Files
Ville Pietilä ae4e632c7d [rocm-libraries] ROCm/rocm-libraries#4797 (commit 1a30400)
[CK_TILE] Add CK Tile bwd weight profiler
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

## Motivation

To compare old CK and CK Tile, we need to extend the current CK profiler
to support running also CK Tile instance with the same API. In order to
have the same instance coverage in CK Tile compared to the old CK, I've
added code generation from old CK configurations to CK Tile instances
using the CK Builder.

## Technical Details

- The codegen python script for CK Tile fwd convs is extended to support
also bwd weight and bwd data.
- The generated instances are added to the CMake build (target
`device_grouped_conv_bwd_weight_tile_instance`s).
- A new profiler op (`grouped_conv_bwd_weight_tile`) has been added to
the CK Profiler.
2026-03-04 21:50:29 +00:00

303 lines
12 KiB
Python
Executable File

#!/usr/bin/env python3
# filepath: /home/AMD/vpietila/git/rocm-libraries/projects/composablekernel/experimental/grouped_convolution_tile_instances/check_instances.py
"""
Script to check which backward weight convolution instances compile successfully.
Compiles each .cpp file independently and reports failures grouped by layout/datatype.
"""
import subprocess
import sys
from pathlib import Path
from collections import defaultdict
import argparse
import tempfile
from concurrent.futures import ThreadPoolExecutor, as_completed
import re
# Configuration
CXX_COMPILER = "/opt/rocm/bin/hipcc"
CXX_STANDARD = "20"
def get_include_dirs(project_root: Path) -> list[str]:
"""Get the include directories needed for compilation."""
return [
str(project_root / "build" / "include"),
str(project_root / "include"),
str(project_root / "library" / "include"),
str(project_root / "experimental" / "builder" / "include"),
str(project_root / "experimental" / "builder" / "test" / "utils"),
str(project_root / "experimental" / "grouped_convolution_tile_instances" / "instances"),
]
def compile_single_file(cpp_file: Path, project_root: Path, gpu_target: str, verbose: bool) -> tuple[bool, str]:
"""
Attempt to compile a single .cpp file.
Returns (success, error_message).
"""
include_dirs = get_include_dirs(project_root)
include_flags = [f"-I{d}" for d in include_dirs]
# Create a temporary directory for output
with tempfile.TemporaryDirectory() as tmpdir:
output_file = Path(tmpdir) / "output.o"
cmd = [
CXX_COMPILER,
"-c", # Compile only, don't link
f"-std=c++{CXX_STANDARD}",
f"--offload-arch={gpu_target}",
"-D__HIP_PLATFORM_AMD__",
"-D CK_EXPERIMENTAL_BUILDER=ON",
"-O3",
*include_flags,
str(cpp_file),
"-o", str(output_file)
]
try:
result = subprocess.run(
cmd,
capture_output=True,
text=True,
timeout=300 # 5 minute timeout per file
)
if result.returncode == 0:
return True, ""
else:
# Extract the key error message
error_output = result.stderr
return False, error_output
except subprocess.TimeoutExpired:
return False, "TIMEOUT: Compilation took too long"
except Exception as e:
return False, f"EXCEPTION: {str(e)}"
def extract_key_error(error_output: str) -> str:
"""Extract the most relevant error message from compiler output."""
lines = error_output.split('\n')
for line in lines:
if 'error:' in line:
return line.strip()
# Return first non-empty line if no explicit error found
for line in lines:
if line.strip():
return line.strip()[:200] # Limit length
return "Unknown error"
def find_instance_files(instances_dir: Path, direction: str = "backward_weight") -> dict[str, list[Path]]:
"""
Find all instance .cpp files grouped by subdirectory (layout/datatype).
Returns dict: subdirectory_name -> list of cpp files
"""
target_dir = instances_dir / direction
if not target_dir.exists():
print(f"Error: Directory {target_dir} does not exist")
sys.exit(1)
files_by_subdir = defaultdict(list)
for subdir in sorted(target_dir.iterdir()):
if subdir.is_dir():
cpp_files = sorted(subdir.glob("*.cpp"))
if cpp_files:
files_by_subdir[subdir.name] = cpp_files
return files_by_subdir
def parse_args():
parser = argparse.ArgumentParser(description="Check which convolution instances compile")
parser.add_argument("--direction", default="backward_weight",
choices=["forward", "backward_weight", "backward_data"],
help="Convolution direction to check")
parser.add_argument("--subdir", default=None,
help="Only check specific subdirectory (e.g., 'nhwgc_bf16')")
parser.add_argument("--max-files", type=int, default=None,
help="Maximum number of files to check per subdirectory")
parser.add_argument("--verbose", "-v", action="store_true",
help="Show verbose output including compile commands")
parser.add_argument("--output", "-o", default=None,
help="Output file for the blacklist")
parser.add_argument("--project-root", default=None,
help="Project root directory (auto-detected if not specified)")
parser.add_argument("--instance", type=int, default=None,
help="Only check a single instance by its index in the config file.")
parser.add_argument(
"--parallel-jobs",
"-j",
type=int,
default=1,
help="Number of parallel compilation jobs (default: 1)",
)
parser.add_argument("--gpu-target", type=str, default="gfx950", help="GPU target architecture (default: gfx950)")
args = parser.parse_args()
return args
def main():
args = parse_args()
# Find project root
if args.project_root:
project_root = Path(args.project_root)
else:
# Assume script is in experimental/grouped_convolution_tile_instances/
script_dir = Path(__file__).resolve().parent
project_root = script_dir.parent.parent # Go up to composablekernel/
instances_dir = project_root / "experimental" / "grouped_convolution_tile_instances" / "instances"
print(f"Project root: {project_root}")
print(f"Instances directory: {instances_dir}")
print(f"Compiler: {CXX_COMPILER}")
print(f"GPU Target: {args.gpu_target}")
print(f"Direction: {args.direction}")
if args.instance is not None:
print(f"Checking only instance index: {args.instance}")
print()
# Find all instance files
files_by_subdir = find_instance_files(instances_dir, args.direction)
# If sub directory is defined, check only that sub directory
if args.subdir:
if args.subdir not in files_by_subdir:
print(f"Error: Subdirectory '{args.subdir}' not found")
print(f"Available: {list(files_by_subdir.keys())}")
sys.exit(1)
files_by_subdir = {args.subdir: files_by_subdir[args.subdir]}
if args.instance is not None:
# If instance index is specified, find the corresponding file for each subdir
instance_files = {}
for subdir, files in files_by_subdir.items():
if args.instance >= 0:
target_suffix = f"_{args.instance}.cpp"
matched_files = [f for f in files if f.name.endswith(target_suffix)]
if matched_files:
assert len(matched_files) == 1, f"Expected exactly one file ending with {target_suffix} in {subdir}, found {len(matched_files)}"
instance_files[subdir] = matched_files
else:
if args.subdir is None:
print(f"Warning: Subdirectory '{subdir}' does not have instance index {args.instance}")
files_by_subdir = instance_files
if args.subdir:
if args.instance is not None and args.subdir not in files_by_subdir:
print(f"Instance index {args.instance} was not found in subdirectory '{args.subdir}'")
sys.exit(1)
elif args.subdir not in files_by_subdir:
print(f"Error: Subdirectory '{args.subdir}' not found")
print(f"Available: {list(files_by_subdir.keys())}")
sys.exit(1)
files_by_subdir = {args.subdir: files_by_subdir[args.subdir]}
# Track results
all_failures = defaultdict(list) # subdir -> list of (filename, error)
all_successes = defaultdict(list) # subdir -> list of filenames
error_types = defaultdict(set) # error_key -> set of files
total_files = sum(len(files) for files in files_by_subdir.values())
if args.max_files:
total_files = min(total_files, args.max_files * len(files_by_subdir))
print(f"Found {total_files} instance files to check")
print("=" * 60)
checked = 0
for subdir_name, cpp_files in sorted(files_by_subdir.items()):
print(f"\nChecking {subdir_name}...", flush=True)
files_to_check = cpp_files
if args.max_files:
files_to_check = cpp_files[:args.max_files]
if args.parallel_jobs > 1:
# Parallel compilation
with ThreadPoolExecutor(max_workers=args.parallel_jobs) as executor:
# Submit all compilation jobs
futures = {
executor.submit(compile_single_file, cpp_file, project_root, args.gpu_target, args.verbose): cpp_file
for cpp_file in files_to_check
}
# Process results as they complete
for future in as_completed(futures):
cpp_file = futures[future]
filename = cpp_file.name
checked += 1
success, error = future.result()
if success:
print(f" [{checked}/{total_files}] {filename}... OK", flush=True)
all_successes[subdir_name].append(filename)
else:
key_error = extract_key_error(error)
print(f" [{checked}/{total_files}] {filename}... FAILED", flush=True)
if args.verbose:
print(f" Error: {key_error}")
all_failures[subdir_name].append((filename, key_error))
error_types[key_error].add(f"{subdir_name}/{filename}")
else:
# Sequential compilation
print(f"Compiling {len(files_to_check)} files sequentially...")
for cpp_file in files_to_check:
checked += 1
filename = cpp_file.name
if args.verbose:
print(f" [{checked}/{total_files}] {filename}...", end=" ", flush=True)
else:
print(f" [{checked}/{total_files}] {filename}...", end=" ", flush=True)
success, error = compile_single_file(cpp_file, project_root, args.gpu_target, args.verbose)
if success:
print("OK")
all_successes[subdir_name].append(filename)
else:
key_error = extract_key_error(error)
print(f"FAILED")
if args.verbose:
print(f" Error: {key_error}")
all_failures[subdir_name].append((filename, key_error))
error_types[key_error].add(f"{subdir_name}/{filename}")
# Print summary
print("\n" + "=" * 60)
print("SUMMARY")
print("=" * 60)
for subdir_name in sorted(files_by_subdir.keys()):
successes = len(all_successes.get(subdir_name, []))
failures = len(all_failures.get(subdir_name, []))
total = successes + failures
print(f"\n{subdir_name}: {successes}/{total} passed, {failures} failed")
if failures > 0:
print(f" Failed files:")
# Order the failures by the filename for consistency
# Each filename ends with _{instance_index}.cpp, so we can sort by instance index
sorted_failures = sorted(
all_failures[subdir_name],
key=lambda x: int(re.search(r'_(\d+)\.cpp$', x[0]).group(1))
if re.search(r'_(\d+)\.cpp$', x[0]) else 0
)
for filename, error in sorted_failures:
print(f" - {filename}")
# Return exit code based on failures
total_failures = sum(len(f) for f in all_failures.values())
if total_failures > 0:
print(f"\n{total_failures} total failures found")
return 1
else:
print("\nAll instances compiled successfully!")
return 0
if __name__ == "__main__":
sys.exit(main())