mirror of
https://github.com/ROCm/composable_kernel.git
synced 2026-03-21 15:47:38 +00:00
[CK_TILE] Add CK Tile bwd weight profiler MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ## Motivation To compare old CK and CK Tile, we need to extend the current CK profiler to support running also CK Tile instance with the same API. In order to have the same instance coverage in CK Tile compared to the old CK, I've added code generation from old CK configurations to CK Tile instances using the CK Builder. ## Technical Details - The codegen python script for CK Tile fwd convs is extended to support also bwd weight and bwd data. - The generated instances are added to the CMake build (target `device_grouped_conv_bwd_weight_tile_instance`s). - A new profiler op (`grouped_conv_bwd_weight_tile`) has been added to the CK Profiler.
303 lines
12 KiB
Python
Executable File
303 lines
12 KiB
Python
Executable File
#!/usr/bin/env python3
|
|
# filepath: /home/AMD/vpietila/git/rocm-libraries/projects/composablekernel/experimental/grouped_convolution_tile_instances/check_instances.py
|
|
|
|
"""
|
|
Script to check which backward weight convolution instances compile successfully.
|
|
Compiles each .cpp file independently and reports failures grouped by layout/datatype.
|
|
"""
|
|
|
|
import subprocess
|
|
import sys
|
|
from pathlib import Path
|
|
from collections import defaultdict
|
|
import argparse
|
|
import tempfile
|
|
from concurrent.futures import ThreadPoolExecutor, as_completed
|
|
import re
|
|
|
|
# Configuration
|
|
CXX_COMPILER = "/opt/rocm/bin/hipcc"
|
|
CXX_STANDARD = "20"
|
|
|
|
def get_include_dirs(project_root: Path) -> list[str]:
|
|
"""Get the include directories needed for compilation."""
|
|
return [
|
|
str(project_root / "build" / "include"),
|
|
str(project_root / "include"),
|
|
str(project_root / "library" / "include"),
|
|
str(project_root / "experimental" / "builder" / "include"),
|
|
str(project_root / "experimental" / "builder" / "test" / "utils"),
|
|
str(project_root / "experimental" / "grouped_convolution_tile_instances" / "instances"),
|
|
]
|
|
|
|
def compile_single_file(cpp_file: Path, project_root: Path, gpu_target: str, verbose: bool) -> tuple[bool, str]:
|
|
"""
|
|
Attempt to compile a single .cpp file.
|
|
Returns (success, error_message).
|
|
"""
|
|
include_dirs = get_include_dirs(project_root)
|
|
include_flags = [f"-I{d}" for d in include_dirs]
|
|
|
|
# Create a temporary directory for output
|
|
with tempfile.TemporaryDirectory() as tmpdir:
|
|
output_file = Path(tmpdir) / "output.o"
|
|
|
|
cmd = [
|
|
CXX_COMPILER,
|
|
"-c", # Compile only, don't link
|
|
f"-std=c++{CXX_STANDARD}",
|
|
f"--offload-arch={gpu_target}",
|
|
"-D__HIP_PLATFORM_AMD__",
|
|
"-D CK_EXPERIMENTAL_BUILDER=ON",
|
|
"-O3",
|
|
*include_flags,
|
|
str(cpp_file),
|
|
"-o", str(output_file)
|
|
]
|
|
|
|
try:
|
|
result = subprocess.run(
|
|
cmd,
|
|
capture_output=True,
|
|
text=True,
|
|
timeout=300 # 5 minute timeout per file
|
|
)
|
|
|
|
if result.returncode == 0:
|
|
return True, ""
|
|
else:
|
|
# Extract the key error message
|
|
error_output = result.stderr
|
|
return False, error_output
|
|
|
|
except subprocess.TimeoutExpired:
|
|
return False, "TIMEOUT: Compilation took too long"
|
|
except Exception as e:
|
|
return False, f"EXCEPTION: {str(e)}"
|
|
|
|
def extract_key_error(error_output: str) -> str:
|
|
"""Extract the most relevant error message from compiler output."""
|
|
lines = error_output.split('\n')
|
|
for line in lines:
|
|
if 'error:' in line:
|
|
return line.strip()
|
|
# Return first non-empty line if no explicit error found
|
|
for line in lines:
|
|
if line.strip():
|
|
return line.strip()[:200] # Limit length
|
|
return "Unknown error"
|
|
|
|
def find_instance_files(instances_dir: Path, direction: str = "backward_weight") -> dict[str, list[Path]]:
|
|
"""
|
|
Find all instance .cpp files grouped by subdirectory (layout/datatype).
|
|
Returns dict: subdirectory_name -> list of cpp files
|
|
"""
|
|
target_dir = instances_dir / direction
|
|
if not target_dir.exists():
|
|
print(f"Error: Directory {target_dir} does not exist")
|
|
sys.exit(1)
|
|
|
|
files_by_subdir = defaultdict(list)
|
|
|
|
for subdir in sorted(target_dir.iterdir()):
|
|
if subdir.is_dir():
|
|
cpp_files = sorted(subdir.glob("*.cpp"))
|
|
if cpp_files:
|
|
files_by_subdir[subdir.name] = cpp_files
|
|
|
|
return files_by_subdir
|
|
|
|
def parse_args():
|
|
parser = argparse.ArgumentParser(description="Check which convolution instances compile")
|
|
parser.add_argument("--direction", default="backward_weight",
|
|
choices=["forward", "backward_weight", "backward_data"],
|
|
help="Convolution direction to check")
|
|
parser.add_argument("--subdir", default=None,
|
|
help="Only check specific subdirectory (e.g., 'nhwgc_bf16')")
|
|
parser.add_argument("--max-files", type=int, default=None,
|
|
help="Maximum number of files to check per subdirectory")
|
|
parser.add_argument("--verbose", "-v", action="store_true",
|
|
help="Show verbose output including compile commands")
|
|
parser.add_argument("--output", "-o", default=None,
|
|
help="Output file for the blacklist")
|
|
parser.add_argument("--project-root", default=None,
|
|
help="Project root directory (auto-detected if not specified)")
|
|
parser.add_argument("--instance", type=int, default=None,
|
|
help="Only check a single instance by its index in the config file.")
|
|
parser.add_argument(
|
|
"--parallel-jobs",
|
|
"-j",
|
|
type=int,
|
|
default=1,
|
|
help="Number of parallel compilation jobs (default: 1)",
|
|
)
|
|
parser.add_argument("--gpu-target", type=str, default="gfx950", help="GPU target architecture (default: gfx950)")
|
|
|
|
args = parser.parse_args()
|
|
|
|
return args
|
|
|
|
def main():
|
|
|
|
args = parse_args()
|
|
|
|
# Find project root
|
|
if args.project_root:
|
|
project_root = Path(args.project_root)
|
|
else:
|
|
# Assume script is in experimental/grouped_convolution_tile_instances/
|
|
script_dir = Path(__file__).resolve().parent
|
|
project_root = script_dir.parent.parent # Go up to composablekernel/
|
|
|
|
instances_dir = project_root / "experimental" / "grouped_convolution_tile_instances" / "instances"
|
|
|
|
print(f"Project root: {project_root}")
|
|
print(f"Instances directory: {instances_dir}")
|
|
print(f"Compiler: {CXX_COMPILER}")
|
|
print(f"GPU Target: {args.gpu_target}")
|
|
print(f"Direction: {args.direction}")
|
|
if args.instance is not None:
|
|
print(f"Checking only instance index: {args.instance}")
|
|
print()
|
|
|
|
# Find all instance files
|
|
files_by_subdir = find_instance_files(instances_dir, args.direction)
|
|
|
|
# If sub directory is defined, check only that sub directory
|
|
if args.subdir:
|
|
if args.subdir not in files_by_subdir:
|
|
print(f"Error: Subdirectory '{args.subdir}' not found")
|
|
print(f"Available: {list(files_by_subdir.keys())}")
|
|
sys.exit(1)
|
|
files_by_subdir = {args.subdir: files_by_subdir[args.subdir]}
|
|
|
|
if args.instance is not None:
|
|
# If instance index is specified, find the corresponding file for each subdir
|
|
instance_files = {}
|
|
for subdir, files in files_by_subdir.items():
|
|
if args.instance >= 0:
|
|
target_suffix = f"_{args.instance}.cpp"
|
|
matched_files = [f for f in files if f.name.endswith(target_suffix)]
|
|
if matched_files:
|
|
assert len(matched_files) == 1, f"Expected exactly one file ending with {target_suffix} in {subdir}, found {len(matched_files)}"
|
|
instance_files[subdir] = matched_files
|
|
else:
|
|
if args.subdir is None:
|
|
print(f"Warning: Subdirectory '{subdir}' does not have instance index {args.instance}")
|
|
files_by_subdir = instance_files
|
|
|
|
if args.subdir:
|
|
if args.instance is not None and args.subdir not in files_by_subdir:
|
|
print(f"Instance index {args.instance} was not found in subdirectory '{args.subdir}'")
|
|
sys.exit(1)
|
|
elif args.subdir not in files_by_subdir:
|
|
print(f"Error: Subdirectory '{args.subdir}' not found")
|
|
print(f"Available: {list(files_by_subdir.keys())}")
|
|
sys.exit(1)
|
|
files_by_subdir = {args.subdir: files_by_subdir[args.subdir]}
|
|
|
|
# Track results
|
|
all_failures = defaultdict(list) # subdir -> list of (filename, error)
|
|
all_successes = defaultdict(list) # subdir -> list of filenames
|
|
error_types = defaultdict(set) # error_key -> set of files
|
|
|
|
total_files = sum(len(files) for files in files_by_subdir.values())
|
|
if args.max_files:
|
|
total_files = min(total_files, args.max_files * len(files_by_subdir))
|
|
|
|
print(f"Found {total_files} instance files to check")
|
|
print("=" * 60)
|
|
|
|
checked = 0
|
|
for subdir_name, cpp_files in sorted(files_by_subdir.items()):
|
|
print(f"\nChecking {subdir_name}...", flush=True)
|
|
|
|
files_to_check = cpp_files
|
|
if args.max_files:
|
|
files_to_check = cpp_files[:args.max_files]
|
|
|
|
if args.parallel_jobs > 1:
|
|
# Parallel compilation
|
|
with ThreadPoolExecutor(max_workers=args.parallel_jobs) as executor:
|
|
# Submit all compilation jobs
|
|
futures = {
|
|
executor.submit(compile_single_file, cpp_file, project_root, args.gpu_target, args.verbose): cpp_file
|
|
for cpp_file in files_to_check
|
|
}
|
|
|
|
# Process results as they complete
|
|
for future in as_completed(futures):
|
|
cpp_file = futures[future]
|
|
filename = cpp_file.name
|
|
checked += 1
|
|
success, error = future.result()
|
|
|
|
if success:
|
|
print(f" [{checked}/{total_files}] {filename}... OK", flush=True)
|
|
all_successes[subdir_name].append(filename)
|
|
else:
|
|
key_error = extract_key_error(error)
|
|
print(f" [{checked}/{total_files}] {filename}... FAILED", flush=True)
|
|
if args.verbose:
|
|
print(f" Error: {key_error}")
|
|
all_failures[subdir_name].append((filename, key_error))
|
|
error_types[key_error].add(f"{subdir_name}/{filename}")
|
|
else:
|
|
# Sequential compilation
|
|
print(f"Compiling {len(files_to_check)} files sequentially...")
|
|
for cpp_file in files_to_check:
|
|
checked += 1
|
|
filename = cpp_file.name
|
|
|
|
if args.verbose:
|
|
print(f" [{checked}/{total_files}] {filename}...", end=" ", flush=True)
|
|
else:
|
|
print(f" [{checked}/{total_files}] {filename}...", end=" ", flush=True)
|
|
|
|
success, error = compile_single_file(cpp_file, project_root, args.gpu_target, args.verbose)
|
|
|
|
if success:
|
|
print("OK")
|
|
all_successes[subdir_name].append(filename)
|
|
else:
|
|
key_error = extract_key_error(error)
|
|
print(f"FAILED")
|
|
if args.verbose:
|
|
print(f" Error: {key_error}")
|
|
all_failures[subdir_name].append((filename, key_error))
|
|
error_types[key_error].add(f"{subdir_name}/{filename}")
|
|
|
|
# Print summary
|
|
print("\n" + "=" * 60)
|
|
print("SUMMARY")
|
|
print("=" * 60)
|
|
|
|
for subdir_name in sorted(files_by_subdir.keys()):
|
|
successes = len(all_successes.get(subdir_name, []))
|
|
failures = len(all_failures.get(subdir_name, []))
|
|
total = successes + failures
|
|
print(f"\n{subdir_name}: {successes}/{total} passed, {failures} failed")
|
|
|
|
if failures > 0:
|
|
print(f" Failed files:")
|
|
# Order the failures by the filename for consistency
|
|
# Each filename ends with _{instance_index}.cpp, so we can sort by instance index
|
|
sorted_failures = sorted(
|
|
all_failures[subdir_name],
|
|
key=lambda x: int(re.search(r'_(\d+)\.cpp$', x[0]).group(1))
|
|
if re.search(r'_(\d+)\.cpp$', x[0]) else 0
|
|
)
|
|
for filename, error in sorted_failures:
|
|
print(f" - {filename}")
|
|
|
|
# Return exit code based on failures
|
|
total_failures = sum(len(f) for f in all_failures.values())
|
|
if total_failures > 0:
|
|
print(f"\n{total_failures} total failures found")
|
|
return 1
|
|
else:
|
|
print("\nAll instances compiled successfully!")
|
|
return 0
|
|
|
|
if __name__ == "__main__":
|
|
sys.exit(main()) |