composable_kernel/experimental/grouped_convolution_tile_instances/test-instances.py

#!/usr/bin/env python3
# filepath: /home/AMD/vpietila/git/rocm-libraries/projects/composablekernel/experimental/grouped_convolution_tile_instances/check_instances.py

"""
Script to check which backward weight convolution instances compile successfully.
Compiles each .cpp file independently and reports failures grouped by layout/datatype.
"""

import subprocess
import sys
from pathlib import Path
from collections import defaultdict
import argparse
import tempfile
from concurrent.futures import ThreadPoolExecutor, as_completed
import re

# Configuration
CXX_COMPILER = "/opt/rocm/bin/hipcc"
CXX_STANDARD = "20"

def get_include_dirs(project_root: Path) -> list[str]:
    """Get the include directories needed for compilation."""
    return [
        str(project_root / "build" / "include"),
        str(project_root / "include"),
        str(project_root / "library" / "include"),
        str(project_root / "experimental" / "builder" / "include"),
        str(project_root / "experimental" / "builder" / "test" / "utils"),
        str(project_root / "experimental" / "grouped_convolution_tile_instances" / "instances"),
    ]

def compile_single_file(cpp_file: Path, project_root: Path, gpu_target: str, verbose: bool) -> tuple[bool, str]:
    """
    Attempt to compile a single .cpp file.
    Returns (success, error_message).
    """
    include_dirs = get_include_dirs(project_root)
    include_flags = [f"-I{d}" for d in include_dirs]

    # Create a temporary directory for output
    with tempfile.TemporaryDirectory() as tmpdir:
        output_file = Path(tmpdir) / "output.o"

        cmd = [
            CXX_COMPILER,
            "-c",  # Compile only, don't link
            f"-std=c++{CXX_STANDARD}",
            f"--offload-arch={gpu_target}",
            "-D__HIP_PLATFORM_AMD__",
            "-D CK_EXPERIMENTAL_BUILDER=ON",
            "-O3",
            *include_flags,
            str(cpp_file),
            "-o", str(output_file)
        ]

        try:
            result = subprocess.run(
                cmd,
                capture_output=True,
                text=True,
                timeout=300  # 5 minute timeout per file
            )

            if result.returncode == 0:
                return True, ""
            else:
                # Extract the key error message
                error_output = result.stderr
                return False, error_output

        except subprocess.TimeoutExpired:
            return False, "TIMEOUT: Compilation took too long"
        except Exception as e:
            return False, f"EXCEPTION: {str(e)}"

def extract_key_error(error_output: str) -> str:
    """Extract the most relevant error message from compiler output."""
    lines = error_output.split('\n')
    for line in lines:
        if 'error:' in line:
            return line.strip()
    # Return first non-empty line if no explicit error found
    for line in lines:
        if line.strip():
            return line.strip()[:200]  # Limit length
    return "Unknown error"

def find_instance_files(instances_dir: Path, direction: str = "backward_weight") -> dict[str, list[Path]]:
    """
    Find all instance .cpp files grouped by subdirectory (layout/datatype).
    Returns dict: subdirectory_name -> list of cpp files
    """
    target_dir = instances_dir / direction
    if not target_dir.exists():
        print(f"Error: Directory {target_dir} does not exist")
        sys.exit(1)

    files_by_subdir = defaultdict(list)

    for subdir in sorted(target_dir.iterdir()):
        if subdir.is_dir():
            cpp_files = sorted(subdir.glob("*.cpp"))
            if cpp_files:
                files_by_subdir[subdir.name] = cpp_files

    return files_by_subdir

def parse_args():
    parser = argparse.ArgumentParser(description="Check which convolution instances compile")
    parser.add_argument("--direction", default="backward_weight",
                        choices=["forward", "backward_weight", "backward_data"],
                        help="Convolution direction to check")
    parser.add_argument("--subdir", default=None,
                        help="Only check specific subdirectory (e.g., 'nhwgc_bf16')")
    parser.add_argument("--max-files", type=int, default=None,
                        help="Maximum number of files to check per subdirectory")
    parser.add_argument("--verbose", "-v", action="store_true",
                        help="Show verbose output including compile commands")
    parser.add_argument("--output", "-o", default=None,
                        help="Output file for the blacklist")
    parser.add_argument("--project-root", default=None,
                        help="Project root directory (auto-detected if not specified)")
    parser.add_argument("--instance", type=int, default=None,
                        help="Only check a single instance by its index in the config file.")
    parser.add_argument(
        "--parallel-jobs",
        "-j",
        type=int,
        default=1,
        help="Number of parallel compilation jobs (default: 1)",
    )
    parser.add_argument("--gpu-target", type=str, default="gfx950", help="GPU target architecture (default: gfx950)")

    args = parser.parse_args()

    return args

def main():

    args = parse_args()

    # Find project root
    if args.project_root:
        project_root = Path(args.project_root)
    else:
        # Assume script is in experimental/grouped_convolution_tile_instances/
        script_dir = Path(__file__).resolve().parent
        project_root = script_dir.parent.parent  # Go up to composablekernel/

    instances_dir = project_root / "experimental" / "grouped_convolution_tile_instances" / "instances"

    print(f"Project root: {project_root}")
    print(f"Instances directory: {instances_dir}")
    print(f"Compiler: {CXX_COMPILER}")
    print(f"GPU Target: {args.gpu_target}")
    print(f"Direction: {args.direction}")
    if args.instance is not None:
        print(f"Checking only instance index: {args.instance}")
    print()

    # Find all instance files
    files_by_subdir = find_instance_files(instances_dir, args.direction)

    # If sub directory is defined, check only that sub directory
    if args.subdir:
        if args.subdir not in files_by_subdir:
            print(f"Error: Subdirectory '{args.subdir}' not found")
            print(f"Available: {list(files_by_subdir.keys())}")
            sys.exit(1)
        files_by_subdir = {args.subdir: files_by_subdir[args.subdir]}

    if args.instance is not None:
        # If instance index is specified, find the corresponding file for each subdir
        instance_files = {}
        for subdir, files in files_by_subdir.items():
            if args.instance >= 0:
                target_suffix = f"_{args.instance}.cpp"
                matched_files = [f for f in files if f.name.endswith(target_suffix)]
                if matched_files:
                    assert len(matched_files) == 1, f"Expected exactly one file ending with {target_suffix} in {subdir}, found {len(matched_files)}"
                    instance_files[subdir] = matched_files
            else:
                if args.subdir is None:
                    print(f"Warning: Subdirectory '{subdir}' does not have instance index {args.instance}")
        files_by_subdir = instance_files

    if args.subdir:
        if args.instance is not None and args.subdir not in files_by_subdir:
            print(f"Instance index {args.instance} was not found in subdirectory '{args.subdir}'")
            sys.exit(1)
        elif args.subdir not in files_by_subdir:
            print(f"Error: Subdirectory '{args.subdir}' not found")
            print(f"Available: {list(files_by_subdir.keys())}")
            sys.exit(1)
        files_by_subdir = {args.subdir: files_by_subdir[args.subdir]}

    # Track results
    all_failures = defaultdict(list)  # subdir -> list of (filename, error)
    all_successes = defaultdict(list)  # subdir -> list of filenames
    error_types = defaultdict(set)  # error_key -> set of files

    total_files = sum(len(files) for files in files_by_subdir.values())
    if args.max_files:
        total_files = min(total_files, args.max_files * len(files_by_subdir))

    print(f"Found {total_files} instance files to check")
    print("=" * 60)

    checked = 0
    for subdir_name, cpp_files in sorted(files_by_subdir.items()):
        print(f"\nChecking {subdir_name}...", flush=True)

        files_to_check = cpp_files
        if args.max_files:
            files_to_check = cpp_files[:args.max_files]

        if args.parallel_jobs > 1:
            # Parallel compilation
            with ThreadPoolExecutor(max_workers=args.parallel_jobs) as executor:
                # Submit all compilation jobs
                futures = {
                    executor.submit(compile_single_file, cpp_file, project_root, args.gpu_target, args.verbose): cpp_file
                    for cpp_file in files_to_check
                }

                # Process results as they complete
                for future in as_completed(futures):
                    cpp_file = futures[future]
                    filename = cpp_file.name
                    checked += 1
                    success, error = future.result()

                    if success:
                        print(f"  [{checked}/{total_files}] {filename}... OK", flush=True)
                        all_successes[subdir_name].append(filename)
                    else:
                        key_error = extract_key_error(error)
                        print(f"  [{checked}/{total_files}] {filename}... FAILED", flush=True)
                        if args.verbose:
                            print(f"    Error: {key_error}")
                        all_failures[subdir_name].append((filename, key_error))
                        error_types[key_error].add(f"{subdir_name}/{filename}")
        else:
            # Sequential compilation
            print(f"Compiling {len(files_to_check)} files sequentially...")
            for cpp_file in files_to_check:
                checked += 1
                filename = cpp_file.name

                if args.verbose:
                    print(f"  [{checked}/{total_files}] {filename}...", end=" ", flush=True)
                else:
                    print(f"  [{checked}/{total_files}] {filename}...", end=" ", flush=True)

                success, error = compile_single_file(cpp_file, project_root, args.gpu_target, args.verbose)

                if success:
                    print("OK")
                    all_successes[subdir_name].append(filename)
                else:
                    key_error = extract_key_error(error)
                    print(f"FAILED")
                    if args.verbose:
                        print(f"    Error: {key_error}")
                    all_failures[subdir_name].append((filename, key_error))
                    error_types[key_error].add(f"{subdir_name}/{filename}")

    # Print summary
    print("\n" + "=" * 60)
    print("SUMMARY")
    print("=" * 60)

    for subdir_name in sorted(files_by_subdir.keys()):
        successes = len(all_successes.get(subdir_name, []))
        failures = len(all_failures.get(subdir_name, []))
        total = successes + failures
        print(f"\n{subdir_name}: {successes}/{total} passed, {failures} failed")

        if failures > 0:
            print(f"  Failed files:")
            # Order the failures by the filename for consistency
            # Each filename ends with _{instance_index}.cpp, so we can sort by instance index
            sorted_failures = sorted(
                all_failures[subdir_name],
                key=lambda x: int(re.search(r'_(\d+)\.cpp$', x[0]).group(1))
                              if re.search(r'_(\d+)\.cpp$', x[0]) else 0
            )
            for filename, error in sorted_failures:
                print(f"    - {filename}")

    # Return exit code based on failures
    total_failures = sum(len(f) for f in all_failures.values())
    if total_failures > 0:
        print(f"\n{total_failures} total failures found")
        return 1
    else:
        print("\nAll instances compiled successfully!")
        return 0

if __name__ == "__main__":
    sys.exit(main())