Files
composable_kernel/dispatcher/scripts/parallel_kernel_builder.py
Vidyasagar Ananthan 9e049a32a1 Adding dispatcher architecture (#3300)
* WIP POC of dispatcher

* Dispatcher python workflow setup.

* Dispatcher cleanup and updates.

Further dispatcher cleanup and updates.

Build fixes

Improvements and python to CK example

Improvements to readme

* Fixes to python paths

* Cleaning up code

* Improving dispatcher support for different arch

Fixing typos

* Fix formatting errors

* Cleaning up examples

* Improving codegeneration

* Improving and fixing C++ examples

* Adding conv functionality (fwd,bwd,bwdw) and examples.

* Fixes based on feedback.

* Further fixes based on feedback.

* Adding stress test for autogeneration and autocorrection, and fixing preshuffle bug.

* Another round of improvements  based on feedback.

* Trimming out unnecessary code.

* Fixing the multi-D implementation.

* Using gpu verification for gemms and fixing convolutions tflops calculation.

* Fix counter usage issue and arch filtering per ops.

* Adding changelog and other fixes.

* Improve examples and resolve critical bugs.

* Reduce build time for python examples.

* Fixing minor bug.

* Fix compilation error.

* Improve installation instructions for dispatcher.

* Add docker based  installation instructions for dispatcher.

* Fixing arch-based filtering to match tile engine.

* Remove dead code and fix arch filtering.

* Minor bugfix.

* Updates after rebase.

* Trimming code.

* Fix copyright headers.

* Consolidate examples, cut down code.

* Minor fixes.

* Improving python examples.

* Update readmes.

* Remove conv functionality.

* Cleanup following conv removable.
2026-01-22 09:34:33 -08:00

143 lines
4.0 KiB
Python
Executable File

#!/usr/bin/env python3
# Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
# SPDX-License-Identifier: MIT
"""
Build kernels in parallel - one translation unit per kernel.
This script is called at make time (not cmake time) to avoid slow cmake configuration.
"""
import argparse
import os
import subprocess
import sys
from pathlib import Path
from concurrent.futures import ProcessPoolExecutor, as_completed
def find_hipcc():
"""Find hipcc compiler."""
candidates = [
os.environ.get("HIPCC"),
"/opt/rocm/bin/hipcc",
shutil.which("hipcc") if shutil else None,
]
for path in candidates:
if path and os.path.isfile(path):
return path
return "hipcc" # Assume in PATH
def compile_kernel(args):
"""Compile a single kernel."""
kernel_hpp, output_dir, include_dirs, hipcc = args
kernel_name = kernel_hpp.stem
# Create wrapper .cpp
wrapper_cpp = output_dir / f"{kernel_name}.cpp"
wrapper_cpp.write_text(f'''// Auto-generated wrapper
#include "{kernel_hpp.name}"
namespace {{ volatile bool _k = true; }}
''')
# Compile to object
obj_file = output_dir / f"{kernel_name}.o"
cmd = [
hipcc,
"-c",
"-fPIC",
"-std=c++17",
"-O3",
"--offload-arch=gfx942",
"-mllvm",
"-enable-noalias-to-md-conversion=0",
"-Wno-undefined-func-template",
"-Wno-float-equal",
"--offload-compress",
]
for inc_dir in include_dirs:
cmd.extend(["-I", str(inc_dir)])
cmd.extend(["-I", str(kernel_hpp.parent)])
cmd.extend(["-o", str(obj_file), str(wrapper_cpp)])
result = subprocess.run(cmd, capture_output=True, text=True)
if result.returncode != 0:
return (kernel_name, False, result.stderr)
return (kernel_name, True, str(obj_file))
def main():
parser = argparse.ArgumentParser(description="Build kernels in parallel")
parser.add_argument("--kernel-dir", type=Path, required=True)
parser.add_argument("--output-dir", type=Path, required=True)
parser.add_argument("--include-dirs", type=str, required=True)
parser.add_argument("--jobs", type=int, default=os.cpu_count())
args = parser.parse_args()
# Find kernel headers
kernel_headers = list(args.kernel_dir.glob("gemm_*.hpp")) + list(
args.kernel_dir.glob("conv_*.hpp")
)
if not kernel_headers:
print("No kernels found to build")
return 0
print(f"Building {len(kernel_headers)} kernels with {args.jobs} parallel jobs...")
include_dirs = [Path(p.strip()) for p in args.include_dirs.split(",")]
hipcc = find_hipcc()
args.output_dir.mkdir(parents=True, exist_ok=True)
# Prepare work items
work = [(h, args.output_dir, include_dirs, hipcc) for h in kernel_headers]
# Compile in parallel
obj_files = []
failed = []
with ProcessPoolExecutor(max_workers=args.jobs) as executor:
futures = {executor.submit(compile_kernel, w): w[0].name for w in work}
for i, future in enumerate(as_completed(futures), 1):
name, success, result = future.result()
if success:
obj_files.append(result)
print(f"[{i}/{len(kernel_headers)}] Built: {name}")
else:
failed.append((name, result))
print(f"[{i}/{len(kernel_headers)}] FAILED: {name}")
if failed:
print(f"\n{len(failed)} kernels failed to compile:")
for name, err in failed[:5]:
print(f" {name}: {err[:100]}")
return 1
# Link into shared library
print(f"\nLinking {len(obj_files)} objects into libdispatcher_kernels.so...")
lib_path = args.output_dir / "libdispatcher_kernels.so"
link_cmd = [hipcc, "-shared", "-fPIC", "-o", str(lib_path)] + obj_files
result = subprocess.run(link_cmd, capture_output=True, text=True)
if result.returncode != 0:
print(f"Linking failed: {result.stderr}")
return 1
print(f"✓ Built: {lib_path}")
return 0
if __name__ == "__main__":
import shutil
sys.exit(main())