mirror of
https://github.com/ROCm/composable_kernel.git
synced 2026-04-19 22:39:03 +00:00
Adding dispatcher architecture (#3300)
* WIP POC of dispatcher * Dispatcher python workflow setup. * Dispatcher cleanup and updates. Further dispatcher cleanup and updates. Build fixes Improvements and python to CK example Improvements to readme * Fixes to python paths * Cleaning up code * Improving dispatcher support for different arch Fixing typos * Fix formatting errors * Cleaning up examples * Improving codegeneration * Improving and fixing C++ examples * Adding conv functionality (fwd,bwd,bwdw) and examples. * Fixes based on feedback. * Further fixes based on feedback. * Adding stress test for autogeneration and autocorrection, and fixing preshuffle bug. * Another round of improvements based on feedback. * Trimming out unnecessary code. * Fixing the multi-D implementation. * Using gpu verification for gemms and fixing convolutions tflops calculation. * Fix counter usage issue and arch filtering per ops. * Adding changelog and other fixes. * Improve examples and resolve critical bugs. * Reduce build time for python examples. * Fixing minor bug. * Fix compilation error. * Improve installation instructions for dispatcher. * Add docker based installation instructions for dispatcher. * Fixing arch-based filtering to match tile engine. * Remove dead code and fix arch filtering. * Minor bugfix. * Updates after rebase. * Trimming code. * Fix copyright headers. * Consolidate examples, cut down code. * Minor fixes. * Improving python examples. * Update readmes. * Remove conv functionality. * Cleanup following conv removable.
This commit is contained in:
committed by
GitHub
parent
44f481a45c
commit
9e049a32a1
142
dispatcher/scripts/parallel_kernel_builder.py
Executable file
142
dispatcher/scripts/parallel_kernel_builder.py
Executable file
@@ -0,0 +1,142 @@
|
||||
#!/usr/bin/env python3
|
||||
|
||||
# Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
|
||||
# SPDX-License-Identifier: MIT
|
||||
|
||||
"""
|
||||
Build kernels in parallel - one translation unit per kernel.
|
||||
|
||||
This script is called at make time (not cmake time) to avoid slow cmake configuration.
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import os
|
||||
import subprocess
|
||||
import sys
|
||||
from pathlib import Path
|
||||
from concurrent.futures import ProcessPoolExecutor, as_completed
|
||||
|
||||
|
||||
def find_hipcc():
|
||||
"""Find hipcc compiler."""
|
||||
candidates = [
|
||||
os.environ.get("HIPCC"),
|
||||
"/opt/rocm/bin/hipcc",
|
||||
shutil.which("hipcc") if shutil else None,
|
||||
]
|
||||
for path in candidates:
|
||||
if path and os.path.isfile(path):
|
||||
return path
|
||||
return "hipcc" # Assume in PATH
|
||||
|
||||
|
||||
def compile_kernel(args):
|
||||
"""Compile a single kernel."""
|
||||
kernel_hpp, output_dir, include_dirs, hipcc = args
|
||||
kernel_name = kernel_hpp.stem
|
||||
|
||||
# Create wrapper .cpp
|
||||
wrapper_cpp = output_dir / f"{kernel_name}.cpp"
|
||||
wrapper_cpp.write_text(f'''// Auto-generated wrapper
|
||||
#include "{kernel_hpp.name}"
|
||||
namespace {{ volatile bool _k = true; }}
|
||||
''')
|
||||
|
||||
# Compile to object
|
||||
obj_file = output_dir / f"{kernel_name}.o"
|
||||
|
||||
cmd = [
|
||||
hipcc,
|
||||
"-c",
|
||||
"-fPIC",
|
||||
"-std=c++17",
|
||||
"-O3",
|
||||
"--offload-arch=gfx942",
|
||||
"-mllvm",
|
||||
"-enable-noalias-to-md-conversion=0",
|
||||
"-Wno-undefined-func-template",
|
||||
"-Wno-float-equal",
|
||||
"--offload-compress",
|
||||
]
|
||||
|
||||
for inc_dir in include_dirs:
|
||||
cmd.extend(["-I", str(inc_dir)])
|
||||
cmd.extend(["-I", str(kernel_hpp.parent)])
|
||||
|
||||
cmd.extend(["-o", str(obj_file), str(wrapper_cpp)])
|
||||
|
||||
result = subprocess.run(cmd, capture_output=True, text=True)
|
||||
|
||||
if result.returncode != 0:
|
||||
return (kernel_name, False, result.stderr)
|
||||
return (kernel_name, True, str(obj_file))
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(description="Build kernels in parallel")
|
||||
parser.add_argument("--kernel-dir", type=Path, required=True)
|
||||
parser.add_argument("--output-dir", type=Path, required=True)
|
||||
parser.add_argument("--include-dirs", type=str, required=True)
|
||||
parser.add_argument("--jobs", type=int, default=os.cpu_count())
|
||||
args = parser.parse_args()
|
||||
|
||||
# Find kernel headers
|
||||
kernel_headers = list(args.kernel_dir.glob("gemm_*.hpp")) + list(
|
||||
args.kernel_dir.glob("conv_*.hpp")
|
||||
)
|
||||
|
||||
if not kernel_headers:
|
||||
print("No kernels found to build")
|
||||
return 0
|
||||
|
||||
print(f"Building {len(kernel_headers)} kernels with {args.jobs} parallel jobs...")
|
||||
|
||||
include_dirs = [Path(p.strip()) for p in args.include_dirs.split(",")]
|
||||
hipcc = find_hipcc()
|
||||
|
||||
args.output_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
# Prepare work items
|
||||
work = [(h, args.output_dir, include_dirs, hipcc) for h in kernel_headers]
|
||||
|
||||
# Compile in parallel
|
||||
obj_files = []
|
||||
failed = []
|
||||
|
||||
with ProcessPoolExecutor(max_workers=args.jobs) as executor:
|
||||
futures = {executor.submit(compile_kernel, w): w[0].name for w in work}
|
||||
|
||||
for i, future in enumerate(as_completed(futures), 1):
|
||||
name, success, result = future.result()
|
||||
if success:
|
||||
obj_files.append(result)
|
||||
print(f"[{i}/{len(kernel_headers)}] Built: {name}")
|
||||
else:
|
||||
failed.append((name, result))
|
||||
print(f"[{i}/{len(kernel_headers)}] FAILED: {name}")
|
||||
|
||||
if failed:
|
||||
print(f"\n{len(failed)} kernels failed to compile:")
|
||||
for name, err in failed[:5]:
|
||||
print(f" {name}: {err[:100]}")
|
||||
return 1
|
||||
|
||||
# Link into shared library
|
||||
print(f"\nLinking {len(obj_files)} objects into libdispatcher_kernels.so...")
|
||||
lib_path = args.output_dir / "libdispatcher_kernels.so"
|
||||
|
||||
link_cmd = [hipcc, "-shared", "-fPIC", "-o", str(lib_path)] + obj_files
|
||||
result = subprocess.run(link_cmd, capture_output=True, text=True)
|
||||
|
||||
if result.returncode != 0:
|
||||
print(f"Linking failed: {result.stderr}")
|
||||
return 1
|
||||
|
||||
print(f"✓ Built: {lib_path}")
|
||||
return 0
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
import shutil
|
||||
|
||||
sys.exit(main())
|
||||
Reference in New Issue
Block a user