mirror of
https://github.com/ROCm/composable_kernel.git
synced 2026-05-14 02:02:46 +00:00
* WIP POC of dispatcher
* Dispatcher python workflow setup.
* Dispatcher cleanup and updates.
Further dispatcher cleanup and updates.
Build fixes
Improvements and python to CK example
Improvements to readme
* Fixes to python paths
* Cleaning up code
* Improving dispatcher support for different arch
Fixing typos
* Fix formatting errors
* Cleaning up examples
* Improving codegeneration
* Improving and fixing C++ examples
* Adding conv functionality (fwd,bwd,bwdw) and examples.
* Fixes based on feedback.
* Further fixes based on feedback.
* Adding stress test for autogeneration and autocorrection, and fixing preshuffle bug.
* Another round of improvements based on feedback.
* Trimming out unnecessary code.
* Fixing the multi-D implementation.
* Using gpu verification for gemms and fixing convolutions tflops calculation.
* Fix counter usage issue and arch filtering per ops.
* Adding changelog and other fixes.
* Improve examples and resolve critical bugs.
* Reduce build time for python examples.
* Fixing minor bug.
* Fix compilation error.
* Improve installation instructions for dispatcher.
* Add docker based installation instructions for dispatcher.
* Fixing arch-based filtering to match tile engine.
* Remove dead code and fix arch filtering.
* Minor bugfix.
* Updates after rebase.
* Trimming code.
* Fix copyright headers.
* Consolidate examples, cut down code.
* Minor fixes.
* Improving python examples.
* Update readmes.
* Remove conv functionality.
* Cleanup following conv removable.
[ROCm/composable_kernel commit: 9e049a32a1]
143 lines
4.0 KiB
Python
Executable File
143 lines
4.0 KiB
Python
Executable File
#!/usr/bin/env python3
|
|
|
|
# Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
|
|
# SPDX-License-Identifier: MIT
|
|
|
|
"""
|
|
Build kernels in parallel - one translation unit per kernel.
|
|
|
|
This script is called at make time (not cmake time) to avoid slow cmake configuration.
|
|
"""
|
|
|
|
import argparse
|
|
import os
|
|
import subprocess
|
|
import sys
|
|
from pathlib import Path
|
|
from concurrent.futures import ProcessPoolExecutor, as_completed
|
|
|
|
|
|
def find_hipcc():
|
|
"""Find hipcc compiler."""
|
|
candidates = [
|
|
os.environ.get("HIPCC"),
|
|
"/opt/rocm/bin/hipcc",
|
|
shutil.which("hipcc") if shutil else None,
|
|
]
|
|
for path in candidates:
|
|
if path and os.path.isfile(path):
|
|
return path
|
|
return "hipcc" # Assume in PATH
|
|
|
|
|
|
def compile_kernel(args):
|
|
"""Compile a single kernel."""
|
|
kernel_hpp, output_dir, include_dirs, hipcc = args
|
|
kernel_name = kernel_hpp.stem
|
|
|
|
# Create wrapper .cpp
|
|
wrapper_cpp = output_dir / f"{kernel_name}.cpp"
|
|
wrapper_cpp.write_text(f'''// Auto-generated wrapper
|
|
#include "{kernel_hpp.name}"
|
|
namespace {{ volatile bool _k = true; }}
|
|
''')
|
|
|
|
# Compile to object
|
|
obj_file = output_dir / f"{kernel_name}.o"
|
|
|
|
cmd = [
|
|
hipcc,
|
|
"-c",
|
|
"-fPIC",
|
|
"-std=c++17",
|
|
"-O3",
|
|
"--offload-arch=gfx942",
|
|
"-mllvm",
|
|
"-enable-noalias-to-md-conversion=0",
|
|
"-Wno-undefined-func-template",
|
|
"-Wno-float-equal",
|
|
"--offload-compress",
|
|
]
|
|
|
|
for inc_dir in include_dirs:
|
|
cmd.extend(["-I", str(inc_dir)])
|
|
cmd.extend(["-I", str(kernel_hpp.parent)])
|
|
|
|
cmd.extend(["-o", str(obj_file), str(wrapper_cpp)])
|
|
|
|
result = subprocess.run(cmd, capture_output=True, text=True)
|
|
|
|
if result.returncode != 0:
|
|
return (kernel_name, False, result.stderr)
|
|
return (kernel_name, True, str(obj_file))
|
|
|
|
|
|
def main():
|
|
parser = argparse.ArgumentParser(description="Build kernels in parallel")
|
|
parser.add_argument("--kernel-dir", type=Path, required=True)
|
|
parser.add_argument("--output-dir", type=Path, required=True)
|
|
parser.add_argument("--include-dirs", type=str, required=True)
|
|
parser.add_argument("--jobs", type=int, default=os.cpu_count())
|
|
args = parser.parse_args()
|
|
|
|
# Find kernel headers
|
|
kernel_headers = list(args.kernel_dir.glob("gemm_*.hpp")) + list(
|
|
args.kernel_dir.glob("conv_*.hpp")
|
|
)
|
|
|
|
if not kernel_headers:
|
|
print("No kernels found to build")
|
|
return 0
|
|
|
|
print(f"Building {len(kernel_headers)} kernels with {args.jobs} parallel jobs...")
|
|
|
|
include_dirs = [Path(p.strip()) for p in args.include_dirs.split(",")]
|
|
hipcc = find_hipcc()
|
|
|
|
args.output_dir.mkdir(parents=True, exist_ok=True)
|
|
|
|
# Prepare work items
|
|
work = [(h, args.output_dir, include_dirs, hipcc) for h in kernel_headers]
|
|
|
|
# Compile in parallel
|
|
obj_files = []
|
|
failed = []
|
|
|
|
with ProcessPoolExecutor(max_workers=args.jobs) as executor:
|
|
futures = {executor.submit(compile_kernel, w): w[0].name for w in work}
|
|
|
|
for i, future in enumerate(as_completed(futures), 1):
|
|
name, success, result = future.result()
|
|
if success:
|
|
obj_files.append(result)
|
|
print(f"[{i}/{len(kernel_headers)}] Built: {name}")
|
|
else:
|
|
failed.append((name, result))
|
|
print(f"[{i}/{len(kernel_headers)}] FAILED: {name}")
|
|
|
|
if failed:
|
|
print(f"\n{len(failed)} kernels failed to compile:")
|
|
for name, err in failed[:5]:
|
|
print(f" {name}: {err[:100]}")
|
|
return 1
|
|
|
|
# Link into shared library
|
|
print(f"\nLinking {len(obj_files)} objects into libdispatcher_kernels.so...")
|
|
lib_path = args.output_dir / "libdispatcher_kernels.so"
|
|
|
|
link_cmd = [hipcc, "-shared", "-fPIC", "-o", str(lib_path)] + obj_files
|
|
result = subprocess.run(link_cmd, capture_output=True, text=True)
|
|
|
|
if result.returncode != 0:
|
|
print(f"Linking failed: {result.stderr}")
|
|
return 1
|
|
|
|
print(f"✓ Built: {lib_path}")
|
|
return 0
|
|
|
|
|
|
if __name__ == "__main__":
|
|
import shutil
|
|
|
|
sys.exit(main())
|