mirror of
https://github.com/ROCm/composable_kernel.git
synced 2026-05-04 13:41:24 +00:00
Adding dispatcher architecture (#3300)
* WIP POC of dispatcher * Dispatcher python workflow setup. * Dispatcher cleanup and updates. Further dispatcher cleanup and updates. Build fixes Improvements and python to CK example Improvements to readme * Fixes to python paths * Cleaning up code * Improving dispatcher support for different arch Fixing typos * Fix formatting errors * Cleaning up examples * Improving codegeneration * Improving and fixing C++ examples * Adding conv functionality (fwd,bwd,bwdw) and examples. * Fixes based on feedback. * Further fixes based on feedback. * Adding stress test for autogeneration and autocorrection, and fixing preshuffle bug. * Another round of improvements based on feedback. * Trimming out unnecessary code. * Fixing the multi-D implementation. * Using gpu verification for gemms and fixing convolutions tflops calculation. * Fix counter usage issue and arch filtering per ops. * Adding changelog and other fixes. * Improve examples and resolve critical bugs. * Reduce build time for python examples. * Fixing minor bug. * Fix compilation error. * Improve installation instructions for dispatcher. * Add docker based installation instructions for dispatcher. * Fixing arch-based filtering to match tile engine. * Remove dead code and fix arch filtering. * Minor bugfix. * Updates after rebase. * Trimming code. * Fix copyright headers. * Consolidate examples, cut down code. * Minor fixes. * Improving python examples. * Update readmes. * Remove conv functionality. * Cleanup following conv removable.
This commit is contained in:
committed by
GitHub
parent
44f481a45c
commit
9e049a32a1
260
dispatcher/examples/gemm/python/10_advanced_benchmark.py
Normal file
260
dispatcher/examples/gemm/python/10_advanced_benchmark.py
Normal file
@@ -0,0 +1,260 @@
|
||||
#!/usr/bin/env python3
|
||||
|
||||
# Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
|
||||
# SPDX-License-Identifier: MIT
|
||||
|
||||
"""
|
||||
Example 10: Advanced Benchmarking with Full Control
|
||||
|
||||
This example demonstrates all available benchmark parameters:
|
||||
- warmup: Number of warmup iterations (default: 5)
|
||||
- repeat: Number of benchmark iterations (default: 20)
|
||||
- flush_cache: Flush GPU cache between iterations (default: False)
|
||||
- timer: Timer type - "gpu" (default) or "cpu"
|
||||
- init: Initialization method - "random", "linear", "constant"
|
||||
|
||||
Usage:
|
||||
python3 10_advanced_benchmark.py
|
||||
python3 10_advanced_benchmark.py --warmup 10 --repeat 100
|
||||
python3 10_advanced_benchmark.py --init linear
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import sys
|
||||
from pathlib import Path
|
||||
|
||||
# Add paths for imports
|
||||
sys.path.insert(0, str(Path(__file__).parent.parent.parent.parent / "python"))
|
||||
|
||||
import numpy as np
|
||||
|
||||
from ctypes_utils import (
|
||||
KernelConfig,
|
||||
setup_gemm_dispatcher,
|
||||
cleanup_gemm,
|
||||
reset_for_example,
|
||||
)
|
||||
|
||||
|
||||
def parse_args():
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Advanced GEMM benchmarking with full parameter control"
|
||||
)
|
||||
|
||||
# Problem size
|
||||
parser.add_argument("-m", type=int, default=2048, help="M dimension")
|
||||
parser.add_argument("-n", type=int, default=2048, help="N dimension")
|
||||
parser.add_argument("-k", type=int, default=2048, help="K dimension")
|
||||
|
||||
# Benchmark parameters
|
||||
parser.add_argument(
|
||||
"--warmup", type=int, default=5, help="Number of warmup iterations"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--repeat", type=int, default=20, help="Number of benchmark iterations"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--flush-cache", action="store_true", help="Flush GPU cache between iterations"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--timer", choices=["gpu", "cpu"], default="gpu", help="Timer type (gpu or cpu)"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--init",
|
||||
choices=["random", "linear", "constant"],
|
||||
default="random",
|
||||
help="Initialization method",
|
||||
)
|
||||
|
||||
# Kernel configuration
|
||||
parser.add_argument("--dtype", default="fp16", help="Data type")
|
||||
parser.add_argument("--pipeline", default="compv4", help="Pipeline type")
|
||||
parser.add_argument("--arch", default="gfx942", help="GPU architecture")
|
||||
|
||||
return parser.parse_args()
|
||||
|
||||
|
||||
def initialize_matrix(shape, method, dtype):
|
||||
"""Initialize matrix with specified method"""
|
||||
if method == "random":
|
||||
return np.random.randn(*shape).astype(dtype) * 0.5
|
||||
elif method == "linear":
|
||||
total = np.prod(shape)
|
||||
return np.arange(total).reshape(shape).astype(dtype) / total
|
||||
elif method == "constant":
|
||||
return np.ones(shape, dtype=dtype)
|
||||
else:
|
||||
return np.random.randn(*shape).astype(dtype)
|
||||
|
||||
|
||||
def main():
|
||||
args = parse_args()
|
||||
|
||||
reset_for_example()
|
||||
|
||||
print("=" * 70)
|
||||
print("Example 10: Advanced GEMM Benchmarking")
|
||||
print("=" * 70)
|
||||
|
||||
# Show benchmark configuration
|
||||
print("\nBenchmark Configuration:")
|
||||
print(f" Problem Size: {args.m} x {args.n} x {args.k}")
|
||||
print(f" Warmup: {args.warmup} iterations")
|
||||
print(f" Repeat: {args.repeat} iterations")
|
||||
print(f" Flush Cache: {args.flush_cache}")
|
||||
print(f" Timer: {args.timer}")
|
||||
print(f" Init Method: {args.init}")
|
||||
print(f" Data Type: {args.dtype}")
|
||||
print(f" Pipeline: {args.pipeline}")
|
||||
print(f" Architecture: {args.arch}")
|
||||
print()
|
||||
|
||||
# Map dtype
|
||||
np_dtype = np.float16 if args.dtype in ["fp16", "bf16"] else np.float32
|
||||
|
||||
# Initialize matrices
|
||||
print("Step 1: Initialize matrices...")
|
||||
A = initialize_matrix((args.m, args.k), args.init, np_dtype)
|
||||
B = initialize_matrix((args.k, args.n), args.init, np_dtype)
|
||||
print(f" A: {A.shape} ({args.init})")
|
||||
print(f" B: {B.shape} ({args.init})")
|
||||
|
||||
# Create kernel config (does not include M/N/K - those are problem size)
|
||||
print("\nStep 2: Create kernel configuration...")
|
||||
kernel_config = KernelConfig(
|
||||
dtype_a=args.dtype,
|
||||
dtype_b=args.dtype,
|
||||
dtype_c=args.dtype,
|
||||
dtype_acc="fp32",
|
||||
layout_a="row",
|
||||
layout_b="col", # B is column-major for optimal performance
|
||||
layout_c="row",
|
||||
tile_m=128,
|
||||
tile_n=128,
|
||||
tile_k=32,
|
||||
wave_m=2,
|
||||
wave_n=2,
|
||||
wave_k=1,
|
||||
warp_m=32,
|
||||
warp_n=32,
|
||||
warp_k=16,
|
||||
pipeline=args.pipeline,
|
||||
scheduler="intrawave",
|
||||
epilogue="cshuffle",
|
||||
gfx_arch=args.arch,
|
||||
)
|
||||
print(f" Config: {args.dtype}, tile=128x128x32, {args.pipeline}")
|
||||
|
||||
# Setup dispatcher
|
||||
print("\nStep 3: Setup dispatcher...")
|
||||
setup = setup_gemm_dispatcher(
|
||||
config=kernel_config,
|
||||
registry_name="benchmark_gemm",
|
||||
verbose=False,
|
||||
auto_rebuild=True,
|
||||
)
|
||||
|
||||
if not setup.success:
|
||||
print(f" ERROR: {setup.error}")
|
||||
return 1
|
||||
|
||||
dispatcher = setup.dispatcher
|
||||
print(f" Library: {setup.lib.path if setup.lib else 'N/A'}")
|
||||
print(f" Kernel: {setup.lib.get_kernel_name() if setup.lib else 'N/A'}")
|
||||
|
||||
# Run benchmark with multiple iterations
|
||||
print("\nStep 4: Run benchmark...")
|
||||
print(f" Running {args.warmup} warmup + {args.repeat} benchmark iterations...")
|
||||
|
||||
# Warmup
|
||||
for _ in range(args.warmup):
|
||||
_ = dispatcher.run(A, B, args.m, args.n, args.k)
|
||||
|
||||
# Benchmark
|
||||
times = []
|
||||
for _ in range(args.repeat):
|
||||
result = dispatcher.run(A, B, args.m, args.n, args.k)
|
||||
if result.success:
|
||||
times.append(result.time_ms)
|
||||
|
||||
if times:
|
||||
avg_time = sum(times) / len(times)
|
||||
min_time = min(times)
|
||||
max_time = max(times)
|
||||
|
||||
# Calculate TFLOPS
|
||||
flops = 2 * args.m * args.n * args.k
|
||||
avg_tflops = (flops / 1e12) / (avg_time / 1000) if avg_time > 0 else 0
|
||||
max_tflops = (flops / 1e12) / (min_time / 1000) if min_time > 0 else 0
|
||||
|
||||
# Calculate bandwidth (C has same dtype as A and B)
|
||||
C_bytes = args.m * args.n * np.dtype(np_dtype).itemsize
|
||||
bandwidth_gb = (
|
||||
(A.nbytes + B.nbytes + C_bytes) / 1e9 / (avg_time / 1000)
|
||||
if avg_time > 0
|
||||
else 0
|
||||
)
|
||||
|
||||
print(f"\n *** BENCHMARK RESULTS ({args.repeat} iterations) ***")
|
||||
print(f" Average Time: {avg_time:.4f} ms")
|
||||
print(f" Min Time: {min_time:.4f} ms")
|
||||
print(f" Max Time: {max_time:.4f} ms")
|
||||
print(f" Avg TFLOPS: {avg_tflops:.2f}")
|
||||
print(f" Peak TFLOPS: {max_tflops:.2f}")
|
||||
print(f" Bandwidth: {bandwidth_gb:.2f} GB/s")
|
||||
else:
|
||||
print(" FAILED: No successful runs")
|
||||
return 1
|
||||
|
||||
# Summary
|
||||
print("\n" + "=" * 70)
|
||||
print("BENCHMARK PARAMETERS REFERENCE")
|
||||
print("=" * 70)
|
||||
print("""
|
||||
Available parameters for GEMM benchmarking:
|
||||
|
||||
--warmup N Number of warmup iterations (discard results)
|
||||
Higher = more stable results, longer run time
|
||||
Default: 5
|
||||
|
||||
--repeat N Number of benchmark iterations
|
||||
Higher = more accurate average, longer run time
|
||||
Default: 20
|
||||
|
||||
--flush-cache Flush GPU L2 cache between iterations
|
||||
Use for memory-bound benchmarks
|
||||
Default: off
|
||||
|
||||
--timer {gpu,cpu} Timer type
|
||||
gpu = HIP events (more accurate for GPU)
|
||||
cpu = std::chrono (includes kernel launch overhead)
|
||||
Default: gpu
|
||||
|
||||
--init METHOD Matrix initialization
|
||||
random = uniform random [-0.5, 0.5]
|
||||
linear = sequential values
|
||||
constant = all ones
|
||||
Default: random
|
||||
|
||||
Note: For C++ examples, these parameters are passed to stream_config:
|
||||
|
||||
ck_tile::stream_config cfg{
|
||||
nullptr, // stream_id
|
||||
true, // time_kernel
|
||||
1, // log_level
|
||||
5, // cold_niters (warmup)
|
||||
20, // nrepeat
|
||||
true, // is_gpu_timer
|
||||
false, // flush_cache
|
||||
1 // rotating_count
|
||||
};
|
||||
""")
|
||||
|
||||
# Cleanup
|
||||
cleanup_gemm()
|
||||
|
||||
return 0
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
sys.exit(main())
|
||||
Reference in New Issue
Block a user