Files
composable_kernel/dispatcher/examples/gemm/python/10_advanced_benchmark.py
Vidyasagar Ananthan 9e049a32a1 Adding dispatcher architecture (#3300)
* WIP POC of dispatcher

* Dispatcher python workflow setup.

* Dispatcher cleanup and updates.

Further dispatcher cleanup and updates.

Build fixes

Improvements and python to CK example

Improvements to readme

* Fixes to python paths

* Cleaning up code

* Improving dispatcher support for different arch

Fixing typos

* Fix formatting errors

* Cleaning up examples

* Improving codegeneration

* Improving and fixing C++ examples

* Adding conv functionality (fwd,bwd,bwdw) and examples.

* Fixes based on feedback.

* Further fixes based on feedback.

* Adding stress test for autogeneration and autocorrection, and fixing preshuffle bug.

* Another round of improvements  based on feedback.

* Trimming out unnecessary code.

* Fixing the multi-D implementation.

* Using gpu verification for gemms and fixing convolutions tflops calculation.

* Fix counter usage issue and arch filtering per ops.

* Adding changelog and other fixes.

* Improve examples and resolve critical bugs.

* Reduce build time for python examples.

* Fixing minor bug.

* Fix compilation error.

* Improve installation instructions for dispatcher.

* Add docker based  installation instructions for dispatcher.

* Fixing arch-based filtering to match tile engine.

* Remove dead code and fix arch filtering.

* Minor bugfix.

* Updates after rebase.

* Trimming code.

* Fix copyright headers.

* Consolidate examples, cut down code.

* Minor fixes.

* Improving python examples.

* Update readmes.

* Remove conv functionality.

* Cleanup following conv removable.
2026-01-22 09:34:33 -08:00

261 lines
8.0 KiB
Python

#!/usr/bin/env python3
# Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
# SPDX-License-Identifier: MIT
"""
Example 10: Advanced Benchmarking with Full Control
This example demonstrates all available benchmark parameters:
- warmup: Number of warmup iterations (default: 5)
- repeat: Number of benchmark iterations (default: 20)
- flush_cache: Flush GPU cache between iterations (default: False)
- timer: Timer type - "gpu" (default) or "cpu"
- init: Initialization method - "random", "linear", "constant"
Usage:
python3 10_advanced_benchmark.py
python3 10_advanced_benchmark.py --warmup 10 --repeat 100
python3 10_advanced_benchmark.py --init linear
"""
import argparse
import sys
from pathlib import Path
# Add paths for imports
sys.path.insert(0, str(Path(__file__).parent.parent.parent.parent / "python"))
import numpy as np
from ctypes_utils import (
KernelConfig,
setup_gemm_dispatcher,
cleanup_gemm,
reset_for_example,
)
def parse_args():
parser = argparse.ArgumentParser(
description="Advanced GEMM benchmarking with full parameter control"
)
# Problem size
parser.add_argument("-m", type=int, default=2048, help="M dimension")
parser.add_argument("-n", type=int, default=2048, help="N dimension")
parser.add_argument("-k", type=int, default=2048, help="K dimension")
# Benchmark parameters
parser.add_argument(
"--warmup", type=int, default=5, help="Number of warmup iterations"
)
parser.add_argument(
"--repeat", type=int, default=20, help="Number of benchmark iterations"
)
parser.add_argument(
"--flush-cache", action="store_true", help="Flush GPU cache between iterations"
)
parser.add_argument(
"--timer", choices=["gpu", "cpu"], default="gpu", help="Timer type (gpu or cpu)"
)
parser.add_argument(
"--init",
choices=["random", "linear", "constant"],
default="random",
help="Initialization method",
)
# Kernel configuration
parser.add_argument("--dtype", default="fp16", help="Data type")
parser.add_argument("--pipeline", default="compv4", help="Pipeline type")
parser.add_argument("--arch", default="gfx942", help="GPU architecture")
return parser.parse_args()
def initialize_matrix(shape, method, dtype):
"""Initialize matrix with specified method"""
if method == "random":
return np.random.randn(*shape).astype(dtype) * 0.5
elif method == "linear":
total = np.prod(shape)
return np.arange(total).reshape(shape).astype(dtype) / total
elif method == "constant":
return np.ones(shape, dtype=dtype)
else:
return np.random.randn(*shape).astype(dtype)
def main():
args = parse_args()
reset_for_example()
print("=" * 70)
print("Example 10: Advanced GEMM Benchmarking")
print("=" * 70)
# Show benchmark configuration
print("\nBenchmark Configuration:")
print(f" Problem Size: {args.m} x {args.n} x {args.k}")
print(f" Warmup: {args.warmup} iterations")
print(f" Repeat: {args.repeat} iterations")
print(f" Flush Cache: {args.flush_cache}")
print(f" Timer: {args.timer}")
print(f" Init Method: {args.init}")
print(f" Data Type: {args.dtype}")
print(f" Pipeline: {args.pipeline}")
print(f" Architecture: {args.arch}")
print()
# Map dtype
np_dtype = np.float16 if args.dtype in ["fp16", "bf16"] else np.float32
# Initialize matrices
print("Step 1: Initialize matrices...")
A = initialize_matrix((args.m, args.k), args.init, np_dtype)
B = initialize_matrix((args.k, args.n), args.init, np_dtype)
print(f" A: {A.shape} ({args.init})")
print(f" B: {B.shape} ({args.init})")
# Create kernel config (does not include M/N/K - those are problem size)
print("\nStep 2: Create kernel configuration...")
kernel_config = KernelConfig(
dtype_a=args.dtype,
dtype_b=args.dtype,
dtype_c=args.dtype,
dtype_acc="fp32",
layout_a="row",
layout_b="col", # B is column-major for optimal performance
layout_c="row",
tile_m=128,
tile_n=128,
tile_k=32,
wave_m=2,
wave_n=2,
wave_k=1,
warp_m=32,
warp_n=32,
warp_k=16,
pipeline=args.pipeline,
scheduler="intrawave",
epilogue="cshuffle",
gfx_arch=args.arch,
)
print(f" Config: {args.dtype}, tile=128x128x32, {args.pipeline}")
# Setup dispatcher
print("\nStep 3: Setup dispatcher...")
setup = setup_gemm_dispatcher(
config=kernel_config,
registry_name="benchmark_gemm",
verbose=False,
auto_rebuild=True,
)
if not setup.success:
print(f" ERROR: {setup.error}")
return 1
dispatcher = setup.dispatcher
print(f" Library: {setup.lib.path if setup.lib else 'N/A'}")
print(f" Kernel: {setup.lib.get_kernel_name() if setup.lib else 'N/A'}")
# Run benchmark with multiple iterations
print("\nStep 4: Run benchmark...")
print(f" Running {args.warmup} warmup + {args.repeat} benchmark iterations...")
# Warmup
for _ in range(args.warmup):
_ = dispatcher.run(A, B, args.m, args.n, args.k)
# Benchmark
times = []
for _ in range(args.repeat):
result = dispatcher.run(A, B, args.m, args.n, args.k)
if result.success:
times.append(result.time_ms)
if times:
avg_time = sum(times) / len(times)
min_time = min(times)
max_time = max(times)
# Calculate TFLOPS
flops = 2 * args.m * args.n * args.k
avg_tflops = (flops / 1e12) / (avg_time / 1000) if avg_time > 0 else 0
max_tflops = (flops / 1e12) / (min_time / 1000) if min_time > 0 else 0
# Calculate bandwidth (C has same dtype as A and B)
C_bytes = args.m * args.n * np.dtype(np_dtype).itemsize
bandwidth_gb = (
(A.nbytes + B.nbytes + C_bytes) / 1e9 / (avg_time / 1000)
if avg_time > 0
else 0
)
print(f"\n *** BENCHMARK RESULTS ({args.repeat} iterations) ***")
print(f" Average Time: {avg_time:.4f} ms")
print(f" Min Time: {min_time:.4f} ms")
print(f" Max Time: {max_time:.4f} ms")
print(f" Avg TFLOPS: {avg_tflops:.2f}")
print(f" Peak TFLOPS: {max_tflops:.2f}")
print(f" Bandwidth: {bandwidth_gb:.2f} GB/s")
else:
print(" FAILED: No successful runs")
return 1
# Summary
print("\n" + "=" * 70)
print("BENCHMARK PARAMETERS REFERENCE")
print("=" * 70)
print("""
Available parameters for GEMM benchmarking:
--warmup N Number of warmup iterations (discard results)
Higher = more stable results, longer run time
Default: 5
--repeat N Number of benchmark iterations
Higher = more accurate average, longer run time
Default: 20
--flush-cache Flush GPU L2 cache between iterations
Use for memory-bound benchmarks
Default: off
--timer {gpu,cpu} Timer type
gpu = HIP events (more accurate for GPU)
cpu = std::chrono (includes kernel launch overhead)
Default: gpu
--init METHOD Matrix initialization
random = uniform random [-0.5, 0.5]
linear = sequential values
constant = all ones
Default: random
Note: For C++ examples, these parameters are passed to stream_config:
ck_tile::stream_config cfg{
nullptr, // stream_id
true, // time_kernel
1, // log_level
5, // cold_niters (warmup)
20, // nrepeat
true, // is_gpu_timer
false, // flush_cache
1 // rotating_count
};
""")
# Cleanup
cleanup_gemm()
return 0
if __name__ == "__main__":
sys.exit(main())