#!/usr/bin/env python3 # Copyright (c) Advanced Micro Devices, Inc., or its affiliates. # SPDX-License-Identifier: MIT """ Example 10: Advanced Benchmarking with Full Control This example demonstrates all available benchmark parameters: - warmup: Number of warmup iterations (default: 5) - repeat: Number of benchmark iterations (default: 20) - flush_cache: Flush GPU cache between iterations (default: False) - timer: Timer type - "gpu" (default) or "cpu" - init: Initialization method - "random", "linear", "constant" Usage: python3 10_advanced_benchmark.py python3 10_advanced_benchmark.py --warmup 10 --repeat 100 python3 10_advanced_benchmark.py --init linear """ import argparse import sys from pathlib import Path # Add paths for imports sys.path.insert(0, str(Path(__file__).parent.parent.parent.parent / "python")) import numpy as np from ctypes_utils import ( KernelConfig, setup_gemm_dispatcher, cleanup_gemm, reset_for_example, ) def parse_args(): parser = argparse.ArgumentParser( description="Advanced GEMM benchmarking with full parameter control" ) # Problem size parser.add_argument("-m", type=int, default=2048, help="M dimension") parser.add_argument("-n", type=int, default=2048, help="N dimension") parser.add_argument("-k", type=int, default=2048, help="K dimension") # Benchmark parameters parser.add_argument( "--warmup", type=int, default=5, help="Number of warmup iterations" ) parser.add_argument( "--repeat", type=int, default=20, help="Number of benchmark iterations" ) parser.add_argument( "--flush-cache", action="store_true", help="Flush GPU cache between iterations" ) parser.add_argument( "--timer", choices=["gpu", "cpu"], default="gpu", help="Timer type (gpu or cpu)" ) parser.add_argument( "--init", choices=["random", "linear", "constant"], default="random", help="Initialization method", ) # Kernel configuration parser.add_argument("--dtype", default="fp16", help="Data type") parser.add_argument("--pipeline", default="compv4", help="Pipeline type") parser.add_argument("--arch", default="gfx942", help="GPU architecture") return parser.parse_args() def initialize_matrix(shape, method, dtype): """Initialize matrix with specified method""" if method == "random": return np.random.randn(*shape).astype(dtype) * 0.5 elif method == "linear": total = np.prod(shape) return np.arange(total).reshape(shape).astype(dtype) / total elif method == "constant": return np.ones(shape, dtype=dtype) else: return np.random.randn(*shape).astype(dtype) def main(): args = parse_args() reset_for_example() print("=" * 70) print("Example 10: Advanced GEMM Benchmarking") print("=" * 70) # Show benchmark configuration print("\nBenchmark Configuration:") print(f" Problem Size: {args.m} x {args.n} x {args.k}") print(f" Warmup: {args.warmup} iterations") print(f" Repeat: {args.repeat} iterations") print(f" Flush Cache: {args.flush_cache}") print(f" Timer: {args.timer}") print(f" Init Method: {args.init}") print(f" Data Type: {args.dtype}") print(f" Pipeline: {args.pipeline}") print(f" Architecture: {args.arch}") print() # Map dtype np_dtype = np.float16 if args.dtype in ["fp16", "bf16"] else np.float32 # Initialize matrices print("Step 1: Initialize matrices...") A = initialize_matrix((args.m, args.k), args.init, np_dtype) B = initialize_matrix((args.k, args.n), args.init, np_dtype) print(f" A: {A.shape} ({args.init})") print(f" B: {B.shape} ({args.init})") # Create kernel config (does not include M/N/K - those are problem size) print("\nStep 2: Create kernel configuration...") kernel_config = KernelConfig( dtype_a=args.dtype, dtype_b=args.dtype, dtype_c=args.dtype, dtype_acc="fp32", layout_a="row", layout_b="col", # B is column-major for optimal performance layout_c="row", tile_m=128, tile_n=128, tile_k=32, wave_m=2, wave_n=2, wave_k=1, warp_m=32, warp_n=32, warp_k=16, pipeline=args.pipeline, scheduler="intrawave", epilogue="cshuffle", gfx_arch=args.arch, ) print(f" Config: {args.dtype}, tile=128x128x32, {args.pipeline}") # Setup dispatcher print("\nStep 3: Setup dispatcher...") setup = setup_gemm_dispatcher( config=kernel_config, registry_name="benchmark_gemm", verbose=False, auto_rebuild=True, ) if not setup.success: print(f" ERROR: {setup.error}") return 1 dispatcher = setup.dispatcher print(f" Library: {setup.lib.path if setup.lib else 'N/A'}") print(f" Kernel: {setup.lib.get_kernel_name() if setup.lib else 'N/A'}") # Run benchmark with multiple iterations print("\nStep 4: Run benchmark...") print(f" Running {args.warmup} warmup + {args.repeat} benchmark iterations...") # Warmup for _ in range(args.warmup): _ = dispatcher.run(A, B, args.m, args.n, args.k) # Benchmark times = [] for _ in range(args.repeat): result = dispatcher.run(A, B, args.m, args.n, args.k) if result.success: times.append(result.time_ms) if times: avg_time = sum(times) / len(times) min_time = min(times) max_time = max(times) # Calculate TFLOPS flops = 2 * args.m * args.n * args.k avg_tflops = (flops / 1e12) / (avg_time / 1000) if avg_time > 0 else 0 max_tflops = (flops / 1e12) / (min_time / 1000) if min_time > 0 else 0 # Calculate bandwidth (C has same dtype as A and B) C_bytes = args.m * args.n * np.dtype(np_dtype).itemsize bandwidth_gb = ( (A.nbytes + B.nbytes + C_bytes) / 1e9 / (avg_time / 1000) if avg_time > 0 else 0 ) print(f"\n *** BENCHMARK RESULTS ({args.repeat} iterations) ***") print(f" Average Time: {avg_time:.4f} ms") print(f" Min Time: {min_time:.4f} ms") print(f" Max Time: {max_time:.4f} ms") print(f" Avg TFLOPS: {avg_tflops:.2f}") print(f" Peak TFLOPS: {max_tflops:.2f}") print(f" Bandwidth: {bandwidth_gb:.2f} GB/s") else: print(" FAILED: No successful runs") return 1 # Summary print("\n" + "=" * 70) print("BENCHMARK PARAMETERS REFERENCE") print("=" * 70) print(""" Available parameters for GEMM benchmarking: --warmup N Number of warmup iterations (discard results) Higher = more stable results, longer run time Default: 5 --repeat N Number of benchmark iterations Higher = more accurate average, longer run time Default: 20 --flush-cache Flush GPU L2 cache between iterations Use for memory-bound benchmarks Default: off --timer {gpu,cpu} Timer type gpu = HIP events (more accurate for GPU) cpu = std::chrono (includes kernel launch overhead) Default: gpu --init METHOD Matrix initialization random = uniform random [-0.5, 0.5] linear = sequential values constant = all ones Default: random Note: For C++ examples, these parameters are passed to stream_config: ck_tile::stream_config cfg{ nullptr, // stream_id true, // time_kernel 1, // log_level 5, // cold_niters (warmup) 20, // nrepeat true, // is_gpu_timer false, // flush_cache 1 // rotating_count }; """) # Cleanup cleanup_gemm() return 0 if __name__ == "__main__": sys.exit(main())