Adding dispatcher architecture (#3300)

* WIP POC of dispatcher * Dispatcher python workflow setup. * Dispatcher cleanup and updates. Further dispatcher cleanup and updates. Build fixes Improvements and python to CK example Improvements to readme * Fixes to python paths * Cleaning up code * Improving dispatcher support for different arch Fixing typos * Fix formatting errors * Cleaning up examples * Improving codegeneration * Improving and fixing C++ examples * Adding conv functionality (fwd,bwd,bwdw) and examples. * Fixes based on feedback. * Further fixes based on feedback. * Adding stress test for autogeneration and autocorrection, and fixing preshuffle bug. * Another round of improvements based on feedback. * Trimming out unnecessary code. * Fixing the multi-D implementation. * Using gpu verification for gemms and fixing convolutions tflops calculation. * Fix counter usage issue and arch filtering per ops. * Adding changelog and other fixes. * Improve examples and resolve critical bugs. * Reduce build time for python examples. * Fixing minor bug. * Fix compilation error. * Improve installation instructions for dispatcher. * Add docker based installation instructions for dispatcher. * Fixing arch-based filtering to match tile engine. * Remove dead code and fix arch filtering. * Minor bugfix. * Updates after rebase. * Trimming code. * Fix copyright headers. * Consolidate examples, cut down code. * Minor fixes. * Improving python examples. * Update readmes. * Remove conv functionality. * Cleanup following conv removable.
2026-05-04 13:41:24 +00:00 · 2026-01-22 09:34:33 -08:00
parent 44f481a45c
commit 9e049a32a1
97 changed files with 33472 additions and 0 deletions
--- a/dispatcher/examples/gemm/python/10_advanced_benchmark.py
+++ b/dispatcher/examples/gemm/python/10_advanced_benchmark.py
@@ -0,0 +1,260 @@
+#!/usr/bin/env python3
+
+# Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
+# SPDX-License-Identifier: MIT
+
+"""
+Example 10: Advanced Benchmarking with Full Control
+
+This example demonstrates all available benchmark parameters:
+  - warmup: Number of warmup iterations (default: 5)
+  - repeat: Number of benchmark iterations (default: 20)
+  - flush_cache: Flush GPU cache between iterations (default: False)
+  - timer: Timer type - "gpu" (default) or "cpu"
+  - init: Initialization method - "random", "linear", "constant"
+
+Usage:
+    python3 10_advanced_benchmark.py
+    python3 10_advanced_benchmark.py --warmup 10 --repeat 100
+    python3 10_advanced_benchmark.py --init linear
+"""
+
+import argparse
+import sys
+from pathlib import Path
+
+# Add paths for imports
+sys.path.insert(0, str(Path(__file__).parent.parent.parent.parent / "python"))
+
+import numpy as np
+
+from ctypes_utils import (
+    KernelConfig,
+    setup_gemm_dispatcher,
+    cleanup_gemm,
+    reset_for_example,
+)
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(
+        description="Advanced GEMM benchmarking with full parameter control"
+    )
+
+    # Problem size
+    parser.add_argument("-m", type=int, default=2048, help="M dimension")
+    parser.add_argument("-n", type=int, default=2048, help="N dimension")
+    parser.add_argument("-k", type=int, default=2048, help="K dimension")
+
+    # Benchmark parameters
+    parser.add_argument(
+        "--warmup", type=int, default=5, help="Number of warmup iterations"
+    )
+    parser.add_argument(
+        "--repeat", type=int, default=20, help="Number of benchmark iterations"
+    )
+    parser.add_argument(
+        "--flush-cache", action="store_true", help="Flush GPU cache between iterations"
+    )
+    parser.add_argument(
+        "--timer", choices=["gpu", "cpu"], default="gpu", help="Timer type (gpu or cpu)"
+    )
+    parser.add_argument(
+        "--init",
+        choices=["random", "linear", "constant"],
+        default="random",
+        help="Initialization method",
+    )
+
+    # Kernel configuration
+    parser.add_argument("--dtype", default="fp16", help="Data type")
+    parser.add_argument("--pipeline", default="compv4", help="Pipeline type")
+    parser.add_argument("--arch", default="gfx942", help="GPU architecture")
+
+    return parser.parse_args()
+
+
+def initialize_matrix(shape, method, dtype):
+    """Initialize matrix with specified method"""
+    if method == "random":
+        return np.random.randn(*shape).astype(dtype) * 0.5
+    elif method == "linear":
+        total = np.prod(shape)
+        return np.arange(total).reshape(shape).astype(dtype) / total
+    elif method == "constant":
+        return np.ones(shape, dtype=dtype)
+    else:
+        return np.random.randn(*shape).astype(dtype)
+
+
+def main():
+    args = parse_args()
+
+    reset_for_example()
+
+    print("=" * 70)
+    print("Example 10: Advanced GEMM Benchmarking")
+    print("=" * 70)
+
+    # Show benchmark configuration
+    print("\nBenchmark Configuration:")
+    print(f"  Problem Size:   {args.m} x {args.n} x {args.k}")
+    print(f"  Warmup:         {args.warmup} iterations")
+    print(f"  Repeat:         {args.repeat} iterations")
+    print(f"  Flush Cache:    {args.flush_cache}")
+    print(f"  Timer:          {args.timer}")
+    print(f"  Init Method:    {args.init}")
+    print(f"  Data Type:      {args.dtype}")
+    print(f"  Pipeline:       {args.pipeline}")
+    print(f"  Architecture:   {args.arch}")
+    print()
+
+    # Map dtype
+    np_dtype = np.float16 if args.dtype in ["fp16", "bf16"] else np.float32
+
+    # Initialize matrices
+    print("Step 1: Initialize matrices...")
+    A = initialize_matrix((args.m, args.k), args.init, np_dtype)
+    B = initialize_matrix((args.k, args.n), args.init, np_dtype)
+    print(f"  A: {A.shape} ({args.init})")
+    print(f"  B: {B.shape} ({args.init})")
+
+    # Create kernel config (does not include M/N/K - those are problem size)
+    print("\nStep 2: Create kernel configuration...")
+    kernel_config = KernelConfig(
+        dtype_a=args.dtype,
+        dtype_b=args.dtype,
+        dtype_c=args.dtype,
+        dtype_acc="fp32",
+        layout_a="row",
+        layout_b="col",  # B is column-major for optimal performance
+        layout_c="row",
+        tile_m=128,
+        tile_n=128,
+        tile_k=32,
+        wave_m=2,
+        wave_n=2,
+        wave_k=1,
+        warp_m=32,
+        warp_n=32,
+        warp_k=16,
+        pipeline=args.pipeline,
+        scheduler="intrawave",
+        epilogue="cshuffle",
+        gfx_arch=args.arch,
+    )
+    print(f"  Config: {args.dtype}, tile=128x128x32, {args.pipeline}")
+
+    # Setup dispatcher
+    print("\nStep 3: Setup dispatcher...")
+    setup = setup_gemm_dispatcher(
+        config=kernel_config,
+        registry_name="benchmark_gemm",
+        verbose=False,
+        auto_rebuild=True,
+    )
+
+    if not setup.success:
+        print(f"  ERROR: {setup.error}")
+        return 1
+
+    dispatcher = setup.dispatcher
+    print(f"  Library: {setup.lib.path if setup.lib else 'N/A'}")
+    print(f"  Kernel: {setup.lib.get_kernel_name() if setup.lib else 'N/A'}")
+
+    # Run benchmark with multiple iterations
+    print("\nStep 4: Run benchmark...")
+    print(f"  Running {args.warmup} warmup + {args.repeat} benchmark iterations...")
+
+    # Warmup
+    for _ in range(args.warmup):
+        _ = dispatcher.run(A, B, args.m, args.n, args.k)
+
+    # Benchmark
+    times = []
+    for _ in range(args.repeat):
+        result = dispatcher.run(A, B, args.m, args.n, args.k)
+        if result.success:
+            times.append(result.time_ms)
+
+    if times:
+        avg_time = sum(times) / len(times)
+        min_time = min(times)
+        max_time = max(times)
+
+        # Calculate TFLOPS
+        flops = 2 * args.m * args.n * args.k
+        avg_tflops = (flops / 1e12) / (avg_time / 1000) if avg_time > 0 else 0
+        max_tflops = (flops / 1e12) / (min_time / 1000) if min_time > 0 else 0
+
+        # Calculate bandwidth (C has same dtype as A and B)
+        C_bytes = args.m * args.n * np.dtype(np_dtype).itemsize
+        bandwidth_gb = (
+            (A.nbytes + B.nbytes + C_bytes) / 1e9 / (avg_time / 1000)
+            if avg_time > 0
+            else 0
+        )
+
+        print(f"\n  *** BENCHMARK RESULTS ({args.repeat} iterations) ***")
+        print(f"  Average Time:   {avg_time:.4f} ms")
+        print(f"  Min Time:       {min_time:.4f} ms")
+        print(f"  Max Time:       {max_time:.4f} ms")
+        print(f"  Avg TFLOPS:     {avg_tflops:.2f}")
+        print(f"  Peak TFLOPS:    {max_tflops:.2f}")
+        print(f"  Bandwidth:      {bandwidth_gb:.2f} GB/s")
+    else:
+        print("  FAILED: No successful runs")
+        return 1
+
+    # Summary
+    print("\n" + "=" * 70)
+    print("BENCHMARK PARAMETERS REFERENCE")
+    print("=" * 70)
+    print("""
+Available parameters for GEMM benchmarking:
+
+  --warmup N          Number of warmup iterations (discard results)
+                      Higher = more stable results, longer run time
+                      Default: 5
+
+  --repeat N          Number of benchmark iterations
+                      Higher = more accurate average, longer run time
+                      Default: 20
+
+  --flush-cache       Flush GPU L2 cache between iterations
+                      Use for memory-bound benchmarks
+                      Default: off
+
+  --timer {gpu,cpu}   Timer type
+                      gpu = HIP events (more accurate for GPU)
+                      cpu = std::chrono (includes kernel launch overhead)
+                      Default: gpu
+
+  --init METHOD       Matrix initialization
+                      random = uniform random [-0.5, 0.5]
+                      linear = sequential values
+                      constant = all ones
+                      Default: random
+
+Note: For C++ examples, these parameters are passed to stream_config:
+
+    ck_tile::stream_config cfg{
+        nullptr,    // stream_id
+        true,       // time_kernel
+        1,          // log_level
+        5,          // cold_niters (warmup)
+        20,         // nrepeat
+        true,       // is_gpu_timer
+        false,      // flush_cache
+        1           // rotating_count
+    };
+""")
+
+    # Cleanup
+    cleanup_gemm()
+
+    return 0
+
+
+if __name__ == "__main__":
+    sys.exit(main())