recover benchmark_gemm and fix

2026-06-29 19:28:33 +00:00 · 2025-05-16 10:37:59 +00:00
parent fb63cd5923
commit 012c77125a
7 changed files with 64 additions and 95 deletions
--- a/tile_engine/ops/gemm/README.md
+++ b/tile_engine/ops/gemm/README.md
@@ -12,7 +12,7 @@ User can provide kernel configuration such as tile size, warp size, padding, pip
 mkdir build && cd build
 # build composable kernel
 ## replace <arch> with the appropriate architecture (example gfx942) or leave blank
-## "USE_CUSTOM_CONFIG=ON" for default configuration, "USE_CUSTOM_CONFIG=ON" for user provided configuration
+## "USE_CUSTOM_CONFIG=OFF" for default configuration, "USE_CUSTOM_CONFIG=ON" for user provided configuration
 sh ../script/cmake-ck-dev.sh  ../ <arch> -D USE_CUSTOM_CONFIG=ON 
 # generate the executable
 make tile_engine_gemm -j
@@ -29,22 +29,23 @@ rm -rf tile_engine/ && make tile_engine_gemm -j  # rebuild
                      -m    The value for m dimension. Default is 3840.
                      -n    The value for n dimension. Default is 4096.
                      -k    The value for k dimension. Default is 2048.
-              -stride_a    The stride value for tensor A. Default is 0.
-              -stride_b    The stride value for tensor B. Default is 0.
-              -stride_c    The stride value for tensor C  Default is 0.
+               -stride_a    The stride value for tensor A. Default is 0.
+               -stride_b    The stride value for tensor B. Default is 0.
+               -stride_c    The stride value for tensor C  Default is 0.
                -split_k    The split value for k dimension. Default is 1.
                      -v    The type of validation. Set to 0 for no validation, 1 for validation on CPU, or 2 for validation on GPU. Default is 2, validation on GPU.
-                -warmup    The number of iterations before benchmark the kernel. Default is 50.
-                -repeat    The number of iterations to benchmark the kernel. Default is 100.
+                    -log    Wether output kernel instance information or not. Possible values are true or false. Default is false.
+                 -warmup    The number of iterations before benchmark the kernel. Default is 50.
+                 -repeat    The number of iterations to benchmark the kernel. Default is 100.
                  -timer    The type of timer. Possible values are gpu timer or cpu timer. Default is gpu timer.
-                  -init    The method of tensor initialization. Set to 0 for random, to 1 for linear, or 2 for constant(1). Default is 0, random.
-                -metric    Metric with which to measure kernel performance. Set to 0 for latency, 1 for tflops, or 2 for bandwidth. Default is 0, latency.
-   -structured_sparsity    whether use sparsity kernel or not. Possible values are true or false. Default is false.
-              -pipeline    The type of pipeline. Possible values are compv3, compv4 or mem. Default is compv3.     
-              -epilogue    The type of epilogue. Possible values are cshuffle or default. Default is cshuffle.
-                 -pad_m    Whether pad or not in m direction. Possible values are true or false. Default is false. 
-                 -pad_n    Whether pad or not in n direction. Possible values are true or false. Default is false. 
-                 -pad_k    Whether pad or not in k direction. Possible values are true or false. Default is false. 
+                   -init    The method of tensor initialization. Set to 0 for random, to 1 for linear, or 2 for constant(1). Default is 0, random.
+                 -metric    Metric with which to measure kernel performance. Set to 0 for latency, 1 for tflops, or 2 for bandwidth. Default is 0, latency.
+    -structured_sparsity    whether use sparsity kernel or not. Possible values are true or false. Default is false.
+               -pipeline    The type of pipeline. Possible values are compv3, compv4 or mem. Default is compv3.     
+               -epilogue    The type of epilogue. Possible values are cshuffle or default. Default is cshuffle.
+                  -pad_m    Whether pad or not in m direction. Possible values are true or false. Default is false. 
+                  -pad_n    Whether pad or not in n direction. Possible values are true or false. Default is false. 
+                  -pad_k    Whether pad or not in k direction. Possible values are true or false. Default is false. 

 Note: pipeline, scheduler, epilogue, pad_m, pad_n, pad_k should be one of the options specified in user_provided_config.json 
 ```
--- a/tile_engine/ops/gemm/benchmark_gemm.hpp
+++ b/tile_engine/ops/gemm/benchmark_gemm.hpp
@@ -137,44 +137,6 @@ class GemmProfiler
        return "Unknown";
    }

-    template <typename Timer>
-    static float time_kernel(const ck_tile::stream_config& stream,
-                             const std::function<void()>& kernel_launch_func)
-    {
-        Timer timer;
-
-        // flush cache
-        auto flush_cache = [&] {
-#if defined(__HIP_DEVICE_COMPILE__) && __HIP_DEVICE_COMPILE__
-            asm volatile("s_dcache_wb; s_dcache_inv; s_icache_inv" ::: "memory");
-#endif
-        };
-
-        // Cold runs - excludes from final timing measurement
-        for(int i = 0; i < stream.cold_niters_; ++i)
-        {
-            timer.start(stream.stream_id_);
-            kernel_launch_func();
-            timer.stop(stream.stream_id_);
-        }
-
-        // Hot runs - actual performance measurement
-        std::vector<float> measured_times;
-        for(int i = 0; i < stream.nrepeat_; ++i)
-        {
-            timer.start(stream.stream_id_);
-            kernel_launch_func();
-            timer.stop(stream.stream_id_);
-            measured_times.push_back(timer.duration());
-            // Periodic cache flushing
-            if(i % 4 == 0)
-                flush_cache();
-        }
-
-        return std::accumulate(measured_times.begin(), measured_times.end(), 0.0f) /
-               measured_times.size();
-    }
-
    template <typename Kernel>
    void benchmark_kernel(ck_tile::DeviceMem& c_m_n_dev_buf,
                          ck_tile::HostTensor<CDataType>& c_m_n_host_result,
@@ -202,18 +164,7 @@ class GemmProfiler

        KernelInstance kernel_instance{description, problem, {-1.0f, -1.0f, -1.0f}};

-        auto kernel_launch_func = [&] { Kernel::launch(args, stream); };
-
-        float avg_time = 0.f;
-        if(stream.is_gpu_timer_)
-        {
-            avg_time = time_kernel<ck_tile::gpu_timer>(stream, kernel_launch_func);
-        }
-        else
-        {
-            avg_time = time_kernel<ck_tile::cpu_timer>(stream, kernel_launch_func);
-        }
-
+        float avg_time       = Kernel::launch(args, stream);
        std::size_t flop     = std::size_t(2) * args.M * args.N * args.K;
        std::size_t num_byte = sizeof(ADataType) * args.M * args.K +
                               sizeof(BDataType) * args.N * args.K +
@@ -223,7 +174,10 @@ class GemmProfiler
        kernel_instance.perf_result.tflops    = static_cast<float>(flop) / 1.E9 / avg_time;
        kernel_instance.perf_result.bandwidth = num_byte / 1.E6 / avg_time;

-        std::cout << kernel_instance << std::endl;
+        if(stream.log_level_ > 0)
+        {
+            std::cout << kernel_instance << std::endl;
+        }

        c_m_n_dev_buf.FromDevice(c_m_n_dev_result.data());
        bool verified_correct =
--- a/tile_engine/ops/gemm/codegen_utils.py
+++ b/tile_engine/ops/gemm/codegen_utils.py
@@ -88,9 +88,16 @@ RUN_MEM = """
                RunSplitk(ck_tile::bool_constant<true>{},
                    ck_tile::integral_constant<ck_tile::TailNumber, ck_tile::TailNumber::Full>{});
            }
-            // Variadic call using fold expression
+            
            auto check_tail = [&](auto... TNs) {
-                (try_run< BaseGemmPipeline, decltype(TNs)::value>(tail_num), ...);
+                ([&]{
+                    if constexpr(BaseGemmPipeline::PrefetchStages > static_cast<int>(decltype(TNs)::value)) {
+                        if(tail_num == decltype(TNs)::value) {
+                            RunSplitk(ck_tile::bool_constant<true>{},
+                                    ck_tile::integral_constant<ck_tile::TailNumber, decltype(TNs)::value>{});
+                        }
+                    }
+                }(), ...);
            };

            check_tail(
@@ -200,28 +207,31 @@ def element_size(data_type: str) -> float:
    else:
        raise ValueError(f"Unsupported data type: {data_type}")

+GPU_NAME_PATTERN = re.compile(r'Name:\s*(gfx\d+\w*)')

@lru_cache(maxsize=1)
 def get_gpu_name_by_id(gpu_id: int = 0) -> str:
    """Retrieve GPU name (e.g. gfx90a) by device ID"""
    try:
-        cmd = ['rocm-smi', '--showproductname', '-d', str(gpu_id)]
-        result = subprocess.run(
-            cmd,
-            stdout=subprocess.PIPE,
-            stderr=subprocess.PIPE,
+        output = subprocess.check_output(
+            ["rocminfo"],
            text=True,
-            check=True
+            stderr=subprocess.PIPE,
+            timeout=5
        )
-
-        arch_pattern = r'gfx\d{3,4}[a-z]?'
-        match = re.search(arch_pattern, result.stdout.lower())
-        return match.group() if match else ""
-
-    except (FileNotFoundError, subprocess.CalledProcessError) as e:
-        print(f"System Error: {str(e)}, when get the name of gpu:{gpu_id}")
+        if matches := GPU_NAME_PATTERN.finditer(output):
+            gpu_list = [m.group(1) for m in matches]
+            return gpu_list[gpu_id] if gpu_id < len(gpu_list) else ""
+            
        return ""
+        
+    except subprocess.CalledProcessError as e:
+        print(f"GPU query failed (exit {e.returncode}): {e.stderr.strip()}")
+    except FileNotFoundError:
+        print("ROCm tools not installed (requires rocminfo)")
+    except subprocess.TimeoutExpired:
+        print("GPU query timeout (5s)")
    except Exception as e:
-        print(
-            f"Runtime Exception: {str(e)}, when get the name of gpu:{gpu_id}")
-        return ""
+        print(f"GPU detection error: {str(e)}")
+    
+    return ""
--- a/tile_engine/ops/gemm/configs/user_provided_config.json
+++ b/tile_engine/ops/gemm/configs/user_provided_config.json
@@ -34,12 +34,16 @@
  "tile_config": {
    "tile_m": {
      "values": [
-        256
+        256,
+        128,
+        64
      ]
    },
    "tile_n": {
      "values": [
-        256
+        256,
+        128,
+        64
      ]
    },
    "tile_k": {
--- a/tile_engine/ops/gemm/gemm_host_api.cpp
+++ b/tile_engine/ops/gemm/gemm_host_api.cpp
@@ -48,6 +48,7 @@ void run(const ck_tile::ArgParser& arg_parser)
    ck_tile::index_t stride_B = arg_parser.get_int("stride_b");
    ck_tile::index_t stride_C = arg_parser.get_int("stride_c");

+    bool log                     = arg_parser.get_int("log");
    int n_warmup                 = arg_parser.get_int("warmup");
    int n_repeat                 = arg_parser.get_int("repeat");
    int verify                   = arg_parser.get_int("v");
@@ -173,7 +174,7 @@ void run(const ck_tile::ArgParser& arg_parser)
                       structured_sparsity,
                       trait,
                       gemm_args,
-                       ck_tile::stream_config{nullptr, false, 0, n_warmup, n_repeat});
+                       ck_tile::stream_config{nullptr, true, log, n_warmup, n_repeat});

    return;
 }
--- a/tile_engine/ops/gemm/gemm_host_api.hpp
+++ b/tile_engine/ops/gemm/gemm_host_api.hpp
@@ -118,6 +118,10 @@ inline auto create_args(int argc, char* argv[])
                "2",
                "The type of validation. Set to 0 for no validation, 1 for validation on CPU, or 2 "
                "for validation on GPU. Default is 2, validation on GPU.")
+        .insert("log",
+                "false",
+                "Wether output kernel instance information or not. Possible values are true or "
+                "false. Default is false")
        .insert(
            "warmup", "50", "The number of iterations before benchmark the kernel. Default is 50.")
        .insert(
--- a/tile_engine/ops/gemm/gemm_instance_builder.py
+++ b/tile_engine/ops/gemm/gemm_instance_builder.py
@@ -109,6 +109,7 @@ class GemmCodeGenerator:
 // Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.

 #pragma once
+
 #include "ck_tile/core.hpp"

 // Data types
@@ -141,6 +142,8 @@ using CLayout = {LAYOUT_MAP[self.config.problem.layout_map['matrix_c']]};
        content = f"""// SPDX-License-Identifier: MIT
 // Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.

+#pragma once
+
 #include "gemm_common.hpp"
 #include "ck_tile/ops/gemm.hpp"
 #include "ck_tile/ops/epilogue.hpp"
@@ -159,15 +162,7 @@ namespace {trait} {{
                                pad_m: str, pad_n: str, pad_k: str) -> str:
        """Generate the code block of kernel struct"""
        return f"""
-template <typename Pipeline, ck_tile::TailNumber TN>
-void try_run(ck_tile::TailNumber tn) {{
-    if constexpr (Pipeline::PrefetchStages > static_cast<int>(TN) - 1) {{
-        if (tn == TN) {{
-            RunSplitk(ck_tile::bool_constant<true>{{}},
-                ck_tile::integral_constant<ck_tile::TailNumber, TN>{{}});
-        }}
-    }}
-}}
+
 template <int TileM, int TileN, int TileK,
          int WarpM, int WarpN, int WarpK,
          int WarpTileM, int WarpTileN, int WarpTileK,