From 012c77125a0ef4b5491a28ea2dfc3187ff91fe60 Mon Sep 17 00:00:00 2001
From: Yanxing-Shi <yanxishi@amd.com>
Date: Fri, 16 May 2025 10:37:59 +0000
Subject: [PATCH] recover benchmark_gemm and fix

---
 tile_engine/ops/gemm/README.md                | 29 +++++-----
 tile_engine/ops/gemm/benchmark_gemm.hpp       | 56 ++-----------------
 tile_engine/ops/gemm/codegen_utils.py         | 46 +++++++++------
 .../gemm/configs/user_provided_config.json    |  8 ++-
 tile_engine/ops/gemm/gemm_host_api.cpp        |  3 +-
 tile_engine/ops/gemm/gemm_host_api.hpp        |  4 ++
 tile_engine/ops/gemm/gemm_instance_builder.py | 13 ++---
 7 files changed, 64 insertions(+), 95 deletions(-)
diff --git a/tile_engine/ops/gemm/README.md b/tile_engine/ops/gemm/README.md
index aeac2f541f..26ece438bb 100644
--- a/tile_engine/ops/gemm/README.md
+++ b/tile_engine/ops/gemm/README.md
@@ -12,7 +12,7 @@ User can provide kernel configuration such as tile size, warp size, padding, pip
 mkdir build && cd build
 # build composable kernel
 ## replace <arch> with the appropriate architecture (example gfx942) or leave blank
-## "USE_CUSTOM_CONFIG=ON" for default configuration, "USE_CUSTOM_CONFIG=ON" for user provided configuration
+## "USE_CUSTOM_CONFIG=OFF" for default configuration, "USE_CUSTOM_CONFIG=ON" for user provided configuration
 sh ../script/cmake-ck-dev.sh  ../ <arch> -D USE_CUSTOM_CONFIG=ON 
 # generate the executable
 make tile_engine_gemm -j
@@ -29,22 +29,23 @@ rm -rf tile_engine/ && make tile_engine_gemm -j  # rebuild
                       -m    The value for m dimension. Default is 3840.
                       -n    The value for n dimension. Default is 4096.
                       -k    The value for k dimension. Default is 2048.
-              -stride_a    The stride value for tensor A. Default is 0.
-              -stride_b    The stride value for tensor B. Default is 0.
-              -stride_c    The stride value for tensor C  Default is 0.
+               -stride_a    The stride value for tensor A. Default is 0.
+               -stride_b    The stride value for tensor B. Default is 0.
+               -stride_c    The stride value for tensor C  Default is 0.
                 -split_k    The split value for k dimension. Default is 1.
                       -v    The type of validation. Set to 0 for no validation, 1 for validation on CPU, or 2 for validation on GPU. Default is 2, validation on GPU.
-                -warmup    The number of iterations before benchmark the kernel. Default is 50.
-                -repeat    The number of iterations to benchmark the kernel. Default is 100.
+                    -log    Wether output kernel instance information or not. Possible values are true or false. Default is false.
+                 -warmup    The number of iterations before benchmark the kernel. Default is 50.
+                 -repeat    The number of iterations to benchmark the kernel. Default is 100.
                   -timer    The type of timer. Possible values are gpu timer or cpu timer. Default is gpu timer.
-                  -init    The method of tensor initialization. Set to 0 for random, to 1 for linear, or 2 for constant(1). Default is 0, random.
-                -metric    Metric with which to measure kernel performance. Set to 0 for latency, 1 for tflops, or 2 for bandwidth. Default is 0, latency.
-   -structured_sparsity    whether use sparsity kernel or not. Possible values are true or false. Default is false.
-              -pipeline    The type of pipeline. Possible values are compv3, compv4 or mem. Default is compv3.     
-              -epilogue    The type of epilogue. Possible values are cshuffle or default. Default is cshuffle.
-                 -pad_m    Whether pad or not in m direction. Possible values are true or false. Default is false. 
-                 -pad_n    Whether pad or not in n direction. Possible values are true or false. Default is false. 
-                 -pad_k    Whether pad or not in k direction. Possible values are true or false. Default is false. 
+                   -init    The method of tensor initialization. Set to 0 for random, to 1 for linear, or 2 for constant(1). Default is 0, random.
+                 -metric    Metric with which to measure kernel performance. Set to 0 for latency, 1 for tflops, or 2 for bandwidth. Default is 0, latency.
+    -structured_sparsity    whether use sparsity kernel or not. Possible values are true or false. Default is false.
+               -pipeline    The type of pipeline. Possible values are compv3, compv4 or mem. Default is compv3.     
+               -epilogue    The type of epilogue. Possible values are cshuffle or default. Default is cshuffle.
+                  -pad_m    Whether pad or not in m direction. Possible values are true or false. Default is false. 
+                  -pad_n    Whether pad or not in n direction. Possible values are true or false. Default is false. 
+                  -pad_k    Whether pad or not in k direction. Possible values are true or false. Default is false. 
 
 Note: pipeline, scheduler, epilogue, pad_m, pad_n, pad_k should be one of the options specified in user_provided_config.json 
 ```
diff --git a/tile_engine/ops/gemm/benchmark_gemm.hpp b/tile_engine/ops/gemm/benchmark_gemm.hpp
index 3c08388b50..a6aaad1a60 100644
--- a/tile_engine/ops/gemm/benchmark_gemm.hpp
+++ b/tile_engine/ops/gemm/benchmark_gemm.hpp
@@ -137,44 +137,6 @@ class GemmProfiler
         return "Unknown";
     }
 
-    template <typename Timer>
-    static float time_kernel(const ck_tile::stream_config& stream,
-                             const std::function<void()>& kernel_launch_func)
-    {
-        Timer timer;
-
-        // flush cache
-        auto flush_cache = [&] {
-#if defined(__HIP_DEVICE_COMPILE__) && __HIP_DEVICE_COMPILE__
-            asm volatile("s_dcache_wb; s_dcache_inv; s_icache_inv" ::: "memory");
-#endif
-        };
-
-        // Cold runs - excludes from final timing measurement
-        for(int i = 0; i < stream.cold_niters_; ++i)
-        {
-            timer.start(stream.stream_id_);
-            kernel_launch_func();
-            timer.stop(stream.stream_id_);
-        }
-
-        // Hot runs - actual performance measurement
-        std::vector<float> measured_times;
-        for(int i = 0; i < stream.nrepeat_; ++i)
-        {
-            timer.start(stream.stream_id_);
-            kernel_launch_func();
-            timer.stop(stream.stream_id_);
-            measured_times.push_back(timer.duration());
-            // Periodic cache flushing
-            if(i % 4 == 0)
-                flush_cache();
-        }
-
-        return std::accumulate(measured_times.begin(), measured_times.end(), 0.0f) /
-               measured_times.size();
-    }
-
     template <typename Kernel>
     void benchmark_kernel(ck_tile::DeviceMem& c_m_n_dev_buf,
                           ck_tile::HostTensor<CDataType>& c_m_n_host_result,
@@ -202,18 +164,7 @@ class GemmProfiler
 
         KernelInstance kernel_instance{description, problem, {-1.0f, -1.0f, -1.0f}};
 
-        auto kernel_launch_func = [&] { Kernel::launch(args, stream); };
-
-        float avg_time = 0.f;
-        if(stream.is_gpu_timer_)
-        {
-            avg_time = time_kernel<ck_tile::gpu_timer>(stream, kernel_launch_func);
-        }
-        else
-        {
-            avg_time = time_kernel<ck_tile::cpu_timer>(stream, kernel_launch_func);
-        }
-
+        float avg_time       = Kernel::launch(args, stream);
         std::size_t flop     = std::size_t(2) * args.M * args.N * args.K;
         std::size_t num_byte = sizeof(ADataType) * args.M * args.K +
                                sizeof(BDataType) * args.N * args.K +
@@ -223,7 +174,10 @@ class GemmProfiler
         kernel_instance.perf_result.tflops    = static_cast<float>(flop) / 1.E9 / avg_time;
         kernel_instance.perf_result.bandwidth = num_byte / 1.E6 / avg_time;
 
-        std::cout << kernel_instance << std::endl;
+        if(stream.log_level_ > 0)
+        {
+            std::cout << kernel_instance << std::endl;
+        }
 
         c_m_n_dev_buf.FromDevice(c_m_n_dev_result.data());
         bool verified_correct =
diff --git a/tile_engine/ops/gemm/codegen_utils.py b/tile_engine/ops/gemm/codegen_utils.py
index 1480300b68..1a696fef51 100644
--- a/tile_engine/ops/gemm/codegen_utils.py
+++ b/tile_engine/ops/gemm/codegen_utils.py
@@ -88,9 +88,16 @@ RUN_MEM = """
                 RunSplitk(ck_tile::bool_constant<true>{},
                     ck_tile::integral_constant<ck_tile::TailNumber, ck_tile::TailNumber::Full>{});
             }
-            // Variadic call using fold expression
+            
             auto check_tail = [&](auto... TNs) {
-                (try_run< BaseGemmPipeline, decltype(TNs)::value>(tail_num), ...);
+                ([&]{
+                    if constexpr(BaseGemmPipeline::PrefetchStages > static_cast<int>(decltype(TNs)::value)) {
+                        if(tail_num == decltype(TNs)::value) {
+                            RunSplitk(ck_tile::bool_constant<true>{},
+                                    ck_tile::integral_constant<ck_tile::TailNumber, decltype(TNs)::value>{});
+                        }
+                    }
+                }(), ...);
             };
 
             check_tail(
@@ -200,28 +207,31 @@ def element_size(data_type: str) -> float:
     else:
         raise ValueError(f"Unsupported data type: {data_type}")
 
+GPU_NAME_PATTERN = re.compile(r'Name:\s*(gfx\d+\w*)')
 
 @lru_cache(maxsize=1)
 def get_gpu_name_by_id(gpu_id: int = 0) -> str:
     """Retrieve GPU name (e.g. gfx90a) by device ID"""
     try:
-        cmd = ['rocm-smi', '--showproductname', '-d', str(gpu_id)]
-        result = subprocess.run(
-            cmd,
-            stdout=subprocess.PIPE,
-            stderr=subprocess.PIPE,
+        output = subprocess.check_output(
+            ["rocminfo"],
             text=True,
-            check=True
+            stderr=subprocess.PIPE,
+            timeout=5
         )
-
-        arch_pattern = r'gfx\d{3,4}[a-z]?'
-        match = re.search(arch_pattern, result.stdout.lower())
-        return match.group() if match else ""
-
-    except (FileNotFoundError, subprocess.CalledProcessError) as e:
-        print(f"System Error: {str(e)}, when get the name of gpu:{gpu_id}")
+        if matches := GPU_NAME_PATTERN.finditer(output):
+            gpu_list = [m.group(1) for m in matches]
+            return gpu_list[gpu_id] if gpu_id < len(gpu_list) else ""
+            
         return ""
+        
+    except subprocess.CalledProcessError as e:
+        print(f"GPU query failed (exit {e.returncode}): {e.stderr.strip()}")
+    except FileNotFoundError:
+        print("ROCm tools not installed (requires rocminfo)")
+    except subprocess.TimeoutExpired:
+        print("GPU query timeout (5s)")
     except Exception as e:
-        print(
-            f"Runtime Exception: {str(e)}, when get the name of gpu:{gpu_id}")
-        return ""
+        print(f"GPU detection error: {str(e)}")
+    
+    return ""
\ No newline at end of file
diff --git a/tile_engine/ops/gemm/configs/user_provided_config.json b/tile_engine/ops/gemm/configs/user_provided_config.json
index 312cefb4ed..7744d33d3c 100644
--- a/tile_engine/ops/gemm/configs/user_provided_config.json
+++ b/tile_engine/ops/gemm/configs/user_provided_config.json
@@ -34,12 +34,16 @@
   "tile_config": {
     "tile_m": {
       "values": [
-        256
+        256,
+        128,
+        64
       ]
     },
     "tile_n": {
       "values": [
-        256
+        256,
+        128,
+        64
       ]
     },
     "tile_k": {
diff --git a/tile_engine/ops/gemm/gemm_host_api.cpp b/tile_engine/ops/gemm/gemm_host_api.cpp
index 673ba63f94..43d6509bf8 100755
--- a/tile_engine/ops/gemm/gemm_host_api.cpp
+++ b/tile_engine/ops/gemm/gemm_host_api.cpp
@@ -48,6 +48,7 @@ void run(const ck_tile::ArgParser& arg_parser)
     ck_tile::index_t stride_B = arg_parser.get_int("stride_b");
     ck_tile::index_t stride_C = arg_parser.get_int("stride_c");
 
+    bool log                     = arg_parser.get_int("log");
     int n_warmup                 = arg_parser.get_int("warmup");
     int n_repeat                 = arg_parser.get_int("repeat");
     int verify                   = arg_parser.get_int("v");
@@ -173,7 +174,7 @@ void run(const ck_tile::ArgParser& arg_parser)
                        structured_sparsity,
                        trait,
                        gemm_args,
-                       ck_tile::stream_config{nullptr, false, 0, n_warmup, n_repeat});
+                       ck_tile::stream_config{nullptr, true, log, n_warmup, n_repeat});
 
     return;
 }
diff --git a/tile_engine/ops/gemm/gemm_host_api.hpp b/tile_engine/ops/gemm/gemm_host_api.hpp
index bbf71b84de..58eeb9a9ee 100755
--- a/tile_engine/ops/gemm/gemm_host_api.hpp
+++ b/tile_engine/ops/gemm/gemm_host_api.hpp
@@ -118,6 +118,10 @@ inline auto create_args(int argc, char* argv[])
                 "2",
                 "The type of validation. Set to 0 for no validation, 1 for validation on CPU, or 2 "
                 "for validation on GPU. Default is 2, validation on GPU.")
+        .insert("log",
+                "false",
+                "Wether output kernel instance information or not. Possible values are true or "
+                "false. Default is false")
         .insert(
             "warmup", "50", "The number of iterations before benchmark the kernel. Default is 50.")
         .insert(
diff --git a/tile_engine/ops/gemm/gemm_instance_builder.py b/tile_engine/ops/gemm/gemm_instance_builder.py
index bc74054723..75ddaf76c7 100755
--- a/tile_engine/ops/gemm/gemm_instance_builder.py
+++ b/tile_engine/ops/gemm/gemm_instance_builder.py
@@ -109,6 +109,7 @@ class GemmCodeGenerator:
 // Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
+
 #include "ck_tile/core.hpp"
 
 // Data types
@@ -141,6 +142,8 @@ using CLayout = {LAYOUT_MAP[self.config.problem.layout_map['matrix_c']]};
         content = f"""// SPDX-License-Identifier: MIT
 // Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
 
+#pragma once
+
 #include "gemm_common.hpp"
 #include "ck_tile/ops/gemm.hpp"
 #include "ck_tile/ops/epilogue.hpp"
@@ -159,15 +162,7 @@ namespace {trait} {{
                                 pad_m: str, pad_n: str, pad_k: str) -> str:
         """Generate the code block of kernel struct"""
         return f"""
-template <typename Pipeline, ck_tile::TailNumber TN>
-void try_run(ck_tile::TailNumber tn) {{
-    if constexpr (Pipeline::PrefetchStages > static_cast<int>(TN) - 1) {{
-        if (tn == TN) {{
-            RunSplitk(ck_tile::bool_constant<true>{{}},
-                ck_tile::integral_constant<ck_tile::TailNumber, TN>{{}});
-        }}
-    }}
-}}
+
 template <int TileM, int TileN, int TileK,
           int WarpM, int WarpN, int WarpK,
           int WarpTileM, int WarpTileN, int WarpTileK,