Improve parallelism in the benchmark

2026-07-18 09:38:17 +00:00 · 2026-02-09 13:16:21 +00:00
parent ec1e8ec58e
commit 8b59a1e192
2 changed files with 11 additions and 8 deletions
--- a/example/ck_tile/42_mhc/mhc_v3_bf16_benchmark.cpp
+++ b/example/ck_tile/42_mhc/mhc_v3_bf16_benchmark.cpp
@@ -94,9 +94,10 @@ bool run_mhc_benchmark(const ck_tile::ArgParser& arg_parser)
    d_phi_mem.ToDevice(h_phi.data());
    d_output_mem.ToDevice(h_output.data());

-    // Define block shape - 128 threads (2 warps) to match BlockGemmShape configuration
-    using BlockShape = ck_tile::Generic2dBlockShape<ck_tile::sequence<1, 128>,
-                                                    ck_tile::sequence<1, 128>,
+    // Define block shape - 64 threads (1 warp) to match BlockGemmShape configuration
+    // This matches a 16x16 block tile with 1 warp (1x1 warp layout)
+    using BlockShape = ck_tile::Generic2dBlockShape<ck_tile::sequence<1, 64>,
+                                                    ck_tile::sequence<1, 64>,
                                                    ck_tile::sequence<1, 1>>;

    using Problem = ck_tile::MHCProblem<XDataType, ComputeDataType, YDataType, BlockShape>;