From 8b59a1e19272ea4cfcae8a2ae08d28d595865d2f Mon Sep 17 00:00:00 2001
From: Damien Lejeune <damien.lejeune@amd.com>
Date: Mon, 9 Feb 2026 13:16:21 +0000
Subject: [PATCH] Improve parallelism in the benchmark

---
 example/ck_tile/42_mhc/mhc_v3_bf16_benchmark.cpp |  7 ++++---
 include/ck_tile/ops/mhc/pipeline/mhc_problem.hpp | 12 +++++++-----
 2 files changed, 11 insertions(+), 8 deletions(-)

diff --git a/example/ck_tile/42_mhc/mhc_v3_bf16_benchmark.cpp b/example/ck_tile/42_mhc/mhc_v3_bf16_benchmark.cpp
index 15be81d9e7..019e8a15b3 100644
--- a/example/ck_tile/42_mhc/mhc_v3_bf16_benchmark.cpp
+++ b/example/ck_tile/42_mhc/mhc_v3_bf16_benchmark.cpp
@@ -94,9 +94,10 @@ bool run_mhc_benchmark(const ck_tile::ArgParser& arg_parser)
     d_phi_mem.ToDevice(h_phi.data());
     d_output_mem.ToDevice(h_output.data());
 
-    // Define block shape - 128 threads (2 warps) to match BlockGemmShape configuration
-    using BlockShape = ck_tile::Generic2dBlockShape<ck_tile::sequence<1, 128>,
-                                                    ck_tile::sequence<1, 128>,
+    // Define block shape - 64 threads (1 warp) to match BlockGemmShape configuration
+    // This matches a 16x16 block tile with 1 warp (1x1 warp layout)
+    using BlockShape = ck_tile::Generic2dBlockShape<ck_tile::sequence<1, 64>,
+                                                    ck_tile::sequence<1, 64>,
                                                     ck_tile::sequence<1, 1>>;
 
     using Problem = ck_tile::MHCProblem<XDataType, ComputeDataType, YDataType, BlockShape>;
diff --git a/include/ck_tile/ops/mhc/pipeline/mhc_problem.hpp b/include/ck_tile/ops/mhc/pipeline/mhc_problem.hpp
index 6096605357..d596cd23b9 100644
--- a/include/ck_tile/ops/mhc/pipeline/mhc_problem.hpp
+++ b/include/ck_tile/ops/mhc/pipeline/mhc_problem.hpp
@@ -26,12 +26,14 @@ struct MHCProblem
     using CDataType = ComputeDataType; // Output/accumulator matrix C
 
     // BlockGemmShape with kM, kN, kK members for BlockGemm
-    // Using 32x32x8 warp tiles (supported by MFMA) with 2x1 warp layout for 64x32 block
-    // This gives better parallelism than 64x32 while using supported warp sizes
+    // Using 16x16x16 warp tiles with 1x1 warp layout for 16x16 block
+    // Minimal tile size to maximize block count: (1024/16) × (24/16) = 64 × 2 = 128 blocks
+    // This provides 8x better parallelism than original (128 blocks vs 16 blocks)
+    // Testing if overhead from many small blocks becomes a problem
     using BlockGemmShape =
-        TileGemmShape<sequence<64, 32, 8>,  // BlockTile (M, N, K) - keep original for now
-                      sequence<2, 1, 1>,    // BlockWarps (2 warps in M, 1 in N, 1 in K)
-                      sequence<32, 32, 8>>; // WarpTile (32x32x8 is supported by MFMA)
+        TileGemmShape<sequence<16, 16, 16>,  // BlockTile (M, N, K) - minimal tiles for max blocks
+                      sequence<1, 1, 1>,     // BlockWarps (1 warp per block)
+                      sequence<16, 16, 16>>; // WarpTile (16x16x16 is supported by MFMA)
 
     // Layout types for BlockGemm
     using ALayout = ck_tile::tensor_layout::gemm::RowMajor; // x is row-major [B, nC]