From 8b59a1e19272ea4cfcae8a2ae08d28d595865d2f Mon Sep 17 00:00:00 2001 From: Damien Lejeune Date: Mon, 9 Feb 2026 13:16:21 +0000 Subject: [PATCH] Improve parallelism in the benchmark --- example/ck_tile/42_mhc/mhc_v3_bf16_benchmark.cpp | 7 ++++--- include/ck_tile/ops/mhc/pipeline/mhc_problem.hpp | 12 +++++++----- 2 files changed, 11 insertions(+), 8 deletions(-) diff --git a/example/ck_tile/42_mhc/mhc_v3_bf16_benchmark.cpp b/example/ck_tile/42_mhc/mhc_v3_bf16_benchmark.cpp index 15be81d9e7..019e8a15b3 100644 --- a/example/ck_tile/42_mhc/mhc_v3_bf16_benchmark.cpp +++ b/example/ck_tile/42_mhc/mhc_v3_bf16_benchmark.cpp @@ -94,9 +94,10 @@ bool run_mhc_benchmark(const ck_tile::ArgParser& arg_parser) d_phi_mem.ToDevice(h_phi.data()); d_output_mem.ToDevice(h_output.data()); - // Define block shape - 128 threads (2 warps) to match BlockGemmShape configuration - using BlockShape = ck_tile::Generic2dBlockShape, - ck_tile::sequence<1, 128>, + // Define block shape - 64 threads (1 warp) to match BlockGemmShape configuration + // This matches a 16x16 block tile with 1 warp (1x1 warp layout) + using BlockShape = ck_tile::Generic2dBlockShape, + ck_tile::sequence<1, 64>, ck_tile::sequence<1, 1>>; using Problem = ck_tile::MHCProblem; diff --git a/include/ck_tile/ops/mhc/pipeline/mhc_problem.hpp b/include/ck_tile/ops/mhc/pipeline/mhc_problem.hpp index 6096605357..d596cd23b9 100644 --- a/include/ck_tile/ops/mhc/pipeline/mhc_problem.hpp +++ b/include/ck_tile/ops/mhc/pipeline/mhc_problem.hpp @@ -26,12 +26,14 @@ struct MHCProblem using CDataType = ComputeDataType; // Output/accumulator matrix C // BlockGemmShape with kM, kN, kK members for BlockGemm - // Using 32x32x8 warp tiles (supported by MFMA) with 2x1 warp layout for 64x32 block - // This gives better parallelism than 64x32 while using supported warp sizes + // Using 16x16x16 warp tiles with 1x1 warp layout for 16x16 block + // Minimal tile size to maximize block count: (1024/16) × (24/16) = 64 × 2 = 128 blocks + // This provides 8x better parallelism than original (128 blocks vs 16 blocks) + // Testing if overhead from many small blocks becomes a problem using BlockGemmShape = - TileGemmShape, // BlockTile (M, N, K) - keep original for now - sequence<2, 1, 1>, // BlockWarps (2 warps in M, 1 in N, 1 in K) - sequence<32, 32, 8>>; // WarpTile (32x32x8 is supported by MFMA) + TileGemmShape, // BlockTile (M, N, K) - minimal tiles for max blocks + sequence<1, 1, 1>, // BlockWarps (1 warp per block) + sequence<16, 16, 16>>; // WarpTile (16x16x16 is supported by MFMA) // Layout types for BlockGemm using ALayout = ck_tile::tensor_layout::gemm::RowMajor; // x is row-major [B, nC]