Add benchmark example

2026-05-12 01:10:17 +00:00 · 2026-02-06 14:55:13 +00:00
parent 804a9d488c
commit ec1e8ec58e
6 changed files with 231 additions and 28 deletions
--- a/include/ck_tile/ops/mhc/kernel/mhc_kernel_tile_v3.hpp
+++ b/include/ck_tile/ops/mhc/kernel/mhc_kernel_tile_v3.hpp
@@ -22,9 +22,6 @@ namespace ck_tile {

 template <typename Problem_,
          typename Policy_     = MHCDefaultPolicy,
-          index_t kMTile_      = 64, // Batch tile size
-          index_t kNTile_      = 32, // Output dimension tile (can cover all 24 outputs)
-          index_t kKTile_      = 8,  // K-tile for C dimension (must match BlockGemmShape::kK)
          typename Activation_ = element_wise::Sigmoid>
 struct MHCKernelV3
 {
@@ -37,9 +34,10 @@ struct MHCKernelV3
    using YDataType       = ck_tile::remove_cvref_t<typename Problem::YDataType>;
    using PhiDataType     = ck_tile::remove_cvref_t<typename Problem::PhiDataType>;

-    static constexpr index_t kMTile = kMTile_; // Batch tile
-    static constexpr index_t kNTile = kNTile_; // Output tile
-    static constexpr index_t kKTile = kKTile_; // K tile for C dimension
+    // Automatically derive tile sizes from BlockGemmShape (single source of truth!)
+    static constexpr index_t kMTile = Problem::BlockGemmShape::kM; // Batch tile
+    static constexpr index_t kNTile = Problem::BlockGemmShape::kN; // Output tile
+    static constexpr index_t kKTile = Problem::BlockGemmShape::kK; // K tile for C dimension

    static constexpr index_t kBlockSize = Problem::kBlockSize;

--- a/include/ck_tile/ops/mhc/pipeline/mhc_problem.hpp
+++ b/include/ck_tile/ops/mhc/pipeline/mhc_problem.hpp
@@ -26,12 +26,12 @@ struct MHCProblem
    using CDataType = ComputeDataType; // Output/accumulator matrix C

    // BlockGemmShape with kM, kN, kK members for BlockGemm
-    // Use supported warp gemm configuration for float32: 32x32x8
-    // We'll use 2 warps in M and 1 warp in N to get 64x32 block
+    // Using 32x32x8 warp tiles (supported by MFMA) with 2x1 warp layout for 64x32 block
+    // This gives better parallelism than 64x32 while using supported warp sizes
    using BlockGemmShape =
-        TileGemmShape<sequence<64, 32, 8>,  // BlockTile (M, N, K)
+        TileGemmShape<sequence<64, 32, 8>,  // BlockTile (M, N, K) - keep original for now
                      sequence<2, 1, 1>,    // BlockWarps (2 warps in M, 1 in N, 1 in K)
-                      sequence<32, 32, 8>>; // WarpTile (matches available float32 MFMA)
+                      sequence<32, 32, 8>>; // WarpTile (32x32x8 is supported by MFMA)

    // Layout types for BlockGemm
    using ALayout = ck_tile::tensor_layout::gemm::RowMajor; // x is row-major [B, nC]