[CK_TILE] Fix flatmm on gfx11 and gfx12 (#2790)

1. Correct shuffle_b and MakeBFlatDramTileDistribution according to WMMA warp layout 2. Add FlatmmConfig16_Wmma for gfx11 and gfx12
2026-05-03 21:21:22 +00:00 · 2025-09-10 08:28:00 +08:00
parent 82890192dd
commit df4ee556d6
14 changed files with 224 additions and 67 deletions
--- a/include/ck_tile/ops/gemm/pipeline/wp_pipeline_agmem_bgmem_creg_base_policy.hpp
+++ b/include/ck_tile/ops/gemm/pipeline/wp_pipeline_agmem_bgmem_creg_base_policy.hpp
@@ -89,14 +89,19 @@ struct UniversalWeightPreshufflePipelineAgBgCrPolicy
    CK_TILE_HOST_DEVICE static constexpr auto GetKBPerLoad()
    {
        using TileShape = typename Problem::BlockGemmShape;
+#if defined(__gfx11__)
+        constexpr index_t scale = 4;
+#else
+        constexpr index_t scale = get_warp_size() == 32 ? 2 : 1;
+#endif
        if constexpr(TileShape::WarpTile::at(I1) == 32)
        {
-            return TileShape::WarpTile::at(I2) / 2;
+            return TileShape::WarpTile::at(I2) * scale / 2;
        }
        else
        {
            static_assert(TileShape::WarpTile::at(I1) == 16);
-            return TileShape::WarpTile::at(I2) / 4;
+            return TileShape::WarpTile::at(I2) * scale / 4;
        }
    }

@@ -192,7 +197,7 @@ struct UniversalWeightPreshufflePipelineAgBgCrPolicy
    }

    template <typename Problem>
-    CK_TILE_HOST_DEVICE static constexpr auto MakeBFlatDramTileDistribution()
+    CK_TILE_DEVICE static constexpr auto MakeBFlatDramTileDistribution()
    {
        using TileShape = typename Problem::BlockGemmShape;

@@ -200,8 +205,13 @@ struct UniversalWeightPreshufflePipelineAgBgCrPolicy
        constexpr index_t WaveSize  = get_warp_size();
        constexpr index_t WaveNum   = BlockSize / WaveSize;

-        constexpr index_t KBPerLoad   = GetKBPerLoad<Problem>();
-        constexpr index_t KThdPerWave = WaveSize; // threads cnt in K dim
+        constexpr index_t KBPerLoad = GetKBPerLoad<Problem>();
+#if defined(__gfx11__)
+        constexpr index_t KRepeatInWave = 2;
+#else
+        constexpr index_t KRepeatInWave = 1;
+#endif
+        constexpr index_t KThdPerWave = WaveSize / KRepeatInWave; // threads cnt in K dim
        constexpr index_t KWavePerBlk = 1;
        constexpr index_t KRepeat     = 1;
        static_assert(TileShape::flatKPerWarp == KThdPerWave * KBPerLoad, "wrong");
@@ -212,16 +222,15 @@ struct UniversalWeightPreshufflePipelineAgBgCrPolicy
        constexpr index_t NRepeat     = 1;

        constexpr index_t WaveRepeat = WaveNum / TileShape::flatNPerWarp;
-
        return make_static_tile_distribution(
            tile_distribution_encoding<
-                sequence<WaveRepeat>,                                          // ?
+                sequence<WaveRepeat, KRepeatInWave>,                           // ?
                tuple<sequence<NRepeat, NWavePerBlk, NThdPerWave, NBPerLoad>,  // second direction
                      sequence<KRepeat, KWavePerBlk, KThdPerWave, KBPerLoad>>, // first  direction
                // wave in blk,     // thd in wave
                // <M, K>           // <M, K>
-                tuple<sequence<0, 1, 2>, sequence<1, 2>>, // which direction
-                tuple<sequence<0, 1, 1>, sequence<2, 2>>, // which index
+                tuple<sequence<0, 1, 2>, sequence<0, 1, 2>>, // which direction
+                tuple<sequence<0, 1, 1>, sequence<1, 2, 2>>, // which index
                // <repeat, vec_load>
                sequence<1, 1, 2, 2>,
                sequence<0, 3, 0, 3>>{});
--- a/include/ck_tile/ops/gemm/pipeline/wp_pipeline_agmem_bgmem_creg_v1.hpp
+++ b/include/ck_tile/ops/gemm/pipeline/wp_pipeline_agmem_bgmem_creg_v1.hpp
@@ -189,11 +189,11 @@ struct WeightPreshufflePipelineAGmemBGmemCRegV1
    }

    template <typename ADramBlockWindowTmp, typename BFlatBlockWindowTmp, typename AElementFunction>
-    CK_TILE_HOST_DEVICE auto operator()(const ADramBlockWindowTmp& a_dram_block_window_tmp,
-                                        const AElementFunction& a_element_func,
-                                        const BFlatBlockWindowTmp& b_flat_dram_block_window_tmp,
-                                        index_t num_loop,
-                                        void* p_smem) const
+    CK_TILE_DEVICE auto operator()(const ADramBlockWindowTmp& a_dram_block_window_tmp,
+                                   const AElementFunction& a_element_func,
+                                   const BFlatBlockWindowTmp& b_flat_dram_block_window_tmp,
+                                   index_t num_loop,
+                                   void* p_smem) const
    {
        static_assert(
            std::is_same_v<ADataType, remove_cvref_t<typename ADramBlockWindowTmp::DataType>> &&
--- a/include/ck_tile/ops/gemm/pipeline/wp_pipeline_agmem_bgmem_creg_v2.hpp
+++ b/include/ck_tile/ops/gemm/pipeline/wp_pipeline_agmem_bgmem_creg_v2.hpp
@@ -146,10 +146,14 @@ struct WeightPreshufflePipelineAGmemBGmemCRegV2
    static constexpr index_t mfma_per_wg = 1;
 #endif
    static constexpr index_t dsread_per_wg =
-        WG::kM * WG::kK * sizeof(ADataType) / WaveSize / Problem::VectorLoadSize;
-    static_assert((WG::kM * WG::kK * sizeof(ADataType) / WaveSize) % Problem::VectorLoadSize == 0);
-
-    static constexpr index_t dsread_num_perK  = dsread_per_wg * MIterPerWarp;
+        max(index_t(WG::kM * WG::kK * sizeof(ADataType) / WaveSize / Problem::VectorLoadSize), 1);
+#if defined(__HIP_DEVICE_COMPILE__)
+    static_assert((WG::kM * WG::kK * sizeof(ADataType) * MIterPerWarp / WaveSize) %
+                      Problem::VectorLoadSize ==
+                  0);
+#endif
+    static constexpr index_t dsread_num_perK =
+        WG::kM * WG::kK * sizeof(ADataType) * MIterPerWarp / WaveSize / Problem::VectorLoadSize;
    static constexpr index_t dswrite_num_perK = dsread_num_perK / (MWarp * NWarp);
    static constexpr index_t dswrite_rep    = (dswrite_num_perK + MIterPerWarp - 1) / MIterPerWarp;
    static constexpr index_t Aload_num_perK = dswrite_num_perK;
@@ -499,12 +503,12 @@ struct WeightPreshufflePipelineAGmemBGmemCRegV2
              typename ADramBlockWindowTmp,
              typename BFlatBlockWindowTmp,
              typename AElementFunction>
-    CK_TILE_HOST_DEVICE auto operator()(const ADramBlockWindowTmp& a_dram_block_window_tmp,
-                                        const AElementFunction& a_element_func,
-                                        const BFlatBlockWindowTmp& b_flat_dram_block_window_tmp,
-                                        index_t num_loop,
-                                        void* p_smem_ping,
-                                        void* p_smem_pong) const
+    CK_TILE_DEVICE auto operator()(const ADramBlockWindowTmp& a_dram_block_window_tmp,
+                                   const AElementFunction& a_element_func,
+                                   const BFlatBlockWindowTmp& b_flat_dram_block_window_tmp,
+                                   index_t num_loop,
+                                   void* p_smem_ping,
+                                   void* p_smem_pong) const
    {
        static_assert(
            std::is_same_v<ADataType, remove_cvref_t<typename ADramBlockWindowTmp::DataType>>,