[CK_TILE] Update flatmm related kernels (#3022)

--------- Co-authored-by: Ding, Yi <yi.ding@amd.com> Co-authored-by: felix <felix.li@amd.com>
2026-05-04 13:41:24 +00:00 · 2025-10-22 22:36:11 +08:00
parent cbd1279ae6
commit 211d64e18a
39 changed files with 11183 additions and 739 deletions
--- a/include/ck_tile/ops/epilogue/cshuffle_epilogue.hpp
+++ b/include/ck_tile/ops/epilogue/cshuffle_epilogue.hpp
@@ -9,9 +9,9 @@
 #include "ck_tile/ops/elementwise/unary_element_wise_operation.hpp"

 #include <optional>
-#include <type_traits>

 namespace ck_tile {
+
 template <typename AsDataType_,
          typename BsDataType_,
          typename DsDataType_,
@@ -29,10 +29,11 @@ template <typename AsDataType_,
          index_t KPerXdl_,
          bool isCTransposed_,
          memory_operation_enum MemoryOperation_,
-          index_t kNumWaveGroups_ = 1,
-          bool FixedVectorSize_   = false,
-          index_t VectorSizeC_    = 1,
-          bool TiledMMAPermuteN_  = false>
+          index_t kNumWaveGroups_      = 1,
+          bool FixedVectorSize_        = false,
+          index_t VectorSizeC_         = 1,
+          bool TiledMMAPermuteN_       = false,
+          index_t BlockedXDLN_PerWarp_ = 1> // The number of continuous xdl_output per warp
 struct CShuffleEpilogueProblem
 {
    using AsDataType                                       = remove_cvref_t<AsDataType_>;
@@ -55,6 +56,7 @@ struct CShuffleEpilogueProblem
    static constexpr memory_operation_enum MemoryOperation = MemoryOperation_;
    static constexpr bool FixedVectorSize                  = FixedVectorSize_;
    static constexpr index_t VectorSizeC                   = VectorSizeC_;
+    static constexpr index_t BlockedXDLN_PerWarp           = BlockedXDLN_PerWarp_;
    static constexpr bool TiledMMAPermuteN                 = TiledMMAPermuteN_;
    static constexpr index_t kNumWaveGroups                = kNumWaveGroups_;
    static constexpr index_t NumDTensor                    = DsDataType::size();
@@ -107,6 +109,7 @@ struct CShuffleEpilogue
    static constexpr index_t isCTransposed                 = Problem::isCTransposed;
    static constexpr bool FixedVectorSize                  = Problem::FixedVectorSize;
    static constexpr bool TiledMMAPermuteN                 = Problem::TiledMMAPermuteN;
+    static constexpr index_t BlockedXDLN_PerWarp           = Problem::BlockedXDLN_PerWarp;
    static constexpr index_t VectorSizeC                   = Problem::VectorSizeC;
    static constexpr index_t MPerIteration                 = MPerXdl * MWave;
    static constexpr index_t NPerIteration                 = NPerXdl * NWave;
@@ -212,7 +215,8 @@ struct CShuffleEpilogue
        }
    }();
    static constexpr index_t NumMXdlPerWavePerShuffle = std::get<0>(shuffle_tile_tuple);
-    static constexpr index_t NumNXdlPerWavePerShuffle = std::get<1>(shuffle_tile_tuple);
+    static constexpr index_t NumNXdlPerWavePerShuffle =
+        max(BlockedXDLN_PerWarp, std::get<1>(shuffle_tile_tuple));

    static constexpr auto MNPerIterationShuffle = [] {
        constexpr index_t m_val = MPerXdl * MWave * NumMXdlPerWavePerShuffle;
@@ -265,14 +269,31 @@ struct CShuffleEpilogue

    CK_TILE_DEVICE static constexpr auto MakeLdsDistributionEncode()
    {
-        constexpr auto block_outer_dstr_encoding =
-            tile_distribution_encoding<sequence<>,
-                                       tuple<sequence<NumMXdlPerWavePerShuffle, MWave>,
-                                             sequence<NumNXdlPerWavePerShuffle, NWave>>,
-                                       tuple<sequence<1, 2>>,
-                                       tuple<sequence<1, 1>>,
-                                       sequence<1, 2>,
-                                       sequence<0, 0>>{};
+        constexpr auto block_outer_dstr_encoding = [] {
+            if constexpr(BlockedXDLN_PerWarp == 1)
+            {
+                return tile_distribution_encoding<sequence<>,
+                                                  tuple<sequence<NumMXdlPerWavePerShuffle, MWave>,
+                                                        sequence<NumNXdlPerWavePerShuffle, NWave>>,
+                                                  tuple<sequence<1, 2>>,
+                                                  tuple<sequence<1, 1>>,
+                                                  sequence<1, 2>,
+                                                  sequence<0, 0>>{};
+            }
+            else
+            {
+                constexpr int RakedXDLN_PerWarp = NumNXdlPerWavePerShuffle / BlockedXDLN_PerWarp;
+                // BlockedLayout
+                return tile_distribution_encoding<
+                    sequence<>,
+                    tuple<sequence<NumMXdlPerWavePerShuffle, MWave>,
+                          sequence<RakedXDLN_PerWarp, NWave, BlockedXDLN_PerWarp>>,
+                    tuple<sequence<1, 2>>,
+                    tuple<sequence<1, 1>>,
+                    sequence<1, 2, 2>,
+                    sequence<0, 0, 2>>{};
+            }
+        }();
        constexpr auto block_dstr_encoding = detail::make_embed_tile_distribution_encoding(
            block_outer_dstr_encoding, typename CWarpDstr::DstrEncode{});

@@ -437,7 +458,6 @@ struct CShuffleEpilogue

        static_assert(MPerXdl % RowsPerLane == 0,
                      "CShuffle (permuteN): MPerXdl must be divisible by per-lane row count.");
-
        constexpr int kM0 = MWave;
        constexpr int kM2 = RowsPerLane;
        constexpr int kM1 = MPerXdl / kM2;
@@ -527,6 +547,7 @@ struct CShuffleEpilogue
                    const int src = n_idx * plane + m_lane;   // source row in this N-plane
                    const int dst = n_idx + m_lane * NRepeat; // permuted N layout in output
                    AccDataType v = shuffle_acc.get_thread_buffer()[src];
+
                    if constexpr(has_scalar_scales)
                    {
                        v = static_cast<AccDataType>(v * scale_m * scale_n);
@@ -537,6 +558,7 @@ struct CShuffleEpilogue
                        const auto sn = static_cast<float>(sn_tile.get_thread_buffer()[dst]);
                        v             = static_cast<AccDataType>(v * sm * sn);
                    }
+
                    c_out_tensor.get_thread_buffer()[dst] = type_convert<ODataType>(v);
                });
            });