support flatmm scaling

2026-05-03 13:11:25 +00:00 · 2025-07-23 19:04:22 +00:00
parent 3f7d848dd3
commit 5a1183ebbd
7 changed files with 476 additions and 318 deletions
--- a/include/ck_tile/ops/epilogue/cshuffle_epilogue.hpp
+++ b/include/ck_tile/ops/epilogue/cshuffle_epilogue.hpp
@@ -282,8 +282,8 @@ struct CShuffleEpilogue
            {0, 0});

        using SFC                    = space_filling_curve<sequence<kMPerBlock, kNPerBlock>,
-                                        sequence<0, 1>,
-                                        sequence<MPerIterationShuffle, NPerIterationShuffle>>;
+                                                           sequence<0, 1>,
+                                                           sequence<MPerIterationShuffle, NPerIterationShuffle>>;
        constexpr index_t num_access = SFC::get_num_of_access();

        static_assert(std::is_same_v<ELayout, tensor_layout::gemm::RowMajor>,
@@ -334,8 +334,8 @@ struct CShuffleEpilogue

            const auto c_ds_tiles = concat_tuple_of_reference(
                tie(c_out_tensor, c_out_tensor),
-                generate_tie(
-                    [&](auto idx) -> const auto& { return ds_tensor[idx]; }, number<NumDTensor>{}));
+                generate_tie([&](auto idx) -> const auto& { return ds_tensor[idx]; },
+                             number<NumDTensor>{}));

            tile_elementwise_inout_unpack(typename Problem::CDElementwise{}, c_ds_tiles);

@@ -360,7 +360,12 @@ struct CShuffleEpilogue
            }
        });
    }
-    template <typename ODramWindow, typename OAccTile, typename DsDramWindows, typename ScaleM, typename ScaleN>
+
+    template <typename ODramWindow,
+              typename OAccTile,
+              typename DsDramWindows,
+              typename ScaleM,
+              typename ScaleN>
    CK_TILE_DEVICE auto operator()(ODramWindow& out_dram_window,
                                   const OAccTile& o_acc_tile,
                                   const DsDramWindows& ds_dram_windows,
@@ -368,118 +373,133 @@ struct CShuffleEpilogue
                                   ScaleM scale_m,
                                   ScaleN scale_n)
    {
-        // const index_t iMWarp = get_warp_id() / kNWave;
-        // const index_t iNWarp = get_warp_id() - iMWarp * kNWave;
-        // const index_t iMLane = get_lane_id() / NPerXdl;
-        // const index_t iNLane = get_lane_id() % NPerXdl;
+        constexpr auto LdsTileDistr = make_static_tile_distribution(MakeLdsDistributionEncode());

-        // constexpr auto LdsTileDistr = make_static_tile_distribution(MakeLdsDistributionEncode());
+        auto lds_tile = make_static_distributed_tensor<AccDataType>(LdsTileDistr);

-        // auto lds_tile = make_static_distributed_tensor<AccDataType>(LdsTileDistr);
+        constexpr auto lds_block_desc = MakeLdsBlockDescriptor<Problem>();
+        auto o_lds_block              = make_tensor_view<address_space_enum::lds>(
+            static_cast<ODataType*>(p_smem), lds_block_desc);

-        // constexpr auto lds_block_desc = MakeLdsBlockDescriptor<Problem>();
-        // auto o_lds_block              = make_tensor_view<address_space_enum::lds>(
-        //     static_cast<ODataType*>(p_smem), lds_block_desc);
+        auto in_lds_window = make_tile_window(
+            o_lds_block,
+            make_tuple(number<MPerIterationShuffle>{}, number<NPerIterationShuffle>{}),
+            {0, 0},
+            LdsTileDistr);

-        // auto in_lds_window = make_tile_window(
-        //     o_lds_block,
-        //     make_tuple(number<MPerIterationShuffle>{}, number<NPerIterationShuffle>{}),
-        //     {0, 0},
-        //     LdsTileDistr);
+        auto out_lds_window = make_tile_window(
+            o_lds_block,
+            make_tuple(number<MPerIterationShuffle>{}, number<NPerIterationShuffle>{}),
+            {0, 0});

-        // auto out_lds_window = make_tile_window(
-        //     o_lds_block,
-        //     make_tuple(number<MPerIterationShuffle>{}, number<NPerIterationShuffle>{}),
-        //     {0, 0});
+        using SFC                    = space_filling_curve<sequence<kMPerBlock, kNPerBlock>,
+                                                           sequence<0, 1>,
+                                                           sequence<MPerIterationShuffle, NPerIterationShuffle>>;
+        constexpr index_t num_access = SFC::get_num_of_access();

-        // using SFC                    = space_filling_curve<sequence<kMPerBlock, kNPerBlock>,
-        //                                 sequence<0, 1>,
-        //                                 sequence<MPerIterationShuffle, NPerIterationShuffle>>;
-        // constexpr index_t num_access = SFC::get_num_of_access();
+        static_assert(std::is_same_v<ELayout, tensor_layout::gemm::RowMajor>,
+                      "Currently, the CShuffle Epilogue only supports the Row Major Output layout");

-        // static_assert(std::is_same_v<ELayout, tensor_layout::gemm::RowMajor>,
-        //               "Currently, the CShuffle Epilogue only supports the Row Major Output layout");
+        using TileEncodingPattern =
+            TileDistributionEncodingPattern2D<kBlockSize,
+                                              MPerIterationShuffle,
+                                              NPerIterationShuffle,
+                                              GetVectorSizeC(),
+                                              tile_distribution_pattern::thread_raked,
+                                              Problem::kNumWaveGroups>;
+        constexpr auto dram_tile_distribution = TileEncodingPattern::Make2DStaticTileDistribution();

-        // using TileEncodingPattern =
-        //     TileDistributionEncodingPattern2D<kBlockSize,
-        //                                       MPerIterationShuffle,
-        //                                       NPerIterationShuffle,
-        //                                       GetVectorSizeC(),
-        //                                       tile_distribution_pattern::thread_raked,
-        //                                       Problem::kNumWaveGroups>;
-        // constexpr auto dram_tile_distribution = TileEncodingPattern::Make2DStaticTileDistribution();
+        auto d_dram_windows = generate_tuple(
+            [&](auto idx) {
+                return make_tile_window(ds_dram_windows[idx], dram_tile_distribution);
+            },
+            number<NumDTensor>{});

-        // auto d_dram_windows = generate_tuple(
-        //     [&](auto idx) {
-        //         return make_tile_window(ds_dram_windows[idx], dram_tile_distribution);
-        //     },
-        //     number<NumDTensor>{});
+        constexpr auto c_warp_y_lengths =
+            to_sequence(CWarpDstr{}.get_ys_to_d_descriptor().get_lengths());
+        constexpr auto c_warp_y_index_zeros = uniform_sequence_gen_t<CWarpDstr::NDimY, 0>{};

-        // constexpr auto c_warp_y_lengths =
-        //     to_sequence(CWarpDstr{}.get_ys_to_d_descriptor().get_lengths());
-        // constexpr auto c_warp_y_index_zeros = uniform_sequence_gen_t<CWarpDstr::NDimY, 0>{};
+        constexpr int kM2 = 4;              // Val
+        constexpr int kM1 = (64 / NPerXdl); // Thr
+        constexpr int kM0 = MPerXdl / kM1;  // Val

-        // static_for<0, num_access, 1>{}([&](auto iAccess) {
-        //     block_sync_lds();
-        //     constexpr auto idx_y_start = SFC::get_index(iAccess);
+        const index_t iMWarp = get_warp_id() / NWave;
+        const index_t iNWarp = get_warp_id() - iMWarp * NWave;
+        const index_t iMLane = get_lane_id() / NPerXdl;
+        const index_t iNLane = get_lane_id() % NPerXdl;

-        //     constexpr auto mIter = number<idx_y_start.at(number<0>{}) / (MPerIterationShuffle)>{};
-        //     constexpr auto nIter = number<idx_y_start.at(number<1>{}) / (NPerIterationShuffle)>{};
+        static_for<0, num_access, 1>{}([&](auto iAccess) {
+            block_sync_lds();
+            constexpr auto idx_y_start = SFC::get_index(iAccess);

-        //     lds_tile.get_thread_buffer() = o_acc_tile.get_y_sliced_thread_data(
-        //         merge_sequences(
-        //             sequence<mIter * NumMXdlPerWavePerShuffle, nIter * NumNXdlPerWavePerShuffle>{},
-        //             c_warp_y_index_zeros),
-        //         merge_sequences(sequence<NumMXdlPerWavePerShuffle, NumNXdlPerWavePerShuffle>{},
-        //                         c_warp_y_lengths));
+            constexpr auto mIter = number<idx_y_start.at(number<0>{}) / (MPerIterationShuffle)>{};
+            constexpr auto nIter = number<idx_y_start.at(number<1>{}) / (NPerIterationShuffle)>{};

-        //     const auto c_warptile_in_tensor_casted = cast_tile<ODataType>(lds_tile);
+            lds_tile.get_thread_buffer() = o_acc_tile.get_y_sliced_thread_data(
+                merge_sequences(
+                    sequence<mIter * NumMXdlPerWavePerShuffle, nIter * NumNXdlPerWavePerShuffle>{},
+                    c_warp_y_index_zeros),
+                merge_sequences(sequence<NumMXdlPerWavePerShuffle, NumNXdlPerWavePerShuffle>{},
+                                c_warp_y_lengths));

-        //     store_tile(in_lds_window, c_warptile_in_tensor_casted);
-        //     block_sync_lds();
+            static_for<0, NumNXdlPerWavePerShuffle, 1>{}([&](auto n_xdl) {
+                float scale_B =
+                    scale_n[nIter * NPerIterationShuffle +
+                            iNWarp * NumNXdlPerWavePerShuffle * NPerXdl + n_xdl * NPerXdl + iNLane];
+                static_for<0, NumMXdlPerWavePerShuffle, 1>{}([&](auto m_xdl) {
+                    constexpr int acc_xdl_offset =
+                        (m_xdl * NumMXdlPerWavePerShuffle + n_xdl) * c_warp_y_lengths.product();

-        //     auto c_out_tensor = load_tile(make_tile_window(out_lds_window, dram_tile_distribution));
-            
-        //     auto m1       = iMLane;
-        //     float scale_B = scale_n[nIter * NPerIterationShuffle];
-        //     static_for<0, kM0, 1>{}([&](auto m0) {
-        //         static_for<0, kM2, 1>{}([&](auto m2) {
-        //             float scale_A = scale_m[mIter * MPerIterationShuffle + iMWarp * MPerXdl +
-        //                                     m0 * kM1 * kM2 + m1 * kM2 + m2];
-        //             c_out_tensor.get_thread_buffer()[m0 * kM2 + m2] *= scale_A * scale_B;
-        //         });
-        //     });
+                    static_for<0, kM0, 1>{}([&](auto m0) {
+                        static_for<0, kM2, 1>{}([&](auto m2) {
+                            float scale_A =
+                                scale_m[mIter * MPerIterationShuffle +
+                                        iMWarp * NumMXdlPerWavePerShuffle * MPerXdl +
+                                        m_xdl * MPerXdl + m0 * kM1 * kM2 + iMLane * kM2 + m2];
+                            lds_tile.get_thread_buffer()[acc_xdl_offset + m0 * kM2 + m2] *=
+                                scale_A * scale_B;
+                        });
+                    });
+                });
+            });

-        //     const auto ds_tensor = generate_tuple(
-        //         [&](auto idx) { return load_tile(d_dram_windows[idx]); }, number<NumDTensor>{});
+            const auto c_warptile_in_tensor_casted = cast_tile<ODataType>(lds_tile);

-        //     const auto c_ds_tiles = concat_tuple_of_reference(
-        //         tie(c_out_tensor, c_out_tensor),
-        //         generate_tie(
-        //             [&](auto idx) -> const auto& { return ds_tensor[idx]; }, number<NumDTensor>{}));
+            store_tile(in_lds_window, c_warptile_in_tensor_casted);
+            block_sync_lds();

-        //     tile_elementwise_inout_unpack(typename Problem::CDElementwise{}, c_ds_tiles);
+            auto c_out_tensor = load_tile(make_tile_window(out_lds_window, dram_tile_distribution));

-        //     if constexpr(MemoryOperation == memory_operation_enum::set)
-        //     {
-        //         store_tile(out_dram_window, c_out_tensor);
-        //     }
-        //     else
-        //     {
-        //         update_tile(out_dram_window, c_out_tensor);
-        //     }
-        //     if constexpr(iAccess != num_access - 1)
-        //     {
-        //         constexpr auto step = SFC::get_forward_step(iAccess);
+            const auto ds_tensor = generate_tuple(
+                [&](auto idx) { return load_tile(d_dram_windows[idx]); }, number<NumDTensor>{});

-        //         move_tile_window(out_dram_window, {step.at(number<0>{}), step.at(number<1>{})});
+            const auto c_ds_tiles = concat_tuple_of_reference(
+                tie(c_out_tensor, c_out_tensor),
+                generate_tie([&](auto idx) -> const auto& { return ds_tensor[idx]; },
+                             number<NumDTensor>{}));

-        //         static_for<0, NumDTensor, 1>{}([&](auto idx) {
-        //             move_tile_window(d_dram_windows[idx],
-        //                              {step.at(number<0>{}), step.at(number<1>{})});
-        //         });
-        //     }
-        // });
+            tile_elementwise_inout_unpack(typename Problem::CDElementwise{}, c_ds_tiles);
+
+            if constexpr(MemoryOperation == memory_operation_enum::set)
+            {
+                store_tile(out_dram_window, c_out_tensor);
+            }
+            else
+            {
+                update_tile(out_dram_window, c_out_tensor);
+            }
+            if constexpr(iAccess != num_access - 1)
+            {
+                constexpr auto step = SFC::get_forward_step(iAccess);
+
+                move_tile_window(out_dram_window, {step.at(number<0>{}), step.at(number<1>{})});
+
+                static_for<0, NumDTensor, 1>{}([&](auto idx) {
+                    move_tile_window(d_dram_windows[idx],
+                                     {step.at(number<0>{}), step.at(number<1>{})});
+                });
+            }
+        });
    }
 };
 } // namespace ck_tile
--- a/include/ck_tile/ops/flatmm/kernel/flatmm_kernel.hpp
+++ b/include/ck_tile/ops/flatmm/kernel/flatmm_kernel.hpp
@@ -102,17 +102,17 @@ struct BaseFlatmmHostArgs
 {
    CK_TILE_HOST BaseFlatmmHostArgs() = default;
    CK_TILE_HOST BaseFlatmmHostArgs(const void* a_ptr_,
-                                const void* b_ptr_,
-                                const std::array<const void*, NumDTensor>& ds_ptr_,
-                                void* e_ptr_,
-                                index_t k_batch_,
-                                index_t M_,
-                                index_t N_,
-                                index_t K_,
-                                index_t stride_A_,
-                                index_t stride_B_,
-                                const std::array<index_t, NumDTensor>& stride_Ds_,
-                                index_t stride_E_)
+                                    const void* b_ptr_,
+                                    const std::array<const void*, NumDTensor>& ds_ptr_,
+                                    void* e_ptr_,
+                                    index_t k_batch_,
+                                    index_t M_,
+                                    index_t N_,
+                                    index_t K_,
+                                    index_t stride_A_,
+                                    index_t stride_B_,
+                                    const std::array<index_t, NumDTensor>& stride_Ds_,
+                                    index_t stride_E_)
        : a_ptr(a_ptr_),
          b_ptr(b_ptr_),
          ds_ptr(ds_ptr_),
@@ -151,35 +151,49 @@ struct BaseFlatmmHostArgs
    index_t k_batch;
 };

-template <class ScaleM = FlatmmScalePointer<-1>, class ScaleN = FlatmmScalePointer<-1>, index_t NumDTensor = 0>
+template <class ScaleM       = FlatmmScalePointer<-1>,
+          class ScaleN       = FlatmmScalePointer<-1>,
+          index_t NumDTensor = 0>
 struct ScaleFlatmmHostArgs : public BaseFlatmmHostArgs<>
 {
    CK_TILE_HOST ScaleFlatmmHostArgs() = default;
    CK_TILE_HOST ScaleFlatmmHostArgs(const void* a_ptr_,
-                                        const void* b_shuffle_ptr_,
-                                        const std::array<const void*, NumDTensor>& ds_ptr_,
-                                        void* c_ptr_,
-                                        index_t k_batch_,
-                                        index_t M_,
-                                        index_t N_,
-                                        index_t K_,
-                                        index_t stride_A_,
-                                        index_t stride_B_,
-                                        const std::array<index_t, NumDTensor>& stride_Ds_,
-                                        index_t stride_C_,
-                                        ScaleM scale_m_ = nullptr,
-                                        ScaleN scale_n_ = nullptr)
-        : BaseFlatmmHostArgs(a_ptr_, b_shuffle_ptr_, ds_ptr_, c_ptr_, k_batch_, M_, N_, K_, stride_A_, stride_B_, stride_Ds_, stride_C_),
-            scale_m(scale_m_),
-            scale_n(scale_n_)
+                                     const void* b_shuffle_ptr_,
+                                     const std::array<const void*, NumDTensor>& ds_ptr_,
+                                     void* c_ptr_,
+                                     index_t k_batch_,
+                                     index_t M_,
+                                     index_t N_,
+                                     index_t K_,
+                                     index_t stride_A_,
+                                     index_t stride_B_,
+                                     const std::array<index_t, NumDTensor>& stride_Ds_,
+                                     index_t stride_C_,
+                                     ScaleM scale_m_ = nullptr,
+                                     ScaleN scale_n_ = nullptr)
+        : BaseFlatmmHostArgs(a_ptr_,
+                             b_shuffle_ptr_,
+                             ds_ptr_,
+                             c_ptr_,
+                             k_batch_,
+                             M_,
+                             N_,
+                             K_,
+                             stride_A_,
+                             stride_B_,
+                             stride_Ds_,
+                             stride_C_),
+          scale_m(scale_m_),
+          scale_n(scale_n_)
    {
    }
    ScaleM scale_m = nullptr;
    ScaleN scale_n = nullptr;
 };

-template <int NumberTensor=0>
-using FlatmmHostArgs = ScaleFlatmmHostArgs<FlatmmScalePointer<-1>, FlatmmScalePointer<-1>, NumberTensor>;
+template <int NumberTensor = 0>
+using FlatmmHostArgs =
+    ScaleFlatmmHostArgs<FlatmmScalePointer<-1>, FlatmmScalePointer<-1>, NumberTensor>;

 template <class ScaleM, class ScaleN, index_t NumDTensor = 0>
 struct FlatmmKernelArgs
@@ -278,7 +292,8 @@ struct FlatmmKernel
    struct SplitKBatchOffset
    {
        template <class KernelArgs>
-        __device__ SplitKBatchOffset(const KernelArgs& kargs, const std::size_t k_id = blockIdx.z)        {
+        __device__ SplitKBatchOffset(const KernelArgs& kargs, const std::size_t k_id = blockIdx.z)
+        {
            constexpr auto K1   = TilePartitioner::BlockGemmShape::WarpTile::at(number<2>{});
            const index_t K_t   = kargs.k_batch * K1;
            const index_t KRead = (kargs.K + K_t - 1) / K_t * K1;
@@ -681,16 +696,17 @@ struct FlatmmKernel
    }

    template <class ScaleM, class ScaleN, bool UseDefaultScheduler = true>
-    CK_TILE_DEVICE static void RunFlatmm(const ADataType* a_ptr,
-                                         const BDataType* b_flat_ptr,
-                                         const std::array<const void*, NumDTensor>& ds_ptr,
-                                         EDataType* e_ptr,
-                                         void* smem_ptr_ping,
-                                         void* smem_ptr_pong,
-                                         const FlatmmKernelArgs<ScaleM, ScaleN, DsDataType::size()>& kargs,
-                                         const SplitKBatchOffset& splitk_batch_offset,
-                                         const index_t block_idx_m,
-                                         const index_t block_idx_n)
+    CK_TILE_DEVICE static void
+    RunFlatmm(const ADataType* a_ptr,
+              const BDataType* b_flat_ptr,
+              const std::array<const void*, NumDTensor>& ds_ptr,
+              EDataType* e_ptr,
+              void* smem_ptr_ping,
+              void* smem_ptr_pong,
+              const FlatmmKernelArgs<ScaleM, ScaleN, DsDataType::size()>& kargs,
+              const SplitKBatchOffset& splitk_batch_offset,
+              const index_t block_idx_m,
+              const index_t block_idx_n)
    {
        // Create Gemm tensor views, pad views and tile windows
        const auto& gemm_tensor_views_tuple =
@@ -712,19 +728,21 @@ struct FlatmmKernel
        if constexpr(ScaleM::granularity != -1 || ScaleN::granularity != -1)
        {
            auto& c_block_window = gemm_tile_windows.at(I3);
-            EpiloguePipeline{}.template operator()<decltype(c_block_window), decltype(c_block_tile), decltype(d_block_window)>(
-                    c_block_window,
-                    c_block_tile,
-                    d_block_window,
-                    smem_ptr_ping,
-                    kargs.scale_m_ptr + block_idx_m,
-                    kargs.scale_n_ptr + block_idx_n);
+            EpiloguePipeline{}.template
+            operator()<decltype(c_block_window), decltype(c_block_tile), decltype(d_block_window)>(
+                c_block_window,
+                c_block_tile,
+                d_block_window,
+                smem_ptr_ping,
+                kargs.scale_m_ptr + block_idx_m,
+                kargs.scale_n_ptr + block_idx_n);
        }
        else if(UseDefaultScheduler || (get_warp_id() == 0))
        {
            // Run Epilogue Pipeline
            auto& c_block_window = gemm_tile_windows.at(I3);
-            EpiloguePipeline{}.template operator()<decltype(c_block_window), decltype(c_block_tile), decltype(d_block_window)>(
+            EpiloguePipeline{}.template
+            operator()<decltype(c_block_window), decltype(c_block_tile), decltype(d_block_window)>(
                c_block_window, c_block_tile, d_block_window, smem_ptr_ping);
        }
    }
@@ -755,15 +773,15 @@ struct FlatmmKernel
        {
            constexpr auto scheduler_type = (FlatmmPipeline::NumWaveGroups == 1);
            RunFlatmm<ScaleM, ScaleN, scheduler_type>(a_ptr,
-                                      b_flat_ptr,
-                                      kargs.ds_ptr,
-                                      e_ptr,
-                                      smem_ptr_ping,
-                                      smem_ptr_pong,
-                                      kargs,
-                                      splitk_batch_offset,
-                                      i_m,
-                                      i_n);
+                                                      b_flat_ptr,
+                                                      kargs.ds_ptr,
+                                                      e_ptr,
+                                                      smem_ptr_ping,
+                                                      smem_ptr_pong,
+                                                      kargs,
+                                                      splitk_batch_offset,
+                                                      i_m,
+                                                      i_n);
        }
    }
 };