sync

2026-04-19 22:39:03 +00:00 · 2025-07-23 15:01:53 +08:00
parent 46a538e39e
commit 7e1bd4b839
3 changed files with 285 additions and 29 deletions
--- a/example/ck_tile/18_flatmm/run_flatmm_example.inc
+++ b/example/ck_tile/18_flatmm/run_flatmm_example.inc
@@ -97,6 +97,8 @@ template <typename FlatmmConfig,
          typename BLayout,
          typename DsLayout,
          typename CLayout,
+          typename ScaleM,
+          typename ScaleN,
          typename CDEElementWise = ck_tile::element_wise::PassThrough>
 float invoke_flatmm(ck_tile::DeviceMem& a_dev_buf,
                    ck_tile::DeviceMem& b_shuffle_dev_buf,
--- a/include/ck_tile/ops/epilogue/cshuffle_epilogue.hpp
+++ b/include/ck_tile/ops/epilogue/cshuffle_epilogue.hpp
@@ -339,6 +339,127 @@ struct CShuffleEpilogue

            tile_elementwise_inout_unpack(typename Problem::CDElementwise{}, c_ds_tiles);

+            if constexpr(MemoryOperation == memory_operation_enum::set)
+            {
+                store_tile(out_dram_window, c_out_tensor);
+            }
+            else
+            {
+                update_tile(out_dram_window, c_out_tensor);
+            }
+            if constexpr(iAccess != num_access - 1)
+            {
+                constexpr auto step = SFC::get_forward_step(iAccess);
+
+                move_tile_window(out_dram_window, {step.at(number<0>{}), step.at(number<1>{})});
+
+                static_for<0, NumDTensor, 1>{}([&](auto idx) {
+                    move_tile_window(d_dram_windows[idx],
+                                     {step.at(number<0>{}), step.at(number<1>{})});
+                });
+            }
+        });
+    }
+    template <typename ODramWindow, typename OAccTile, typename DsDramWindows, typename ScaleM, typename ScaleN>
+    CK_TILE_DEVICE auto operator()(ODramWindow& out_dram_window,
+                                   const OAccTile& o_acc_tile,
+                                   const DsDramWindows& ds_dram_windows,
+                                   void* p_smem,
+                                   ScaleM scale_m,
+                                   ScaleN scale_n)
+    {
+        const index_t iMWarp = get_warp_id() / kNWave;
+        const index_t iNWarp = get_warp_id() - iMWarp * kNWave;
+        const index_t iMLane = get_lane_id() / NPerXdl;
+        const index_t iNLane = get_lane_id() % NPerXdl;
+
+        constexpr auto LdsTileDistr = make_static_tile_distribution(MakeLdsDistributionEncode());
+
+        auto lds_tile = make_static_distributed_tensor<AccDataType>(LdsTileDistr);
+
+        constexpr auto lds_block_desc = MakeLdsBlockDescriptor<Problem>();
+        auto o_lds_block              = make_tensor_view<address_space_enum::lds>(
+            static_cast<ODataType*>(p_smem), lds_block_desc);
+
+        auto in_lds_window = make_tile_window(
+            o_lds_block,
+            make_tuple(number<MPerIterationShuffle>{}, number<NPerIterationShuffle>{}),
+            {0, 0},
+            LdsTileDistr);
+
+        auto out_lds_window = make_tile_window(
+            o_lds_block,
+            make_tuple(number<MPerIterationShuffle>{}, number<NPerIterationShuffle>{}),
+            {0, 0});
+
+        using SFC                    = space_filling_curve<sequence<kMPerBlock, kNPerBlock>,
+                                        sequence<0, 1>,
+                                        sequence<MPerIterationShuffle, NPerIterationShuffle>>;
+        constexpr index_t num_access = SFC::get_num_of_access();
+
+        static_assert(std::is_same_v<ELayout, tensor_layout::gemm::RowMajor>,
+                      "Currently, the CShuffle Epilogue only supports the Row Major Output layout");
+
+        using TileEncodingPattern =
+            TileDistributionEncodingPattern2D<kBlockSize,
+                                              MPerIterationShuffle,
+                                              NPerIterationShuffle,
+                                              GetVectorSizeC(),
+                                              tile_distribution_pattern::thread_raked,
+                                              Problem::kNumWaveGroups>;
+        constexpr auto dram_tile_distribution = TileEncodingPattern::Make2DStaticTileDistribution();
+
+        auto d_dram_windows = generate_tuple(
+            [&](auto idx) {
+                return make_tile_window(ds_dram_windows[idx], dram_tile_distribution);
+            },
+            number<NumDTensor>{});
+
+        constexpr auto c_warp_y_lengths =
+            to_sequence(CWarpDstr{}.get_ys_to_d_descriptor().get_lengths());
+        constexpr auto c_warp_y_index_zeros = uniform_sequence_gen_t<CWarpDstr::NDimY, 0>{};
+
+        static_for<0, num_access, 1>{}([&](auto iAccess) {
+            block_sync_lds();
+            constexpr auto idx_y_start = SFC::get_index(iAccess);
+
+            constexpr auto mIter = number<idx_y_start.at(number<0>{}) / (MPerIterationShuffle)>{};
+            constexpr auto nIter = number<idx_y_start.at(number<1>{}) / (NPerIterationShuffle)>{};
+
+            lds_tile.get_thread_buffer() = o_acc_tile.get_y_sliced_thread_data(
+                merge_sequences(
+                    sequence<mIter * NumMXdlPerWavePerShuffle, nIter * NumNXdlPerWavePerShuffle>{},
+                    c_warp_y_index_zeros),
+                merge_sequences(sequence<NumMXdlPerWavePerShuffle, NumNXdlPerWavePerShuffle>{},
+                                c_warp_y_lengths));
+
+            const auto c_warptile_in_tensor_casted = cast_tile<ODataType>(lds_tile);
+
+            store_tile(in_lds_window, c_warptile_in_tensor_casted);
+            block_sync_lds();
+
+            auto c_out_tensor = load_tile(make_tile_window(out_lds_window, dram_tile_distribution));
+            
+            auto m1       = iMLane;
+            float scale_B = scale_n[nIter * NPerIterationShuffle];
+            static_for<0, kM0, 1>{}([&](auto m0) {
+                static_for<0, kM2, 1>{}([&](auto m2) {
+                    float scale_A = scale_m[mIter * MPerIterationShuffle + iMWarp * MPerXdl +
+                                            m0 * kM1 * kM2 + m1 * kM2 + m2];
+                    c_out_tensor.get_thread_buffer()[m0 * kM2 + m2] *= scale_A * scale_B;
+                });
+            });
+
+            const auto ds_tensor = generate_tuple(
+                [&](auto idx) { return load_tile(d_dram_windows[idx]); }, number<NumDTensor>{});
+
+            const auto c_ds_tiles = concat_tuple_of_reference(
+                tie(c_out_tensor, c_out_tensor),
+                generate_tie(
+                    [&](auto idx) -> const auto& { return ds_tensor[idx]; }, number<NumDTensor>{}));
+
+            tile_elementwise_inout_unpack(typename Problem::CDElementwise{}, c_ds_tiles);
+
            if constexpr(MemoryOperation == memory_operation_enum::set)
            {
                store_tile(out_dram_window, c_out_tensor);
--- a/include/ck_tile/ops/flatmm/kernel/flatmm_kernel.hpp
+++ b/include/ck_tile/ops/flatmm/kernel/flatmm_kernel.hpp
@@ -11,12 +11,97 @@
 #include "ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_scheduler.hpp"

 namespace ck_tile {
-
-template <index_t NumDTensor = 0>
-struct FlatmmHostArgs
+struct FlatmmProblem
 {
-    CK_TILE_HOST FlatmmHostArgs() = default;
-    CK_TILE_HOST FlatmmHostArgs(const void* a_ptr_,
+    CK_TILE_HOST FlatmmProblem() = default;
+    CK_TILE_HOST FlatmmProblem(
+        index_t M_, index_t N_, index_t K_, index_t stride_A_, index_t stride_B_, index_t stride_C_)
+        : M(M_), N(N_), K(K_), stride_A(stride_A_), stride_B(stride_B_), stride_C(stride_C_)
+    {
+    }
+
+    index_t M;
+    index_t N;
+    index_t K;
+    index_t stride_A;
+    index_t stride_B;
+    index_t stride_C;
+};
+
+template <int SharedGranularity>
+struct FlatmmScalePointer
+{
+    static constexpr int granularity = SharedGranularity;
+
+    union
+    {
+        const float* ptr;
+        float scalar; // if shared granularity is 0, all rows/columns use the same scale value
+    };
+
+    CK_TILE_HOST_DEVICE FlatmmScalePointer() = default;
+    CK_TILE_HOST_DEVICE FlatmmScalePointer(float scalar_) : scalar(scalar_) {}
+    CK_TILE_HOST_DEVICE FlatmmScalePointer(const float* ptr_) : ptr(ptr_) {}
+
+    CK_TILE_HOST_DEVICE FlatmmScalePointer operator+(index_t offset) const
+    {
+        FlatmmScalePointer ret;
+        if constexpr(granularity == 0)
+        {
+            ret.scalar = scalar;
+        }
+        else if constexpr(granularity == 1)
+        {
+            ret.ptr = ptr + offset;
+        }
+        else
+        {
+            ret.ptr = ptr + offset / granularity;
+        }
+        return ret;
+    }
+
+    CK_TILE_HOST_DEVICE float operator[](index_t i) const
+    {
+        if constexpr(granularity == 0)
+        {
+            return scalar;
+        }
+        else if constexpr(granularity == 1)
+        {
+            return ptr[i];
+        }
+        else
+        {
+            return ptr[i / granularity];
+        }
+    }
+};
+// shared granularity = -1 means no scale
+template <>
+struct FlatmmScalePointer<-1>
+{
+    static constexpr int granularity = -1;
+
+    CK_TILE_HOST_DEVICE constexpr FlatmmScalePointer() = default;
+    CK_TILE_HOST_DEVICE constexpr FlatmmScalePointer(float scalar_) {}
+    CK_TILE_HOST_DEVICE constexpr FlatmmScalePointer(const float* ptr_) {}
+
+    CK_TILE_HOST_DEVICE constexpr FlatmmScalePointer operator+(index_t) const
+    {
+        return FlatmmScalePointer{};
+    }
+    CK_TILE_HOST_DEVICE constexpr float operator[](index_t) const
+    {
+        return 1; // alway return 1, it doesn't change the result
+    }
+};
+
+template <>
+struct BaseFlatmmHostArgs
+{
+    CK_TILE_HOST BaseFlatmmHostArgs() = default;
+    CK_TILE_HOST BaseFlatmmHostArgs(const void* a_ptr_,
                                const void* b_ptr_,
                                const std::array<const void*, NumDTensor>& ds_ptr_,
                                void* e_ptr_,
@@ -66,7 +151,37 @@ struct FlatmmHostArgs
    index_t k_batch;
 };

-template <index_t NumDTensor = 0>
+template <class ScaleM = FlatmmScalePointer<-1>, class ScaleN = FlatmmScalePointer<-1>, index_t NumDTensor = 0>
+struct ScaleFlatmmHostArgs : public BaseFlatmmHostArgs<>
+{
+    CK_TILE_HOST ScaleFlatmmHostArgs() = default;
+    CK_TILE_HOST ScaleFlatmmHostArgs(const void* a_ptr_,
+                                        const void* b_shuffle_ptr_,
+                                        const std::array<const void*, NumDTensor>& ds_ptr_,
+                                        void* c_ptr_,
+                                        index_t k_batch_,
+                                        index_t M_,
+                                        index_t N_,
+                                        index_t K_,
+                                        index_t stride_A_,
+                                        index_t stride_B_,
+                                        const std::array<index_t, NumDTensor>& stride_Ds_,
+                                        index_t stride_C_,
+                                        ScaleM scale_m_ = nullptr,
+                                        ScaleN scale_n_ = nullptr)
+        : BaseFlatmmHostArgs(a_ptr_, b_shuffle_ptr_, ds_ptr_, c_ptr_, M_, N_, K_, stride_A_, stride_B_, stride_Ds_, stride_C_, k_batch_),
+            scale_m(scale_m_),
+            scale_n(scale_n_)
+    {
+    }
+    ScaleM scale_m = nullptr;
+    ScaleN scale_n = nullptr;
+};
+
+template <int NumberTensor=0>
+using FlatmmHostArgs = ScaleFlatmmHostArgs<FlatmmScalePointer<-1>, FlatmmScalePointer<-1>, NumberTensor>;
+
+template <class ScaleM, class ScaleN, index_t NumDTensor = 0>
 struct FlatmmKernelArgs
 {
    const void* a_ptr;
@@ -82,6 +197,8 @@ struct FlatmmKernelArgs
    std::array<index_t, NumDTensor> stride_Ds;
    index_t stride_E;
    index_t k_batch;
+    ScaleM scale_m_ptr = nullptr;
+    ScaleN scale_n_ptr = nullptr;
 };

 template <typename TilePartitioner_, typename FlatmmPipeline_, typename EpiloguePipeline_>
@@ -113,7 +230,7 @@ struct FlatmmKernel

    static_assert(DsLayout::size() == DsDataType::size(),
                  "The size of DsLayout and DsDataType should be the same");
-    using KernelArgs = FlatmmKernelArgs<DsLayout::size()>;
+    // using KernelArgs = FlatmmKernelArgs<DsLayout::size()>;

    [[nodiscard]] CK_TILE_HOST static const std::string GetName()
    {
@@ -129,21 +246,24 @@ struct FlatmmKernel

    CK_TILE_HOST static constexpr auto BlockSize() { return dim3(KernelBlockSize); }

-    CK_TILE_HOST static constexpr KernelArgs
-    MakeKernelArgs(const FlatmmHostArgs<NumDTensor>& hostArgs)
+    template <class ScaleM, class ScaleN>
+    CK_TILE_HOST static constexpr FlatmmKernelArgs<ScaleM, ScaleN, DsDataType::size()>
+    MakeKernelArgs(const FlatmmHostArgs<ScaleM, ScaleN, DsDataType::size()>& hostArgs)
    {
-        return KernelArgs{hostArgs.a_ptr,
-                          hostArgs.b_ptr,
-                          hostArgs.ds_ptr,
-                          hostArgs.e_ptr,
-                          hostArgs.M,
-                          hostArgs.N,
-                          hostArgs.K,
-                          hostArgs.stride_A,
-                          hostArgs.stride_B,
-                          hostArgs.stride_Ds,
-                          hostArgs.stride_E,
-                          hostArgs.k_batch};
+        return {hostArgs.a_ptr,
+                hostArgs.b_ptr,
+                hostArgs.ds_ptr,
+                hostArgs.e_ptr,
+                hostArgs.M,
+                hostArgs.N,
+                hostArgs.K,
+                hostArgs.stride_A,
+                hostArgs.stride_B,
+                hostArgs.stride_Ds,
+                hostArgs.stride_E,
+                hostArgs.k_batch,
+                hostArgs.scale_m,
+                hostArgs.scale_n};
    }

    CK_TILE_HOST_DEVICE static constexpr index_t GetSmemPingSize()
@@ -157,8 +277,8 @@ struct FlatmmKernel

    struct SplitKBatchOffset
    {
-        __device__ SplitKBatchOffset(const KernelArgs& kargs, const std::size_t k_id = blockIdx.z)
-        {
+        template <class KernelArgs>
+        __device__ SplitKBatchOffset(const KernelArgs& kargs, const std::size_t k_id = blockIdx.z)        {
            constexpr auto K1   = TilePartitioner::BlockGemmShape::WarpTile::at(number<2>{});
            const index_t K_t   = kargs.k_batch * K1;
            const index_t KRead = (kargs.K + K_t - 1) / K_t * K1;
@@ -196,6 +316,7 @@ struct FlatmmKernel
        index_t splitted_k;
    };

+    template <class KernelArgs>
    CK_TILE_HOST static bool IsSupportedArgument(const KernelArgs& kargs)
    {
        if constexpr(EpiloguePipeline::GetVectorSizeC() % 2 != 0 &&
@@ -341,7 +462,7 @@ struct FlatmmKernel
        return DTesnorIsValid;
    }

-    template <memory_operation_enum DstInMemOp = memory_operation_enum::set>
+    template <memory_operation_enum DstInMemOp = memory_operation_enum::set, class KernelArgs>
    CK_TILE_DEVICE static auto
    MakeGemmTensorViews(const ADataType* a_ptr,
                        const BDataType* b_flat_ptr,
@@ -559,14 +680,14 @@ struct FlatmmKernel
        return make_tuple(a_block_window, b_flat_block_window, ds_block_window, e_block_window);
    }

-    template <bool UseDefaultScheduler = true>
+    template <class ScaleM, class ScaleN, bool UseDefaultScheduler = true>
    CK_TILE_DEVICE static void RunFlatmm(const ADataType* a_ptr,
                                         const BDataType* b_flat_ptr,
                                         const std::array<const void*, NumDTensor>& ds_ptr,
                                         EDataType* e_ptr,
                                         void* smem_ptr_ping,
                                         void* smem_ptr_pong,
-                                         const KernelArgs& kargs,
+                                         const FlatmmKernelArgs<ScaleM, ScaleN, DsDataType::size()>& kargs,
                                         const SplitKBatchOffset& splitk_batch_offset,
                                         const index_t block_idx_m,
                                         const index_t block_idx_n)
@@ -588,8 +709,18 @@ struct FlatmmKernel
            a_block_window, b_flat_block_window, num_loop, smem_ptr_ping, smem_ptr_pong);

        // Run Epilogue Pipeline
-
-        if(UseDefaultScheduler || (get_warp_id() == 0))
+        if constexpr(ScaleM::granularity != -1 || ScaleN::granularity != -1)
+        {
+            auto& c_block_window = gemm_tile_windows.at(I3);
+            EpiloguePipeline{}.template operator()<decltype(c_block_window), decltype(c_block_tile), decltype(d_block_window)>(
+                    c_block_window,
+                    c_block_tile,
+                    d_block_window,
+                    smem_ptr_ping,
+                    kargs.scale_m_ptr + block_idx_m,
+                    kargs.scale_n_ptr + block_idx_n);
+        }
+        else if(UseDefaultScheduler || (get_warp_id() == 0))
        {
            // Run Epilogue Pipeline
            auto& c_block_window = gemm_tile_windows.at(I3);
@@ -598,7 +729,9 @@ struct FlatmmKernel
        }
    }

-    CK_TILE_DEVICE void operator()(KernelArgs kargs) const
+    template <class ScaleM, class ScaleN>
+    CK_TILE_DEVICE void operator()(FlatmmKernelArgs<ScaleM, ScaleN, DsDataType::size()> kargs,
+                                   int partition_idx = blockIdx.x) const
    {
        const auto [iM, iN] = TilePartitioner{kargs.M, kargs.N}.GetOutputTileIndex(blockIdx.x);
        const index_t i_m   = __builtin_amdgcn_readfirstlane(iM * TilePartitioner::MPerBlock);