supporting prefill shapes for preshuffle block scale gemm (#2975)

* debugging * debugging for prefill shapes * comment unused code * fix for prefill shapes * clearing up the code * add int4 to universal gemm example * clang formatted * adding test for prefill shapes in block scale gemm * lil improv on the block pipeline * Address Review Comment --------- Co-authored-by: ThomasNing <thomas.ning@amd.com>
2026-04-20 14:59:17 +00:00 · 2025-10-10 15:36:24 -07:00
parent 9d060d3e3c
commit 3c39d279ab
10 changed files with 137 additions and 89 deletions
--- a/include/ck_tile/ops/gemm_quant/block/block_universal_gemm_ar_flatbr_bquant_cr.hpp
+++ b/include/ck_tile/ops/gemm_quant/block/block_universal_gemm_ar_flatbr_bquant_cr.hpp
@@ -127,62 +127,72 @@ struct BlockGemmWeightPreshuffleBQuantARegBRegCReg
                                   BQBlockTensor& bq_block_tensor,
                                   ABlockWindow& a_warp_windows) const
    {
-        using CWarpDstr   = typename WG::CWarpDstr;
-        using CWarpTensor = typename WG::CWarpTensor;
+        using CWarpDstr = typename WG::CWarpDstr;
+        using AccTensor = typename WG::CWarpTensor;

        constexpr auto c_warp_y_index_zeros = uniform_sequence_gen_t<CWarpDstr::NDimY, 0>{};

-        static_for<0, QScalesPerBlockRow, 1>{}([&](auto kQScale) {
-            CWarpTensor c_warp_tensor;
-            static_for<0, KIterPerQScale, 1>{}([&](auto kIterInQScale) {
-                static_for<0, MIterPerWarp, 1>{}([&](auto mIter) {
-                    static_for<0, NIterPerWarp, 1>{}([&](auto nIter) {
-                        constexpr auto kIter = kQScale * KIterPerQScale + kIterInQScale;
+        statically_indexed_array<statically_indexed_array<AccTensor, NIterPerWarp>, MIterPerWarp>
+            c_acc;

-                        constexpr auto AwarpIter = (kIter * MIterPerWarp + mIter) % m_preload;
-
-                        // warp GEMM
-                        if constexpr(kIterInQScale == 0)
-                            c_warp_tensor = WG{}(a_warp_tensor(number<AwarpIter>{}),
-                                                 b_warp_tensor(nIter)(number<kIter>{}));
-                        else
-                            WG{}(c_warp_tensor,
-                                 a_warp_tensor(number<AwarpIter>{}),
-                                 b_warp_tensor(nIter)(number<kIter>{}));
-
-                        __builtin_amdgcn_sched_barrier(0x7F6);
-                        // preload next A from lds
-                        if constexpr((kIter * MIterPerWarp + mIter) <
-                                     (KIterPerWarp * MIterPerWarp - m_preload))
-                        {
-                            constexpr auto AmIter = (mIter + m_preload) % MIterPerWarp;
-                            constexpr auto AkIter = (kIter + (mIter + m_preload) / MIterPerWarp);
-                            a_warp_tensor(number<AwarpIter>{}) =
-                                load_tile(a_warp_windows(number<AmIter>{})(number<AkIter>{}));
-                        }
-                        // barrier
-                        if constexpr((kIter == KIterPerWarp - 1) && (mIter == MIter_2nd_last))
-                        {
-                            block_sync_lds();
-                        }
-                    });
+        auto zero_accumulators = [&] {
+            static_for<0, MIterPerWarp, 1>{}([&](auto mIter) {
+                static_for<0, NIterPerWarp, 1>{}([&](auto nIter) {
+                    static_for<0, (WG::kM * WG::kN) / warp_size, 1>{}([&](auto i) {
+                        c_acc(mIter)(nIter).get_thread_buffer()[i] = 0.0f;
+                    }); // make sure WG::CWarpTensor exposes a clear/zero
                });
            });
+        };
+        static_for<0, QScalesPerBlockRow, 1>{}([&](auto kQScale) {
+            zero_accumulators();
+            static_for<0, KIterPerQScale, 1>{}([&](auto kIterInQScale) {
+                constexpr auto kIter = kQScale * KIterPerQScale + kIterInQScale;
+                static_for<0, MIterPerWarp, 1>{}([&](auto mIter) {
+                    constexpr auto AwarpIter = (kIter * MIterPerWarp + mIter) % m_preload;
+                    static_for<0, NIterPerWarp, 1>{}([&](auto nIter) {
+                        // warp GEMM
+                        WG{}(c_acc(mIter)(nIter),
+                             a_warp_tensor(number<AwarpIter>{}),
+                             b_warp_tensor(nIter)(number<kIter>{}));
+                    });
+                    __builtin_amdgcn_sched_barrier(0x7F6);
+                    // preload next A from lds
+                    if constexpr((kIter * MIterPerWarp + mIter) <
+                                 (KIterPerWarp * MIterPerWarp - m_preload))
+                    {
+                        constexpr auto AmIter = (mIter + m_preload) % MIterPerWarp;
+                        constexpr auto AkIter = (kIter + (mIter + m_preload) / MIterPerWarp);
+                        a_warp_tensor(number<AwarpIter>{}) =
+                            load_tile(a_warp_windows(number<AmIter>{})(number<AkIter>{}));
+                    }
+                    // barrier
+                    // Could be deleted
+                    if constexpr((mIter == MIter_2nd_last))
+                    {
+                        block_sync_lds();
+                    }
+                });
+            });
+            static_for<0, MIterPerWarp, 1>{}([&](auto mIter) {
+                static_for<0, NIterPerWarp, 1>{}([&](auto nIter) {
+                    constexpr auto tbuf_offset =
+                        number<typename CBlockTensor::ThreadTensorDesc{}.calculate_offset(
+                                   merge_sequences(sequence<mIter, nIter>{},
+                                                   c_warp_y_index_zeros)) /
+                               CBlockTensor::PackedSize>{};

-            constexpr auto tbuf_offset =
-                number<typename CBlockTensor::ThreadTensorDesc{}.calculate_offset(merge_sequences(
-                           sequence<number<0>{}, number<0>{}>{}, c_warp_y_index_zeros)) /
-                       CBlockTensor::PackedSize>{};
+                    constexpr index_t reg_offset = nIter * KPerBlockBQ + kQScale;

-            constexpr index_t reg_offset = kQScale;
-            // nIter * KPerBlockBQ + kQScale; //((kIter * WG::kK) / kQuantGroupSize);
+                    auto& scale_reg   = bq_block_tensor.get_thread_buffer()[reg_offset];
+                    float scale_reg_f = cvt_scale_to_fp32(scale_reg);

-            auto& scale_reg   = bq_block_tensor.get_thread_buffer()[reg_offset];
-            float scale_reg_f = cvt_scale_to_fp32(scale_reg);
-
-            static_for<0, WG::kM * WG::kN / warp_size, 1>{}([&](auto c_row) {
-                c_block_tensor.get_thread_buffer()[tbuf_offset + c_row] +=
-                    (c_warp_tensor.get_thread_buffer()[c_row] * scale_reg_f);
+                    static_for<0, WG::kM * WG::kN / warp_size, 1>{}([&](auto c_row) {
+                        auto& c_ref = c_block_tensor.get_thread_buffer()[tbuf_offset + c_row];
+                        const auto acc_val = c_acc(mIter)(nIter).get_thread_buffer()[c_row];
+                        c_ref              = c_ref + acc_val * scale_reg_f;
+                    });
+                });
            });
        });
    }