Preshuffle AQ matrix in block scale gemm (#2624)

* Preshuffle AQ matrix in block scale gemm * turns the output to fp16. Increase the repetition time. --------- Co-authored-by: ThomasNing <thomas.ning@amd.com>
2026-05-02 12:41:26 +00:00 · 2025-08-12 22:32:51 -06:00
parent 0f42a92fc1
commit 452791a3ba
13 changed files with 667 additions and 228 deletions
--- a/include/ck_tile/ops/gemm_group_quant/block/block_universal_gemm_as_aquant_bs_cr.hpp
+++ b/include/ck_tile/ops/gemm_group_quant/block/block_universal_gemm_as_aquant_bs_cr.hpp
@@ -156,6 +156,8 @@ struct AQuantBlockUniversalGemmAsBsCr : public BlockGemmQuantBase<Problem_>

        static constexpr index_t KPack      = WarpGemm::kKPerThread;
        static constexpr index_t KPerThread = KIterPerWarp * WarpGemm::kKPerThread;
+
+        static constexpr bool Preshuffle = Problem::Traits::Preshuffle;
    };

    public:
@@ -322,6 +324,7 @@ struct AQuantBlockUniversalGemmAsBsCr : public BlockGemmQuantBase<Problem_>
            static_assert(std::is_same_v<CDataType, typename CBlockTensor::DataType>,
                          "The CDataType as defined in traits should be the same as correspoinding "
                          "C block tensor data type!");
+            constexpr auto warp_size = get_warp_size();

            // hot loop:
            static_for<0, MIterPerWarp, 1>{}([&](auto mIter) {
@@ -354,82 +357,153 @@ struct AQuantBlockUniversalGemmAsBsCr : public BlockGemmQuantBase<Problem_>
                            }
                        });

-                        // Need to multiply aquant with accumulated C
-                        //
-                        // The accumulated C tile has the standard distribution. For example
-                        // lane 0 holds elements [0,0], [1,0], [2,0], [3,0], [8,0], [9,0],
-                        // [10,0], [11,0], [16,0], [17,0], [18,0], [19,0], [24,0], [25,0],
-                        // [26,0], [27,0].
-                        //
-                        // These elements are in different rows, need to get the scale value
-                        // for the corresponding row.
-                        // Based on aquant's tile distribution, it can be inferred which
-                        // lane holds the relevant scale. For example, the scales corresponding
-                        // to the 16 elements held by lane 0 are held by lanes 0, 1, 2, 3, 8, 9,
-                        // 10, 11, 16, 17, 18, 19, 24, 25, 26, 27 respectively.
-                        //
-                        // These scales can be obtained using __builtin_amdgcn_ds_bpermute.
+                        if constexpr(Traits::Preshuffle)
+                        {
+                            // A view is created on top of the preshuffled AQ, where each row of the
+                            // view is composed of a row from a warp tile within an AQ block tile.
+                            // Multiple warp tile rows that belong to the same block tile are laid
+                            // out as consecutive rows.
+                            //
+                            // When we need to multiply a C warp tile with an AQ warp tile, thread 0
+                            // in the warp will load AQ_warp_tile[0], thread 1 will load
+                            // AQ_warp_tile[1], and so on, up to thread 63, which will load
+                            // AQ_warp_tile[63]. The VGPR file in the warp acts similarly to LDS in
+                            // this context, but we use cross-lane operations to access the data.
+                            // (Cross-lane operations are faster than using LDS.)
+                            //
+                            // Note that when the size of the AQ warp tile is smaller than the warp
+                            // size, you need to pad the rows in the view to ensure that each thread
+                            // can read one element.
+                            constexpr auto tbuf_offset =
+                                number<typename CBlockTensor::ThreadTensorDesc{}.calculate_offset(
+                                           merge_sequences(sequence<mIter, nIter>{},
+                                                           c_warp_y_index_zeros)) /
+                                       CBlockTensor::PackedSize>{};
+                            constexpr uint32_t kTileRowsOfCPerThread = 4;

-                        // MIters per warp
-                        constexpr index_t mIters_per_warp = get_warp_size() / WarpGemm::kM;
+                            static_for<0, WarpGemm::kM * WarpGemm::kN / warp_size, 1>{}(
+                                [&](auto c_row) {
+                                    // For a warp tile of [16x16x32], take thread 0 as an example.
+                                    // Its VGPR[0] stores the value from C_tile[0,0], VGPR[1] stores
+                                    // C_tile[1,0], VGPR[2] stores C_tile[2,0], and VGPR[3] stores
+                                    // C_tile[3,0]. This means VGPR[0] should be multiplied by
+                                    // AQ_tile[0, 0], VGPR[1] by AQ_tile[1, 0], VGPR[2] by
+                                    // AQ_tile[2, 0], and VGPR[3] by AQ_tile[3, 0].

-                        // Reg block offset based on mIter
-                        constexpr index_t reg_block_offset =
-                            ((mIter / mIters_per_warp) * Traits::AQPerBlock);
+                                    // Thread 0 can read AQ_tile[0, 0] from itself, AQ_tile[1, 0]
+                                    // from thread 1, ..., and AQ_tile[3, 0] from thread 3.
+                                    auto pull_from_lane =
+                                        ((threadIdx.x & (warp_size - 1)) / Traits::WarpGemm::kN *
+                                             kTileRowsOfCPerThread +
+                                         c_row) *
+                                            Traits::QScalesPerBlockRow +
+                                        kQScale;
+                                    auto& scale_reg = aq_block_tensor.get_thread_buffer()[mIter];

-                        constexpr index_t lane_base_offset =
-                            (mIter % mIters_per_warp) * WarpGemm::kM;
+                                    // cross lane ops
+                                    uint32_t scale_reg_dword;

-                        // Scale tensor offset along K
-                        constexpr index_t src_reg_offset = reg_block_offset + kQScale;
+                                    if constexpr(std::is_same_v<AQDataType, float>)
+                                    {
+                                        scale_reg_dword = ck_tile::bit_cast<uint32_t>(scale_reg);
+                                    }
+                                    else
+                                    {
+                                        scale_reg_dword = static_cast<uint32_t>(scale_reg);
+                                    }

-                        constexpr uint32_t kTileRows        = 4;
-                        constexpr uint32_t kTiledCMsPerWarp = WarpGemm::kCMLane * kTileRows;
+                                    int gathered_scale_reg = __builtin_amdgcn_ds_bpermute(
+                                        pull_from_lane << 2,
+                                        __builtin_bit_cast(int, scale_reg_dword));

-                        constexpr auto tbuf_offset =
-                            number<typename CBlockTensor::ThreadTensorDesc{}.calculate_offset(
-                                       merge_sequences(sequence<mIter, nIter>{},
-                                                       c_warp_y_index_zeros)) /
-                                   CBlockTensor::PackedSize>{};
+                                    float scale_reg_f = Base::cvt_scale_to_fp32(gathered_scale_reg);

-                        static_for<0, WarpGemm::kM, WarpGemm::kCMLane>{}([&](auto c_row) {
-                            // Multiply by 4 because output is stored in tiles of 4
-                            // x CNLane
-                            constexpr uint32_t row_base =
-                                ((c_row / kTiledCMsPerWarp) * kTiledCMsPerWarp) +
-                                ((c_row % kTiledCMsPerWarp) / WarpGemm::kCMLane);
+                                    c_block_tensor.get_thread_buffer()[tbuf_offset + c_row] +=
+                                        (c_warp_tensor.get_thread_buffer()[c_row] * scale_reg_f *
+                                         kA_cvt_scale * kB_cvt_scale);
+                                });
+                        }
+                        else
+                        {
+                            // Need to multiply aquant with accumulated C
+                            //
+                            // The accumulated C tile has the standard distribution. For example
+                            // lane 0 holds elements [0,0], [1,0], [2,0], [3,0], [8,0], [9,0],
+                            // [10,0], [11,0], [16,0], [17,0], [18,0], [19,0], [24,0], [25,0],
+                            // [26,0], [27,0].
+                            //
+                            // These elements are in different rows, need to get the scale value
+                            // for the corresponding row.
+                            // Based on aquant's tile distribution, it can be inferred which
+                            // lane holds the relevant scale. For example, the scales corresponding
+                            // to the 16 elements held by lane 0 are held by lanes 0, 1, 2, 3, 8, 9,
+                            // 10, 11, 16, 17, 18, 19, 24, 25, 26, 27 respectively.
+                            //
+                            // These scales can be obtained using __builtin_amdgcn_ds_bpermute.

-                            constexpr uint32_t reg_offset_for_row_data = c_row / WarpGemm::kCMLane;
+                            // MIters per warp
+                            constexpr index_t mIters_per_warp = get_warp_size() / WarpGemm::kM;

-                            // Lane index to source scale from
-                            uint32_t src_lane_idx = lane_base_offset + row_base +
-                                                    (__lane_id() / WarpGemm::kN * kTileRows);
+                            // Reg block offset based on mIter
+                            constexpr index_t reg_block_offset =
+                                ((mIter / mIters_per_warp) * Traits::AQPerBlock);

-                            // Directly index into thread buffer corresponding to
-                            // desired row coefficient
-                            auto& scale_reg = aq_block_tensor.get_thread_buffer()[src_reg_offset];
-                            uint32_t scale_reg_dword;
+                            constexpr index_t lane_base_offset =
+                                (mIter % mIters_per_warp) * WarpGemm::kM;

-                            if constexpr(std::is_same_v<AQDataType, float>)
-                            {
-                                scale_reg_dword = ck_tile::bit_cast<uint32_t>(scale_reg);
-                            }
-                            else
-                            {
-                                scale_reg_dword = static_cast<uint32_t>(scale_reg);
-                            }
+                            // Scale tensor offset along K
+                            constexpr index_t src_reg_offset = reg_block_offset + kQScale;

-                            // Pull scale data across lanes
-                            int gathered_scale_reg = __builtin_amdgcn_ds_bpermute(
-                                src_lane_idx * 4, __builtin_bit_cast(int, scale_reg_dword));
+                            constexpr uint32_t kTileRows        = 4;
+                            constexpr uint32_t kTiledCMsPerWarp = WarpGemm::kCMLane * kTileRows;

-                            float scale_reg_f = Base::cvt_scale_to_fp32(gathered_scale_reg);
+                            constexpr auto tbuf_offset =
+                                number<typename CBlockTensor::ThreadTensorDesc{}.calculate_offset(
+                                           merge_sequences(sequence<mIter, nIter>{},
+                                                           c_warp_y_index_zeros)) /
+                                       CBlockTensor::PackedSize>{};

-                            c_block_tensor
-                                .get_thread_buffer()[tbuf_offset + reg_offset_for_row_data] +=
-                                (c_warp_tensor.get_thread_buffer()[reg_offset_for_row_data] *
-                                 scale_reg_f * kA_cvt_scale * kB_cvt_scale);
-                        });
+                            static_for<0, WarpGemm::kM, WarpGemm::kCMLane>{}([&](auto c_row) {
+                                // Multiply by 4 because output is stored in tiles of 4
+                                // x CNLane
+                                constexpr uint32_t row_base =
+                                    ((c_row / kTiledCMsPerWarp) * kTiledCMsPerWarp) +
+                                    ((c_row % kTiledCMsPerWarp) / WarpGemm::kCMLane);
+
+                                constexpr uint32_t reg_offset_for_row_data =
+                                    c_row / WarpGemm::kCMLane;
+
+                                // Lane index to source scale from
+                                uint32_t src_lane_idx = lane_base_offset + row_base +
+                                                        (__lane_id() / WarpGemm::kN * kTileRows);
+
+                                // Directly index into thread buffer corresponding to
+                                // desired row coefficient
+                                auto& scale_reg =
+                                    aq_block_tensor.get_thread_buffer()[src_reg_offset];
+                                uint32_t scale_reg_dword;
+
+                                if constexpr(std::is_same_v<AQDataType, float>)
+                                {
+                                    scale_reg_dword = ck_tile::bit_cast<uint32_t>(scale_reg);
+                                }
+                                else
+                                {
+                                    scale_reg_dword = static_cast<uint32_t>(scale_reg);
+                                }
+
+                                // Pull scale data across lanes
+                                int gathered_scale_reg = __builtin_amdgcn_ds_bpermute(
+                                    src_lane_idx * 4, __builtin_bit_cast(int, scale_reg_dword));
+
+                                float scale_reg_f = Base::cvt_scale_to_fp32(gathered_scale_reg);
+
+                                c_block_tensor
+                                    .get_thread_buffer()[tbuf_offset + reg_offset_for_row_data] +=
+                                    (c_warp_tensor.get_thread_buffer()[reg_offset_for_row_data] *
+                                     scale_reg_f * kA_cvt_scale * kB_cvt_scale);
+                            });
+                        }
                    });
                });
            });