[CK_Tile] Support for preshuffle weight(B) quant tensor for block scale gemm (#3165)

* formatted * formatted * formatting * formatting * formatting * [CK TILE GEMM] Refactor block_scale_gemm examples - Split cpp file to reduce building time - Support multiple GemmConfig * [CK TILE GEMM] Refactor block_scale_gemm examples - Update Readme * enable prefill shapes * [CK TILE GEMM] Refactor block_scale_gemm examples - Add support for rowcol and tensor GEMM operations * [CK TILE GEMM] Refactor block_scale_gemm examples - Update README * adding preshuffle quant as new parameter and its associated new files * remove debugging statements * adding test * enable preshuffle quant with permuteN * updating readme and correcponding gemmconfigs * updating cmake file * fixing CI failures for grouped quant gemm * addressing review comments * fixing CI issue * addressing reveiw comments * formatting * formatting * fixing aquant operator overlaoding * formatting --------- Co-authored-by: Cong Ma <congma13@amd.com> Co-authored-by: Thomas Ning <Thomas.Ning@amd.com>
2026-04-20 06:49:15 +00:00 · 2025-11-24 07:48:42 -08:00
parent e857e26bf6
commit 8111572785
31 changed files with 855 additions and 247 deletions
--- a/include/ck_tile/ops/gemm_quant/block/block_universal_gemm_ar_flatbr_bquant_cr.hpp
+++ b/include/ck_tile/ops/gemm_quant/block/block_universal_gemm_ar_flatbr_bquant_cr.hpp
@@ -54,6 +54,8 @@ struct BlockGemmWeightPreshuffleBQuantARegBRegCReg

    static constexpr index_t kBlockSize = Problem::kBlockSize;

+    static constexpr bool PreshuffleQuant = Problem::Traits::PreshuffleQuant;
+
    static constexpr index_t MIterPerWarp = MPerBlock / (MWarp * WG::kM);
    static constexpr index_t NIterPerWarp =
        BlockTile::at(idxN) / (WarpTile::at(idxN) * BlockWarps::at(idxN));
@@ -172,16 +174,47 @@ struct BlockGemmWeightPreshuffleBQuantARegBRegCReg
                                                   c_warp_y_index_zeros)) /
                               CBlockTensor::PackedSize>{};

-                    constexpr index_t reg_offset = nIter * KPerBlockBQ + kQScale;
+                    if constexpr(PreshuffleQuant)
+                    {
+                        constexpr index_t reg_offset = nIter;
+                        auto pull_from_lane = (__lane_id() & (WG::kN - 1)) * KPerBlockBQ + kQScale;
+                        auto& scale_reg     = bq_block_tensor.get_thread_buffer()[reg_offset];
+                        // cross lane ops
+                        uint32_t scale_reg_dword;

-                    auto& scale_reg   = bq_block_tensor.get_thread_buffer()[reg_offset];
-                    float scale_reg_f = cvt_scale_to_fp32(scale_reg);
+                        if constexpr(std::is_same_v<BQDataType, float>)
+                        {
+                            scale_reg_dword = ck_tile::bit_cast<uint32_t>(scale_reg);
+                        }
+                        else
+                        {
+                            scale_reg_dword = static_cast<uint32_t>(scale_reg);
+                        }

-                    static_for<0, WG::kM * WG::kN / warp_size, 1>{}([&](auto c_row) {
-                        auto& c_ref = c_block_tensor.get_thread_buffer()[tbuf_offset + c_row];
-                        const auto acc_val = c_acc(mIter)(nIter).get_thread_buffer()[c_row];
-                        c_ref              = c_ref + acc_val * scale_reg_f;
-                    });
+                        // cross lane ops to get the value of scale_reg.
+                        int gathered_scale_reg = __builtin_amdgcn_ds_bpermute(
+                            pull_from_lane << 2, __builtin_bit_cast(int, scale_reg_dword));
+
+                        float scale_reg_f = cvt_scale_to_fp32(gathered_scale_reg);
+
+                        static_for<0, WG::kM * WG::kN / warp_size, 1>{}([&](auto c_row) {
+                            auto& c_ref = c_block_tensor.get_thread_buffer()[tbuf_offset + c_row];
+                            const auto acc_val = c_acc(mIter)(nIter).get_thread_buffer()[c_row];
+                            c_ref              = c_ref + acc_val * scale_reg_f;
+                        });
+                    }
+                    else
+                    {
+                        constexpr index_t reg_offset = nIter * KPerBlockBQ + kQScale;
+                        auto& scale_reg   = bq_block_tensor.get_thread_buffer()[reg_offset];
+                        float scale_reg_f = cvt_scale_to_fp32(scale_reg);
+
+                        static_for<0, WG::kM * WG::kN / warp_size, 1>{}([&](auto c_row) {
+                            auto& c_ref = c_block_tensor.get_thread_buffer()[tbuf_offset + c_row];
+                            const auto acc_val = c_acc(mIter)(nIter).get_thread_buffer()[c_row];
+                            c_ref              = c_ref + acc_val * scale_reg_f;
+                        });
+                    }
                });
            });
        });
--- a/include/ck_tile/ops/gemm_quant/block/block_universal_gemm_as_aquant_bs_cr.hpp
+++ b/include/ck_tile/ops/gemm_quant/block/block_universal_gemm_as_aquant_bs_cr.hpp
@@ -274,7 +274,6 @@ struct AQuantBlockUniversalGemmAsBsCr : public BlockGemmAQuantBase<Problem_>

            int gathered_scale_reg = __builtin_amdgcn_ds_bpermute(
                pull_from_lane << 2, __builtin_bit_cast(int, scale_reg_dword));
-
            return Base::cvt_scale_to_fp32(gathered_scale_reg);
        }

@@ -368,7 +367,6 @@ struct AQuantBlockUniversalGemmAsBsCr : public BlockGemmAQuantBase<Problem_>
                            static_assert(false, "WarpGemm::kM is not 16 nor 32.");
                        }
                        auto& scale_reg = aq_block_tensor.get_thread_buffer()[mIter];
-
                        return exchange_quant_value_across_lanes(scale_reg, pull_from_lane);
                    }
                    else
@@ -511,6 +509,7 @@ struct AQuantBlockUniversalGemmAsBsCr : public BlockGemmAQuantBase<Problem_>
                        static_for<0, WarpGemm::kM * WarpGemm::kN / warp_size, 1>{}(
                            [&](auto c_row) {
                                float scale_reg_f = aq_picker.template pick<c_row>();
+
                                c_block_tensor.get_thread_buffer()[tbuf_offset + c_row] +=
                                    (c_warp_tensor.get_thread_buffer()[c_row] * scale_reg_f);
                            });
--- a/include/ck_tile/ops/gemm_quant/block/block_universal_gemm_as_bs_bquant_cr.hpp
+++ b/include/ck_tile/ops/gemm_quant/block/block_universal_gemm_as_bs_bquant_cr.hpp
@@ -100,6 +100,8 @@ struct BQuantBlockUniversalGemmAsBsCr : public BlockGemmBQuantBase<Problem_>
        static constexpr index_t NIterPerWarp = NPerBlock / (NWarp * WarpGemm::kN);
        static constexpr index_t KIterPerWarp = KPerBlock / WarpGemm::kK;

+        static constexpr bool PreshuffleQuant = Problem::Traits::PreshuffleQuant;
+
        static constexpr index_t QScalesPerBlockRow =
            integer_divide_ceil(KPerBlock, QuantGroupSize::kK);
        static constexpr index_t QScalesPerWarpGemmRow =
@@ -173,6 +175,8 @@ struct BQuantBlockUniversalGemmAsBsCr : public BlockGemmBQuantBase<Problem_>
    using BWarpTensor = typename WarpGemm::BWarpTensor;
    using CWarpTensor = typename WarpGemm::CWarpTensor;

+    static constexpr bool PreshuffleQuant = Traits::PreshuffleQuant;
+
    static_assert(std::is_same_v<typename WarpGemm::CDataType, float>);

    static constexpr auto a_warp_y_lengths =
@@ -321,31 +325,65 @@ struct BQuantBlockUniversalGemmAsBsCr : public BlockGemmBQuantBase<Problem_>
                            }
                        });

-                        // Multiply bquant with accumulated C
-                        constexpr index_t reg_offset = [&]() {
-                            if constexpr(GemmTraits::QuantGroupSize::kN >= (NWarp * WarpGemm::kN))
-                                return (nIter * NWarp * WarpGemm::kN) /
-                                           GemmTraits::QuantGroupSize::kN * Traits::KQPerBlock +
-                                       kQScale;
-                            else
-                            {
-                                return nIter * Traits::KQPerBlock + kQScale;
-                            }
-                        }();
-
                        constexpr auto tbuf_offset =
                            number<typename CBlockTensor::ThreadTensorDesc{}.calculate_offset(
                                       merge_sequences(sequence<mIter, nIter>{},
                                                       c_warp_y_index_zeros)) /
                                   CBlockTensor::PackedSize>{};

-                        auto& scale_reg   = bq_block_tensor.get_thread_buffer()[reg_offset];
-                        float scale_reg_f = Base::cvt_scale_to_fp32(scale_reg);
-                        static_for<0, WarpGemm::kM * WarpGemm::kN / warp_size, 1>{}(
-                            [&](auto c_row) {
-                                c_block_tensor.get_thread_buffer()[tbuf_offset + c_row] +=
-                                    (c_warp_tensor.get_thread_buffer()[c_row] * scale_reg_f);
-                            });
+                        if constexpr(PreshuffleQuant)
+                        {
+                            constexpr index_t reg_offset = nIter;
+                            auto pull_from_lane =
+                                (__lane_id() & (WarpGemm::kN - 1)) * Traits::KQPerBlock + kQScale;
+                            auto& scale_reg = bq_block_tensor.get_thread_buffer()[reg_offset];
+                            // cross lane ops
+                            uint32_t scale_reg_dword;
+
+                            if constexpr(std::is_same_v<BQDataType, float>)
+                            {
+                                scale_reg_dword = ck_tile::bit_cast<uint32_t>(scale_reg);
+                            }
+                            else
+                            {
+                                scale_reg_dword = static_cast<uint32_t>(scale_reg);
+                            }
+
+                            // cross lane ops to get the value of scale_reg.
+                            int gathered_scale_reg = __builtin_amdgcn_ds_bpermute(
+                                pull_from_lane << 2, __builtin_bit_cast(int, scale_reg_dword));
+
+                            float scale_reg_f = Base::cvt_scale_to_fp32(gathered_scale_reg);
+
+                            static_for<0, WarpGemm::kM * WarpGemm::kN / warp_size, 1>{}(
+                                [&](auto c_row) {
+                                    c_block_tensor.get_thread_buffer()[tbuf_offset + c_row] +=
+                                        (c_warp_tensor.get_thread_buffer()[c_row] * scale_reg_f);
+                                });
+                        }
+                        else
+                        {
+                            // Multiply bquant with accumulated C
+                            constexpr index_t reg_offset = [&]() {
+                                if constexpr(GemmTraits::QuantGroupSize::kN >=
+                                             (NWarp * WarpGemm::kN))
+                                    return (nIter * NWarp * WarpGemm::kN) /
+                                               GemmTraits::QuantGroupSize::kN * Traits::KQPerBlock +
+                                           kQScale;
+                                else
+                                {
+                                    return nIter * Traits::KQPerBlock + kQScale;
+                                }
+                            }();
+
+                            auto& scale_reg   = bq_block_tensor.get_thread_buffer()[reg_offset];
+                            float scale_reg_f = Base::cvt_scale_to_fp32(scale_reg);
+                            static_for<0, WarpGemm::kM * WarpGemm::kN / warp_size, 1>{}(
+                                [&](auto c_row) {
+                                    c_block_tensor.get_thread_buffer()[tbuf_offset + c_row] +=
+                                        (c_warp_tensor.get_thread_buffer()[c_row] * scale_reg_f);
+                                });
+                        }
                    });
                });
            });
--- a/include/ck_tile/ops/gemm_quant/kernel/gemm_quant_kernel.hpp
+++ b/include/ck_tile/ops/gemm_quant/kernel/gemm_quant_kernel.hpp
@@ -271,6 +271,94 @@ struct QuantGemmKernel
        return max(GemmPipeline::GetSmemSize(), EpiloguePipeline::GetSmemSize());
    }

+    private:
+    CK_TILE_DEVICE static constexpr index_t get_padding_size(index_t length, index_t alignment)
+    {
+        return ck_tile::integer_least_multiple(length, alignment) - length;
+    };
+    // ===================================================================
+    // Helper: Create Pre-shuffled Quantization Tensor Descriptor
+    // ===================================================================
+    template <index_t KPerBlockBQ,
+              index_t NPerBlock,
+              index_t WarpTileN,
+              index_t GetVectorSizeBQ,
+              typename BQDataType_>
+    CK_TILE_DEVICE static auto
+    MakePreshuffledQuantTensorView(const BQDataType_* bq_ptr, index_t N, index_t QK_B)
+    {
+        // Step 1: Calculate base BQ tensor dimensions
+        // ----------------------------------------------------------
+        // bq_x: Number of quantization groups in N dimension
+        //       = N * KPerBlockBQ, where KPerBlockBQ is the number of
+        //       K-dimension groups per block
+        // bq_y: Number of quantization groups in K dimension
+        //       = Total K groups (QK_B) / groups per block
+        const auto bq_x = N * KPerBlockBQ;
+        const auto bq_y = QK_B / KPerBlockBQ;
+
+        const auto bq_desc = make_naive_tensor_descriptor(
+            make_tuple(bq_y, bq_x), make_tuple(bq_x, 1), number<GetVectorSizeBQ>{}, number<1>{});
+
+        // Step 2: First padding transformation (block-level alignment)
+        // ----------------------------------------------------------
+        // Pad the X dimension to be a multiple of block_tile_size to ensure
+        // each thread block can process complete tiles without edge cases
+        const auto block_tile_size = NPerBlock * KPerBlockBQ;
+        const auto bq_pad0_desc    = transform_tensor_descriptor(
+            bq_desc,
+            make_tuple(make_pass_through_transform(bq_y),
+                       make_right_pad_transform(bq_x, get_padding_size(bq_x, block_tile_size))),
+            make_tuple(sequence<0>{}, sequence<1>{}),
+            make_tuple(sequence<0>{}, sequence<1>{}));
+
+        // Step 3: Unmerge transformation (wave-level decomposition)
+        // ----------------------------------------------------------
+        // Split the X dimension into [wave_tile_count_x, wave_tile_size]
+        // This separates the work into tiles that can be processed by
+        // individual warps/waves
+        const auto pad_bq_x          = bq_pad0_desc.get_lengths()[I1];
+        const auto wave_tile_size    = WarpTileN * KPerBlockBQ;
+        const auto wave_tile_count_x = ck_tile::integer_divide_ceil(pad_bq_x, wave_tile_size);
+
+        const auto bq_unmerge_pad0_desc = transform_tensor_descriptor(
+            bq_pad0_desc,
+            make_tuple(make_pass_through_transform(bq_y),
+                       make_unmerge_transform(make_tuple(wave_tile_count_x, wave_tile_size))),
+            make_tuple(sequence<0>{}, sequence<1>{}),
+            make_tuple(sequence<0>{}, sequence<1, 2>{}));
+
+        // Step 4: Second padding transformation (warp-level alignment)
+        // ----------------------------------------------------------
+        // Pad wave_tile_size to be a multiple of warp_size (typically 32 or 64)
+        // This ensures coalesced memory accesses within each warp
+        const auto bq_pad1_desc = transform_tensor_descriptor(
+            bq_unmerge_pad0_desc,
+            make_tuple(make_pass_through_transform(bq_y),
+                       make_pass_through_transform(wave_tile_count_x),
+                       make_right_pad_transform(wave_tile_size,
+                                                get_padding_size(wave_tile_size, get_warp_size()))),
+            make_tuple(sequence<0>{}, sequence<1>{}, sequence<2>{}),
+            make_tuple(sequence<0>{}, sequence<1>{}, sequence<2>{}));
+
+        // Step 5: Final merge transformation (prepare for indexing)
+        // ----------------------------------------------------------
+        // Merge [bq_y, wave_tile_count_x] into a single outer dimension
+        // This creates a 2D layout: [merged_outer_dim, pad_wave_size]
+        // where merged_outer_dim = bq_y * wave_tile_count_x
+        // This layout facilitates efficient block-to-data mapping
+        const auto pad_wave_size = ck_tile::integer_least_multiple(wave_tile_size, get_warp_size());
+        const auto bq_merge_pad1_desc = transform_tensor_descriptor(
+            bq_pad1_desc,
+            make_tuple(make_merge_transform(make_tuple(bq_y, wave_tile_count_x)),
+                       make_pass_through_transform(pad_wave_size)),
+            make_tuple(sequence<0, 1>{}, sequence<2>{}),
+            make_tuple(sequence<0>{}, sequence<1>{}));
+
+        return make_tensor_view<address_space_enum::global>(bq_ptr, bq_merge_pad1_desc);
+    }
+
+    public:
    struct SplitKBatchOffset
    {
        __device__ SplitKBatchOffset(const QuantGemmKernelArgs& kargs,
@@ -509,17 +597,12 @@ struct QuantGemmKernel
            }
        }();

-        const auto get_padding_size = [](index_t length, index_t alignment) {
-            return ck_tile::integer_least_multiple(length, alignment) - length;
-        };
-
        const auto& aq_tensor_view = [&]() {
            if constexpr(kQuantType == QuantType::AQuantGrouped && PreshuffleQuant)
            {
                static_assert(std::is_same_v<AQLayout, tensor_layout::gemm::RowMajor>);
                const auto aq_x = kargs.M * GemmPipeline::KPerBlockAQ;
                const auto aq_y = kargs.QK_A / GemmPipeline::KPerBlockAQ;
-
                const auto aq_desc =
                    make_naive_tensor_descriptor(make_tuple(aq_y, aq_x),
                                                 make_tuple(aq_x, 1),
@@ -540,6 +623,7 @@ struct QuantGemmKernel
                    GemmPipeline::BlockGemmShape::WarpTile::at(I0) * GemmPipeline::KPerBlockAQ;
                const auto wave_tile_count_x =
                    ck_tile::integer_divide_ceil(pad_aq_x, wave_tile_size);
+
                const auto aq_unmerge_pad0_desc = transform_tensor_descriptor(
                    aq_pad0_desc,
                    make_tuple(
@@ -686,14 +770,27 @@ struct QuantGemmKernel
            }
            else if constexpr(kQuantType == QuantType::BQuantGrouped)
            {
-                static_assert(std::is_same_v<BQLayout, tensor_layout::gemm::ColumnMajor>);
-                using QuantGroupSize = remove_cvref_t<typename GemmPipeline::QuantGroupSize>;
-                return make_naive_tensor_view<address_space_enum::global>(
-                    bq_ptr,
-                    make_tuple(kargs.QK_B, integer_divide_ceil(kargs.N, QuantGroupSize::kN)),
-                    make_tuple(1, kargs.stride_BQ),
-                    number<GemmPipeline::GetVectorSizeBQ()>{},
-                    number<1>{});
+                if constexpr(PreshuffleQuant)
+                {
+                    static_assert(std::is_same_v<BQLayout, tensor_layout::gemm::ColumnMajor>);
+
+                    return MakePreshuffledQuantTensorView<
+                        GemmPipeline::KPerBlockBQ,
+                        GemmPipeline::NPerBlock,
+                        TilePartitioner::BlockGemmShape::WarpTile::at(I1),
+                        GemmPipeline::GetVectorSizeBQ()>(bq_ptr, kargs.N, kargs.QK_B);
+                }
+                else
+                {
+                    static_assert(std::is_same_v<BQLayout, tensor_layout::gemm::ColumnMajor>);
+                    using QuantGroupSize = remove_cvref_t<typename GemmPipeline::QuantGroupSize>;
+                    return make_naive_tensor_view<address_space_enum::global>(
+                        bq_ptr,
+                        make_tuple(kargs.QK_B, integer_divide_ceil(kargs.N, QuantGroupSize::kN)),
+                        make_tuple(1, kargs.stride_BQ),
+                        number<GemmPipeline::GetVectorSizeBQ()>{},
+                        number<1>{});
+                }
            }
            else
            {
@@ -910,13 +1007,33 @@ struct QuantGemmKernel
            }
            else if constexpr(kQuantType == QuantType::BQuantGrouped)
            {
-                static_assert(std::is_same_v<BQLayout, tensor_layout::gemm::ColumnMajor>);
-                using QuantGroupSize = remove_cvref_t<typename GemmPipeline::QuantGroupSize>;
-                return make_tile_window(
-                    bq_pad_view,
-                    make_tuple(number<TilePartitioner::KPerBlock / QuantGroupSize::kK>{},
-                               number<TilePartitioner::NPerBlock / QuantGroupSize::kN>{}),
-                    {0, i_n / QuantGroupSize::kN});
+                if constexpr(PreshuffleQuant)
+                {
+                    static_assert(std::is_same_v<BQLayout, tensor_layout::gemm::ColumnMajor>);
+                    using QuantGroupSize   = remove_cvref_t<typename GemmPipeline::QuantGroupSize>;
+                    constexpr auto block_n = TilePartitioner::NPerBlock / QuantGroupSize::kN;
+                    constexpr auto warp_n  = TilePartitioner::BlockGemmShape::WarpTile::at(I1);
+                    constexpr auto bqk_per_block = TilePartitioner::KPerBlock / QuantGroupSize::kK;
+                    constexpr auto tile_window_width =
+                        ck_tile::integer_least_multiple(warp_n * bqk_per_block, get_warp_size());
+                    constexpr auto tile_window_height = block_n / warp_n;
+                    auto block_n_idx                  = i_n / block_n;
+
+                    return make_tile_window(
+                        bq_pad_view,
+                        make_tuple(number<tile_window_height>{}, number<tile_window_width>{}),
+                        {block_n_idx * tile_window_height, 0});
+                }
+                else
+                {
+                    static_assert(std::is_same_v<BQLayout, tensor_layout::gemm::ColumnMajor>);
+                    using QuantGroupSize = remove_cvref_t<typename GemmPipeline::QuantGroupSize>;
+                    return make_tile_window(
+                        bq_pad_view,
+                        make_tuple(number<TilePartitioner::KPerBlock / QuantGroupSize::kK>{},
+                                   number<TilePartitioner::NPerBlock / QuantGroupSize::kN>{}),
+                        {0, i_n / QuantGroupSize::kN});
+                }
            }
            else
            {
@@ -979,14 +1096,24 @@ struct QuantGemmKernel
            if constexpr(kQuantType == QuantType::AQuantGrouped)
            {
                const auto& aq_block_window = gemm_tile_windows.at(I1);
+                index_t m                   = 0;
+                if constexpr(PreshuffleQuant)
+                {
+                    m = kargs.M;
+                }
                return GemmPipeline{}.template operator()(
-                    a_block_window, b_block_window, aq_block_window, kargs.M, num_loop, smem_ptr_0);
+                    a_block_window, b_block_window, aq_block_window, num_loop, smem_ptr_0, m);
            }
            else if constexpr(kQuantType == QuantType::BQuantGrouped)
            {
                const auto& bq_block_window = gemm_tile_windows.at(I3);
+                index_t n                   = 0;
+                if constexpr(PreshuffleQuant)
+                {
+                    n = kargs.N;
+                }
                return GemmPipeline{}.template operator()(
-                    a_block_window, b_block_window, bq_block_window, num_loop, smem_ptr_0);
+                    a_block_window, b_block_window, bq_block_window, num_loop, smem_ptr_0, n);
            }
            else if constexpr(kQuantType == QuantType::RowColQuant ||
                              kQuantType == QuantType::TensorQuant)
@@ -1074,12 +1201,18 @@ struct QuantGemmKernel
            if constexpr(kQuantType == QuantType::BQuantGrouped)
            {
                const auto& bq_block_window = gemm_tile_windows.at(I3);
+                index_t n                   = 0;
+                if constexpr(PreshuffleQuant)
+                {
+                    n = kargs.N;
+                }
                return GemmPipeline{}.template operator()(a_block_window,
                                                          b_block_window,
                                                          bq_block_window,
                                                          num_loop,
                                                          smem_ptr_0,
-                                                          smem_ptr_1);
+                                                          smem_ptr_1,
+                                                          n);
            }
            else
            {
@@ -1109,7 +1242,6 @@ struct QuantGemmKernel
        const auto [iM, iN] = TilePartitioner{kargs.M, kargs.N}.GetOutputTileIndex(blockId);
        const index_t i_m   = amd_wave_read_first_lane(iM * TilePartitioner::MPerBlock);
        const index_t i_n   = amd_wave_read_first_lane(iN * TilePartitioner::NPerBlock);
-
        const SplitKBatchOffset splitk_batch_offset(kargs);
        // options
        const ADataType* a_ptr   = static_cast<const ADataType*>(kargs.a_ptr);
--- a/include/ck_tile/ops/gemm_quant/pipeline/gemm_aquant_pipeline_ag_bg_cr_mem.hpp
+++ b/include/ck_tile/ops/gemm_quant/pipeline/gemm_aquant_pipeline_ag_bg_cr_mem.hpp
@@ -463,11 +463,10 @@ struct AQuantGemmPipelineAgBgCrMem : public BaseAQuantGemmPipelineAgBgCrMem<Prob
    CK_TILE_DEVICE auto operator()(const ADramBlockWindowTmp& a_dram_block_window_tmp,
                                   const BDramBlockWindowTmp& b_dram_block_window_tmp,
                                   const AQDramBlockWindowTmp& aq_dram_block_window_tmp,
-                                   index_t m,
                                   index_t num_loop,
-                                   void* p_smem) const
+                                   void* p_smem,
+                                   index_t m = 0) const
    {
-
        return PipelineImpl<GemmPipelineScheduler::Interwave>{}
            .template operator()<HasHotLoop, TailNum>(
                a_dram_block_window_tmp,
--- a/include/ck_tile/ops/gemm_quant/pipeline/gemm_aquant_pipeline_ag_bg_cr_v3.hpp
+++ b/include/ck_tile/ops/gemm_quant/pipeline/gemm_aquant_pipeline_ag_bg_cr_v3.hpp
@@ -465,9 +465,9 @@ struct AQuantGemmPipelineAgBgCrCompV3 : public BaseAQuantGemmPipelineAgBgCrCompV
    CK_TILE_DEVICE auto operator()(const ADramBlockWindowTmp& a_dram_block_window_tmp,
                                   const BDramBlockWindowTmp& b_dram_block_window_tmp,
                                   const AQDramBlockWindowTmp& aq_dram_block_window_tmp,
-                                   index_t m,
                                   index_t num_loop,
-                                   void* p_smem) const
+                                   void* p_smem,
+                                   index_t m = 0) const
    {
        return PipelineImpl<Scheduler>{}.template operator()<HasHotLoop, TailNum>(
            a_dram_block_window_tmp,
--- a/include/ck_tile/ops/gemm_quant/pipeline/gemm_bquant_pipeline_ag_bg_cr_policy.hpp
+++ b/include/ck_tile/ops/gemm_quant/pipeline/gemm_bquant_pipeline_ag_bg_cr_policy.hpp
@@ -35,30 +35,48 @@ struct GemmBQuantPipelineAgBgCrDefaultPolicy : public UniversalGemmPipelineAgBgC
        using BQLayout       = remove_cvref_t<typename Problem::BQLayout>;
        using BlockGemmShape = typename Problem::BlockGemmShape;

-        constexpr index_t BlockSize   = Problem::kBlockSize;
-        constexpr index_t NPerBlock   = Problem::BlockGemmShape::kN;
-        constexpr index_t NPerBlockBQ = NPerBlock / Problem::QuantGroupSize::kN;
-        constexpr index_t KPerBlock   = Problem::BlockGemmShape::kK;
-        constexpr index_t KPerBlockBQ = KPerBlock / Problem::QuantGroupSize::kK;
-        using WarpTile                = typename Problem::BlockGemmShape::WarpTile;
-        using WarpGemm                = WarpGemmDispatcher<typename Problem::ComputeDataType,
-                                                           typename Problem::ComputeDataType,
-                                                           typename Problem::CDataType,
-                                                           WarpTile::at(I0),
-                                                           WarpTile::at(I1),
-                                                           WarpTile::at(I2),
-                                                           Problem::TransposeC>;
+        constexpr index_t BlockSize    = Problem::kBlockSize;
+        constexpr index_t NPerBlock    = Problem::BlockGemmShape::kN;
+        constexpr index_t NPerBlockBQ  = NPerBlock / Problem::QuantGroupSize::kN;
+        constexpr index_t KPerBlock    = Problem::BlockGemmShape::kK;
+        constexpr index_t KPerBlockBQ  = KPerBlock / Problem::QuantGroupSize::kK;
+        constexpr index_t VecLoadSize  = GetVectorSizeBQ<Problem>();
+        constexpr bool PreshuffleQuant = Problem::Traits::PreshuffleQuant;
+
+        using WarpTile = typename Problem::BlockGemmShape::WarpTile;
+        using WarpGemm = WarpGemmDispatcher<typename Problem::ComputeDataType,
+                                            typename Problem::ComputeDataType,
+                                            typename Problem::CDataType,
+                                            WarpTile::at(I0),
+                                            WarpTile::at(I1),
+                                            WarpTile::at(I2),
+                                            Problem::TransposeC>;

        static_assert(std::is_same_v<BQLayout, tensor_layout::gemm::ColumnMajor>);
-        using TileEncodingPattern =
-            tile_distribution_encoding_pattern_bq<BlockGemmShape,
-                                                  WarpGemm,
-                                                  BlockSize,
-                                                  KPerBlockBQ,
-                                                  NPerBlockBQ,
-                                                  Problem::QuantGroupSize::kN>;
+        if constexpr(PreshuffleQuant)
+        {
+            using TileEncodingPattern = tile_distribution_encoding_pattern_bq<
+                BlockGemmShape,
+                WarpGemm,
+                BlockSize,
+                NPerBlock / WarpGemm::kN,
+                ck_tile::integer_least_multiple(WarpGemm::kN * KPerBlockBQ, get_warp_size()),
+                VecLoadSize,
+                PreshuffleQuant>;
+            return TileEncodingPattern::make_2d_static_tile_distribution();
+        }
+        else
+        {
+            using TileEncodingPattern =
+                tile_distribution_encoding_pattern_bq<BlockGemmShape,
+                                                      WarpGemm,
+                                                      BlockSize,
+                                                      KPerBlockBQ,
+                                                      NPerBlockBQ,
+                                                      Problem::QuantGroupSize::kN>;

-        return TileEncodingPattern::make_2d_static_tile_distribution();
+            return TileEncodingPattern::make_2d_static_tile_distribution();
+        }
    }

    template <typename Problem>
--- a/include/ck_tile/ops/gemm_quant/pipeline/gemm_bquant_pipeline_ag_bg_cr_v3.hpp
+++ b/include/ck_tile/ops/gemm_quant/pipeline/gemm_bquant_pipeline_ag_bg_cr_v3.hpp
@@ -137,6 +137,7 @@ struct BQuantGemmPipelineAgBgCrCompV3 : public BaseBQuantGemmPipelineAgBgCrCompV
    static constexpr bool kPadK = Problem::kPadK;

    static constexpr bool DoubleSmemBuffer = Problem::DoubleSmemBuffer;
+    static constexpr bool PreshuffleQuant  = Problem::Traits::PreshuffleQuant;

    static constexpr bool HasHotLoop = Problem::HasHotLoop;
    static constexpr auto TailNum    = Problem::TailNum;
@@ -238,6 +239,7 @@ struct BQuantGemmPipelineAgBgCrCompV3 : public BaseBQuantGemmPipelineAgBgCrCompV
                                       const BDramBlockWindowTmp& b_dram_block_window_tmp,
                                       const BElementFunction& b_element_func,
                                       const BQDramBlockWindowTmp& bq_dram_block_window_tmp,
+                                       index_t n,
                                       index_t num_loop,
                                       void* p_smem) const
        {
@@ -257,9 +259,6 @@ struct BQuantGemmPipelineAgBgCrCompV3 : public BaseBQuantGemmPipelineAgBgCrCompV
            constexpr bool is_b_row_major = std::is_same_v<BLayout, tensor_layout::gemm::RowMajor>;

            static_assert(is_bq_col_major, "Bq must be col major (row major not supported yet)");
-            static_assert(KPerBlockBQ == BQDramBlockWindowTmp{}.get_window_lengths()[I0{}] &&
-                              NPerBlockBQ == BQDramBlockWindowTmp{}.get_window_lengths()[I1{}],
-                          "Bq block window has incorrect lengths for defined BqLayout!");

            static_assert(is_a_col_major
                              ? (KPerBlock == ADramBlockWindowTmp{}.get_window_lengths()[I0{}] &&
@@ -315,8 +314,12 @@ struct BQuantGemmPipelineAgBgCrCompV3 : public BaseBQuantGemmPipelineAgBgCrCompV
                is_a_col_major ? make_array(KPerBlock, 0) : make_array(0, KPerBlock);
            constexpr BDramTileWindowStep b_dram_tile_window_step =
                is_b_row_major ? make_array(KPerBlock, 0) : make_array(0, KPerBlock);
-            constexpr BQDramTileWindowStep bq_dram_tile_window_step =
-                is_bq_col_major ? make_array(KPerBlockBQ, 0) : make_array(0, KPerBlockBQ);
+            const BQDramTileWindowStep bq_dram_tile_window_step =
+                (PreshuffleQuant) ? make_array(ck_tile::integer_least_multiple(n, NPerBlock) /
+                                                   BlockGemmShape::WarpTile::at(number<1>{}),
+                                               0)
+                : is_bq_col_major ? make_array(KPerBlockBQ, 0)
+                                  : make_array(0, KPerBlockBQ);

            // DRAM prefetch (global read 0)
            Base::GlobalPrefetch(a_block_tile, a_copy_dram_window, a_dram_tile_window_step);
@@ -457,6 +460,7 @@ struct BQuantGemmPipelineAgBgCrCompV3 : public BaseBQuantGemmPipelineAgBgCrCompV
            return c_block_tile;
        }
    };
+    // Overload for PreshuffleQuant = true
    template <typename ADramBlockWindowTmp,
              typename BDramBlockWindowTmp,
              typename BQDramBlockWindowTmp>
@@ -464,7 +468,8 @@ struct BQuantGemmPipelineAgBgCrCompV3 : public BaseBQuantGemmPipelineAgBgCrCompV
                                   const BDramBlockWindowTmp& b_dram_block_window_tmp,
                                   const BQDramBlockWindowTmp& bq_dram_block_window_tmp,
                                   index_t num_loop,
-                                   void* p_smem) const
+                                   void* p_smem,
+                                   index_t n = 0) const
    {
        return PipelineImpl<Scheduler>{}.template operator()<HasHotLoop, TailNum>(
            a_dram_block_window_tmp,
@@ -472,6 +477,7 @@ struct BQuantGemmPipelineAgBgCrCompV3 : public BaseBQuantGemmPipelineAgBgCrCompV
            b_dram_block_window_tmp,
            [](const BDataType& b) { return b; },
            bq_dram_block_window_tmp,
+            n,
            num_loop,
            p_smem);
    }
@@ -502,7 +508,8 @@ struct BQuantGemmPipelineAgBgCrCompV3 : public BaseBQuantGemmPipelineAgBgCrCompV
                                   index_t num_loop,
                                   bool has_hot_loop,
                                   TailNumber tail_number,
-                                   void* p_smem) const
+                                   void* p_smem,
+                                   index_t n = 0) const
    {
        const auto RunPipeline = [&](auto has_hot_loop_, auto tail_number_) {
            constexpr bool hot_loop = has_hot_loop_.value;
@@ -513,6 +520,7 @@ struct BQuantGemmPipelineAgBgCrCompV3 : public BaseBQuantGemmPipelineAgBgCrCompV
                b_dram_block_window_tmp,
                [](const BDataType& b) { return b; },
                bq_dram_block_window_tmp,
+                n, // dummy value, won't be used
                num_loop,
                p_smem);
        };
--- a/include/ck_tile/ops/gemm_quant/pipeline/gemm_group_quant_utils.hpp
+++ b/include/ck_tile/ops/gemm_quant/pipeline/gemm_group_quant_utils.hpp
@@ -171,7 +171,8 @@ template <typename BlockGemmShape,
          index_t BlockSize,
          index_t YPerTile,
          index_t XPerTile,
-          index_t XPerQ>
+          index_t XPerQ,
+          bool PreshuffleQuant = false>
 struct tile_distribution_encoding_pattern_bq : public tile_distribution_encoding_pattern
 {
    static constexpr index_t warp_size = get_warp_size();
@@ -213,52 +214,71 @@ struct tile_distribution_encoding_pattern_bq : public tile_distribution_encoding
    /// @return A static tile distribution encoding for the BQ scale tensor
    CK_TILE_HOST_DEVICE static constexpr auto make_2d_static_tile_distribution()
    {
-        if constexpr(XPerQ < WarpGemm::kN)
+        if constexpr(PreshuffleQuant)
        {
-            // Case 1: Fine-grained - multiple quantization scales within a single warp
-            constexpr index_t Y  = YPerTile;             // Full Y dimension of tile
-            constexpr index_t YR = 1;                    // No Y replication needed
-            constexpr index_t X0 = NIterPerWarp;         // Iterations per warp in N-dim
-            constexpr index_t X1 = NWarps;               // Number of warps in N-dim
-            constexpr index_t X2 = WarpGemm::kN / XPerQ; // Number of scales per warp
-            constexpr index_t XR = XPerQ;                // Elements per quantization group
-
-            static_assert(X0 * X1 * X2 == XPerTile, "X0, X1, X2 must cover the blocktile along X.");
+            constexpr index_t X1 = warp_size;
+            constexpr index_t X0 = XPerTile / warp_size;
+            constexpr index_t Y1 = NWarps;
+            constexpr index_t Y0 = YPerTile / Y1;

            return make_static_tile_distribution(
-                tile_distribution_encoding<sequence<MWarps, YR, XR>,
-                                           tuple<sequence<Y>, sequence<X0, X1, X2>>,
-                                           tuple<sequence<0, 2>, sequence<0, 2, 0>>,
-                                           tuple<sequence<0, 1>, sequence<1, 2, 2>>,
-                                           sequence<2, 1>,
-                                           sequence<0, 0>>{});
-        }
-        else if constexpr(XPerQ <= WarpGemm::kN * NWarps)
-        {
-            // Case 2: Medium-grained - one quantization scale per warp
-            constexpr auto XR = XPerQ / WarpGemm::kN; // Scale replication factor
-            constexpr auto X1 = NWarps / XR;          // Warps per unique scale
-            constexpr auto X0 = XPerTile / X1;        // Iterations to cover X dimension
-            return make_static_tile_distribution(
-                tile_distribution_encoding<sequence<MWarps, XR, get_warp_size()>,
-                                           tuple<sequence<YPerTile>, sequence<X0, X1>>,
-                                           tuple<sequence<0, 2, 0>, sequence<0>>,
-                                           tuple<sequence<0, 1, 1>, sequence<2>>,
-                                           sequence<2, 1>,
-                                           sequence<0, 0>>{});
-        }
-        else // XPerQ > WarpGemm::kN * NWarps
-        {
-            // Case 3: Coarse-grained - quantization group spans all warps
-            // All warps in N-dimension share the same quantization scale
-            return make_static_tile_distribution(
-                tile_distribution_encoding<sequence<MWarps, NWarps, get_warp_size()>,
-                                           tuple<sequence<YPerTile>, sequence<XPerTile>>,
-                                           tuple<sequence<0, 0>, sequence<0>>,
+                tile_distribution_encoding<sequence<MWarps>,
+                                           tuple<sequence<Y0, Y1>, sequence<X0, X1>>,
                                           tuple<sequence<0, 1>, sequence<2>>,
-                                           sequence<2, 1>,
+                                           tuple<sequence<0, 1>, sequence<1>>,
+                                           sequence<1, 2>,
                                           sequence<0, 0>>{});
        }
+        else
+        {
+            if constexpr(XPerQ < WarpGemm::kN)
+            {
+                // Case 1: Fine-grained - multiple quantization scales within a single warp
+                constexpr index_t Y  = YPerTile;             // Full Y dimension of tile
+                constexpr index_t YR = 1;                    // No Y replication needed
+                constexpr index_t X0 = NIterPerWarp;         // Iterations per warp in N-dim
+                constexpr index_t X1 = NWarps;               // Number of warps in N-dim
+                constexpr index_t X2 = WarpGemm::kN / XPerQ; // Number of scales per warp
+                constexpr index_t XR = XPerQ;                // Elements per quantization group
+
+                static_assert(X0 * X1 * X2 == XPerTile,
+                              "X0, X1, X2 must cover the blocktile along X.");
+
+                return make_static_tile_distribution(
+                    tile_distribution_encoding<sequence<MWarps, YR, XR>,
+                                               tuple<sequence<Y>, sequence<X0, X1, X2>>,
+                                               tuple<sequence<0, 2>, sequence<0, 2, 0>>,
+                                               tuple<sequence<0, 1>, sequence<1, 2, 2>>,
+                                               sequence<2, 1>,
+                                               sequence<0, 0>>{});
+            }
+            else if constexpr(XPerQ <= WarpGemm::kN * NWarps)
+            {
+                // Case 2: Medium-grained - one quantization scale per warp
+                constexpr auto XR = XPerQ / WarpGemm::kN; // Scale replication factor
+                constexpr auto X1 = NWarps / XR;          // Warps per unique scale
+                constexpr auto X0 = XPerTile / X1;        // Iterations to cover X dimension
+                return make_static_tile_distribution(
+                    tile_distribution_encoding<sequence<MWarps, XR, get_warp_size()>,
+                                               tuple<sequence<YPerTile>, sequence<X0, X1>>,
+                                               tuple<sequence<0, 2, 0>, sequence<0>>,
+                                               tuple<sequence<0, 1, 1>, sequence<2>>,
+                                               sequence<2, 1>,
+                                               sequence<0, 0>>{});
+            }
+            else // XPerQ > WarpGemm::kN * NWarps
+            {
+                // Case 3: Coarse-grained - quantization group spans all warps
+                // All warps in N-dimension share the same quantization scale
+                return make_static_tile_distribution(
+                    tile_distribution_encoding<sequence<MWarps, NWarps, get_warp_size()>,
+                                               tuple<sequence<YPerTile>, sequence<XPerTile>>,
+                                               tuple<sequence<0, 0>, sequence<0>>,
+                                               tuple<sequence<0, 1>, sequence<2>>,
+                                               sequence<2, 1>,
+                                               sequence<0, 0>>{});
+            }
+        }
    }
 };

--- a/include/ck_tile/ops/gemm_quant/pipeline/gemm_wp_bquant_pipeline_ag_bg_cr_v2.hpp
+++ b/include/ck_tile/ops/gemm_quant/pipeline/gemm_wp_bquant_pipeline_ag_bg_cr_v2.hpp
@@ -68,6 +68,7 @@ struct WPQuantBPipelineAgBgCrV2 : public WeightPreshufflePipelineAGmemBGmemCRegV

    using Base::m_preload;

+    static constexpr bool PreshuffleQuant = Problem::Traits::PreshuffleQuant;
    static constexpr index_t KPerBlockBQ =
        integer_divide_ceil(BlockGemmShape::kK, QuantGroupSize::kK);
    static constexpr index_t QScalesPerBlockRow =
@@ -106,6 +107,7 @@ struct WPQuantBPipelineAgBgCrV2 : public WeightPreshufflePipelineAGmemBGmemCRegV
                                   const AElementFunction& a_element_func,
                                   const BFlatBlockWindowTmp& b_flat_dram_block_window_tmp,
                                   const BQDramBlockWindowTmp& bq_dram_block_window_tmp,
+                                   index_t n,
                                   index_t num_loop,
                                   void* p_smem_ping,
                                   void* p_smem_pong) const
@@ -236,7 +238,7 @@ struct WPQuantBPipelineAgBgCrV2 : public WeightPreshufflePipelineAGmemBGmemCRegV
        // BQ DRAM window for load
        auto bq_copy_dram_window =
            make_tile_window(bq_dram_block_window_tmp.get_bottom_tensor_view(),
-                             make_tuple(number<KPerBlockBQ>{}, number<kNPerBlock>{}),
+                             bq_dram_block_window_tmp.get_window_lengths(),
                             bq_dram_block_window_tmp.get_window_origin(),
                             PipelinePolicy::template MakeBQDramTileDistribution<Problem>());

@@ -269,8 +271,17 @@ struct WPQuantBPipelineAgBgCrV2 : public WeightPreshufflePipelineAGmemBGmemCRegV
        BQBlockTile bq_block_tile, bq_block_tile_2;
        bq_block_tile = load_tile(bq_copy_dram_window);
        // move BQ to tile 1
-        move_tile_window(bq_copy_dram_window, {KPerBlockBQ, 0});
-
+        if constexpr(PreshuffleQuant)
+        {
+            move_tile_window(bq_copy_dram_window,
+                             {ck_tile::integer_least_multiple(n, kNPerBlock) /
+                                  BlockGemmShape::WarpTile::at(number<1>{}),
+                              0});
+        }
+        else
+        {
+            move_tile_window(bq_copy_dram_window, {KPerBlockBQ, 0});
+        }
        // Prefill A0
        auto a_block_tile_tmp = tile_elementwise_in(a_element_func, a_block_tile);
        store_tile(a_copy_lds_window_ping, a_block_tile_tmp);
@@ -318,7 +329,17 @@ struct WPQuantBPipelineAgBgCrV2 : public WeightPreshufflePipelineAGmemBGmemCRegV
            move_tile_window(b_flat_dram_window, {0, BlockGemmShape::flatKPerBlock});

            bq_block_tile_2 = load_tile(bq_copy_dram_window);
-            move_tile_window(bq_copy_dram_window, {KPerBlockBQ, 0});
+            if constexpr(PreshuffleQuant)
+            {
+                move_tile_window(bq_copy_dram_window,
+                                 {ck_tile::integer_least_multiple(n, kNPerBlock) /
+                                      BlockGemmShape::WarpTile::at(number<1>{}),
+                                  0});
+            }
+            else
+            {
+                move_tile_window(bq_copy_dram_window, {KPerBlockBQ, 0});
+            }

            // Prefill A(2i+1)
            a_block_tile_tmp = tile_elementwise_in(a_element_func, a_block_tile);
@@ -360,7 +381,17 @@ struct WPQuantBPipelineAgBgCrV2 : public WeightPreshufflePipelineAGmemBGmemCRegV
            move_tile_window(b_flat_dram_window, {0, BlockGemmShape::flatKPerBlock});

            bq_block_tile = load_tile(bq_copy_dram_window);
-            move_tile_window(bq_copy_dram_window, {KPerBlockBQ, 0});
+            if constexpr(PreshuffleQuant)
+            {
+                move_tile_window(bq_copy_dram_window,
+                                 {ck_tile::integer_least_multiple(n, kNPerBlock) /
+                                      BlockGemmShape::WarpTile::at(number<1>{}),
+                                  0});
+            }
+            else
+            {
+                move_tile_window(bq_copy_dram_window, {KPerBlockBQ, 0});
+            }

            // Prefill A(2i+2)
            a_block_tile_tmp = tile_elementwise_in(a_element_func, a_block_tile);
@@ -448,6 +479,7 @@ struct WPQuantBPipelineAgBgCrV2 : public WeightPreshufflePipelineAGmemBGmemCRegV
        return c_block_tile;
    }

+    // Replace lines 485-526 with a single optimized operator:
    template <typename ADramBlockWindowTmp,
              typename BFlatBlockWindowTmp,
              typename BQDramBlockWindowTmp>
@@ -456,14 +488,15 @@ struct WPQuantBPipelineAgBgCrV2 : public WeightPreshufflePipelineAGmemBGmemCRegV
                                   const BQDramBlockWindowTmp& bq_dram_block_window_tmp,
                                   index_t num_loop,
                                   void* p_smem_ping,
-                                   void* p_smem_pong) const
+                                   void* p_smem_pong,
+                                   index_t n = 0) const // Default value for non-preshuffle case
    {
-
        return operator()<TailNum>(
            a_dram_block_window_tmp,
            [](const ADataType& a) { return a; },
            b_flat_dram_block_window_tmp,
            bq_dram_block_window_tmp,
+            n,
            num_loop,
            p_smem_ping,
            p_smem_pong);
@@ -478,7 +511,8 @@ struct WPQuantBPipelineAgBgCrV2 : public WeightPreshufflePipelineAGmemBGmemCRegV
                                   index_t num_loop,
                                   TailNumber tail_number,
                                   void* p_smem_ping,
-                                   void* p_smem_pong) const
+                                   void* p_smem_pong,
+                                   index_t n = 0) const
    {
        const auto RunPipeline = [&](auto bool_val, auto tail_num_) {
            (void)bool_val; // Suppress unused parameter warning
@@ -488,6 +522,7 @@ struct WPQuantBPipelineAgBgCrV2 : public WeightPreshufflePipelineAGmemBGmemCRegV
                [](const ADataType& a) { return a; },
                b_flat_dram_block_window_tmp,
                bq_dram_block_window_tmp,
+                n, // dummy value, won't be used
                num_loop,
                p_smem_ping,
                p_smem_pong);