[CK_TILE] Support more layouts for BQuant GEMM (#3349)

* WIP: preparing to add transpose bq support * WIP: handle both row/col layout for BQ windows/tile dstr * Fix build * WIP: adding some test, debugging numerical errors * Fix all but pkint4 tests * Remove test_gemm_quant_typed.cpp again * update disabled tests * add conversion from pkint4 for b matrix * fix formatting * fix formatting * Fix tr_load and use override b datatype for clarity * fix formatting * make bquant preshuffle tests bqlayout column-major
2026-05-01 20:21:23 +00:00 · 2025-12-08 21:05:56 +00:00
parent fe07b5a1bf
commit c363a98d41
10 changed files with 359 additions and 189 deletions
--- a/include/ck_tile/ops/gemm_quant/kernel/gemm_quant_kernel.hpp
+++ b/include/ck_tile/ops/gemm_quant/kernel/gemm_quant_kernel.hpp
@@ -426,7 +426,6 @@ struct QuantGemmKernel

        if constexpr(kQuantType == QuantType::BQuantGrouped)
        {
-            static_assert(std::is_same_v<BQLayout, tensor_layout::gemm::ColumnMajor>);
            if(kargs.QK_B % GemmPipeline::GetVectorSizeBQ() != 0)
            {
                if(ck_tile::EnvIsEnabled(CK_TILE_ENV(CK_TILE_LOGGING)))
@@ -781,7 +780,9 @@ struct QuantGemmKernel
            {
                if constexpr(PreshuffleQuant)
                {
-                    static_assert(std::is_same_v<BQLayout, tensor_layout::gemm::ColumnMajor>);
+                    static_assert(std::is_same_v<BQLayout, tensor_layout::gemm::ColumnMajor>,
+                                  "PreshuffleQuant with BQuantGrouped currently only supports "
+                                  "ColumnMajor BQ layout");

                    return MakePreshuffledQuantTensorView<
                        GemmPipeline::KPerBlockBQ,
@@ -791,14 +792,35 @@ struct QuantGemmKernel
                }
                else
                {
-                    static_assert(std::is_same_v<BQLayout, tensor_layout::gemm::ColumnMajor>);
                    using QuantGroupSize = remove_cvref_t<typename GemmPipeline::QuantGroupSize>;
-                    return make_naive_tensor_view<address_space_enum::global>(
-                        bq_ptr,
-                        make_tuple(integer_divide_ceil(kargs.N, QuantGroupSize::kN), kargs.QK_B),
-                        make_tuple(kargs.stride_BQ, 1),
-                        number<GemmPipeline::GetVectorSizeBQ()>{},
-                        number<1>{});
+
+                    if constexpr(std::is_same_v<BQLayout, tensor_layout::gemm::RowMajor>)
+                    {
+                        // For RowMajor BQ: memory layout is [K/QuantGroupK][N/QuantGroupN]
+                        // Dimensions: [K/QuantGroupK, N/QuantGroupN]
+                        // Strides: [N/QuantGroupN, 1]
+                        return make_naive_tensor_view<address_space_enum::global>(
+                            bq_ptr,
+                            make_tuple(integer_divide_ceil(kargs.K, QuantGroupSize::kK),
+                                       integer_divide_ceil(kargs.N, QuantGroupSize::kN)),
+                            make_tuple(integer_divide_ceil(kargs.N, QuantGroupSize::kN), 1),
+                            number<GemmPipeline::GetVectorSizeBQ()>{},
+                            number<1>{});
+                    }
+                    else
+                    {
+                        static_assert(std::is_same_v<BQLayout, tensor_layout::gemm::ColumnMajor>);
+                        // For ColumnMajor BQ: memory layout is [N/QuantGroupN][K/QuantGroupK]
+                        // Dimensions: [N/QuantGroupN, K/QuantGroupK]
+                        // Strides: [K/QuantGroupK, 1]
+                        return make_naive_tensor_view<address_space_enum::global>(
+                            bq_ptr,
+                            make_tuple(integer_divide_ceil(kargs.N, QuantGroupSize::kN),
+                                       integer_divide_ceil(kargs.K, QuantGroupSize::kK)),
+                            make_tuple(integer_divide_ceil(kargs.K, QuantGroupSize::kK), 1),
+                            number<GemmPipeline::GetVectorSizeBQ()>{},
+                            number<1>{});
+                    }
                }
            }
            else
@@ -1023,10 +1045,10 @@ struct QuantGemmKernel
            }
            else if constexpr(kQuantType == QuantType::BQuantGrouped)
            {
+                using QuantGroupSize = remove_cvref_t<typename GemmPipeline::QuantGroupSize>;
                if constexpr(PreshuffleQuant)
                {
                    static_assert(std::is_same_v<BQLayout, tensor_layout::gemm::ColumnMajor>);
-                    using QuantGroupSize   = remove_cvref_t<typename GemmPipeline::QuantGroupSize>;
                    constexpr auto block_n = TilePartitioner::NPerBlock / QuantGroupSize::kN;
                    constexpr auto warp_n  = TilePartitioner::BlockGemmShape::WarpTile::at(I1);
                    constexpr auto bqk_per_block = TilePartitioner::KPerBlock / QuantGroupSize::kK;
@@ -1042,13 +1064,23 @@ struct QuantGemmKernel
                }
                else
                {
-                    static_assert(std::is_same_v<BQLayout, tensor_layout::gemm::ColumnMajor>);
-                    using QuantGroupSize = remove_cvref_t<typename GemmPipeline::QuantGroupSize>;
-                    return make_tile_window(
-                        bq_pad_view,
-                        make_tuple(number<TilePartitioner::NPerBlock / QuantGroupSize::kN>{},
-                                   number<TilePartitioner::KPerBlock / QuantGroupSize::kK>{}),
-                        {i_n / QuantGroupSize::kN, 0});
+                    if constexpr(std::is_same_v<BQLayout, tensor_layout::gemm::RowMajor>)
+                    {
+                        return make_tile_window(
+                            bq_pad_view,
+                            make_tuple(number<TilePartitioner::KPerBlock / QuantGroupSize::kK>{},
+                                       number<TilePartitioner::NPerBlock / QuantGroupSize::kN>{}),
+                            {0, i_n / QuantGroupSize::kN});
+                    }
+                    else
+                    {
+                        static_assert(std::is_same_v<BQLayout, tensor_layout::gemm::ColumnMajor>);
+                        return make_tile_window(
+                            bq_pad_view,
+                            make_tuple(number<TilePartitioner::NPerBlock / QuantGroupSize::kN>{},
+                                       number<TilePartitioner::KPerBlock / QuantGroupSize::kK>{}),
+                            {i_n / QuantGroupSize::kN, 0});
+                    }
                }
            }
            else