[CK_TILE] Fix gemm_quant (#3186)

[ROCm/composable_kernel commit: 1b1c46e508]
2026-05-18 03:49:41 +00:00 · 2025-11-12 00:23:57 +08:00
parent c1b5372db3
commit 13cf0bd17f
13 changed files with 135 additions and 49 deletions
--- a/include/ck_tile/host/tensor_shuffle_utils.hpp
+++ b/include/ck_tile/host/tensor_shuffle_utils.hpp
@@ -24,16 +24,43 @@ template <typename GemmConfig, typename T>
 auto shuffle_b(const ck_tile::HostTensor<T>& t)
 {
    assert(t.get_lengths().size() == 2);
-    int n_                = t.get_lengths()[1];
-    int k_                = t.get_lengths()[0];
-    constexpr int divisor = GemmConfig::N_Warp_Tile == 32 ? 2 : 4;
-    ck_tile::HostTensor<T> t_view({n_ / GemmConfig::N_Warp_Tile,
-                                   GemmConfig::N_Warp_Tile,
-                                   k_ / GemmConfig::K_Warp_Tile,
-                                   divisor,
-                                   GemmConfig::K_Warp_Tile / divisor});
-    std::copy(t.begin(), t.end(), t_view.begin());
-    return ck_tile::reference_permute(t_view, {0, 2, 3, 1, 4});
+    int n_ = t.get_lengths()[1];
+    int k_ = t.get_lengths()[0];
+
+    if(ck_tile::is_gfx12_supported())
+    {
+        constexpr int divisor      = 2;
+        constexpr int kABK1PerLane = 8;
+        constexpr int kABK0PerLane = GemmConfig::K_Warp_Tile / divisor / kABK1PerLane;
+        ck_tile::HostTensor<T> t_view({n_ / GemmConfig::N_Warp_Tile,
+                                       GemmConfig::N_Warp_Tile,
+                                       k_ / GemmConfig::K_Warp_Tile,
+                                       kABK0PerLane,
+                                       divisor,
+                                       kABK1PerLane});
+        std::copy(t.begin(), t.end(), t_view.begin());
+        return ck_tile::reference_permute(t_view, {0, 2, 4, 1, 3, 5});
+    }
+    else
+    {
+        int divisor = 1;
+        if(ck_tile::is_gfx11_supported())
+        {
+            divisor = 1;
+        }
+        else
+        {
+            assert(is_wave32() == false);
+            divisor = GemmConfig::N_Warp_Tile == 32 ? 2 : 4;
+        }
+        ck_tile::HostTensor<T> t_view({n_ / GemmConfig::N_Warp_Tile,
+                                       GemmConfig::N_Warp_Tile,
+                                       k_ / GemmConfig::K_Warp_Tile,
+                                       divisor,
+                                       GemmConfig::K_Warp_Tile / divisor});
+        std::copy(t.begin(), t.end(), t_view.begin());
+        return ck_tile::reference_permute(t_view, {0, 2, 3, 1, 4});
+    }
 }

 template <typename GemmConfig, typename T>
@@ -55,21 +82,46 @@ template <typename GemmConfig, typename T>
 auto shuffle_b_permuteN(const ck_tile::HostTensor<T>& t)
 {
    assert(t.get_lengths().size() == 2);
-
    int n_                = t.get_lengths()[1];
    int k_                = t.get_lengths()[0];
-    constexpr int divisor = GemmConfig::N_Warp_Tile == 32 ? 2 : 4;
    constexpr int NRepeat = GemmConfig::N_Tile / GemmConfig::N_Warp_Tile / GemmConfig::N_Warp;
-
-    ck_tile::HostTensor<T> t_view({n_ / GemmConfig::N_Tile,
-                                   GemmConfig::N_Warp,
-                                   GemmConfig::N_Warp_Tile,
-                                   NRepeat,
-                                   k_ / GemmConfig::K_Warp_Tile,
-                                   divisor,
-                                   GemmConfig::K_Warp_Tile / divisor});
-
-    std::copy(t.begin(), t.end(), t_view.begin());
-    return ck_tile::reference_permute(t_view, {0, 3, 1, 4, 5, 2, 6});
+    if(ck_tile::is_gfx12_supported())
+    {
+        constexpr int divisor      = 2;
+        constexpr int kABK1PerLane = 8;
+        constexpr int kABK0PerLane = GemmConfig::K_Warp_Tile / divisor / kABK1PerLane;
+        ck_tile::HostTensor<T> t_view({n_ / GemmConfig::N_Tile,
+                                       GemmConfig::N_Warp,
+                                       GemmConfig::N_Warp_Tile,
+                                       NRepeat,
+                                       k_ / GemmConfig::K_Warp_Tile,
+                                       kABK0PerLane,
+                                       divisor,
+                                       kABK1PerLane});
+        std::copy(t.begin(), t.end(), t_view.begin());
+        return ck_tile::reference_permute(t_view, {0, 3, 1, 4, 6, 5, 2, 7});
+    }
+    else
+    {
+        int divisor = 1;
+        if(ck_tile::is_gfx11_supported())
+        {
+            divisor = 1;
+        }
+        else
+        {
+            assert(is_wave32() == false);
+            divisor = GemmConfig::N_Warp_Tile == 32 ? 2 : 4;
+        }
+        ck_tile::HostTensor<T> t_view({n_ / GemmConfig::N_Tile,
+                                       GemmConfig::N_Warp,
+                                       GemmConfig::N_Warp_Tile,
+                                       NRepeat,
+                                       k_ / GemmConfig::K_Warp_Tile,
+                                       divisor,
+                                       GemmConfig::K_Warp_Tile / divisor});
+        std::copy(t.begin(), t.end(), t_view.begin());
+        return ck_tile::reference_permute(t_view, {0, 3, 1, 4, 5, 2, 6});
+    }
 }
 } // namespace ck_tile
--- a/include/ck_tile/ops/gemm/warp/warp_gemm_attribute_wmma.hpp
+++ b/include/ck_tile/ops/gemm/warp/warp_gemm_attribute_wmma.hpp
@@ -79,6 +79,7 @@ struct WarpGemmAttributeWmma
    static constexpr index_t kM          = Impl::kM;
    static constexpr index_t kN          = Impl::kN;
    static constexpr index_t kK          = Impl::kK;
+    static constexpr index_t kCMLane     = Impl::kCMLane;
    static constexpr index_t kKPerThread = Impl::kABK0PerLane * Impl::kABK1PerLane;

    CK_TILE_HOST_DEVICE static constexpr auto get_num_of_access() { return 1; }
--- a/include/ck_tile/ops/gemm_quant/block/block_universal_gemm_ar_flatbr_bquant_cr.hpp
+++ b/include/ck_tile/ops/gemm_quant/block/block_universal_gemm_ar_flatbr_bquant_cr.hpp
@@ -82,11 +82,11 @@ struct BlockGemmWeightPreshuffleBQuantARegBRegCReg
        float scale_reg_f = 0.f;
        if constexpr(std::is_same_v<BQDataType, ck_tile::fp8_t>)
        {
-            scale_reg_f = element_wise::amd_assembly_fp8_to_fp32(static_cast<uint32_t>(scale));
+            scale_reg_f = __builtin_amdgcn_cvt_f32_fp8(static_cast<uint32_t>(scale), 0);
        }
        else if constexpr(std::is_same_v<BQDataType, ck_tile::bf8_t>)
        {
-            scale_reg_f = element_wise::amd_assembly_bf8_to_fp32(static_cast<uint32_t>(scale));
+            scale_reg_f = __builtin_amdgcn_cvt_f32_bf8(static_cast<uint32_t>(scale), 0);
        }
        else if constexpr(std::is_same_v<BQDataType, float>)
        {
--- a/include/ck_tile/ops/gemm_quant/block/block_universal_gemm_as_aquant_bs_cr.hpp
+++ b/include/ck_tile/ops/gemm_quant/block/block_universal_gemm_as_aquant_bs_cr.hpp
@@ -25,13 +25,11 @@ struct BlockGemmAQuantBase
        float scale_reg_f = 0.f;
        if constexpr(std::is_same_v<AQDataType, ck_tile::fp8_t>)
        {
-            scale_reg_f =
-                ck_tile::element_wise::amd_assembly_fp8_to_fp32(static_cast<uint32_t>(scale));
+            scale_reg_f = __builtin_amdgcn_cvt_f32_fp8(static_cast<uint32_t>(scale), 0);
        }
        else if constexpr(std::is_same_v<AQDataType, ck_tile::bf8_t>)
        {
-            scale_reg_f =
-                ck_tile::element_wise::amd_assembly_bf8_to_fp32(static_cast<uint32_t>(scale));
+            scale_reg_f = __builtin_amdgcn_cvt_f32_bf8(static_cast<uint32_t>(scale), 0);
        }
        else if constexpr(std::is_same_v<AQDataType, float>)
        {
@@ -349,7 +347,7 @@ struct AQuantBlockUniversalGemmAsBsCr : public BlockGemmAQuantBase<Problem_>
                        // Thread 0 can read AQ_tile[0, 0] from itself, AQ_tile[1,
                        // 0] from thread 1, ..., and AQ_tile[3, 0] from thread 3.

-                        constexpr uint32_t kTileRowsOfCPerThread = 4;
+                        constexpr uint32_t kTileRowsOfCPerThread = (get_warp_size() == 64) ? 4 : 8;
                        decltype(threadIdx.x) pull_from_lane     = 0;
                        if constexpr(WarpGemm::kM == 16)
                        {
@@ -410,7 +408,8 @@ struct AQuantBlockUniversalGemmAsBsCr : public BlockGemmAQuantBase<Problem_>
                        // desired row coefficient
                        auto& scale_reg = aq_block_tensor.get_thread_buffer()[src_reg_offset];

-                        constexpr uint32_t kTileRows               = 4;
+                        constexpr uint32_t kTileRows = (get_warp_size() == 64) ? 4 : 8;
+                        ;
                        constexpr uint32_t kTiledCMsPerWarp        = WarpGemm::kCMLane * kTileRows;
                        constexpr uint32_t reg_offset_for_row_data = c_row * WarpGemm::kCMLane;
                        // Multiply by 4 because output is stored in tiles of 4
--- a/include/ck_tile/ops/gemm_quant/block/block_universal_gemm_as_bs_bquant_cr.hpp
+++ b/include/ck_tile/ops/gemm_quant/block/block_universal_gemm_as_bs_bquant_cr.hpp
@@ -25,13 +25,11 @@ struct BlockGemmBQuantBase
        float scale_reg_f = 0.f;
        if constexpr(std::is_same_v<BQDataType, ck_tile::fp8_t>)
        {
-            scale_reg_f =
-                ck_tile::element_wise::amd_assembly_fp8_to_fp32(static_cast<uint32_t>(scale));
+            scale_reg_f = __builtin_amdgcn_cvt_f32_fp8(static_cast<uint32_t>(scale), 0);
        }
        else if constexpr(std::is_same_v<BQDataType, ck_tile::bf8_t>)
        {
-            scale_reg_f =
-                ck_tile::element_wise::amd_assembly_bf8_to_fp32(static_cast<uint32_t>(scale));
+            scale_reg_f = __builtin_amdgcn_cvt_f32_bf8(static_cast<uint32_t>(scale), 0);
        }
        else if constexpr(std::is_same_v<BQDataType, float>)
        {
--- a/include/ck_tile/ops/gemm_quant/kernel/gemm_quant_kernel.hpp
+++ b/include/ck_tile/ops/gemm_quant/kernel/gemm_quant_kernel.hpp
@@ -240,7 +240,10 @@ struct QuantGemmKernel
        return dim3(TilePartitioner::GridSize(M, N), 1, KBatch);
    }

-    CK_TILE_HOST static constexpr auto BlockSize() { return dim3(kBlockSize); }
+    CK_TILE_HOST static auto BlockSize()
+    {
+        return is_wave32() ? dim3(kBlockSize / 2) : dim3(kBlockSize);
+    }

    CK_TILE_HOST static constexpr QuantGemmKernelArgs
    MakeKernelArgs(const QuantGemmHostArgs& hostArgs)
--- a/include/ck_tile/ops/gemm_quant/pipeline/tile_gemm_quant_traits.hpp
+++ b/include/ck_tile/ops/gemm_quant/pipeline/tile_gemm_quant_traits.hpp
@@ -41,7 +41,8 @@ template <bool kPadM_,
          typename BQLayout_        = BLayout_,
          bool TransposeC_          = false,
          bool DoubleSmemBuffer_    = false,
-          bool UsePersistentKernel_ = false>
+          bool UsePersistentKernel_ = false,
+          int VectorSize_           = 16>
 struct TileGemmQuantTraits
 {
    static constexpr bool kPadM = kPadM_;
@@ -50,7 +51,7 @@ struct TileGemmQuantTraits

    static constexpr QuantType kQuantType = QuantType_;

-    static constexpr int _VectorSize       = 16;
+    static constexpr int _VectorSize       = VectorSize_;
    static constexpr bool DoubleSmemBuffer = DoubleSmemBuffer_;

    using ALayout  = ALayout_;