[rocm-libraries] ROCm/rocm-libraries#4368 (commit 17f7dfc)

[CK_TILE][FMHA] Support microscaling (mxfp8 and mxfp4) on gfx950 (#4368) ## Motivation Microscaling types (mxfp8 and mxfp4) for fwd qr pipeline ## Technical Details The microscaling is used when quant scale mode is `BlockAttentionQuantScaleEnum::MX` and `Q/K/P/VDataType` are fp8/bf8/fp4. Supported features: * only "qr" pipeline is implemented * hdim 128 and 256 (smaller hdim are not possible due to restrictions of "qr" pipeline, but they can be computed using instances with padding) * both 32x32x64 and 16x16x128 scale MFMAs are supported * Q and K scales are applied in hdim, V scales - in seqlen dimension * column-major V only * batch and group mode * bias, Alibi (tested but no instances by default, just like fp8) * masking etc. Aiter PR with new API args: https://github.com/ROCm/aiter/pull/2008 ## Test Plan ``` ninja test_ck_tile_fmha_fwd_mxfp8 && bin/test_ck_tile_fmha_fwd_mxfp8 ninja test_ck_tile_fmha_fwd_mxfp4 && bin/test_ck_tile_fmha_fwd_mxfp4 ``` ## Test Result The tests must pass. ## Submission Checklist - [x] Look over the contributing guidelines at https://github.com/ROCm/ROCm/blob/develop/CONTRIBUTING.md#pull-requests.
2026-05-05 06:01:23 +00:00 · 2026-03-11 10:00:52 +00:00
parent c85c272c39
commit 2312eef6c3
29 changed files with 2167 additions and 356 deletions
--- a/include/ck_tile/ops/gemm/warp/warp_gemm.hpp
+++ b/include/ck_tile/ops/gemm/warp/warp_gemm.hpp
@@ -407,6 +407,12 @@ using WarpGemmMfma_f32_16x16x128_bf8_bf8_CTransposed =
        WarpGemmAttributeMfmaImpl_f32_16x16x128_f8f6f4<bf8_t, bf8_t>,
        AttrNumAccess>>;

+template <WGAttrNumAccessEnum AttrNumAccess = WGAttrNumAccessEnum::Single>
+using WarpGemmMfma_f32_16x16x128_fp4_fp4_CTransposed =
+    WarpGemmImpl<WarpGemmAttributeMfmaTransposedCDistribution<
+        WarpGemmAttributeMfmaImpl_f32_16x16x128_f8f6f4<pk_fp4_t, pk_fp4_t>,
+        AttrNumAccess>>;
+
 template <WGAttrNumAccessEnum AttrNumAccess = WGAttrNumAccessEnum::Single>
 using WarpGemmMfma_f32_32x32x64_fp8_fp8 = WarpGemmImpl<
    WarpGemmAttributeMfma<WarpGemmAttributeMfmaImpl_f32_32x32x64_fp8_fp8<WGAttrCtlEnum::Default_>,
@@ -427,6 +433,36 @@ using WarpGemmMfma_f32_32x32x64_bf8_bf8 = WarpGemmImpl<
    WarpGemmAttributeMfma<WarpGemmAttributeMfmaImpl_f32_32x32x64_bf8_bf8<WGAttrCtlEnum::Default_>,
                          AttrNumAccess>>;

+template <WGAttrNumAccessEnum AttrNumAccess = WGAttrNumAccessEnum::Single>
+using WarpGemmMfma_f32_32x32x64_fp8_fp8_CTransposed =
+    WarpGemmImpl<WarpGemmAttributeMfmaTransposedCDistribution<
+        WarpGemmAttributeMfmaImpl_f32_32x32x64_fp8_fp8<WGAttrCtlEnum::Default_>,
+        AttrNumAccess>>;
+
+template <WGAttrNumAccessEnum AttrNumAccess = WGAttrNumAccessEnum::Single>
+using WarpGemmMfma_f32_32x32x64_fp8_bf8_CTransposed =
+    WarpGemmImpl<WarpGemmAttributeMfmaTransposedCDistribution<
+        WarpGemmAttributeMfmaImpl_f32_32x32x64_fp8_bf8<WGAttrCtlEnum::Default_>,
+        AttrNumAccess>>;
+
+template <WGAttrNumAccessEnum AttrNumAccess = WGAttrNumAccessEnum::Single>
+using WarpGemmMfma_f32_32x32x64_bf8_fp8_CTransposed =
+    WarpGemmImpl<WarpGemmAttributeMfmaTransposedCDistribution<
+        WarpGemmAttributeMfmaImpl_f32_32x32x64_bf8_fp8<WGAttrCtlEnum::Default_>,
+        AttrNumAccess>>;
+
+template <WGAttrNumAccessEnum AttrNumAccess = WGAttrNumAccessEnum::Single>
+using WarpGemmMfma_f32_32x32x64_bf8_bf8_CTransposed =
+    WarpGemmImpl<WarpGemmAttributeMfmaTransposedCDistribution<
+        WarpGemmAttributeMfmaImpl_f32_32x32x64_bf8_bf8<WGAttrCtlEnum::Default_>,
+        AttrNumAccess>>;
+
+template <WGAttrNumAccessEnum AttrNumAccess = WGAttrNumAccessEnum::Single>
+using WarpGemmMfma_f32_32x32x64_fp4_fp4_CTransposed =
+    WarpGemmImpl<WarpGemmAttributeMfmaTransposedCDistribution<
+        WarpGemmAttributeMfmaImpl_f32_32x32x64_f8f6f4<pk_fp4_t, pk_fp4_t>,
+        AttrNumAccess>>;
+
 using WarpGemmMfma_f32_32x32x16_fp8_fp8_CTransposed =
    WarpGemmImpl<WarpGemmAttributeMfmaTransposedCDistribution<
        WarpGemmAttributeMfmaImpl_f32_32x32x16_fp8_fp8<WGAttrCtlEnum::Default_>>>;
--- a/include/ck_tile/ops/gemm/warp/warp_gemm_attribute_mfma.hpp
+++ b/include/ck_tile/ops/gemm/warp/warp_gemm_attribute_mfma.hpp
@@ -446,6 +446,19 @@ struct WarpGemmAttributeMfmaTransposedCDistribution
        Impl{}(c_vec, b_vec, a_vec, bool_constant<post_nop_>{});
    }

+    template <index_t opselA, index_t opselB, bool post_nop_ = false>
+    CK_TILE_DEVICE void operator()(CVecType& c_vec,
+                                   const AVecType& a_vec,
+                                   const int32_t& a_scale,
+                                   const BVecType& b_vec,
+                                   const int32_t& b_scale,
+                                   bool_constant<post_nop_> = {}) const
+    {
+        // swap A and B
+        Impl{}.template operator()<opselB, opselA>(
+            c_vec, b_vec, b_scale, a_vec, a_scale, bool_constant<post_nop_>{});
+    }
+
    // c_vec = a_vec * b_vec
    CK_TILE_DEVICE CVecType operator()(const AVecType& a_vec, const BVecType& b_vec) const
    {
@@ -540,6 +553,19 @@ struct WarpGemmAttributeMfmaTransposedCDistribution_SwizzleB
        Impl{}(c_vec, b_vec, a_vec, bool_constant<post_nop_>{});
    }

+    template <index_t opselA, index_t opselB, bool post_nop_ = false>
+    CK_TILE_DEVICE void operator()(CVecType& c_vec,
+                                   const AVecType& a_vec,
+                                   const int32_t& a_scale,
+                                   const BVecType& b_vec,
+                                   const int32_t& b_scale,
+                                   bool_constant<post_nop_> = {}) const
+    {
+        // swap A and B
+        Impl{}.template operator()<opselB, opselA>(
+            c_vec, b_vec, b_scale, a_vec, a_scale, bool_constant<post_nop_>{});
+    }
+
    // c_vec = a_vec * b_vec
    CK_TILE_DEVICE CVecType operator()(const AVecType& a_vec, const BVecType& b_vec) const
    {
--- a/include/ck_tile/ops/gemm/warp/warp_gemm_attribute_mfma_impl.hpp
+++ b/include/ck_tile/ops/gemm/warp/warp_gemm_attribute_mfma_impl.hpp
@@ -1599,6 +1599,8 @@ struct WarpGemmAttributeMfmaImpl_f32_16x16x128_f8f6f4
    static constexpr index_t kCM0PerLane = 1;
    static constexpr index_t kCM1PerLane = 4;

+    static constexpr index_t kScaleGranularity = 32;
+
    // To get unity scale: 2^(kDefaultScale - 127) = 1.0
    static constexpr index_t kDefaultScale = 0x7F7F7F7F;

@@ -1683,15 +1685,15 @@ struct WarpGemmAttributeMfmaImpl_f32_16x16x128_f8f6f4
 };

 template <typename AType_, typename BType_, WGAttrCtlEnum Ctrl_ = WGAttrCtlEnum::Default_>
-struct WarpGemmAttributeMfmaImpl_f32_32x32x64_f8_bf8_base
+struct WarpGemmAttributeMfmaImpl_f32_32x32x64_f8f6f4
 {
    static constexpr WGAttrCtlEnum Ctrl = Ctrl_;
    using ADataType                     = AType_;
    using BDataType                     = BType_;
    using CDataType                     = float;

-    using AVecType = ext_vector_t<ADataType, 32>;
-    using BVecType = ext_vector_t<BDataType, 32>;
+    using AVecType = ext_vector_t<ADataType, 32 / numeric_traits<ADataType>::PackedSize>;
+    using BVecType = ext_vector_t<BDataType, 32 / numeric_traits<BDataType>::PackedSize>;
    using CVecType = ext_vector_t<CDataType, 16>;

    static constexpr index_t kM = 32;
@@ -1711,6 +1713,71 @@ struct WarpGemmAttributeMfmaImpl_f32_32x32x64_f8_bf8_base
    static constexpr index_t kCM0PerLane = 4;
    static constexpr index_t kCM1PerLane = 4;

+    static constexpr index_t kScaleGranularity = 32;
+
+    // c_vec += a_vec * b_vec
+    template <index_t opselA, index_t opselB, bool post_nop_ = false>
+    CK_TILE_DEVICE void operator()(CVecType& c_vec,
+                                   const AVecType& a_vec,
+                                   const int32_t& a_scale,
+                                   const BVecType& b_vec,
+                                   const int32_t& b_scale,
+                                   bool_constant<post_nop_> = {}) const
+    {
+#if defined(__gfx950__)
+        auto dtype2conf = [](auto dtype) {
+            if constexpr(std::is_same_v<decltype(dtype), fp8_t>)
+                return make_tuple(number<0>{}, int32x8_t{});
+            else if constexpr(std::is_same_v<decltype(dtype), bf8_t>)
+                return make_tuple(number<1>{}, int32x8_t{});
+            else if constexpr(std::is_same_v<decltype(dtype), pk_fp6x16_t>)
+                return make_tuple(number<2>{}, pk_fp6x32_t{});
+            // else if e3m2 => make_tuple(number<3>{}, int32x6_t{})
+            else if constexpr(std::is_same_v<decltype(dtype), pk_fp4_t>)
+                return make_tuple(number<4>{}, int32x4_t{});
+            else
+                static_assert(false, "Unsupported data type for mfma scale");
+        };
+        auto dtype2code = [&](auto dtype) { return dtype2conf(dtype)(number<0>{}); };
+        auto dtype2vec  = [&](auto dtype) { return dtype2conf(dtype)(number<1>{}); };
+        auto arg256     = [&](auto x) {
+            if constexpr(sizeof(x) == 16)
+                return int32x8_t{x[0], x[1], x[2], x[3], 0, 0, 0, 0};
+            else if constexpr(sizeof(x) == 24)
+                return int32x8_t{x[0], x[1], x[2], x[3], x[4], x[5], 0, 0};
+            else if constexpr(sizeof(x) == 32)
+                return x;
+            else
+                static_assert(false, "Unexpected vector size for mfma scale");
+        };
+
+        auto arg_a         = bit_cast<decltype(dtype2vec(ADataType{}))>(a_vec);
+        auto arg_b         = bit_cast<decltype(dtype2vec(BDataType{}))>(b_vec);
+        constexpr int cbsz = decltype(dtype2code(ADataType{}))::value;
+        constexpr int blgp = decltype(dtype2code(BDataType{}))::value;
+        c_vec              = __builtin_amdgcn_mfma_scale_f32_32x32x64_f8f6f4(
+            arg256(arg_a), arg256(arg_b), c_vec, cbsz, blgp, opselA, a_scale, opselB, b_scale);
+#else
+        ck_tile::ignore = c_vec;
+        ck_tile::ignore = a_vec;
+        ck_tile::ignore = b_vec;
+        ck_tile::ignore = a_scale;
+        ck_tile::ignore = b_scale;
+#endif
+    }
+
+    // c_vec = a_vec * b_vec
+    template <index_t opselA, index_t opselB>
+    CK_TILE_DEVICE CVecType operator()(const AVecType& a_vec,
+                                       const int32_t& a_scale,
+                                       const BVecType& b_vec,
+                                       const int32_t& b_scale) const
+    {
+        CVecType c_vec{0.f};
+        operator()<opselA, opselB>(c_vec, a_vec, a_scale, b_vec, b_scale);
+        return c_vec;
+    }
+
    // c_vec += a_vec * b_vec
    template <bool post_nop_ = false>
    CK_TILE_DEVICE void operator()(CVecType& c_vec,
@@ -1718,67 +1785,31 @@ struct WarpGemmAttributeMfmaImpl_f32_32x32x64_f8_bf8_base
                                   const BVecType& b_vec,
                                   bool_constant<post_nop_> = {}) const
    {
-        //__builtin_amdgcn_mfma_scale_f32_32x32x64_f8f6f4(a, b, c, cbsz, blgp, opsel, scale_a,
-        // opsel, scale_b)
-#if defined(__gfx950__)
-        if constexpr(std::is_same_v<ADataType, fp8_t> && std::is_same_v<BDataType, fp8_t>)
-            c_vec = __builtin_amdgcn_mfma_scale_f32_32x32x64_f8f6f4(
-                a_vec, b_vec, c_vec, 0, 0, 0, 0, 0, 0);
-        else if constexpr(std::is_same_v<ADataType, fp8_t> && std::is_same_v<BDataType, bf8_t>)
-            c_vec = __builtin_amdgcn_mfma_scale_f32_32x32x64_f8f6f4(
-                a_vec, b_vec, c_vec, 0, 1, 0, 0, 0, 0);
-        else if constexpr(std::is_same_v<ADataType, bf8_t> && std::is_same_v<BDataType, fp8_t>)
-            c_vec = __builtin_amdgcn_mfma_scale_f32_32x32x64_f8f6f4(
-                a_vec, b_vec, c_vec, 1, 0, 0, 0, 0, 0);
-        else if constexpr(std::is_same_v<ADataType, bf8_t> && std::is_same_v<BDataType, bf8_t>)
-            c_vec = __builtin_amdgcn_mfma_scale_f32_32x32x64_f8f6f4(
-                a_vec, b_vec, c_vec, 1, 1, 0, 0, 0, 0);
-#else
-        ck_tile::ignore = c_vec;
-        ck_tile::ignore = a_vec;
-        ck_tile::ignore = b_vec;
-#endif
+        operator()<0, 0>(c_vec, a_vec, 0, b_vec, 0);
    }

    // c_vec = a_vec * b_vec
    CK_TILE_DEVICE CVecType operator()(const AVecType& a_vec, const BVecType& b_vec) const
    {
-#if defined(__gfx950__)
-        if constexpr(std::is_same_v<ADataType, fp8_t> && std::is_same_v<BDataType, fp8_t>)
-            return bit_cast<CVecType>(__builtin_amdgcn_mfma_scale_f32_32x32x64_f8f6f4(
-                a_vec, b_vec, CVecType{0.f}, 0, 0, 0, 0, 0, 0));
-        else if constexpr(std::is_same_v<ADataType, fp8_t> && std::is_same_v<BDataType, bf8_t>)
-            return bit_cast<CVecType>(__builtin_amdgcn_mfma_scale_f32_32x32x64_f8f6f4(
-                a_vec, b_vec, CVecType{0.f}, 0, 1, 0, 0, 0, 0));
-        else if constexpr(std::is_same_v<ADataType, bf8_t> && std::is_same_v<BDataType, fp8_t>)
-            return bit_cast<CVecType>(__builtin_amdgcn_mfma_scale_f32_32x32x64_f8f6f4(
-                a_vec, b_vec, CVecType{0.f}, 1, 0, 0, 0, 0, 0));
-        else if constexpr(std::is_same_v<ADataType, bf8_t> && std::is_same_v<BDataType, bf8_t>)
-            return bit_cast<CVecType>(__builtin_amdgcn_mfma_scale_f32_32x32x64_f8f6f4(
-                a_vec, b_vec, CVecType{0.f}, 1, 1, 0, 0, 0, 0));
-#else
-        ck_tile::ignore = a_vec;
-        ck_tile::ignore = b_vec;
-        return CVecType{0.f};
-#endif
+        return operator()<0, 0>(a_vec, 0, b_vec, 0);
    }
 };

 template <WGAttrCtlEnum Ctrl_ = WGAttrCtlEnum::Default_>
 using WarpGemmAttributeMfmaImpl_f32_32x32x64_fp8_fp8 =
-    WarpGemmAttributeMfmaImpl_f32_32x32x64_f8_bf8_base<fp8_t, fp8_t, Ctrl_>;
+    WarpGemmAttributeMfmaImpl_f32_32x32x64_f8f6f4<fp8_t, fp8_t, Ctrl_>;

 template <WGAttrCtlEnum Ctrl_ = WGAttrCtlEnum::Default_>
 using WarpGemmAttributeMfmaImpl_f32_32x32x64_fp8_bf8 =
-    WarpGemmAttributeMfmaImpl_f32_32x32x64_f8_bf8_base<fp8_t, bf8_t, Ctrl_>;
+    WarpGemmAttributeMfmaImpl_f32_32x32x64_f8f6f4<fp8_t, bf8_t, Ctrl_>;

 template <WGAttrCtlEnum Ctrl_ = WGAttrCtlEnum::Default_>
 using WarpGemmAttributeMfmaImpl_f32_32x32x64_bf8_fp8 =
-    WarpGemmAttributeMfmaImpl_f32_32x32x64_f8_bf8_base<bf8_t, fp8_t, Ctrl_>;
+    WarpGemmAttributeMfmaImpl_f32_32x32x64_f8f6f4<bf8_t, fp8_t, Ctrl_>;

 template <WGAttrCtlEnum Ctrl_ = WGAttrCtlEnum::Default_>
 using WarpGemmAttributeMfmaImpl_f32_32x32x64_bf8_bf8 =
-    WarpGemmAttributeMfmaImpl_f32_32x32x64_f8_bf8_base<bf8_t, bf8_t, Ctrl_>;
+    WarpGemmAttributeMfmaImpl_f32_32x32x64_f8f6f4<bf8_t, bf8_t, Ctrl_>;

 // int8
 template <WGAttrCtlEnum Ctrl_ = WGAttrCtlEnum::Default_>
--- a/include/ck_tile/ops/gemm/warp/warp_gemm_dispatcher.hpp
+++ b/include/ck_tile/ops/gemm/warp/warp_gemm_dispatcher.hpp
@@ -130,6 +130,8 @@ template<WGAttrNumAccessEnum I> struct Dispatcher<fp8_t, bf8_t, float, 16, 16, 1
 template<WGAttrNumAccessEnum I> struct Dispatcher<bf8_t, fp8_t, float, 16, 16, 128,  true, false, false, I> { using Type = WarpGemmMfma_f32_16x16x128_bf8_fp8_CTransposed<I>; };
 template<WGAttrNumAccessEnum I> struct Dispatcher<bf8_t, bf8_t, float, 16, 16, 128,  true, false, false, I> { using Type = WarpGemmMfma_f32_16x16x128_bf8_bf8_CTransposed<I>; };

+template<WGAttrNumAccessEnum I> struct Dispatcher<pk_fp4_t, pk_fp4_t, float, 16, 16, 128,  true, false, false, I> { using Type = WarpGemmMfma_f32_16x16x128_fp4_fp4_CTransposed<I>; };
+
 template<> struct Dispatcher<fp8_t, fp8_t, float, 32, 32,  64, false> { using Type = WarpGemmMfma_f32_32x32x64_fp8_fp8<>; };
 template<> struct Dispatcher<fp8_t, bf8_t, float, 32, 32,  64, false> { using Type = WarpGemmMfma_f32_32x32x64_fp8_bf8<>; };
 template<> struct Dispatcher<bf8_t, fp8_t, float, 32, 32,  64, false> { using Type = WarpGemmMfma_f32_32x32x64_bf8_fp8<>; };
@@ -143,6 +145,13 @@ template<> struct Dispatcher<fp8_t, bf8_t, float, 32, 32,  64, false, false, fal
 template<> struct Dispatcher<bf8_t, fp8_t, float, 32, 32,  64, false, false, false, EQuad> { using Type = WarpGemmMfma_f32_32x32x64_bf8_fp8<EQuad>; };
 template<> struct Dispatcher<bf8_t, bf8_t, float, 32, 32,  64, false, false, false, EQuad> { using Type = WarpGemmMfma_f32_32x32x64_bf8_bf8<EQuad>; };

+template<WGAttrNumAccessEnum I> struct Dispatcher<fp8_t, fp8_t, float, 32, 32,  64,  true, false, false, I> { using Type = WarpGemmMfma_f32_32x32x64_fp8_fp8_CTransposed<I>; };
+template<WGAttrNumAccessEnum I> struct Dispatcher<fp8_t, bf8_t, float, 32, 32,  64,  true, false, false, I> { using Type = WarpGemmMfma_f32_32x32x64_fp8_bf8_CTransposed<I>; };
+template<WGAttrNumAccessEnum I> struct Dispatcher<bf8_t, fp8_t, float, 32, 32,  64,  true, false, false, I> { using Type = WarpGemmMfma_f32_32x32x64_bf8_fp8_CTransposed<I>; };
+template<WGAttrNumAccessEnum I> struct Dispatcher<bf8_t, bf8_t, float, 32, 32,  64,  true, false, false, I> { using Type = WarpGemmMfma_f32_32x32x64_bf8_bf8_CTransposed<I>; };
+
+template<WGAttrNumAccessEnum I> struct Dispatcher<pk_fp4_t, pk_fp4_t, float, 32, 32,  64,  true, false, false, I> { using Type = WarpGemmMfma_f32_32x32x64_fp4_fp4_CTransposed<I>; };
+
 template<> struct Dispatcher<fp8_t, fp8_t, float, 32, 32,  32, false> { using Type = WarpGemmMfma_f32_32x32x32_fp8_fp8<>; };
 template<> struct Dispatcher<fp8_t, fp8_t, float, 32, 32,  32, false, false, false, EDouble> { using Type = WarpGemmMfma_f32_32x32x32_fp8_fp8<EDouble>; };
 template<> struct Dispatcher<bf8_t, bf8_t, float, 32, 32,  32, false> { using Type = WarpGemmMfma_f32_32x32x32_bf8_bf8<>; };
@@ -152,7 +161,6 @@ template<> struct Dispatcher<fp8_t, fp8_t, float, 16, 16,  64,  true> { using Ty
 template<> struct Dispatcher<fp8_t, fp8_t, float, 16, 16,  64, false> { using Type = WarpGemmMfma_f32_16x16x64_fp8_fp8<>; };
 template<> struct Dispatcher<fp8_t, fp8_t, float, 16, 16,  64, false, false, false, EDouble> { using Type = WarpGemmMfma_f32_16x16x64_fp8_fp8<EDouble>; };

-
 //WMMA cases
 template<bool TransposeC> struct Dispatcher<fp8_t, fp8_t, float, 16, 16, 16, TransposeC, false> { using Type = WarpGemmWmma_f32_16x16x16_f8_f8<TransposeC>; };
 template<bool TransposeC> struct Dispatcher<bf8_t, bf8_t, float, 16, 16, 16, TransposeC, false> { using Type = WarpGemmWmma_f32_16x16x16_bf8_bf8<TransposeC>; };