MX FP GEMM - Test MX FP8 MFMA Instructions (#1902)

* Refactored `load_A_row_major` to follow scale mapping * Refactored `load_A_col_major` to follow scale mapping * Refactored `load_B_col_major` to follow scale mapping * Verified non-scaled test * Verified scaled tests * Used ReferenceMXGemm for verification * Updated license headers [ROCm/composable_kernel commit: ffa13455a2]
2026-05-14 02:02:46 +00:00 · 2025-02-21 13:35:54 -07:00
parent d3f31b32d2
commit c3175995ba
4 changed files with 897 additions and 127 deletions
--- a/include/ck/tensor_operation/gpu/warp/xdlops_gemm.hpp
+++ b/include/ck/tensor_operation/gpu/warp/xdlops_gemm.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.

 #pragma once

@@ -780,7 +780,6 @@ struct mfma_type<MfmaInstr::mfma_f32_16x16x32bf8f8>
    }
 };

-// TODO: fix mfma...f8f6f4 instructions
 template <>
 struct mfma_type<MfmaInstr::mfma_f32_32x32x64f8f6f4>
 {
@@ -847,9 +846,14 @@ struct mfma_type<MfmaInstr::mfma_scale_f32_32x32x64f8f6f4>
    // clang-format on

    template <index_t MPerXdlops, index_t NPerXdlops, class FloatA, class FloatB, class FloatC>
-    __device__ void run(const FloatA& a, const FloatB& b, FloatC& reg_c) const
+    __device__ void run(const FloatA& a,
+                        const int32_t scale_a,
+                        const FloatB& b,
+                        const int32_t scale_b,
+                        FloatC& reg_c) const
    {
-        intrin_mfma_scale_f32_32x32x64f8f6f4<MPerXdlops, NPerXdlops>::Run(a, b, reg_c);
+        intrin_mfma_scale_f32_32x32x64f8f6f4<MPerXdlops, NPerXdlops>::Run(
+            a, scale_a, b, scale_b, reg_c);
    }
 };

@@ -871,9 +875,14 @@ struct mfma_type<MfmaInstr::mfma_scale_f32_16x16x128f8f6f4>
    // clang-format on

    template <index_t MPerXdlops, index_t NPerXdlops, class FloatA, class FloatB, class FloatC>
-    __device__ void run(const FloatA& a, const FloatB& b, FloatC& reg_c) const
+    __device__ void run(const FloatA& a,
+                        const int32_t scale_a,
+                        const FloatB& b,
+                        const int32_t scale_b,
+                        FloatC& reg_c) const
    {
-        intrin_mfma_scale_f32_16x16x128f8f6f4<MPerXdlops, NPerXdlops>::Run(a, b, reg_c);
+        intrin_mfma_scale_f32_16x16x128f8f6f4<MPerXdlops, NPerXdlops>::Run(
+            a, scale_a, b, scale_b, reg_c);
    }
 };

--- a/include/ck/utility/amd_xdlops.hpp
+++ b/include/ck/utility/amd_xdlops.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.

 #pragma once

@@ -533,9 +533,9 @@ struct intrin_mfma_scale_f32_32x32x64f8f6f4<32, 32>
                reg_c.template AsType<float16_t>()[Number<0>{}],
                0, // cbsz
                0, // blgp
-                0, // { OPSEL_HI[0], OPSEL[0] }?
+                0, // OPSEL
                scale_a,
-                0, // { OPSEL_HI[1], OPSEL[1] }?
+                0, // OPSEL
                scale_b);
 #else
        ignore = reg_a;
@@ -569,9 +569,9 @@ struct intrin_mfma_scale_f32_16x16x128f8f6f4<16, 16>
                reg_c.template AsType<float4_t>()[Number<0>{}],
                0, // cbsz
                0, // blgp
-                0, // { OPSEL_HI[0], OPSEL[0] }?
+                0, // OPSEL
                scale_a,
-                0, // { OPSEL_HI[1], OPSEL[1] }?
+                0, // OPSEL
                scale_b);
 #else
        ignore = reg_a;
--- a/test/mx_mfma_op/mx_mfma_op.cpp
+++ b/test/mx_mfma_op/mx_mfma_op.cpp
@@ -30,11 +30,11 @@ bool run_mfma_test(ck::index_t init)
    constexpr auto BLOCK_N = mfma_instr.n_per_blk;
    constexpr auto BLOCK_K = mfma_instr.num_input_blks * mfma_instr.k_per_blk;

-    const auto mx_mfma_kernel = ck::matmul<AType, BType, CType, AccType, BLOCK_M, BLOCK_N, BLOCK_K>;
+    const auto mfma_kernel = ck::matmul<AType, BType, CType, AccType, BLOCK_M, BLOCK_N, BLOCK_K>;

    bool pass = true;

-    pass = ck::mfma_test::TestMFMA<decltype(mx_mfma_kernel),
+    pass = ck::mfma_test::TestMFMA<decltype(mfma_kernel),
                                   AType,
                                   BType,
                                   CType,
@@ -45,21 +45,80 @@ bool run_mfma_test(ck::index_t init)
                                   CLayout,
                                   BLOCK_M,
                                   BLOCK_N,
-                                   BLOCK_K>{}(mx_mfma_kernel, init);
+                                   BLOCK_K>{}(mfma_kernel, init);

    return pass;
 }

 TEST(MFMA, FP8MFMA16x16x128)
 {
-    auto AB_init = 0;
+    auto AB_init = 4;
    auto pass    = run_mfma_test<f8_t, f8_t, half_t, ck::MFMA_F8F6F4::F32_16x16x128>(AB_init);
    EXPECT_TRUE(pass);
 }

 TEST(MFMA, FP8MFMA32x32x64)
 {
-    auto AB_init = 0;
+    auto AB_init = 4;
    auto pass    = run_mfma_test<f8_t, f8_t, float, ck::MFMA_F8F6F4::F32_32x32x64>(AB_init);
    EXPECT_TRUE(pass);
 }
+
+/**
+ * @brief Run the test for the given MX MFMA instruction
+ *
+ * @param init - selects initialization algorithm for A and B tensors
+ */
+template <typename AType, typename BType, typename CType, ck::MFMA_F8F6F4 mfma>
+bool run_mxmfma_test(ck::index_t init)
+{
+    static_assert(mfma == ck::MFMA_F8F6F4::SCALE_F32_16x16x128 ||
+                      mfma == ck::MFMA_F8F6F4::SCALE_F32_32x32x64,
+                  "Only SCALE_F32_16x16x128 and SCALE_F32_32x32x64 are supported");
+    using ALayout = ck::tensor_layout::gemm::RowMajor;
+    using BLayout = ck::tensor_layout::gemm::ColumnMajor;
+    using CLayout = ck::tensor_layout::gemm::RowMajor;
+
+    using AccType   = float;           // only MFMA_F32 instructions supported
+    using ScaleType = ck::e8m0_bexp_t; // biased exponent type
+
+    ck::mfma_type<static_cast<ck::MfmaInstr>(mfma)> mfma_instr;
+    constexpr auto BLOCK_M = mfma_instr.m_per_blk;
+    constexpr auto BLOCK_N = mfma_instr.n_per_blk;
+    constexpr auto BLOCK_K = mfma_instr.num_input_blks * mfma_instr.k_per_blk;
+    constexpr auto BLOCK_X = 32; // scaling vector size
+
+    const auto mx_mfma_kernel =
+        ck::matmul<AType, BType, ScaleType, CType, AccType, BLOCK_M, BLOCK_N, BLOCK_K, BLOCK_X>;
+
+    bool pass = true;
+
+    pass = ck::mxmfma_test::TestMXMFMA<decltype(mx_mfma_kernel),
+                                       AType,
+                                       BType,
+                                       ScaleType,
+                                       CType,
+                                       ALayout,
+                                       BLayout,
+                                       CLayout,
+                                       BLOCK_M,
+                                       BLOCK_N,
+                                       BLOCK_K,
+                                       BLOCK_X>{}(mx_mfma_kernel, init);
+
+    return pass;
+}
+
+TEST(MXMFMA, MXFP8MFMA16x16x128)
+{
+    auto AB_init = 7;
+    auto pass = run_mxmfma_test<f8_t, f8_t, float, ck::MFMA_F8F6F4::SCALE_F32_16x16x128>(AB_init);
+    EXPECT_TRUE(pass);
+}
+
+TEST(MXMFMA, MXFP8MFMA32x32x64)
+{
+    auto AB_init = 7;
+    auto pass = run_mxmfma_test<f8_t, f8_t, half_t, ck::MFMA_F8F6F4::SCALE_F32_32x32x64>(AB_init);
+    EXPECT_TRUE(pass);
+}
--- a/test/mx_mfma_op/mx_mfma_op.hpp
+++ b/test/mx_mfma_op/mx_mfma_op.hpp