MX GEMM - Add MX BF8 example (#2071)

* Add MX GEMM example for MX BF8 * Verified MX FP8 with 16x16x128 scale builtin * Verify MX BF8 GEMM with BF16 output
2026-06-05 20:55:59 +00:00 · 2025-04-16 15:25:02 -06:00
parent 3bb62f16cd
commit da54464cce
5 changed files with 139 additions and 0 deletions
--- a/example/67_gemm_microscaling/CMakeLists.txt
+++ b/example/67_gemm_microscaling/CMakeLists.txt
@@ -3,3 +3,6 @@ add_custom_target(example_gemm_mx)
 add_example_executable(example_gemm_mx_fp8 gemm_mx_fp8.cpp)
 add_example_dependencies(example_gemm_mx example_gemm_mx_fp8)

+add_example_executable(example_gemm_mx_bf8 gemm_mx_bf8.cpp)
+add_example_dependencies(example_gemm_mx example_gemm_mx_bf8)
+
--- a/example/67_gemm_microscaling/gemm_mx_bf8.cpp
+++ b/example/67_gemm_microscaling/gemm_mx_bf8.cpp
@@ -0,0 +1,98 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "gemm_mx_common.hpp"
+
+using ADataType = ck::bf8_t;
+using BDataType = ck::bf8_t;
+
+using XDataType = ck::e8m0_bexp_t;
+
+using CDataType        = ck::bhalf_t;
+using AccDataType      = float;
+using CShuffleDataType = CDataType;
+
+using ALayout = Row;
+using BLayout = Col;
+using CLayout = Row;
+
+using AElementOp = PassThrough; // elementwise transformation for A matrix
+using BElementOp = PassThrough; // elementwise transformation for B matrix
+using CElementOp = PassThrough; // elementwise transformation for C matrix
+
+constexpr ck::index_t ScaleBlockSize = 32; // scaling block size
+constexpr ck::index_t KPerBlock      = 128;
+
+constexpr auto GemmSpec      = ck::tensor_operation::device::GemmSpecialization::Default;
+constexpr auto BlkGemmPSched = ck::BlockGemmPipelineScheduler::Intrawave;
+constexpr auto BlkGemmPVer   = ck::BlockGemmPipelineVersion::v1;
+
+using DeviceOpInstance = ck::tensor_operation::device::DeviceGemmMX_Xdl_CShuffleV3<
+    ALayout,          // ALayout
+    BLayout,          // BLayout
+    CLayout,          // CLayout
+    ADataType,        // ADataType
+    XDataType,        // AScaleDataType
+    BDataType,        // BDataType
+    XDataType,        // BScaleDataType
+    CDataType,        // CDataType
+    AccDataType,      // GemmAccDataType
+    CShuffleDataType, // CShuffleDataType
+    AElementOp,       // AElementwiseOperation
+    BElementOp,       // BElementwiseOperation
+    CElementOp,       // CElementwiseOperation
+    GemmSpec,         // GemmSpec
+    ScaleBlockSize,   // ScaleBlockSize: Scaling block size
+    128,              // BlockSize: Thread block size
+    128,              // MPerBlock
+    16,               // NPerBlock
+    KPerBlock,        // KPerBlock
+    16,               // AK1
+    16,               // BK1
+    16,               // MPerXDL
+    16,               // NPerXDL
+    4,                // MXdlPerWave
+    1,                // NXdlPerWave
+    S<8, 16, 1>,      // ABlockTransferThreadClusterLengths_AK0_M_AK1
+    S<1, 0, 2>,       // ABlockTransferThreadClusterArrangeOrder
+    S<1, 0, 2>,       // ABlockTransferSrcAccessOrder
+    2,                // ABlockTransferSrcVectorDim
+    16,               // ABlockTransferSrcScalarPerVector
+    16,               // ABlockTransferDstScalarPerVector_AK1
+    false,            // ABlockLdsExtraM
+    S<8, 16, 1>,      // BBlockTransferThreadClusterLengths_BK0_N_BK1
+    S<1, 0, 2>,       // BBlockTransferThreadClusterArrangeOrder
+    S<1, 0, 2>,       // BBlockTransferSrcAccessOrder
+    2,                // BBlockTransferSrcVectorDim
+    16,               // BBlockTransferSrcScalarPerVector
+    16,               // BBlockTransferDstScalarPerVector_BK1
+    false,            // BBlockLdsExtraN
+    1,                // CShuffleMXdlPerWavePerShuffle
+    1,                // CShuffleNXdlPerWavePerShuffle
+    S<1, 16, 1, 8>,   // CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock
+    2,                // CShuffleBlockTransferScalarPerVector_NPerBlock
+    BlkGemmPSched,    // BlkGemmPipeSched
+    BlkGemmPVer,      // BlkGemmPipelineVer
+    ADataType,        // ComputeTypeA
+    BDataType         // ComputeTypeB
+    >;
+
+int main(int argc, char* argv[])
+{
+    return run_mx_gemm_example<DeviceOpInstance,
+                               ADataType,
+                               BDataType,
+                               XDataType,
+                               CDataType,
+                               ALayout,
+                               BLayout,
+                               CLayout,
+                               AElementOp,
+                               BElementOp,
+                               CElementOp,
+                               AccDataType,
+                               CShuffleDataType,
+                               ScaleBlockSize>(argc, argv)
+               ? 0
+               : -1;
+}
--- a/include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle_v3_mx.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle_v3_mx.hpp
@@ -699,6 +699,9 @@ struct DeviceGemmMX_Xdl_CShuffleV3 : public DeviceGemmMX<ALayout,

        static_assert(ScaleBlockSize == 32, "Only ScaleBlockSize 32 is supported");

+        static_assert(is_same_v<ComputeTypeA, ADataType> && is_same_v<ComputeTypeB, BDataType>,
+                      "ComputeTypeA and ComputeTypeB must be the same as ADataType and BDataType");
+
        return true;
    }

--- a/include/ck/tensor_operation/gpu/warp/xdlops_gemm.hpp
+++ b/include/ck/tensor_operation/gpu/warp/xdlops_gemm.hpp
@@ -1141,6 +1141,12 @@ struct MfmaSelector
        return MfmaInstr::mfma_scale_f32_16x16x128f8f6f4;
    }

+    template <>
+    constexpr auto GetMfma<bf8_t, 16, 16, bf8_t, false, true>()
+    {
+        return MfmaInstr::mfma_scale_f32_16x16x128f8f6f4;
+    }
+
    template <>
    constexpr auto GetMfma<bf8_t, 32, 32>()
    {
--- a/include/ck/utility/amd_xdlops.hpp
+++ b/include/ck/utility/amd_xdlops.hpp
@@ -588,6 +588,35 @@ struct intrin_mfma_scale_f32_16x16x128f8f6f4<16, 16>
        ignore = reg_b;
        ignore = scale_b;
        ignore = reg_c;
+#endif
+    }
+
+    template <class FloatC>
+    __device__ static void Run(const bf8x32_t& reg_a,
+                               const int32_t& scale_a,
+                               const bf8x32_t& reg_b,
+                               const int32_t& scale_b,
+                               FloatC& reg_c)
+    {
+#if defined(__gfx950__)
+        // https://github.com/ROCm/llvm-project/blob/656552edc693e2bb4abc9258399c39d190fce2b3/llvm/test/Verifier/AMDGPU/mfma-scale.ll#L10
+        reg_c.template AsType<float4_t>()(Number<0>{}) =
+            __builtin_amdgcn_mfma_scale_f32_16x16x128_f8f6f4(
+                reg_a,
+                reg_b,
+                reg_c.template AsType<float4_t>()[Number<0>{}],
+                1, // cbsz
+                1, // blgp
+                0, // OPSEL
+                scale_a,
+                0, // OPSEL
+                scale_b);
+#else
+        ignore = reg_a;
+        ignore = scale_a;
+        ignore = reg_b;
+        ignore = scale_b;
+        ignore = reg_c;
 #endif
    }
 };