Introduce MX GEMM for FP8 data type (#2000)

2026-05-02 04:31:25 +00:00 · 2025-03-24 15:41:07 -06:00
parent c027637a8f
commit 6660dc6b8e
11 changed files with 4129 additions and 135 deletions
--- a/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_base.hpp
+++ b/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_base.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.

 #pragma once

@@ -181,6 +181,23 @@ struct BlockwiseGemmXdlops_pipeline_base

    using Tuple4 = decltype(CalculateAThreadOriginDataIndex());

+    /**
+     * @brief Constructor for BlockwiseGemmXdlops_pipeline_base.
+     *
+     * This constructor initializes the thread copy objects for matrices A and B.
+     * It also performs several compile-time checks to ensure the correctness of the
+     * matrix tile descriptors.
+     *
+     * @param a_origin The origin data index for matrix A.
+     * @param b_origin The origin data index for matrix B.
+     *
+     * @note The constructor includes static assertions to ensure that:
+     * - The matrix tile descriptors for A and B are known at compile-time.
+     * - The number of threads in the thread block matches the product of MWaves, NWaves, and
+     * WaveSize.
+     * - The dimensions of the block are divisible by the product of the corresponding XDL and
+     * repeat dimensions.
+     */
    __host__ __device__
    BlockwiseGemmXdlops_pipeline_base(Tuple4 a_origin = CalculateAThreadOriginDataIndex(),
                                      Tuple4 b_origin = CalculateBThreadOriginDataIndex())
--- a/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_mx_selector.hpp
+++ b/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_mx_selector.hpp
@@ -0,0 +1,69 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_v1_mx.hpp"
+
+namespace ck {
+
+template <BlockGemmPipelineVersion BlkGemmPipelineVer,
+          BlockGemmPipelineScheduler BlkGemmPipeSche,
+          index_t ThreadBlockSize,
+          index_t ScaleBlockSize,
+          typename ADataType,
+          typename AScaleDataType,
+          typename BDataType,
+          typename BScaleDataType,
+          typename ComputeDataType, // TODO: remove this as in this pipeline ADataType and BDataType
+                                    // must be used for compute
+          typename AccDataType,
+          typename ATileDesc,
+          typename BTileDesc,
+          typename AMmaTileDesc,
+          typename BMmaTileDesc,
+          index_t ABlockTransferSrcScalarPerVector,
+          index_t BBlockTransferSrcScalarPerVector,
+          index_t MPerBlock,
+          index_t NPerBlock,
+          index_t KPerBlock,
+          index_t MPerXDL,
+          index_t NPerXDL,
+          index_t MRepeat,
+          index_t NRepeat,
+          index_t KPack>
+constexpr auto BlockGemmMXPipeline_Selector()
+{
+    if constexpr(BlkGemmPipelineVer == BlockGemmPipelineVersion::v1)
+    {
+        return BlockwiseGemmXdlops_pipeline_v1_mx<BlkGemmPipeSche,
+                                                  ThreadBlockSize,
+                                                  ScaleBlockSize,
+                                                  ADataType,
+                                                  AScaleDataType,
+                                                  BDataType,
+                                                  BScaleDataType,
+                                                  ComputeDataType,
+                                                  AccDataType,
+                                                  ATileDesc,
+                                                  BTileDesc,
+                                                  AMmaTileDesc,
+                                                  BMmaTileDesc,
+                                                  ABlockTransferSrcScalarPerVector,
+                                                  BBlockTransferSrcScalarPerVector,
+                                                  MPerBlock,
+                                                  NPerBlock,
+                                                  KPerBlock,
+                                                  MPerXDL,
+                                                  NPerXDL,
+                                                  MRepeat,
+                                                  NRepeat,
+                                                  KPack>{};
+    }
+    else
+    {
+        std::cerr << "BlockGemmPipeline configuration is not available" << std::endl;
+    }
+}
+
+} // namespace ck
--- a/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_v1_mx.hpp
+++ b/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_v1_mx.hpp
@@ -0,0 +1,617 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_base.hpp"
+
+namespace ck {
+
+// Naive pipeline with lowest resource request per WGP
+// GlobalPrefetchStages: 1
+// LocalPreFillStages: 1
+// LocalPreFetchStages: 0
+// LocalSharedMemoryBuffer: 1
+
+template <BlockGemmPipelineScheduler BlkGemmPipelineVer,
+          index_t ThreadBlockSize,
+          index_t ScaleBlockSize,
+          typename ADataType,
+          typename AScaleDataType,
+          typename BDataType,
+          typename BScaleDataType,
+          typename ComputeDataType,
+          typename AccDataType,
+          typename ATileDesc,
+          typename BTileDesc,
+          typename AMmaTileDesc,
+          typename BMmaTileDesc,
+          index_t ABlockTransferSrcScalarPerVector,
+          index_t BBlockTransferSrcScalarPerVector,
+          index_t MPerBlock,
+          index_t NPerBlock,
+          index_t KPerBlock,
+          index_t MPerXDL,
+          index_t NPerXDL,
+          index_t MRepeat, // MXdlPerWave
+          index_t NRepeat, // NXdlPerWave
+          index_t KPack>
+struct BlockwiseGemmXdlops_pipeline_v1_mx
+{
+};
+
+template <index_t ThreadBlockSize,
+          index_t ScaleBlockSize,
+          typename ADataType,
+          typename AScaleDataType,
+          typename BDataType,
+          typename BScaleDataType,
+          typename ComputeDataType,
+          typename AccDataType,
+          typename ATileDesc,
+          typename BTileDesc,
+          typename AMmaTileDesc,
+          typename BMmaTileDesc,
+          index_t ABlockTransferSrcScalarPerVector,
+          index_t BBlockTransferSrcScalarPerVector,
+          index_t MPerBlock,
+          index_t NPerBlock,
+          index_t KPerBlock,
+          index_t MPerXDL,
+          index_t NPerXDL,
+          index_t MRepeat, // MXdlPerWave
+          index_t NRepeat, // NXdlPerWave
+          index_t KPack>
+struct BlockwiseGemmXdlops_pipeline_v1_mx<BlockGemmPipelineScheduler::Intrawave,
+                                          ThreadBlockSize,
+                                          ScaleBlockSize,
+                                          ADataType,
+                                          AScaleDataType,
+                                          BDataType,
+                                          BScaleDataType,
+                                          ComputeDataType,
+                                          AccDataType,
+                                          ATileDesc,
+                                          BTileDesc,
+                                          AMmaTileDesc,
+                                          BMmaTileDesc,
+                                          ABlockTransferSrcScalarPerVector,
+                                          BBlockTransferSrcScalarPerVector,
+                                          MPerBlock,
+                                          NPerBlock,
+                                          KPerBlock,
+                                          MPerXDL,
+                                          NPerXDL,
+                                          MRepeat,
+                                          NRepeat,
+                                          KPack>
+    : BlockwiseGemmXdlops_pipeline_base<ThreadBlockSize,
+                                        ADataType,
+                                        BDataType,
+                                        ComputeDataType,
+                                        AccDataType,
+                                        ATileDesc,
+                                        BTileDesc,
+                                        AMmaTileDesc,
+                                        BMmaTileDesc,
+                                        ABlockTransferSrcScalarPerVector,
+                                        BBlockTransferSrcScalarPerVector,
+                                        MPerBlock,
+                                        NPerBlock,
+                                        KPerBlock,
+                                        MPerXDL,
+                                        NPerXDL,
+                                        MRepeat,
+                                        NRepeat,
+                                        KPack>
+
+{
+    using Base = BlockwiseGemmXdlops_pipeline_base<ThreadBlockSize,
+                                                   ADataType,
+                                                   BDataType,
+                                                   ComputeDataType,
+                                                   AccDataType,
+                                                   ATileDesc,
+                                                   BTileDesc,
+                                                   AMmaTileDesc,
+                                                   BMmaTileDesc,
+                                                   ABlockTransferSrcScalarPerVector,
+                                                   BBlockTransferSrcScalarPerVector,
+                                                   MPerBlock,
+                                                   NPerBlock,
+                                                   KPerBlock,
+                                                   MPerXDL,
+                                                   NPerXDL,
+                                                   MRepeat,
+                                                   NRepeat,
+                                                   KPack>;
+    using Base::I0;
+    using Base::I1;
+    using Base::KRepeat;
+    using Base::MWaves;
+    using Base::NWaves;
+    using Base::WaveSize;
+    using Base::xdlops_gemm;
+
+    using Base::CalculateCThreadOriginDataIndex;
+    using Base::CalculateCThreadOriginDataIndex8D;
+    using Base::GetCBlockDescriptor_G_M0_N0_M1_N1_M2_M3_M4_N2;
+    using Base::GetCBlockDescriptor_M0_N0_M1_N1_M2_M3_M4_N2;
+    using Base::GetCBlockDescriptor_M0_N0_M1_N1_M2_N2_N3_N4;
+    using Base::GetCThreadBuffer;
+    using Base::GetCThreadDescriptor_G_M0_N0_M1_N1_M2_M3_M4_N2;
+    using Base::GetCThreadDescriptor_M0_N0_M1_N1_M2_M3_M4_N2;
+    using Base::GetCThreadDescriptor_M0_N0_M1_N1_M2_N2_N3_N4;
+    using Base::GetWaveIdx;
+    using Base::MakeCGridDescriptor_G_M0_N0_M1_N1_M2_M3_M4_N2;
+    using Base::MakeCGridDescriptor_M0_N0_M1_N1_M2_M3_M4_N2;
+
+    using Base::a_block_desc_m0_m1_m2_k;
+    using Base::b_block_desc_n0_n1_n2_k;
+
+    using Base::AMmaKStride;
+    using Base::BMmaKStride;
+
+    using Tuple4 = typename Base::Tuple4;
+
+    static constexpr index_t PrefetchStages  = 1;
+    static constexpr index_t PrefillStages   = 1;
+    static constexpr index_t GlobalBufferNum = 1;
+
+    static constexpr auto ScalesPerKBlockSize =
+        KPerBlock / ScaleBlockSize; // How many mx-vectors per K block size
+
+    __host__ static constexpr bool BlockHasHotloop(index_t num_loop)
+    {
+        return num_loop > PrefetchStages;
+    }
+
+    __host__ static constexpr TailNumber BlockLoopTailNum(index_t num_loop)
+    {
+        ignore = num_loop;
+        return TailNumber::Full;
+    }
+
+    __device__ static auto CalculateAThreadOriginDataIndex()
+    {
+        const auto wave_idx = GetWaveIdx();
+
+        const auto waveId_m = wave_idx[I0];
+
+        const auto xdlops_a_idx = xdlops_gemm.CalculateAThreadOriginDataIndex();
+
+        return make_tuple(0, waveId_m, xdlops_a_idx[I1], xdlops_gemm.KPerXdlops * xdlops_a_idx[I0]);
+    }
+
+    __device__ static auto CalculateBThreadOriginDataIndex()
+    {
+        const auto wave_idx = GetWaveIdx();
+
+        const auto waveId_n = wave_idx[I1];
+
+        const auto xdlops_b_idx = xdlops_gemm.CalculateBThreadOriginDataIndex();
+
+        return make_tuple(0, waveId_n, xdlops_b_idx[I1], xdlops_gemm.KPerXdlops * xdlops_b_idx[I0]);
+    }
+
+    /**
+     * @brief Constructor for BlockwiseGemmXdlops_pipeline_v1_mx.
+     *
+     * The primary purpose of this constructor is to modify default initialization of the base class
+     * with the origin data index suitable for microscaling.
+     *
+     * @param a_origin The origin data index for matrix A.
+     * @param b_origin The origin data index for matrix B.
+     *
+     */
+    __host__ __device__
+    BlockwiseGemmXdlops_pipeline_v1_mx(Tuple4 a_origin = CalculateAThreadOriginDataIndex(),
+                                       Tuple4 b_origin = CalculateBThreadOriginDataIndex())
+        : Base(a_origin, b_origin)
+    {
+    }
+
+    template <bool HasMainLoop,
+              TailNumber TailNum,
+              typename AGridDesc,
+              typename ABlockDesc,
+              typename ABlockTransfer,
+              typename AGridBuffer,
+              typename ABlockBuffer,
+              typename ABlockTransferStep,
+              typename BGridDesc,
+              typename BBlockDesc,
+              typename BBlockTransfer,
+              typename BGridBuffer,
+              typename BBlockBuffer,
+              typename BBlockTransferStep,
+              typename CThreadBuffer,
+              typename AScaleGridBuffer,
+              typename AScaleGridDesc,
+              typename AScaleThreadTransfer,
+              typename BScaleGridBuffer,
+              typename BScaleGridDesc,
+              typename BScaleThreadTransfer>
+    __device__ void Run(
+        // ABlockCopy
+        const AGridDesc& a_grid_desc,
+        const ABlockDesc& a_block_desc,
+        ABlockTransfer& a_blockwise_copy,
+        const AGridBuffer& a_grid_buf,
+        ABlockBuffer& a_block_buf,
+        const ABlockTransferStep& a_block_copy_step,
+        // BBlockCopy
+        const BGridDesc& b_grid_desc,
+        const BBlockDesc& b_block_desc,
+        BBlockTransfer& b_blockwise_copy,
+        const BGridBuffer& b_grid_buf,
+        BBlockBuffer& b_block_buf,
+        const BBlockTransferStep& b_block_copy_step,
+        // CThread
+        CThreadBuffer& c_thread_buf,
+        // A and B scales
+        const AScaleGridDesc& a_scale_grid_desc,
+        AScaleThreadTransfer& a_scale_thread_copy,
+        const AScaleGridBuffer& a_scale_grid_buf,
+        const BScaleGridDesc& b_scale_grid_desc,
+        BScaleThreadTransfer& b_scale_thread_copy,
+        const BScaleGridBuffer& b_scale_grid_buf,
+        index_t num_loop) const
+    {
+        auto a_thread_buf = make_static_buffer<AddressSpaceEnum::Vgpr, ComputeDataType>(
+            a_thread_desc_.GetElementSpaceSize());
+        auto b_thread_buf = make_static_buffer<AddressSpaceEnum::Vgpr, ComputeDataType>(
+            b_thread_desc_.GetElementSpaceSize());
+
+        auto a_scale_thread_buf = make_static_buffer<AddressSpaceEnum::Vgpr, AScaleDataType>(
+            a_scale_thread_desc.GetElementSpaceSize());
+
+        auto b_scale_thread_buf = make_static_buffer<AddressSpaceEnum::Vgpr, BScaleDataType>(
+            b_scale_thread_desc.GetElementSpaceSize());
+
+        // Global prefetch 1
+        a_blockwise_copy.RunRead(a_grid_desc, a_grid_buf);
+        b_blockwise_copy.RunRead(b_grid_desc, b_grid_buf);
+
+        a_blockwise_copy.MoveSrcSliceWindow(a_grid_desc, a_block_copy_step);
+        b_blockwise_copy.MoveSrcSliceWindow(b_grid_desc, b_block_copy_step);
+
+        static_assert(xdlops_gemm.mfma_instr.num_groups_per_blk *
+                              xdlops_gemm.mfma_instr.group_size ==
+                          xdlops_gemm.GetRegSizePerXdlops(),
+                      "Assume num_regs_per_blk == num_groups_per_blk * group_size");
+
+        // Prefetch a_scales
+        static_for<0, MRepeat, 1>{}([&](auto m0) {
+            static_for<0, KRepeat, 1>{}([&](auto k0) {
+                static_for<0, xdlops_gemm.mfma_instr.num_groups_per_blk, 1>{}([&](auto g) {
+                    auto a_scale_thread_buf_group =
+                        make_static_buffer<AddressSpaceEnum::Vgpr, AScaleDataType>(
+                            a_scale_thread_desc_group.GetElementSpaceSize());
+
+                    a_scale_thread_copy.Run(a_scale_grid_desc,
+                                            a_scale_grid_buf,
+                                            a_scale_thread_desc_group,
+                                            make_tuple(I0, I0),
+                                            a_scale_thread_buf_group);
+
+                    static_for<0, xdlops_gemm.mfma_instr.group_size, 1>{}([&](auto i) {
+                        constexpr index_t a_scale_offset =
+                            a_scale_thread_desc.CalculateOffset(make_tuple(m0, k0, g, i));
+                        a_scale_thread_buf(Number<a_scale_offset>{}) =
+                            a_scale_thread_buf_group[Number<i>{}];
+                    });
+                    // go to the next group
+                    a_scale_thread_copy.MoveSrcSliceWindow(
+                        a_scale_grid_desc,
+                        make_multi_index(2 * xdlops_gemm.mfma_instr.group_size, 0));
+                }); // g
+
+                // restore row id and advance to the next scale
+                a_scale_thread_copy.MoveSrcSliceWindow(
+                    a_scale_grid_desc,
+                    make_multi_index(-2 * xdlops_gemm.mfma_instr.group_size *
+                                         xdlops_gemm.mfma_instr.num_groups_per_blk,
+                                     1));
+            }); // k0
+
+            // restore column id and advance to the next set of rows
+            a_scale_thread_copy.MoveSrcSliceWindow(
+                a_scale_grid_desc, make_multi_index(MWaves * MPerXDL, -ScalesPerKBlockSize));
+        }); // m0
+
+        // restore row id and advance to the next set of scales
+        a_scale_thread_copy.MoveSrcSliceWindow(a_scale_grid_desc,
+                                               make_multi_index(-MPerBlock, ScalesPerKBlockSize));
+
+        // Prefetch b_scales
+        static_for<0, NRepeat, 1>{}([&](auto n0) {
+            b_scale_thread_copy.Run(b_scale_grid_desc,
+                                    b_scale_grid_buf,
+                                    b_scale_thread_desc,
+                                    make_tuple(n0, I0),
+                                    b_scale_thread_buf);
+            b_scale_thread_copy.MoveSrcSliceWindow(b_scale_grid_desc,
+                                                   make_multi_index(NWaves * NPerXDL, 0));
+        });
+
+        b_scale_thread_copy.MoveSrcSliceWindow(b_scale_grid_desc,
+                                               make_multi_index(-NPerBlock, ScalesPerKBlockSize));
+
+        // Local prefill 1
+        a_blockwise_copy.RunWrite(a_block_desc, a_block_buf);
+        b_blockwise_copy.RunWrite(b_block_desc, b_block_buf);
+
+        // Initialize C
+        c_thread_buf.Clear();
+
+        auto c_thread_buf_per_scale = remove_cvref_t<decltype(c_thread_buf)>();
+
+        // main body
+        if constexpr(HasMainLoop)
+        {
+            // loop over k with the step KPerBlock
+            index_t i = 0;
+            do
+            {
+                // -------------------------------------------------------------------------------------------
+                a_blockwise_copy.RunRead(a_grid_desc, a_grid_buf);
+                b_blockwise_copy.RunRead(b_grid_desc, b_grid_buf);
+
+                a_blockwise_copy.MoveSrcSliceWindow(a_grid_desc, a_block_copy_step);
+                b_blockwise_copy.MoveSrcSliceWindow(b_grid_desc, b_block_copy_step);
+
+                block_sync_lds();
+
+                static_for<0, KRepeat, 1>{}([&](auto k) {
+                    constexpr auto a_k_step = k * AMmaKStride * KPack / xdlops_gemm.K1PerXdlops;
+                    constexpr auto b_k_step = k * BMmaKStride * KPack / xdlops_gemm.K1PerXdlops;
+
+                    static_for<0, MRepeat, 1>{}([&](auto m0) {
+                        a_thread_copy_.Run(a_block_desc_m0_m1_m2_k,
+                                           make_tuple(m0, I0, I0, Number<a_k_step>{}),
+                                           a_block_buf,
+                                           a_thread_desc_,
+                                           make_tuple(m0, I0, k, I0),
+                                           a_thread_buf);
+                    });
+                    static_for<0, NRepeat, 1>{}([&](auto n0) {
+                        b_thread_copy_.Run(b_block_desc_n0_n1_n2_k,
+                                           make_tuple(n0, I0, I0, Number<b_k_step>{}),
+                                           b_block_buf,
+                                           b_thread_desc_,
+                                           make_tuple(n0, I0, k, I0),
+                                           b_thread_buf);
+                    });
+                });
+
+                static_for<0, MRepeat, 1>{}([&](auto m0) {
+                    static_for<0, NRepeat, 1>{}([&](auto n0) {
+                        static_for<0, KRepeat, 1>{}([&](auto k0) {
+                            c_thread_buf_per_scale.Clear();
+                            vector_type<ComputeDataType, KPack> a_thread_vec;
+                            vector_type<ComputeDataType, KPack> b_thread_vec;
+
+                            static_for<0, KPack, 1>{}([&](auto ik) {
+                                a_thread_vec.template AsType<ComputeDataType>()(ik) =
+                                    a_thread_buf[Number<a_thread_desc_.CalculateOffset(
+                                        make_tuple(m0, I0, k0, ik))>{}];
+                                b_thread_vec.template AsType<ComputeDataType>()(ik) =
+                                    b_thread_buf[Number<b_thread_desc_.CalculateOffset(
+                                        make_tuple(n0, I0, k0, ik))>{}];
+                            });
+
+                            using mfma_input_type =
+                                typename vector_type<ComputeDataType,
+                                                     xdlops_gemm.K1PerXdlops>::type;
+
+                            // MFMA accumulation
+                            // m = 1:MPerXDL
+                            //   n = 1:NPerXDL
+                            //     k = 1:KPack
+                            //       c(m,n) += a(m,k)*b(k,n)
+                            xdlops_gemm.template Run<>(
+                                a_thread_vec.template AsType<mfma_input_type>(),
+                                b_thread_vec.template AsType<mfma_input_type>(),
+                                c_thread_buf_per_scale.GetVectorTypeReference(I0));
+
+                            // one scale per k0
+                            constexpr index_t b_scale_offset =
+                                b_scale_thread_desc.CalculateOffset(make_tuple(n0, k0));
+
+                            static_for<0, xdlops_gemm.mfma_instr.num_groups_per_blk, 1>{}(
+                                [&](auto g) {
+                                    static_for<0, xdlops_gemm.mfma_instr.group_size, 1>{}(
+                                        [&](auto r) {
+                                            constexpr index_t a_scale_offset =
+                                                a_scale_thread_desc.CalculateOffset(
+                                                    make_tuple(m0, k0, g, r));
+
+                                            constexpr auto reg_offset =
+                                                g * xdlops_gemm.mfma_instr.group_size + r;
+
+                                            constexpr index_t c_offset =
+                                                c_thread_desc_.CalculateOffset(
+                                                    make_tuple(m0, n0, reg_offset));
+
+                                            c_thread_buf(Number<c_offset>{}) +=
+                                                c_thread_buf_per_scale[Number<reg_offset>{}] *
+                                                type_convert<AccDataType>(
+                                                    b_scale_thread_buf[Number<b_scale_offset>{}]) *
+                                                type_convert<AccDataType>(
+                                                    a_scale_thread_buf[Number<a_scale_offset>{}]);
+                                        });
+                                });
+                        });
+                    });
+                });
+
+                static_for<0, MRepeat, 1>{}([&](auto m0) {
+                    static_for<0, KRepeat, 1>{}([&](auto k0) {
+                        static_for<0, xdlops_gemm.mfma_instr.num_groups_per_blk, 1>{}([&](auto g) {
+                            auto a_scale_thread_buf_group =
+                                make_static_buffer<AddressSpaceEnum::Vgpr, AScaleDataType>(
+                                    a_scale_thread_desc_group.GetElementSpaceSize());
+
+                            a_scale_thread_copy.Run(a_scale_grid_desc,
+                                                    a_scale_grid_buf,
+                                                    a_scale_thread_desc_group,
+                                                    make_tuple(I0, I0),
+                                                    a_scale_thread_buf_group);
+
+                            static_for<0, xdlops_gemm.mfma_instr.group_size, 1>{}([&](auto r) {
+                                constexpr index_t a_scale_offset =
+                                    a_scale_thread_desc.CalculateOffset(make_tuple(m0, k0, g, r));
+                                a_scale_thread_buf(Number<a_scale_offset>{}) =
+                                    a_scale_thread_buf_group[Number<r>{}];
+                            });
+                            // go to the next group
+                            a_scale_thread_copy.MoveSrcSliceWindow(
+                                a_scale_grid_desc,
+                                make_multi_index(2 * xdlops_gemm.mfma_instr.group_size, 0));
+                        }); // g
+
+                        // restore row id and advance to the next scale
+                        a_scale_thread_copy.MoveSrcSliceWindow(
+                            a_scale_grid_desc,
+                            make_multi_index(-2 * xdlops_gemm.mfma_instr.group_size *
+                                                 xdlops_gemm.mfma_instr.num_groups_per_blk,
+                                             1));
+                    }); // k0
+
+                    // restore column id and advance to the next set of rows
+                    a_scale_thread_copy.MoveSrcSliceWindow(
+                        a_scale_grid_desc,
+                        make_multi_index(MWaves * MPerXDL, -ScalesPerKBlockSize));
+                }); // m0
+
+                // restore row id and advance to the next set of scales
+                a_scale_thread_copy.MoveSrcSliceWindow(
+                    a_scale_grid_desc, make_multi_index(-MPerBlock, ScalesPerKBlockSize));
+
+                static_for<0, NRepeat, 1>{}([&](auto n0) {
+                    b_scale_thread_copy.Run(b_scale_grid_desc,
+                                            b_scale_grid_buf,
+                                            b_scale_thread_desc,
+                                            make_tuple(n0, I0),
+                                            b_scale_thread_buf);
+                    b_scale_thread_copy.MoveSrcSliceWindow(b_scale_grid_desc,
+                                                           make_multi_index(NWaves * NPerXDL, 0));
+                });
+                // NWaves * NPerXDL * NRepeat == NPerBlock
+                b_scale_thread_copy.MoveSrcSliceWindow(
+                    b_scale_grid_desc, make_multi_index(-NPerBlock, ScalesPerKBlockSize));
+
+                block_sync_lds();
+                a_blockwise_copy.RunWrite(a_block_desc, a_block_buf);
+                b_blockwise_copy.RunWrite(b_block_desc, b_block_buf);
+
+                i += 1;
+
+            } while(i < (num_loop - 1));
+        }
+
+        // tail
+        if constexpr(TailNum == TailNumber::Full)
+        {
+            block_sync_lds();
+
+            static_for<0, KRepeat, 1>{}([&](auto k) {
+                constexpr auto a_k_step = k * AMmaKStride * KPack / xdlops_gemm.K1PerXdlops;
+                constexpr auto b_k_step = k * BMmaKStride * KPack / xdlops_gemm.K1PerXdlops;
+
+                static_for<0, MRepeat, 1>{}([&](auto m0) {
+                    a_thread_copy_.Run(a_block_desc_m0_m1_m2_k,
+                                       make_tuple(m0, I0, I0, Number<a_k_step>{}),
+                                       a_block_buf,
+                                       a_thread_desc_,
+                                       make_tuple(m0, I0, k, I0),
+                                       a_thread_buf);
+                });
+                static_for<0, NRepeat, 1>{}([&](auto n0) {
+                    b_thread_copy_.Run(b_block_desc_n0_n1_n2_k,
+                                       make_tuple(n0, I0, I0, Number<b_k_step>{}),
+                                       b_block_buf,
+                                       b_thread_desc_,
+                                       make_tuple(n0, I0, k, I0),
+                                       b_thread_buf);
+                });
+            });
+
+            static_for<0, MRepeat, 1>{}([&](auto m0) {
+                static_for<0, NRepeat, 1>{}([&](auto n0) {
+                    static_for<0, KRepeat, 1>{}([&](auto k0) {
+                        c_thread_buf_per_scale.Clear();
+                        vector_type<ComputeDataType, KPack> a_thread_vec;
+                        vector_type<ComputeDataType, KPack> b_thread_vec;
+
+                        static_for<0, KPack, 1>{}([&](auto ik) {
+                            a_thread_vec.template AsType<ComputeDataType>()(ik) =
+                                a_thread_buf[Number<a_thread_desc_.CalculateOffset(
+                                    make_tuple(m0, I0, k0, ik))>{}];
+                            b_thread_vec.template AsType<ComputeDataType>()(ik) =
+                                b_thread_buf[Number<b_thread_desc_.CalculateOffset(
+                                    make_tuple(n0, I0, k0, ik))>{}];
+                        });
+
+                        using mfma_input_type =
+                            typename vector_type<ComputeDataType, xdlops_gemm.K1PerXdlops>::type;
+
+                        xdlops_gemm.template Run<>(
+                            a_thread_vec.template AsType<mfma_input_type>(),
+                            b_thread_vec.template AsType<mfma_input_type>(),
+                            c_thread_buf_per_scale.GetVectorTypeReference(I0));
+
+                        // one scale per k0
+                        constexpr index_t b_scale_offset =
+                            b_scale_thread_desc.CalculateOffset(make_tuple(n0, k0));
+
+                        static_for<0, xdlops_gemm.mfma_instr.num_groups_per_blk, 1>{}([&](auto g) {
+                            static_for<0, xdlops_gemm.mfma_instr.group_size, 1>{}([&](auto r) {
+                                constexpr index_t a_scale_offset =
+                                    a_scale_thread_desc.CalculateOffset(make_tuple(m0, k0, g, r));
+
+                                constexpr auto reg_offset =
+                                    g * xdlops_gemm.mfma_instr.group_size + r;
+
+                                constexpr index_t c_offset =
+                                    c_thread_desc_.CalculateOffset(make_tuple(m0, n0, reg_offset));
+
+                                c_thread_buf(Number<c_offset>{}) +=
+                                    c_thread_buf_per_scale[Number<reg_offset>{}] *
+                                    type_convert<AccDataType>(
+                                        b_scale_thread_buf[Number<b_scale_offset>{}]) *
+                                    type_convert<AccDataType>(
+                                        a_scale_thread_buf[Number<a_scale_offset>{}]);
+                            });
+                        });
+                    });
+                });
+            });
+        }
+    }
+
+    // TODO: make this field protected when a_scale_thread_copy_ is moved here
+    static constexpr auto a_scale_thread_desc = make_naive_tensor_descriptor_packed(
+        make_tuple(Number<MRepeat>{},
+                   Number<KRepeat>{},
+                   Number<xdlops_gemm.mfma_instr.num_groups_per_blk>{},
+                   Number<xdlops_gemm.mfma_instr.group_size>{}));
+
+    // Is used to copy data from a_scale_grid to a_scale_thread
+    static constexpr auto a_scale_thread_desc_group = make_naive_tensor_descriptor_packed(
+        make_tuple(Number<xdlops_gemm.mfma_instr.group_size>{}, Number<1>{}));
+
+    // TODO: make this field protected when b_scale_thread_copy_ is moved here
+    static constexpr auto b_scale_thread_desc =
+        make_naive_tensor_descriptor_packed(make_tuple(Number<NRepeat>{}, Number<KRepeat>{}));
+
+    protected:
+    using Base::a_thread_copy_;
+    using Base::a_thread_desc_;
+    using Base::b_thread_copy_;
+    using Base::b_thread_desc_;
+    using Base::c_thread_desc_;
+};
+
+} // namespace ck
--- a/include/ck/tensor_operation/gpu/device/device_gemm_mx.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_gemm_mx.hpp
@@ -0,0 +1,50 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck/tensor_operation/gpu/device/device_base.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+
+template <typename ALayout,
+          typename BLayout,
+          typename CLayout,
+          typename ADataType,
+          typename AScaleDataType,
+          typename BDataType,
+          typename BScaleDataType,
+          typename CDataType,
+          index_t ScaleBlockSize,
+          typename AElementwiseOperation,
+          typename BElementwiseOperation,
+          typename CElementwiseOperation>
+struct DeviceGemmMX : public BaseOperator
+{
+    virtual std::unique_ptr<BaseArgument>
+    MakeArgumentPointer(const void* p_a,
+                        const void* p_a_scale,
+                        const void* p_b,
+                        const void* p_b_scale,
+                        void* p_c,
+                        ck::index_t M,
+                        ck::index_t N,
+                        ck::index_t K,
+                        ck::index_t StrideA,
+                        ck::index_t StrideAScale,
+                        ck::index_t StrideB,
+                        ck::index_t StrideBScale,
+                        ck::index_t StrideC,
+                        ck::index_t KBatch,
+                        AElementwiseOperation a_element_op,
+                        BElementwiseOperation b_element_op,
+                        CElementwiseOperation c_element_op) = 0;
+
+    virtual std::unique_ptr<BaseInvoker> MakeInvokerPointer() = 0;
+};
+
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
--- a/include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle_v3_mx.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle_v3_mx.hpp
@@ -0,0 +1,877 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include <iostream>
+#include <sstream>
+
+#include "ck/utility/common_header.hpp"
+
+#include "ck/host_utility/flush_cache.hpp"
+#include "ck/tensor_description/tensor_descriptor.hpp"
+#include "ck/tensor_description/tensor_descriptor_helper.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/device_gemm_mx.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3_mx.hpp"
+#include "ck/host_utility/device_prop.hpp"
+#include "ck/host_utility/kernel_launch.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+
+/**
+ * \brief WIP: Implements XDL CShuffle V3 GEMM for microscale-compliant data types
+ *
+ * This class is a work-in-progress implementation of the XDL CShuffle V3 GEMM for
+ * microscale-compliant data types.
+ *
+ * Assumptions:
+ * - A and B data types are compliant with the OCP Microscaling Formats (MX) Specification
+ * - Each scale applies to ScaleBlockSize elements in K direction
+ * - A scale matrix is row-major
+ * - B scale matrix is column-major
+ * - Scale data types must have get_exponent_value() specialization, whereas lowest 8 bits of the
+ * exponent will be interpreted as conventional biased Float32 exponent (E8M0)
+ *
+ * Tunable parameters.
+ * The CK instance includes a series of tunable template parameters to control the parallel
+ * granularity of the workload to achieve load balancing on different hardware platforms. These
+ * parameters include Block Size, M/N/K Per Block, M/N per XDL, AK1, BK1, etc.
+ *  - Block Size determines the number of threads in the thread block.
+ *  - M/N/K Per Block determines the size of tile that each thread block is responsible for
+ * calculating.
+ *  - M/N Per XDL refers to M/N size for Instinct accelerator Matrix Fused Multiply Add (MFMA)
+ * instructions operating on a per-wavefront basis.
+ *  - A/B K1 is related to the data type. It can be any value ranging from 1 to K Per Block. To
+ * achieve the optimal load/store performance, 128bit per load is suggested. In addition, the A/B
+ * loading parameters must be changed accordingly to match the A/B K1 value; otherwise, it will
+ * result in compilation errors.
+ *
+ * Conditions for achieving computational load balancing on different hardware platforms can vary.
+ *
+ * Serialized version of the algorithm:
+ * \code
+ * // E = A * B + C
+ * // Loop over E[MPerBlock,NPerBlock] tiles
+ * for(int mb = 0; mb < M; mb += MPerBlock){
+ *    for(int nb = 0; nb < N; nb += NPerBlock){
+ *       // initialize E[MPerBlock,NPerBlock] tile
+ *       for(int mt = mb; mt < mb + MPerBlock; mt++){
+ *          for(int nt = nb; nt < nb + NPerBlock; nt++){
+ *             E[mt,nt] = C[mt,nt];
+ *          }
+ *       }
+ *
+ *       // multiply-accumulate per tile
+ *       for(int kb = 0; kb < K; kb += KPerBlock){
+ *         for(int m0 = mb; m0 < mb + MPerBlock; m0 += MWaves * MPerXDL){
+ *           for(int n0 = nb; n0 < nb + NPerBlock; n0 += NWaves * NPerXDL){
+ *             for(int mw = m0; mw < m0 + MWaves * MPerXDL; mw += MPerXDL){
+ *               for(int nw = n0; nw < n0 + NWaves * NPerXDL; nw += NPerXDL){
+ *                 for(int k0 = kb; k0 < kb + KPerBlock; k0 += mfma.num_input_blks*KPack){
+ *                   // MFMA accumulation for multirate instructions
+ *                   for(int k_pack = k0; k_pack < k0 + mfma.num_input_blks*KPack; k_pack += KPack){
+ *                     for(int k_mfma = k_pack; k_mfma < k_pack + KPack; k_mfma += mfma.k_per_blk){
+ *                       // MFMA instruction
+ *                       for(int m = mw; m < mw + MPerXDL; m++){
+ *                         for(int n = nw; n < nw + NPerXDL; n++){
+ *                           for(int k = k_mfma; k < k_mfma + mfma.k_per_blk; k++){
+ *                            E[m,n] += A[m,k] * B[k,n];
+ *                           }
+ *                         }
+ *                       }
+ *                     }
+ *                   }
+ *                 }
+ *               }
+ *             }
+ *           }
+ *         }
+ *       }
+ *    }
+ * }
+ * \endcode
+ *
+ */
+template <typename ALayout,
+          typename BLayout,
+          typename CLayout,
+          typename ADataType,
+          typename AScaleDataType,
+          typename BDataType,
+          typename BScaleDataType,
+          typename CDataType,
+          typename GemmAccDataType,
+          typename CShuffleDataType,
+          typename AElementwiseOperation,
+          typename BElementwiseOperation,
+          typename CElementwiseOperation,
+          GemmSpecialization GemmSpec,
+          index_t ScaleBlockSize, // Scaling block size
+          index_t BlockSize,      // Thread block size
+          index_t MPerBlock,
+          index_t NPerBlock,
+          index_t KPerBlock,
+          index_t AK1,
+          index_t BK1,
+          index_t MPerXDL,
+          index_t NPerXDL,
+          index_t MXdlPerWave,
+          index_t NXdlPerWave,
+          typename ABlockTransferThreadClusterLengths_AK0_M_AK1,
+          typename ABlockTransferThreadClusterArrangeOrder,
+          typename ABlockTransferSrcAccessOrder,
+          index_t ABlockTransferSrcVectorDim,
+          index_t ABlockTransferSrcScalarPerVector,
+          index_t ABlockTransferDstScalarPerVector_AK1,
+          bool ABlockLdsExtraM,
+          typename BBlockTransferThreadClusterLengths_BK0_N_BK1,
+          typename BBlockTransferThreadClusterArrangeOrder,
+          typename BBlockTransferSrcAccessOrder,
+          index_t BBlockTransferSrcVectorDim,
+          index_t BBlockTransferSrcScalarPerVector,
+          index_t BBlockTransferDstScalarPerVector_BK1,
+          bool BBlockLdsExtraN,
+          index_t CShuffleMXdlPerWavePerShuffle,
+          index_t CShuffleNXdlPerWavePerShuffle,
+          typename CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock,
+          index_t CShuffleBlockTransferScalarPerVector_NPerBlock,
+          BlockGemmPipelineScheduler BlkGemmPipeSched = BlockGemmPipelineScheduler::Intrawave,
+          BlockGemmPipelineVersion BlkGemmPipelineVer = BlockGemmPipelineVersion::v1,
+          typename ComputeTypeA =
+              ADataType, // XXX: These should always be the same as ADataType and BDataType
+          typename ComputeTypeB =
+              BDataType // TODO: Hardcode them and remove from the list of template parameters
+          >
+struct DeviceGemmMX_Xdl_CShuffleV3 : public DeviceGemmMX<ALayout,
+                                                         BLayout,
+                                                         CLayout,
+                                                         ADataType,
+                                                         AScaleDataType,
+                                                         BDataType,
+                                                         BScaleDataType,
+                                                         CDataType,
+                                                         ScaleBlockSize,
+                                                         AElementwiseOperation,
+                                                         BElementwiseOperation,
+                                                         CElementwiseOperation>
+{
+    // GridwiseGemm
+    using GridwiseGemm = GridwiseGemmMX_xdl_cshuffle_v3<
+        ALayout,
+        BLayout,
+        CLayout,
+        ADataType,
+        AScaleDataType,
+        BDataType,
+        BScaleDataType,
+        GemmAccDataType,
+        CShuffleDataType,
+        CDataType,
+        AElementwiseOperation,
+        BElementwiseOperation,
+        CElementwiseOperation,
+        GemmSpec,
+        ScaleBlockSize,
+        BlockSize,
+        MPerBlock,
+        NPerBlock,
+        KPerBlock,
+        AK1,
+        BK1,
+        MPerXDL,
+        NPerXDL,
+        MXdlPerWave,
+        NXdlPerWave,
+        ABlockTransferThreadClusterLengths_AK0_M_AK1,
+        ABlockTransferThreadClusterArrangeOrder,
+        ABlockTransferSrcAccessOrder,
+        ABlockTransferSrcVectorDim,
+        ABlockTransferSrcScalarPerVector,
+        ABlockTransferDstScalarPerVector_AK1,
+        false,
+        ABlockLdsExtraM,
+        BBlockTransferThreadClusterLengths_BK0_N_BK1,
+        BBlockTransferThreadClusterArrangeOrder,
+        BBlockTransferSrcAccessOrder,
+        BBlockTransferSrcVectorDim,
+        BBlockTransferSrcScalarPerVector,
+        BBlockTransferDstScalarPerVector_BK1,
+        false,
+        BBlockLdsExtraN,
+        CShuffleMXdlPerWavePerShuffle,
+        CShuffleNXdlPerWavePerShuffle,
+        CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock,
+        CShuffleBlockTransferScalarPerVector_NPerBlock,
+        BlkGemmPipeSched,
+        BlkGemmPipelineVer,
+        ComputeTypeA,
+        ComputeTypeB>;
+
+    using Argument = typename GridwiseGemm::Argument;
+
+    // Invoker
+    struct Invoker : public BaseInvoker
+    {
+        float Run(const Argument& arg, const StreamConfig& stream_config = StreamConfig{})
+        {
+            if(stream_config.log_level_ > 0)
+            {
+                arg.Print();
+                GridwiseGemm::BlockwiseGemmPipe::HotLoopInstList::Print();
+            }
+
+            if(!GridwiseGemm::CheckValidity(arg))
+            {
+                throw std::runtime_error("wrong! GridwiseGemm has invalid setting");
+            }
+
+            index_t gdx, gdy, gdz;
+            std::tie(gdx, gdy, gdz) = GridwiseGemm::CalculateGridSize(arg.M, arg.N, arg.KBatch);
+
+            float ave_time = 0;
+
+            index_t k_grain = arg.KBatch * KPerBlock;
+            index_t K_split = (arg.K + k_grain - 1) / k_grain * KPerBlock;
+
+            const bool has_main_k_block_loop = GridwiseGemm::CalculateHasMainKBlockLoop(K_split);
+
+            const auto Run = [&](const auto& kernel) {
+                if(stream_config.flush_cache)
+                {
+                    Argument arg_ = arg;
+
+                    const auto a_grid_desc_ak0_m_ak1 = GridwiseGemm::MakeAGridDescriptor_AK0_M_AK1(
+                        arg_.M, arg_.MPadded, arg_.K, arg_.KPadded, arg_.StrideA, arg_.AK0);
+                    const auto b_grid_desc_bk0_n_bk1 = GridwiseGemm::MakeBGridDescriptor_BK0_N_BK1(
+                        arg_.K, arg_.KPadded, arg_.N, arg_.NPadded, arg_.StrideB, arg_.BK0);
+
+                    auto size_a_buffer =
+                        a_grid_desc_ak0_m_ak1.GetElementSpaceSize() * sizeof(ADataType);
+                    auto size_b_buffer =
+                        b_grid_desc_bk0_n_bk1.GetElementSpaceSize() * sizeof(BDataType);
+
+                    ck::utility::RotatingMemWrapper<Argument> rotating_mem(
+                        arg_, stream_config.rotating_count, size_a_buffer, size_b_buffer);
+                    rotating_mem.Print();
+
+                    auto run_flush_cache = [&]() {
+                        // flush icache
+                        ck::utility::flush_icache();
+                        // rotating mem
+                        rotating_mem.Next();
+                        // clear c mem
+                        if(arg_.KBatch > 1)
+                            hipGetErrorString(hipMemsetAsync(arg_.p_c_grid,
+                                                             0,
+                                                             arg_.M * arg_.N * sizeof(CDataType),
+                                                             stream_config.stream_id_));
+                    };
+
+                    ave_time = ck::utility::launch_and_time_kernel_with_preprocess<false>(
+                        stream_config,
+                        run_flush_cache,
+                        kernel,
+                        dim3(gdx, gdy, gdz),
+                        dim3(BlockSize),
+                        0,
+                        arg_);
+                }
+                else
+                {
+                    if(arg.KBatch > 1)
+                        hipGetErrorString(hipMemsetAsync(arg.p_c_grid,
+                                                         0,
+                                                         arg.M * arg.N * sizeof(CDataType),
+                                                         stream_config.stream_id_));
+
+                    ave_time = launch_and_time_kernel(
+                        stream_config, kernel, dim3(gdx, gdy, gdz), dim3(BlockSize), 0, arg);
+                }
+            };
+
+            // TODO: Check if this is the right algorithm for minimum_occupancy
+            constexpr index_t minimum_occupancy =
+                BlkGemmPipeSched == BlockGemmPipelineScheduler::Intrawave
+                    ? (BlkGemmPipelineVer == BlockGemmPipelineVersion::v3 &&
+                       MPerBlock * NPerBlock * KPerBlock * sizeof(ADataType) <= 128 * 128 * 64 * 2)
+                          ? 2
+                          : 1
+                    : 2;
+
+            if(has_main_k_block_loop)
+            {
+                // Tail number always full
+                if constexpr(BlkGemmPipelineVer == BlockGemmPipelineVersion::v1 ||
+                             BlkGemmPipelineVer == BlockGemmPipelineVersion::v3)
+                {
+                    if(arg.KBatch > 1)
+                    {
+                        const auto kernel =
+                            kernel_gemm_xdl_cshuffle_v3<GridwiseGemm,
+                                                        true,
+                                                        InMemoryDataOperationEnum::AtomicAdd,
+                                                        minimum_occupancy>;
+                        Run(kernel);
+                    }
+                    else
+                    {
+                        const auto kernel =
+                            kernel_gemm_xdl_cshuffle_v3<GridwiseGemm,
+                                                        true,
+                                                        InMemoryDataOperationEnum::Set,
+                                                        minimum_occupancy>;
+                        Run(kernel);
+                    }
+                }
+                // Tail number could be One to Seven
+                else if constexpr(BlkGemmPipelineVer == BlockGemmPipelineVersion::v2)
+                {
+                    if(arg.KBatch > 1)
+                    {
+                        if(GridwiseGemm::CalculateKBlockLoopTailNum(K_split) == TailNumber::One)
+                        {
+                            const auto kernel =
+                                kernel_gemm_xdl_cshuffle_v3<GridwiseGemm,
+                                                            true,
+                                                            InMemoryDataOperationEnum::AtomicAdd,
+                                                            minimum_occupancy,
+                                                            TailNumber::One>;
+                            Run(kernel);
+                        }
+                        else if(GridwiseGemm::CalculateKBlockLoopTailNum(K_split) ==
+                                TailNumber::Full)
+                        {
+                            const auto kernel =
+                                kernel_gemm_xdl_cshuffle_v3<GridwiseGemm,
+                                                            true,
+                                                            InMemoryDataOperationEnum::AtomicAdd,
+                                                            minimum_occupancy,
+                                                            TailNumber::Full>;
+                            Run(kernel);
+                        }
+
+                        if constexpr(GridwiseGemm::BlockwiseGemmPipe::PrefetchStages > 2)
+                        {
+                            if(GridwiseGemm::CalculateKBlockLoopTailNum(K_split) == TailNumber::Two)
+                            {
+                                const auto kernel = kernel_gemm_xdl_cshuffle_v3<
+                                    GridwiseGemm,
+                                    true,
+                                    InMemoryDataOperationEnum::AtomicAdd,
+                                    minimum_occupancy,
+                                    TailNumber::Two>;
+                                Run(kernel);
+                            }
+                        }
+
+                        if constexpr(GridwiseGemm::BlockwiseGemmPipe::PrefetchStages > 3)
+                        {
+                            if(GridwiseGemm::CalculateKBlockLoopTailNum(K_split) ==
+                               TailNumber::Three)
+                            {
+                                const auto kernel = kernel_gemm_xdl_cshuffle_v3<
+                                    GridwiseGemm,
+                                    true,
+                                    InMemoryDataOperationEnum::AtomicAdd,
+                                    minimum_occupancy,
+                                    TailNumber::Three>;
+                                Run(kernel);
+                            }
+                        }
+
+                        if constexpr(GridwiseGemm::BlockwiseGemmPipe::PrefetchStages > 4)
+                        {
+                            if(GridwiseGemm::CalculateKBlockLoopTailNum(K_split) ==
+                               TailNumber::Four)
+                            {
+                                const auto kernel = kernel_gemm_xdl_cshuffle_v3<
+                                    GridwiseGemm,
+                                    true,
+                                    InMemoryDataOperationEnum::AtomicAdd,
+                                    minimum_occupancy,
+                                    TailNumber::Four>;
+                                Run(kernel);
+                            }
+                        }
+
+                        if constexpr(GridwiseGemm::BlockwiseGemmPipe::PrefetchStages > 5)
+                        {
+                            if(GridwiseGemm::CalculateKBlockLoopTailNum(K_split) ==
+                               TailNumber::Five)
+                            {
+                                const auto kernel = kernel_gemm_xdl_cshuffle_v3<
+                                    GridwiseGemm,
+                                    true,
+                                    InMemoryDataOperationEnum::AtomicAdd,
+                                    minimum_occupancy,
+                                    TailNumber::Five>;
+                                Run(kernel);
+                            }
+                        }
+
+                        if constexpr(GridwiseGemm::BlockwiseGemmPipe::PrefetchStages > 6)
+                        {
+                            if(GridwiseGemm::CalculateKBlockLoopTailNum(K_split) == TailNumber::Six)
+                            {
+                                const auto kernel = kernel_gemm_xdl_cshuffle_v3<
+                                    GridwiseGemm,
+                                    true,
+                                    InMemoryDataOperationEnum::AtomicAdd,
+                                    minimum_occupancy,
+                                    TailNumber::Six>;
+                                Run(kernel);
+                            }
+                        }
+
+                        if constexpr(GridwiseGemm::BlockwiseGemmPipe::PrefetchStages > 7)
+                        {
+                            if(GridwiseGemm::CalculateKBlockLoopTailNum(K_split) ==
+                               TailNumber::Seven)
+                            {
+                                const auto kernel = kernel_gemm_xdl_cshuffle_v3<
+                                    GridwiseGemm,
+                                    true,
+                                    InMemoryDataOperationEnum::AtomicAdd,
+                                    minimum_occupancy,
+                                    TailNumber::Seven>;
+                                Run(kernel);
+                            }
+                        }
+                    }
+                    else
+                    {
+                        if(GridwiseGemm::CalculateKBlockLoopTailNum(K_split) == TailNumber::One)
+                        {
+                            const auto kernel =
+                                kernel_gemm_xdl_cshuffle_v3<GridwiseGemm,
+                                                            true,
+                                                            InMemoryDataOperationEnum::Set,
+                                                            minimum_occupancy,
+                                                            TailNumber::One>;
+                            Run(kernel);
+                        }
+                        else if(GridwiseGemm::CalculateKBlockLoopTailNum(K_split) ==
+                                TailNumber::Full)
+                        {
+                            const auto kernel =
+                                kernel_gemm_xdl_cshuffle_v3<GridwiseGemm,
+                                                            true,
+                                                            InMemoryDataOperationEnum::Set,
+                                                            minimum_occupancy,
+                                                            TailNumber::Full>;
+                            Run(kernel);
+                        }
+
+                        if constexpr(GridwiseGemm::BlockwiseGemmPipe::PrefetchStages > 2)
+                        {
+                            if(GridwiseGemm::CalculateKBlockLoopTailNum(K_split) == TailNumber::Two)
+                            {
+                                const auto kernel =
+                                    kernel_gemm_xdl_cshuffle_v3<GridwiseGemm,
+                                                                true,
+                                                                InMemoryDataOperationEnum::Set,
+                                                                minimum_occupancy,
+                                                                TailNumber::Two>;
+                                Run(kernel);
+                            }
+                        }
+
+                        if constexpr(GridwiseGemm::BlockwiseGemmPipe::PrefetchStages > 3)
+                        {
+                            if(GridwiseGemm::CalculateKBlockLoopTailNum(K_split) ==
+                               TailNumber::Three)
+                            {
+                                const auto kernel =
+                                    kernel_gemm_xdl_cshuffle_v3<GridwiseGemm,
+                                                                true,
+                                                                InMemoryDataOperationEnum::Set,
+                                                                minimum_occupancy,
+                                                                TailNumber::Three>;
+                                Run(kernel);
+                            }
+                        }
+
+                        if constexpr(GridwiseGemm::BlockwiseGemmPipe::PrefetchStages > 4)
+                        {
+                            if(GridwiseGemm::CalculateKBlockLoopTailNum(K_split) ==
+                               TailNumber::Four)
+                            {
+                                const auto kernel =
+                                    kernel_gemm_xdl_cshuffle_v3<GridwiseGemm,
+                                                                true,
+                                                                InMemoryDataOperationEnum::Set,
+                                                                minimum_occupancy,
+                                                                TailNumber::Four>;
+                                Run(kernel);
+                            }
+                        }
+
+                        if constexpr(GridwiseGemm::BlockwiseGemmPipe::PrefetchStages > 5)
+                        {
+                            if(GridwiseGemm::CalculateKBlockLoopTailNum(K_split) ==
+                               TailNumber::Five)
+                            {
+                                const auto kernel =
+                                    kernel_gemm_xdl_cshuffle_v3<GridwiseGemm,
+                                                                true,
+                                                                InMemoryDataOperationEnum::Set,
+                                                                minimum_occupancy,
+                                                                TailNumber::Five>;
+                                Run(kernel);
+                            }
+                        }
+
+                        if constexpr(GridwiseGemm::BlockwiseGemmPipe::PrefetchStages > 6)
+                        {
+                            if(GridwiseGemm::CalculateKBlockLoopTailNum(K_split) == TailNumber::Six)
+                            {
+                                const auto kernel =
+                                    kernel_gemm_xdl_cshuffle_v3<GridwiseGemm,
+                                                                true,
+                                                                InMemoryDataOperationEnum::Set,
+                                                                minimum_occupancy,
+                                                                TailNumber::Six>;
+                                Run(kernel);
+                            }
+                        }
+
+                        if constexpr(GridwiseGemm::BlockwiseGemmPipe::PrefetchStages > 7)
+                        {
+                            if(GridwiseGemm::CalculateKBlockLoopTailNum(K_split) ==
+                               TailNumber::Seven)
+                            {
+                                const auto kernel =
+                                    kernel_gemm_xdl_cshuffle_v3<GridwiseGemm,
+                                                                true,
+                                                                InMemoryDataOperationEnum::Set,
+                                                                minimum_occupancy,
+                                                                TailNumber::Seven>;
+                                Run(kernel);
+                            }
+                        }
+                    }
+                }
+                // Tail number could be Odd or Even
+                else if constexpr(BlkGemmPipelineVer == BlockGemmPipelineVersion::v4)
+                {
+                    if(arg.KBatch > 1)
+                    {
+                        if(GridwiseGemm::CalculateKBlockLoopTailNum(K_split) == TailNumber::Odd)
+                        {
+                            const auto kernel = kernel_gemm_xdl_cshuffle_v3_2lds<
+                                GridwiseGemm,
+                                true,
+                                InMemoryDataOperationEnum::AtomicAdd,
+                                minimum_occupancy,
+                                TailNumber::Odd>;
+                            Run(kernel);
+                        }
+                        else
+                        {
+                            const auto kernel = kernel_gemm_xdl_cshuffle_v3_2lds<
+                                GridwiseGemm,
+                                true,
+                                InMemoryDataOperationEnum::AtomicAdd,
+                                minimum_occupancy,
+                                TailNumber::Even>;
+                            Run(kernel);
+                        }
+                    }
+                    else
+                    {
+                        if(GridwiseGemm::CalculateKBlockLoopTailNum(K_split) == TailNumber::Odd)
+                        {
+                            const auto kernel =
+                                kernel_gemm_xdl_cshuffle_v3_2lds<GridwiseGemm,
+                                                                 true,
+                                                                 InMemoryDataOperationEnum::Set,
+                                                                 minimum_occupancy,
+                                                                 TailNumber::Odd>;
+                            Run(kernel);
+                        }
+                        else
+                        {
+                            const auto kernel =
+                                kernel_gemm_xdl_cshuffle_v3_2lds<GridwiseGemm,
+                                                                 true,
+                                                                 InMemoryDataOperationEnum::Set,
+                                                                 minimum_occupancy,
+                                                                 TailNumber::Even>;
+                            Run(kernel);
+                        }
+                    }
+                }
+                else
+                {
+                    if(arg.KBatch > 1)
+                    {
+                        if(GridwiseGemm::CalculateKBlockLoopTailNum(K_split) == TailNumber::Odd)
+                        {
+                            const auto kernel =
+                                kernel_gemm_xdl_cshuffle_v3<GridwiseGemm,
+                                                            true,
+                                                            InMemoryDataOperationEnum::AtomicAdd,
+                                                            minimum_occupancy,
+                                                            TailNumber::Odd>;
+                            Run(kernel);
+                        }
+                        else
+                        {
+                            const auto kernel =
+                                kernel_gemm_xdl_cshuffle_v3<GridwiseGemm,
+                                                            true,
+                                                            InMemoryDataOperationEnum::AtomicAdd,
+                                                            minimum_occupancy,
+                                                            TailNumber::Even>;
+                            Run(kernel);
+                        }
+                    }
+                    else
+                    {
+                        if(GridwiseGemm::CalculateKBlockLoopTailNum(K_split) == TailNumber::Odd)
+                        {
+                            const auto kernel =
+                                kernel_gemm_xdl_cshuffle_v3<GridwiseGemm,
+                                                            true,
+                                                            InMemoryDataOperationEnum::Set,
+                                                            minimum_occupancy,
+                                                            TailNumber::Odd>;
+                            Run(kernel);
+                        }
+                        else
+                        {
+                            const auto kernel =
+                                kernel_gemm_xdl_cshuffle_v3<GridwiseGemm,
+                                                            true,
+                                                            InMemoryDataOperationEnum::Set,
+                                                            minimum_occupancy,
+                                                            TailNumber::Even>;
+                            Run(kernel);
+                        }
+                    }
+                }
+            }
+            else
+            {
+                // Tail number always 1
+                if constexpr(BlkGemmPipelineVer == BlockGemmPipelineVersion::v1)
+                {
+                    if(arg.KBatch > 1)
+                    {
+                        const auto kernel =
+                            kernel_gemm_xdl_cshuffle_v3<GridwiseGemm,
+                                                        false,
+                                                        InMemoryDataOperationEnum::AtomicAdd,
+                                                        minimum_occupancy>;
+                        Run(kernel);
+                    }
+                    else
+                    {
+                        const auto kernel =
+                            kernel_gemm_xdl_cshuffle_v3<GridwiseGemm,
+                                                        false,
+                                                        InMemoryDataOperationEnum::Set,
+                                                        minimum_occupancy>;
+                        Run(kernel);
+                    }
+                }
+            }
+
+            return ave_time;
+        }
+
+        // polymorphic
+        float Run(const BaseArgument* p_arg,
+                  const StreamConfig& stream_config = StreamConfig{}) override
+        {
+            return Run(*dynamic_cast<const Argument*>(p_arg), stream_config);
+        }
+    };
+
+    static constexpr bool IsValidCompilationParameter()
+    {
+        // TODO: properly implement this check
+        static_assert((is_same_v<ADataType, f8_t> || is_same_v<ADataType, bf8_t> ||
+                       is_same_v<ADataType, f6_t> || is_same_v<ADataType, bf6_t> ||
+                       is_same_v<ADataType, f4_t>)&&(is_same_v<BDataType, f8_t> ||
+                                                     is_same_v<BDataType, bf8_t> ||
+                                                     is_same_v<BDataType, f6_t> ||
+                                                     is_same_v<BDataType, bf6_t> ||
+                                                     is_same_v<BDataType, f4_t>),
+                      "Only microscaling formats are supported for ADataType and BDataType");
+
+        static_assert(ScaleBlockSize == 32, "Only ScaleBlockSize 32 is supported");
+
+        return true;
+    }
+
+    static bool IsSupportedArgument(const Argument& arg)
+    {
+        if(!ck::is_xdl_supported())
+        {
+            return false;
+        }
+
+        if(!is_bf16_atomic_supported() && std::is_same_v<CDataType, ck::bhalf_t> && arg.KBatch > 1)
+        {
+            return false;
+        }
+
+        if((arg.K % AK1 != 0 || arg.K % BK1 != 0) && !(GemmSpec == GemmSpecialization::MKPadding ||
+                                                       GemmSpec == GemmSpecialization::NKPadding ||
+                                                       GemmSpec == GemmSpecialization::MNKPadding ||
+                                                       GemmSpec == GemmSpecialization::KPadding))
+        {
+            return false;
+        }
+
+        return GridwiseGemm::CheckValidity(arg);
+    }
+
+    // polymorphic
+    bool IsSupportedArgument(const BaseArgument* p_arg) override
+    {
+        return IsSupportedArgument(*dynamic_cast<const Argument*>(p_arg));
+    }
+
+    static auto MakeArgument(const ADataType* p_a,
+                             const AScaleDataType* p_a_scale,
+                             const BDataType* p_b,
+                             const BScaleDataType* p_b_scale,
+                             CDataType* p_c,
+                             index_t M,
+                             index_t N,
+                             index_t K,
+                             index_t StrideA,
+                             index_t StrideScaleA,
+                             index_t StrideB,
+                             index_t StrideScaleB,
+                             index_t StrideC,
+                             index_t KBatch,
+                             AElementwiseOperation a_element_op,
+                             BElementwiseOperation b_element_op,
+                             CElementwiseOperation c_element_op)
+    {
+        return Argument{p_a,
+                        p_a_scale,
+                        p_b,
+                        p_b_scale,
+                        p_c,
+                        M,
+                        N,
+                        K,
+                        StrideA,
+                        StrideScaleA,
+                        StrideB,
+                        StrideScaleB,
+                        StrideC,
+                        KBatch,
+                        a_element_op,
+                        b_element_op,
+                        c_element_op};
+    }
+
+    static auto MakeInvoker() { return Invoker{}; }
+
+    // polymorphic
+    std::unique_ptr<BaseArgument> MakeArgumentPointer(const void* p_a,
+                                                      const void* p_a_scale,
+                                                      const void* p_b,
+                                                      const void* p_b_scale,
+                                                      void* p_c,
+                                                      ck::index_t M,
+                                                      ck::index_t N,
+                                                      ck::index_t K,
+                                                      ck::index_t StrideA,
+                                                      ck::index_t StrideScaleA,
+                                                      ck::index_t StrideB,
+                                                      ck::index_t StrideScaleB,
+                                                      ck::index_t StrideC,
+                                                      ck::index_t KBatch,
+                                                      AElementwiseOperation a_element_op,
+                                                      BElementwiseOperation b_element_op,
+                                                      CElementwiseOperation c_element_op) override
+    {
+        return std::make_unique<Argument>(static_cast<const ADataType*>(p_a),
+                                          static_cast<const AScaleDataType*>(p_a_scale),
+                                          static_cast<const BDataType*>(p_b),
+                                          static_cast<const BScaleDataType*>(p_b_scale),
+                                          static_cast<CDataType*>(p_c),
+                                          M,
+                                          N,
+                                          K,
+                                          StrideA,
+                                          StrideScaleA,
+                                          StrideB,
+                                          StrideScaleB,
+                                          StrideC,
+                                          KBatch,
+                                          a_element_op,
+                                          b_element_op,
+                                          c_element_op);
+    }
+
+    // polymorphic
+    std::unique_ptr<BaseInvoker> MakeInvokerPointer() override
+    {
+        return std::make_unique<Invoker>(Invoker{});
+    }
+
+    // polymorphic
+    std::string GetTypeString() const override
+    {
+        auto str = std::stringstream();
+
+        std::map<BlockGemmPipelineScheduler, std::string> BlkGemmPipelineSchedulerToString{
+            {BlockGemmPipelineScheduler::Intrawave, "Intrawave"},
+            {BlockGemmPipelineScheduler::Interwave, "Interwave"}};
+
+        std::map<BlockGemmPipelineVersion, std::string> BlkGemmPipelineVersionToString{
+            {BlockGemmPipelineVersion::v1, "v1"},
+            {BlockGemmPipelineVersion::v2, "v2"},
+            {BlockGemmPipelineVersion::v3, "v3"},
+            {BlockGemmPipelineVersion::v4, "v4"},
+            {BlockGemmPipelineVersion::v5, "v5"}};
+
+        // clang-format off
+        str << "DeviceGemmMX_Xdl_CShuffleV3"
+            << "<"
+            << getGemmSpecializationString(GemmSpec) << ", "
+            << std::string(ALayout::name)[0]
+            << std::string(BLayout::name)[0]
+            << std::string(CLayout::name)[0]
+            << ">"
+            << " BlkSize: "
+            << BlockSize << ", "
+            << "BlkTile: "
+            << MPerBlock<<"x"<<NPerBlock<<"x"<<KPerBlock << ", "
+            << "WaveTile: "
+            << MPerXDL<<"x"<<NPerXDL << ", "
+            << "WaveMap: "
+            << MXdlPerWave<<"x" << NXdlPerWave<<", "
+            << "VmemReadVec: "
+            << ABlockTransferSrcScalarPerVector<<"x"<<BBlockTransferSrcScalarPerVector<<", "
+            << "BlkGemmPipelineScheduler: "
+            << BlkGemmPipelineSchedulerToString[BlkGemmPipeSched] << ", "
+            << "BlkGemmPipelineVersion: "
+            << BlkGemmPipelineVersionToString[BlkGemmPipelineVer] << ", "
+            << "BlkGemmPipelinePrefetchStages: "
+            << GridwiseGemm::BlockwiseGemmPipe::PrefetchStages << ", "
+            << "Kpack: "
+            << GridwiseGemm::BlockwiseGemmPipe::AMmaKStride << ", "
+            << "ScaleBlockSize: "
+            << ScaleBlockSize;
+        // clang-format on
+
+        return str.str();
+    }
+    REGISTER_EXTRA_PRINTING_METHODS
+};
+
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3_mx.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3_mx.hpp
--- a/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer.hpp
+++ b/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer.hpp
@@ -189,15 +189,36 @@ struct ThreadwiseTensorSliceTransfer_v1r3
    const ElementwiseOperation element_op_;
 }; // namespace ThreadwiseTensorSliceTransfer_v1r3

-// Assume:
-//   1. src:
-//     1. SrcDesc is not known at compile-time
-//     2. SrcBuffer is DynamicBuffer
-//     3. src_slice_origin_idx is not known at compile-time
-//   2. dst:
-//     1. DstDesc is known at compile-time
-//     2. DstBuffer is StaticBuffer
-//     3. dst_slice_origin_idx is known at compile-time
+/**
+ * @brief Helper structure that facilitates transfer of source (grid) data to destination threads.
+ *
+ * @details The following assumptions are made:
+ *  - For Source (Grid) Data:
+ *     1. The source tensor descriptor SrcDesc is not known at compile-time.
+ *     2. The source buffer is a dynamic buffer.
+ *     3. The source slice origin index src_slice_origin_idx is not known at compile-time.
+ *  - For Destination (Thread) Data:
+ *     1. The destination tensor descriptor DstDesc is known at compile-time.
+ *     2. The destination buffer dst_buf is a static buffer.
+ *     3. The destination slice origin index dst_slice_origin_idx is known at compile-time.
+ *
+ * @tparam SrcData The data type of the source tensor.
+ * @tparam DstData The data type of the destination tensor.
+ * @tparam SrcDesc The descriptor type of the source tensor.
+ * @tparam DstDesc The descriptor type of the destination tensor.
+ * @tparam SliceLengths The lengths of the slice to be transferred.
+ * @tparam DimAccessOrder The order of dimension access for the space-filling curve.
+ * @tparam SrcVectorDim The dimension along which vectorized access is performed in the source
+ * tensor.
+ * @tparam SrcScalarPerVector The number of scalar elements per vector in the source tensor.
+ * @tparam SrcScalarStrideInVector The stride of scalar elements within a vector in the source
+ * tensor.
+ * @tparam SrcResetCoordinateAfterRun controls whether source coordinate is restored after each Run
+ * or rolled back one step in MoveSrcSliceWindow
+ * @tparam InvalidElementAsNaN Whether to fill invalid elements with NaN (only applicable for
+ * floating-point types).
+ *
+ */
 template <typename SrcData,
          typename DstData,
          typename SrcDesc,
--- a/include/ck/tensor_operation/gpu/warp/xdlops_gemm.hpp
+++ b/include/ck/tensor_operation/gpu/warp/xdlops_gemm.hpp
@@ -793,7 +793,7 @@ struct mfma_type<MfmaInstr::mfma_f32_32x32x64f8f6f4>
    static constexpr index_t num_output_blks     = 1;    // (is_k_reduction == true) ???
    static constexpr index_t m_per_blk           = 32;   // from the instruction
    static constexpr index_t n_per_blk           = 32;   // from the instruction
-    static constexpr index_t k_per_blk           = 32;   // (is_k_reduction == true) ? 64 / num_input_blks
+    static constexpr index_t k_per_blk           = 32;   // (is_k_reduction == true) ? KPerXdlops / num_input_blks
    static constexpr bool is_k_reduction         = true; // ???
    // clang-format on

@@ -817,7 +817,7 @@ struct mfma_type<MfmaInstr::mfma_f32_16x16x128f8f6f4>
    static constexpr index_t num_output_blks     = 1;    // (is_k_reduction == true) ???
    static constexpr index_t m_per_blk           = 16;   // from the instruction
    static constexpr index_t n_per_blk           = 16;   // from the instruction
-    static constexpr index_t k_per_blk           = 32;   // (is_k_reduction == true) ? 128 / num_input_blks
+    static constexpr index_t k_per_blk           = 32;   // (is_k_reduction == true) ? KPerXdlops / num_input_blks
    static constexpr bool is_k_reduction         = true; // ???
    // clang-format on

@@ -841,7 +841,7 @@ struct mfma_type<MfmaInstr::mfma_scale_f32_32x32x64f8f6f4>
    static constexpr index_t num_output_blks     = 1;    // (is_k_reduction == true) ???
    static constexpr index_t m_per_blk           = 32;   // from the instruction
    static constexpr index_t n_per_blk           = 32;   // from the instruction
-    static constexpr index_t k_per_blk           = 32;   // (is_k_reduction == true) ? 64 / num_input_blks
+    static constexpr index_t k_per_blk           = 32;   // (is_k_reduction == true) ? KPerXdlops / num_input_blks
    static constexpr bool is_k_reduction         = true; // ???
    // clang-format on

@@ -870,7 +870,7 @@ struct mfma_type<MfmaInstr::mfma_scale_f32_16x16x128f8f6f4>
    static constexpr index_t num_output_blks     = 1;    // (is_k_reduction == true) ???
    static constexpr index_t m_per_blk           = 16;   // from the instruction
    static constexpr index_t n_per_blk           = 16;   // from the instruction
-    static constexpr index_t k_per_blk           = 32;   // (is_k_reduction == true) ? 128 / num_input_blks
+    static constexpr index_t k_per_blk           = 32;   // (is_k_reduction == true) ? KPerXdlops / num_input_blks
    static constexpr bool is_k_reduction         = true; // ???
    // clang-format on