MX GEMM - Add FP6 GEMM Test (#2488)

* Add F6 GEMM MX Test * Add BF6 GEMM MX Test
2026-05-11 08:50:17 +00:00 · 2025-07-11 15:32:12 -06:00
parent 518dc21ae8
commit 25b359d630
11 changed files with 265 additions and 9 deletions
--- a/library/include/ck/library/tensor_operation_instance/device_operation_instance_factory.hpp
+++ b/library/include/ck/library/tensor_operation_instance/device_operation_instance_factory.hpp
@@ -24,6 +24,8 @@ using F8   = ck::f8_t;
 using BF8  = ck::bf8_t;
 using I4   = ck::pk_i4_t;
 using F4   = ck::f4x2_pk_t;
+using F6   = ck::f6x16_pk_t;
+using BF6  = ck::bf6x16_pk_t;

 using E8M0   = ck::e8m0_bexp_t;
 using E8M0PK = int32_t;
--- a/library/include/ck/library/tensor_operation_instance/gpu/gemm_mx.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/gemm_mx.hpp
@@ -87,6 +87,34 @@ void add_device_gemm_mx_xdl_f8_f8_bf16_km_nk_mn_default_instances(
                                             PassThrough,
                                             PassThrough>>>& instances);

+void add_device_gemm_mx_xdl_f6_f6_f16_mk_nk_mn_default_instances(
+    std::vector<std::unique_ptr<DeviceGemmMX<Row,
+                                             Col,
+                                             Row,
+                                             F6,
+                                             E8M0PK,
+                                             F6,
+                                             E8M0PK,
+                                             F16,
+                                             32,
+                                             PassThrough,
+                                             PassThrough,
+                                             PassThrough>>>& instances);
+
+void add_device_gemm_mx_xdl_bf6_bf6_bf16_mk_nk_mn_default_instances(
+    std::vector<std::unique_ptr<DeviceGemmMX<Row,
+                                             Col,
+                                             Row,
+                                             BF6,
+                                             E8M0PK,
+                                             BF6,
+                                             E8M0PK,
+                                             BF16,
+                                             32,
+                                             PassThrough,
+                                             PassThrough,
+                                             PassThrough>>>& instances);
+
 template <typename ADataType,
          typename AScaleDataType,
          typename BDataType,
@@ -130,6 +158,8 @@ struct DeviceOperationInstanceFactory<

        if constexpr(is_same_v<ALayout, Row> && is_same_v<BLayout, Col> && is_same_v<CLayout, Row>)
        {
+            // Row-Col-Row -- one of the two currently supported layouts, another one is
+            // Row-MFMA-Row
            if constexpr(is_same_v<ADataType, F8> && is_same_v<BDataType, F8> &&
                         is_same_v<CDataType, F16>)
            {
@@ -147,6 +177,16 @@ struct DeviceOperationInstanceFactory<
            {
                add_device_gemm_mx_xdl_f4_f4_f16_mk_nk_mn_default_instances(op_ptrs);
            }
+            else if constexpr(is_same_v<ADataType, F6> && is_same_v<BDataType, F6> &&
+                              is_same_v<CDataType, F16>)
+            {
+                add_device_gemm_mx_xdl_f6_f6_f16_mk_nk_mn_default_instances(op_ptrs);
+            }
+            else if constexpr(is_same_v<ADataType, BF6> && is_same_v<BDataType, BF6> &&
+                              is_same_v<CDataType, BF16>)
+            {
+                add_device_gemm_mx_xdl_bf6_bf6_bf16_mk_nk_mn_default_instances(op_ptrs);
+            }
        }
        else if constexpr(is_same_v<ALayout, Row> && is_same_v<BLayout, Row> &&
                          is_same_v<CLayout, Row>)
--- a/library/src/tensor_operation_instance/gpu/gemm_mx/CMakeLists.txt
+++ b/library/src/tensor_operation_instance/gpu/gemm_mx/CMakeLists.txt
@@ -2,6 +2,8 @@
 set(GEMM_MX_INSTANCES)

 list(APPEND GEMM_MX_INSTANCES
+        device_gemm_mx_xdl_f6_f6_f16/device_gemm_mx_xdl_f6_f6_f16_mk_nk_mn_default_instance.cpp
+        device_gemm_mx_xdl_bf6_bf6_bf16/device_gemm_mx_xdl_bf6_bf6_bf16_mk_nk_mn_default_instance.cpp
        device_gemm_mx_xdl_f8_f8_f16/device_gemm_mx_xdl_f8_f8_f16_mk_nk_mn_default_instance.cpp
        device_gemm_mx_xdl_f8_f8_bf16/device_gemm_mx_xdl_f8_f8_bf16_mk_nk_mn_default_instance.cpp
        device_gemm_mx_xdl_f8_f8_bf16/device_gemm_mx_xdl_f8_f8_bf16_km_nk_mn_default_instance.cpp
@@ -11,6 +13,8 @@ list(APPEND GEMM_MX_INSTANCES
    )


+set_source_files_properties(device_gemm_mx_xdl_f6_f6_f16/device_gemm_mx_xdl_f6_f6_f16_mk_nk_mn_default_instance.cpp PROPERTIES COMPILE_OPTIONS ";-mllvm;-greedy-reverse-local-assignment=1")
+set_source_files_properties(device_gemm_mx_xdl_bf6_bf6_bf16/device_gemm_mx_xdl_bf6_bf6_bf16_mk_nk_mn_default_instance.cpp PROPERTIES COMPILE_OPTIONS ";-mllvm;-greedy-reverse-local-assignment=1")
 set_source_files_properties(device_gemm_mx_xdl_f8_f8_f16/device_gemm_mx_xdl_f8_f8_f16_mk_nk_mn_default_instance.cpp PROPERTIES COMPILE_OPTIONS ";-mllvm;-greedy-reverse-local-assignment=1")
 set_source_files_properties(device_gemm_mx_xdl_f8_f8_bf16/device_gemm_mx_xdl_f8_f8_bf16_mk_nk_mn_default_instance.cpp PROPERTIES COMPILE_OPTIONS ";-mllvm;-greedy-reverse-local-assignment=1")
 set_source_files_properties(device_gemm_mx_xdl_f8_f8_bf16/device_gemm_mx_xdl_f8_f8_bf16_km_nk_mn_default_instance.cpp PROPERTIES COMPILE_OPTIONS ";-mllvm;-greedy-reverse-local-assignment=1")
--- a/library/src/tensor_operation_instance/gpu/gemm_mx/device_gemm_mx_xdl_bf6_bf6_bf16/device_gemm_mx_xdl_bf6_bf6_bf16_mk_nk_mn.hpp
+++ b/library/src/tensor_operation_instance/gpu/gemm_mx/device_gemm_mx_xdl_bf6_bf6_bf16/device_gemm_mx_xdl_bf6_bf6_bf16_mk_nk_mn.hpp
@@ -0,0 +1,66 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle_v3_mx.hpp"
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+using BF16   = bhalf_t;
+using F32    = float;
+using E8M0   = ck::e8m0_bexp_t;
+using E8M0PK = int32_t;
+using BF6    = ck::bf6x16_pk_t;
+
+using Row = tensor_layout::gemm::RowMajor;
+using Col = tensor_layout::gemm::ColumnMajor;
+
+template <index_t... Is>
+using S = Sequence<Is...>;
+
+using PassThrough = element_wise::PassThrough;
+
+static constexpr auto GemmDefault    = GemmSpecialization::Default;
+static constexpr auto GemmKPadding   = GemmSpecialization::KPadding;
+static constexpr auto GemmMNPadding  = GemmSpecialization::MNPadding;
+static constexpr auto GemmMNKPadding = GemmSpecialization::MNKPadding;
+
+static constexpr auto Intrawave = BlockGemmPipelineScheduler::Intrawave;
+static constexpr auto Interwave = BlockGemmPipelineScheduler::Interwave;
+
+static constexpr auto ScaleBlockSize = 32;
+static constexpr auto KPerBlock      = 256 / ck::packed_size_v<BF6>; // 256 bf6 = 16 bf6x16_pk_t
+
+template <BlockGemmPipelineScheduler BlkGemmPipeSched, GemmSpecialization GemmSpec>
+using device_gemm_mx_xdl_bf6_bf6_bf16_mk_nk_mn_instances = std::tuple<
+    // clang-format off
+    //###########################| ALayout| BLayout| CLayout|AData| AScale|BData| BScale| CData| AccData| Cshuffle|           A|           B|           C|          GEMM|    Scale Block| Block|  MPer|  NPer|      KPer| AK1| BK1|MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle|     CBlockTransferClusterLengths|  CBlockTransfer|    Block-wiseGemm|               Block-wiseGemm|
+    //###########################|        |        |        | Type|   Data| Type|   Data|  Type|    Type|     Type| Elementwise| Elementwise| Elementwise|Specialization|           Size|  Size| Block| Block|     Block|    |    | XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MXdlPerWave_MWaveMPerXdl| ScalarPerVector|          Pipeline|                     Pipeline|
+    //###########################|        |        |        |     |   Type|     |   Type|      |        |         |   Operation|   Operation|   Operation|              |               |      |      |      |          |    |    |    |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle| _NBlock_NXdlPerWave_NWaveNPerXdl|   _NWaveNPerXdl|         Scheduler|                     Verision|
+    //###########################|        |        |        |     |       |     |       |      |        |         |            |            |            |              |               |      |      |      |          |    |    |    |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                                 |                |                  |                             |
+    DeviceGemmMX_Xdl_CShuffleV3<       Row,     Col,     Row,  BF6, E8M0PK,  BF6, E8M0PK,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough,      GemmSpec, ScaleBlockSize,   256,   128,   128, KPerBlock,   1,   1,  16,   16,    4,    4,     S<16,16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              1,              1,      true,     S<16,16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              1,      true,           2,           2,                   S<1, 32, 1, 8>,               8,  BlkGemmPipeSched,  BlockGemmPipelineVersion::v3>,
+    DeviceGemmMX_Xdl_CShuffleV3<       Row,     Col,     Row,  BF6, E8M0PK,  BF6, E8M0PK,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough,      GemmSpec, ScaleBlockSize,   256,   128,    64, KPerBlock,   1,   1,  16,   16,    4,    2,     S<16,16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              1,              1,      true,     S<16,16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              1,      true,           2,           2,                   S<1, 32, 1, 8>,               8,  BlkGemmPipeSched,  BlockGemmPipelineVersion::v3>,
+    DeviceGemmMX_Xdl_CShuffleV3<       Row,     Col,     Row,  BF6, E8M0PK,  BF6, E8M0PK,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough,      GemmSpec, ScaleBlockSize,   256,    64,   128, KPerBlock,   1,   1,  16,   16,    2,    4,     S<16,16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              1,              1,      true,     S<16,16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              1,      true,           2,           2,                   S<1, 32, 1, 8>,               8,  BlkGemmPipeSched,  BlockGemmPipelineVersion::v3>,
+    DeviceGemmMX_Xdl_CShuffleV3<       Row,     Col,     Row,  BF6, E8M0PK,  BF6, E8M0PK,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough,      GemmSpec, ScaleBlockSize,   128,   128,    32, KPerBlock,   1,   1,  16,   16,    4,    2,     S<16, 8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              1,              1,      true,     S<16, 8, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              1,      true,           2,           2,                   S<1, 32, 1, 4>,               8,  BlkGemmPipeSched,  BlockGemmPipelineVersion::v3>,
+    DeviceGemmMX_Xdl_CShuffleV3<       Row,     Col,     Row,  BF6, E8M0PK,  BF6, E8M0PK,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough,      GemmSpec, ScaleBlockSize,    64,    32,    32, KPerBlock,   1,   1,  16,   16,    2,    2,     S<16, 4, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              1,              1,      true,     S<16, 4, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              1,      true,           2,           2,                   S<1, 16, 1, 4>,               8,  BlkGemmPipeSched,  BlockGemmPipelineVersion::v3>,
+
+    DeviceGemmMX_Xdl_CShuffleV3<       Row,     Col,     Row,  BF6, E8M0PK,  BF6, E8M0PK,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough,      GemmSpec, ScaleBlockSize,   256,   128,   128, KPerBlock,   1,   1,  16,   16,    4,    4,     S<16,16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              1,              1,      true,     S<16,16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              1,      true,           2,           2,                   S<1, 32, 1, 8>,               8,  BlkGemmPipeSched,  BlockGemmPipelineVersion::v1>,
+    DeviceGemmMX_Xdl_CShuffleV3<       Row,     Col,     Row,  BF6, E8M0PK,  BF6, E8M0PK,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough,      GemmSpec, ScaleBlockSize,   256,   128,    64, KPerBlock,   1,   1,  16,   16,    4,    2,     S<16,16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              1,              1,      true,     S<16,16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              1,      true,           2,           2,                   S<1, 32, 1, 8>,               8,  BlkGemmPipeSched,  BlockGemmPipelineVersion::v1>,
+    DeviceGemmMX_Xdl_CShuffleV3<       Row,     Col,     Row,  BF6, E8M0PK,  BF6, E8M0PK,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough,      GemmSpec, ScaleBlockSize,   256,    64,   128, KPerBlock,   1,   1,  16,   16,    2,    4,     S<16,16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              1,              1,      true,     S<16,16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              1,      true,           2,           2,                   S<1, 32, 1, 8>,               8,  BlkGemmPipeSched,  BlockGemmPipelineVersion::v1>,
+    DeviceGemmMX_Xdl_CShuffleV3<       Row,     Col,     Row,  BF6, E8M0PK,  BF6, E8M0PK,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough,      GemmSpec, ScaleBlockSize,   128,   128,    32, KPerBlock,   1,   1,  16,   16,    4,    2,     S<16, 8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              1,              1,      true,     S<16, 8, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              1,      true,           2,           2,                   S<1, 32, 1, 4>,               8,  BlkGemmPipeSched,  BlockGemmPipelineVersion::v1>,
+    DeviceGemmMX_Xdl_CShuffleV3<       Row,     Col,     Row,  BF6, E8M0PK,  BF6, E8M0PK,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough,      GemmSpec, ScaleBlockSize,    64,    32,    32, KPerBlock,   1,   1,  16,   16,    2,    2,     S<16, 4, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              1,              1,      true,     S<16, 4, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              1,      true,           2,           2,                   S<1, 16, 1, 4>,               8,  BlkGemmPipeSched,  BlockGemmPipelineVersion::v1>,
+    std::nullptr_t
+    // clang-format on
+    >;
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
--- a/library/src/tensor_operation_instance/gpu/gemm_mx/device_gemm_mx_xdl_bf6_bf6_bf16/device_gemm_mx_xdl_bf6_bf6_bf16_mk_nk_mn_default_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/gemm_mx/device_gemm_mx_xdl_bf6_bf6_bf16/device_gemm_mx_xdl_bf6_bf6_bf16_mk_nk_mn_default_instance.cpp
@@ -0,0 +1,32 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "device_gemm_mx_xdl_bf6_bf6_bf16_mk_nk_mn.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_gemm_mx_xdl_bf6_bf6_bf16_mk_nk_mn_default_instances(
+    std::vector<std::unique_ptr<DeviceGemmMX<Row,
+                                             Col,
+                                             Row,
+                                             BF6,
+                                             E8M0PK,
+                                             BF6,
+                                             E8M0PK,
+                                             BF16,
+                                             32,
+                                             PassThrough,
+                                             PassThrough,
+                                             PassThrough>>>& instances)
+{
+    add_device_operation_instances(
+        instances, device_gemm_mx_xdl_bf6_bf6_bf16_mk_nk_mn_instances<Intrawave, GemmDefault>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
--- a/library/src/tensor_operation_instance/gpu/gemm_mx/device_gemm_mx_xdl_f6_f6_f16/device_gemm_mx_xdl_f6_f6_f16_mk_nk_mn.hpp
+++ b/library/src/tensor_operation_instance/gpu/gemm_mx/device_gemm_mx_xdl_f6_f6_f16/device_gemm_mx_xdl_f6_f6_f16_mk_nk_mn.hpp
@@ -0,0 +1,67 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle_v3_mx.hpp"
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+using F16    = half_t;
+using F32    = float;
+using E8M0   = ck::e8m0_bexp_t;
+using E8M0PK = int32_t;
+using F6     = ck::f6x16_pk_t;
+using BF6    = ck::bf6x16_pk_t;
+
+using Row = tensor_layout::gemm::RowMajor;
+using Col = tensor_layout::gemm::ColumnMajor;
+
+template <index_t... Is>
+using S = Sequence<Is...>;
+
+using PassThrough = element_wise::PassThrough;
+
+static constexpr auto GemmDefault    = GemmSpecialization::Default;
+static constexpr auto GemmKPadding   = GemmSpecialization::KPadding;
+static constexpr auto GemmMNPadding  = GemmSpecialization::MNPadding;
+static constexpr auto GemmMNKPadding = GemmSpecialization::MNKPadding;
+
+static constexpr auto Intrawave = BlockGemmPipelineScheduler::Intrawave;
+static constexpr auto Interwave = BlockGemmPipelineScheduler::Interwave;
+
+static constexpr auto ScaleBlockSize = 32;
+static constexpr auto KPerBlock      = 256 / ck::packed_size_v<F6>; // 256 f6 = 16 f6x16_pk_t
+
+template <BlockGemmPipelineScheduler BlkGemmPipeSched, GemmSpecialization GemmSpec>
+using device_gemm_mx_xdl_f6_f6_f16_mk_nk_mn_instances = std::tuple<
+    // clang-format off
+    //###########################| ALayout| BLayout| CLayout|AData| AScale|BData| BScale| CData| AccData| Cshuffle|           A|           B|           C|          GEMM|    Scale Block| Block|  MPer|  NPer|      KPer| AK1| BK1|MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle|     CBlockTransferClusterLengths|  CBlockTransfer|    Block-wiseGemm|               Block-wiseGemm|
+    //###########################|        |        |        | Type|   Data| Type|   Data|  Type|    Type|     Type| Elementwise| Elementwise| Elementwise|Specialization|           Size|  Size| Block| Block|     Block|    |    | XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MXdlPerWave_MWaveMPerXdl| ScalarPerVector|          Pipeline|                     Pipeline|
+    //###########################|        |        |        |     |   Type|     |   Type|      |        |         |   Operation|   Operation|   Operation|              |               |      |      |      |          |    |    |    |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle| _NBlock_NXdlPerWave_NWaveNPerXdl|   _NWaveNPerXdl|         Scheduler|                     Verision|
+    //###########################|        |        |        |     |       |     |       |      |        |         |            |            |            |              |               |      |      |      |          |    |    |    |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                                 |                |                  |                             |
+    DeviceGemmMX_Xdl_CShuffleV3<       Row,     Col,     Row,   F6, E8M0PK,   F6, E8M0PK,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough,      GemmSpec, ScaleBlockSize,   256,   128,   128, KPerBlock,   1,   1,  16,   16,    4,    4,     S<16,16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              1,              1,      true,     S<16,16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              1,      true,           2,           2,                   S<1, 32, 1, 8>,               8,  BlkGemmPipeSched,  BlockGemmPipelineVersion::v3>,
+    DeviceGemmMX_Xdl_CShuffleV3<       Row,     Col,     Row,   F6, E8M0PK,   F6, E8M0PK,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough,      GemmSpec, ScaleBlockSize,   256,   128,    64, KPerBlock,   1,   1,  16,   16,    4,    2,     S<16,16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              1,              1,      true,     S<16,16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              1,      true,           2,           2,                   S<1, 32, 1, 8>,               8,  BlkGemmPipeSched,  BlockGemmPipelineVersion::v3>,
+    DeviceGemmMX_Xdl_CShuffleV3<       Row,     Col,     Row,   F6, E8M0PK,   F6, E8M0PK,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough,      GemmSpec, ScaleBlockSize,   256,    64,   128, KPerBlock,   1,   1,  16,   16,    2,    4,     S<16,16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              1,              1,      true,     S<16,16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              1,      true,           2,           2,                   S<1, 32, 1, 8>,               8,  BlkGemmPipeSched,  BlockGemmPipelineVersion::v3>,
+    DeviceGemmMX_Xdl_CShuffleV3<       Row,     Col,     Row,   F6, E8M0PK,   F6, E8M0PK,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough,      GemmSpec, ScaleBlockSize,   128,   128,    32, KPerBlock,   1,   1,  16,   16,    4,    2,     S<16, 8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              1,              1,      true,     S<16, 8, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              1,      true,           2,           2,                   S<1, 32, 1, 4>,               8,  BlkGemmPipeSched,  BlockGemmPipelineVersion::v3>,
+    DeviceGemmMX_Xdl_CShuffleV3<       Row,     Col,     Row,   F6, E8M0PK,   F6, E8M0PK,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough,      GemmSpec, ScaleBlockSize,    64,    32,    32, KPerBlock,   1,   1,  16,   16,    2,    2,     S<16, 4, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              1,              1,      true,     S<16, 4, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              1,      true,           2,           2,                   S<1, 16, 1, 4>,               8,  BlkGemmPipeSched,  BlockGemmPipelineVersion::v3>,
+
+    DeviceGemmMX_Xdl_CShuffleV3<       Row,     Col,     Row,   F6, E8M0PK,   F6, E8M0PK,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough,      GemmSpec, ScaleBlockSize,   256,   128,   128, KPerBlock,   1,   1,  16,   16,    4,    4,     S<16,16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              1,              1,      true,     S<16,16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              1,      true,           2,           2,                   S<1, 32, 1, 8>,               8,  BlkGemmPipeSched,  BlockGemmPipelineVersion::v1>,
+    DeviceGemmMX_Xdl_CShuffleV3<       Row,     Col,     Row,   F6, E8M0PK,   F6, E8M0PK,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough,      GemmSpec, ScaleBlockSize,   256,   128,    64, KPerBlock,   1,   1,  16,   16,    4,    2,     S<16,16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              1,              1,      true,     S<16,16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              1,      true,           2,           2,                   S<1, 32, 1, 8>,               8,  BlkGemmPipeSched,  BlockGemmPipelineVersion::v1>,
+    DeviceGemmMX_Xdl_CShuffleV3<       Row,     Col,     Row,   F6, E8M0PK,   F6, E8M0PK,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough,      GemmSpec, ScaleBlockSize,   256,    64,   128, KPerBlock,   1,   1,  16,   16,    2,    4,     S<16,16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              1,              1,      true,     S<16,16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              1,      true,           2,           2,                   S<1, 32, 1, 8>,               8,  BlkGemmPipeSched,  BlockGemmPipelineVersion::v1>,
+    DeviceGemmMX_Xdl_CShuffleV3<       Row,     Col,     Row,   F6, E8M0PK,   F6, E8M0PK,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough,      GemmSpec, ScaleBlockSize,   128,   128,    32, KPerBlock,   1,   1,  16,   16,    4,    2,     S<16, 8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              1,              1,      true,     S<16, 8, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              1,      true,           2,           2,                   S<1, 32, 1, 4>,               8,  BlkGemmPipeSched,  BlockGemmPipelineVersion::v1>,
+    DeviceGemmMX_Xdl_CShuffleV3<       Row,     Col,     Row,   F6, E8M0PK,   F6, E8M0PK,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough,      GemmSpec, ScaleBlockSize,    64,    32,    32, KPerBlock,   1,   1,  16,   16,    2,    2,     S<16, 4, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              1,              1,      true,     S<16, 4, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              1,      true,           2,           2,                   S<1, 16, 1, 4>,               8,  BlkGemmPipeSched,  BlockGemmPipelineVersion::v1>,
+    std::nullptr_t
+    // clang-format on
+    >;
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
--- a/library/src/tensor_operation_instance/gpu/gemm_mx/device_gemm_mx_xdl_f6_f6_f16/device_gemm_mx_xdl_f6_f6_f16_mk_nk_mn_default_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/gemm_mx/device_gemm_mx_xdl_f6_f6_f16/device_gemm_mx_xdl_f6_f6_f16_mk_nk_mn_default_instance.cpp
@@ -0,0 +1,32 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "device_gemm_mx_xdl_f6_f6_f16_mk_nk_mn.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_gemm_mx_xdl_f6_f6_f16_mk_nk_mn_default_instances(
+    std::vector<std::unique_ptr<DeviceGemmMX<Row,
+                                             Col,
+                                             Row,
+                                             F6,
+                                             E8M0PK,
+                                             F6,
+                                             E8M0PK,
+                                             F16,
+                                             32,
+                                             PassThrough,
+                                             PassThrough,
+                                             PassThrough>>>& instances)
+{
+    add_device_operation_instances(
+        instances, device_gemm_mx_xdl_f6_f6_f16_mk_nk_mn_instances<Intrawave, GemmDefault>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck