Implement device_gemm_universal_preshuffle_instance for RDNA4 (#3429)

* add device_gemm_wmma_cshuffle_v3_b_preshuffle.hpp * add examples * add instances to test * remove duplicate code between examples
2026-04-20 06:49:15 +00:00 · 2026-01-15 16:19:31 +01:00
parent e30207985a
commit 6df2d70143
20 changed files with 1229 additions and 14 deletions
--- a/library/include/ck/library/tensor_operation_instance/gpu/gemm_universal_preshuffle.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/gemm_universal_preshuffle.hpp
@@ -3,18 +3,19 @@

 #pragma once

-#include <vector>
-#include <memory>
 #include "ck/ck.hpp"
-#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
-#include "ck/tensor_operation/gpu/device/device_gemm_v2.hpp"
-#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
-
 #include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp"
-#ifdef CK_USE_XDL
+#include "ck/tensor_operation/gpu/device/device_gemm_v2.hpp"
+#include "ck/tensor_operation/gpu/element/unary_element_wise_operation.hpp"
+#include "ck/utility/amd_ck_fp8.hpp"
+#include "ck/utility/data_type.hpp"
+#if defined(CK_USE_XDL) || defined(CK_USE_WMMA)
 #include "gemm_universal_preshuffle.inc"
 #endif

+#include <memory>
+#include <vector>
+
 namespace ck {
 namespace tensor_operation {
 namespace device {
@@ -51,7 +52,7 @@ struct DeviceOperationInstanceFactory<

    static auto GetInstances()
    {
-#ifdef CK_USE_XDL
+#if defined(CK_USE_XDL) || defined(CK_USE_WMMA)
        std::vector<std::unique_ptr<DeviceOp>> op_ptrs;
 #if(defined(CK_ENABLE_BF16) && defined(CK_ENABLE_FP8))
        if constexpr(is_same_v<ADataType, f8_t> && is_same_v<BDataType, f8_t> &&
@@ -60,6 +61,7 @@ struct DeviceOperationInstanceFactory<
            if constexpr(is_same_v<ALayout, Row> && is_same_v<BLayout, Col> &&
                         is_same_v<CLayout, Row>)
            {
+#ifdef CK_USE_XDL
                add_device_gemm_xdl_universal_preshuffle_f8_f8_bf16_mk_mfma32x32_mn_instances(
                    op_ptrs);
                add_device_gemm_xdl_universal_preshuffle_f8_f8_bf16_mk_mfma32x32_mn_compute_instances(
@@ -90,6 +92,17 @@ struct DeviceOperationInstanceFactory<
                    op_ptrs);
                add_device_gemm_xdl_universal_preshuffle_f8_f8_bf16_mk_mfma16x16_nk_mn_comp_default_instances_part1(
                    op_ptrs);
+#endif
+#ifdef CK_USE_WMMA
+                add_device_gemm_universal_preshuffle_wmma_f8_f8_bf16_mk_wmma_mn_default_instances_p1(
+                    op_ptrs);
+                add_device_gemm_universal_preshuffle_wmma_f8_f8_bf16_mk_wmma_mn_default_instances_p2(
+                    op_ptrs);
+                add_device_gemm_universal_preshuffle_wmma_f8_f8_bf16_mk_wmma_mn_default_instances_p3(
+                    op_ptrs);
+                add_device_gemm_universal_preshuffle_wmma_f8_f8_bf16_mk_wmma_mn_default_instances_p4(
+                    op_ptrs);
+#endif
            }
        }
 #endif
@@ -100,6 +113,7 @@ struct DeviceOperationInstanceFactory<
            if constexpr(is_same_v<ALayout, Row> && is_same_v<BLayout, Col> &&
                         is_same_v<CLayout, Row>)
            {
+#ifdef CK_USE_XDL
                add_device_gemm_universal_preshuffle_xdl_f8_f8_f16_mk_mfma_mn_compute_default_instances_p1(
                    op_ptrs);
                add_device_gemm_universal_preshuffle_xdl_f8_f8_f16_mk_mfma_mn_compute_default_instances_p2(
@@ -136,10 +150,21 @@ struct DeviceOperationInstanceFactory<
                    op_ptrs);
                add_device_gemm_universal_preshuffle_xdl_f8_f8_f16_mk_mfma16x16_mn_compute_default_instances_p6(
                    op_ptrs);
+#endif
+#ifdef CK_USE_WMMA
+                add_device_gemm_universal_preshuffle_wmma_f8_f8_f16_mk_wmma_mn_default_instances_p1(
+                    op_ptrs);
+                add_device_gemm_universal_preshuffle_wmma_f8_f8_f16_mk_wmma_mn_default_instances_p2(
+                    op_ptrs);
+                add_device_gemm_universal_preshuffle_wmma_f8_f8_f16_mk_wmma_mn_default_instances_p3(
+                    op_ptrs);
+                add_device_gemm_universal_preshuffle_wmma_f8_f8_f16_mk_wmma_mn_default_instances_p4(
+                    op_ptrs);
+#endif
            }
        }
 #endif
-#endif // CK_USE_XDL
+#endif // CK_USE_XDL || CK_USE_WMMA

        return op_ptrs;
    }
--- a/library/include/ck/library/tensor_operation_instance/gpu/gemm_universal_preshuffle.inc
+++ b/library/include/ck/library/tensor_operation_instance/gpu/gemm_universal_preshuffle.inc
@@ -13,8 +13,7 @@ namespace instance {
 using GemmF8F8BF16InstanceVector = std::vector<std::unique_ptr<
    DeviceGemmV2BPreshuffle<Row, Col, Row, F8, F8, BF16, PassThrough, PassThrough, PassThrough>>>&;

-using GemmF8F8F16InstanceVector = std::vector<std::unique_ptr<
-    DeviceGemmV2BPreshuffle<Row, Col, Row, F8, F8, F16, PassThrough, PassThrough, PassThrough>>>&;
+#ifdef CK_USE_XDL

 void add_device_gemm_xdl_universal_preshuffle_f8_f8_bf16_mk_mfma32x32_mn_instances(
    GemmF8F8BF16InstanceVector& instances);
@@ -61,7 +60,32 @@ void add_device_gemm_xdl_universal_preshuffle_f8_f8_bf16_mk_mfma16x16_nk_mn_comp
    GemmF8F8BF16InstanceVector& instances);

 #endif
+
+#ifdef CK_USE_WMMA
+
+void add_device_gemm_universal_preshuffle_wmma_f8_f8_bf16_mk_wmma_mn_default_instances_p1(
+    GemmF8F8BF16InstanceVector& instances);
+
+void add_device_gemm_universal_preshuffle_wmma_f8_f8_bf16_mk_wmma_mn_default_instances_p2(
+    GemmF8F8BF16InstanceVector& instances);
+
+void add_device_gemm_universal_preshuffle_wmma_f8_f8_bf16_mk_wmma_mn_default_instances_p3(
+    GemmF8F8BF16InstanceVector& instances);
+
+void add_device_gemm_universal_preshuffle_wmma_f8_f8_bf16_mk_wmma_mn_default_instances_p4(
+    GemmF8F8BF16InstanceVector& instances);
+
+#endif
+
+#endif
+
 #if(defined(CK_ENABLE_FP16) && defined(CK_ENABLE_FP8))
+
+using GemmF8F8F16InstanceVector = std::vector<std::unique_ptr<
+    DeviceGemmV2BPreshuffle<Row, Col, Row, F8, F8, F16, PassThrough, PassThrough, PassThrough>>>&;
+
+#ifdef CK_USE_XDL
+
 void add_device_gemm_universal_preshuffle_xdl_f8_f8_f16_mk_mfma_mn_compute_default_instances_p1(
    GemmF8F8F16InstanceVector& instances);
 void add_device_gemm_universal_preshuffle_xdl_f8_f8_f16_mk_mfma_mn_compute_default_instances_p2(
@@ -99,6 +123,25 @@ void add_device_gemm_universal_preshuffle_xdl_f8_f8_f16_mk_mfma16x16_mn_compute_
    GemmF8F8F16InstanceVector& instances);
 void add_device_gemm_universal_preshuffle_xdl_f8_f8_f16_mk_mfma16x16_mn_compute_default_instances_p6(
    GemmF8F8F16InstanceVector& instances);
+
+#endif
+
+#ifdef CK_USE_WMMA
+
+void add_device_gemm_universal_preshuffle_wmma_f8_f8_f16_mk_wmma_mn_default_instances_p1(
+    GemmF8F8F16InstanceVector& instances);
+
+void add_device_gemm_universal_preshuffle_wmma_f8_f8_f16_mk_wmma_mn_default_instances_p2(
+    GemmF8F8F16InstanceVector& instances);
+
+void add_device_gemm_universal_preshuffle_wmma_f8_f8_f16_mk_wmma_mn_default_instances_p3(
+    GemmF8F8F16InstanceVector& instances);
+
+void add_device_gemm_universal_preshuffle_wmma_f8_f8_f16_mk_wmma_mn_default_instances_p4(
+    GemmF8F8F16InstanceVector& instances);
+
+#endif
+
 #endif
 } // namespace instance
 } // namespace device
--- a/library/src/tensor_operation_instance/gpu/gemm_universal_preshuffle/CMakeLists.txt
+++ b/library/src/tensor_operation_instance/gpu/gemm_universal_preshuffle/CMakeLists.txt
@@ -1,7 +1,7 @@
 # Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
 # SPDX-License-Identifier: MIT

-# ONLY XDL_KERNELS
+# ONLY XDL_AND_WMMA_KERNELS
 set(GEMM_UNIVERSAL_INSTANCES)

 # F8_F8_BF16
@@ -21,6 +21,10 @@ device_gemm_xdl_universal_preshuffle_f8_f8_bf16/device_gemm_xdl_universal_preshu
 device_gemm_xdl_universal_preshuffle_f8_f8_bf16/device_gemm_xdl_universal_preshuffle_f8_f8_bf16_mk_mfma_mn_p5_default_instance.cpp
 device_gemm_xdl_universal_preshuffle_f8_f8_bf16/device_gemm_xdl_universal_preshuffle_f8_f8_f8_bf16_mk_mfma32x32_mn_default_instance.cpp
 device_gemm_xdl_universal_preshuffle_f8_f8_bf16/device_gemm_xdl_universal_preshuffle_f8_f8_f8_bf16_mk_mfma32x32_mn_comp_instance.cpp
+device_gemm_wmma_universal_preshuffle_f8_f8_bf16/device_gemm_wmma_universal_preshuffle_f8_f8_bf16_mk_wmma_mn_default_instance_p1.cpp
+device_gemm_wmma_universal_preshuffle_f8_f8_bf16/device_gemm_wmma_universal_preshuffle_f8_f8_bf16_mk_wmma_mn_default_instance_p2.cpp
+device_gemm_wmma_universal_preshuffle_f8_f8_bf16/device_gemm_wmma_universal_preshuffle_f8_f8_bf16_mk_wmma_mn_default_instance_p3.cpp
+device_gemm_wmma_universal_preshuffle_f8_f8_bf16/device_gemm_wmma_universal_preshuffle_f8_f8_bf16_mk_wmma_mn_default_instance_p4.cpp
 )

 # F8_F8_F16
@@ -43,6 +47,10 @@ list(APPEND GEMM_UNIVERSAL_INSTANCES
        device_gemm_xdl_universal_preshuffle_f8_f8_f16/device_gemm_universal_preshuffle_xdl_f8_f8_f16_mk_mfma16x16_mn_compute_default_instance_p4.cpp
        device_gemm_xdl_universal_preshuffle_f8_f8_f16/device_gemm_universal_preshuffle_xdl_f8_f8_f16_mk_mfma16x16_mn_compute_default_instance_p5.cpp
        device_gemm_xdl_universal_preshuffle_f8_f8_f16/device_gemm_universal_preshuffle_xdl_f8_f8_f16_mk_mfma16x16_mn_compute_default_instance_p6.cpp
+        device_gemm_wmma_universal_preshuffle_f8_f8_f16/device_gemm_wmma_universal_preshuffle_f8_f8_f16_mk_wmma_mn_default_instance_p1.cpp
+        device_gemm_wmma_universal_preshuffle_f8_f8_f16/device_gemm_wmma_universal_preshuffle_f8_f8_f16_mk_wmma_mn_default_instance_p2.cpp
+        device_gemm_wmma_universal_preshuffle_f8_f8_f16/device_gemm_wmma_universal_preshuffle_f8_f8_f16_mk_wmma_mn_default_instance_p3.cpp
+        device_gemm_wmma_universal_preshuffle_f8_f8_f16/device_gemm_wmma_universal_preshuffle_f8_f8_f16_mk_wmma_mn_default_instance_p4.cpp
 )

 # F8_F8_F16
@@ -64,6 +72,10 @@ set_source_files_properties(device_gemm_xdl_universal_preshuffle_f8_f8_f16/devic
 set_source_files_properties(device_gemm_xdl_universal_preshuffle_f8_f8_f16/device_gemm_universal_preshuffle_xdl_f8_f8_f16_mk_mfma16x16_mn_compute_default_instance_p4.cpp PROPERTIES COMPILE_OPTIONS ";-mllvm;-greedy-reverse-local-assignment=1")
 set_source_files_properties(device_gemm_xdl_universal_preshuffle_f8_f8_f16/device_gemm_universal_preshuffle_xdl_f8_f8_f16_mk_mfma16x16_mn_compute_default_instance_p5.cpp PROPERTIES COMPILE_OPTIONS ";-mllvm;-greedy-reverse-local-assignment=1")
 set_source_files_properties(device_gemm_xdl_universal_preshuffle_f8_f8_f16/device_gemm_universal_preshuffle_xdl_f8_f8_f16_mk_mfma16x16_mn_compute_default_instance_p6.cpp PROPERTIES COMPILE_OPTIONS ";-mllvm;-greedy-reverse-local-assignment=1")
+set_source_files_properties(device_gemm_wmma_universal_preshuffle_f8_f8_f16/device_gemm_wmma_universal_preshuffle_f8_f8_f16_mk_wmma_mn_default_instance_p1.cpp PROPERTIES COMPILE_OPTIONS ";-mllvm;-greedy-reverse-local-assignment=1")
+set_source_files_properties(device_gemm_wmma_universal_preshuffle_f8_f8_f16/device_gemm_wmma_universal_preshuffle_f8_f8_f16_mk_wmma_mn_default_instance_p2.cpp PROPERTIES COMPILE_OPTIONS ";-mllvm;-greedy-reverse-local-assignment=1")
+set_source_files_properties(device_gemm_wmma_universal_preshuffle_f8_f8_f16/device_gemm_wmma_universal_preshuffle_f8_f8_f16_mk_wmma_mn_default_instance_p3.cpp PROPERTIES COMPILE_OPTIONS ";-mllvm;-greedy-reverse-local-assignment=1")
+set_source_files_properties(device_gemm_wmma_universal_preshuffle_f8_f8_f16/device_gemm_wmma_universal_preshuffle_f8_f8_f16_mk_wmma_mn_default_instance_p4.cpp PROPERTIES COMPILE_OPTIONS ";-mllvm;-greedy-reverse-local-assignment=1")

 # F8_F8_BF16
 set_source_files_properties(device_gemm_xdl_universal_preshuffle_f8_f8_bf16/device_gemm_xdl_universal_preshuffle_f8_f8_f8_bf16_mk_mfma32x32_mn_default_instance.cpp PROPERTIES COMPILE_OPTIONS ";-mllvm;-greedy-reverse-local-assignment=1")
@@ -81,5 +93,9 @@ set_source_files_properties(device_gemm_xdl_universal_preshuffle_f8_f8_bf16/devi
 set_source_files_properties(device_gemm_xdl_universal_preshuffle_f8_f8_bf16/device_gemm_xdl_universal_preshuffle_f8_f8_bf16_mk_mfma_mn_p5_default_instance.cpp PROPERTIES COMPILE_OPTIONS ";-mllvm;-greedy-reverse-local-assignment=1")
 set_source_files_properties(device_gemm_xdl_universal_preshuffle_f8_f8_bf16/device_gemm_xdl_universal_preshuffle_f8_f8_bf16_mk_mfma_nk_mn_comp_default_instance_p1.cpp PROPERTIES COMPILE_OPTIONS ";-mllvm;-greedy-reverse-local-assignment=1")
 set_source_files_properties(device_gemm_xdl_universal_preshuffle_f8_f8_bf16/device_gemm_xdl_universal_preshuffle_f8_f8_bf16_mk_mfma_nk_mn_comp_default_instance_p2.cpp PROPERTIES COMPILE_OPTIONS ";-mllvm;-greedy-reverse-local-assignment=1")
+set_source_files_properties(device_gemm_wmma_universal_preshuffle_f8_f8_bf16/device_gemm_wmma_universal_preshuffle_f8_f8_bf16_mk_wmma_mn_default_instance_p1.cpp PROPERTIES COMPILE_OPTIONS ";-mllvm;-greedy-reverse-local-assignment=1")
+set_source_files_properties(device_gemm_wmma_universal_preshuffle_f8_f8_bf16/device_gemm_wmma_universal_preshuffle_f8_f8_bf16_mk_wmma_mn_default_instance_p2.cpp PROPERTIES COMPILE_OPTIONS ";-mllvm;-greedy-reverse-local-assignment=1")
+set_source_files_properties(device_gemm_wmma_universal_preshuffle_f8_f8_bf16/device_gemm_wmma_universal_preshuffle_f8_f8_bf16_mk_wmma_mn_default_instance_p3.cpp PROPERTIES COMPILE_OPTIONS ";-mllvm;-greedy-reverse-local-assignment=1")
+set_source_files_properties(device_gemm_wmma_universal_preshuffle_f8_f8_bf16/device_gemm_wmma_universal_preshuffle_f8_f8_bf16_mk_wmma_mn_default_instance_p4.cpp PROPERTIES COMPILE_OPTIONS ";-mllvm;-greedy-reverse-local-assignment=1")

 add_instance_library(device_gemm_universal_preshuffle_instance ${GEMM_UNIVERSAL_INSTANCES})
--- a/library/src/tensor_operation_instance/gpu/gemm_universal_preshuffle/device_gemm_wmma_universal_preshuffle_f8_f8_bf16/device_gemm_wmma_universal_preshuffle_f8_f8_bf16_mk_wmma_mn.hpp
+++ b/library/src/tensor_operation_instance/gpu/gemm_universal_preshuffle/device_gemm_wmma_universal_preshuffle_f8_f8_bf16/device_gemm_wmma_universal_preshuffle_f8_f8_bf16_mk_wmma_mn.hpp
@@ -0,0 +1,106 @@
+// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
+// SPDX-License-Identifier: MIT
+
+#include "ck/ck.hpp"
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_gemm_wmma_cshuffle_v3_b_preshuffle.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/element/unary_element_wise_operation.hpp"
+#include "ck/utility/amd_ck_fp8.hpp"
+#include "ck/utility/data_type.hpp"
+#include "ck/utility/scheduler_enum.hpp"
+#include "ck/utility/sequence.hpp"
+
+#include <tuple>
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+using F8   = f8_t;
+using BF16 = bhalf_t;
+using F32  = float;
+
+using Row = tensor_layout::gemm::RowMajor;
+using Col = tensor_layout::gemm::ColumnMajor;
+
+template <index_t... Is>
+using S = Sequence<Is...>;
+
+using PassThrough = element_wise::PassThrough;
+
+static constexpr auto GemmDefault = GemmSpecialization::Default;
+
+static constexpr auto Intrawave = BlockGemmPipelineScheduler::Intrawave;
+
+static constexpr auto v1 = BlockGemmPipelineVersion::v1;
+
+template <GemmSpecialization GemmSpec>
+using device_gemm_universal_preshuffle_wmma_f8_f8_bf16_mk_wmma_mn_instances_p1 = std::tuple<
+    // clang-format off
+        //#####################################| ALayout| BLayout| CLayout|AData| BData| CData| AccData| Cshuffle|           A|           B|           C|           GEMM| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MRepeat| NRepeat|    ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|    BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|   CShuffle|   CShuffle| CShuffleBlockTransfer| CShuffleBlockTransfer| BlkGemmPipeSched| BlkGemmPipelineVer| ComputeTypeA|
+        //#####################################|        |        |        | Type|  Type|  Type|    Type|     Type| Elementwise| Elementwise| Elementwise| Specialization|  Size| Block| Block| Block|    |    | Wmma| Wmma|        |        |     ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|     ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN|    MRepeat|    NRepeat|        ClusterLengths|       ScalarPerVector|                 |                   |             |
+        //#####################################|        |        |        |     |      |      |        |         |   Operation|   Operation|   Operation|               |      |      |      |      |    |    |     |     |        |        | Lengths_AK0_M_AK1|   ArrangeOrder|               |               |      PerVector|  PerVector_AK1|          | Lengths_BK0_N_BK1|   ArrangeOrder|               |              |      PerVector|  PerVector_BK1|          | PerShuffle| PerShuffle|     _MBlock_MPerBlock|                      |                 |                   |             |
+        //#####################################|        |        |        |     |      |      |        |         |            |            |            |               |      |      |      |      |    |    |     |     |        |        |                  |               |               |               |               |               |          |                  |               |               |              |               |               |          |           |           |     _NBlock_NPerBlock|                      |                 |                   |             |
+        DeviceGemm_Wmma_CShuffleV3_BPreshuffle<     Row,     Col,     Row,   F8,    F8,  BF16,     F32,      F32, PassThrough, PassThrough, PassThrough,       GemmSpec,   256,    32,   128,   128,  16,  16,   16,   16,       2,       1,       S<8, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,             16,             16,         0,       S<8, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,             16,             16,         0,          1,          1,       S<1,  8, 1, 32>,            S<4, 4, 1>,        Intrawave,                 v1,            F8 >,
+        DeviceGemm_Wmma_CShuffleV3_BPreshuffle<     Row,     Col,     Row,   F8,    F8,  BF16,     F32,      F32, PassThrough, PassThrough, PassThrough,       GemmSpec,   256,    64,   128,   128,  16,  16,   16,   16,       4,       1,       S<8, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,             16,             16,         0,       S<8, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,             16,             16,         0,          1,          1,       S<1, 16, 1, 16>,            S<8, 8, 1>,        Intrawave,                 v1,            F8 >,
+        DeviceGemm_Wmma_CShuffleV3_BPreshuffle<     Row,     Col,     Row,   F8,    F8,  BF16,     F32,      F32, PassThrough, PassThrough, PassThrough,       GemmSpec,   256,    64,   256,   128,  16,  16,   16,   16,       4,       2,       S<8, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,             16,             16,         0,       S<8, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,             16,             16,         0,          1,          1,       S<1, 16, 1, 16>,            S<8, 8, 1>,        Intrawave,                 v1,            F8 >,
+        DeviceGemm_Wmma_CShuffleV3_BPreshuffle<     Row,     Col,     Row,   F8,    F8,  BF16,     F32,      F32, PassThrough, PassThrough, PassThrough,       GemmSpec,   256,    32,   256,   128,  16,  16,   16,   16,       2,       2,       S<8, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,             16,             16,         0,       S<8, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,             16,             16,         0,          1,          1,       S<1, 16, 1, 16>,            S<8, 8, 1>,        Intrawave,                 v1,            F8 >,
+        DeviceGemm_Wmma_CShuffleV3_BPreshuffle<     Row,     Col,     Row,   F8,    F8,  BF16,     F32,      F32, PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   128,   128,   128,  16,  16,   16,   16,       8,       1,       S<8, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,             16,             16,         0,       S<8, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,             16,             16,         0,          1,          1,       S<1, 16, 1, 16>,            S<8, 8, 1>,        Intrawave,                 v1,            F8 >
+    // clang-format on
+    >;
+
+template <GemmSpecialization GemmSpec>
+using device_gemm_universal_preshuffle_wmma_f8_f8_bf16_mk_wmma_mn_instances_p2 = std::tuple<
+    // clang-format off
+        //#####################################| ALayout| BLayout| CLayout|AData| BData| CData| AccData| Cshuffle|           A|           B|           C|           GEMM| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MRepeat| NRepeat|    ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|    BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|   CShuffle|   CShuffle| CShuffleBlockTransfer| CShuffleBlockTransfer| BlkGemmPipeSched| BlkGemmPipelineVer| ComputeTypeA|
+        //#####################################|        |        |        | Type|  Type|  Type|    Type|     Type| Elementwise| Elementwise| Elementwise| Specialization|  Size| Block| Block| Block|    |    | Wmma| Wmma|        |        |     ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|     ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN|    MRepeat|    NRepeat|        ClusterLengths|       ScalarPerVector|                 |                   |             |
+        //#####################################|        |        |        |     |      |      |        |         |   Operation|   Operation|   Operation|               |      |      |      |      |    |    |     |     |        |        | Lengths_AK0_M_AK1|   ArrangeOrder|               |               |      PerVector|  PerVector_AK1|          | Lengths_BK0_N_BK1|   ArrangeOrder|               |              |      PerVector|  PerVector_BK1|          | PerShuffle| PerShuffle|     _MBlock_MPerBlock|                      |                 |                   |             |
+        //#####################################|        |        |        |     |      |      |        |         |            |            |            |               |      |      |      |      |    |    |     |     |        |        |                  |               |               |               |               |               |          |                  |               |               |              |               |               |          |           |           |     _NBlock_NPerBlock|                      |                 |                   |             |
+        DeviceGemm_Wmma_CShuffleV3_BPreshuffle<     Row,     Col,     Row,   F8,    F8,  BF16,     F32,      F32, PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   128,   256,   128,  16,  16,   16,   16,       8,       2,       S<8, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,             16,             16,         0,       S<8, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,             16,             16,         0,          1,          1,       S<1, 16, 1, 16>,            S<8, 8, 1>,        Intrawave,                 v1,            F8 >,
+        DeviceGemm_Wmma_CShuffleV3_BPreshuffle<     Row,     Col,     Row,   F8,    F8,  BF16,     F32,      F32, PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   256,   128,   128,  16,  16,   16,   16,       8,       2,       S<8, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,             16,             16,         0,       S<8, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,             16,             16,         0,          1,          1,       S<1, 32, 1,  8>,            S<8, 8, 1>,        Intrawave,                 v1,            F8 >,
+        DeviceGemm_Wmma_CShuffleV3_BPreshuffle<     Row,     Col,     Row,   F8,    F8,  BF16,     F32,      F32, PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   256,   128,   128,  16,  16,   16,   16,      16,       1,       S<8, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,             16,             16,         0,       S<8, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,             16,             16,         0,          1,          1,       S<1, 16, 1, 16>,            S<8, 8, 1>,        Intrawave,                 v1,            F8 >,
+        DeviceGemm_Wmma_CShuffleV3_BPreshuffle<     Row,     Col,     Row,   F8,    F8,  BF16,     F32,      F32, PassThrough, PassThrough, PassThrough,       GemmSpec,   128,   256,    64,   128,  16,  16,   16,   16,      16,       1,       S<8, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,             16,             16,         0,       S<8, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,             16,             16,         0,          1,          1,       S<1, 16, 1,  8>,            S<8, 8, 1>,        Intrawave,                 v1,            F8 >,
+        DeviceGemm_Wmma_CShuffleV3_BPreshuffle<     Row,     Col,     Row,   F8,    F8,  BF16,     F32,      F32, PassThrough, PassThrough, PassThrough,       GemmSpec,   128,   128,    64,   128,  16,  16,   16,   16,       8,       1,       S<8, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,             16,             16,         0,       S<8, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,             16,             16,         0,          1,          1,       S<1, 16, 1,  8>,            S<8, 8, 1>,        Intrawave,                 v1,            F8 >
+    // clang-format on
+    >;
+
+template <GemmSpecialization GemmSpec>
+using device_gemm_universal_preshuffle_wmma_f8_f8_bf16_mk_wmma_mn_instances_p3 = std::tuple<
+    // clang-format off
+        //#####################################| ALayout| BLayout| CLayout|AData| BData| CData| AccData| Cshuffle|           A|           B|           C|           GEMM| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MRepeat| NRepeat|    ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|    BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|   CShuffle|   CShuffle| CShuffleBlockTransfer| CShuffleBlockTransfer| BlkGemmPipeSched| BlkGemmPipelineVer| ComputeTypeA|
+        //#####################################|        |        |        | Type|  Type|  Type|    Type|     Type| Elementwise| Elementwise| Elementwise| Specialization|  Size| Block| Block| Block|    |    | Wmma| Wmma|        |        |     ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|     ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN|    MRepeat|    NRepeat|        ClusterLengths|       ScalarPerVector|                 |                   |             |
+        //#####################################|        |        |        |     |      |      |        |         |   Operation|   Operation|   Operation|               |      |      |      |      |    |    |     |     |        |        | Lengths_AK0_M_AK1|   ArrangeOrder|               |               |      PerVector|  PerVector_AK1|          | Lengths_BK0_N_BK1|   ArrangeOrder|               |              |      PerVector|  PerVector_BK1|          | PerShuffle| PerShuffle|     _MBlock_MPerBlock|                      |                 |                   |             |
+        //#####################################|        |        |        |     |      |      |        |         |            |            |            |               |      |      |      |      |    |    |     |     |        |        |                  |               |               |               |               |               |          |                  |               |               |              |               |               |          |           |           |     _NBlock_NPerBlock|                      |                 |                   |             |
+        DeviceGemm_Wmma_CShuffleV3_BPreshuffle<     Row,     Col,     Row,   F8,    F8,  BF16,     F32,      F32, PassThrough, PassThrough, PassThrough,       GemmSpec,   256,    32,   128,   256,  16,  16,   16,   16,       2,       1,      S<16, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,             16,             16,         0,      S<16, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,             16,             16,         0,          1,          1,       S<1, 16, 1, 16>,            S<8, 8, 1>,        Intrawave,                 v1,            F8 >,
+        DeviceGemm_Wmma_CShuffleV3_BPreshuffle<     Row,     Col,     Row,   F8,    F8,  BF16,     F32,      F32, PassThrough, PassThrough, PassThrough,       GemmSpec,   256,    64,   128,   256,  16,  16,   16,   16,       4,       1,      S<16, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,             16,             16,         0,      S<16, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,             16,             16,         0,          1,          1,       S<1, 16, 1, 16>,            S<8, 8, 1>,        Intrawave,                 v1,            F8 >,
+        DeviceGemm_Wmma_CShuffleV3_BPreshuffle<     Row,     Col,     Row,   F8,    F8,  BF16,     F32,      F32, PassThrough, PassThrough, PassThrough,       GemmSpec,   256,    16,   128,   256,  16,  16,   16,   16,       1,       1,      S<16, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,             16,             16,         0,      S<16, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,             16,             16,         0,          1,          1,       S<1, 16, 1, 16>,            S<8, 8, 1>,        Intrawave,                 v1,            F8 >,
+        DeviceGemm_Wmma_CShuffleV3_BPreshuffle<     Row,     Col,     Row,   F8,    F8,  BF16,     F32,      F32, PassThrough, PassThrough, PassThrough,       GemmSpec,   256,    16,   256,   256,  16,  16,   16,   16,       1,       2,      S<16, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,             16,             16,         0,      S<16, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,             16,             16,         0,          1,          1,       S<1, 16, 1, 16>,            S<8, 8, 1>,        Intrawave,                 v1,            F8 >,
+        DeviceGemm_Wmma_CShuffleV3_BPreshuffle<     Row,     Col,     Row,   F8,    F8,  BF16,     F32,      F32, PassThrough, PassThrough, PassThrough,       GemmSpec,   256,    16,   512,   256,  16,  16,   16,   16,       1,       4,      S<16, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,             16,             16,         0,      S<16, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,             16,             16,         0,          1,          1,       S<1, 16, 1, 16>,            S<8, 8, 1>,        Intrawave,                 v1,            F8 >,
+        DeviceGemm_Wmma_CShuffleV3_BPreshuffle<     Row,     Col,     Row,   F8,    F8,  BF16,     F32,      F32, PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   128,   128,   256,  16,  16,   16,   16,       8,       1,      S<16, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,             16,             16,         0,      S<16, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,             16,             16,         0,          1,          1,       S<1, 16, 1, 16>,            S<8, 8, 1>,        Intrawave,                 v1,            F8 >,
+        DeviceGemm_Wmma_CShuffleV3_BPreshuffle<     Row,     Col,     Row,   F8,    F8,  BF16,     F32,      F32, PassThrough, PassThrough, PassThrough,       GemmSpec,   128,   128,    64,   256,  16,  16,   16,   16,       8,       1,      S<16,  8, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,             16,             16,         0,      S<16,  8, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,             16,             16,         0,          1,          1,       S<1, 16, 1,  8>,            S<8, 8, 1>,        Intrawave,                 v1,            F8 >
+    // clang-format on
+    >;
+
+template <GemmSpecialization GemmSpec>
+using device_gemm_universal_preshuffle_wmma_f8_f8_bf16_mk_wmma_mn_instances_p4 = std::tuple<
+    // clang-format off
+        //#####################################| ALayout| BLayout| CLayout|AData| BData| CData| AccData| Cshuffle|           A|           B|           C|           GEMM| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MRepeat| NRepeat|    ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|    BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|   CShuffle|   CShuffle| CShuffleBlockTransfer| CShuffleBlockTransfer| BlkGemmPipeSched| BlkGemmPipelineVer| ComputeTypeA|
+        //#####################################|        |        |        | Type|  Type|  Type|    Type|     Type| Elementwise| Elementwise| Elementwise| Specialization|  Size| Block| Block| Block|    |    | Wmma| Wmma|        |        |     ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|     ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN|    MRepeat|    NRepeat|        ClusterLengths|       ScalarPerVector|                 |                   |             |
+        //#####################################|        |        |        |     |      |      |        |         |   Operation|   Operation|   Operation|               |      |      |      |      |    |    |     |     |        |        | Lengths_AK0_M_AK1|   ArrangeOrder|               |               |      PerVector|  PerVector_AK1|          | Lengths_BK0_N_BK1|   ArrangeOrder|               |              |      PerVector|  PerVector_BK1|          | PerShuffle| PerShuffle|     _MBlock_MPerBlock|                      |                 |                   |             |
+        //#####################################|        |        |        |     |      |      |        |         |            |            |            |               |      |      |      |      |    |    |     |     |        |        |                  |               |               |               |               |               |          |                  |               |               |              |               |               |          |           |           |     _NBlock_NPerBlock|                      |                 |                   |             |
+        DeviceGemm_Wmma_CShuffleV3_BPreshuffle<     Row,     Col,     Row,   F8,    F8,  BF16,     F32,      F32, PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   224,   256,   128,  16,  16,   16,   16,       7,       4,       S<8, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,             16,             16,         0,       S<8, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,             16,             16,         0,          1,          1,        S<1, 32, 1, 8>,            S<8, 8, 1>,        Intrawave,                 v1,            F8 >,
+        DeviceGemm_Wmma_CShuffleV3_BPreshuffle<     Row,     Col,     Row,   F8,    F8,  BF16,     F32,      F32, PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   224,   128,   128,  16,  16,   16,   16,       7,       2,       S<8, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,             16,             16,         0,       S<8, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,             16,             16,         0,          1,          1,        S<1, 32, 1, 8>,            S<8, 8, 1>,        Intrawave,                 v1,            F8 >,
+        DeviceGemm_Wmma_CShuffleV3_BPreshuffle<     Row,     Col,     Row,   F8,    F8,  BF16,     F32,      F32, PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   224,    64,   128,  16,  16,   16,   16,       7,       1,       S<8, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,             16,             16,         0,       S<8, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,             16,             16,         0,          1,          1,        S<1, 32, 1, 8>,            S<8, 8, 1>,        Intrawave,                 v1,            F8 >,
+        DeviceGemm_Wmma_CShuffleV3_BPreshuffle<     Row,     Col,     Row,   F8,    F8,  BF16,     F32,      F32, PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   256,   224,   128,  16,  16,   16,   16,       4,       7,       S<8, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,             16,             16,         0,       S<8, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,             16,             16,         0,          1,          1,        S<1, 64, 1, 4>,            S<8, 8, 1>,        Intrawave,                 v1,            F8 >,
+        DeviceGemm_Wmma_CShuffleV3_BPreshuffle<     Row,     Col,     Row,   F8,    F8,  BF16,     F32,      F32, PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   128,   224,   128,  16,  16,   16,   16,       2,       7,       S<8, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,             16,             16,         0,       S<8, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,             16,             16,         0,          1,          1,        S<1, 64, 1, 4>,            S<8, 8, 1>,        Intrawave,                 v1,            F8 >,
+        DeviceGemm_Wmma_CShuffleV3_BPreshuffle<     Row,     Col,     Row,   F8,    F8,  BF16,     F32,      F32, PassThrough, PassThrough, PassThrough,       GemmSpec,   256,    64,   224,   128,  16,  16,   16,   16,       1,       7,       S<8, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,             16,             16,         0,       S<8, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,             16,             16,         0,          1,          1,        S<1, 64, 1, 4>,            S<8, 8, 1>,        Intrawave,                 v1,            F8 >
+    // clang-format on
+    >;
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
--- a/library/src/tensor_operation_instance/gpu/gemm_universal_preshuffle/device_gemm_wmma_universal_preshuffle_f8_f8_bf16/device_gemm_wmma_universal_preshuffle_f8_f8_bf16_mk_wmma_mn_default_instance_p1.cpp
+++ b/library/src/tensor_operation_instance/gpu/gemm_universal_preshuffle/device_gemm_wmma_universal_preshuffle_f8_f8_bf16/device_gemm_wmma_universal_preshuffle_f8_f8_bf16_mk_wmma_mn_default_instance_p1.cpp
@@ -0,0 +1,33 @@
+// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
+// SPDX-License-Identifier: MIT
+
+#include "device_gemm_wmma_universal_preshuffle_f8_f8_bf16_mk_wmma_mn.hpp"
+
+#include <memory>
+#include <vector>
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_gemm_universal_preshuffle_wmma_f8_f8_bf16_mk_wmma_mn_default_instances_p1(
+    std::vector<std::unique_ptr<DeviceGemmV2BPreshuffle<Row,
+                                                        Col,
+                                                        Row,
+                                                        F8,
+                                                        F8,
+                                                        BF16,
+                                                        PassThrough,
+                                                        PassThrough,
+                                                        PassThrough>>>& instances)
+{
+    add_device_operation_instances(
+        instances,
+        device_gemm_universal_preshuffle_wmma_f8_f8_bf16_mk_wmma_mn_instances_p1<GemmDefault>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
--- a/library/src/tensor_operation_instance/gpu/gemm_universal_preshuffle/device_gemm_wmma_universal_preshuffle_f8_f8_bf16/device_gemm_wmma_universal_preshuffle_f8_f8_bf16_mk_wmma_mn_default_instance_p2.cpp
+++ b/library/src/tensor_operation_instance/gpu/gemm_universal_preshuffle/device_gemm_wmma_universal_preshuffle_f8_f8_bf16/device_gemm_wmma_universal_preshuffle_f8_f8_bf16_mk_wmma_mn_default_instance_p2.cpp
@@ -0,0 +1,33 @@
+// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
+// SPDX-License-Identifier: MIT
+
+#include "device_gemm_wmma_universal_preshuffle_f8_f8_bf16_mk_wmma_mn.hpp"
+
+#include <memory>
+#include <vector>
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_gemm_universal_preshuffle_wmma_f8_f8_bf16_mk_wmma_mn_default_instances_p2(
+    std::vector<std::unique_ptr<DeviceGemmV2BPreshuffle<Row,
+                                                        Col,
+                                                        Row,
+                                                        F8,
+                                                        F8,
+                                                        BF16,
+                                                        PassThrough,
+                                                        PassThrough,
+                                                        PassThrough>>>& instances)
+{
+    add_device_operation_instances(
+        instances,
+        device_gemm_universal_preshuffle_wmma_f8_f8_bf16_mk_wmma_mn_instances_p2<GemmDefault>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
--- a/library/src/tensor_operation_instance/gpu/gemm_universal_preshuffle/device_gemm_wmma_universal_preshuffle_f8_f8_bf16/device_gemm_wmma_universal_preshuffle_f8_f8_bf16_mk_wmma_mn_default_instance_p3.cpp
+++ b/library/src/tensor_operation_instance/gpu/gemm_universal_preshuffle/device_gemm_wmma_universal_preshuffle_f8_f8_bf16/device_gemm_wmma_universal_preshuffle_f8_f8_bf16_mk_wmma_mn_default_instance_p3.cpp
@@ -0,0 +1,33 @@
+// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
+// SPDX-License-Identifier: MIT
+
+#include "device_gemm_wmma_universal_preshuffle_f8_f8_bf16_mk_wmma_mn.hpp"
+
+#include <memory>
+#include <vector>
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_gemm_universal_preshuffle_wmma_f8_f8_bf16_mk_wmma_mn_default_instances_p3(
+    std::vector<std::unique_ptr<DeviceGemmV2BPreshuffle<Row,
+                                                        Col,
+                                                        Row,
+                                                        F8,
+                                                        F8,
+                                                        BF16,
+                                                        PassThrough,
+                                                        PassThrough,
+                                                        PassThrough>>>& instances)
+{
+    add_device_operation_instances(
+        instances,
+        device_gemm_universal_preshuffle_wmma_f8_f8_bf16_mk_wmma_mn_instances_p3<GemmDefault>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
--- a/library/src/tensor_operation_instance/gpu/gemm_universal_preshuffle/device_gemm_wmma_universal_preshuffle_f8_f8_bf16/device_gemm_wmma_universal_preshuffle_f8_f8_bf16_mk_wmma_mn_default_instance_p4.cpp
+++ b/library/src/tensor_operation_instance/gpu/gemm_universal_preshuffle/device_gemm_wmma_universal_preshuffle_f8_f8_bf16/device_gemm_wmma_universal_preshuffle_f8_f8_bf16_mk_wmma_mn_default_instance_p4.cpp
@@ -0,0 +1,33 @@
+// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
+// SPDX-License-Identifier: MIT
+
+#include "device_gemm_wmma_universal_preshuffle_f8_f8_bf16_mk_wmma_mn.hpp"
+
+#include <memory>
+#include <vector>
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_gemm_universal_preshuffle_wmma_f8_f8_bf16_mk_wmma_mn_default_instances_p4(
+    std::vector<std::unique_ptr<DeviceGemmV2BPreshuffle<Row,
+                                                        Col,
+                                                        Row,
+                                                        F8,
+                                                        F8,
+                                                        BF16,
+                                                        PassThrough,
+                                                        PassThrough,
+                                                        PassThrough>>>& instances)
+{
+    add_device_operation_instances(
+        instances,
+        device_gemm_universal_preshuffle_wmma_f8_f8_bf16_mk_wmma_mn_instances_p4<GemmDefault>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
--- a/library/src/tensor_operation_instance/gpu/gemm_universal_preshuffle/device_gemm_wmma_universal_preshuffle_f8_f8_f16/device_gemm_wmma_universal_preshuffle_f8_f8_f16_mk_wmma_mn.hpp
+++ b/library/src/tensor_operation_instance/gpu/gemm_universal_preshuffle/device_gemm_wmma_universal_preshuffle_f8_f8_f16/device_gemm_wmma_universal_preshuffle_f8_f8_f16_mk_wmma_mn.hpp
@@ -0,0 +1,106 @@
+// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
+// SPDX-License-Identifier: MIT
+
+#include "ck/ck.hpp"
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_gemm_wmma_cshuffle_v3_b_preshuffle.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/element/unary_element_wise_operation.hpp"
+#include "ck/utility/amd_ck_fp8.hpp"
+#include "ck/utility/data_type.hpp"
+#include "ck/utility/scheduler_enum.hpp"
+#include "ck/utility/sequence.hpp"
+
+#include <tuple>
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+using F8  = f8_t;
+using F16 = half_t;
+using F32 = float;
+
+using Row = tensor_layout::gemm::RowMajor;
+using Col = tensor_layout::gemm::ColumnMajor;
+
+template <index_t... Is>
+using S = Sequence<Is...>;
+
+using PassThrough = element_wise::PassThrough;
+
+static constexpr auto GemmDefault = GemmSpecialization::Default;
+
+static constexpr auto Intrawave = BlockGemmPipelineScheduler::Intrawave;
+
+static constexpr auto v1 = BlockGemmPipelineVersion::v1;
+
+template <GemmSpecialization GemmSpec>
+using device_gemm_universal_preshuffle_wmma_f8_f8_f16_mk_wmma_mn_instances_p1 = std::tuple<
+    // clang-format off
+        //#####################################| ALayout| BLayout| CLayout|AData| BData| CData| AccData| Cshuffle|           A|           B|           C|           GEMM| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MRepeat| NRepeat|    ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|    BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|   CShuffle|   CShuffle| CShuffleBlockTransfer| CShuffleBlockTransfer| BlkGemmPipeSched| BlkGemmPipelineVer| ComputeTypeA|
+        //#####################################|        |        |        | Type|  Type|  Type|    Type|     Type| Elementwise| Elementwise| Elementwise| Specialization|  Size| Block| Block| Block|    |    | Wmma| Wmma|        |        |     ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|     ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN|    MRepeat|    NRepeat|        ClusterLengths|       ScalarPerVector|                 |                   |             |
+        //#####################################|        |        |        |     |      |      |        |         |   Operation|   Operation|   Operation|               |      |      |      |      |    |    |     |     |        |        | Lengths_AK0_M_AK1|   ArrangeOrder|               |               |      PerVector|  PerVector_AK1|          | Lengths_BK0_N_BK1|   ArrangeOrder|               |              |      PerVector|  PerVector_BK1|          | PerShuffle| PerShuffle|     _MBlock_MPerBlock|                      |                 |                   |             |
+        //#####################################|        |        |        |     |      |      |        |         |            |            |            |               |      |      |      |      |    |    |     |     |        |        |                  |               |               |               |               |               |          |                  |               |               |              |               |               |          |           |           |     _NBlock_NPerBlock|                      |                 |                   |             |
+        DeviceGemm_Wmma_CShuffleV3_BPreshuffle<     Row,     Col,     Row,   F8,    F8,   F16,     F32,      F32, PassThrough, PassThrough, PassThrough,       GemmSpec,   256,    32,   128,   128,  16,  16,   16,   16,       2,       1,       S<8, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,             16,             16,         0,       S<8, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,             16,             16,         0,          1,          1,       S<1,  8, 1, 32>,            S<4, 4, 1>,        Intrawave,                 v1,            F8 >,
+        DeviceGemm_Wmma_CShuffleV3_BPreshuffle<     Row,     Col,     Row,   F8,    F8,   F16,     F32,      F32, PassThrough, PassThrough, PassThrough,       GemmSpec,   256,    64,   128,   128,  16,  16,   16,   16,       4,       1,       S<8, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,             16,             16,         0,       S<8, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,             16,             16,         0,          1,          1,       S<1, 16, 1, 16>,            S<8, 8, 1>,        Intrawave,                 v1,            F8 >,
+        DeviceGemm_Wmma_CShuffleV3_BPreshuffle<     Row,     Col,     Row,   F8,    F8,   F16,     F32,      F32, PassThrough, PassThrough, PassThrough,       GemmSpec,   256,    64,   256,   128,  16,  16,   16,   16,       4,       2,       S<8, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,             16,             16,         0,       S<8, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,             16,             16,         0,          1,          1,       S<1, 16, 1, 16>,            S<8, 8, 1>,        Intrawave,                 v1,            F8 >,
+        DeviceGemm_Wmma_CShuffleV3_BPreshuffle<     Row,     Col,     Row,   F8,    F8,   F16,     F32,      F32, PassThrough, PassThrough, PassThrough,       GemmSpec,   256,    32,   256,   128,  16,  16,   16,   16,       2,       2,       S<8, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,             16,             16,         0,       S<8, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,             16,             16,         0,          1,          1,       S<1, 16, 1, 16>,            S<8, 8, 1>,        Intrawave,                 v1,            F8 >,
+        DeviceGemm_Wmma_CShuffleV3_BPreshuffle<     Row,     Col,     Row,   F8,    F8,   F16,     F32,      F32, PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   128,   128,   128,  16,  16,   16,   16,       8,       1,       S<8, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,             16,             16,         0,       S<8, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,             16,             16,         0,          1,          1,       S<1, 16, 1, 16>,            S<8, 8, 1>,        Intrawave,                 v1,            F8 >
+    // clang-format on
+    >;
+
+template <GemmSpecialization GemmSpec>
+using device_gemm_universal_preshuffle_wmma_f8_f8_f16_mk_wmma_mn_instances_p2 = std::tuple<
+    // clang-format off
+        //#####################################| ALayout| BLayout| CLayout|AData| BData| CData| AccData| Cshuffle|           A|           B|           C|           GEMM| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MRepeat| NRepeat|    ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|    BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|   CShuffle|   CShuffle| CShuffleBlockTransfer| CShuffleBlockTransfer| BlkGemmPipeSched| BlkGemmPipelineVer| ComputeTypeA|
+        //#####################################|        |        |        | Type|  Type|  Type|    Type|     Type| Elementwise| Elementwise| Elementwise| Specialization|  Size| Block| Block| Block|    |    | Wmma| Wmma|        |        |     ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|     ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN|    MRepeat|    NRepeat|        ClusterLengths|       ScalarPerVector|                 |                   |             |
+        //#####################################|        |        |        |     |      |      |        |         |   Operation|   Operation|   Operation|               |      |      |      |      |    |    |     |     |        |        | Lengths_AK0_M_AK1|   ArrangeOrder|               |               |      PerVector|  PerVector_AK1|          | Lengths_BK0_N_BK1|   ArrangeOrder|               |              |      PerVector|  PerVector_BK1|          | PerShuffle| PerShuffle|     _MBlock_MPerBlock|                      |                 |                   |             |
+        //#####################################|        |        |        |     |      |      |        |         |            |            |            |               |      |      |      |      |    |    |     |     |        |        |                  |               |               |               |               |               |          |                  |               |               |              |               |               |          |           |           |     _NBlock_NPerBlock|                      |                 |                   |             |
+        DeviceGemm_Wmma_CShuffleV3_BPreshuffle<     Row,     Col,     Row,   F8,    F8,   F16,     F32,      F32, PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   128,   256,   128,  16,  16,   16,   16,       8,       2,       S<8, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,             16,             16,         0,       S<8, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,             16,             16,         0,          1,          1,       S<1, 16, 1, 16>,            S<8, 8, 1>,        Intrawave,                 v1,            F8 >,
+        DeviceGemm_Wmma_CShuffleV3_BPreshuffle<     Row,     Col,     Row,   F8,    F8,   F16,     F32,      F32, PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   256,   128,   128,  16,  16,   16,   16,       8,       2,       S<8, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,             16,             16,         0,       S<8, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,             16,             16,         0,          1,          1,       S<1, 32, 1,  8>,            S<8, 8, 1>,        Intrawave,                 v1,            F8 >,
+        DeviceGemm_Wmma_CShuffleV3_BPreshuffle<     Row,     Col,     Row,   F8,    F8,   F16,     F32,      F32, PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   256,   128,   128,  16,  16,   16,   16,      16,       1,       S<8, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,             16,             16,         0,       S<8, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,             16,             16,         0,          1,          1,       S<1, 16, 1, 16>,            S<8, 8, 1>,        Intrawave,                 v1,            F8 >,
+        DeviceGemm_Wmma_CShuffleV3_BPreshuffle<     Row,     Col,     Row,   F8,    F8,   F16,     F32,      F32, PassThrough, PassThrough, PassThrough,       GemmSpec,   128,   256,    64,   128,  16,  16,   16,   16,      16,       1,       S<8, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,             16,             16,         0,       S<8, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,             16,             16,         0,          1,          1,       S<1, 16, 1,  8>,            S<8, 8, 1>,        Intrawave,                 v1,            F8 >,
+        DeviceGemm_Wmma_CShuffleV3_BPreshuffle<     Row,     Col,     Row,   F8,    F8,   F16,     F32,      F32, PassThrough, PassThrough, PassThrough,       GemmSpec,   128,   128,    64,   128,  16,  16,   16,   16,       8,       1,       S<8, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,             16,             16,         0,       S<8, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,             16,             16,         0,          1,          1,       S<1, 16, 1,  8>,            S<8, 8, 1>,        Intrawave,                 v1,            F8 >
+    // clang-format on
+    >;
+
+template <GemmSpecialization GemmSpec>
+using device_gemm_universal_preshuffle_wmma_f8_f8_f16_mk_wmma_mn_instances_p3 = std::tuple<
+    // clang-format off
+        //#####################################| ALayout| BLayout| CLayout|AData| BData| CData| AccData| Cshuffle|           A|           B|           C|           GEMM| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MRepeat| NRepeat|    ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|    BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|   CShuffle|   CShuffle| CShuffleBlockTransfer| CShuffleBlockTransfer| BlkGemmPipeSched| BlkGemmPipelineVer| ComputeTypeA|
+        //#####################################|        |        |        | Type|  Type|  Type|    Type|     Type| Elementwise| Elementwise| Elementwise| Specialization|  Size| Block| Block| Block|    |    | Wmma| Wmma|        |        |     ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|     ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN|    MRepeat|    NRepeat|        ClusterLengths|       ScalarPerVector|                 |                   |             |
+        //#####################################|        |        |        |     |      |      |        |         |   Operation|   Operation|   Operation|               |      |      |      |      |    |    |     |     |        |        | Lengths_AK0_M_AK1|   ArrangeOrder|               |               |      PerVector|  PerVector_AK1|          | Lengths_BK0_N_BK1|   ArrangeOrder|               |              |      PerVector|  PerVector_BK1|          | PerShuffle| PerShuffle|     _MBlock_MPerBlock|                      |                 |                   |             |
+        //#####################################|        |        |        |     |      |      |        |         |            |            |            |               |      |      |      |      |    |    |     |     |        |        |                  |               |               |               |               |               |          |                  |               |               |              |               |               |          |           |           |     _NBlock_NPerBlock|                      |                 |                   |             |
+        DeviceGemm_Wmma_CShuffleV3_BPreshuffle<     Row,     Col,     Row,   F8,    F8,   F16,     F32,      F32, PassThrough, PassThrough, PassThrough,       GemmSpec,   256,    32,   128,   256,  16,  16,   16,   16,       2,       1,      S<16, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,             16,             16,         0,      S<16, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,             16,             16,         0,          1,          1,       S<1, 16, 1, 16>,            S<8, 8, 1>,        Intrawave,                 v1,            F8 >,
+        DeviceGemm_Wmma_CShuffleV3_BPreshuffle<     Row,     Col,     Row,   F8,    F8,   F16,     F32,      F32, PassThrough, PassThrough, PassThrough,       GemmSpec,   256,    64,   128,   256,  16,  16,   16,   16,       4,       1,      S<16, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,             16,             16,         0,      S<16, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,             16,             16,         0,          1,          1,       S<1, 16, 1, 16>,            S<8, 8, 1>,        Intrawave,                 v1,            F8 >,
+        DeviceGemm_Wmma_CShuffleV3_BPreshuffle<     Row,     Col,     Row,   F8,    F8,   F16,     F32,      F32, PassThrough, PassThrough, PassThrough,       GemmSpec,   256,    16,   128,   256,  16,  16,   16,   16,       1,       1,      S<16, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,             16,             16,         0,      S<16, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,             16,             16,         0,          1,          1,       S<1, 16, 1, 16>,            S<8, 8, 1>,        Intrawave,                 v1,            F8 >,
+        DeviceGemm_Wmma_CShuffleV3_BPreshuffle<     Row,     Col,     Row,   F8,    F8,   F16,     F32,      F32, PassThrough, PassThrough, PassThrough,       GemmSpec,   256,    16,   256,   256,  16,  16,   16,   16,       1,       2,      S<16, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,             16,             16,         0,      S<16, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,             16,             16,         0,          1,          1,       S<1, 16, 1, 16>,            S<8, 8, 1>,        Intrawave,                 v1,            F8 >,
+        DeviceGemm_Wmma_CShuffleV3_BPreshuffle<     Row,     Col,     Row,   F8,    F8,   F16,     F32,      F32, PassThrough, PassThrough, PassThrough,       GemmSpec,   256,    16,   512,   256,  16,  16,   16,   16,       1,       4,      S<16, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,             16,             16,         0,      S<16, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,             16,             16,         0,          1,          1,       S<1, 16, 1, 16>,            S<8, 8, 1>,        Intrawave,                 v1,            F8 >,
+        DeviceGemm_Wmma_CShuffleV3_BPreshuffle<     Row,     Col,     Row,   F8,    F8,   F16,     F32,      F32, PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   128,   128,   256,  16,  16,   16,   16,       8,       1,      S<16, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,             16,             16,         0,      S<16, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,             16,             16,         0,          1,          1,       S<1, 16, 1, 16>,            S<8, 8, 1>,        Intrawave,                 v1,            F8 >,
+        DeviceGemm_Wmma_CShuffleV3_BPreshuffle<     Row,     Col,     Row,   F8,    F8,   F16,     F32,      F32, PassThrough, PassThrough, PassThrough,       GemmSpec,   128,   128,    64,   256,  16,  16,   16,   16,       8,       1,      S<16,  8, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,             16,             16,         0,      S<16,  8, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,             16,             16,         0,          1,          1,       S<1, 16, 1,  8>,            S<8, 8, 1>,        Intrawave,                 v1,            F8 >
+    // clang-format on
+    >;
+
+template <GemmSpecialization GemmSpec>
+using device_gemm_universal_preshuffle_wmma_f8_f8_f16_mk_wmma_mn_instances_p4 = std::tuple<
+    // clang-format off
+        //#####################################| ALayout| BLayout| CLayout|AData| BData| CData| AccData| Cshuffle|           A|           B|           C|           GEMM| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MRepeat| NRepeat|    ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|    BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|   CShuffle|   CShuffle| CShuffleBlockTransfer| CShuffleBlockTransfer| BlkGemmPipeSched| BlkGemmPipelineVer| ComputeTypeA|
+        //#####################################|        |        |        | Type|  Type|  Type|    Type|     Type| Elementwise| Elementwise| Elementwise| Specialization|  Size| Block| Block| Block|    |    | Wmma| Wmma|        |        |     ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|     ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN|    MRepeat|    NRepeat|        ClusterLengths|       ScalarPerVector|                 |                   |             |
+        //#####################################|        |        |        |     |      |      |        |         |   Operation|   Operation|   Operation|               |      |      |      |      |    |    |     |     |        |        | Lengths_AK0_M_AK1|   ArrangeOrder|               |               |      PerVector|  PerVector_AK1|          | Lengths_BK0_N_BK1|   ArrangeOrder|               |              |      PerVector|  PerVector_BK1|          | PerShuffle| PerShuffle|     _MBlock_MPerBlock|                      |                 |                   |             |
+        //#####################################|        |        |        |     |      |      |        |         |            |            |            |               |      |      |      |      |    |    |     |     |        |        |                  |               |               |               |               |               |          |                  |               |               |              |               |               |          |           |           |     _NBlock_NPerBlock|                      |                 |                   |             |
+        DeviceGemm_Wmma_CShuffleV3_BPreshuffle<     Row,     Col,     Row,   F8,    F8,   F16,     F32,      F32, PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   224,   256,   128,  16,  16,   16,   16,       7,       4,       S<8, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,             16,             16,         0,       S<8, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,             16,             16,         0,          1,          1,        S<1, 32, 1, 8>,            S<8, 8, 1>,        Intrawave,                 v1,            F8 >,
+        DeviceGemm_Wmma_CShuffleV3_BPreshuffle<     Row,     Col,     Row,   F8,    F8,   F16,     F32,      F32, PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   224,   128,   128,  16,  16,   16,   16,       7,       2,       S<8, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,             16,             16,         0,       S<8, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,             16,             16,         0,          1,          1,        S<1, 32, 1, 8>,            S<8, 8, 1>,        Intrawave,                 v1,            F8 >,
+        DeviceGemm_Wmma_CShuffleV3_BPreshuffle<     Row,     Col,     Row,   F8,    F8,   F16,     F32,      F32, PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   224,    64,   128,  16,  16,   16,   16,       7,       1,       S<8, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,             16,             16,         0,       S<8, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,             16,             16,         0,          1,          1,        S<1, 32, 1, 8>,            S<8, 8, 1>,        Intrawave,                 v1,            F8 >,
+        DeviceGemm_Wmma_CShuffleV3_BPreshuffle<     Row,     Col,     Row,   F8,    F8,   F16,     F32,      F32, PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   256,   224,   128,  16,  16,   16,   16,       4,       7,       S<8, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,             16,             16,         0,       S<8, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,             16,             16,         0,          1,          1,        S<1, 64, 1, 4>,            S<8, 8, 1>,        Intrawave,                 v1,            F8 >,
+        DeviceGemm_Wmma_CShuffleV3_BPreshuffle<     Row,     Col,     Row,   F8,    F8,   F16,     F32,      F32, PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   128,   224,   128,  16,  16,   16,   16,       2,       7,       S<8, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,             16,             16,         0,       S<8, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,             16,             16,         0,          1,          1,        S<1, 64, 1, 4>,            S<8, 8, 1>,        Intrawave,                 v1,            F8 >,
+        DeviceGemm_Wmma_CShuffleV3_BPreshuffle<     Row,     Col,     Row,   F8,    F8,   F16,     F32,      F32, PassThrough, PassThrough, PassThrough,       GemmSpec,   256,    64,   224,   128,  16,  16,   16,   16,       1,       7,       S<8, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,             16,             16,         0,       S<8, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,             16,             16,         0,          1,          1,        S<1, 64, 1, 4>,            S<8, 8, 1>,        Intrawave,                 v1,            F8 >
+    // clang-format on
+    >;
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
--- a/library/src/tensor_operation_instance/gpu/gemm_universal_preshuffle/device_gemm_wmma_universal_preshuffle_f8_f8_f16/device_gemm_wmma_universal_preshuffle_f8_f8_f16_mk_wmma_mn_default_instance_p1.cpp
+++ b/library/src/tensor_operation_instance/gpu/gemm_universal_preshuffle/device_gemm_wmma_universal_preshuffle_f8_f8_f16/device_gemm_wmma_universal_preshuffle_f8_f8_f16_mk_wmma_mn_default_instance_p1.cpp
@@ -0,0 +1,33 @@
+// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
+// SPDX-License-Identifier: MIT
+
+#include "device_gemm_wmma_universal_preshuffle_f8_f8_f16_mk_wmma_mn.hpp"
+
+#include <memory>
+#include <vector>
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_gemm_universal_preshuffle_wmma_f8_f8_f16_mk_wmma_mn_default_instances_p1(
+    std::vector<std::unique_ptr<DeviceGemmV2BPreshuffle<Row,
+                                                        Col,
+                                                        Row,
+                                                        F8,
+                                                        F8,
+                                                        F16,
+                                                        PassThrough,
+                                                        PassThrough,
+                                                        PassThrough>>>& instances)
+{
+    add_device_operation_instances(
+        instances,
+        device_gemm_universal_preshuffle_wmma_f8_f8_f16_mk_wmma_mn_instances_p1<GemmDefault>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
--- a/library/src/tensor_operation_instance/gpu/gemm_universal_preshuffle/device_gemm_wmma_universal_preshuffle_f8_f8_f16/device_gemm_wmma_universal_preshuffle_f8_f8_f16_mk_wmma_mn_default_instance_p2.cpp
+++ b/library/src/tensor_operation_instance/gpu/gemm_universal_preshuffle/device_gemm_wmma_universal_preshuffle_f8_f8_f16/device_gemm_wmma_universal_preshuffle_f8_f8_f16_mk_wmma_mn_default_instance_p2.cpp
@@ -0,0 +1,33 @@
+// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
+// SPDX-License-Identifier: MIT
+
+#include "device_gemm_wmma_universal_preshuffle_f8_f8_f16_mk_wmma_mn.hpp"
+
+#include <memory>
+#include <vector>
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_gemm_universal_preshuffle_wmma_f8_f8_f16_mk_wmma_mn_default_instances_p2(
+    std::vector<std::unique_ptr<DeviceGemmV2BPreshuffle<Row,
+                                                        Col,
+                                                        Row,
+                                                        F8,
+                                                        F8,
+                                                        F16,
+                                                        PassThrough,
+                                                        PassThrough,
+                                                        PassThrough>>>& instances)
+{
+    add_device_operation_instances(
+        instances,
+        device_gemm_universal_preshuffle_wmma_f8_f8_f16_mk_wmma_mn_instances_p2<GemmDefault>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
--- a/library/src/tensor_operation_instance/gpu/gemm_universal_preshuffle/device_gemm_wmma_universal_preshuffle_f8_f8_f16/device_gemm_wmma_universal_preshuffle_f8_f8_f16_mk_wmma_mn_default_instance_p3.cpp
+++ b/library/src/tensor_operation_instance/gpu/gemm_universal_preshuffle/device_gemm_wmma_universal_preshuffle_f8_f8_f16/device_gemm_wmma_universal_preshuffle_f8_f8_f16_mk_wmma_mn_default_instance_p3.cpp
@@ -0,0 +1,33 @@
+// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
+// SPDX-License-Identifier: MIT
+
+#include "device_gemm_wmma_universal_preshuffle_f8_f8_f16_mk_wmma_mn.hpp"
+
+#include <memory>
+#include <vector>
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_gemm_universal_preshuffle_wmma_f8_f8_f16_mk_wmma_mn_default_instances_p3(
+    std::vector<std::unique_ptr<DeviceGemmV2BPreshuffle<Row,
+                                                        Col,
+                                                        Row,
+                                                        F8,
+                                                        F8,
+                                                        F16,
+                                                        PassThrough,
+                                                        PassThrough,
+                                                        PassThrough>>>& instances)
+{
+    add_device_operation_instances(
+        instances,
+        device_gemm_universal_preshuffle_wmma_f8_f8_f16_mk_wmma_mn_instances_p3<GemmDefault>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
--- a/library/src/tensor_operation_instance/gpu/gemm_universal_preshuffle/device_gemm_wmma_universal_preshuffle_f8_f8_f16/device_gemm_wmma_universal_preshuffle_f8_f8_f16_mk_wmma_mn_default_instance_p4.cpp
+++ b/library/src/tensor_operation_instance/gpu/gemm_universal_preshuffle/device_gemm_wmma_universal_preshuffle_f8_f8_f16/device_gemm_wmma_universal_preshuffle_f8_f8_f16_mk_wmma_mn_default_instance_p4.cpp
@@ -0,0 +1,33 @@
+// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
+// SPDX-License-Identifier: MIT
+
+#include "device_gemm_wmma_universal_preshuffle_f8_f8_f16_mk_wmma_mn.hpp"
+
+#include <memory>
+#include <vector>
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_gemm_universal_preshuffle_wmma_f8_f8_f16_mk_wmma_mn_default_instances_p4(
+    std::vector<std::unique_ptr<DeviceGemmV2BPreshuffle<Row,
+                                                        Col,
+                                                        Row,
+                                                        F8,
+                                                        F8,
+                                                        F16,
+                                                        PassThrough,
+                                                        PassThrough,
+                                                        PassThrough>>>& instances)
+{
+    add_device_operation_instances(
+        instances,
+        device_gemm_universal_preshuffle_wmma_f8_f8_f16_mk_wmma_mn_instances_p4<GemmDefault>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck