added add_fastgelu instances

2026-06-07 08:15:04 +00:00 · 2025-06-10 17:23:44 +00:00
parent 881bc3fbb0
commit ea9805b580
10 changed files with 390 additions and 82 deletions
--- a/library/include/ck/library/tensor_operation_instance/gpu/gemm_add_fastgelu.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/gemm_add_fastgelu.hpp
@@ -97,8 +97,47 @@ void add_device_gemm_add_fastgelu_xdl_c_shuffle_bf16_i8_bf16_bf16_mk_kn_mn_mn_in
 #endif // CK_USE_XDL

 #if defined(CK_USE_WMMA)
-void add_device_gemm_add_fastgelu_wmma_universal_f16_f16_f16_f16_mk_nk_mn_mn_instances(
+void add_device_gemm_add_fastgelu_wmma_c_shuffle_f16_f16_f16_f16_mk_kn_mn_mn_instances(
    std::vector<std::unique_ptr<DeviceGemmMultipleDSplitK<Row,
+                                                          Row,
+                                                          Row_Tuple,
+                                                          Row,
+                                                          F16,
+                                                          F16,
+                                                          F16_Tuple,
+                                                          F16,
+                                                          PassThrough,
+                                                          PassThrough,
+                                                          AddFastGelu>>>&);
+
+void add_device_gemm_add_fastgelu_wmma_c_shuffle_f16_f16_f16_f16_mk_nk_mn_mn_instances(
+    std::vector<std::unique_ptr<DeviceGemmMultipleDSplitK<Row,
+                                                          Col,
+                                                          Row_Tuple,
+                                                          Row,
+                                                          F16,
+                                                          F16,
+                                                          F16_Tuple,
+                                                          F16,
+                                                          PassThrough,
+                                                          PassThrough,
+                                                          AddFastGelu>>>&);
+
+void add_device_gemm_add_fastgelu_wmma_c_shuffle_f16_f16_f16_f16_km_kn_mn_mn_instances(
+    std::vector<std::unique_ptr<DeviceGemmMultipleDSplitK<Col,
+                                                          Row,
+                                                          Row_Tuple,
+                                                          Row,
+                                                          F16,
+                                                          F16,
+                                                          F16_Tuple,
+                                                          F16,
+                                                          PassThrough,
+                                                          PassThrough,
+                                                          AddFastGelu>>>&);
+
+void add_device_gemm_add_fastgelu_wmma_c_shuffle_f16_f16_f16_f16_km_nk_mn_mn_instances(
+    std::vector<std::unique_ptr<DeviceGemmMultipleDSplitK<Col,
                                                          Col,
                                                          Row_Tuple,
                                                          Row,
@@ -159,16 +198,32 @@ struct DeviceOperationInstanceFactory<
        if constexpr(is_same_v<ADataType, half_t> && is_same_v<BDataType, half_t> &&
                     is_same_v<D0DataType, half_t> && is_same_v<EDataType, half_t>)
        {
-            if constexpr(is_same_v<ALayout, Row> && is_same_v<BLayout, Col> &&
+            if constexpr(is_same_v<ALayout, Row> && is_same_v<BLayout, Row> &&
                         is_same_v<D0Layout, Row> && is_same_v<ELayout, Row>)
            {
-                add_device_gemm_add_fastgelu_wmma_universal_f16_f16_f16_f16_mk_nk_mn_mn_instances(
+                add_device_gemm_add_fastgelu_wmma_c_shuffle_f16_f16_f16_f16_mk_kn_mn_mn_instances(
+                    op_ptrs);
+            }
+            else if constexpr(is_same_v<ALayout, Row> && is_same_v<BLayout, Col> &&
+                              is_same_v<D0Layout, Row> && is_same_v<ELayout, Row>)
+            {
+                add_device_gemm_add_fastgelu_wmma_c_shuffle_f16_f16_f16_f16_mk_nk_mn_mn_instances(
+                    op_ptrs);
+            }
+            else if constexpr(is_same_v<ALayout, Col> && is_same_v<BLayout, Row> &&
+                              is_same_v<D0Layout, Row> && is_same_v<ELayout, Row>)
+            {
+                add_device_gemm_add_fastgelu_wmma_c_shuffle_f16_f16_f16_f16_km_kn_mn_mn_instances(
+                    op_ptrs);
+            }
+            else if constexpr(is_same_v<ALayout, Col> && is_same_v<BLayout, Col> &&
+                              is_same_v<D0Layout, Row> && is_same_v<ELayout, Row>)
+            {
+                add_device_gemm_add_fastgelu_wmma_c_shuffle_f16_f16_f16_f16_km_nk_mn_mn_instances(
                    op_ptrs);
            }
        }
-
-        // TODO: Add other types and layouts
-
+        
 #endif // CK_ENABLE_FP16
 #endif // CK_USE_WMMA

--- a/library/src/tensor_operation_instance/gpu/gemm_add_fastgelu/CMakeLists.txt
+++ b/library/src/tensor_operation_instance/gpu/gemm_add_fastgelu/CMakeLists.txt
@@ -1,6 +1,9 @@
 # ONLY XDL_AND_WMMA_KERNELS
 add_instance_library(device_gemm_add_fastgelu_instance
-    device_gemm_add_fastgelu_wmma_universal_f16_f16_f16_f16_mk_nk_mn_mn_instance.cpp
+    device_gemm_add_fastgelu_wmma_c_shuffle_f16_f16_f16_f16_km_kn_mn_mn_instance.cpp
+    device_gemm_add_fastgelu_wmma_c_shuffle_f16_f16_f16_f16_km_nk_mn_mn_instance.cpp
+    device_gemm_add_fastgelu_wmma_c_shuffle_f16_f16_f16_f16_mk_kn_mn_mn_instance.cpp
+    device_gemm_add_fastgelu_wmma_c_shuffle_f16_f16_f16_f16_mk_nk_mn_mn_instance.cpp

    device_gemm_add_fastgelu_xdl_c_shuffle_bf16_i8_bf16_bf16_mk_kn_mn_mn_instance.cpp
    device_gemm_add_fastgelu_xdl_c_shuffle_f16_f16_f16_f16_km_kn_mn_mn_instance.cpp
--- a/library/src/tensor_operation_instance/gpu/gemm_add_fastgelu/device_gemm_add_fastgelu_wmma_c_shuffle_f16_f16_f16_f16_km_kn_mn_mn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/gemm_add_fastgelu/device_gemm_add_fastgelu_wmma_c_shuffle_f16_f16_f16_f16_km_kn_mn_mn_instance.cpp
@@ -0,0 +1,76 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_wmma_cshuffle_v3.hpp"
+#include "ck/utility/sequence.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+
+static constexpr auto GemmDefault    = ck::tensor_operation::device::GemmSpecialization::Default;
+static constexpr auto GemmMNKPadding = ck::tensor_operation::device::GemmSpecialization::MNKPadding;
+
+static constexpr auto Interwave = BlockGemmPipelineScheduler::Interwave;
+static constexpr auto Intrawave = BlockGemmPipelineScheduler::Intrawave;
+
+static constexpr auto V1 = BlockGemmPipelineVersion::v1;
+static constexpr auto V3 = BlockGemmPipelineVersion::v3;
+
+// e = elementwise((a * b), d0)
+// elementwise(c, d0) = fastgelu(c + d0)
+// output: e[m, n]
+// input: a[m, k], b[n, k], d0[m, n]
+
+template <GemmSpecialization GemmSpec>
+using device_gemm_add_fastgelu_wmma_c_shuffle_f16_f16_f16_f16_km_kn_mn_mn_instance =
+    std::tuple<
+        // clang-format off
+        //##################################| ALayout| BLayout|  DsLayout| ELayout| AData| BData|    DsData| EData| AccData| CShuffle|           A|           B|         CDE| GemmSpec| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MRepeat| NRepeat|    ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|    BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|   CShuffle|   CShuffle| CShuffleBlockTransfer| CDEShuffleBlockTransfer|   BlkGemm|     BlkGemm|
+        //##################################|        |        |          |        |  Type|  Type|      Type|  Type|    Type| DataType| Elementwise| Elementwise| Elementwise|         |  Size| Block| Block| Block|    |    | Wmma| Wmma|        |        |     ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar|    ExtraM|     ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar|    ExtraN|    MRepeat|    NRepeat|        ClusterLengths|        ScalarPerVectors| PipeSched| PipelineVer|
+        //##################################|        |        |          |        |      |      |          |      |        |         |   Operation|   Operation|   Operation|         |      |      |      |      |    |    |     |     |        |        | Lengths_AK0_M_AK1|   ArrangeOrder|               |               |      PerVector|  PerVector_AK1|          | Lengths_BK0_N_BK1|   ArrangeOrder|               |               |      PerVector|  PerVector_BK1|          | PerShuffle| PerShuffle|     _MBlock_MPerBlock|                        |          |            |
+        //##################################|        |        |          |        |      |      |          |      |        |         |            |            |            |         |      |      |      |      |    |    |     |     |        |        |                  |               |               |               |               |               |          |                  |               |               |               |               |               |          |           |           |     _NBlock_NPerBlock|                        |          |            |
+        DeviceGemmMultipleD_Wmma_CShuffleV3<       Col,     Row, Row_Tuple,     Row,   F16,   F16, F16_Tuple,   F16,     F32,      F32, PassThrough, PassThrough, AddFastGelu, GemmSpec,   256,   128,   128,    32,   8,   8,   16,   16,       4,       2,       S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,       S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,          1,          1,        S<1, 32, 1, 8>,              S<8, 8, 8>, Intrawave,          V1>,
+        DeviceGemmMultipleD_Wmma_CShuffleV3<       Col,     Row, Row_Tuple,     Row,   F16,   F16, F16_Tuple,   F16,     F32,      F32, PassThrough, PassThrough, AddFastGelu, GemmSpec,   128,   128,   128,    32,   8,   8,   16,   16,       4,       4,       S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,       S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,          1,          1,        S<1, 32, 1, 4>,              S<8, 8, 8>, Intrawave,          V1>,
+        DeviceGemmMultipleD_Wmma_CShuffleV3<       Col,     Row, Row_Tuple,     Row,   F16,   F16, F16_Tuple,   F16,     F32,      F32, PassThrough, PassThrough, AddFastGelu, GemmSpec,    64,    32,    64,    64,   8,   8,   16,   16,       2,       2,       S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         0,       S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         0,          1,          1,        S<1, 16, 1, 4>,              S<8, 8, 8>, Intrawave,          V1>,
+        DeviceGemmMultipleD_Wmma_CShuffleV3<       Col,     Row, Row_Tuple,     Row,   F16,   F16, F16_Tuple,   F16,     F32,      F32, PassThrough, PassThrough, AddFastGelu, GemmSpec,   256,   128,   128,    32,   8,   8,   16,   16,       4,       2,       S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,       S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,          1,          1,        S<1, 32, 1, 8>,              S<8, 8, 8>, Interwave,          V1>,
+        DeviceGemmMultipleD_Wmma_CShuffleV3<       Col,     Row, Row_Tuple,     Row,   F16,   F16, F16_Tuple,   F16,     F32,      F32, PassThrough, PassThrough, AddFastGelu, GemmSpec,   256,   128,   128,    64,   8,   8,   16,   16,       4,       2,       S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,       S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,          1,          1,        S<1, 32, 1, 8>,              S<8, 8, 8>, Interwave,          V1>,
+        DeviceGemmMultipleD_Wmma_CShuffleV3<       Col,     Row, Row_Tuple,     Row,   F16,   F16, F16_Tuple,   F16,     F32,      F32, PassThrough, PassThrough, AddFastGelu, GemmSpec,    64,    32,    64,    64,   8,   8,   16,   16,       2,       2,       S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         0,       S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         0,          1,          1,        S<1, 16, 1, 4>,              S<8, 8, 8>, Interwave,          V1>,
+        DeviceGemmMultipleD_Wmma_CShuffleV3<       Col,     Row, Row_Tuple,     Row,   F16,   F16, F16_Tuple,   F16,     F32,      F32, PassThrough, PassThrough, AddFastGelu, GemmSpec,   256,   128,   128,    32,   8,   8,   16,   16,       4,       2,       S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,       S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,          1,          1,        S<1, 32, 1, 8>,              S<8, 8, 8>, Intrawave,          V3>,
+        DeviceGemmMultipleD_Wmma_CShuffleV3<       Col,     Row, Row_Tuple,     Row,   F16,   F16, F16_Tuple,   F16,     F32,      F32, PassThrough, PassThrough, AddFastGelu, GemmSpec,   256,   128,   128,    64,   8,   8,   16,   16,       4,       2,       S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,       S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,          1,          1,        S<1, 32, 1, 8>,              S<8, 8, 8>, Intrawave,          V3>,
+        DeviceGemmMultipleD_Wmma_CShuffleV3<       Col,     Row, Row_Tuple,     Row,   F16,   F16, F16_Tuple,   F16,     F32,      F32, PassThrough, PassThrough, AddFastGelu, GemmSpec,    64,    32,    64,    64,   8,   8,   16,   16,       2,       2,       S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         0,       S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         0,          1,          1,        S<1, 16, 1, 4>,              S<8, 8, 8>, Intrawave,          V3>
+        // clang-format on
+        >;
+
+void add_device_gemm_add_fastgelu_wmma_c_shuffle_f16_f16_f16_f16_km_kn_mn_mn_instances(
+    std::vector<std::unique_ptr<DeviceGemmMultipleDSplitK<Col,
+                                                          Row,
+                                                          Row_Tuple,
+                                                          Row,
+                                                          F16,
+                                                          F16,
+                                                          F16_Tuple,
+                                                          F16,
+                                                          PassThrough,
+                                                          PassThrough,
+                                                          AddFastGelu>>>& instances)
+{
+    add_device_operation_instances(
+        instances,
+        device_gemm_add_fastgelu_wmma_c_shuffle_f16_f16_f16_f16_km_kn_mn_mn_instance<GemmDefault>{});
+    add_device_operation_instances(
+        instances,
+        device_gemm_add_fastgelu_wmma_c_shuffle_f16_f16_f16_f16_km_kn_mn_mn_instance<GemmMNKPadding>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
--- a/library/src/tensor_operation_instance/gpu/gemm_add_fastgelu/device_gemm_add_fastgelu_wmma_c_shuffle_f16_f16_f16_f16_km_nk_mn_mn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/gemm_add_fastgelu/device_gemm_add_fastgelu_wmma_c_shuffle_f16_f16_f16_f16_km_nk_mn_mn_instance.cpp
@@ -0,0 +1,78 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_wmma_cshuffle_v3.hpp"
+#include "ck/utility/sequence.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+
+static constexpr auto GemmDefault    = ck::tensor_operation::device::GemmSpecialization::Default;
+static constexpr auto GemmMNKPadding = ck::tensor_operation::device::GemmSpecialization::MNKPadding;
+
+static constexpr auto Interwave = BlockGemmPipelineScheduler::Interwave;
+static constexpr auto Intrawave = BlockGemmPipelineScheduler::Intrawave;
+
+static constexpr auto V1 = BlockGemmPipelineVersion::v1;
+static constexpr auto V3 = BlockGemmPipelineVersion::v3;
+
+// e = elementwise((a * b), d0)
+// elementwise(c, d0) = fastgelu(c + d0)
+// output: e[m, n]
+// input: a[m, k], b[n, k], d0[m, n]
+
+template <GemmSpecialization GemmSpec>
+using device_gemm_add_fastgelu_wmma_c_shuffle_f16_f16_f16_f16_km_nk_mn_mn_instances =
+    std::tuple<
+        // clang-format off
+        //##################################| ALayout| BLayout|  DsLayout| ELayout| AData| BData|    DsData| EData| AccData| CShuffle|           A|           B|          CDE| GemmSpec| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MRepeat| NRepeat|    ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|    BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|   CShuffle|   CShuffle| CShuffleBlockTransfer| CDEShuffleBlockTransfer|   BlkGemm|     BlkGemm|
+        //##################################|        |        |          |        |  Type|  Type|      Type|  Type|    Type| DataType| Elementwise| Elementwise|  Elementwise|         |  Size| Block| Block| Block|    |    | Wmma| Wmma|        |        |     ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar|    ExtraM|     ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar|    ExtraN|    MRepeat|    NRepeat|        ClusterLengths|        ScalarPerVectors| PipeSched| PipelineVer|
+        //##################################|        |        |          |        |      |      |          |      |        |         |   Operation|   Operation|    Operation|         |      |      |      |      |    |    |     |     |        |        | Lengths_AK0_M_AK1|   ArrangeOrder|               |               |      PerVector|  PerVector_AK1|          | Lengths_BK0_N_BK1|   ArrangeOrder|               |               |      PerVector|  PerVector_BK1|          | PerShuffle| PerShuffle|     _MBlock_MPerBlock|                        |          |            |
+        //##################################|        |        |          |        |      |      |          |      |        |         |            |            |             |         |      |      |      |      |    |    |     |     |        |        |                  |               |               |               |               |               |          |                  |               |               |               |               |               |          |           |           |     _NBlock_NPerBlock|                        |          |            |
+        DeviceGemmMultipleD_Wmma_CShuffleV3<      Col,     Col, Row_Tuple,     Row,   F16,    F16, F16_Tuple,   F16,     F32,      F32, PassThrough, PassThrough, AddFastGelu, GemmSpec,   256,   128,   128,    32,   8,   8,   16,   16,       4,       2,       S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 32, 1, 8>,              S<8, 8, 8>, Intrawave,          V1>,
+        DeviceGemmMultipleD_Wmma_CShuffleV3<      Col,     Col, Row_Tuple,     Row,   F16,    F16, F16_Tuple,   F16,     F32,      F32, PassThrough, PassThrough, AddFastGelu, GemmSpec,   256,   128,   256,    64,   8,   8,   16,   16,       4,       4,       S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 32, 1, 8>,              S<8, 8, 8>, Intrawave,          V1>,
+        DeviceGemmMultipleD_Wmma_CShuffleV3<      Col,     Col, Row_Tuple,     Row,   F16,    F16, F16_Tuple,   F16,     F32,      F32, PassThrough, PassThrough, AddFastGelu, GemmSpec,   256,   128,   160,    64,   8,   8,   16,   16,       2,       5,       S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,       S<8, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 64, 1, 4>,              S<8, 8, 8>, Intrawave,          V1>,
+        DeviceGemmMultipleD_Wmma_CShuffleV3<      Col,     Col, Row_Tuple,     Row,   F16,    F16, F16_Tuple,   F16,     F32,      F32, PassThrough, PassThrough, AddFastGelu, GemmSpec,   128,    64,    80,    64,   8,   8,   16,   16,       1,       5,       S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         0,       S<8, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         0,          1,          1,        S<1, 64, 1, 2>,              S<8, 8, 8>, Intrawave,          V1>,
+        DeviceGemmMultipleD_Wmma_CShuffleV3<      Col,     Col, Row_Tuple,     Row,   F16,    F16, F16_Tuple,   F16,     F32,      F32, PassThrough, PassThrough, AddFastGelu, GemmSpec,   256,   128,   128,    32,   8,   8,   16,   16,       4,       2,       S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 32, 1, 8>,              S<8, 8, 8>, Interwave,          V1>,
+        DeviceGemmMultipleD_Wmma_CShuffleV3<      Col,     Col, Row_Tuple,     Row,   F16,    F16, F16_Tuple,   F16,     F32,      F32, PassThrough, PassThrough, AddFastGelu, GemmSpec,   256,   128,   128,    64,   8,   8,   16,   16,       4,       2,       S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 32, 1, 8>,              S<8, 8, 8>, Interwave,          V1>,
+        DeviceGemmMultipleD_Wmma_CShuffleV3<      Col,     Col, Row_Tuple,     Row,   F16,    F16, F16_Tuple,   F16,     F32,      F32, PassThrough, PassThrough, AddFastGelu, GemmSpec,   128,    64,    80,    64,   8,   8,   16,   16,       1,       5,       S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,       S<8, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 64, 1, 2>,              S<8, 8, 8>, Interwave,          V1>,
+        DeviceGemmMultipleD_Wmma_CShuffleV3<      Col,     Col, Row_Tuple,     Row,   F16,    F16, F16_Tuple,   F16,     F32,      F32, PassThrough, PassThrough, AddFastGelu, GemmSpec,    64,    32,    64,    64,   8,   8,   16,   16,       2,       2,       S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         0,       S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         0,          1,          1,        S<1, 16, 1, 4>,              S<8, 8, 8>, Interwave,          V1>,
+        DeviceGemmMultipleD_Wmma_CShuffleV3<      Col,     Col, Row_Tuple,     Row,   F16,    F16, F16_Tuple,   F16,     F32,      F32, PassThrough, PassThrough, AddFastGelu, GemmSpec,   256,   128,   128,    32,   8,   8,   16,   16,       4,       2,       S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 32, 1, 8>,              S<8, 8, 8>, Intrawave,          V3>,
+        DeviceGemmMultipleD_Wmma_CShuffleV3<      Col,     Col, Row_Tuple,     Row,   F16,    F16, F16_Tuple,   F16,     F32,      F32, PassThrough, PassThrough, AddFastGelu, GemmSpec,   128,   128,   128,    32,   8,   8,   16,   16,       4,       4,       S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 32, 1, 4>,              S<8, 8, 8>, Intrawave,          V3>,
+        DeviceGemmMultipleD_Wmma_CShuffleV3<      Col,     Col, Row_Tuple,     Row,   F16,    F16, F16_Tuple,   F16,     F32,      F32, PassThrough, PassThrough, AddFastGelu, GemmSpec,    64,    64,    32,    64,   8,   8,   16,   16,       4,       1,       S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         0,       S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         0,          1,          1,        S<1, 16, 1, 4>,              S<8, 8, 8>, Intrawave,          V3>
+        // clang-format on
+        >;
+
+void add_device_gemm_add_fastgelu_wmma_c_shuffle_f16_f16_f16_f16_km_nk_mn_mn_instances(
+    std::vector<std::unique_ptr<DeviceGemmMultipleDSplitK<Col,
+                                                          Col,
+                                                          Row_Tuple,
+                                                          Row,
+                                                          F16,
+                                                          F16,
+                                                          F16_Tuple,
+                                                          F16,
+                                                          PassThrough,
+                                                          PassThrough,
+                                                          AddFastGelu>>>& instances)
+{
+    add_device_operation_instances(
+        instances,
+        device_gemm_add_fastgelu_wmma_c_shuffle_f16_f16_f16_f16_km_nk_mn_mn_instances<GemmDefault>{});
+    add_device_operation_instances(
+        instances,
+        device_gemm_add_fastgelu_wmma_c_shuffle_f16_f16_f16_f16_km_nk_mn_mn_instances<GemmMNKPadding>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
--- a/library/src/tensor_operation_instance/gpu/gemm_add_fastgelu/device_gemm_add_fastgelu_wmma_c_shuffle_f16_f16_f16_f16_mk_kn_mn_mn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/gemm_add_fastgelu/device_gemm_add_fastgelu_wmma_c_shuffle_f16_f16_f16_f16_mk_kn_mn_mn_instance.cpp
@@ -0,0 +1,81 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_wmma_cshuffle_v3.hpp"
+#include "ck/utility/sequence.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+
+static constexpr auto GemmDefault    = ck::tensor_operation::device::GemmSpecialization::Default;
+static constexpr auto GemmMNKPadding = ck::tensor_operation::device::GemmSpecialization::MNKPadding;
+
+static constexpr auto Interwave = BlockGemmPipelineScheduler::Interwave;
+static constexpr auto Intrawave = BlockGemmPipelineScheduler::Intrawave;
+
+static constexpr auto V1 = BlockGemmPipelineVersion::v1;
+static constexpr auto V3 = BlockGemmPipelineVersion::v3;
+
+// e = elementwise((a * b), d0)
+// elementwise(c, d0) = fastgelu(c + d0)
+// output: e[m, n]
+// input: a[m, k], b[n, k], d0[m, n]
+
+template <GemmSpecialization GemmSpec>
+using device_gemm_add_fastgelu_wmma_c_shuffle_f16_f16_f16_f16_mk_kn_mn_mn_instances =
+    std::tuple<
+        // clang-format off
+        //##################################| ALayout| BLayout|  DsLayout| ELayout| AData| BData|    DsData| EData| AccData| CShuffle|           A|           B|          CDE| GemmSpec| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MRepeat| NRepeat|    ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|    BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|   CShuffle|   CShuffle| CShuffleBlockTransfer| CDEShuffleBlockTransfer|   BlkGemm|     BlkGemm|
+        //##################################|        |        |          |        |  Type|  Type|      Type|  Type|    Type| DataType| Elementwise| Elementwise|  Elementwise|         |  Size| Block| Block| Block|    |    | Wmma| Wmma|        |        |     ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar|    ExtraM|     ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar|    ExtraN|    MRepeat|    NRepeat|        ClusterLengths|        ScalarPerVectors| PipeSched| PipelineVer|
+        //##################################|        |        |          |        |      |      |          |      |        |         |   Operation|   Operation|    Operation|         |      |      |      |      |    |    |     |     |        |        | Lengths_AK0_M_AK1|   ArrangeOrder|               |               |      PerVector|  PerVector_AK1|          | Lengths_BK0_N_BK1|   ArrangeOrder|               |               |      PerVector|  PerVector_BK1|          | PerShuffle| PerShuffle|     _MBlock_MPerBlock|                        |          |            |
+        //##################################|        |        |          |        |      |      |          |      |        |         |            |            |             |         |      |      |      |      |    |    |     |     |        |        |                  |               |               |               |               |               |          |                  |               |               |               |               |               |          |           |           |     _NBlock_NPerBlock|                        |          |            |
+        DeviceGemmMultipleD_Wmma_CShuffleV3<      Row,     Row, Row_Tuple,     Row,   F16,    F16, F16_Tuple,   F16,     F32,      F32, PassThrough, PassThrough, AddFastGelu, GemmSpec,   256,   128,   128,    32,   8,   8,   16,   16,       4,       2,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         0,       S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         0,          1,          1,        S<1, 32, 1, 8>,              S<8, 8, 8>, Intrawave,          V1>,
+        DeviceGemmMultipleD_Wmma_CShuffleV3<      Row,     Row, Row_Tuple,     Row,   F16,    F16, F16_Tuple,   F16,     F32,      F32, PassThrough, PassThrough, AddFastGelu, GemmSpec,   128,   128,    64,    64,   8,   8,   16,   16,       4,       2,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         0,       S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         0,          1,          1,        S<1, 32, 1, 4>,              S<8, 8, 8>, Intrawave,          V1>,
+        DeviceGemmMultipleD_Wmma_CShuffleV3<      Row,     Row, Row_Tuple,     Row,   F16,    F16, F16_Tuple,   F16,     F32,      F32, PassThrough, PassThrough, AddFastGelu, GemmSpec,   256,   128,   256,    64,   8,   8,   16,   16,       4,       4,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,          1,          1,        S<1, 32, 1, 8>,              S<8, 8, 8>, Intrawave,          V1>,
+        DeviceGemmMultipleD_Wmma_CShuffleV3<      Row,     Row, Row_Tuple,     Row,   F16,    F16, F16_Tuple,   F16,     F32,      F32, PassThrough, PassThrough, AddFastGelu, GemmSpec,   256,   128,   128,    64,   8,   8,   16,   16,       4,       2,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,          1,          1,        S<1, 32, 1, 8>,              S<8, 8, 8>, Intrawave,          V1>,
+        DeviceGemmMultipleD_Wmma_CShuffleV3<      Row,     Row, Row_Tuple,     Row,   F16,    F16, F16_Tuple,   F16,     F32,      F32, PassThrough, PassThrough, AddFastGelu, GemmSpec,   128,    64,    64,    32,   8,   8,   16,   16,       2,       2,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         0,       S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              4,         1,          1,          1,        S<1, 32, 1, 2>,              S<8, 8, 8>, Intrawave,          V1>,
+        DeviceGemmMultipleD_Wmma_CShuffleV3<      Row,     Row, Row_Tuple,     Row,   F16,    F16, F16_Tuple,   F16,     F32,      F32, PassThrough, PassThrough, AddFastGelu, GemmSpec,   256,   128,   128,    32,   8,   8,   16,   16,       4,       2,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,          1,          1,        S<1, 32, 1, 8>,              S<8, 8, 8>, Interwave,          V1>,
+        DeviceGemmMultipleD_Wmma_CShuffleV3<      Row,     Row, Row_Tuple,     Row,   F16,    F16, F16_Tuple,   F16,     F32,      F32, PassThrough, PassThrough, AddFastGelu, GemmSpec,   256,   128,   128,    64,   8,   8,   16,   16,       4,       2,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,          1,          1,        S<1, 32, 1, 8>,              S<8, 8, 8>, Interwave,          V1>,
+        DeviceGemmMultipleD_Wmma_CShuffleV3<      Row,     Row, Row_Tuple,     Row,   F16,    F16, F16_Tuple,   F16,     F32,      F32, PassThrough, PassThrough, AddFastGelu, GemmSpec,   128,   128,   128,    32,   8,   8,   16,   16,       4,       4,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,          1,          1,        S<1, 32, 1, 4>,              S<8, 8, 8>, Interwave,          V1>,
+        DeviceGemmMultipleD_Wmma_CShuffleV3<      Row,     Row, Row_Tuple,     Row,   F16,    F16, F16_Tuple,   F16,     F32,      F32, PassThrough, PassThrough, AddFastGelu, GemmSpec,    64,    32,    64,    64,   8,   8,   16,   16,       2,       2,       S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         0,       S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         0,          1,          1,        S<1, 16, 1, 4>,              S<8, 8, 8>, Interwave,          V1>,
+        DeviceGemmMultipleD_Wmma_CShuffleV3<      Row,     Row, Row_Tuple,     Row,   F16,    F16, F16_Tuple,   F16,     F32,      F32, PassThrough, PassThrough, AddFastGelu, GemmSpec,   128,    64,    64,    32,   8,   8,   16,   16,       2,       2,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         0,       S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              4,         0,          1,          1,        S<1, 32, 1, 2>,              S<8, 8, 8>, Interwave,          V1>,
+        DeviceGemmMultipleD_Wmma_CShuffleV3<      Row,     Row, Row_Tuple,     Row,   F16,    F16, F16_Tuple,   F16,     F32,      F32, PassThrough, PassThrough, AddFastGelu, GemmSpec,   256,   128,   128,    32,   8,   8,   16,   16,       4,       2,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,          1,          1,        S<1, 32, 1, 8>,              S<8, 8, 8>, Intrawave,          V3>,
+        DeviceGemmMultipleD_Wmma_CShuffleV3<      Row,     Row, Row_Tuple,     Row,   F16,    F16, F16_Tuple,   F16,     F32,      F32, PassThrough, PassThrough, AddFastGelu, GemmSpec,   256,   128,   128,    64,   8,   8,   16,   16,       4,       2,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,          1,          1,        S<1, 32, 1, 8>,              S<8, 8, 8>, Intrawave,          V3>,
+        DeviceGemmMultipleD_Wmma_CShuffleV3<      Row,     Row, Row_Tuple,     Row,   F16,    F16, F16_Tuple,   F16,     F32,      F32, PassThrough, PassThrough, AddFastGelu, GemmSpec,   128,   128,   128,    32,   8,   8,   16,   16,       4,       4,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,          1,          1,        S<1, 32, 1, 4>,              S<8, 8, 8>, Intrawave,          V3>,
+        DeviceGemmMultipleD_Wmma_CShuffleV3<      Row,     Row, Row_Tuple,     Row,   F16,    F16, F16_Tuple,   F16,     F32,      F32, PassThrough, PassThrough, AddFastGelu, GemmSpec,   128,    64,    64,    32,   8,   8,   16,   16,       2,       2,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         0,       S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              4,         1,          1,          1,        S<1, 32, 1, 2>,              S<8, 8, 8>, Intrawave,          V3>
+        // clang-format on
+        >;
+
+void add_device_gemm_add_fastgelu_wmma_c_shuffle_f16_f16_f16_f16_mk_kn_mn_mn_instances(
+    std::vector<std::unique_ptr<DeviceGemmMultipleDSplitK<Row,
+                                                          Row,
+                                                          Row_Tuple,
+                                                          Row,
+                                                          F16,
+                                                          F16,
+                                                          F16_Tuple,
+                                                          F16,
+                                                          PassThrough,
+                                                          PassThrough,
+                                                          AddFastGelu>>>& instances)
+{
+    add_device_operation_instances(
+        instances,
+        device_gemm_add_fastgelu_wmma_c_shuffle_f16_f16_f16_f16_mk_kn_mn_mn_instances<GemmDefault>{});
+    add_device_operation_instances(
+        instances,
+        device_gemm_add_fastgelu_wmma_c_shuffle_f16_f16_f16_f16_mk_kn_mn_mn_instances<GemmMNKPadding>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
--- a/library/src/tensor_operation_instance/gpu/gemm_add_fastgelu/device_gemm_add_fastgelu_wmma_c_shuffle_f16_f16_f16_f16_mk_nk_mn_mn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/gemm_add_fastgelu/device_gemm_add_fastgelu_wmma_c_shuffle_f16_f16_f16_f16_mk_nk_mn_mn_instance.cpp
@@ -0,0 +1,83 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_wmma_cshuffle_v3.hpp"
+#include "ck/utility/sequence.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+
+static constexpr auto GemmDefault    = ck::tensor_operation::device::GemmSpecialization::Default;
+static constexpr auto GemmMNKPadding = ck::tensor_operation::device::GemmSpecialization::MNKPadding;
+
+static constexpr auto Interwave = BlockGemmPipelineScheduler::Interwave;
+static constexpr auto Intrawave = BlockGemmPipelineScheduler::Intrawave;
+
+static constexpr auto V1 = BlockGemmPipelineVersion::v1;
+static constexpr auto V3 = BlockGemmPipelineVersion::v3;
+
+// e = elementwise((a * b), d0)
+// elementwise(c, d0) = fastgelu(c + d0)
+// output: e[m, n]
+// input: a[m, k], b[n, k], d0[m, n]
+
+template <GemmSpecialization GemmSpec>
+using device_gemm_add_fastgelu_wmma_c_shuffle_f16_f16_f16_f16_mk_nk_mn_mn_instances = std::tuple<
+        // clang-format off
+        //##################################| ALayout| BLayout|  DsLayout| ELayout| AData| BData|    DsData| EData| AccData| CShuffle|           A|           B|         CDE| GemmSpec| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MRepeat| NRepeat|    ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|    BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|   CShuffle|   CShuffle| CShuffleBlockTransfer| CDEShuffleBlockTransfer|   BlkGemm|     BlkGemm|
+        //##################################|        |        |          |        |  Type|  Type|      Type|  Type|    Type| DataType| Elementwise| Elementwise| Elementwise|         |  Size| Block| Block| Block|    |    | Wmma| Wmma|        |        |     ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar|    ExtraM|     ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar|    ExtraN|    MRepeat|    NRepeat|        ClusterLengths|        ScalarPerVectors| PipeSched| PipelineVer|
+        //##################################|        |        |          |        |      |      |          |      |        |         |   Operation|   Operation|   Operation|         |      |      |      |      |    |    |     |     |        |        | Lengths_AK0_M_AK1|   ArrangeOrder|               |               |      PerVector|  PerVector_AK1|          | Lengths_BK0_N_BK1|   ArrangeOrder|               |               |      PerVector|  PerVector_BK1|          | PerShuffle| PerShuffle|     _MBlock_MPerBlock|                        |          |            |
+        //##################################|        |        |          |        |      |      |          |      |        |         |            |            |            |         |      |      |      |      |    |    |     |     |        |        |                  |               |               |               |               |               |          |                  |               |               |               |               |               |          |           |           |     _NBlock_NPerBlock|                        |          |            |
+        DeviceGemmMultipleD_Wmma_CShuffleV3<      Row,     Col, Row_Tuple,     Row,   F16,   F16, F16_Tuple,   F16,     F32,      F32, PassThrough, PassThrough, AddFastGelu, GemmSpec,   256,   128,   128,    32,   8,   8,   16,   16,       4,       2,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 32, 1, 8>,              S<8, 8, 8>,  Intrawave,         V1>,
+        DeviceGemmMultipleD_Wmma_CShuffleV3<      Row,     Col, Row_Tuple,     Row,   F16,   F16, F16_Tuple,   F16,     F32,      F32, PassThrough, PassThrough, AddFastGelu, GemmSpec,   128,   128,    64,    64,   8,   8,   16,   16,       4,       2,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         0,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         0,          1,          1,        S<1, 32, 1, 4>,              S<8, 8, 8>,  Intrawave,         V1>,
+        DeviceGemmMultipleD_Wmma_CShuffleV3<      Row,     Col, Row_Tuple,     Row,   F16,   F16, F16_Tuple,   F16,     F32,      F32, PassThrough, PassThrough, AddFastGelu, GemmSpec,   256,   128,   256,    64,   8,   8,   16,   16,       4,       4,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 32, 1, 8>,              S<8, 8, 8>,  Intrawave,         V1>,
+        DeviceGemmMultipleD_Wmma_CShuffleV3<      Row,     Col, Row_Tuple,     Row,   F16,   F16, F16_Tuple,   F16,     F32,      F32, PassThrough, PassThrough, AddFastGelu, GemmSpec,   256,   128,   128,    64,   8,   8,   16,   16,       4,       2,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         0,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         0,          1,          1,        S<1, 32, 1, 8>,              S<8, 8, 8>,  Intrawave,         V1>,
+        DeviceGemmMultipleD_Wmma_CShuffleV3<      Row,     Col, Row_Tuple,     Row,   F16,   F16, F16_Tuple,   F16,     F32,      F32, PassThrough, PassThrough, AddFastGelu, GemmSpec,   256,   128,   160,    64,   8,   8,   16,   16,       2,       5,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         0,       S<8, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         0,          1,          1,        S<1, 64, 1, 4>,              S<8, 8, 8>,  Intrawave,         V1>,
+        DeviceGemmMultipleD_Wmma_CShuffleV3<      Row,     Col, Row_Tuple,     Row,   F16,   F16, F16_Tuple,   F16,     F32,      F32, PassThrough, PassThrough, AddFastGelu, GemmSpec,   128,    64,    80,    64,   8,   8,   16,   16,       1,       5,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         0,       S<8, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         0,          1,          1,        S<1, 64, 1, 2>,              S<8, 8, 8>,  Intrawave,         V1>,
+        DeviceGemmMultipleD_Wmma_CShuffleV3<      Row,     Col, Row_Tuple,     Row,   F16,   F16, F16_Tuple,   F16,     F32,      F32, PassThrough, PassThrough, AddFastGelu, GemmSpec,    64,    32,    64,    64,   8,   8,   16,   16,       2,       2,       S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         0,       S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         0,          1,          1,        S<1, 16, 1, 4>,              S<8, 8, 8>,  Intrawave,         V1>,
+        DeviceGemmMultipleD_Wmma_CShuffleV3<      Row,     Col, Row_Tuple,     Row,   F16,   F16, F16_Tuple,   F16,     F32,      F32, PassThrough, PassThrough, AddFastGelu, GemmSpec,   256,   128,   128,    32,   8,   8,   16,   16,       4,       2,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 32, 1, 8>,              S<8, 8, 8>,  Interwave,         V1>,
+        DeviceGemmMultipleD_Wmma_CShuffleV3<      Row,     Col, Row_Tuple,     Row,   F16,   F16, F16_Tuple,   F16,     F32,      F32, PassThrough, PassThrough, AddFastGelu, GemmSpec,   128,   128,    64,    64,   8,   8,   16,   16,       4,       2,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         0,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         0,          1,          1,        S<1, 32, 1, 4>,              S<8, 8, 8>,  Interwave,         V1>,
+        DeviceGemmMultipleD_Wmma_CShuffleV3<      Row,     Col, Row_Tuple,     Row,   F16,   F16, F16_Tuple,   F16,     F32,      F32, PassThrough, PassThrough, AddFastGelu, GemmSpec,   256,   128,   160,    64,   8,   8,   16,   16,       2,       5,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<8, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 64, 1, 4>,              S<8, 8, 8>,  Interwave,         V1>,
+        DeviceGemmMultipleD_Wmma_CShuffleV3<      Row,     Col, Row_Tuple,     Row,   F16,   F16, F16_Tuple,   F16,     F32,      F32, PassThrough, PassThrough, AddFastGelu, GemmSpec,   128,   128,   128,    32,   8,   8,   16,   16,       4,       4,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 32, 1, 4>,              S<8, 8, 8>,  Interwave,         V1>,
+        DeviceGemmMultipleD_Wmma_CShuffleV3<      Row,     Col, Row_Tuple,     Row,   F16,   F16, F16_Tuple,   F16,     F32,      F32, PassThrough, PassThrough, AddFastGelu, GemmSpec,    64,    32,    64,    64,   8,   8,   16,   16,       2,       2,       S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         0,       S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         0,          1,          1,        S<1, 16, 1, 4>,              S<8, 8, 8>,  Interwave,         V1>,
+        DeviceGemmMultipleD_Wmma_CShuffleV3<      Row,     Col, Row_Tuple,     Row,   F16,   F16, F16_Tuple,   F16,     F32,      F32, PassThrough, PassThrough, AddFastGelu, GemmSpec,   256,   128,   128,    32,   8,   8,   16,   16,       4,       2,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 32, 1, 8>,              S<8, 8, 8>,  Intrawave,         V3>,
+        DeviceGemmMultipleD_Wmma_CShuffleV3<      Row,     Col, Row_Tuple,     Row,   F16,   F16, F16_Tuple,   F16,     F32,      F32, PassThrough, PassThrough, AddFastGelu, GemmSpec,   128,   128,    64,    64,   8,   8,   16,   16,       4,       2,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         0,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         0,          1,          1,        S<1, 32, 1, 4>,              S<8, 8, 8>,  Intrawave,         V3>,
+        DeviceGemmMultipleD_Wmma_CShuffleV3<      Row,     Col, Row_Tuple,     Row,   F16,   F16, F16_Tuple,   F16,     F32,      F32, PassThrough, PassThrough, AddFastGelu, GemmSpec,   256,   128,   160,    64,   8,   8,   16,   16,       2,       5,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<8, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 64, 1, 4>,              S<8, 8, 8>,  Intrawave,         V3>,
+        DeviceGemmMultipleD_Wmma_CShuffleV3<      Row,     Col, Row_Tuple,     Row,   F16,   F16, F16_Tuple,   F16,     F32,      F32, PassThrough, PassThrough, AddFastGelu, GemmSpec,   128,   128,   128,    32,   8,   8,   16,   16,       4,       4,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 32, 1, 4>,              S<8, 8, 8>,  Intrawave,         V3>,
+        DeviceGemmMultipleD_Wmma_CShuffleV3<      Row,     Col, Row_Tuple,     Row,   F16,   F16, F16_Tuple,   F16,     F32,      F32, PassThrough, PassThrough, AddFastGelu, GemmSpec,    64,    32,    64,    64,   8,   8,   16,   16,       2,       2,       S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         0,       S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         0,          1,          1,        S<1, 16, 1, 4>,              S<8, 8, 8>,  Intrawave,         V3>
+        // clang-format on
+    >;
+
+void add_device_gemm_add_fastgelu_wmma_c_shuffle_f16_f16_f16_f16_mk_nk_mn_mn_instances(
+    std::vector<std::unique_ptr<DeviceGemmMultipleDSplitK<Row,
+                                                          Col,
+                                                          Row_Tuple,
+                                                          Row,
+                                                          F16,
+                                                          F16,
+                                                          F16_Tuple,
+                                                          F16,
+                                                          PassThrough,
+                                                          PassThrough,
+                                                          AddFastGelu>>>& instances)
+{
+    add_device_operation_instances(
+        instances,
+        device_gemm_add_fastgelu_wmma_c_shuffle_f16_f16_f16_f16_mk_nk_mn_mn_instances<GemmDefault>{});
+    add_device_operation_instances(
+        instances,
+        device_gemm_add_fastgelu_wmma_c_shuffle_f16_f16_f16_f16_mk_nk_mn_mn_instances<GemmMNKPadding>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
--- a/library/src/tensor_operation_instance/gpu/gemm_add_fastgelu/device_gemm_add_fastgelu_wmma_universal_f16_f16_f16_f16_mk_nk_mn_mn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/gemm_add_fastgelu/device_gemm_add_fastgelu_wmma_universal_f16_f16_f16_f16_mk_nk_mn_mn_instance.cpp
@@ -1,73 +0,0 @@
-// SPDX-License-Identifier: MIT
-// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
-
-#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
-#include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp"
-#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
-#include "ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_wmma_cshuffle_v3.hpp"
-#include "ck/utility/sequence.hpp"
-
-namespace ck {
-namespace tensor_operation {
-namespace device {
-namespace instance {
-
-template <ck::index_t... Is>
-using S = ck::Sequence<Is...>;
-
-static constexpr auto GemmDefault    = ck::tensor_operation::device::GemmSpecialization::Default;
-static constexpr auto GemmMNKPadding = ck::tensor_operation::device::GemmSpecialization::MNKPadding;
-
-static constexpr auto Intrawave = BlockGemmPipelineScheduler::Intrawave;
-static constexpr auto Interwave = BlockGemmPipelineScheduler::Interwave;
-
-// e = elementwise((a * b), d0)
-// elementwise(c, d0) = fastgelu(c + d0)
-// output: e[m, n]
-// input: a[m, k], b[n, k], d0[m, n]
-
-using device_gemm_add_fastgelu_wmma_universal_f16_f16_f16_f16_mk_nk_mn_mn_generic_instances =
-    std::tuple<
-        // clang-format off
-        //##################################| ALayout| BLayout|  DsLayout| ELayout| AData| BData|    DsData| EData| AccData| CShuffle|           A|           B|          CDE|       GemmSpec| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MRepeat| NRepeat|    ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|    BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|   CShuffle|   CShuffle| CShuffleBlockTransfer| CDEShuffleBlockTransfer|   BlkGemm|                      BlkGemm|
-        //##################################|        |        |          |        |  Type|  Type|      Type|  Type|    Type| DataType| Elementwise| Elementwise|  Elementwise|               |  Size| Block| Block| Block|    |    | Wmma| Wmma|        |        |     ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar|    ExtraM|     ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar|    ExtraN|    MRepeat|    NRepeat|        ClusterLengths|        ScalarPerVectors| PipeSched|                  PipelineVer|
-        //##################################|        |        |          |        |      |      |          |      |        |         |   Operation|   Operation|    Operation|               |      |      |      |      |    |    |     |     |        |        | Lengths_AK0_M_AK1|   ArrangeOrder|               |               |      PerVector|  PerVector_AK1|          | Lengths_BK0_N_BK1|   ArrangeOrder|               |               |      PerVector|  PerVector_BK1|          | PerShuffle| PerShuffle|     _MBlock_MPerBlock|                        |          |                             |
-        //##################################|        |        |          |        |      |      |          |      |        |         |            |            |             |               |      |      |      |      |    |    |     |     |        |        |                  |               |               |               |               |               |          |                  |               |               |               |               |               |          |           |           |     _NBlock_NPerBlock|                        |          |                             |
-        DeviceGemmMultipleD_Wmma_CShuffleV3<      Row,     Col, Row_Tuple,     Row,   F16,   F16, F16_Tuple,   F16,     F32,      F32, PassThrough, PassThrough,  AddFastGelu, GemmMNKPadding,   128,   128,    64,    64,   8,   8,   16,   16,       4,       2,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         0,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         0,          1,          1,       S<1, 32, 1, 4>,               S<8, 8, 8>, Interwave, BlockGemmPipelineVersion::v1>
-        // clang-format on
-        >;
-
-using device_gemm_add_fastgelu_wmma_universal_f16_f16_f16_f16_mk_nk_mn_mn_instances = std::tuple<
-    // clang-format off
-        //##################################| ALayout| BLayout|  DsLayout| ELayout| AData| BData|    DsData| EData| AccData| CShuffle|           A|           B|          CDE|       GemmSpec| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MRepeat| NRepeat|    ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|    BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|   CShuffle|   CShuffle| CShuffleBlockTransfer| CDEShuffleBlockTransfer|   BlkGemm|                      BlkGemm|
-        //##################################|        |        |          |        |  Type|  Type|      Type|  Type|    Type| DataType| Elementwise| Elementwise|  Elementwise|               |  Size| Block| Block| Block|    |    | Wmma| Wmma|        |        |     ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar|    ExtraM|     ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar|    ExtraN|    MRepeat|    NRepeat|        ClusterLengths|        ScalarPerVectors| PipeSched|                  PipelineVer|
-        //##################################|        |        |          |        |      |      |          |      |        |         |   Operation|   Operation|    Operation|               |      |      |      |      |    |    |     |     |        |        | Lengths_AK0_M_AK1|   ArrangeOrder|               |               |      PerVector|  PerVector_AK1|          | Lengths_BK0_N_BK1|   ArrangeOrder|               |               |      PerVector|  PerVector_BK1|          | PerShuffle| PerShuffle|     _MBlock_MPerBlock|                        |          |                             |
-        //##################################|        |        |          |        |      |      |          |      |        |         |            |            |             |               |      |      |      |      |    |    |     |     |        |        |                  |               |               |               |               |               |          |                  |               |               |               |               |               |          |           |           |     _NBlock_NPerBlock|                        |          |                             |
-        DeviceGemmMultipleD_Wmma_CShuffleV3<      Row,     Col, Row_Tuple,     Row,   F16,   F16, F16_Tuple,   F16,     F32,      F32, PassThrough, PassThrough,  AddFastGelu,    GemmDefault,   128,   128,    64,    64,   8,   8,   16,   16,       4,       2,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         0,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         0,          1,          1,       S<1, 32, 1, 4>,               S<8, 8, 8>, Intrawave, BlockGemmPipelineVersion::v1>
-    // clang-format on
-    >;
-
-void add_device_gemm_add_fastgelu_wmma_universal_f16_f16_f16_f16_mk_nk_mn_mn_instances(
-    std::vector<std::unique_ptr<DeviceGemmMultipleDSplitK<Row,
-                                                          Col,
-                                                          Row_Tuple,
-                                                          Row,
-                                                          F16,
-                                                          F16,
-                                                          F16_Tuple,
-                                                          F16,
-                                                          PassThrough,
-                                                          PassThrough,
-                                                          AddFastGelu>>>& instances)
-{
-    add_device_operation_instances(
-        instances,
-        device_gemm_add_fastgelu_wmma_universal_f16_f16_f16_f16_mk_nk_mn_mn_generic_instances{});
-    add_device_operation_instances(
-        instances, device_gemm_add_fastgelu_wmma_universal_f16_f16_f16_f16_mk_nk_mn_mn_instances{});
-}
-
-} // namespace instance
-} // namespace device
-} // namespace tensor_operation
-} // namespace ck
--- a/library/src/tensor_operation_instance/gpu/gemm_add_fastgelu/device_gemm_add_fastgelu_xdl_c_shuffle_bf16_i8_bf16_bf16_mk_kn_mn_mn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/gemm_add_fastgelu/device_gemm_add_fastgelu_xdl_c_shuffle_bf16_i8_bf16_bf16_mk_kn_mn_mn_instance.cpp
@@ -47,7 +47,7 @@ using device_gemm_add_fastgelu_xdl_c_shuffle_bf16_i8_bf16_bf16_mk_kn_mn_mn_insta
    >;

 void add_device_gemm_add_fastgelu_xdl_c_shuffle_bf16_i8_bf16_bf16_mk_kn_mn_mn_instances(
-    std::vector<std::unique_ptr<DeviceGemmMultipleD<Row,
+    std::vector<std::unique_ptr<DeviceGemmMultipleDSplitK<Row,
                                                    Row,
                                                    Row_Tuple,
                                                    Row,
--- a/test/gemm_add/CMakeLists.txt
+++ b/test/gemm_add/CMakeLists.txt
@@ -1,3 +1,5 @@
+# Implements test instances for MultipleD with xdl and wmma support.
+
 add_gtest_executable(test_gemm_add_xdl test_gemm_add_xdl.cpp)
 if(result EQUAL 0)
    target_link_libraries(test_gemm_add_xdl PRIVATE utility device_gemm_add_instance)
--- a/test/gemm_add/test_gemm_add_fastgelu_wmma.cpp
+++ b/test/gemm_add/test_gemm_add_fastgelu_wmma.cpp
@@ -34,7 +34,10 @@ class TestGemmAddFastgelu : public TestGemmD0Common<Tuple>
    decltype(ProfileGemmAddFastgeluImpl) GetImpl() override { return ProfileGemmAddFastgeluImpl; }
 };

-using KernelTypes = ::testing::Types<std::tuple<F16, F16, F32, F16, F16, Row, Col, Row, Row>>;
+using KernelTypes = ::testing::Types<std::tuple<F16, F16, F32, F16, F16, Row, Row, Row, Row>,
+                                     std::tuple<F16, F16, F32, F16, F16, Row, Col, Row, Row>,
+                                     std::tuple<F16, F16, F32, F16, F16, Col, Row, Row, Row>,
+                                     std::tuple<F16, F16, F32, F16, F16, Col, Col, Row, Row>>;

 TYPED_TEST_SUITE(TestGemmAddFastgelu, KernelTypes);
 TYPED_TEST(TestGemmAddFastgelu, Test_BF16FP16) { this->Run(); }