Merge commit 'd5ae81b2922773f7cdf4a02a2e1fd57d0e4df851' into develop

2026-05-20 21:09:08 +00:00 · 2026-01-20 22:14:29 +00:00
parent 5f61470a1f
commit b2c76ff10f
22 changed files with 2956 additions and 499 deletions
--- a/library/include/ck/library/tensor_operation_instance/gpu/batched_gemm_add_relu_gemm_add.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/batched_gemm_add_relu_gemm_add.hpp
@@ -20,6 +20,52 @@ namespace tensor_operation {
 namespace device {
 namespace instance {

+#ifdef CK_USE_WMMA
+#ifdef CK_ENABLE_FP16
+void add_device_batched_gemm_add_relu_gemm_add_wmma_cshuffle_f16_f16_f16_f16_gmk_gnk_gno_gmo_instance(
+    std::vector<std::unique_ptr<DeviceBatchedGemmMultipleDGemmMultipleD<Row,
+                                                                        Col,
+                                                                        ck::Tuple<Row>,
+                                                                        Row,
+                                                                        ck::Tuple<Row>,
+                                                                        Row,
+                                                                        F16,
+                                                                        F16,
+                                                                        ck::Tuple<F16>,
+                                                                        F16,
+                                                                        ck::Tuple<F16>,
+                                                                        F16,
+                                                                        PassThrough,
+                                                                        PassThrough,
+                                                                        CDE0ElementOp,
+                                                                        PassThrough,
+                                                                        CDE1ElementOp>>>&
+        instances);
+
+void add_device_batched_gemm_add_relu_gemm_add_wmma_cshuffle_f16_f16_f16_f16_gmk_gnk_gon_gmo_instance(
+    std::vector<std::unique_ptr<DeviceBatchedGemmMultipleDGemmMultipleD<Row,
+                                                                        Col,
+                                                                        ck::Tuple<Row>,
+                                                                        Col,
+                                                                        ck::Tuple<Row>,
+                                                                        Row,
+                                                                        F16,
+                                                                        F16,
+                                                                        ck::Tuple<F16>,
+                                                                        F16,
+                                                                        ck::Tuple<F16>,
+                                                                        F16,
+                                                                        PassThrough,
+                                                                        PassThrough,
+                                                                        CDE0ElementOp,
+                                                                        PassThrough,
+                                                                        CDE1ElementOp>>>&
+        instances);
+#endif // CK_ENABLE_FP16
+#endif // CK_USE_WMMA
+
+#ifdef CK_USE_XDL
+#ifdef CK_ENABLE_FP16
 void add_device_batched_gemm_add_relu_gemm_add_xdl_cshuffle_f16_f16_f16_f16_gmk_gnk_gno_gmo_instance(
    std::vector<std::unique_ptr<DeviceBatchedGemmMultipleDGemmMultipleD<Row,
                                                                        Col,
@@ -59,7 +105,8 @@ void add_device_batched_gemm_add_relu_gemm_add_xdl_cshuffle_f16_f16_f16_f16_gmk_
                                                                        PassThrough,
                                                                        CDE1ElementOp>>>&
        instances);
-
+#endif // CK_ENABLE_FP16
+#endif // CK_USE_XDL
 template <typename A0Layout,
          typename B0Layout,
          typename D0sLayout,
@@ -113,22 +160,36 @@ struct DeviceOperationInstanceFactory<
    {
        std::vector<std::unique_ptr<DeviceOp>> op_ptrs;

+#ifdef CK_ENABLE_FP16
        if constexpr(is_same_v<A0DataType, half_t> && is_same_v<B0DataType, half_t> &&
                     is_same_v<B1DataType, half_t> && is_same_v<E1DataType, half_t>)
        {
            if constexpr(is_same_v<A0Layout, Row> && is_same_v<B0Layout, Col> &&
                         is_same_v<B1Layout, Row> && is_same_v<E1Layout, Row>)
            {
+#ifdef CK_USE_XDL
                add_device_batched_gemm_add_relu_gemm_add_xdl_cshuffle_f16_f16_f16_f16_gmk_gnk_gno_gmo_instance(
                    op_ptrs);
+#endif
+#ifdef CK_USE_WMMA
+                add_device_batched_gemm_add_relu_gemm_add_wmma_cshuffle_f16_f16_f16_f16_gmk_gnk_gno_gmo_instance(
+                    op_ptrs);
+#endif
            }
            else if constexpr(is_same_v<A0Layout, Row> && is_same_v<B0Layout, Col> &&
                              is_same_v<B1Layout, Col> && is_same_v<E1Layout, Row>)
            {
+#ifdef CK_USE_XDL
                add_device_batched_gemm_add_relu_gemm_add_xdl_cshuffle_f16_f16_f16_f16_gmk_gnk_gon_gmo_instance(
                    op_ptrs);
+#endif
+#ifdef CK_USE_WMMA
+                add_device_batched_gemm_add_relu_gemm_add_wmma_cshuffle_f16_f16_f16_f16_gmk_gnk_gon_gmo_instance(
+                    op_ptrs);
+#endif
            }
        }
+#endif
        return op_ptrs;
    }
 };
--- a/library/src/tensor_operation_instance/gpu/batched_gemm_add_relu_gemm_add/CMakeLists.txt
+++ b/library/src/tensor_operation_instance/gpu/batched_gemm_add_relu_gemm_add/CMakeLists.txt
@@ -1,8 +1,11 @@
 # Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
 # SPDX-License-Identifier: MIT

-# ONLY XDL_KERNELS
+# ONLY XDL_AND_WMMA_KERNELS
 add_instance_library(device_batched_gemm_add_relu_gemm_add_instance
    device_batched_gemm_add_relu_gemm_add_xdl_cshuffle_f16_f16_f16_f16_gmk_gnk_gno_gmo_instance.cpp
    device_batched_gemm_add_relu_gemm_add_xdl_cshuffle_f16_f16_f16_f16_gmk_gnk_gon_gmo_instance.cpp
+
+    device_batched_gemm_add_relu_gemm_add_wmma_cshuffle_f16_f16_f16_f16_gmk_gnk_gno_gmo_instance.cpp
+    device_batched_gemm_add_relu_gemm_add_wmma_cshuffle_f16_f16_f16_f16_gmk_gnk_gon_gmo_instance.cpp
 )
--- a/library/src/tensor_operation_instance/gpu/batched_gemm_add_relu_gemm_add/device_batched_gemm_add_relu_gemm_add_wmma_cshuffle_f16_f16_f16_f16_gmk_gnk_gno_gmo_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/batched_gemm_add_relu_gemm_add/device_batched_gemm_add_relu_gemm_add_wmma_cshuffle_f16_f16_f16_f16_gmk_gnk_gno_gmo_instance.cpp
@@ -0,0 +1,72 @@
+// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
+// SPDX-License-Identifier: MIT
+
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_batched_gemm_multiple_d_gemm_multiple_d_wmma_cshuffle_v3.hpp"
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+using F16 = ck::half_t;
+using F32 = float;
+
+using Row = ck::tensor_layout::gemm::RowMajor;
+using Col = ck::tensor_layout::gemm::ColumnMajor;
+
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+
+using PassThrough   = ck::tensor_operation::element_wise::PassThrough;
+using CDE0ElementOp = ck::tensor_operation::element_wise::AddRelu;
+using CDE1ElementOp = ck::tensor_operation::element_wise::Add;
+
+// c[g, m, n] = a[g, m, k] * b[g, n, k]
+using device_batched_gemm_add_relu_gemm_add_wmma_cshuffle_f16_f16_f16_f16_gmk_gnk_gno_gmo_instances =
+    std::tuple<
+        // clang-format off
+        //#####################################################| A0Layout| B0Layout|       D0Layout| B1Layout|      D1sLayout| E1Layout| A0Data| B0Data|     D0DataType| B1Data|     D1DataType| E1Data| AccData| CShuffle|           A0|          B0|          CDE0|          B1|          CDE1|              GemmSpecialization| Block|  Gemm0| Gemm0| Gemm0| Gemm1| Gemm1|A0K1|B0K1| B1K1| MPer| NPer| MRepeat| LRepeat| NRepeat|A0BlockTransfer|A0BlockTransfer|A0BlockTransfer|A0BlockTransfer|A0BlockTransfer|A0BlockTransfer|A0BlockLds|  B0BlockTransfer| B0BlockTransfer| B0BlockTransfer| B0BlockTransfer| B0BlockTransfer| B0BlockTransfer| B0BlockLds| CDE0BlockTransfer|  B1BlockTransfer| B1BlockTransfer| B1BlockTransfer| B1BlockTransfer| B1BlockTransfer| B1BlockTransfer| B1BlockLds|   C1Shuffle|   C1Shuffle| CDE1BlockTransferClusterLengths| CDE1BlockTransfer|
+        //#####################################################|         |         |               |         |               |         |   Type|   Type|               |   Type|               |   Type|    Type| DataType|  Elementwise| Elementwise|   Elementwise| Elementwise|   Elementwise|                                |  Size|   MPer|  NPer|  KPer|  NPer|  KPer|    |    |     | WMMA| WMMA|        |        |        |  ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|    ThreadCluster|   ThreadCluster|  SrcAccessOrder|    SrcVectorDim|       SrcScalar|       DstScalar|  AddExtraN|         SrcScalar|    ThreadCluster|   ThreadCluster|  SrcAccessOrder|    SrcVectorDim|       SrcScalar|       DstScalar|  AddExtraN|     MRepeat|     NRepeat|                 _MBlock_MRepeat|   ScalarPerVector|
+        //#####################################################|         |         |               |         |               |         |       |       |               |       |               |       |        |         |    Operation|   Operation|     Operation|   Operation|     Operation|                                |      |  Block| Block| Block| Block| Block|    |    |     |     |     |        |        |        |Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|  PerVector_AK1|          |  Lengths_K0_N_K1|    ArrangeOrder|                |                |       PerVector|    PerVector_K1|           |         PerVector|  Lengths_K0_N_K1|    ArrangeOrder|                |                |       PerVector|    PerVector_K1|           |  PerShuffle|  PerShuffle|                 _NBlock_NRepeat|          _NRepeat|
+        //#####################################################|         |         |               |         |               |         |       |       |               |       |               |       |        |         |             |            |              |            |              |                                |      |       |      |      |      |      |    |    |     |     |     |        |        |        |               |               |               |               |               |               |          |                 |                |                |                |                |                |           |                  |                 |                |                |                |                |                |           |            |            |                                |                  |
+        // No padding
+        DeviceBatchedGemmMultipleDGemmMultipleD_Wmma_CShuffleV3<      Row,      Col, ck::Tuple<Row>,      Row, ck::Tuple<Row>,      Row,    F16,    F16, ck::Tuple<F16>,    F16, ck::Tuple<F16>,    F16,     F32,      F32,  PassThrough, PassThrough, CDE0ElementOp, PassThrough, CDE1ElementOp,     GemmSpecialization::Default,    32,     16,    64,    64,    64,    64,   8,   8,    8,   16,   16,       1,       4,       4,    S<2, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,     false,      S<2, 16, 1>,      S<1, 0, 2>,      S<1, 0, 2>,               2,               8,               8,      false,                 4,      S<2, 16, 1>,      S<0, 2, 1>,      S<0, 2, 1>,               1,               4,               2,       true,           1,           2,                  S<1, 16, 1, 2>,                8>,
+        // Fallback with padding
+        DeviceBatchedGemmMultipleDGemmMultipleD_Wmma_CShuffleV3<      Row,      Col, ck::Tuple<Row>,      Row, ck::Tuple<Row>,      Row,    F16,    F16, ck::Tuple<F16>,    F16, ck::Tuple<F16>,    F16,     F32,      F32,  PassThrough, PassThrough, CDE0ElementOp, PassThrough, CDE1ElementOp, GemmSpecialization::MNKOPadding,    32,     16,    64,    64,    64,    64,   8,   8,    8,   16,   16,       1,       4,       4,    S<2, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              8,     false,      S<2, 16, 1>,      S<1, 0, 2>,      S<1, 0, 2>,               2,               1,               8,      false,                 1,      S<2, 16, 1>,      S<0, 2, 1>,      S<0, 2, 1>,               1,               1,               1,       true,           1,           2,                  S<1, 16, 1, 2>,                1>
+        // clang-format on
+        >;
+
+void add_device_batched_gemm_add_relu_gemm_add_wmma_cshuffle_f16_f16_f16_f16_gmk_gnk_gno_gmo_instance(
+    std::vector<std::unique_ptr<DeviceBatchedGemmMultipleDGemmMultipleD<Row,
+                                                                        Col,
+                                                                        ck::Tuple<Row>,
+                                                                        Row,
+                                                                        ck::Tuple<Row>,
+                                                                        Row,
+                                                                        F16,
+                                                                        F16,
+                                                                        ck::Tuple<F16>,
+                                                                        F16,
+                                                                        ck::Tuple<F16>,
+                                                                        F16,
+                                                                        PassThrough,
+                                                                        PassThrough,
+                                                                        CDE0ElementOp,
+                                                                        PassThrough,
+                                                                        CDE1ElementOp>>>& instances)
+{
+    add_device_operation_instances(
+        instances,
+        device_batched_gemm_add_relu_gemm_add_wmma_cshuffle_f16_f16_f16_f16_gmk_gnk_gno_gmo_instances{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
--- a/library/src/tensor_operation_instance/gpu/batched_gemm_add_relu_gemm_add/device_batched_gemm_add_relu_gemm_add_wmma_cshuffle_f16_f16_f16_f16_gmk_gnk_gon_gmo_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/batched_gemm_add_relu_gemm_add/device_batched_gemm_add_relu_gemm_add_wmma_cshuffle_f16_f16_f16_f16_gmk_gnk_gon_gmo_instance.cpp
@@ -0,0 +1,72 @@
+// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
+// SPDX-License-Identifier: MIT
+
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_batched_gemm_multiple_d_gemm_multiple_d_wmma_cshuffle_v3.hpp"
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+using F16 = ck::half_t;
+using F32 = float;
+
+using Row = ck::tensor_layout::gemm::RowMajor;
+using Col = ck::tensor_layout::gemm::ColumnMajor;
+
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+
+using PassThrough   = ck::tensor_operation::element_wise::PassThrough;
+using CDE0ElementOp = ck::tensor_operation::element_wise::AddRelu;
+using CDE1ElementOp = ck::tensor_operation::element_wise::Add;
+
+// c[g, m, n] = a[g, m, k] * b[g, n, k]
+using device_batched_gemm_add_relu_gemm_add_wmma_cshuffle_f16_f16_f16_f16_gmk_gnk_gon_gmo_instances =
+    std::tuple<
+        // clang-format off
+        //#####################################################| A0Layout| B0Layout|       D0Layout| B1Layout|      D1sLayout| E1Layout| A0Data| B0Data|     D0DataType| B1Data|     D1DataType| E1Data| AccData| CShuffle|           A0|          B0|          CDE0|          B1|          CDE1|              GemmSpecialization| Block|  Gemm0| Gemm0| Gemm0| Gemm1| Gemm1|A0K1|B0K1| B1K1| MPer| NPer| MRepeat| LRepeat| NRepeat|A0BlockTransfer|A0BlockTransfer|A0BlockTransfer|A0BlockTransfer|A0BlockTransfer|A0BlockTransfer|A0BlockLds|  B0BlockTransfer| B0BlockTransfer| B0BlockTransfer| B0BlockTransfer| B0BlockTransfer| B0BlockTransfer| B0BlockLds| CDE0BlockTransfer|  B1BlockTransfer| B1BlockTransfer| B1BlockTransfer| B1BlockTransfer| B1BlockTransfer| B1BlockTransfer| B1BlockLds|   C1Shuffle|   C1Shuffle| CDE1BlockTransferClusterLengths| CDE1BlockTransfer|
+        //#####################################################|         |         |               |         |               |         |   Type|   Type|               |   Type|               |   Type|    Type| DataType|  Elementwise| Elementwise|   Elementwise| Elementwise|   Elementwise|                                |  Size|   MPer|  NPer|  KPer|  NPer|  KPer|    |    |     | WMMA| WMMA|        |        |        |  ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|    ThreadCluster|   ThreadCluster|  SrcAccessOrder|    SrcVectorDim|       SrcScalar|       DstScalar|  AddExtraN|         SrcScalar|    ThreadCluster|   ThreadCluster|  SrcAccessOrder|    SrcVectorDim|       SrcScalar|       DstScalar|  AddExtraN|     MRepeat|     NRepeat|                 _MBlock_MRepeat|   ScalarPerVector|
+        //#####################################################|         |         |               |         |               |         |       |       |               |       |               |       |        |         |    Operation|   Operation|     Operation|   Operation|     Operation|                                |      |  Block| Block| Block| Block| Block|    |    |     |     |     |        |        |        |Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|  PerVector_AK1|          |  Lengths_K0_N_K1|    ArrangeOrder|                |                |       PerVector|    PerVector_K1|           |         PerVector|  Lengths_K0_N_K1|    ArrangeOrder|                |                |       PerVector|    PerVector_K1|           |  PerShuffle|  PerShuffle|                 _NBlock_NRepeat|          _NRepeat|
+        //#####################################################|         |         |               |         |               |         |       |       |               |       |               |       |        |         |             |            |              |            |              |                                |      |       |      |      |      |      |    |    |     |     |     |        |        |        |               |               |               |               |               |               |          |                 |                |                |                |                |                |           |                  |                 |                |                |                |                |                |           |            |            |                                |                  |
+        // No padding
+        DeviceBatchedGemmMultipleDGemmMultipleD_Wmma_CShuffleV3<      Row,      Col, ck::Tuple<Row>,      Col, ck::Tuple<Row>,      Row,    F16,    F16, ck::Tuple<F16>,    F16, ck::Tuple<F16>,    F16,     F32,      F32,  PassThrough, PassThrough, CDE0ElementOp, PassThrough, CDE1ElementOp,     GemmSpecialization::Default,    32,     16,    64,    64,    64,    64,   8,   8,    8,   16,   16,       1,       4,       4,    S<2, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,     false,      S<2, 16, 1>,      S<1, 0, 2>,      S<1, 0, 2>,               2,               8,               8,      false,                 4,      S<2, 16, 1>,      S<1, 0, 2>,      S<1, 0, 2>,               2,               4,               2,       true,           1,           2,                  S<1, 16, 1, 2>,                8>,
+        // Fallback with padding
+        DeviceBatchedGemmMultipleDGemmMultipleD_Wmma_CShuffleV3<      Row,      Col, ck::Tuple<Row>,      Col, ck::Tuple<Row>,      Row,    F16,    F16, ck::Tuple<F16>,    F16, ck::Tuple<F16>,    F16,     F32,      F32,  PassThrough, PassThrough, CDE0ElementOp, PassThrough, CDE1ElementOp, GemmSpecialization::MNKOPadding,    32,     16,    64,    64,    64,    64,   8,   8,    8,   16,   16,       1,       4,       4,    S<2, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              8,     false,      S<2, 16, 1>,      S<1, 0, 2>,      S<1, 0, 2>,               2,               1,               8,      false,                 1,      S<2, 16, 1>,      S<1, 0, 2>,      S<1, 0, 2>,               2,               1,               2,       true,           1,           2,                  S<1, 16, 1, 2>,                1>
+        // clang-format on
+        >;
+
+void add_device_batched_gemm_add_relu_gemm_add_wmma_cshuffle_f16_f16_f16_f16_gmk_gnk_gon_gmo_instance(
+    std::vector<std::unique_ptr<DeviceBatchedGemmMultipleDGemmMultipleD<Row,
+                                                                        Col,
+                                                                        ck::Tuple<Row>,
+                                                                        Col,
+                                                                        ck::Tuple<Row>,
+                                                                        Row,
+                                                                        F16,
+                                                                        F16,
+                                                                        ck::Tuple<F16>,
+                                                                        F16,
+                                                                        ck::Tuple<F16>,
+                                                                        F16,
+                                                                        PassThrough,
+                                                                        PassThrough,
+                                                                        CDE0ElementOp,
+                                                                        PassThrough,
+                                                                        CDE1ElementOp>>>& instances)
+{
+    add_device_operation_instances(
+        instances,
+        device_batched_gemm_add_relu_gemm_add_wmma_cshuffle_f16_f16_f16_f16_gmk_gnk_gon_gmo_instances{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck