Grouped Conv Bwd Weight Direct Load (#3648)

* Grouped Conv Bwd Weight Direct Load * Update gridwise_gemm_xdl_cshuffle_conv_v3.hpp * Implement group merging for bwd_weight and add instances * Link direct load instances * builder fixes * fix * fixes * fix --------- Co-authored-by: Graner, Johannes <johannes.graner@amd.com> [ROCm/composable_kernel commit: 83b58bb0c3]
2026-05-14 02:02:46 +00:00 · 2026-01-28 22:31:54 +01:00
parent ee4e216716
commit 017d96faaa
18 changed files with 578 additions and 194 deletions
--- a/library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_weight/device_grouped_conv_bwd_weight_v3_xdl_instance.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_weight/device_grouped_conv_bwd_weight_v3_xdl_instance.hpp
@@ -101,6 +101,55 @@ using device_grouped_conv_bwd_weight_v3_xdl_c_shuffle_f16_instances = std::tuple
    // clang-format on
    >;

+template <ck::index_t NDimSpatial,
+          typename ALayout,
+          typename BLayout,
+          typename ELayout,
+          ConvolutionBackwardWeightSpecialization ConvSpec,
+          BlockGemmPipelineScheduler Scheduler,
+          BlockGemmPipelineVersion PipelineVersion>
+using device_grouped_conv_bwd_weight_v3_xdl_c_shuffle_f16_direct_load_instances = std::tuple<
+    // clang-format off
+        //#########################################|     Num| InLayout| WeiLayout| OutLayout| InData| WeiData| OutData| AccData|          In|         Wei|         Out|              ConvBackward| Block|  MPer|  NPer| K0Per| K1| MPer| NPer| MXdl| NXdl|  ABlockTransfer|   ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle|   CBlockTransfer|  CBlockTransfer| BlockGemm| BlockGemm| Compute| Compute| Direct|
+        //#########################################|     Dim|         |          |          |   Type|    Type|    Type|    Type| Elementwise| Elementwise| Elementwise|                    Weight|  Size| Block| Block| Block|   |  XDL|  XDL|  Per|  Per|   ThreadCluster|    ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|   ClusterLengths| ScalarPerVector|  Pipeline|  Pipeline|    Data|    Data|   Load|
+        //#########################################| Spatial|         |          |          |       |        |        |        |   Operation|   Operation|   Operation|            Specialization|      |      |      |      |   |     |     | Wave| Wave| Lengths_K0_M_K1|     ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle| MBlock_MPerBlock|    NWaveNPerXdl| Scheduler|   Version|    Type|    Type|       |
+        //#########################################|        |         |          |          |       |        |        |        |            |            |            |                          |      |      |      |      |   |     |     |     |     |                |                 |               |               |               |               |          |                |               |               |              |               |               |          |            |            | NBlock_NPerBlock|                |          |          |        |        |       |
+        DeviceGroupedConvBwdWeight_Xdl_CShuffleV3< NDimSpatial,  ALayout,   BLayout,   ELayout,    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough,                 ConvSpec,   128,    16,   32,    64,   8,     16,   16,    1,    1,     S<8,  2, 8>,      S<0, 2, 1>,S<0, 2, 1>,               1,              8,              8,          0,    S<4,  4, 8>,      S<0, 2, 1>,S<0, 2, 1>,               1,              8,              8,          0,          1,           1,                   S<1, 16, 1, 8>,               4,  Scheduler, PipelineVersion, F16,  F16, true>,
+        DeviceGroupedConvBwdWeight_Xdl_CShuffleV3< NDimSpatial,  ALayout,   BLayout,   ELayout,    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough,                 ConvSpec,   128,    16,   32,    64,   8,     16,   16,    1,    1,     S<8,  2, 8>,      S<0, 2, 1>,S<0, 2, 1>,               1,              8,              8,          0,    S<4,  4, 8>,      S<0, 2, 1>,S<0, 2, 1>,               1,              8,              8,          0,          1,           1,                   S<1, 16, 1, 8>,               4,  Scheduler, PipelineVersion, F16,  F16, true, 2>,
+        DeviceGroupedConvBwdWeight_Xdl_CShuffleV3< NDimSpatial,  ALayout,   BLayout,   ELayout,    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough,                 ConvSpec,   128,    16,   64,    64,   8,     16,   16,    1,    2,     S<8,  2, 8>,      S<0, 2, 1>,S<0, 2, 1>,               1,              8,              8,          0,    S<2,  8, 8>,      S<0, 2, 1>,S<0, 2, 1>,               1,              8,              8,          0,          1,           1,                   S<1, 16, 1, 8>,               4,  Scheduler, PipelineVersion, F16,  F16, true>,
+        DeviceGroupedConvBwdWeight_Xdl_CShuffleV3< NDimSpatial,  ALayout,   BLayout,   ELayout,    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough,                 ConvSpec,   128,    16,   64,    64,   8,     16,   16,    1,    2,     S<8,  2, 8>,      S<0, 2, 1>,S<0, 2, 1>,               1,              8,              8,          0,    S<2,  8, 8>,      S<0, 2, 1>,S<0, 2, 1>,               1,              8,              8,          0,          1,           1,                   S<1, 16, 1, 8>,               4,  Scheduler, PipelineVersion, F16,  F16, true, 2>,
+        DeviceGroupedConvBwdWeight_Xdl_CShuffleV3< NDimSpatial,  ALayout,   BLayout,   ELayout,    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough,                 ConvSpec,   256,   256,   32,    64,   8,     32,   32,    2,    1,     S<4, 32, 2>,      S<0, 2, 1>,S<0, 2, 1>,               1,              8,              8,          0,    S<8,  4, 8>,      S<0, 2, 1>,S<0, 2, 1>,               1,              8,              8,          0,          1,           1,                   S<1, 64, 1, 4>,               8,  Scheduler, PipelineVersion, F16,  F16, true>,
+        DeviceGroupedConvBwdWeight_Xdl_CShuffleV3< NDimSpatial,  ALayout,   BLayout,   ELayout,    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough,                 ConvSpec,   256,    64,  128,    64,   8,     32,   32,    1,    2,     S<4,  8, 8>,      S<0, 2, 1>,S<0, 2, 1>,               1,              8,              8,          0,    S<4, 16, 4>,      S<0, 2, 1>,S<0, 2, 1>,               1,              8,              8,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  Scheduler, PipelineVersion, F16,  F16, true>,
+        DeviceGroupedConvBwdWeight_Xdl_CShuffleV3< NDimSpatial,  ALayout,   BLayout,   ELayout,    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough,                 ConvSpec,   256,    64,  128,    64,   8,     32,   32,    1,    2,     S<4,  8, 8>,      S<0, 2, 1>,S<0, 2, 1>,               1,              8,              8,          0,    S<4, 16, 4>,      S<0, 2, 1>,S<0, 2, 1>,               1,              8,              8,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  Scheduler, PipelineVersion, F16,  F16, true, 2>,
+        DeviceGroupedConvBwdWeight_Xdl_CShuffleV3< NDimSpatial,  ALayout,   BLayout,   ELayout,    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough,                 ConvSpec,   256,    64,   64,    64,   8,     16,   16,    2,    2,     S<4,  8, 8>,      S<0, 2, 1>,S<0, 2, 1>,               1,              8,              8,          0,    S<4,  8, 8>,      S<0, 2, 1>,S<0, 2, 1>,               1,              8,              8,          0,          1,           1,                   S<1, 32, 1, 4>,               8,  Scheduler, PipelineVersion, F16,  F16, true>,
+        DeviceGroupedConvBwdWeight_Xdl_CShuffleV3< NDimSpatial,  ALayout,   BLayout,   ELayout,    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough,                 ConvSpec,   256,    64,   64,    64,   8,     16,   16,    2,    2,     S<4,  8, 8>,      S<0, 2, 1>,S<0, 2, 1>,               1,              8,              8,          0,    S<4,  8, 8>,      S<0, 2, 1>,S<0, 2, 1>,               1,              8,              8,          0,          1,           1,                   S<1, 32, 1, 4>,               2,  Scheduler, PipelineVersion, F16,  F16, true>,
+        DeviceGroupedConvBwdWeight_Xdl_CShuffleV3< NDimSpatial,  ALayout,   BLayout,   ELayout,    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough,                 ConvSpec,   256,    64,   64,    64,   8,     16,   16,    2,    2,     S<4,  32, 2>,     S<0, 2, 1>,S<0, 2, 1>,               1,              2,              1,          0,    S<4, 32, 2>,      S<0, 2, 1>,S<0, 2, 1>,               1,              2,              1,          0,          1,           1,                   S<1, 32, 1, 4>,               4,  Scheduler, PipelineVersion, F16,  F16, true>
+    // clang-format on
+    >;
+
+template <ck::index_t NDimSpatial,
+          typename ALayout,
+          typename BLayout,
+          typename ELayout,
+          ConvolutionBackwardWeightSpecialization ConvSpec,
+          BlockGemmPipelineScheduler Scheduler,
+          BlockGemmPipelineVersion PipelineVersion>
+using device_grouped_conv_bwd_weight_v3_xdl_c_shuffle_bf16_direct_load_instances = std::tuple<
+    // clang-format off
+        //#########################################|     Num| InLayout| WeiLayout| OutLayout| InData| WeiData| OutData| AccData|          In|         Wei|         Out|              ConvBackward| Block|  MPer|  NPer| K0Per| K1| MPer| NPer| MXdl| NXdl|  ABlockTransfer|   ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle|   CBlockTransfer|  CBlockTransfer| BlockGemm| BlockGemm| Compute| Compute| Direct|
+        //#########################################|     Dim|         |          |          |   Type|    Type|    Type|    Type| Elementwise| Elementwise| Elementwise|                    Weight|  Size| Block| Block| Block|   |  XDL|  XDL|  Per|  Per|   ThreadCluster|    ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|   ClusterLengths| ScalarPerVector|  Pipeline|  Pipeline|    Data|    Data|   Load|
+        //#########################################| Spatial|         |          |          |       |        |        |        |   Operation|   Operation|   Operation|            Specialization|      |      |      |      |   |     |     | Wave| Wave| Lengths_K0_M_K1|     ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle| MBlock_MPerBlock|    NWaveNPerXdl| Scheduler|   Version|    Type|    Type|       |
+        //#########################################|        |         |          |          |       |        |        |        |            |            |            |                          |      |      |      |      |   |     |     |     |     |                |                 |               |               |               |               |          |                |               |               |              |               |               |          |            |            | NBlock_NPerBlock|                |          |          |        |        |       |
+        DeviceGroupedConvBwdWeight_Xdl_CShuffleV3< NDimSpatial,  ALayout,   BLayout,   ELayout,   BF16,    BF16,    BF16,     F32, PassThrough, PassThrough, PassThrough,                 ConvSpec,   128,    16,   32,    64,   8,     16,   16,    1,    1,     S<8,  2, 8>,      S<0, 2, 1>,S<0, 2, 1>,               1,              8,              8,          0,    S<4,  4, 8>,      S<0, 2, 1>,S<0, 2, 1>,               1,              8,              8,          0,          1,           1,                   S<1, 16, 1, 8>,               4,  Scheduler, PipelineVersion, BF16, BF16, true>,
+        DeviceGroupedConvBwdWeight_Xdl_CShuffleV3< NDimSpatial,  ALayout,   BLayout,   ELayout,   BF16,    BF16,    BF16,     F32, PassThrough, PassThrough, PassThrough,                 ConvSpec,   128,    16,   64,    64,   8,     16,   16,    1,    2,     S<8,  2, 8>,      S<0, 2, 1>,S<0, 2, 1>,               1,              8,              8,          0,    S<2,  8, 8>,      S<0, 2, 1>,S<0, 2, 1>,               1,              8,              8,          0,          1,           1,                   S<1, 16, 1, 8>,               4,  Scheduler, PipelineVersion, BF16, BF16, true>,
+        DeviceGroupedConvBwdWeight_Xdl_CShuffleV3< NDimSpatial,  ALayout,   BLayout,   ELayout,   BF16,    BF16,    BF16,     F32, PassThrough, PassThrough, PassThrough,                 ConvSpec,   256,   256,   32,    64,   8,     32,   32,    2,    1,     S<4, 32, 2>,      S<0, 2, 1>,S<0, 2, 1>,               1,              8,              8,          0,    S<8,  4, 8>,      S<0, 2, 1>,S<0, 2, 1>,               1,              8,              8,          0,          1,           1,                   S<1, 64, 1, 4>,               8,  Scheduler, PipelineVersion, BF16, BF16, true>,
+        DeviceGroupedConvBwdWeight_Xdl_CShuffleV3< NDimSpatial,  ALayout,   BLayout,   ELayout,   BF16,    BF16,    BF16,     F32, PassThrough, PassThrough, PassThrough,                 ConvSpec,   256,    64,  128,    64,   8,     32,   32,    1,    2,     S<4,  8, 8>,      S<0, 2, 1>,S<0, 2, 1>,               1,              8,              8,          0,    S<4, 16, 4>,      S<0, 2, 1>,S<0, 2, 1>,               1,              8,              8,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  Scheduler, PipelineVersion, BF16, BF16, true>,
+        DeviceGroupedConvBwdWeight_Xdl_CShuffleV3< NDimSpatial,  ALayout,   BLayout,   ELayout,   BF16,    BF16,    BF16,     F32, PassThrough, PassThrough, PassThrough,                 ConvSpec,   256,    64,   64,    64,   8,     16,   16,    2,    2,     S<4,  8, 8>,      S<0, 2, 1>,S<0, 2, 1>,               1,              8,              8,          0,    S<4,  8, 8>,      S<0, 2, 1>,S<0, 2, 1>,               1,              8,              8,          0,          1,           1,                   S<1, 32, 1, 4>,               8,  Scheduler, PipelineVersion, BF16, BF16, true>,
+        DeviceGroupedConvBwdWeight_Xdl_CShuffleV3< NDimSpatial,  ALayout,   BLayout,   ELayout,   BF16,    BF16,    BF16,     F32, PassThrough, PassThrough, PassThrough,                 ConvSpec,   256,    64,   64,    64,   8,     16,   16,    2,    2,     S<4,  8, 8>,      S<0, 2, 1>,S<0, 2, 1>,               1,              8,              8,          0,    S<4,  8, 8>,      S<0, 2, 1>,S<0, 2, 1>,               1,              8,              8,          0,          1,           1,                   S<1, 32, 1, 4>,               2,  Scheduler, PipelineVersion, BF16, BF16, true>,
+        DeviceGroupedConvBwdWeight_Xdl_CShuffleV3< NDimSpatial,  ALayout,   BLayout,   ELayout,   BF16,    BF16,    BF16,     F32, PassThrough, PassThrough, PassThrough,                 ConvSpec,   256,    64,   64,    64,   8,     16,   16,    2,    2,     S<4,  32, 2>,     S<0, 2, 1>,S<0, 2, 1>,               1,              2,              1,          0,    S<4, 32, 2>,      S<0, 2, 1>,S<0, 2, 1>,               1,              2,              1,          0,          1,           1,                   S<1, 32, 1, 4>,               4,  Scheduler, PipelineVersion, BF16, BF16, true>
+    // clang-format on
+    >;
+
 template <ck::index_t NDimSpatial,
          typename ALayout,
          typename BLayout,
--- a/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_backward_weight.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_backward_weight.hpp
@@ -393,6 +393,9 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceGroupe
                    add_device_grouped_conv2d_bwd_weight_xdl_nhwgc_gkyxc_nhwgk_f16_instances(
                        op_ptrs);

+                    add_device_grouped_conv2d_bwd_weight_xdl_nhwgc_gkyxc_nhwgk_f16_direct_load_instances(
+                        op_ptrs);
+
                    add_device_grouped_conv2d_bwd_weight_xdl_nhwgc_gkyxc_nhwgk_f16_default_pipev2_instances(
                        op_ptrs);
                    add_device_grouped_conv2d_bwd_weight_xdl_nhwgc_gkyxc_nhwgk_f16_default_pipev5_instances(
@@ -453,6 +456,9 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceGroupe
                    add_device_grouped_conv2d_bwd_weight_xdl_nhwgc_gkyxc_nhwgk_bf16_instances(
                        op_ptrs);

+                    add_device_grouped_conv2d_bwd_weight_xdl_nhwgc_gkyxc_nhwgk_bf16_direct_load_instances(
+                        op_ptrs);
+
                    add_device_grouped_conv2d_bwd_weight_xdl_nhwgc_gkyxc_nhwgk_bf16_default_pipev2_instances(
                        op_ptrs);
                    add_device_grouped_conv2d_bwd_weight_xdl_nhwgc_gkyxc_nhwgk_bf16_default_pipev5_instances(
--- a/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_backward_weight_xdl.inc
+++ b/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_backward_weight_xdl.inc
@@ -184,6 +184,18 @@ void add_device_grouped_conv2d_bwd_weight_xdl_nhwgc_gkyxc_nhwgk_bf16_default_pip
                                                           PassThrough,
                                                           PassThrough>>>& instances);

+void add_device_grouped_conv2d_bwd_weight_xdl_nhwgc_gkyxc_nhwgk_bf16_direct_load_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvBwdWeight<2,
+                                                           NHWGC,
+                                                           GKYXC,
+                                                           NHWGK,
+                                                           BF16,
+                                                           BF16,
+                                                           BF16,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           PassThrough>>>& instances);
+
 void add_device_grouped_conv2d_bwd_weight_xdl_nhwgc_gkyxc_nhwgk_bf16_default_pipev5_instances(
    std::vector<std::unique_ptr<DeviceGroupedConvBwdWeight<2,
                                                           NHWGC,
@@ -389,6 +401,18 @@ void add_device_grouped_conv2d_bwd_weight_xdl_nhwgc_gkyxc_nhwgk_f16_default_pipe
                                                           PassThrough,
                                                           PassThrough>>>& instances);

+void add_device_grouped_conv2d_bwd_weight_xdl_nhwgc_gkyxc_nhwgk_f16_direct_load_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvBwdWeight<2,
+                                                           NHWGC,
+                                                           GKYXC,
+                                                           NHWGK,
+                                                           F16,
+                                                           F16,
+                                                           F16,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           PassThrough>>>& instances);
+
 void add_device_grouped_conv2d_bwd_weight_xdl_nhwgc_gkyxc_nhwgk_f16_default_pipev5_instances(
    std::vector<std::unique_ptr<DeviceGroupedConvBwdWeight<2,
                                                           NHWGC,
--- a/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/CMakeLists.txt
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/CMakeLists.txt
@@ -20,6 +20,8 @@ set(GROUPED_CONV2D_BWD_WEIGHT
    xdl/nhwgc_gkyxc_nhwgk/device_grouped_conv2d_bwd_weight_xdl_nhwgc_gkyxc_nhwgk_bf16_default_pipev5_instance.cpp
    xdl/nhwgc_gkyxc_nhwgk/device_grouped_conv2d_bwd_weight_xdl_nhwgc_gkyxc_nhwgk_bf16_pad0_pipev2_instance.cpp
    xdl/nhwgc_gkyxc_nhwgk/device_grouped_conv2d_bwd_weight_xdl_nhwgc_gkyxc_nhwgk_bf16_pad0_pipev5_instance.cpp
+    xdl/nhwgc_gkyxc_nhwgk/device_grouped_conv2d_bwd_weight_xdl_nhwgc_gkyxc_nhwgk_bf16_direct_load.cpp
+    xdl/nhwgc_gkyxc_nhwgk/device_grouped_conv2d_bwd_weight_xdl_nhwgc_gkyxc_nhwgk_f16_direct_load.cpp
    xdl/nhwgc_gkyxc_nhwgk/device_grouped_conv2d_bwd_weight_xdl_nhwgc_gkyxc_nhwgk_f16_default_pipev2_instance.cpp
    xdl/nhwgc_gkyxc_nhwgk/device_grouped_conv2d_bwd_weight_xdl_nhwgc_gkyxc_nhwgk_f16_default_pipev5_instance.cpp
    xdl/nhwgc_gkyxc_nhwgk/device_grouped_conv2d_bwd_weight_xdl_nhwgc_gkyxc_nhwgk_f16_pad0_pipev2_instance.cpp
--- a/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/nhwgc_gkyxc_nhwgk/device_grouped_conv2d_bwd_weight_xdl_nhwgc_gkyxc_nhwgk_bf16_direct_load.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/nhwgc_gkyxc_nhwgk/device_grouped_conv2d_bwd_weight_xdl_nhwgc_gkyxc_nhwgk_bf16_direct_load.cpp
@@ -0,0 +1,40 @@
+// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
+// SPDX-License-Identifier: MIT
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_weight/device_grouped_conv_bwd_weight_v3_xdl_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+// Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k]
+void add_device_grouped_conv2d_bwd_weight_xdl_nhwgc_gkyxc_nhwgk_bf16_direct_load_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvBwdWeight<2,
+                                                           NHWGC,
+                                                           GKYXC,
+                                                           NHWGK,
+                                                           BF16,
+                                                           BF16,
+                                                           BF16,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           PassThrough>>>& instances)
+{
+    add_device_operation_instances(
+        instances,
+        device_grouped_conv_bwd_weight_v3_xdl_c_shuffle_bf16_direct_load_instances<
+            2,
+            NHWGC,
+            GKYXC,
+            NHWGK,
+            ConvBwdWeightDefault,
+            BlockGemmPipelineScheduler::Intrawave,
+            BlockGemmPipelineVersion::v1>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
--- a/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/nhwgc_gkyxc_nhwgk/device_grouped_conv2d_bwd_weight_xdl_nhwgc_gkyxc_nhwgk_f16_direct_load.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/nhwgc_gkyxc_nhwgk/device_grouped_conv2d_bwd_weight_xdl_nhwgc_gkyxc_nhwgk_f16_direct_load.cpp
@@ -0,0 +1,40 @@
+// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
+// SPDX-License-Identifier: MIT
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_weight/device_grouped_conv_bwd_weight_v3_xdl_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+// Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k]
+void add_device_grouped_conv2d_bwd_weight_xdl_nhwgc_gkyxc_nhwgk_f16_direct_load_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvBwdWeight<2,
+                                                           NHWGC,
+                                                           GKYXC,
+                                                           NHWGK,
+                                                           F16,
+                                                           F16,
+                                                           F16,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           PassThrough>>>& instances)
+{
+    add_device_operation_instances(
+        instances,
+        device_grouped_conv_bwd_weight_v3_xdl_c_shuffle_f16_direct_load_instances<
+            2,
+            NHWGC,
+            GKYXC,
+            NHWGK,
+            ConvBwdWeightDefault,
+            BlockGemmPipelineScheduler::Intrawave,
+            BlockGemmPipelineVersion::v1>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck