Add grouped conv bwd weight multi d kernel (#1237)

* Add grouped conv bwd weight multi d kernel * Reference fix * Fix cmake files * bwd weight scale only xdl * Fixes * Fix client conv fwd example
2026-05-11 17:00:18 +00:00 · 2024-04-18 23:35:04 +02:00
parent 930f889c34
commit fd923b6d86
34 changed files with 4446 additions and 966 deletions
--- a/library/include/ck/library/reference_tensor_operation/cpu/reference_conv_bwd_weight.hpp
+++ b/library/include/ck/library/reference_tensor_operation/cpu/reference_conv_bwd_weight.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.

 #pragma once

@@ -25,6 +25,9 @@ template <ck::index_t NDimSpatial,
          typename InElementwiseOperation,
          typename WeiElementwiseOperation,
          typename OutElementwiseOperation,
+          ck::index_t NumAElementwiseTensor                                         = 0,
+          ck::index_t NumBElementwiseTensor                                         = 0,
+          ck::index_t NumDElementwiseTensor                                         = 0,
          typename ComputeTypeA                                                     = OutDataType,
          typename ComputeTypeB                                                     = InDataType,
          typename std::enable_if<NDimSpatial >= 1 && NDimSpatial <= 3, bool>::type = false>
@@ -33,19 +36,26 @@ struct ReferenceConvBwdWeight : public device::BaseOperator
    // Argument
    struct Argument : public device::BaseArgument
    {
-        Argument(const Tensor<InDataType>& in_n_c_hi_wi,
-                 Tensor<WeiDataType>& wei_k_c_y_x,
-                 const Tensor<OutDataType>& out_n_k_ho_wo,
-                 std::vector<ck::index_t> conv_filter_strides,
-                 std::vector<ck::index_t> conv_filter_dilations,
-                 std::vector<ck::index_t> input_left_pads,
-                 std::vector<ck::index_t> input_right_pads,
-                 InElementwiseOperation in_element_op,
-                 WeiElementwiseOperation wei_element_op,
-                 OutElementwiseOperation out_element_op)
+        Argument(
+            const Tensor<InDataType>& in_n_c_hi_wi,
+            Tensor<WeiDataType>& wei_k_c_y_x,
+            const Tensor<OutDataType>& out_n_k_ho_wo,
+            std::vector<ck::index_t> conv_filter_strides,
+            std::vector<ck::index_t> conv_filter_dilations,
+            std::vector<ck::index_t> input_left_pads,
+            std::vector<ck::index_t> input_right_pads,
+            InElementwiseOperation in_element_op,
+            WeiElementwiseOperation wei_element_op,
+            OutElementwiseOperation out_element_op,
+            const std::array<Tensor<OutDataType>, NumAElementwiseTensor>& elementwise_a_tensors,
+            const std::array<Tensor<InDataType>, NumBElementwiseTensor>& elementwise_b_tensors,
+            const std::array<Tensor<WeiDataType>, NumDElementwiseTensor>& elementwise_d_tensors)
            : input_{in_n_c_hi_wi},
              weight_{wei_k_c_y_x},
              output_{out_n_k_ho_wo},
+              elementwise_a_tensors_{elementwise_a_tensors},
+              elementwise_b_tensors_{elementwise_b_tensors},
+              elementwise_d_tensors_{elementwise_d_tensors},
              conv_strides_{conv_filter_strides},
              conv_dilations_{conv_filter_dilations},
              in_left_pads_{input_left_pads},
@@ -60,6 +70,10 @@ struct ReferenceConvBwdWeight : public device::BaseOperator
        Tensor<WeiDataType>& weight_;
        const Tensor<OutDataType>& output_;

+        const std::array<Tensor<OutDataType>, NumAElementwiseTensor>& elementwise_a_tensors_;
+        const std::array<Tensor<InDataType>, NumBElementwiseTensor>& elementwise_b_tensors_;
+        const std::array<Tensor<WeiDataType>, NumDElementwiseTensor>& elementwise_d_tensors_;
+
        std::vector<index_t> conv_strides_;
        std::vector<index_t> conv_dilations_;
        std::vector<index_t> in_left_pads_;
@@ -103,22 +117,43 @@ struct ReferenceConvBwdWeight : public device::BaseOperator
                                ComputeTypeA v_out;
                                ComputeTypeB v_in;

-                                arg.out_element_op_(
-                                    v_out, ck::type_convert<float>(arg.output_(g, n, k, wo)));
-
-                                arg.in_element_op_(
-                                    v_in, ck::type_convert<float>(arg.input_(g, n, c, wi)));
+                                ExecuteElementwiseOp(
+                                    arg.out_element_op_,
+                                    arg.elementwise_a_tensors_,
+                                    Number<NumAElementwiseTensor>{},
+                                    v_out,
+                                    ck::type_convert<float>(arg.output_(g, n, k, wo)),
+                                    g,
+                                    n,
+                                    k,
+                                    wo);
+                                ExecuteElementwiseOp(
+                                    arg.in_element_op_,
+                                    arg.elementwise_b_tensors_,
+                                    Number<NumBElementwiseTensor>{},
+                                    v_in,
+                                    ck::type_convert<float>(arg.input_(g, n, c, wi)),
+                                    g,
+                                    n,
+                                    c,
+                                    wi);

                                v_acc += type_convert<float>(v_out) * type_convert<float>(v_in);
                            }
                        }
                    }

-                    float v_wei;
-
-                    arg.wei_element_op_(v_wei, v_acc);
-
-                    arg.weight_(g, k, c, x) = ck::type_convert<WeiDataType>(v_wei);
+                    WeiDataType v_acc_converted = ck::type_convert<WeiDataType>(v_acc);
+                    WeiDataType& v_wei          = arg.weight_(g, k, c, x);
+                    ExecuteElementwiseOp(arg.wei_element_op_,
+                                         arg.elementwise_d_tensors_,
+                                         Number<NumDElementwiseTensor>{},
+                                         v_wei,
+                                         v_acc_converted,
+                                         g,
+                                         k,
+                                         c,
+                                         x);
                };

                make_ParallelTensorFunctor(f_kcx,
@@ -163,12 +198,28 @@ struct ReferenceConvBwdWeight : public device::BaseOperator
                                    ComputeTypeA v_out;
                                    ComputeTypeB v_in;

-                                    arg.out_element_op_(
+                                    ExecuteElementwiseOp(
+                                        arg.out_element_op_,
+                                        arg.elementwise_a_tensors_,
+                                        Number<NumAElementwiseTensor>{},
                                        v_out,
-                                        ck::type_convert<float>(arg.output_(g, n, k, ho, wo)));
-
-                                    arg.in_element_op_(
-                                        v_in, ck::type_convert<float>(arg.input_(g, n, c, hi, wi)));
+                                        ck::type_convert<float>(arg.output_(g, n, k, ho, wo)),
+                                        g,
+                                        n,
+                                        k,
+                                        ho,
+                                        wo);
+                                    ExecuteElementwiseOp(
+                                        arg.in_element_op_,
+                                        arg.elementwise_b_tensors_,
+                                        Number<NumBElementwiseTensor>{},
+                                        v_in,
+                                        ck::type_convert<float>(arg.input_(g, n, c, hi, wi)),
+                                        g,
+                                        n,
+                                        c,
+                                        hi,
+                                        wi);

                                    v_acc += type_convert<float>(v_out) * type_convert<float>(v_in);
                                }
@@ -176,11 +227,18 @@ struct ReferenceConvBwdWeight : public device::BaseOperator
                        }
                    }

-                    float v_wei;
-
-                    arg.wei_element_op_(v_wei, v_acc);
-
-                    arg.weight_(g, k, c, y, x) = ck::type_convert<WeiDataType>(v_wei);
+                    WeiDataType v_acc_converted = ck::type_convert<WeiDataType>(v_acc);
+                    WeiDataType& v_wei          = arg.weight_(g, k, c, y, x);
+                    ExecuteElementwiseOp(arg.wei_element_op_,
+                                         arg.elementwise_d_tensors_,
+                                         Number<NumDElementwiseTensor>{},
+                                         v_wei,
+                                         v_acc_converted,
+                                         g,
+                                         k,
+                                         c,
+                                         y,
+                                         x);
                };

                make_ParallelTensorFunctor(f_kcyx,
@@ -231,13 +289,30 @@ struct ReferenceConvBwdWeight : public device::BaseOperator
                                        ComputeTypeA v_out;
                                        ComputeTypeB v_in;

-                                        arg.out_element_op_(v_out,
-                                                            ck::type_convert<float>(
-                                                                arg.output_(g, n, k, do_, ho, wo)));
-
-                                        arg.in_element_op_(v_in,
-                                                           ck::type_convert<float>(
-                                                               arg.input_(g, n, c, di, hi, wi)));
+                                        ExecuteElementwiseOp(arg.out_element_op_,
+                                                             arg.elementwise_a_tensors_,
+                                                             Number<NumAElementwiseTensor>{},
+                                                             v_out,
+                                                             ck::type_convert<float>(
+                                                                 arg.output_(g, n, k, do_, ho, wo)),
+                                                             g,
+                                                             n,
+                                                             k,
+                                                             do_,
+                                                             ho,
+                                                             wo);
+                                        ExecuteElementwiseOp(arg.in_element_op_,
+                                                             arg.elementwise_b_tensors_,
+                                                             Number<NumBElementwiseTensor>{},
+                                                             v_in,
+                                                             ck::type_convert<float>(
+                                                                 arg.input_(g, n, c, di, hi, wi)),
+                                                             g,
+                                                             n,
+                                                             c,
+                                                             di,
+                                                             hi,
+                                                             wi);

                                        v_acc +=
                                            type_convert<float>(v_out) * type_convert<float>(v_in);
@@ -247,11 +322,19 @@ struct ReferenceConvBwdWeight : public device::BaseOperator
                        }
                    }

-                    float v_wei;
-
-                    arg.wei_element_op_(v_wei, v_acc);
-
-                    arg.weight_(g, k, c, z, y, x) = ck::type_convert<WeiDataType>(v_wei);
+                    WeiDataType v_acc_converted = ck::type_convert<WeiDataType>(v_acc);
+                    WeiDataType& v_wei          = arg.weight_(g, k, c, z, y, x);
+                    ExecuteElementwiseOp(arg.wei_element_op_,
+                                         arg.elementwise_d_tensors_,
+                                         Number<NumDElementwiseTensor>{},
+                                         v_wei,
+                                         v_acc_converted,
+                                         g,
+                                         k,
+                                         c,
+                                         z,
+                                         y,
+                                         x);
                };

                make_ParallelTensorFunctor(f_kczyx,
@@ -276,6 +359,37 @@ struct ReferenceConvBwdWeight : public device::BaseOperator
        }
    };

+    template <typename... Args,
+              typename ElementwiseOp,
+              typename ElementwiseTensor,
+              typename NumTensor,
+              typename Y,
+              typename X>
+    static void ExecuteElementwiseOp(ElementwiseOp& elementwise_op,
+                                     ElementwiseTensor& elementwise_tensors,
+                                     NumTensor,
+                                     Y& y,
+                                     const X& x,
+                                     Args... dims)
+    {
+        if constexpr(NumTensor::value == 0)
+        {
+            elementwise_op(y, x);
+        }
+        else if constexpr(NumTensor::value == 1)
+        {
+            elementwise_op(y, x, elementwise_tensors[0](dims...));
+        }
+        else if constexpr(NumTensor::value == 2)
+        {
+            elementwise_op(y, x, elementwise_tensors[0](dims...), elementwise_tensors[1](dims...));
+        }
+        else
+        {
+            throw std::runtime_error("ElementOp not supported in reference.");
+        }
+    }
+
    static constexpr bool IsValidCompilationParameter()
    {
        // TODO: properly implement this check
@@ -284,16 +398,20 @@ struct ReferenceConvBwdWeight : public device::BaseOperator

    bool IsSupportedArgument(const device::BaseArgument*) override { return true; }

-    static auto MakeArgument(const Tensor<InDataType>& in_n_c_hi_wi,
-                             Tensor<WeiDataType>& wei_k_c_y_x,
-                             const Tensor<OutDataType>& out_n_k_ho_wo,
-                             std::vector<ck::index_t> conv_filter_strides,
-                             std::vector<ck::index_t> conv_filter_dilations,
-                             std::vector<ck::index_t> input_left_pads,
-                             std::vector<ck::index_t> input_right_pads,
-                             InElementwiseOperation in_element_op,
-                             WeiElementwiseOperation wei_element_op,
-                             OutElementwiseOperation out_element_op)
+    static auto MakeArgument(
+        const Tensor<InDataType>& in_n_c_hi_wi,
+        Tensor<WeiDataType>& wei_k_c_y_x,
+        const Tensor<OutDataType>& out_n_k_ho_wo,
+        std::vector<ck::index_t> conv_filter_strides,
+        std::vector<ck::index_t> conv_filter_dilations,
+        std::vector<ck::index_t> input_left_pads,
+        std::vector<ck::index_t> input_right_pads,
+        InElementwiseOperation in_element_op,
+        WeiElementwiseOperation wei_element_op,
+        OutElementwiseOperation out_element_op,
+        const std::array<Tensor<OutDataType>, NumAElementwiseTensor>& elementwise_a_tensors = {},
+        const std::array<Tensor<InDataType>, NumBElementwiseTensor>& elementwise_b_tensors  = {},
+        const std::array<Tensor<WeiDataType>, NumDElementwiseTensor>& elementwise_d_tensors = {})
    {
        return Argument{in_n_c_hi_wi,
                        wei_k_c_y_x,
@@ -304,7 +422,10 @@ struct ReferenceConvBwdWeight : public device::BaseOperator
                        input_right_pads,
                        in_element_op,
                        wei_element_op,
-                        out_element_op};
+                        out_element_op,
+                        elementwise_a_tensors,
+                        elementwise_b_tensors,
+                        elementwise_d_tensors};
    }

    static auto MakeInvoker() { return Invoker{}; }
--- a/library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_weight/device_grouped_conv_bwd_weight_xdl_bilinear_instance.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_weight/device_grouped_conv_bwd_weight_xdl_bilinear_instance.hpp
@@ -0,0 +1,185 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_multiple_d_xdl_cshuffle.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+using namespace ck::tensor_layout::convolution;
+
+using BF16 = ck::bhalf_t;
+using F16  = ck::half_t;
+using F32  = float;
+
+#ifdef CK_ENABLE_FP8
+using F8 = ck::f8_t;
+#endif
+
+#ifdef CK_ENABLE_BF8
+using BF8 = ck::bf8_t;
+#endif
+
+using Empty_Tuple = ck::Tuple<>;
+
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+using Bilinear    = ck::tensor_operation::element_wise::Bilinear;
+
+static constexpr auto ConvBwdWeightDefault =
+    ck::tensor_operation::device::ConvolutionBackwardWeightSpecialization::Default;
+
+static constexpr auto ConvBwdWeightFilter1x1Stride1Pad0 =
+    ck::tensor_operation::device::ConvolutionBackwardWeightSpecialization::Filter1x1Stride1Pad0;
+
+template <ck::index_t NDimSpatial,
+          typename ALayout,
+          typename BLayout,
+          typename ELayout,
+          ConvolutionBackwardWeightSpecialization ConvSpec>
+using device_grouped_conv_bwd_weight_xdl_c_shuffle_f32_bilinear_instances = std::tuple<
+    // clang-format off
+        //#########################################|     Num| InLayout| WeiLayout| OutLayout| DsData| InData| WeiData| OutData| AccData| DsData|          In|         Wei|         Out|              ConvBackward| Block|  MPer|  NPer| K0Per| K1| MPer| NPer| MXdl| NXdl|  ABlockTransfer|   ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle|   CBlockTransfer|  CBlockTransfer|
+        //#########################################|     Dim|         |          |          | Layout|   Type|    Type|    Type|    Type|   Type| Elementwise| Elementwise| Elementwise|                    Weight|  Size| Block| Block| Block|   |  XDL|  XDL|  Per|  Per|   ThreadCluster|    ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|   ClusterLengths| ScalarPerVector|
+        //#########################################| Spatial|         |          |          |       |       |        |        |        |       |   Operation|   Operation|   Operation|            Specialization|      |      |      |      |   |     |     | Wave| Wave| Lengths_K0_M_K1|     ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle| MBlock_MPerBlock|    NWaveNPerXdl|
+        //#########################################|        |         |          |          |       |       |        |        |        |       |            |            |            |                          |      |      |      |      |   |     |     |     |     |                |                 |               |               |               |               |          |                |               |               |              |               |               |          |            |            | NBlock_NPerBlock|                |
+        // generic instance
+        DeviceGroupedConvBwdWeightMultipleD_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout, Tuple<BLayout>, F32,     F32,     F32,     F32, Tuple<F32>, PassThrough, Bilinear, PassThrough,  ConvSpec,    64,    64,    64,     4,  4,   32,   32,    2,    2,  S<1, 4, 16, 1>,   S<0, 3, 1, 2>,   S<0, 2, 1, 3>,              2,              1,              4,      true,  S<1, 4, 16, 1>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              1,              4,      true,           1,           1,   S<1, 16, 1, 4>,               1>,
+        // instances for small conv.K and conv.C
+        DeviceGroupedConvBwdWeightMultipleD_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout, Tuple<BLayout>, F32,     F32,     F32,     F32, Tuple<F32>, PassThrough, Bilinear, PassThrough,  ConvSpec,   128,   128,    32,     4,  4,   32,   32,    2,    1,  S<1, 4, 32, 1>,   S<0, 3, 1, 2>,   S<0, 2, 1, 3>,              2,              4,              4,      true,  S<1, 4, 8,  4>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              1,              1,      true,           1,           1,   S<1, 32, 1, 4>,               1>,
+        DeviceGroupedConvBwdWeightMultipleD_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout, Tuple<BLayout>, F32,     F32,     F32,     F32, Tuple<F32>, PassThrough, Bilinear, PassThrough,  ConvSpec,    64,    32,    64,     4,  4,   32,   32,    1,    2,  S<1, 4, 8,  2>,   S<0, 3, 1, 2>,   S<0, 2, 1, 3>,              2,              1,              2,      true,  S<1, 4, 16, 1>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              4,              4,      true,           1,           1,   S<1, 16, 1, 4>,               4>,
+
+        DeviceGroupedConvBwdWeightMultipleD_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout, Tuple<BLayout>, F32,     F32,     F32,     F32, Tuple<F32>, PassThrough, Bilinear, PassThrough,  ConvSpec,   256,   256,   128,     4,  4,   32,   32,    4,    2,  S<1, 4, 64, 1>,   S<0, 3, 1, 2>,   S<0, 2, 1, 3>,              2,              4,              4,      true,  S<1, 4, 32, 2>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              4,              2,      true,           1,           1,   S<1, 32, 1, 8>,               4>,
+        DeviceGroupedConvBwdWeightMultipleD_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout, Tuple<BLayout>, F32,     F32,     F32,     F32, Tuple<F32>, PassThrough, Bilinear, PassThrough,  ConvSpec,   256,   128,   256,     4,  4,   32,   32,    2,    4,  S<1, 4, 32, 2>,   S<0, 3, 1, 2>,   S<0, 2, 1, 3>,              2,              4,              2,      true,  S<1, 4, 64, 1>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              4,              4,      true,           1,           1,   S<1, 32, 1, 8>,               4>,
+        DeviceGroupedConvBwdWeightMultipleD_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout, Tuple<BLayout>, F32,     F32,     F32,     F32, Tuple<F32>, PassThrough, Bilinear, PassThrough,  ConvSpec,   128,   128,   128,     4,  4,   32,   32,    4,    2,  S<1, 4, 32, 1>,   S<0, 3, 1, 2>,   S<0, 2, 1, 3>,              2,              4,              4,      true,  S<1, 4, 32, 1>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              4,              4,      true,           1,           1,   S<1, 32, 1, 4>,               4>,
+        DeviceGroupedConvBwdWeightMultipleD_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout, Tuple<BLayout>, F32,     F32,     F32,     F32, Tuple<F32>, PassThrough, Bilinear, PassThrough,  ConvSpec,   256,   128,   128,     4,  4,   32,   32,    2,    2,  S<1, 4, 32, 2>,   S<0, 3, 1, 2>,   S<0, 2, 1, 3>,              2,              4,              2,      true,  S<1, 4, 32, 2>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              4,              2,      true,           1,           1,   S<1, 32, 1, 4>,               4>,
+        DeviceGroupedConvBwdWeightMultipleD_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout, Tuple<BLayout>, F32,     F32,     F32,     F32, Tuple<F32>, PassThrough, Bilinear, PassThrough,  ConvSpec,   128,   128,    64,     4,  4,   32,   32,    2,    2,  S<1, 4, 32, 1>,   S<0, 3, 1, 2>,   S<0, 2, 1, 3>,              2,              4,              4,      true,  S<1, 4, 16, 2>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              4,              2,      true,           1,           1,   S<1, 32, 1, 4>,               4>,
+        DeviceGroupedConvBwdWeightMultipleD_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout, Tuple<BLayout>, F32,     F32,     F32,     F32, Tuple<F32>, PassThrough, Bilinear, PassThrough,  ConvSpec,   128,    64,   128,     4,  4,   32,   32,    2,    2,  S<1, 4, 16, 2>,   S<0, 3, 1, 2>,   S<0, 2, 1, 3>,              2,              4,              2,      true,  S<1, 4, 32, 1>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              4,              4,      true,           1,           1,   S<1, 32, 1, 4>,               4>,
+        DeviceGroupedConvBwdWeightMultipleD_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout, Tuple<BLayout>, F32,     F32,     F32,     F32, Tuple<F32>, PassThrough, Bilinear, PassThrough,  ConvSpec,    64,    64,    64,     4,  4,   32,   32,    2,    2,  S<1, 4, 16, 1>,   S<0, 3, 1, 2>,   S<0, 2, 1, 3>,              2,              4,              4,      true,  S<1, 4, 16, 1>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              4,              4,      true,           1,           1,   S<1, 16, 1, 4>,               4>,
+        DeviceGroupedConvBwdWeightMultipleD_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout, Tuple<BLayout>, F32,     F32,     F32,     F32, Tuple<F32>, PassThrough, Bilinear, PassThrough,  ConvSpec,   256,   128,    64,     4,  4,   32,   32,    2,    1,  S<1, 4, 32, 2>,   S<0, 3, 1, 2>,   S<0, 2, 1, 3>,              2,              4,              2,      true,  S<1, 4, 16, 4>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              4,              1,      true,           1,           1,   S<1, 32, 1, 4>,               4>,
+        DeviceGroupedConvBwdWeightMultipleD_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout, Tuple<BLayout>, F32,     F32,     F32,     F32, Tuple<F32>, PassThrough, Bilinear, PassThrough,  ConvSpec,   256,    64,   128,     4,  4,   32,   32,    1,    2,  S<1, 4, 16, 4>,   S<0, 3, 1, 2>,   S<0, 2, 1, 3>,              2,              4,              1,      true,  S<1, 4, 32, 2>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              4,              2,      true,           1,           1,   S<1, 32, 1, 4>,               4>,
+        DeviceGroupedConvBwdWeightMultipleD_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout, Tuple<BLayout>, F32,     F32,     F32,     F32, Tuple<F32>, PassThrough, Bilinear, PassThrough,  ConvSpec,   128,   128,    32,     4,  4,   32,   32,    2,    1,  S<1, 4, 32, 1>,   S<0, 3, 1, 2>,   S<0, 2, 1, 3>,              2,              4,              4,      true,  S<1, 4, 8,  4>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              4,              1,      true,           1,           1,   S<1, 32, 1, 4>,               4>,
+        DeviceGroupedConvBwdWeightMultipleD_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout, Tuple<BLayout>, F32,     F32,     F32,     F32, Tuple<F32>, PassThrough, Bilinear, PassThrough,  ConvSpec,   128,    32,   128,     4,  4,   32,   32,    1,    2,  S<1, 4, 8,  4>,   S<0, 3, 1, 2>,   S<0, 2, 1, 3>,              2,              4,              1,      true,  S<1, 4, 32, 1>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              4,              4,      true,           1,           1,   S<1, 32, 1, 4>,               4>,
+        DeviceGroupedConvBwdWeightMultipleD_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout, Tuple<BLayout>, F32,     F32,     F32,     F32, Tuple<F32>, PassThrough, Bilinear, PassThrough,  ConvSpec,    64,    64,    32,     4,  4,   32,   32,    2,    1,  S<1, 4, 16, 1>,   S<0, 3, 1, 2>,   S<0, 2, 1, 3>,              2,              4,              4,      true,  S<1, 4, 8,  2>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              4,              2,      true,           1,           1,   S<1, 16, 1, 4>,               4>,
+        DeviceGroupedConvBwdWeightMultipleD_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout, Tuple<BLayout>, F32,     F32,     F32,     F32, Tuple<F32>, PassThrough, Bilinear, PassThrough,  ConvSpec,    64,    32,    64,     4,  4,   32,   32,    1,    2,  S<1, 4, 8,  2>,   S<0, 3, 1, 2>,   S<0, 2, 1, 3>,              2,              4,              2,      true,  S<1, 4, 16, 1>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              4,              4,      true,           1,           1,   S<1, 16, 1, 4>,               4>
+    // clang-format on
+    >;
+
+template <ck::index_t NDimSpatial,
+          typename ALayout,
+          typename BLayout,
+          typename ELayout,
+          ConvolutionBackwardWeightSpecialization ConvSpec>
+using device_grouped_conv_bwd_weight_xdl_c_shuffle_f16_bilinear_instances = std::tuple<
+    // clang-format off
+        //#########################################|     Num| InLayout| WeiLayout| OutLayout| DsData| InData| WeiData| OutData| AccData| DsData|          In|         Wei|         Out|              ConvBackward| Block|  MPer|  NPer| K0Per| K1| MPer| NPer| MXdl| NXdl|  ABlockTransfer|   ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle|   CBlockTransfer|  CBlockTransfer|
+        //#########################################|     Dim|         |          |          | Layout|   Type|    Type|    Type|    Type|   Type| Elementwise| Elementwise| Elementwise|                    Weight|  Size| Block| Block| Block|   |  XDL|  XDL|  Per|  Per|   ThreadCluster|    ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|   ClusterLengths| ScalarPerVector|
+        //#########################################| Spatial|         |          |          |       |       |        |        |        |       |   Operation|   Operation|   Operation|            Specialization|      |      |      |      |   |     |     | Wave| Wave| Lengths_K0_M_K1|     ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle| MBlock_MPerBlock|    NWaveNPerXdl|
+        //#########################################|        |         |          |          |       |       |        |        |        |       |            |            |            |                          |      |      |      |      |   |     |     |     |     |                |                 |               |               |               |               |          |                |               |               |              |               |               |          |            |            | NBlock_NPerBlock|                |
+        // generic instance
+        DeviceGroupedConvBwdWeightMultipleD_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout, Tuple<BLayout>, F16,     F16,     F16,     F32, Tuple<F16>, PassThrough, Bilinear, PassThrough,  ConvSpec,    64,    64,    64,     4,  8,   32,   32,    2,    2,  S<1, 4, 8,  2>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,              2,              2,              4,      true,  S<1, 4, 8,  2>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              2,              4,      true,           1,           1,   S<1, 16, 1, 4>,               2>,   
+        // instance for small conv.K
+        // for fp16 conv.K and conv.C must be divisible by 2
+        // since half_t atomic_add require scalar_per_x_vector % 2 == 0
+        DeviceGroupedConvBwdWeightMultipleD_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout, Tuple<BLayout>, F16,     F16,     F16,     F32, Tuple<F16>, PassThrough, Bilinear, PassThrough,  ConvSpec,   128,   128,    32,     4,  8,   32,   32,    2,    1,  S<1, 4, 16, 2>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,              2,              8,              4,      true,  S<1, 4, 4,  8>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              2,              1,      true,           1,           1,   S<1, 32, 1, 4>,               2>,  
+        DeviceGroupedConvBwdWeightMultipleD_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout, Tuple<BLayout>, F16,     F16,     F16,     F32, Tuple<F16>, PassThrough, Bilinear, PassThrough,  ConvSpec,    64,    32,    64,     4,  8,   32,   32,    1,    2,  S<1, 4, 4,  4>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,              2,              2,              2,      true,  S<1, 4, 8,  2>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              8,              4,      true,           1,           1,   S<1, 16, 1, 4>,               8>,
+
+        DeviceGroupedConvBwdWeightMultipleD_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout, Tuple<BLayout>, F16,     F16,     F16,     F32, Tuple<F16>, PassThrough, Bilinear, PassThrough,  ConvSpec,   256,   256,   128,     4,  8,   32,   32,    4,    2,  S<1, 4, 32, 2>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,              2,              8,              4,      true,  S<1, 4, 16, 4>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              8,              2,      true,           1,           1,   S<1, 32, 1, 8>,               8>,
+        DeviceGroupedConvBwdWeightMultipleD_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout, Tuple<BLayout>, F16,     F16,     F16,     F32, Tuple<F16>, PassThrough, Bilinear, PassThrough,  ConvSpec,   256,   128,   256,     4,  8,   32,   32,    2,    4,  S<1, 4, 16, 4>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,              2,              8,              2,      true,  S<1, 4, 32, 2>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              8,              4,      true,           1,           1,   S<1, 32, 1, 8>,               8>,   
+        DeviceGroupedConvBwdWeightMultipleD_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout, Tuple<BLayout>, F16,     F16,     F16,     F32, Tuple<F16>, PassThrough, Bilinear, PassThrough,  ConvSpec,   128,   128,   128,     4,  8,   32,   32,    4,    2,  S<1, 4, 16, 2>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,              2,              8,              4,      true,  S<1, 4, 16, 2>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              8,              4,      true,           1,           1,   S<1, 32, 1, 4>,               8>,   
+        DeviceGroupedConvBwdWeightMultipleD_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout, Tuple<BLayout>, F16,     F16,     F16,     F32, Tuple<F16>, PassThrough, Bilinear, PassThrough,  ConvSpec,   256,   128,   128,     4,  8,   32,   32,    2,    2,  S<1, 4, 16, 4>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,              2,              8,              2,      true,  S<1, 4, 16, 4>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              8,              2,      true,           1,           1,   S<1, 32, 1, 4>,               8>,   
+        DeviceGroupedConvBwdWeightMultipleD_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout, Tuple<BLayout>, F16,     F16,     F16,     F32, Tuple<F16>, PassThrough, Bilinear, PassThrough,  ConvSpec,   128,   128,    64,     4,  8,   32,   32,    2,    2,  S<1, 4, 16, 2>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,              2,              8,              4,      true,  S<1, 4, 8,  4>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              8,              2,      true,           1,           1,   S<1, 32, 1, 4>,               8>,   
+        DeviceGroupedConvBwdWeightMultipleD_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout, Tuple<BLayout>, F16,     F16,     F16,     F32, Tuple<F16>, PassThrough, Bilinear, PassThrough,  ConvSpec,   128,    64,   128,     4,  8,   32,   32,    2,    2,  S<1, 4, 8,  4>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,              2,              8,              2,      true,  S<1, 4, 16, 2>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              8,              4,      true,           1,           1,   S<1, 32, 1, 4>,               8>,   
+        DeviceGroupedConvBwdWeightMultipleD_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout, Tuple<BLayout>, F16,     F16,     F16,     F32, Tuple<F16>, PassThrough, Bilinear, PassThrough,  ConvSpec,    64,    64,    64,     4,  8,   32,   32,    2,    2,  S<1, 4, 8,  2>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,              2,              8,              4,      true,  S<1, 4, 8,  2>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              8,              4,      true,           1,           1,   S<1, 16, 1, 4>,               8>,   
+        DeviceGroupedConvBwdWeightMultipleD_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout, Tuple<BLayout>, F16,     F16,     F16,     F32, Tuple<F16>, PassThrough, Bilinear, PassThrough,  ConvSpec,   256,   128,    64,     4,  8,   32,   32,    2,    1,  S<1, 4, 16, 4>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,              2,              8,              2,      true,  S<1, 4, 8,  8>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              8,              1,      true,           1,           1,   S<1, 32, 1, 4>,               8>,   
+        DeviceGroupedConvBwdWeightMultipleD_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout, Tuple<BLayout>, F16,     F16,     F16,     F32, Tuple<F16>, PassThrough, Bilinear, PassThrough,  ConvSpec,   256,    64,   128,     4,  8,   32,   32,    1,    2,  S<1, 4, 8,  8>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,              2,              8,              1,      true,  S<1, 4, 16, 4>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              8,              2,      true,           1,           1,   S<1, 32, 1, 4>,               8>,   
+        DeviceGroupedConvBwdWeightMultipleD_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout, Tuple<BLayout>, F16,     F16,     F16,     F32, Tuple<F16>, PassThrough, Bilinear, PassThrough,  ConvSpec,   128,   128,    32,     4,  8,   32,   32,    2,    1,  S<1, 4, 16, 2>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,              2,              8,              4,      true,  S<1, 4, 4,  8>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              8,              1,      true,           1,           1,   S<1, 32, 1, 4>,               8>,   
+        DeviceGroupedConvBwdWeightMultipleD_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout, Tuple<BLayout>, F16,     F16,     F16,     F32, Tuple<F16>, PassThrough, Bilinear, PassThrough,  ConvSpec,   128,    32,   128,     4,  8,   32,   32,    1,    2,  S<1, 4, 4,  8>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,              2,              8,              1,      true,  S<1, 4, 16, 2>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              8,              4,      true,           1,           1,   S<1, 32, 1, 4>,               8>,   
+        DeviceGroupedConvBwdWeightMultipleD_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout, Tuple<BLayout>, F16,     F16,     F16,     F32, Tuple<F16>, PassThrough, Bilinear, PassThrough,  ConvSpec,    64,    64,    32,     4,  8,   32,   32,    2,    1,  S<1, 4, 8,  2>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,              2,              8,              4,      true,  S<1, 4, 4,  4>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              8,              2,      true,           1,           1,   S<1, 16, 1, 4>,               8>,   
+        DeviceGroupedConvBwdWeightMultipleD_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout, Tuple<BLayout>, F16,     F16,     F16,     F32, Tuple<F16>, PassThrough, Bilinear, PassThrough,  ConvSpec,    64,    32,    64,     4,  8,   32,   32,    1,    2,  S<1, 4, 4,  4>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,              2,              8,              2,      true,  S<1, 4, 8,  2>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              8,              4,      true,           1,           1,   S<1, 16, 1, 4>,               8>
+    // clang-format on
+    >;
+
+template <ck::index_t NDimSpatial,
+          typename ALayout,
+          typename BLayout,
+          typename ELayout,
+          ConvolutionBackwardWeightSpecialization ConvSpec>
+using device_grouped_conv_bwd_weight_xdl_c_shuffle_bf16_bilinear_instances = std::tuple<
+    // clang-format off
+        //#########################################|     Num| InLayout| WeiLayout| OutLayout| DsData| InData| WeiData| OutData| AccData| DsData|          In|         Wei|         Out|              ConvBackward| Block|  MPer|  NPer| K0Per| K1| MPer| NPer| MXdl| NXdl|  ABlockTransfer|   ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle|   CBlockTransfer|  CBlockTransfer|
+        //#########################################|     Dim|         |          |          | Layout|   Type|    Type|    Type|    Type|   Type| Elementwise| Elementwise| Elementwise|                    Weight|  Size| Block| Block| Block|   |  XDL|  XDL|  Per|  Per|   ThreadCluster|    ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|   ClusterLengths| ScalarPerVector|
+        //#########################################| Spatial|         |          |          |       |       |        |        |        |       |   Operation|   Operation|   Operation|            Specialization|      |      |      |      |   |     |     | Wave| Wave| Lengths_K0_M_K1|     ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle| MBlock_MPerBlock|    NWaveNPerXdl|
+        //#########################################|        |         |          |          |       |       |        |        |        |       |            |            |            |                          |      |      |      |      |   |     |     |     |     |                |                 |               |               |               |               |          |                |               |               |              |               |               |          |            |            | NBlock_NPerBlock|                |
+        // generic instance
+        DeviceGroupedConvBwdWeightMultipleD_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,    ELayout, Tuple<BLayout>, BF16,     F32,    BF16,     F32, Tuple<F32>, PassThrough, Bilinear, PassThrough,   ConvSpec,    64,    64,    64,     4,  8,   32,   32,    2,    2,  S<1, 4, 8,  2>, S<0, 3, 1, 2>,   S<0, 2, 1, 3>,              2,              1,              4,      true,   S<1, 4, 8,  2>,   S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              1,              4,      true,           1,           1,   S<1, 16, 1, 4>,               1>,   
+        // instance for small conv.K
+        DeviceGroupedConvBwdWeightMultipleD_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,    ELayout, Tuple<BLayout>, BF16,     F32,    BF16,     F32, Tuple<F32>, PassThrough, Bilinear, PassThrough,   ConvSpec,   128,   128,    32,     4,  8,   32,   32,    2,    1,  S<1, 4, 16, 2>, S<0, 3, 1, 2>,   S<0, 2, 1, 3>,              2,              8,              4,      true,   S<1, 4, 4,  8>,   S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              1,              1,      true,           1,           1,   S<1, 32, 1, 4>,               1>, 
+        DeviceGroupedConvBwdWeightMultipleD_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,    ELayout, Tuple<BLayout>, BF16,     F32,    BF16,     F32, Tuple<F32>, PassThrough, Bilinear, PassThrough,   ConvSpec,    64,    32,    64,     4,  8,   32,   32,    1,    2,  S<1, 4, 4,  4>, S<0, 3, 1, 2>,   S<0, 2, 1, 3>,              2,              1,              2,      true,   S<1, 4, 8,  2>,   S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              8,              4,      true,           1,           1,   S<1, 16, 1, 4>,               4>,
+
+        DeviceGroupedConvBwdWeightMultipleD_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,    ELayout, Tuple<BLayout>, BF16,     F32,    BF16,     F32, Tuple<F32>, PassThrough, Bilinear, PassThrough,   ConvSpec,   256,   256,   128,     4,  8,   32,   32,    4,    2,  S<1, 4, 32, 2>, S<0, 3, 1, 2>,   S<0, 2, 1, 3>,              2,              8,              4,      true,   S<1, 4, 16, 4>,   S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              8,              2,      true,           1,           1,   S<1, 32, 1, 8>,               4>,
+        DeviceGroupedConvBwdWeightMultipleD_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,    ELayout, Tuple<BLayout>, BF16,     F32,    BF16,     F32, Tuple<F32>, PassThrough, Bilinear, PassThrough,   ConvSpec,   256,   128,   256,     4,  8,   32,   32,    2,    4,  S<1, 4, 16, 4>, S<0, 3, 1, 2>,   S<0, 2, 1, 3>,              2,              8,              2,      true,   S<1, 4, 32, 2>,   S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              8,              4,      true,           1,           1,   S<1, 32, 1, 8>,               4>,   
+        DeviceGroupedConvBwdWeightMultipleD_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,    ELayout, Tuple<BLayout>, BF16,     F32,    BF16,     F32, Tuple<F32>, PassThrough, Bilinear, PassThrough,   ConvSpec,   128,   128,   128,     4,  8,   32,   32,    4,    2,  S<1, 4, 16, 2>, S<0, 3, 1, 2>,   S<0, 2, 1, 3>,              2,              8,              4,      true,   S<1, 4, 16, 2>,   S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              8,              4,      true,           1,           1,   S<1, 32, 1, 4>,               4>,   
+        DeviceGroupedConvBwdWeightMultipleD_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,    ELayout, Tuple<BLayout>, BF16,     F32,    BF16,     F32, Tuple<F32>, PassThrough, Bilinear, PassThrough,   ConvSpec,   256,   128,   128,     4,  8,   32,   32,    2,    2,  S<1, 4, 16, 4>, S<0, 3, 1, 2>,   S<0, 2, 1, 3>,              2,              8,              2,      true,   S<1, 4, 16, 4>,   S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              8,              2,      true,           1,           1,   S<1, 32, 1, 4>,               4>,   
+        DeviceGroupedConvBwdWeightMultipleD_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,    ELayout, Tuple<BLayout>, BF16,     F32,    BF16,     F32, Tuple<F32>, PassThrough, Bilinear, PassThrough,   ConvSpec,   128,   128,    64,     4,  8,   32,   32,    2,    2,  S<1, 4, 16, 2>, S<0, 3, 1, 2>,   S<0, 2, 1, 3>,              2,              8,              4,      true,   S<1, 4, 8,  4>,   S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              8,              2,      true,           1,           1,   S<1, 32, 1, 4>,               4>,   
+        DeviceGroupedConvBwdWeightMultipleD_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,    ELayout, Tuple<BLayout>, BF16,     F32,    BF16,     F32, Tuple<F32>, PassThrough, Bilinear, PassThrough,   ConvSpec,   128,    64,   128,     4,  8,   32,   32,    2,    2,  S<1, 4, 8,  4>, S<0, 3, 1, 2>,   S<0, 2, 1, 3>,              2,              8,              2,      true,   S<1, 4, 16, 2>,   S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              8,              4,      true,           1,           1,   S<1, 32, 1, 4>,               4>,   
+        DeviceGroupedConvBwdWeightMultipleD_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,    ELayout, Tuple<BLayout>, BF16,     F32,    BF16,     F32, Tuple<F32>, PassThrough, Bilinear, PassThrough,   ConvSpec,    64,    64,    64,     4,  8,   32,   32,    2,    2,  S<1, 4, 8,  2>, S<0, 3, 1, 2>,   S<0, 2, 1, 3>,              2,              8,              4,      true,   S<1, 4, 8,  2>,   S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              8,              4,      true,           1,           1,   S<1, 16, 1, 4>,               4>,   
+        DeviceGroupedConvBwdWeightMultipleD_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,    ELayout, Tuple<BLayout>, BF16,     F32,    BF16,     F32, Tuple<F32>, PassThrough, Bilinear, PassThrough,   ConvSpec,   256,   128,    64,     4,  8,   32,   32,    2,    1,  S<1, 4, 16, 4>, S<0, 3, 1, 2>,   S<0, 2, 1, 3>,              2,              8,              2,      true,   S<1, 4, 8,  8>,   S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              8,              1,      true,           1,           1,   S<1, 32, 1, 4>,               4>,   
+        DeviceGroupedConvBwdWeightMultipleD_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,    ELayout, Tuple<BLayout>, BF16,     F32,    BF16,     F32, Tuple<F32>, PassThrough, Bilinear, PassThrough,   ConvSpec,   256,    64,   128,     4,  8,   32,   32,    1,    2,  S<1, 4, 8,  8>, S<0, 3, 1, 2>,   S<0, 2, 1, 3>,              2,              8,              1,      true,   S<1, 4, 16, 4>,   S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              8,              2,      true,           1,           1,   S<1, 32, 1, 4>,               4>,   
+        DeviceGroupedConvBwdWeightMultipleD_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,    ELayout, Tuple<BLayout>, BF16,     F32,    BF16,     F32, Tuple<F32>, PassThrough, Bilinear, PassThrough,   ConvSpec,   128,   128,    32,     4,  8,   32,   32,    2,    1,  S<1, 4, 16, 2>, S<0, 3, 1, 2>,   S<0, 2, 1, 3>,              2,              8,              4,      true,   S<1, 4, 4,  8>,   S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              8,              1,      true,           1,           1,   S<1, 32, 1, 4>,               4>,   
+        DeviceGroupedConvBwdWeightMultipleD_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,    ELayout, Tuple<BLayout>, BF16,     F32,    BF16,     F32, Tuple<F32>, PassThrough, Bilinear, PassThrough,   ConvSpec,   128,    32,   128,     4,  8,   32,   32,    1,    2,  S<1, 4, 4,  8>, S<0, 3, 1, 2>,   S<0, 2, 1, 3>,              2,              8,              1,      true,   S<1, 4, 16, 2>,   S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              8,              4,      true,           1,           1,   S<1, 32, 1, 4>,               4>,   
+        DeviceGroupedConvBwdWeightMultipleD_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,    ELayout, Tuple<BLayout>, BF16,     F32,    BF16,     F32, Tuple<F32>, PassThrough, Bilinear, PassThrough,   ConvSpec,    64,    64,    32,     4,  8,   32,   32,    2,    1,  S<1, 4, 8,  2>, S<0, 3, 1, 2>,   S<0, 2, 1, 3>,              2,              8,              4,      true,   S<1, 4, 4,  4>,   S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              8,              2,      true,           1,           1,   S<1, 16, 1, 4>,               4>,   
+        DeviceGroupedConvBwdWeightMultipleD_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,    ELayout, Tuple<BLayout>, BF16,     F32,    BF16,     F32, Tuple<F32>, PassThrough, Bilinear, PassThrough,   ConvSpec,    64,    32,    64,     4,  8,   32,   32,    1,    2,  S<1, 4, 4,  4>, S<0, 3, 1, 2>,   S<0, 2, 1, 3>,              2,              8,              2,      true,   S<1, 4, 8,  2>,   S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              8,              4,      true,           1,           1,   S<1, 16, 1, 4>,               4>
+    // clang-format on
+    >;
+
+template <ck::index_t NDimSpatial,
+          typename ALayout,
+          typename BLayout,
+          typename ELayout,
+          ConvolutionBackwardWeightSpecialization ConvSpec>
+using device_grouped_conv_bwd_weight_xdl_c_shuffle_f16_comp_bf8_f8_bilinear_instances = std::tuple<
+// clang-format off
+        //#########################################|     Num| InLayout| WeiLayout| OutLayout| DsData| InData| WeiData| OutData| AccData| DsData|          In|         Wei|         Out|              ConvBackward| Block|  MPer|  NPer| K0Per| K1| MPer| NPer| MXdl| NXdl|  ABlockTransfer|   ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle|   CBlockTransfer|  CBlockTransfer|
+        //#########################################|     Dim|         |          |          | Layout|   Type|    Type|    Type|    Type|   Type| Elementwise| Elementwise| Elementwise|                    Weight|  Size| Block| Block| Block|   |  XDL|  XDL|  Per|  Per|   ThreadCluster|    ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|   ClusterLengths| ScalarPerVector|
+        //#########################################| Spatial|         |          |          |       |       |        |        |        |       |   Operation|   Operation|   Operation|            Specialization|      |      |      |      |   |     |     | Wave| Wave| Lengths_K0_M_K1|     ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle| MBlock_MPerBlock|    NWaveNPerXdl|
+        //#########################################|        |         |          |          |       |       |        |        |        |       |            |            |            |                          |      |      |      |      |   |     |     |     |     |                |                 |               |               |               |               |          |                |               |               |              |               |               |          |            |            | NBlock_NPerBlock|                |
+#if defined CK_ENABLE_FP8 && defined CK_ENABLE_BF8
+        // generic instance
+        DeviceGroupedConvBwdWeightMultipleD_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout, Tuple<BLayout>, F16,     F16,     F16,     F32, Tuple<F16>, PassThrough, Bilinear, PassThrough,  ConvSpec,    64,    64,    64,     4,  8,   32,   32,    2,    2,  S<1, 4, 8,  2>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,              2,              2,              4,      true,  S<1, 4, 8,  2>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              2,              4,      true,           1,           1,   S<1, 16, 1, 4>,               2,       BF8,     F8>,   
+        // instance for small conv.K
+        // for fp16 conv.K and conv.C must be divisible by 2
+        // since half_t atomic_add require scalar_per_x_vector % 2 == 0
+        DeviceGroupedConvBwdWeightMultipleD_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout, Tuple<BLayout>, F16,     F16,     F16,     F32, Tuple<F16>, PassThrough, Bilinear, PassThrough,  ConvSpec,   128,   128,    32,     4,  8,   32,   32,    2,    1,  S<1, 4, 16, 2>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,              2,              8,              4,      true,  S<1, 4, 4,  8>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              2,              1,      true,           1,           1,   S<1, 32, 1, 4>,               2,       BF8,     F8>,  
+        DeviceGroupedConvBwdWeightMultipleD_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout, Tuple<BLayout>, F16,     F16,     F16,     F32, Tuple<F16>, PassThrough, Bilinear, PassThrough,  ConvSpec,    64,    32,    64,     4,  8,   32,   32,    1,    2,  S<1, 4, 4,  4>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,              2,              2,              2,      true,  S<1, 4, 8,  2>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              8,              4,      true,           1,           1,   S<1, 16, 1, 4>,               8,       BF8,     F8>,
+
+        DeviceGroupedConvBwdWeightMultipleD_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout, Tuple<BLayout>, F16,     F16,     F16,     F32, Tuple<F16>, PassThrough, Bilinear, PassThrough,  ConvSpec,   256,   256,   128,     4,  8,   32,   32,    4,    2,  S<1, 4, 32, 2>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,              2,              8,              4,      true,  S<1, 4, 16, 4>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              8,              2,      true,           1,           1,   S<1, 32, 1, 8>,               8,       BF8,     F8>,
+        DeviceGroupedConvBwdWeightMultipleD_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout, Tuple<BLayout>, F16,     F16,     F16,     F32, Tuple<F16>, PassThrough, Bilinear, PassThrough,  ConvSpec,   256,   128,   256,     4,  8,   32,   32,    2,    4,  S<1, 4, 16, 4>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,              2,              8,              2,      true,  S<1, 4, 32, 2>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              8,              4,      true,           1,           1,   S<1, 32, 1, 8>,               8,       BF8,     F8>,   
+        DeviceGroupedConvBwdWeightMultipleD_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout, Tuple<BLayout>, F16,     F16,     F16,     F32, Tuple<F16>, PassThrough, Bilinear, PassThrough,  ConvSpec,   128,   128,   128,     4,  8,   32,   32,    4,    2,  S<1, 4, 16, 2>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,              2,              8,              4,      true,  S<1, 4, 16, 2>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              8,              4,      true,           1,           1,   S<1, 32, 1, 4>,               8,       BF8,     F8>,   
+        DeviceGroupedConvBwdWeightMultipleD_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout, Tuple<BLayout>, F16,     F16,     F16,     F32, Tuple<F16>, PassThrough, Bilinear, PassThrough,  ConvSpec,   256,   128,   128,     4,  8,   32,   32,    2,    2,  S<1, 4, 16, 4>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,              2,              8,              2,      true,  S<1, 4, 16, 4>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              8,              2,      true,           1,           1,   S<1, 32, 1, 4>,               8,       BF8,     F8>,   
+        DeviceGroupedConvBwdWeightMultipleD_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout, Tuple<BLayout>, F16,     F16,     F16,     F32, Tuple<F16>, PassThrough, Bilinear, PassThrough,  ConvSpec,   128,   128,    64,     4,  8,   32,   32,    2,    2,  S<1, 4, 16, 2>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,              2,              8,              4,      true,  S<1, 4, 8,  4>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              8,              2,      true,           1,           1,   S<1, 32, 1, 4>,               8,       BF8,     F8>,   
+        DeviceGroupedConvBwdWeightMultipleD_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout, Tuple<BLayout>, F16,     F16,     F16,     F32, Tuple<F16>, PassThrough, Bilinear, PassThrough,  ConvSpec,   128,    64,   128,     4,  8,   32,   32,    2,    2,  S<1, 4, 8,  4>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,              2,              8,              2,      true,  S<1, 4, 16, 2>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              8,              4,      true,           1,           1,   S<1, 32, 1, 4>,               8,       BF8,     F8>,   
+        DeviceGroupedConvBwdWeightMultipleD_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout, Tuple<BLayout>, F16,     F16,     F16,     F32, Tuple<F16>, PassThrough, Bilinear, PassThrough,  ConvSpec,    64,    64,    64,     4,  8,   32,   32,    2,    2,  S<1, 4, 8,  2>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,              2,              8,              4,      true,  S<1, 4, 8,  2>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              8,              4,      true,           1,           1,   S<1, 16, 1, 4>,               8,       BF8,     F8>,   
+        DeviceGroupedConvBwdWeightMultipleD_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout, Tuple<BLayout>, F16,     F16,     F16,     F32, Tuple<F16>, PassThrough, Bilinear, PassThrough,  ConvSpec,   256,   128,    64,     4,  8,   32,   32,    2,    1,  S<1, 4, 16, 4>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,              2,              8,              2,      true,  S<1, 4, 8,  8>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              8,              1,      true,           1,           1,   S<1, 32, 1, 4>,               8,       BF8,     F8>,   
+        DeviceGroupedConvBwdWeightMultipleD_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout, Tuple<BLayout>, F16,     F16,     F16,     F32, Tuple<F16>, PassThrough, Bilinear, PassThrough,  ConvSpec,   256,    64,   128,     4,  8,   32,   32,    1,    2,  S<1, 4, 8,  8>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,              2,              8,              1,      true,  S<1, 4, 16, 4>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              8,              2,      true,           1,           1,   S<1, 32, 1, 4>,               8,       BF8,     F8>,   
+        DeviceGroupedConvBwdWeightMultipleD_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout, Tuple<BLayout>, F16,     F16,     F16,     F32, Tuple<F16>, PassThrough, Bilinear, PassThrough,  ConvSpec,   128,   128,    32,     4,  8,   32,   32,    2,    1,  S<1, 4, 16, 2>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,              2,              8,              4,      true,  S<1, 4, 4,  8>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              8,              1,      true,           1,           1,   S<1, 32, 1, 4>,               8,       BF8,     F8>,   
+        DeviceGroupedConvBwdWeightMultipleD_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout, Tuple<BLayout>, F16,     F16,     F16,     F32, Tuple<F16>, PassThrough, Bilinear, PassThrough,  ConvSpec,   128,    32,   128,     4,  8,   32,   32,    1,    2,  S<1, 4, 4,  8>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,              2,              8,              1,      true,  S<1, 4, 16, 2>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              8,              4,      true,           1,           1,   S<1, 32, 1, 4>,               8,       BF8,     F8>,   
+        DeviceGroupedConvBwdWeightMultipleD_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout, Tuple<BLayout>, F16,     F16,     F16,     F32, Tuple<F16>, PassThrough, Bilinear, PassThrough,  ConvSpec,    64,    64,    32,     4,  8,   32,   32,    2,    1,  S<1, 4, 8,  2>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,              2,              8,              4,      true,  S<1, 4, 4,  4>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              8,              2,      true,           1,           1,   S<1, 16, 1, 4>,               8,       BF8,     F8>,   
+        DeviceGroupedConvBwdWeightMultipleD_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout, Tuple<BLayout>, F16,     F16,     F16,     F32, Tuple<F16>, PassThrough, Bilinear, PassThrough,  ConvSpec,    64,    32,    64,     4,  8,   32,   32,    1,    2,  S<1, 4, 4,  4>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,              2,              8,              2,      true,  S<1, 4, 8,  2>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              8,              4,      true,           1,           1,   S<1, 16, 1, 4>,               8,       BF8,     F8>
+#endif
+    // clang-format on
+    >;
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
--- a/library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_weight/device_grouped_conv_bwd_weight_xdl_scale_instance.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_weight/device_grouped_conv_bwd_weight_xdl_scale_instance.hpp
@@ -0,0 +1,185 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_multiple_d_xdl_cshuffle.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+using namespace ck::tensor_layout::convolution;
+
+using BF16 = ck::bhalf_t;
+using F16  = ck::half_t;
+using F32  = float;
+
+#ifdef CK_ENABLE_FP8
+using F8 = ck::f8_t;
+#endif
+
+#ifdef CK_ENABLE_BF8
+using BF8 = ck::bf8_t;
+#endif
+
+using Empty_Tuple = ck::Tuple<>;
+
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+using Scale       = ck::tensor_operation::element_wise::Scale;
+
+static constexpr auto ConvBwdWeightDefault =
+    ck::tensor_operation::device::ConvolutionBackwardWeightSpecialization::Default;
+
+static constexpr auto ConvBwdWeightFilter1x1Stride1Pad0 =
+    ck::tensor_operation::device::ConvolutionBackwardWeightSpecialization::Filter1x1Stride1Pad0;
+
+template <ck::index_t NDimSpatial,
+          typename ALayout,
+          typename BLayout,
+          typename ELayout,
+          ConvolutionBackwardWeightSpecialization ConvSpec>
+using device_grouped_conv_bwd_weight_xdl_c_shuffle_f32_scale_instances = std::tuple<
+    // clang-format off
+        //#########################################|     Num| InLayout| WeiLayout| OutLayout| DsData| InData| WeiData| OutData| AccData| DsData|          In|         Wei|         Out|              ConvBackward| Block|  MPer|  NPer| K0Per| K1| MPer| NPer| MXdl| NXdl|  ABlockTransfer|   ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle|   CBlockTransfer|  CBlockTransfer|
+        //#########################################|     Dim|         |          |          | Layout|   Type|    Type|    Type|    Type|   Type| Elementwise| Elementwise| Elementwise|                    Weight|  Size| Block| Block| Block|   |  XDL|  XDL|  Per|  Per|   ThreadCluster|    ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|   ClusterLengths| ScalarPerVector|
+        //#########################################| Spatial|         |          |          |       |       |        |        |        |       |   Operation|   Operation|   Operation|            Specialization|      |      |      |      |   |     |     | Wave| Wave| Lengths_K0_M_K1|     ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle| MBlock_MPerBlock|    NWaveNPerXdl|
+        //#########################################|        |         |          |          |       |       |        |        |        |       |            |            |            |                          |      |      |      |      |   |     |     |     |     |                |                 |               |               |               |               |          |                |               |               |              |               |               |          |            |            | NBlock_NPerBlock|                |
+        // generic instance
+        DeviceGroupedConvBwdWeightMultipleD_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout, Empty_Tuple, F32,     F32,     F32,     F32, Empty_Tuple, PassThrough, Scale, PassThrough,      ConvSpec,    64,    64,    64,     4,  4,   32,   32,    2,    2,  S<1, 4, 16, 1>,   S<0, 3, 1, 2>,   S<0, 2, 1, 3>,              2,              1,              4,      true,  S<1, 4, 16, 1>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              1,              4,      true,           1,           1,   S<1, 16, 1, 4>,               1>,
+        // instances for small conv.K and conv.C
+        DeviceGroupedConvBwdWeightMultipleD_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout, Empty_Tuple, F32,     F32,     F32,     F32, Empty_Tuple, PassThrough, Scale, PassThrough,      ConvSpec,   128,   128,    32,     4,  4,   32,   32,    2,    1,  S<1, 4, 32, 1>,   S<0, 3, 1, 2>,   S<0, 2, 1, 3>,              2,              4,              4,      true,  S<1, 4, 8,  4>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              1,              1,      true,           1,           1,   S<1, 32, 1, 4>,               1>,
+        DeviceGroupedConvBwdWeightMultipleD_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout, Empty_Tuple, F32,     F32,     F32,     F32, Empty_Tuple, PassThrough, Scale, PassThrough,      ConvSpec,    64,    32,    64,     4,  4,   32,   32,    1,    2,  S<1, 4, 8,  2>,   S<0, 3, 1, 2>,   S<0, 2, 1, 3>,              2,              1,              2,      true,  S<1, 4, 16, 1>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              4,              4,      true,           1,           1,   S<1, 16, 1, 4>,               4>,
+
+        DeviceGroupedConvBwdWeightMultipleD_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout, Empty_Tuple, F32,     F32,     F32,     F32, Empty_Tuple, PassThrough, Scale, PassThrough,      ConvSpec,   256,   256,   128,     4,  4,   32,   32,    4,    2,  S<1, 4, 64, 1>,   S<0, 3, 1, 2>,   S<0, 2, 1, 3>,              2,              4,              4,      true,  S<1, 4, 32, 2>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              4,              2,      true,           1,           1,   S<1, 32, 1, 8>,               4>,
+        DeviceGroupedConvBwdWeightMultipleD_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout, Empty_Tuple, F32,     F32,     F32,     F32, Empty_Tuple, PassThrough, Scale, PassThrough,      ConvSpec,   256,   128,   256,     4,  4,   32,   32,    2,    4,  S<1, 4, 32, 2>,   S<0, 3, 1, 2>,   S<0, 2, 1, 3>,              2,              4,              2,      true,  S<1, 4, 64, 1>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              4,              4,      true,           1,           1,   S<1, 32, 1, 8>,               4>,
+        DeviceGroupedConvBwdWeightMultipleD_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout, Empty_Tuple, F32,     F32,     F32,     F32, Empty_Tuple, PassThrough, Scale, PassThrough,      ConvSpec,   128,   128,   128,     4,  4,   32,   32,    4,    2,  S<1, 4, 32, 1>,   S<0, 3, 1, 2>,   S<0, 2, 1, 3>,              2,              4,              4,      true,  S<1, 4, 32, 1>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              4,              4,      true,           1,           1,   S<1, 32, 1, 4>,               4>,
+        DeviceGroupedConvBwdWeightMultipleD_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout, Empty_Tuple, F32,     F32,     F32,     F32, Empty_Tuple, PassThrough, Scale, PassThrough,      ConvSpec,   256,   128,   128,     4,  4,   32,   32,    2,    2,  S<1, 4, 32, 2>,   S<0, 3, 1, 2>,   S<0, 2, 1, 3>,              2,              4,              2,      true,  S<1, 4, 32, 2>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              4,              2,      true,           1,           1,   S<1, 32, 1, 4>,               4>,
+        DeviceGroupedConvBwdWeightMultipleD_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout, Empty_Tuple, F32,     F32,     F32,     F32, Empty_Tuple, PassThrough, Scale, PassThrough,      ConvSpec,   128,   128,    64,     4,  4,   32,   32,    2,    2,  S<1, 4, 32, 1>,   S<0, 3, 1, 2>,   S<0, 2, 1, 3>,              2,              4,              4,      true,  S<1, 4, 16, 2>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              4,              2,      true,           1,           1,   S<1, 32, 1, 4>,               4>,
+        DeviceGroupedConvBwdWeightMultipleD_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout, Empty_Tuple, F32,     F32,     F32,     F32, Empty_Tuple, PassThrough, Scale, PassThrough,      ConvSpec,   128,    64,   128,     4,  4,   32,   32,    2,    2,  S<1, 4, 16, 2>,   S<0, 3, 1, 2>,   S<0, 2, 1, 3>,              2,              4,              2,      true,  S<1, 4, 32, 1>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              4,              4,      true,           1,           1,   S<1, 32, 1, 4>,               4>,
+        DeviceGroupedConvBwdWeightMultipleD_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout, Empty_Tuple, F32,     F32,     F32,     F32, Empty_Tuple, PassThrough, Scale, PassThrough,      ConvSpec,    64,    64,    64,     4,  4,   32,   32,    2,    2,  S<1, 4, 16, 1>,   S<0, 3, 1, 2>,   S<0, 2, 1, 3>,              2,              4,              4,      true,  S<1, 4, 16, 1>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              4,              4,      true,           1,           1,   S<1, 16, 1, 4>,               4>,
+        DeviceGroupedConvBwdWeightMultipleD_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout, Empty_Tuple, F32,     F32,     F32,     F32, Empty_Tuple, PassThrough, Scale, PassThrough,      ConvSpec,   256,   128,    64,     4,  4,   32,   32,    2,    1,  S<1, 4, 32, 2>,   S<0, 3, 1, 2>,   S<0, 2, 1, 3>,              2,              4,              2,      true,  S<1, 4, 16, 4>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              4,              1,      true,           1,           1,   S<1, 32, 1, 4>,               4>,
+        DeviceGroupedConvBwdWeightMultipleD_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout, Empty_Tuple, F32,     F32,     F32,     F32, Empty_Tuple, PassThrough, Scale, PassThrough,      ConvSpec,   256,    64,   128,     4,  4,   32,   32,    1,    2,  S<1, 4, 16, 4>,   S<0, 3, 1, 2>,   S<0, 2, 1, 3>,              2,              4,              1,      true,  S<1, 4, 32, 2>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              4,              2,      true,           1,           1,   S<1, 32, 1, 4>,               4>,
+        DeviceGroupedConvBwdWeightMultipleD_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout, Empty_Tuple, F32,     F32,     F32,     F32, Empty_Tuple, PassThrough, Scale, PassThrough,      ConvSpec,   128,   128,    32,     4,  4,   32,   32,    2,    1,  S<1, 4, 32, 1>,   S<0, 3, 1, 2>,   S<0, 2, 1, 3>,              2,              4,              4,      true,  S<1, 4, 8,  4>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              4,              1,      true,           1,           1,   S<1, 32, 1, 4>,               4>,
+        DeviceGroupedConvBwdWeightMultipleD_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout, Empty_Tuple, F32,     F32,     F32,     F32, Empty_Tuple, PassThrough, Scale, PassThrough,      ConvSpec,   128,    32,   128,     4,  4,   32,   32,    1,    2,  S<1, 4, 8,  4>,   S<0, 3, 1, 2>,   S<0, 2, 1, 3>,              2,              4,              1,      true,  S<1, 4, 32, 1>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              4,              4,      true,           1,           1,   S<1, 32, 1, 4>,               4>,
+        DeviceGroupedConvBwdWeightMultipleD_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout, Empty_Tuple, F32,     F32,     F32,     F32, Empty_Tuple, PassThrough, Scale, PassThrough,      ConvSpec,    64,    64,    32,     4,  4,   32,   32,    2,    1,  S<1, 4, 16, 1>,   S<0, 3, 1, 2>,   S<0, 2, 1, 3>,              2,              4,              4,      true,  S<1, 4, 8,  2>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              4,              2,      true,           1,           1,   S<1, 16, 1, 4>,               4>,
+        DeviceGroupedConvBwdWeightMultipleD_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout, Empty_Tuple, F32,     F32,     F32,     F32, Empty_Tuple, PassThrough, Scale, PassThrough,      ConvSpec,    64,    32,    64,     4,  4,   32,   32,    1,    2,  S<1, 4, 8,  2>,   S<0, 3, 1, 2>,   S<0, 2, 1, 3>,              2,              4,              2,      true,  S<1, 4, 16, 1>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              4,              4,      true,           1,           1,   S<1, 16, 1, 4>,               4>
+    // clang-format on
+    >;
+
+template <ck::index_t NDimSpatial,
+          typename ALayout,
+          typename BLayout,
+          typename ELayout,
+          ConvolutionBackwardWeightSpecialization ConvSpec>
+using device_grouped_conv_bwd_weight_xdl_c_shuffle_f16_scale_instances = std::tuple<
+    // clang-format off
+        //#########################################|     Num| InLayout| WeiLayout| OutLayout| DsData| InData| WeiData| OutData| AccData| DsData|          In|         Wei|         Out|              ConvBackward| Block|  MPer|  NPer| K0Per| K1| MPer| NPer| MXdl| NXdl|  ABlockTransfer|   ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle|   CBlockTransfer|  CBlockTransfer|
+        //#########################################|     Dim|         |          |          | Layout|   Type|    Type|    Type|    Type|   Type| Elementwise| Elementwise| Elementwise|                    Weight|  Size| Block| Block| Block|   |  XDL|  XDL|  Per|  Per|   ThreadCluster|    ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|   ClusterLengths| ScalarPerVector|
+        //#########################################| Spatial|         |          |          |       |       |        |        |        |       |   Operation|   Operation|   Operation|            Specialization|      |      |      |      |   |     |     | Wave| Wave| Lengths_K0_M_K1|     ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle| MBlock_MPerBlock|    NWaveNPerXdl|
+        //#########################################|        |         |          |          |       |       |        |        |        |       |            |            |            |                          |      |      |      |      |   |     |     |     |     |                |                 |               |               |               |               |          |                |               |               |              |               |               |          |            |            | NBlock_NPerBlock|                |
+        // generic instance
+        DeviceGroupedConvBwdWeightMultipleD_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout, Empty_Tuple, F16,     F16,     F16,     F32, Empty_Tuple, PassThrough, Scale, PassThrough,      ConvSpec,    64,    64,    64,     4,  8,   32,   32,    2,    2,  S<1, 4, 8,  2>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,              2,              2,              4,      true,  S<1, 4, 8,  2>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              2,              4,      true,           1,           1,   S<1, 16, 1, 4>,               2>,   
+        // instance for small conv.K
+        // for fp16 conv.K and conv.C must be divisible by 2
+        // since half_t atomic_add require scalar_per_x_vector % 2 == 0
+        DeviceGroupedConvBwdWeightMultipleD_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout, Empty_Tuple, F16,     F16,     F16,     F32, Empty_Tuple, PassThrough, Scale, PassThrough,      ConvSpec,   128,   128,    32,     4,  8,   32,   32,    2,    1,  S<1, 4, 16, 2>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,              2,              8,              4,      true,  S<1, 4, 4,  8>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              2,              1,      true,           1,           1,   S<1, 32, 1, 4>,               2>,  
+        DeviceGroupedConvBwdWeightMultipleD_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout, Empty_Tuple, F16,     F16,     F16,     F32, Empty_Tuple, PassThrough, Scale, PassThrough,      ConvSpec,    64,    32,    64,     4,  8,   32,   32,    1,    2,  S<1, 4, 4,  4>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,              2,              2,              2,      true,  S<1, 4, 8,  2>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              8,              4,      true,           1,           1,   S<1, 16, 1, 4>,               8>,
+
+        DeviceGroupedConvBwdWeightMultipleD_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout, Empty_Tuple, F16,     F16,     F16,     F32, Empty_Tuple, PassThrough, Scale, PassThrough,      ConvSpec,   256,   256,   128,     4,  8,   32,   32,    4,    2,  S<1, 4, 32, 2>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,              2,              8,              4,      true,  S<1, 4, 16, 4>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              8,              2,      true,           1,           1,   S<1, 32, 1, 8>,               8>,
+        DeviceGroupedConvBwdWeightMultipleD_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout, Empty_Tuple, F16,     F16,     F16,     F32, Empty_Tuple, PassThrough, Scale, PassThrough,      ConvSpec,   256,   128,   256,     4,  8,   32,   32,    2,    4,  S<1, 4, 16, 4>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,              2,              8,              2,      true,  S<1, 4, 32, 2>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              8,              4,      true,           1,           1,   S<1, 32, 1, 8>,               8>,   
+        DeviceGroupedConvBwdWeightMultipleD_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout, Empty_Tuple, F16,     F16,     F16,     F32, Empty_Tuple, PassThrough, Scale, PassThrough,      ConvSpec,   128,   128,   128,     4,  8,   32,   32,    4,    2,  S<1, 4, 16, 2>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,              2,              8,              4,      true,  S<1, 4, 16, 2>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              8,              4,      true,           1,           1,   S<1, 32, 1, 4>,               8>,   
+        DeviceGroupedConvBwdWeightMultipleD_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout, Empty_Tuple, F16,     F16,     F16,     F32, Empty_Tuple, PassThrough, Scale, PassThrough,      ConvSpec,   256,   128,   128,     4,  8,   32,   32,    2,    2,  S<1, 4, 16, 4>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,              2,              8,              2,      true,  S<1, 4, 16, 4>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              8,              2,      true,           1,           1,   S<1, 32, 1, 4>,               8>,   
+        DeviceGroupedConvBwdWeightMultipleD_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout, Empty_Tuple, F16,     F16,     F16,     F32, Empty_Tuple, PassThrough, Scale, PassThrough,      ConvSpec,   128,   128,    64,     4,  8,   32,   32,    2,    2,  S<1, 4, 16, 2>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,              2,              8,              4,      true,  S<1, 4, 8,  4>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              8,              2,      true,           1,           1,   S<1, 32, 1, 4>,               8>,   
+        DeviceGroupedConvBwdWeightMultipleD_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout, Empty_Tuple, F16,     F16,     F16,     F32, Empty_Tuple, PassThrough, Scale, PassThrough,      ConvSpec,   128,    64,   128,     4,  8,   32,   32,    2,    2,  S<1, 4, 8,  4>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,              2,              8,              2,      true,  S<1, 4, 16, 2>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              8,              4,      true,           1,           1,   S<1, 32, 1, 4>,               8>,   
+        DeviceGroupedConvBwdWeightMultipleD_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout, Empty_Tuple, F16,     F16,     F16,     F32, Empty_Tuple, PassThrough, Scale, PassThrough,      ConvSpec,    64,    64,    64,     4,  8,   32,   32,    2,    2,  S<1, 4, 8,  2>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,              2,              8,              4,      true,  S<1, 4, 8,  2>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              8,              4,      true,           1,           1,   S<1, 16, 1, 4>,               8>,   
+        DeviceGroupedConvBwdWeightMultipleD_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout, Empty_Tuple, F16,     F16,     F16,     F32, Empty_Tuple, PassThrough, Scale, PassThrough,      ConvSpec,   256,   128,    64,     4,  8,   32,   32,    2,    1,  S<1, 4, 16, 4>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,              2,              8,              2,      true,  S<1, 4, 8,  8>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              8,              1,      true,           1,           1,   S<1, 32, 1, 4>,               8>,   
+        DeviceGroupedConvBwdWeightMultipleD_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout, Empty_Tuple, F16,     F16,     F16,     F32, Empty_Tuple, PassThrough, Scale, PassThrough,      ConvSpec,   256,    64,   128,     4,  8,   32,   32,    1,    2,  S<1, 4, 8,  8>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,              2,              8,              1,      true,  S<1, 4, 16, 4>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              8,              2,      true,           1,           1,   S<1, 32, 1, 4>,               8>,   
+        DeviceGroupedConvBwdWeightMultipleD_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout, Empty_Tuple, F16,     F16,     F16,     F32, Empty_Tuple, PassThrough, Scale, PassThrough,      ConvSpec,   128,   128,    32,     4,  8,   32,   32,    2,    1,  S<1, 4, 16, 2>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,              2,              8,              4,      true,  S<1, 4, 4,  8>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              8,              1,      true,           1,           1,   S<1, 32, 1, 4>,               8>,   
+        DeviceGroupedConvBwdWeightMultipleD_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout, Empty_Tuple, F16,     F16,     F16,     F32, Empty_Tuple, PassThrough, Scale, PassThrough,      ConvSpec,   128,    32,   128,     4,  8,   32,   32,    1,    2,  S<1, 4, 4,  8>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,              2,              8,              1,      true,  S<1, 4, 16, 2>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              8,              4,      true,           1,           1,   S<1, 32, 1, 4>,               8>,   
+        DeviceGroupedConvBwdWeightMultipleD_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout, Empty_Tuple, F16,     F16,     F16,     F32, Empty_Tuple, PassThrough, Scale, PassThrough,      ConvSpec,    64,    64,    32,     4,  8,   32,   32,    2,    1,  S<1, 4, 8,  2>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,              2,              8,              4,      true,  S<1, 4, 4,  4>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              8,              2,      true,           1,           1,   S<1, 16, 1, 4>,               8>,   
+        DeviceGroupedConvBwdWeightMultipleD_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout, Empty_Tuple, F16,     F16,     F16,     F32, Empty_Tuple, PassThrough, Scale, PassThrough,      ConvSpec,    64,    32,    64,     4,  8,   32,   32,    1,    2,  S<1, 4, 4,  4>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,              2,              8,              2,      true,  S<1, 4, 8,  2>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              8,              4,      true,           1,           1,   S<1, 16, 1, 4>,               8>
+    // clang-format on
+    >;
+
+template <ck::index_t NDimSpatial,
+          typename ALayout,
+          typename BLayout,
+          typename ELayout,
+          ConvolutionBackwardWeightSpecialization ConvSpec>
+using device_grouped_conv_bwd_weight_xdl_c_shuffle_bf16_scale_instances = std::tuple<
+    // clang-format off
+        //#########################################|     Num| InLayout| WeiLayout| OutLayout| DsData| InData| WeiData| OutData| AccData| DsData|          In|         Wei|         Out|              ConvBackward| Block|  MPer|  NPer| K0Per| K1| MPer| NPer| MXdl| NXdl|  ABlockTransfer|   ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle|   CBlockTransfer|  CBlockTransfer|
+        //#########################################|     Dim|         |          |          | Layout|   Type|    Type|    Type|    Type|   Type| Elementwise| Elementwise| Elementwise|                    Weight|  Size| Block| Block| Block|   |  XDL|  XDL|  Per|  Per|   ThreadCluster|    ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|   ClusterLengths| ScalarPerVector|
+        //#########################################| Spatial|         |          |          |       |       |        |        |        |       |   Operation|   Operation|   Operation|            Specialization|      |      |      |      |   |     |     | Wave| Wave| Lengths_K0_M_K1|     ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle| MBlock_MPerBlock|    NWaveNPerXdl|
+        //#########################################|        |         |          |          |       |       |        |        |        |       |            |            |            |                          |      |      |      |      |   |     |     |     |     |                |                 |               |               |               |               |          |                |               |               |              |               |               |          |            |            | NBlock_NPerBlock|                |
+        // generic instance
+        DeviceGroupedConvBwdWeightMultipleD_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,    ELayout, Empty_Tuple, BF16,     F32,    BF16,     F32, Empty_Tuple, PassThrough, Scale, PassThrough,       ConvSpec,    64,    64,    64,     4,  8,   32,   32,    2,    2,  S<1, 4, 8,  2>, S<0, 3, 1, 2>,   S<0, 2, 1, 3>,              2,              1,              4,      true,   S<1, 4, 8,  2>,   S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              1,              4,      true,           1,           1,   S<1, 16, 1, 4>,               1>,   
+        // instance for small conv.K
+        DeviceGroupedConvBwdWeightMultipleD_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,    ELayout, Empty_Tuple, BF16,     F32,    BF16,     F32, Empty_Tuple, PassThrough, Scale, PassThrough,       ConvSpec,   128,   128,    32,     4,  8,   32,   32,    2,    1,  S<1, 4, 16, 2>, S<0, 3, 1, 2>,   S<0, 2, 1, 3>,              2,              8,              4,      true,   S<1, 4, 4,  8>,   S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              1,              1,      true,           1,           1,   S<1, 32, 1, 4>,               1>, 
+        DeviceGroupedConvBwdWeightMultipleD_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,    ELayout, Empty_Tuple, BF16,     F32,    BF16,     F32, Empty_Tuple, PassThrough, Scale, PassThrough,       ConvSpec,    64,    32,    64,     4,  8,   32,   32,    1,    2,  S<1, 4, 4,  4>, S<0, 3, 1, 2>,   S<0, 2, 1, 3>,              2,              1,              2,      true,   S<1, 4, 8,  2>,   S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              8,              4,      true,           1,           1,   S<1, 16, 1, 4>,               4>,
+
+        DeviceGroupedConvBwdWeightMultipleD_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,    ELayout, Empty_Tuple, BF16,     F32,    BF16,     F32, Empty_Tuple, PassThrough, Scale, PassThrough,       ConvSpec,   256,   256,   128,     4,  8,   32,   32,    4,    2,  S<1, 4, 32, 2>, S<0, 3, 1, 2>,   S<0, 2, 1, 3>,              2,              8,              4,      true,   S<1, 4, 16, 4>,   S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              8,              2,      true,           1,           1,   S<1, 32, 1, 8>,               4>,
+        DeviceGroupedConvBwdWeightMultipleD_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,    ELayout, Empty_Tuple, BF16,     F32,    BF16,     F32, Empty_Tuple, PassThrough, Scale, PassThrough,       ConvSpec,   256,   128,   256,     4,  8,   32,   32,    2,    4,  S<1, 4, 16, 4>, S<0, 3, 1, 2>,   S<0, 2, 1, 3>,              2,              8,              2,      true,   S<1, 4, 32, 2>,   S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              8,              4,      true,           1,           1,   S<1, 32, 1, 8>,               4>,   
+        DeviceGroupedConvBwdWeightMultipleD_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,    ELayout, Empty_Tuple, BF16,     F32,    BF16,     F32, Empty_Tuple, PassThrough, Scale, PassThrough,       ConvSpec,   128,   128,   128,     4,  8,   32,   32,    4,    2,  S<1, 4, 16, 2>, S<0, 3, 1, 2>,   S<0, 2, 1, 3>,              2,              8,              4,      true,   S<1, 4, 16, 2>,   S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              8,              4,      true,           1,           1,   S<1, 32, 1, 4>,               4>,   
+        DeviceGroupedConvBwdWeightMultipleD_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,    ELayout, Empty_Tuple, BF16,     F32,    BF16,     F32, Empty_Tuple, PassThrough, Scale, PassThrough,       ConvSpec,   256,   128,   128,     4,  8,   32,   32,    2,    2,  S<1, 4, 16, 4>, S<0, 3, 1, 2>,   S<0, 2, 1, 3>,              2,              8,              2,      true,   S<1, 4, 16, 4>,   S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              8,              2,      true,           1,           1,   S<1, 32, 1, 4>,               4>,   
+        DeviceGroupedConvBwdWeightMultipleD_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,    ELayout, Empty_Tuple, BF16,     F32,    BF16,     F32, Empty_Tuple, PassThrough, Scale, PassThrough,       ConvSpec,   128,   128,    64,     4,  8,   32,   32,    2,    2,  S<1, 4, 16, 2>, S<0, 3, 1, 2>,   S<0, 2, 1, 3>,              2,              8,              4,      true,   S<1, 4, 8,  4>,   S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              8,              2,      true,           1,           1,   S<1, 32, 1, 4>,               4>,   
+        DeviceGroupedConvBwdWeightMultipleD_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,    ELayout, Empty_Tuple, BF16,     F32,    BF16,     F32, Empty_Tuple, PassThrough, Scale, PassThrough,       ConvSpec,   128,    64,   128,     4,  8,   32,   32,    2,    2,  S<1, 4, 8,  4>, S<0, 3, 1, 2>,   S<0, 2, 1, 3>,              2,              8,              2,      true,   S<1, 4, 16, 2>,   S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              8,              4,      true,           1,           1,   S<1, 32, 1, 4>,               4>,   
+        DeviceGroupedConvBwdWeightMultipleD_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,    ELayout, Empty_Tuple, BF16,     F32,    BF16,     F32, Empty_Tuple, PassThrough, Scale, PassThrough,       ConvSpec,    64,    64,    64,     4,  8,   32,   32,    2,    2,  S<1, 4, 8,  2>, S<0, 3, 1, 2>,   S<0, 2, 1, 3>,              2,              8,              4,      true,   S<1, 4, 8,  2>,   S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              8,              4,      true,           1,           1,   S<1, 16, 1, 4>,               4>,   
+        DeviceGroupedConvBwdWeightMultipleD_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,    ELayout, Empty_Tuple, BF16,     F32,    BF16,     F32, Empty_Tuple, PassThrough, Scale, PassThrough,       ConvSpec,   256,   128,    64,     4,  8,   32,   32,    2,    1,  S<1, 4, 16, 4>, S<0, 3, 1, 2>,   S<0, 2, 1, 3>,              2,              8,              2,      true,   S<1, 4, 8,  8>,   S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              8,              1,      true,           1,           1,   S<1, 32, 1, 4>,               4>,   
+        DeviceGroupedConvBwdWeightMultipleD_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,    ELayout, Empty_Tuple, BF16,     F32,    BF16,     F32, Empty_Tuple, PassThrough, Scale, PassThrough,       ConvSpec,   256,    64,   128,     4,  8,   32,   32,    1,    2,  S<1, 4, 8,  8>, S<0, 3, 1, 2>,   S<0, 2, 1, 3>,              2,              8,              1,      true,   S<1, 4, 16, 4>,   S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              8,              2,      true,           1,           1,   S<1, 32, 1, 4>,               4>,   
+        DeviceGroupedConvBwdWeightMultipleD_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,    ELayout, Empty_Tuple, BF16,     F32,    BF16,     F32, Empty_Tuple, PassThrough, Scale, PassThrough,       ConvSpec,   128,   128,    32,     4,  8,   32,   32,    2,    1,  S<1, 4, 16, 2>, S<0, 3, 1, 2>,   S<0, 2, 1, 3>,              2,              8,              4,      true,   S<1, 4, 4,  8>,   S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              8,              1,      true,           1,           1,   S<1, 32, 1, 4>,               4>,   
+        DeviceGroupedConvBwdWeightMultipleD_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,    ELayout, Empty_Tuple, BF16,     F32,    BF16,     F32, Empty_Tuple, PassThrough, Scale, PassThrough,       ConvSpec,   128,    32,   128,     4,  8,   32,   32,    1,    2,  S<1, 4, 4,  8>, S<0, 3, 1, 2>,   S<0, 2, 1, 3>,              2,              8,              1,      true,   S<1, 4, 16, 2>,   S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              8,              4,      true,           1,           1,   S<1, 32, 1, 4>,               4>,   
+        DeviceGroupedConvBwdWeightMultipleD_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,    ELayout, Empty_Tuple, BF16,     F32,    BF16,     F32, Empty_Tuple, PassThrough, Scale, PassThrough,       ConvSpec,    64,    64,    32,     4,  8,   32,   32,    2,    1,  S<1, 4, 8,  2>, S<0, 3, 1, 2>,   S<0, 2, 1, 3>,              2,              8,              4,      true,   S<1, 4, 4,  4>,   S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              8,              2,      true,           1,           1,   S<1, 16, 1, 4>,               4>,   
+        DeviceGroupedConvBwdWeightMultipleD_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,    ELayout, Empty_Tuple, BF16,     F32,    BF16,     F32, Empty_Tuple, PassThrough, Scale, PassThrough,       ConvSpec,    64,    32,    64,     4,  8,   32,   32,    1,    2,  S<1, 4, 4,  4>, S<0, 3, 1, 2>,   S<0, 2, 1, 3>,              2,              8,              2,      true,   S<1, 4, 8,  2>,   S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              8,              4,      true,           1,           1,   S<1, 16, 1, 4>,               4>
+    // clang-format on
+    >;
+
+template <ck::index_t NDimSpatial,
+          typename ALayout,
+          typename BLayout,
+          typename ELayout,
+          ConvolutionBackwardWeightSpecialization ConvSpec>
+using device_grouped_conv_bwd_weight_xdl_c_shuffle_f16_comp_bf8_f8_scale_instances = std::tuple<
+// clang-format off
+        //#########################################|     Num| InLayout| WeiLayout| OutLayout| DsData| InData| WeiData| OutData| AccData| DsData|          In|         Wei|         Out|              ConvBackward| Block|  MPer|  NPer| K0Per| K1| MPer| NPer| MXdl| NXdl|  ABlockTransfer|   ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle|   CBlockTransfer|  CBlockTransfer|
+        //#########################################|     Dim|         |          |          | Layout|   Type|    Type|    Type|    Type|   Type| Elementwise| Elementwise| Elementwise|                    Weight|  Size| Block| Block| Block|   |  XDL|  XDL|  Per|  Per|   ThreadCluster|    ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|   ClusterLengths| ScalarPerVector|
+        //#########################################| Spatial|         |          |          |       |       |        |        |        |       |   Operation|   Operation|   Operation|            Specialization|      |      |      |      |   |     |     | Wave| Wave| Lengths_K0_M_K1|     ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle| MBlock_MPerBlock|    NWaveNPerXdl|
+        //#########################################|        |         |          |          |       |       |        |        |        |       |            |            |            |                          |      |      |      |      |   |     |     |     |     |                |                 |               |               |               |               |          |                |               |               |              |               |               |          |            |            | NBlock_NPerBlock|                |
+#if defined CK_ENABLE_FP8 && defined CK_ENABLE_BF8
+        // generic instance
+        DeviceGroupedConvBwdWeightMultipleD_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout, Empty_Tuple, F16,     F16,     F16,     F32, Empty_Tuple, PassThrough, Scale, PassThrough,      ConvSpec,    64,    64,    64,     4,  8,   32,   32,    2,    2,  S<1, 4, 8,  2>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,              2,              2,              4,      true,  S<1, 4, 8,  2>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              2,              4,      true,           1,           1,   S<1, 16, 1, 4>,               2,       BF8,     F8>,   
+        // instance for small conv.K
+        // for fp16 conv.K and conv.C must be divisible by 2
+        // since half_t atomic_add require scalar_per_x_vector % 2 == 0
+        DeviceGroupedConvBwdWeightMultipleD_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout, Empty_Tuple, F16,     F16,     F16,     F32, Empty_Tuple, PassThrough, Scale, PassThrough,      ConvSpec,   128,   128,    32,     4,  8,   32,   32,    2,    1,  S<1, 4, 16, 2>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,              2,              8,              4,      true,  S<1, 4, 4,  8>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              2,              1,      true,           1,           1,   S<1, 32, 1, 4>,               2,       BF8,     F8>,  
+        DeviceGroupedConvBwdWeightMultipleD_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout, Empty_Tuple, F16,     F16,     F16,     F32, Empty_Tuple, PassThrough, Scale, PassThrough,      ConvSpec,    64,    32,    64,     4,  8,   32,   32,    1,    2,  S<1, 4, 4,  4>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,              2,              2,              2,      true,  S<1, 4, 8,  2>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              8,              4,      true,           1,           1,   S<1, 16, 1, 4>,               8,       BF8,     F8>,
+
+        DeviceGroupedConvBwdWeightMultipleD_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout, Empty_Tuple, F16,     F16,     F16,     F32, Empty_Tuple, PassThrough, Scale, PassThrough,      ConvSpec,   256,   256,   128,     4,  8,   32,   32,    4,    2,  S<1, 4, 32, 2>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,              2,              8,              4,      true,  S<1, 4, 16, 4>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              8,              2,      true,           1,           1,   S<1, 32, 1, 8>,               8,       BF8,     F8>,
+        DeviceGroupedConvBwdWeightMultipleD_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout, Empty_Tuple, F16,     F16,     F16,     F32, Empty_Tuple, PassThrough, Scale, PassThrough,      ConvSpec,   256,   128,   256,     4,  8,   32,   32,    2,    4,  S<1, 4, 16, 4>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,              2,              8,              2,      true,  S<1, 4, 32, 2>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              8,              4,      true,           1,           1,   S<1, 32, 1, 8>,               8,       BF8,     F8>,   
+        DeviceGroupedConvBwdWeightMultipleD_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout, Empty_Tuple, F16,     F16,     F16,     F32, Empty_Tuple, PassThrough, Scale, PassThrough,      ConvSpec,   128,   128,   128,     4,  8,   32,   32,    4,    2,  S<1, 4, 16, 2>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,              2,              8,              4,      true,  S<1, 4, 16, 2>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              8,              4,      true,           1,           1,   S<1, 32, 1, 4>,               8,       BF8,     F8>,   
+        DeviceGroupedConvBwdWeightMultipleD_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout, Empty_Tuple, F16,     F16,     F16,     F32, Empty_Tuple, PassThrough, Scale, PassThrough,      ConvSpec,   256,   128,   128,     4,  8,   32,   32,    2,    2,  S<1, 4, 16, 4>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,              2,              8,              2,      true,  S<1, 4, 16, 4>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              8,              2,      true,           1,           1,   S<1, 32, 1, 4>,               8,       BF8,     F8>,   
+        DeviceGroupedConvBwdWeightMultipleD_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout, Empty_Tuple, F16,     F16,     F16,     F32, Empty_Tuple, PassThrough, Scale, PassThrough,      ConvSpec,   128,   128,    64,     4,  8,   32,   32,    2,    2,  S<1, 4, 16, 2>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,              2,              8,              4,      true,  S<1, 4, 8,  4>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              8,              2,      true,           1,           1,   S<1, 32, 1, 4>,               8,       BF8,     F8>,   
+        DeviceGroupedConvBwdWeightMultipleD_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout, Empty_Tuple, F16,     F16,     F16,     F32, Empty_Tuple, PassThrough, Scale, PassThrough,      ConvSpec,   128,    64,   128,     4,  8,   32,   32,    2,    2,  S<1, 4, 8,  4>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,              2,              8,              2,      true,  S<1, 4, 16, 2>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              8,              4,      true,           1,           1,   S<1, 32, 1, 4>,               8,       BF8,     F8>,   
+        DeviceGroupedConvBwdWeightMultipleD_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout, Empty_Tuple, F16,     F16,     F16,     F32, Empty_Tuple, PassThrough, Scale, PassThrough,      ConvSpec,    64,    64,    64,     4,  8,   32,   32,    2,    2,  S<1, 4, 8,  2>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,              2,              8,              4,      true,  S<1, 4, 8,  2>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              8,              4,      true,           1,           1,   S<1, 16, 1, 4>,               8,       BF8,     F8>,   
+        DeviceGroupedConvBwdWeightMultipleD_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout, Empty_Tuple, F16,     F16,     F16,     F32, Empty_Tuple, PassThrough, Scale, PassThrough,      ConvSpec,   256,   128,    64,     4,  8,   32,   32,    2,    1,  S<1, 4, 16, 4>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,              2,              8,              2,      true,  S<1, 4, 8,  8>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              8,              1,      true,           1,           1,   S<1, 32, 1, 4>,               8,       BF8,     F8>,   
+        DeviceGroupedConvBwdWeightMultipleD_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout, Empty_Tuple, F16,     F16,     F16,     F32, Empty_Tuple, PassThrough, Scale, PassThrough,      ConvSpec,   256,    64,   128,     4,  8,   32,   32,    1,    2,  S<1, 4, 8,  8>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,              2,              8,              1,      true,  S<1, 4, 16, 4>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              8,              2,      true,           1,           1,   S<1, 32, 1, 4>,               8,       BF8,     F8>,   
+        DeviceGroupedConvBwdWeightMultipleD_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout, Empty_Tuple, F16,     F16,     F16,     F32, Empty_Tuple, PassThrough, Scale, PassThrough,      ConvSpec,   128,   128,    32,     4,  8,   32,   32,    2,    1,  S<1, 4, 16, 2>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,              2,              8,              4,      true,  S<1, 4, 4,  8>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              8,              1,      true,           1,           1,   S<1, 32, 1, 4>,               8,       BF8,     F8>,   
+        DeviceGroupedConvBwdWeightMultipleD_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout, Empty_Tuple, F16,     F16,     F16,     F32, Empty_Tuple, PassThrough, Scale, PassThrough,      ConvSpec,   128,    32,   128,     4,  8,   32,   32,    1,    2,  S<1, 4, 4,  8>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,              2,              8,              1,      true,  S<1, 4, 16, 2>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              8,              4,      true,           1,           1,   S<1, 32, 1, 4>,               8,       BF8,     F8>,   
+        DeviceGroupedConvBwdWeightMultipleD_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout, Empty_Tuple, F16,     F16,     F16,     F32, Empty_Tuple, PassThrough, Scale, PassThrough,      ConvSpec,    64,    64,    32,     4,  8,   32,   32,    2,    1,  S<1, 4, 8,  2>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,              2,              8,              4,      true,  S<1, 4, 4,  4>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              8,              2,      true,           1,           1,   S<1, 16, 1, 4>,               8,       BF8,     F8>,   
+        DeviceGroupedConvBwdWeightMultipleD_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout, Empty_Tuple, F16,     F16,     F16,     F32, Empty_Tuple, PassThrough, Scale, PassThrough,      ConvSpec,    64,    32,    64,     4,  8,   32,   32,    1,    2,  S<1, 4, 4,  4>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,              2,              8,              2,      true,  S<1, 4, 8,  2>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              8,              4,      true,           1,           1,   S<1, 16, 1, 4>,               8,       BF8,     F8>
+#endif
+    // clang-format on
+    >;
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
--- a/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_backward_weight_bilinear.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_backward_weight_bilinear.hpp
@@ -0,0 +1,186 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/device_grouped_conv_bwd_weight_multiple_d.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+#include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+#ifdef CK_USE_XDL
+#ifdef CK_ENABLE_BF16
+void add_device_grouped_conv3d_bwd_weight_xdl_bilinear_ndhwgc_gkzyxc_ndhwgk_bf16_f32_bf16_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvBwdWeightMultipleD<3,
+                                                                    NDHWGC,
+                                                                    GKZYXC,
+                                                                    NDHWGK,
+                                                                    Tuple<GKZYXC>,
+                                                                    BF16,
+                                                                    F32,
+                                                                    BF16,
+                                                                    Tuple<F32>,
+                                                                    PassThrough,
+                                                                    Bilinear,
+                                                                    PassThrough>>>& instances);
+#endif
+#ifdef CK_ENABLE_FP16
+void add_device_grouped_conv3d_bwd_weight_xdl_bilinear_ndhwgc_gkzyxc_ndhwgk_f16_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvBwdWeightMultipleD<3,
+                                                                    NDHWGC,
+                                                                    GKZYXC,
+                                                                    NDHWGK,
+                                                                    Tuple<GKZYXC>,
+                                                                    F16,
+                                                                    F16,
+                                                                    F16,
+                                                                    Tuple<F16>,
+                                                                    PassThrough,
+                                                                    Bilinear,
+                                                                    PassThrough>>>& instances);
+#endif
+#ifdef CK_ENABLE_FP32
+void add_device_grouped_conv3d_bwd_weight_xdl_bilinear_ndhwgc_gkzyxc_ndhwgk_f32_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvBwdWeightMultipleD<3,
+                                                                    NDHWGC,
+                                                                    GKZYXC,
+                                                                    NDHWGK,
+                                                                    Tuple<GKZYXC>,
+                                                                    F32,
+                                                                    F32,
+                                                                    F32,
+                                                                    Tuple<F32>,
+                                                                    PassThrough,
+                                                                    Bilinear,
+                                                                    PassThrough>>>& instances);
+#endif
+#if defined CK_ENABLE_FP16 && defined CK_ENABLE_FP8 && defined CK_ENABLE_BF8
+void add_device_grouped_conv3d_bwd_weight_xdl_bilinear_ndhwgc_gkzyxc_ndhwgk_f16_comp_bf8_f8_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvBwdWeightMultipleD<3,
+                                                                    NDHWGC,
+                                                                    GKZYXC,
+                                                                    NDHWGK,
+                                                                    Tuple<GKZYXC>,
+                                                                    F16,
+                                                                    F16,
+                                                                    F16,
+                                                                    Tuple<F16>,
+                                                                    PassThrough,
+                                                                    Bilinear,
+                                                                    PassThrough,
+                                                                    BF8,
+                                                                    F8>>>& instances);
+#endif
+#endif
+
+template <ck::index_t NumDimSpatial,
+          typename InLayout,
+          typename WeiLayout,
+          typename OutLayout,
+          typename DsLayout,
+          typename InDataType,
+          typename WeiDataType,
+          typename OutDataType,
+          typename DsDataType,
+          typename ComputeTypeA,
+          typename ComputeTypeB>
+struct DeviceOperationInstanceFactory<
+    ck::tensor_operation::device::DeviceGroupedConvBwdWeightMultipleD<
+        NumDimSpatial,
+        InLayout,
+        WeiLayout,
+        OutLayout,
+        DsLayout,
+        InDataType,
+        WeiDataType,
+        OutDataType,
+        DsDataType,
+        ck::tensor_operation::element_wise::PassThrough,
+        ck::tensor_operation::element_wise::Bilinear,
+        ck::tensor_operation::element_wise::PassThrough,
+        ComputeTypeA,
+        ComputeTypeB>>
+{
+    using DeviceOp =
+        DeviceGroupedConvBwdWeightMultipleD<NumDimSpatial,
+                                            InLayout,
+                                            WeiLayout,
+                                            OutLayout,
+                                            DsLayout,
+                                            InDataType,
+                                            WeiDataType,
+                                            OutDataType,
+                                            DsDataType,
+                                            ck::tensor_operation::element_wise::PassThrough,
+                                            ck::tensor_operation::element_wise::Bilinear,
+                                            ck::tensor_operation::element_wise::PassThrough,
+                                            ComputeTypeA,
+                                            ComputeTypeB>;
+
+    static auto GetInstances()
+    {
+        std::vector<std::unique_ptr<DeviceOp>> op_ptrs;
+
+#ifdef CK_USE_XDL
+        if constexpr(NumDimSpatial == 3)
+        {
+            if constexpr(is_same_v<InLayout, NDHWGC> && is_same_v<WeiLayout, GKZYXC> &&
+                         is_same_v<OutLayout, NDHWGK>)
+            {
+#ifdef CK_ENABLE_FP32
+                if constexpr(is_same_v<InDataType, float> && is_same_v<WeiDataType, float> &&
+                             is_same_v<OutDataType, float> && is_same_v<ComputeTypeA, float> &&
+                             is_same_v<ComputeTypeB, float>)
+                {
+                    add_device_grouped_conv3d_bwd_weight_xdl_bilinear_ndhwgc_gkzyxc_ndhwgk_f32_instances(
+                        op_ptrs);
+                }
+#endif
+#ifdef CK_ENABLE_FP16
+                if constexpr(is_same_v<InDataType, half_t> && is_same_v<WeiDataType, half_t> &&
+                             is_same_v<OutDataType, half_t> && is_same_v<ComputeTypeA, half_t> &&
+                             is_same_v<ComputeTypeB, half_t>)
+                {
+                    add_device_grouped_conv3d_bwd_weight_xdl_bilinear_ndhwgc_gkzyxc_ndhwgk_f16_instances(
+                        op_ptrs);
+                }
+#endif
+#ifdef CK_ENABLE_BF16
+                if constexpr(is_same_v<InDataType, ck::bhalf_t> && is_same_v<WeiDataType, float> &&
+                             is_same_v<OutDataType, ck::bhalf_t> &&
+                             is_same_v<ComputeTypeA, ck::bhalf_t> &&
+                             is_same_v<ComputeTypeB, ck::bhalf_t>)
+                {
+                    add_device_grouped_conv3d_bwd_weight_xdl_bilinear_ndhwgc_gkzyxc_ndhwgk_bf16_f32_bf16_instances(
+                        op_ptrs);
+                }
+#endif
+#if defined CK_ENABLE_FP16 && defined CK_ENABLE_FP8 && defined CK_ENABLE_BF8
+                if constexpr(is_same_v<InDataType, half_t> && is_same_v<WeiDataType, half_t> &&
+                             is_same_v<OutDataType, half_t> && is_same_v<ComputeTypeA, bf8_t> &&
+                             is_same_v<ComputeTypeB, f8_t>)
+                {
+                    add_device_grouped_conv3d_bwd_weight_xdl_bilinear_ndhwgc_gkzyxc_ndhwgk_f16_comp_bf8_f8_instances(
+                        op_ptrs);
+                }
+#endif
+            }
+        }
+#endif
+        return op_ptrs;
+    }
+};
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
--- a/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_backward_weight_scale.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_backward_weight_scale.hpp
@@ -0,0 +1,186 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/device_grouped_conv_bwd_weight_multiple_d.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+#include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+#ifdef CK_USE_XDL
+#ifdef CK_ENABLE_BF16
+void add_device_grouped_conv3d_bwd_weight_xdl_scale_ndhwgc_gkzyxc_ndhwgk_bf16_f32_bf16_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvBwdWeightMultipleD<3,
+                                                                    NDHWGC,
+                                                                    GKZYXC,
+                                                                    NDHWGK,
+                                                                    Tuple<>,
+                                                                    BF16,
+                                                                    F32,
+                                                                    BF16,
+                                                                    Tuple<>,
+                                                                    PassThrough,
+                                                                    Scale,
+                                                                    PassThrough>>>& instances);
+#endif
+#ifdef CK_ENABLE_FP16
+void add_device_grouped_conv3d_bwd_weight_xdl_scale_ndhwgc_gkzyxc_ndhwgk_f16_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvBwdWeightMultipleD<3,
+                                                                    NDHWGC,
+                                                                    GKZYXC,
+                                                                    NDHWGK,
+                                                                    Tuple<>,
+                                                                    F16,
+                                                                    F16,
+                                                                    F16,
+                                                                    Tuple<>,
+                                                                    PassThrough,
+                                                                    Scale,
+                                                                    PassThrough>>>& instances);
+#endif
+#ifdef CK_ENABLE_FP32
+void add_device_grouped_conv3d_bwd_weight_xdl_scale_ndhwgc_gkzyxc_ndhwgk_f32_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvBwdWeightMultipleD<3,
+                                                                    NDHWGC,
+                                                                    GKZYXC,
+                                                                    NDHWGK,
+                                                                    Tuple<>,
+                                                                    F32,
+                                                                    F32,
+                                                                    F32,
+                                                                    Tuple<>,
+                                                                    PassThrough,
+                                                                    Scale,
+                                                                    PassThrough>>>& instances);
+#endif
+#if defined CK_ENABLE_FP16 && defined CK_ENABLE_FP8 && defined CK_ENABLE_BF8
+void add_device_grouped_conv3d_bwd_weight_xdl_scale_ndhwgc_gkzyxc_ndhwgk_f16_comp_bf8_f8_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvBwdWeightMultipleD<3,
+                                                                    NDHWGC,
+                                                                    GKZYXC,
+                                                                    NDHWGK,
+                                                                    Tuple<>,
+                                                                    F16,
+                                                                    F16,
+                                                                    F16,
+                                                                    Tuple<>,
+                                                                    PassThrough,
+                                                                    Scale,
+                                                                    PassThrough,
+                                                                    BF8,
+                                                                    F8>>>& instances);
+#endif
+#endif
+
+template <ck::index_t NumDimSpatial,
+          typename InLayout,
+          typename WeiLayout,
+          typename OutLayout,
+          typename DsLayout,
+          typename InDataType,
+          typename WeiDataType,
+          typename OutDataType,
+          typename DsDataType,
+          typename ComputeTypeA,
+          typename ComputeTypeB>
+struct DeviceOperationInstanceFactory<
+    ck::tensor_operation::device::DeviceGroupedConvBwdWeightMultipleD<
+        NumDimSpatial,
+        InLayout,
+        WeiLayout,
+        OutLayout,
+        DsLayout,
+        InDataType,
+        WeiDataType,
+        OutDataType,
+        DsDataType,
+        ck::tensor_operation::element_wise::PassThrough,
+        ck::tensor_operation::element_wise::Scale,
+        ck::tensor_operation::element_wise::PassThrough,
+        ComputeTypeA,
+        ComputeTypeB>>
+{
+    using DeviceOp =
+        DeviceGroupedConvBwdWeightMultipleD<NumDimSpatial,
+                                            InLayout,
+                                            WeiLayout,
+                                            OutLayout,
+                                            DsLayout,
+                                            InDataType,
+                                            WeiDataType,
+                                            OutDataType,
+                                            DsDataType,
+                                            ck::tensor_operation::element_wise::PassThrough,
+                                            ck::tensor_operation::element_wise::Scale,
+                                            ck::tensor_operation::element_wise::PassThrough,
+                                            ComputeTypeA,
+                                            ComputeTypeB>;
+
+    static auto GetInstances()
+    {
+        std::vector<std::unique_ptr<DeviceOp>> op_ptrs;
+
+#ifdef CK_USE_XDL
+        if constexpr(NumDimSpatial == 3)
+        {
+            if constexpr(is_same_v<InLayout, NDHWGC> && is_same_v<WeiLayout, GKZYXC> &&
+                         is_same_v<OutLayout, NDHWGK>)
+            {
+#ifdef CK_ENABLE_FP32
+                if constexpr(is_same_v<InDataType, float> && is_same_v<WeiDataType, float> &&
+                             is_same_v<OutDataType, float> && is_same_v<ComputeTypeA, float> &&
+                             is_same_v<ComputeTypeB, float>)
+                {
+                    add_device_grouped_conv3d_bwd_weight_xdl_scale_ndhwgc_gkzyxc_ndhwgk_f32_instances(
+                        op_ptrs);
+                }
+#endif
+#ifdef CK_ENABLE_FP16
+                if constexpr(is_same_v<InDataType, half_t> && is_same_v<WeiDataType, half_t> &&
+                             is_same_v<OutDataType, half_t> && is_same_v<ComputeTypeA, half_t> &&
+                             is_same_v<ComputeTypeB, half_t>)
+                {
+                    add_device_grouped_conv3d_bwd_weight_xdl_scale_ndhwgc_gkzyxc_ndhwgk_f16_instances(
+                        op_ptrs);
+                }
+#endif
+#ifdef CK_ENABLE_BF16
+                if constexpr(is_same_v<InDataType, ck::bhalf_t> && is_same_v<WeiDataType, float> &&
+                             is_same_v<OutDataType, ck::bhalf_t> &&
+                             is_same_v<ComputeTypeA, ck::bhalf_t> &&
+                             is_same_v<ComputeTypeB, ck::bhalf_t>)
+                {
+                    add_device_grouped_conv3d_bwd_weight_xdl_scale_ndhwgc_gkzyxc_ndhwgk_bf16_f32_bf16_instances(
+                        op_ptrs);
+                }
+#endif
+#if defined CK_ENABLE_FP16 && defined CK_ENABLE_FP8 && defined CK_ENABLE_BF8
+                if constexpr(is_same_v<InDataType, half_t> && is_same_v<WeiDataType, half_t> &&
+                             is_same_v<OutDataType, half_t> && is_same_v<ComputeTypeA, bf8_t> &&
+                             is_same_v<ComputeTypeB, f8_t>)
+                {
+                    add_device_grouped_conv3d_bwd_weight_xdl_scale_ndhwgc_gkzyxc_ndhwgk_f16_comp_bf8_f8_instances(
+                        op_ptrs);
+                }
+#endif
+            }
+        }
+#endif
+        return op_ptrs;
+    }
+};
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
--- a/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight_bilinear/CMakeLists.txt
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight_bilinear/CMakeLists.txt
@@ -0,0 +1,12 @@
+# ONLY XDL_KERNELS
+set(GROUPED_CONV3D_BWD_WEIGHT_BILINEAR
+    xdl/device_grouped_conv3d_bwd_weight_xdl_bilinear_ndhwgc_gkzyxc_ndhwgk_f16_instance.cpp
+    xdl/device_grouped_conv3d_bwd_weight_xdl_bilinear_ndhwgc_gkzyxc_ndhwgk_f32_instance.cpp
+    xdl/device_grouped_conv3d_bwd_weight_xdl_bilinear_ndhwgc_gkzyxc_ndhwgk_bf16_instance.cpp)
+
+if((DTYPES MATCHES "fp8" AND DTYPES MATCHES "bf8" AND DTYPES MATCHES "fp16") OR NOT DEFINED DTYPES)
+    list(APPEND GROUPED_CONV3D_BWD_WEIGHT_BILINEAR
+      xdl/device_grouped_conv3d_bwd_weight_xdl_bilinear_ndhwgc_gkzyxc_ndhwgk_f16_comp_bf8_fp8_instance.cpp)
+endif()
+
+add_instance_library(device_grouped_conv3d_bwd_weight_bilinear_instance ${GROUPED_CONV3D_BWD_WEIGHT_BILINEAR})
--- a/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight_bilinear/xdl/device_grouped_conv3d_bwd_weight_xdl_bilinear_ndhwgc_gkzyxc_ndhwgk_bf16_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight_bilinear/xdl/device_grouped_conv3d_bwd_weight_xdl_bilinear_ndhwgc_gkzyxc_ndhwgk_bf16_instance.cpp
@@ -0,0 +1,50 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_weight/device_grouped_conv_bwd_weight_xdl_bilinear_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+// Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k]
+void add_device_grouped_conv3d_bwd_weight_xdl_bilinear_ndhwgc_gkzyxc_ndhwgk_bf16_f32_bf16_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvBwdWeightMultipleD<3,
+                                                                    NDHWGC,
+                                                                    GKZYXC,
+                                                                    NDHWGK,
+                                                                    Tuple<GKZYXC>,
+                                                                    BF16,
+                                                                    F32,
+                                                                    BF16,
+                                                                    Tuple<F32>,
+                                                                    PassThrough,
+                                                                    Bilinear,
+                                                                    PassThrough>>>& instances)
+{
+    // 1. Default
+    add_device_operation_instances(
+        instances,
+        device_grouped_conv_bwd_weight_xdl_c_shuffle_bf16_bilinear_instances<
+            3,
+            NDHWGC,
+            GKZYXC,
+            NDHWGK,
+            ConvBwdWeightDefault>{});
+    // 2. Filter1x1Stride1Pad0
+    add_device_operation_instances(
+        instances,
+        device_grouped_conv_bwd_weight_xdl_c_shuffle_bf16_bilinear_instances<
+            3,
+            NDHWGC,
+            GKZYXC,
+            NDHWGK,
+            ConvBwdWeightFilter1x1Stride1Pad0>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
--- a/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight_bilinear/xdl/device_grouped_conv3d_bwd_weight_xdl_bilinear_ndhwgc_gkzyxc_ndhwgk_f16_comp_bf8_fp8_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight_bilinear/xdl/device_grouped_conv3d_bwd_weight_xdl_bilinear_ndhwgc_gkzyxc_ndhwgk_f16_comp_bf8_fp8_instance.cpp
@@ -0,0 +1,51 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_weight/device_grouped_conv_bwd_weight_xdl_bilinear_instance.hpp"
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_grouped_conv3d_bwd_weight_xdl_bilinear_ndhwgc_gkzyxc_ndhwgk_f16_comp_bf8_f8_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvBwdWeightMultipleD<3,
+                                                                    NDHWGC,
+                                                                    GKZYXC,
+                                                                    NDHWGK,
+                                                                    Tuple<GKZYXC>,
+                                                                    F16,
+                                                                    F16,
+                                                                    F16,
+                                                                    Tuple<F16>,
+                                                                    PassThrough,
+                                                                    Bilinear,
+                                                                    PassThrough,
+                                                                    BF8,
+                                                                    F8>>>& instances)
+{
+    // 1. Default
+    add_device_operation_instances(
+        instances,
+        device_grouped_conv_bwd_weight_xdl_c_shuffle_f16_comp_bf8_f8_bilinear_instances<
+            3,
+            NDHWGC,
+            GKZYXC,
+            NDHWGK,
+            ConvBwdWeightDefault>{});
+    // 2. Filter1x1Stride1Pad0
+    add_device_operation_instances(
+        instances,
+        device_grouped_conv_bwd_weight_xdl_c_shuffle_f16_comp_bf8_f8_bilinear_instances<
+            3,
+            NDHWGC,
+            GKZYXC,
+            NDHWGK,
+            ConvBwdWeightFilter1x1Stride1Pad0>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
--- a/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight_bilinear/xdl/device_grouped_conv3d_bwd_weight_xdl_bilinear_ndhwgc_gkzyxc_ndhwgk_f16_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight_bilinear/xdl/device_grouped_conv3d_bwd_weight_xdl_bilinear_ndhwgc_gkzyxc_ndhwgk_f16_instance.cpp
@@ -0,0 +1,50 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_weight/device_grouped_conv_bwd_weight_xdl_bilinear_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+// Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k]
+void add_device_grouped_conv3d_bwd_weight_xdl_bilinear_ndhwgc_gkzyxc_ndhwgk_f16_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvBwdWeightMultipleD<3,
+                                                                    NDHWGC,
+                                                                    GKZYXC,
+                                                                    NDHWGK,
+                                                                    Tuple<GKZYXC>,
+                                                                    F16,
+                                                                    F16,
+                                                                    F16,
+                                                                    Tuple<F16>,
+                                                                    PassThrough,
+                                                                    Bilinear,
+                                                                    PassThrough>>>& instances)
+{
+    // 1. Default
+    add_device_operation_instances(
+        instances,
+        device_grouped_conv_bwd_weight_xdl_c_shuffle_f16_bilinear_instances<
+            3,
+            NDHWGC,
+            GKZYXC,
+            NDHWGK,
+            ConvBwdWeightDefault>{});
+    // 2. Filter1x1Stride1Pad0
+    add_device_operation_instances(
+        instances,
+        device_grouped_conv_bwd_weight_xdl_c_shuffle_f16_bilinear_instances<
+            3,
+            NDHWGC,
+            GKZYXC,
+            NDHWGK,
+            ConvBwdWeightFilter1x1Stride1Pad0>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
--- a/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight_bilinear/xdl/device_grouped_conv3d_bwd_weight_xdl_bilinear_ndhwgc_gkzyxc_ndhwgk_f32_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight_bilinear/xdl/device_grouped_conv3d_bwd_weight_xdl_bilinear_ndhwgc_gkzyxc_ndhwgk_f32_instance.cpp
@@ -0,0 +1,50 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_weight/device_grouped_conv_bwd_weight_xdl_bilinear_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+// Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k]
+void add_device_grouped_conv3d_bwd_weight_xdl_bilinear_ndhwgc_gkzyxc_ndhwgk_f32_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvBwdWeightMultipleD<3,
+                                                                    NDHWGC,
+                                                                    GKZYXC,
+                                                                    NDHWGK,
+                                                                    Tuple<GKZYXC>,
+                                                                    F32,
+                                                                    F32,
+                                                                    F32,
+                                                                    Tuple<F32>,
+                                                                    PassThrough,
+                                                                    Bilinear,
+                                                                    PassThrough>>>& instances)
+{
+    // 1. Default
+    add_device_operation_instances(
+        instances,
+        device_grouped_conv_bwd_weight_xdl_c_shuffle_f32_bilinear_instances<
+            3,
+            NDHWGC,
+            GKZYXC,
+            NDHWGK,
+            ConvBwdWeightDefault>{});
+    // 2. Filter1x1Stride1Pad0
+    add_device_operation_instances(
+        instances,
+        device_grouped_conv_bwd_weight_xdl_c_shuffle_f32_bilinear_instances<
+            3,
+            NDHWGC,
+            GKZYXC,
+            NDHWGK,
+            ConvBwdWeightFilter1x1Stride1Pad0>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
--- a/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight_scale/CMakeLists.txt
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight_scale/CMakeLists.txt
@@ -0,0 +1,12 @@
+# ONLY XDL_KERNELS
+set(GROUPED_CONV3D_BWD_WEIGHT_SCALE
+    xdl/device_grouped_conv3d_bwd_weight_xdl_scale_ndhwgc_gkzyxc_ndhwgk_f16_instance.cpp
+    xdl/device_grouped_conv3d_bwd_weight_xdl_scale_ndhwgc_gkzyxc_ndhwgk_f32_instance.cpp
+    xdl/device_grouped_conv3d_bwd_weight_xdl_scale_ndhwgc_gkzyxc_ndhwgk_bf16_instance.cpp)
+
+if((DTYPES MATCHES "fp8" AND DTYPES MATCHES "bf8" AND DTYPES MATCHES "fp16") OR NOT DEFINED DTYPES)
+    list(APPEND GROUPED_CONV3D_BWD_WEIGHT_SCALE
+      xdl/device_grouped_conv3d_bwd_weight_xdl_scale_ndhwgc_gkzyxc_ndhwgk_f16_comp_bf8_fp8_instance.cpp)
+endif()
+
+add_instance_library(device_grouped_conv3d_bwd_weight_scale_instance ${GROUPED_CONV3D_BWD_WEIGHT_SCALE})
--- a/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight_scale/xdl/device_grouped_conv3d_bwd_weight_xdl_scale_ndhwgc_gkzyxc_ndhwgk_bf16_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight_scale/xdl/device_grouped_conv3d_bwd_weight_xdl_scale_ndhwgc_gkzyxc_ndhwgk_bf16_instance.cpp
@@ -0,0 +1,49 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_weight/device_grouped_conv_bwd_weight_xdl_scale_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+// Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k]
+void add_device_grouped_conv3d_bwd_weight_xdl_scale_ndhwgc_gkzyxc_ndhwgk_bf16_f32_bf16_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvBwdWeightMultipleD<3,
+                                                                    NDHWGC,
+                                                                    GKZYXC,
+                                                                    NDHWGK,
+                                                                    Tuple<>,
+                                                                    BF16,
+                                                                    F32,
+                                                                    BF16,
+                                                                    Tuple<>,
+                                                                    PassThrough,
+                                                                    Scale,
+                                                                    PassThrough>>>& instances)
+{
+    // 1. Default
+    add_device_operation_instances(
+        instances,
+        device_grouped_conv_bwd_weight_xdl_c_shuffle_bf16_scale_instances<3,
+                                                                          NDHWGC,
+                                                                          GKZYXC,
+                                                                          NDHWGK,
+                                                                          ConvBwdWeightDefault>{});
+    // 2. Filter1x1Stride1Pad0
+    add_device_operation_instances(
+        instances,
+        device_grouped_conv_bwd_weight_xdl_c_shuffle_bf16_scale_instances<
+            3,
+            NDHWGC,
+            GKZYXC,
+            NDHWGK,
+            ConvBwdWeightFilter1x1Stride1Pad0>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
--- a/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight_scale/xdl/device_grouped_conv3d_bwd_weight_xdl_scale_ndhwgc_gkzyxc_ndhwgk_f16_comp_bf8_fp8_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight_scale/xdl/device_grouped_conv3d_bwd_weight_xdl_scale_ndhwgc_gkzyxc_ndhwgk_f16_comp_bf8_fp8_instance.cpp
@@ -0,0 +1,51 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_weight/device_grouped_conv_bwd_weight_xdl_scale_instance.hpp"
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_grouped_conv3d_bwd_weight_xdl_scale_ndhwgc_gkzyxc_ndhwgk_f16_comp_bf8_f8_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvBwdWeightMultipleD<3,
+                                                                    NDHWGC,
+                                                                    GKZYXC,
+                                                                    NDHWGK,
+                                                                    Tuple<>,
+                                                                    F16,
+                                                                    F16,
+                                                                    F16,
+                                                                    Tuple<>,
+                                                                    PassThrough,
+                                                                    Scale,
+                                                                    PassThrough,
+                                                                    BF8,
+                                                                    F8>>>& instances)
+{
+    // 1. Default
+    add_device_operation_instances(
+        instances,
+        device_grouped_conv_bwd_weight_xdl_c_shuffle_f16_comp_bf8_f8_scale_instances<
+            3,
+            NDHWGC,
+            GKZYXC,
+            NDHWGK,
+            ConvBwdWeightDefault>{});
+    // 2. Filter1x1Stride1Pad0
+    add_device_operation_instances(
+        instances,
+        device_grouped_conv_bwd_weight_xdl_c_shuffle_f16_comp_bf8_f8_scale_instances<
+            3,
+            NDHWGC,
+            GKZYXC,
+            NDHWGK,
+            ConvBwdWeightFilter1x1Stride1Pad0>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
--- a/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight_scale/xdl/device_grouped_conv3d_bwd_weight_xdl_scale_ndhwgc_gkzyxc_ndhwgk_f16_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight_scale/xdl/device_grouped_conv3d_bwd_weight_xdl_scale_ndhwgc_gkzyxc_ndhwgk_f16_instance.cpp
@@ -0,0 +1,48 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_weight/device_grouped_conv_bwd_weight_xdl_scale_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+// Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k]
+void add_device_grouped_conv3d_bwd_weight_xdl_scale_ndhwgc_gkzyxc_ndhwgk_f16_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvBwdWeightMultipleD<3,
+                                                                    NDHWGC,
+                                                                    GKZYXC,
+                                                                    NDHWGK,
+                                                                    Tuple<>,
+                                                                    F16,
+                                                                    F16,
+                                                                    F16,
+                                                                    Tuple<>,
+                                                                    PassThrough,
+                                                                    Scale,
+                                                                    PassThrough>>>& instances)
+{
+    // 1. Default
+    add_device_operation_instances(
+        instances,
+        device_grouped_conv_bwd_weight_xdl_c_shuffle_f16_scale_instances<3,
+                                                                         NDHWGC,
+                                                                         GKZYXC,
+                                                                         NDHWGK,
+                                                                         ConvBwdWeightDefault>{});
+    // 2. Filter1x1Stride1Pad0
+    add_device_operation_instances(instances,
+                                   device_grouped_conv_bwd_weight_xdl_c_shuffle_f16_scale_instances<
+                                       3,
+                                       NDHWGC,
+                                       GKZYXC,
+                                       NDHWGK,
+                                       ConvBwdWeightFilter1x1Stride1Pad0>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
--- a/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight_scale/xdl/device_grouped_conv3d_bwd_weight_xdl_scale_ndhwgc_gkzyxc_ndhwgk_f32_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight_scale/xdl/device_grouped_conv3d_bwd_weight_xdl_scale_ndhwgc_gkzyxc_ndhwgk_f32_instance.cpp
@@ -0,0 +1,48 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_weight/device_grouped_conv_bwd_weight_xdl_scale_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+// Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k]
+void add_device_grouped_conv3d_bwd_weight_xdl_scale_ndhwgc_gkzyxc_ndhwgk_f32_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvBwdWeightMultipleD<3,
+                                                                    NDHWGC,
+                                                                    GKZYXC,
+                                                                    NDHWGK,
+                                                                    Tuple<>,
+                                                                    F32,
+                                                                    F32,
+                                                                    F32,
+                                                                    Tuple<>,
+                                                                    PassThrough,
+                                                                    Scale,
+                                                                    PassThrough>>>& instances)
+{
+    // 1. Default
+    add_device_operation_instances(
+        instances,
+        device_grouped_conv_bwd_weight_xdl_c_shuffle_f32_scale_instances<3,
+                                                                         NDHWGC,
+                                                                         GKZYXC,
+                                                                         NDHWGK,
+                                                                         ConvBwdWeightDefault>{});
+    // 2. Filter1x1Stride1Pad0
+    add_device_operation_instances(instances,
+                                   device_grouped_conv_bwd_weight_xdl_c_shuffle_f32_scale_instances<
+                                       3,
+                                       NDHWGC,
+                                       GKZYXC,
+                                       NDHWGK,
+                                       ConvBwdWeightFilter1x1Stride1Pad0>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck