Add client example of grouped conv2d forward (data type: fp16) (#488)

* Rename example folder for GroupedConvFwdMultipleD * Unify example codes * Change target names * Add fp16 example for multiple d instance * Re-format common.hpp * Add interface 'DeviceGroupedConvFwd' * Use simpler interface * Move common conv params out * Rename conv fwd client example folder * Add missing include directive * Update grouped conv instance implementations * Simplify ckProfiler (grouped conv forward) * Use GroupedConvFwd to implement client example * Use greater groupe count in example * Add custom target to group examples * Add extra tag param to instance factory function * Use tag to differentiate factory functions * Add missing tag argument for factory function * Remove inheritance relationship * Remove no-longer used include directive * Add license in front of file
2026-05-11 08:50:17 +00:00 · 2022-11-10 09:01:58 +08:00
parent 38470e0497
commit f49803101e
26 changed files with 1078 additions and 2533 deletions
--- a/client_example/07_conv2d_fwd/CMakeLists.txt
+++ b/client_example/07_conv2d_fwd/CMakeLists.txt
@@ -1,2 +0,0 @@
 add_executable(client_conv2d_fwd conv2d_fwd.cpp)
 target_link_libraries(client_conv2d_fwd PRIVATE composable_kernel::device_operations)
--- a/client_example/07_grouped_conv2d_fwd/CMakeLists.txt
+++ b/client_example/07_grouped_conv2d_fwd/CMakeLists.txt
@@ -0,0 +1,2 @@
 add_executable(client_grouped_conv2d_fwd grouped_conv2d_fwd.cpp)
 target_link_libraries(client_grouped_conv2d_fwd PRIVATE composable_kernel::device_operations)
--- a/client_example/07_grouped_conv2d_fwd/grouped_conv2d_fwd.cpp
+++ b/client_example/07_grouped_conv2d_fwd/grouped_conv2d_fwd.cpp
@@ -1,35 +1,38 @@
 // SPDX-License-Identifier: MIT
 // Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
 #include <cstdlib>
 #include <iomanip>
 #include <iostream>
 #include <iterator>
 #include <numeric>
 #include <vector>
 #include "ck/ck.hpp"
-#include "ck/library/tensor_operation_instance/gpu/convolution_forward.hpp"
+#include "ck/library/tensor_operation_instance/gpu/grouped_convolution_forward.hpp"
 #include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
 #include "ck/tensor_operation/gpu/device/device_conv_fwd.hpp"
 #include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
 using InDataType  = ck::half_t;
 using WeiDataType = ck::half_t;
 using OutDataType = ck::half_t;
-using InLayout    = ck::tensor_layout::convolution::NHWC;
+using InLayout    = ck::tensor_layout::convolution::GNHWC;
-using WeiLayout   = ck::tensor_layout::convolution::KYXC;
+using WeiLayout   = ck::tensor_layout::convolution::GKYXC;
-using OutLayout   = ck::tensor_layout::convolution::NHWK;
+using OutLayout   = ck::tensor_layout::convolution::GNHWK;
 using PassThrough = ck::tensor_operation::element_wise::PassThrough;
 static constexpr ck::index_t NumDimSpatial = 2;
-static constexpr ck::index_t N             = 16;
+static constexpr ck::index_t G             = 32;
-static constexpr ck::index_t K             = 32;
+static constexpr ck::index_t N             = 256;
-static constexpr ck::index_t C             = 3;
+static constexpr ck::index_t K             = 192;
 static constexpr ck::index_t C             = 192;
 static constexpr ck::index_t Y             = 3;
 static constexpr ck::index_t X             = 3;
-static constexpr ck::index_t Hi            = 224;
+static constexpr ck::index_t Hi            = 28;
-static constexpr ck::index_t Wi            = 224;
+static constexpr ck::index_t Wi            = 28;
-static constexpr ck::index_t Ho            = 113;
+static constexpr ck::index_t Ho            = 28;
-static constexpr ck::index_t Wo            = 113;
+static constexpr ck::index_t Wo            = 28;
 struct SimpleDeviceMem
 {
@@ -47,30 +50,66 @@ struct SimpleDeviceMem
    void* p_mem_;
 };
-int main(int argc, char* argv[])
+int main()
 {
-    std::vector<ck::index_t> in_spatial_lengths{Hi, Wi};
+    std::array<ck::index_t, NumDimSpatial + 3> in_lengths{G, N, Hi, Wi, C};
-    std::vector<ck::index_t> filter_spatial_lengths{Y, X};
+    std::array<ck::index_t, NumDimSpatial + 3> in_strides{0, 0, 0, 0, 1};
    std::vector<ck::index_t> out_spatial_lengths{Ho, Wo};
    std::vector<ck::index_t> filter_strides{2, 2};
    std::vector<ck::index_t> filter_dilations{1, 1};
    std::vector<ck::index_t> input_left_pads{2, 2};
    std::vector<ck::index_t> input_right_pads{2, 2};
-    SimpleDeviceMem in(sizeof(InDataType) * N * Hi * Wi * C);
+    std::array<ck::index_t, NumDimSpatial + 3> wei_lengths{G, K, Y, X, C};
-    SimpleDeviceMem wei(sizeof(WeiDataType) * K * Y * X * C);
+    std::array<ck::index_t, NumDimSpatial + 3> wei_strides{0, 0, 0, 0, 1};
-    SimpleDeviceMem out(sizeof(OutDataType) * N * Ho * Wo * K);
+
    std::array<ck::index_t, NumDimSpatial + 3> out_lengths{G, N, Ho, Wo, K};
    std::array<ck::index_t, NumDimSpatial + 3> out_strides{0, 0, 0, 0, 1};
    std::partial_sum(rbegin(in_lengths),
                     std::prev(rend(in_lengths)),
                     std::next(rbegin(in_strides)),
                     std::multiplies<>{});
    std::partial_sum(rbegin(wei_lengths),
                     std::prev(rend(wei_lengths)),
                     std::next(rbegin(wei_strides)),
                     std::multiplies<>{});
    std::partial_sum(rbegin(out_lengths),
                     std::prev(rend(out_lengths)),
                     std::next(rbegin(out_strides)),
                     std::multiplies<>{});
    // transpose GNHWC/GKYXC/GNHWK to GNCHW/GKCYX/GNCHW
    std::rotate(
        rbegin(in_lengths), std::next(rbegin(in_lengths)), std::next(rbegin(in_lengths), 3));
    std::rotate(
        rbegin(in_strides), std::next(rbegin(in_strides)), std::next(rbegin(in_strides), 3));
    std::rotate(
        rbegin(wei_lengths), std::next(rbegin(wei_lengths)), std::next(rbegin(wei_lengths), 3));
    std::rotate(
        rbegin(wei_strides), std::next(rbegin(wei_strides)), std::next(rbegin(wei_strides), 3));
    std::rotate(
        rbegin(out_lengths), std::next(rbegin(out_lengths)), std::next(rbegin(out_lengths), 3));
    std::rotate(
        rbegin(out_strides), std::next(rbegin(out_strides)), std::next(rbegin(out_strides), 3));
    std::array<ck::index_t, NumDimSpatial> filter_strides{1, 1};
    std::array<ck::index_t, NumDimSpatial> filter_dilations{1, 1};
    std::array<ck::index_t, NumDimSpatial> input_left_pads{1, 1};
    std::array<ck::index_t, NumDimSpatial> input_right_pads{1, 1};
    SimpleDeviceMem in(sizeof(InDataType) * G * N * Hi * Wi * C);
    SimpleDeviceMem wei(sizeof(WeiDataType) * G * K * Y * X * C);
    SimpleDeviceMem out(sizeof(OutDataType) * G * N * Ho * Wo * K);
    using DeviceOp = ck::tensor_operation::device::DeviceGroupedConvFwdMultipleD<NumDimSpatial,
                                                                                 InLayout,
                                                                                 WeiLayout,
                                                                                 ck::Tuple<>,
                                                                                 OutLayout,
                                                                                 InDataType,
                                                                                 WeiDataType,
                                                                                 ck::Tuple<>,
                                                                                 OutDataType,
                                                                                 PassThrough,
                                                                                 PassThrough,
                                                                                 PassThrough>;
    using DeviceOp = ck::tensor_operation::device::DeviceConvFwd<NumDimSpatial,
                                                                 InLayout,
                                                                 WeiLayout,
                                                                 OutLayout,
                                                                 InDataType,
                                                                 WeiDataType,
                                                                 OutDataType,
                                                                 PassThrough,
                                                                 PassThrough,
                                                                 PassThrough>;
    // get device op instances
    const auto op_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory<
        DeviceOp>::GetInstances();
@@ -91,13 +130,16 @@ int main(int argc, char* argv[])
        auto& op_ptr        = op_ptrs[i];
        auto argument_ptr   = op_ptr->MakeArgumentPointer(in.GetDeviceBuffer(),
                                                        wei.GetDeviceBuffer(),
                                                        {},
                                                        out.GetDeviceBuffer(),
-                                                        N,
+                                                        in_lengths,
-                                                        K,
+                                                        in_strides,
-                                                        C,
+                                                        wei_lengths,
-                                                        in_spatial_lengths,
+                                                        wei_strides,
-                                                        filter_spatial_lengths,
+                                                        {},
-                                                        out_spatial_lengths,
+                                                        {},
                                                        out_lengths,
                                                        out_strides,
                                                        filter_strides,
                                                        filter_dilations,
                                                        input_left_pads,
@@ -112,10 +154,10 @@ int main(int argc, char* argv[])
        {
            float avg_time = invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, true});
-            std::size_t flop      = 2 * N * K * C * Ho * Wo * Y * X;
+            std::size_t flop      = std::size_t(2) * G * N * K * C * Ho * Wo * Y * X;
-            std::size_t num_bytes = sizeof(InDataType) * N * Hi * Wi * C +
+            std::size_t num_bytes = sizeof(InDataType) * G * N * Hi * Wi * C +
-                                    sizeof(WeiDataType) * K * Y * X * C +
+                                    sizeof(WeiDataType) * G * K * Y * X * C +
-                                    sizeof(OutDataType) * N * Ho * Wo * K;
+                                    sizeof(OutDataType) * G * N * Ho * Wo * K;
            float tflops     = static_cast<float>(flop) / 1.E9 / avg_time;
            float gb_per_sec = num_bytes / 1.E6 / avg_time;
@@ -134,10 +176,16 @@ int main(int argc, char* argv[])
        }
        else
        {
-            std::cout << op_name << " does not support this problem" << std::endl;
+            std::cerr << op_name << " does not support this problem" << std::endl;
        }
    }
    if(best_op_id < 0)
    {
        std::cerr << "no suitable instance" << std::endl;
        return EXIT_FAILURE;
    }
    std::cout << "Best Perf: " << std::setw(10) << best_avg_time << " ms, " << best_tflops
              << " TFlops, " << best_gb_per_sec << " GB/s, " << best_op_name << std::endl;
@@ -148,13 +196,16 @@ int main(int argc, char* argv[])
                  << std::endl;
        auto argument_ptr = op_ptr->MakeArgumentPointer(in.GetDeviceBuffer(),
                                                        wei.GetDeviceBuffer(),
                                                        {},
                                                        out.GetDeviceBuffer(),
-                                                        N,
+                                                        in_lengths,
-                                                        K,
+                                                        in_strides,
-                                                        C,
+                                                        wei_lengths,
-                                                        in_spatial_lengths,
+                                                        wei_strides,
-                                                        filter_spatial_lengths,
+                                                        {},
-                                                        out_spatial_lengths,
+                                                        {},
                                                        out_lengths,
                                                        out_strides,
                                                        filter_strides,
                                                        filter_dilations,
                                                        input_left_pads,
@@ -172,6 +223,4 @@ int main(int argc, char* argv[])
        std::cout << "Done" << std::endl;
    }
-
+}
    return 0;
 }
--- a/example/30_grouped_conv_fwd_multiple_d/CMakeLists.txt
+++ b/example/30_grouped_conv_fwd_multiple_d/CMakeLists.txt
@@ -0,0 +1,22 @@
 add_custom_target(example_grouped_conv_fwd_multiple_d)
 add_example_executable(example_grouped_conv_fwd_bias_relu_add_xdl_fp16 grouped_conv_fwd_bias_relu_add_xdl_fp16.cpp)
 add_example_executable(example_grouped_conv_fwd_bias_relu_add_xdl_fp32 grouped_conv_fwd_bias_relu_add_xdl_fp32.cpp)
 add_example_executable(example_grouped_conv_fwd_bias_relu_add_xdl_bf16 grouped_conv_fwd_bias_relu_add_xdl_bf16.cpp)
 add_example_executable(example_grouped_conv_fwd_bias_relu_add_xdl_int8 grouped_conv_fwd_bias_relu_add_xdl_int8.cpp)
 add_dependencies(example_grouped_conv_fwd_multiple_d example_grouped_conv_fwd_bias_relu_add_xdl_fp16)
 add_dependencies(example_grouped_conv_fwd_multiple_d example_grouped_conv_fwd_bias_relu_add_xdl_fp32)
 add_dependencies(example_grouped_conv_fwd_multiple_d example_grouped_conv_fwd_bias_relu_add_xdl_bf16)
 add_dependencies(example_grouped_conv_fwd_multiple_d example_grouped_conv_fwd_bias_relu_add_xdl_int8)
 if(USE_BITINT_EXTENSION_INT4)
  add_example_executable(example_grouped_conv_fwd_bias_relu_add_xdl_int4 grouped_conv_fwd_bias_relu_add_xdl_int4.cpp)
  add_dependencies(example_grouped_conv_fwd_multiple_d example_grouped_conv_fwd_bias_relu_add_xdl_int4)
 endif() # USE_BITINT_EXTENSION_INT4
 add_example_executable(example_grouped_conv_fwd_xdl_fp16 grouped_conv_fwd_xdl_fp16.cpp)
 add_dependencies(example_grouped_conv_fwd_multiple_d example_grouped_conv_fwd_xdl_fp16)
--- a/example/30_grouped_conv_fwd_multiple_d/README.md
+++ b/example/30_grouped_conv_fwd_multiple_d/README.md
@@ -0,0 +1,30 @@
 Command
 ```bash
 arg1: verification (0=no, 1=yes)
 arg2: initialization (0=no init, 1=integer value, 2=decimal value)
 arg3: time kernel (0=no, 1=yes)
 Following arguments (depending on number of spatial dims):
 Number of spatial dimensions (1=Conv1d, 2=Conv2d, 3=Conv3d)
 G, N, K, C,
 <filter spatial dimensions>, (ie Y, X for 2D)
 <input image spatial dimensions>, (ie Hi, Wi for 2D)
 <strides>, (ie Sy, Sx for 2D)
 <dilations>, (ie Dy, Dx for 2D)
 <left padding>, (ie LeftPy, LeftPx for 2D)
 <right padding>, (ie RightPy, RightPx for 2D)
 ./bin/example_grouped_conv_fwd_bias_relu_add_xdl_fp16 1 1 1
 ```
 Result (MI100)
 ```
 in: dim 5, lengths {1, 128, 192, 71, 71}, strides {192, 967872, 1, 13632, 192}
 wei: dim 5, lengths {1, 256, 192, 3, 3}, strides {442368, 1728, 1, 576, 192}
 bias: dim 5, lengths {1, 128, 256, 36, 36}, strides {256, 0, 1, 0, 0}
 residual: dim 5, lengths {1, 128, 256, 36, 36}, strides {256, 0, 1, 0, 0}
 out: dim 5, lengths {1, 128, 256, 36, 36}, strides {256, 331776, 1, 9216, 256}
 launch_and_time_kernel: grid_dim {1296, 1, 1}, block_dim {256, 1, 1}
 Warm up 1 time
 Start running 10 times...
 Perf: 1.55981 ms, 94.0927 TFlops, 213.868 GB/s, DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<256, 128, 256, 16, Default>
 ```
--- a/example/30_grouped_conv_fwd_multiple_d/common.hpp
+++ b/example/30_grouped_conv_fwd_multiple_d/common.hpp
@@ -0,0 +1,354 @@
 // SPDX-License-Identifier: MIT
 // Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
 #pragma once
 #include <algorithm>
 #include <array>
 #include <iostream>
 #include <string>
 #include <type_traits>
 #include "ck/ck.hpp"
 #include "ck/tensor_operation/gpu/device/convolution_forward_specialization.hpp"
 #include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
 #include "ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_d_xdl_cshuffle.hpp"
 #include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
 #include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
 #include "ck/library/utility/check_err.hpp"
 #include "ck/library/utility/device_memory.hpp"
 #include "ck/library/utility/host_tensor.hpp"
 #include "ck/library/utility/host_tensor_generator.hpp"
 #include "ck/library/utility/convolution_parameter.hpp"
 #include "ck/library/reference_tensor_operation/cpu/reference_conv_fwd.hpp"
 using BF16 = ck::bhalf_t;
 using FP16 = ck::half_t;
 using FP32 = float;
 #ifdef CK_EXPERIMENTAL_BIT_INT_EXTENSION_INT4
 using I4 = ck::int4_t;
 #endif
 using I8  = std::int8_t;
 using I32 = std::int32_t;
 template <ck::index_t... Is>
 using S = ck::Sequence<Is...>;
 using PassThrough = ck::tensor_operation::element_wise::PassThrough;
 static constexpr auto ConvSpec =
    ck::tensor_operation::device::ConvolutionForwardSpecialization::Default;
 static constexpr auto GemmSpec = ck::tensor_operation::device::GemmSpecialization::MNKPadding;
 template <typename InputLay, typename WeightLay, typename OutputLay>
 struct CommonLayoutSetting
 {
    using InputLayout  = InputLay;
    using WeightLayout = WeightLay;
    using OutputLayout = OutputLay;
 };
 template <ck::index_t NDimSpatial>
 struct CommonLayoutSettingSelector;
 namespace ctl = ck::tensor_layout::convolution;
 template <>
 struct CommonLayoutSettingSelector<1> final
    : CommonLayoutSetting<ctl::G_NW_C, ctl::G_K_X_C, ctl::G_NW_K>
 {
 };
 template <>
 struct CommonLayoutSettingSelector<2> final
    : CommonLayoutSetting<ctl::G_NHW_C, ctl::G_K_YX_C, ctl::G_NHW_K>
 {
 };
 template <>
 struct CommonLayoutSettingSelector<3> final
    : CommonLayoutSetting<ctl::G_NDHW_C, ctl::G_K_ZYX_C, ctl::G_NDHW_K>
 {
 };
 template <ck::index_t NDimSpatial>
 using InputLayout = typename CommonLayoutSettingSelector<NDimSpatial>::InputLayout;
 template <ck::index_t NDimSpatial>
 using WeightLayout = typename CommonLayoutSettingSelector<NDimSpatial>::WeightLayout;
 template <ck::index_t NDimSpatial>
 using OutputLayout = typename CommonLayoutSettingSelector<NDimSpatial>::OutputLayout;
 struct ExecutionConfig final
 {
    bool do_verification = true;
    int init_method      = 1;
    bool time_kernel     = true;
 };
 #define DefaultConvParam                                                       \
    ck::utils::conv::ConvParam                                                 \
    {                                                                          \
        2, 32, 2, 256, 192, {3, 3}, {71, 71}, {2, 2}, {1, 1}, {1, 1}, { 1, 1 } \
    }
 inline void print_help_msg()
 {
    std::cerr << "arg1: verification (0=no, 1=yes)\n"
              << "arg2: initialization (0=no init, 1=integer value, 2=decimal value)\n"
              << "arg3: time kernel (0=no, 1=yes)\n"
              << ck::utils::conv::get_conv_param_parser_helper_msg() << std::endl;
 }
 inline bool parse_cmd_args(int argc,
                           char* argv[],
                           ExecutionConfig& config,
                           ck::utils::conv::ConvParam& conv_param)
 {
    constexpr int num_execution_config_args =
        3; // arguments for do_verification, init_method, time_kernel
    constexpr int num_conv_param_leading_args = 5; // arguments for num_dim_spatial_, G_, N_, K_, C_
    constexpr int threshold_to_catch_partial_args = 1 + num_execution_config_args;
    constexpr int threshold_to_catch_all_args =
        threshold_to_catch_partial_args + num_conv_param_leading_args;
    if(argc == 1)
    {
        // use default
    }
    // catch only ExecutionConfig arguments
    else if(argc == threshold_to_catch_partial_args)
    {
        config.do_verification = std::stoi(argv[1]);
        config.init_method     = std::stoi(argv[2]);
        config.time_kernel     = std::stoi(argv[3]);
    }
    // catch both ExecutionConfig & ConvParam arguments
    else if(threshold_to_catch_all_args < argc && ((argc - threshold_to_catch_all_args) % 3 == 0))
    {
        config.do_verification = std::stoi(argv[1]);
        config.init_method     = std::stoi(argv[2]);
        config.time_kernel     = std::stoi(argv[3]);
        const ck::index_t num_dim_spatial = std::stoi(argv[4]);
        conv_param                        = ck::utils::conv::parse_conv_param(
            num_dim_spatial, threshold_to_catch_partial_args, argv);
    }
    else
    {
        print_help_msg();
        return false;
    }
    return true;
 }
 inline HostTensorDescriptor make_input_descriptor(const ck::utils::conv::ConvParam& conv_param)
 {
    switch(conv_param.num_dim_spatial_)
    {
    case 1:
        return HostTensorDescriptor(
            {conv_param.G_, conv_param.N_, conv_param.C_, conv_param.input_spatial_lengths_[0]},
            {
                conv_param.C_,                                                        // g
                conv_param.input_spatial_lengths_[0] * conv_param.G_ * conv_param.C_, // n
                1,                                                                    // c
                conv_param.G_ * conv_param.C_                                         // wi
            });
    case 2:
        return HostTensorDescriptor(
            {conv_param.G_,
             conv_param.N_,
             conv_param.C_,
             conv_param.input_spatial_lengths_[0],
             conv_param.input_spatial_lengths_[1]},
            {
                conv_param.C_, // g
                conv_param.input_spatial_lengths_[0] * conv_param.input_spatial_lengths_[1] *
                    conv_param.G_ * conv_param.C_,                                    // n
                1,                                                                    // c
                conv_param.input_spatial_lengths_[1] * conv_param.G_ * conv_param.C_, // hi
                conv_param.G_ * conv_param.C_                                         // wi
            });
    case 3:
        return HostTensorDescriptor(
            {conv_param.G_,
             conv_param.N_,
             conv_param.C_,
             conv_param.input_spatial_lengths_[0],
             conv_param.input_spatial_lengths_[1],
             conv_param.input_spatial_lengths_[2]},
            {
                conv_param.C_, // g
                conv_param.input_spatial_lengths_[0] * conv_param.input_spatial_lengths_[1] *
                    conv_param.input_spatial_lengths_[2] * conv_param.G_ * conv_param.C_, // n
                1,                                                                        // c
                conv_param.input_spatial_lengths_[1] * conv_param.input_spatial_lengths_[2] *
                    conv_param.G_ * conv_param.C_,                                    // di
                conv_param.input_spatial_lengths_[2] * conv_param.G_ * conv_param.C_, // hi
                conv_param.G_ * conv_param.C_                                         // wi
            });
    }
    throw std::runtime_error("unsuppored # dim spatial");
 }
 inline HostTensorDescriptor make_weight_descriptor(const ck::utils::conv::ConvParam& conv_param)
 {
    switch(conv_param.num_dim_spatial_)
    {
    case 1:
        return HostTensorDescriptor(
            {conv_param.G_, conv_param.K_, conv_param.C_, conv_param.filter_spatial_lengths_[0]},
            {
                conv_param.K_ * conv_param.filter_spatial_lengths_[0] * conv_param.C_, // g
                conv_param.filter_spatial_lengths_[0] * conv_param.C_,                 // k
                1,                                                                     // c
                conv_param.C_                                                          // x
            });
    case 2:
        return HostTensorDescriptor(
            {conv_param.G_,
             conv_param.K_,
             conv_param.C_,
             conv_param.filter_spatial_lengths_[0],
             conv_param.filter_spatial_lengths_[1]},
            {
                conv_param.K_ * conv_param.filter_spatial_lengths_[0] *
                    conv_param.filter_spatial_lengths_[1] * conv_param.C_, // g
                conv_param.filter_spatial_lengths_[0] * conv_param.filter_spatial_lengths_[1] *
                    conv_param.C_,                                     // k
                1,                                                     // c
                conv_param.filter_spatial_lengths_[1] * conv_param.C_, // y
                conv_param.C_                                          // x
            });
    case 3:
        return HostTensorDescriptor(
            {conv_param.G_,
             conv_param.K_,
             conv_param.C_,
             conv_param.filter_spatial_lengths_[0],
             conv_param.filter_spatial_lengths_[1],
             conv_param.filter_spatial_lengths_[2]},
            {
                conv_param.K_ * conv_param.filter_spatial_lengths_[0] *
                    conv_param.filter_spatial_lengths_[1] * conv_param.filter_spatial_lengths_[2] *
                    conv_param.C_, // g
                conv_param.filter_spatial_lengths_[0] * conv_param.filter_spatial_lengths_[1] *
                    conv_param.filter_spatial_lengths_[2] * conv_param.C_, // k
                1,                                                         // c
                conv_param.filter_spatial_lengths_[1] * conv_param.filter_spatial_lengths_[2] *
                    conv_param.C_,                                     // z
                conv_param.filter_spatial_lengths_[2] * conv_param.C_, // y
                conv_param.C_                                          // x
            });
    }
    throw std::runtime_error("unsuppored # dim spatial");
 }
 inline HostTensorDescriptor make_bias_descriptor(const ck::utils::conv::ConvParam& conv_param)
 {
    switch(conv_param.num_dim_spatial_)
    {
    case 1:
        return HostTensorDescriptor(
            {conv_param.G_, conv_param.N_, conv_param.K_, conv_param.output_spatial_lengths_[0]},
            {
                conv_param.K_, // g
                0,             // k
                1,             // c
                0              // x
            });
    case 2:
        return HostTensorDescriptor({conv_param.G_,
                                     conv_param.N_,
                                     conv_param.K_,
                                     conv_param.output_spatial_lengths_[0],
                                     conv_param.output_spatial_lengths_[1]},
                                    {
                                        conv_param.K_, // g
                                        0,             // n
                                        1,             // k
                                        0,             // ho
                                        0              // wo
                                    });
    case 3:
        return HostTensorDescriptor({conv_param.G_,
                                     conv_param.N_,
                                     conv_param.K_,
                                     conv_param.output_spatial_lengths_[0],
                                     conv_param.output_spatial_lengths_[1],
                                     conv_param.output_spatial_lengths_[2]},
                                    {
                                        conv_param.K_, // g
                                        0,             // n
                                        1,             // k
                                        0,             // z
                                        0,             // y
                                        0              // x
                                    });
    }
    throw std::runtime_error("unsuppored # dim spatial");
 }
 inline HostTensorDescriptor make_output_descriptor(const ck::utils::conv::ConvParam& conv_param)
 {
    switch(conv_param.num_dim_spatial_)
    {
    case 1:
        return HostTensorDescriptor(
            {conv_param.G_, conv_param.N_, conv_param.K_, conv_param.output_spatial_lengths_[0]},
            {
                conv_param.K_,                                                         // g
                conv_param.output_spatial_lengths_[0] * conv_param.G_ * conv_param.K_, // n
                1,                                                                     // k
                conv_param.G_ * conv_param.K_                                          // wo
            });
    case 2:
        return HostTensorDescriptor(
            {conv_param.G_,
             conv_param.N_,
             conv_param.K_,
             conv_param.output_spatial_lengths_[0],
             conv_param.output_spatial_lengths_[1]},
            {
                conv_param.K_, // g
                conv_param.output_spatial_lengths_[0] * conv_param.output_spatial_lengths_[1] *
                    conv_param.G_ * conv_param.K_,                                     // n
                1,                                                                     // k
                conv_param.output_spatial_lengths_[1] * conv_param.G_ * conv_param.K_, // ho
                conv_param.G_ * conv_param.K_                                          // wo
            });
    case 3:
        return HostTensorDescriptor(
            {conv_param.G_,
             conv_param.N_,
             conv_param.K_,
             conv_param.output_spatial_lengths_[0],
             conv_param.output_spatial_lengths_[1],
             conv_param.output_spatial_lengths_[2]},
            {
                conv_param.K_, // g
                conv_param.output_spatial_lengths_[0] * conv_param.output_spatial_lengths_[1] *
                    conv_param.output_spatial_lengths_[2] * conv_param.G_ * conv_param.K_, // n
                1,                                                                         // k
                conv_param.output_spatial_lengths_[1] * conv_param.output_spatial_lengths_[2] *
                    conv_param.G_ * conv_param.K_,                                     // do
                conv_param.output_spatial_lengths_[2] * conv_param.G_ * conv_param.K_, // ho
                conv_param.G_ * conv_param.K_                                          // wo
            });
    }
    throw std::runtime_error("unsuppored # dim spatial");
 }
--- a/example/30_grouped_conv_fwd_multiple_d/grouped_conv_fwd_bias_relu_add_xdl_bf16.cpp
+++ b/example/30_grouped_conv_fwd_multiple_d/grouped_conv_fwd_bias_relu_add_xdl_bf16.cpp
@@ -0,0 +1,26 @@
 // SPDX-License-Identifier: MIT
 // Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
 #include "common.hpp"
 // kernel data types
 using InKernelDataType       = BF16;
 using WeiKernelDataType      = BF16;
 using AccDataType            = FP32;
 using CShuffleDataType       = FP32;
 using BiasKernelDataType     = BF16;
 using ResidualKernelDataType = BF16;
 using OutKernelDataType      = BF16;
 // tensor data types
 using InUserDataType  = InKernelDataType;
 using WeiUserDataType = WeiKernelDataType;
 using OutUserDataType = OutKernelDataType;
 using InElementOp  = PassThrough;
 using WeiElementOp = PassThrough;
 using OutElementOp = ck::tensor_operation::element_wise::AddReluAdd;
 #include "run_grouped_conv_fwd_bias_relu_add_example.inc"
 int main(int argc, char* argv[]) { return !run_grouped_conv_fwd_bias_relu_add_example(argc, argv); }
--- a/example/30_grouped_conv_fwd_multiple_d/grouped_conv_fwd_bias_relu_add_xdl_fp16.cpp
+++ b/example/30_grouped_conv_fwd_multiple_d/grouped_conv_fwd_bias_relu_add_xdl_fp16.cpp
@@ -0,0 +1,26 @@
 // SPDX-License-Identifier: MIT
 // Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
 #include "common.hpp"
 // kernel data types
 using InKernelDataType       = FP16;
 using WeiKernelDataType      = FP16;
 using AccDataType            = FP32;
 using CShuffleDataType       = FP16;
 using BiasKernelDataType     = FP16;
 using ResidualKernelDataType = FP16;
 using OutKernelDataType      = FP16;
 // tensor data types
 using InUserDataType  = InKernelDataType;
 using WeiUserDataType = WeiKernelDataType;
 using OutUserDataType = OutKernelDataType;
 using InElementOp  = PassThrough;
 using WeiElementOp = PassThrough;
 using OutElementOp = ck::tensor_operation::element_wise::AddReluAdd;
 #include "run_grouped_conv_fwd_bias_relu_add_example.inc"
 int main(int argc, char* argv[]) { return !run_grouped_conv_fwd_bias_relu_add_example(argc, argv); }
--- a/example/30_grouped_conv_fwd_multiple_d/grouped_conv_fwd_bias_relu_add_xdl_fp32.cpp
+++ b/example/30_grouped_conv_fwd_multiple_d/grouped_conv_fwd_bias_relu_add_xdl_fp32.cpp
@@ -0,0 +1,26 @@
 // SPDX-License-Identifier: MIT
 // Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
 #include "common.hpp"
 // kernel data types
 using InKernelDataType       = FP32;
 using WeiKernelDataType      = FP32;
 using AccDataType            = FP32;
 using CShuffleDataType       = FP32;
 using BiasKernelDataType     = FP32;
 using ResidualKernelDataType = FP32;
 using OutKernelDataType      = FP32;
 // tensor data types
 using InUserDataType  = InKernelDataType;
 using WeiUserDataType = WeiKernelDataType;
 using OutUserDataType = OutKernelDataType;
 using InElementOp  = PassThrough;
 using WeiElementOp = PassThrough;
 using OutElementOp = ck::tensor_operation::element_wise::AddReluAdd;
 #include "run_grouped_conv_fwd_bias_relu_add_example.inc"
 int main(int argc, char* argv[]) { return !run_grouped_conv_fwd_bias_relu_add_example(argc, argv); }
--- a/example/30_grouped_conv_fwd_multiple_d/grouped_conv_fwd_bias_relu_add_xdl_int4.cpp
+++ b/example/30_grouped_conv_fwd_multiple_d/grouped_conv_fwd_bias_relu_add_xdl_int4.cpp
@@ -0,0 +1,31 @@
 // SPDX-License-Identifier: MIT
 // Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
 #ifndef CK_EXPERIMENTAL_BIT_INT_EXTENSION_INT4
 #error Should compile this file with ck::int4_t support
 #endif
 #include "common.hpp"
 // kernel data types
 using InKernelDataType       = I8;
 using WeiKernelDataType      = I8;
 using AccDataType            = I32;
 using CShuffleDataType       = I8;
 using BiasKernelDataType     = I8;
 using ResidualKernelDataType = I8;
 using OutKernelDataType      = I8;
 // tensor data types
 using InUserDataType  = I4;
 using WeiUserDataType = I4;
 using OutUserDataType = I4;
 using InElementOp  = PassThrough;
 using WeiElementOp = PassThrough;
 using OutElementOp = ck::tensor_operation::element_wise::AddReluAdd;
 #define BUILD_INT4_EXAMPLE
 #include "run_grouped_conv_fwd_bias_relu_add_example.inc"
 int main(int argc, char* argv[]) { return !run_grouped_conv_fwd_bias_relu_add_example(argc, argv); }
--- a/example/30_grouped_conv_fwd_multiple_d/grouped_conv_fwd_bias_relu_add_xdl_int8.cpp
+++ b/example/30_grouped_conv_fwd_multiple_d/grouped_conv_fwd_bias_relu_add_xdl_int8.cpp
@@ -0,0 +1,26 @@
 // SPDX-License-Identifier: MIT
 // Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
 #include "common.hpp"
 // kernel data types
 using InKernelDataType       = I8;
 using WeiKernelDataType      = I8;
 using AccDataType            = I32;
 using CShuffleDataType       = I8;
 using BiasKernelDataType     = I8;
 using ResidualKernelDataType = I8;
 using OutKernelDataType      = I8;
 // tensor data types
 using InUserDataType  = InKernelDataType;
 using WeiUserDataType = WeiKernelDataType;
 using OutUserDataType = OutKernelDataType;
 using InElementOp  = PassThrough;
 using WeiElementOp = PassThrough;
 using OutElementOp = ck::tensor_operation::element_wise::AddReluAdd;
 #include "run_grouped_conv_fwd_bias_relu_add_example.inc"
 int main(int argc, char* argv[]) { return !run_grouped_conv_fwd_bias_relu_add_example(argc, argv); }
--- a/example/30_grouped_conv_fwd_multiple_d/grouped_conv_fwd_xdl_fp16.cpp
+++ b/example/30_grouped_conv_fwd_multiple_d/grouped_conv_fwd_xdl_fp16.cpp
@@ -0,0 +1,24 @@
 // SPDX-License-Identifier: MIT
 // Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
 #include "common.hpp"
 // kernel data types
 using InKernelDataType  = FP16;
 using WeiKernelDataType = FP16;
 using AccDataType       = FP32;
 using CShuffleDataType  = FP16;
 using OutKernelDataType = FP16;
 // tensor data types
 using InUserDataType  = InKernelDataType;
 using WeiUserDataType = WeiKernelDataType;
 using OutUserDataType = OutKernelDataType;
 using InElementOp  = PassThrough;
 using WeiElementOp = PassThrough;
 using OutElementOp = PassThrough;
 #include "run_grouped_conv_fwd_example.inc"
 int main(int argc, char* argv[]) { return !run_grouped_conv_fwd_example(argc, argv); }
--- a/example/30_grouped_convnd_fwd_bias_relu_add/grouped_convnd_fwd_bias_relu_add_common.hpp
+++ b/example/30_grouped_convnd_fwd_bias_relu_add/grouped_convnd_fwd_bias_relu_add_common.hpp
@@ -1,59 +1,110 @@
 // SPDX-License-Identifier: MIT
 // Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
-#include <cstdlib>
+template <typename BiasLay, typename ResidualLay>
-#include <iostream>
+struct LayoutSetting
 #include <numeric>
 #include <type_traits>
 #include "ck/ck.hpp"
 #include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
 #include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
 #include "ck/library/utility/check_err.hpp"
 #include "ck/library/utility/device_memory.hpp"
 #include "ck/library/utility/host_tensor.hpp"
 #include "ck/library/utility/host_tensor_generator.hpp"
 #include "ck/library/utility/convolution_parameter.hpp"
 #include "ck/library/reference_tensor_operation/cpu/reference_conv_fwd.hpp"
 void print_helper_msg()
 {
-    std::cout << "arg1: verification (0=no, 1=yes)\n"
+    using BiasLayout     = BiasLay;
-              << "arg2: initialization (0=no init, 1=integer value, 2=decimal value)\n"
+    using ResidualLayout = ResidualLay;
-              << "arg3: time kernel (0=no, 1=yes)\n"
+};
              << ck::utils::conv::get_conv_param_parser_helper_msg() << std::endl;
 }
-template <ck::index_t NDimSpatial,
+template <ck::index_t NDimSpatial>
-          typename InKernelDataType,
+struct LayoutSettingSelector;
-          typename WeiKernelDataType,
+
-          typename CShuffleDataType,
+template <>
-          typename OutKernelDataType,
+struct LayoutSettingSelector<1> final : LayoutSetting<ctl::G_K, ctl::G_NW_K>
          typename InElementOp,
          typename WeiElementOp,
          typename OutElementOp,
          typename InUserDataType,
          typename WeiUserDataType,
          typename OutUserDataType,
          typename DeviceConvNDFwdInstance>
 int run_grouped_conv_fwd_bias_relu_add(bool do_verification,
                                       int init_method,
                                       bool time_kernel,
                                       const ck::utils::conv::ConvParam& conv_param,
                                       const HostTensorDescriptor& in_g_n_c_wis_desc,
                                       const HostTensorDescriptor& wei_g_k_c_xs_desc,
                                       const HostTensorDescriptor& bias_g_n_k_wos_desc,
                                       const HostTensorDescriptor& residual_g_n_k_wos_desc,
                                       const HostTensorDescriptor& out_g_n_k_wos_desc,
                                       const InElementOp& in_element_op,
                                       const WeiElementOp& wei_element_op,
                                       const OutElementOp& out_element_op)
 {
 };
 template <>
 struct LayoutSettingSelector<2> final : LayoutSetting<ctl::G_K, ctl::G_NHW_K>
 {
 };
 template <>
 struct LayoutSettingSelector<3> final : LayoutSetting<ctl::G_K, ctl::G_NDHW_K>
 {
 };
 template <ck::index_t NDimSpatial>
 using BiasLayout = typename LayoutSettingSelector<NDimSpatial>::BiasLayout;
 template <ck::index_t NDimSpatial>
 using ResidualLayout = typename LayoutSettingSelector<NDimSpatial>::ResidualLayout;
 template <ck::index_t NDimSpatial>
 using DeviceConvFwdInstance =
    ck::tensor_operation::device::DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<
        NDimSpatial,
        InputLayout<NDimSpatial>,
        WeightLayout<NDimSpatial>,
        ck::Tuple<BiasLayout<NDimSpatial>, ResidualLayout<NDimSpatial>>,
        OutputLayout<NDimSpatial>,
        InKernelDataType,
        WeiKernelDataType,
        AccDataType,
        CShuffleDataType,
        ck::Tuple<BiasKernelDataType, ResidualKernelDataType>,
        OutKernelDataType,
        InElementOp,
        WeiElementOp,
        OutElementOp,
        ConvSpec,    // ConvForwardSpecialization
        GemmSpec,    // GemmSpecialization
        1,           //
        256,         // BlockSize
        128,         // MPerBlock
        256,         // NPerBlock
        16,          // KPerBlock
        4,           // AK1
        4,           // BK1
        32,          // MPerXdl
        32,          // NPerXdl
        2,           // MXdlPerWave
        4,           // NXdlPerWave
        S<4, 64, 1>, // ABlockTransferThreadClusterLengths_AK0_M_AK1
        S<1, 0, 2>,  // ABlockTransferThreadClusterArrangeOrder
        S<1, 0, 2>,  // ABlockTransferSrcAccessOrder
        2,           // ABlockTransferSrcVectorDim
        4,           // ABlockTransferSrcScalarPerVector
        4,           // ABlockTransferDstScalarPerVector_AK1
        1,           // ABlockLdsExtraM
        S<4, 64, 1>, // BBlockTransferThreadClusterLengths_BK0_N_BK1
        S<1, 0, 2>,  // BBlockTransferThreadClusterArrangeOrder
        S<1, 0, 2>,  // BBlockTransferSrcAccessOrder
        2,           // BBlockTransferSrcVectorDim
        4,           // BBlockTransferSrcScalarPerVector
        4,           // BBlockTransferDstScalarPerVector_BK1
        1,           // BBlockLdsExtraN
        1,
        1,
        S<1, 16, 1, 16>,
        4>;
 template <ck::index_t NDimSpatial>
 using HostConvFwdInstance = ck::tensor_operation::host::ReferenceConvFwd<NDimSpatial,
                                                                         InUserDataType,
                                                                         WeiUserDataType,
                                                                         CShuffleDataType,
                                                                         InElementOp,
                                                                         WeiElementOp,
                                                                         PassThrough>;
 template <ck::index_t NDimSpatial>
 bool run_grouped_conv_fwd_bias_relu_add(const ExecutionConfig& config,
                                        const ck::utils::conv::ConvParam& conv_param)
 {
    static_assert(1 <= NDimSpatial && NDimSpatial <= 3, "Unsupported NDimSpatial");
    const auto in_g_n_c_wis_desc   = make_input_descriptor(conv_param);
    const auto wei_g_k_c_xs_desc   = make_weight_descriptor(conv_param);
    const auto bias_g_n_k_wos_desc = make_bias_descriptor(conv_param);
    const auto out_g_n_k_wos_desc  = make_output_descriptor(conv_param);
    Tensor<InUserDataType> in(in_g_n_c_wis_desc);
    Tensor<WeiUserDataType> wei(wei_g_k_c_xs_desc);
    Tensor<OutUserDataType> bias(bias_g_n_k_wos_desc);
-    Tensor<OutUserDataType> residual(residual_g_n_k_wos_desc);
+    Tensor<OutUserDataType> residual(bias_g_n_k_wos_desc);
    Tensor<OutUserDataType> out_host(out_g_n_k_wos_desc);
    Tensor<OutKernelDataType> out_device(out_g_n_k_wos_desc);
@@ -63,7 +114,7 @@ int run_grouped_conv_fwd_bias_relu_add(bool do_verification,
    std::cout << "residual: " << residual.mDesc << std::endl;
    std::cout << "out: " << out_host.mDesc << std::endl;
-    switch(init_method)
+    switch(config.init_method)
    {
    case 0: break;
    case 1:
@@ -83,7 +134,7 @@ int run_grouped_conv_fwd_bias_relu_add(bool do_verification,
    DeviceMem residual_device_buf(sizeof(OutKernelDataType) * residual.mDesc.GetElementSpaceSize());
    DeviceMem out_device_buf(sizeof(OutKernelDataType) * out_device.mDesc.GetElementSpaceSize());
-#ifdef CK_EXPERIMENTAL_BIT_INT_EXTENSION_INT4
+#ifdef BUILD_INT4_EXAMPLE
    const Tensor<InKernelDataType> in_converted(in);
    const Tensor<WeiKernelDataType> wei_converted(wei);
    const Tensor<OutKernelDataType> bias_converted(bias);
@@ -93,12 +144,12 @@ int run_grouped_conv_fwd_bias_relu_add(bool do_verification,
    wei_device_buf.ToDevice(wei_converted.mData.data());
    bias_device_buf.ToDevice(bias_converted.mData.data());
    residual_device_buf.ToDevice(residual_converted.mData.data());
-#else  // CK_EXPERIMENTAL_BIT_INT_EXTENSION_INT4
+#else
    in_device_buf.ToDevice(in.mData.data());
    wei_device_buf.ToDevice(wei.mData.data());
    bias_device_buf.ToDevice(bias.mData.data());
    residual_device_buf.ToDevice(residual.mData.data());
-#endif //  CK_EXPERIMENTAL_BIT_INT_EXTENSION_INT4
+#endif
    std::array<ck::index_t, NDimSpatial + 3> a_g_n_c_wis_lengths{};
    std::array<ck::index_t, NDimSpatial + 3> a_g_n_c_wis_strides{};
@@ -123,8 +174,8 @@ int run_grouped_conv_fwd_bias_relu_add(bool do_verification,
    copy(wei_g_k_c_xs_desc.GetStrides(), b_g_k_c_xs_strides);
    copy(bias_g_n_k_wos_desc.GetLengths(), d0_g_n_k_wos_lengths);
    copy(bias_g_n_k_wos_desc.GetStrides(), d0_g_n_k_wos_strides);
-    copy(residual_g_n_k_wos_desc.GetLengths(), d1_g_n_k_wos_lengths);
+    copy(bias_g_n_k_wos_desc.GetLengths(), d1_g_n_k_wos_lengths);
-    copy(residual_g_n_k_wos_desc.GetStrides(), d1_g_n_k_wos_strides);
+    copy(bias_g_n_k_wos_desc.GetStrides(), d1_g_n_k_wos_strides);
    copy(out_g_n_k_wos_desc.GetLengths(), e_g_n_k_wos_lengths);
    copy(out_g_n_k_wos_desc.GetStrides(), e_g_n_k_wos_strides);
    copy(conv_param.conv_filter_strides_, conv_filter_strides);
@@ -133,7 +184,7 @@ int run_grouped_conv_fwd_bias_relu_add(bool do_verification,
    copy(conv_param.input_right_pads_, input_right_pads);
    // do Conv
-    auto conv    = DeviceConvNDFwdInstance{};
+    auto conv    = DeviceConvFwdInstance<NDimSpatial>{};
    auto invoker = conv.MakeInvoker();
    auto argument =
        conv.MakeArgument(in_device_buf.GetDeviceBuffer(),
@@ -155,9 +206,9 @@ int run_grouped_conv_fwd_bias_relu_add(bool do_verification,
                          conv_filter_dilations,
                          input_left_pads,
                          input_right_pads,
-                          in_element_op,
+                          InElementOp{},
-                          wei_element_op,
+                          WeiElementOp{},
-                          out_element_op);
+                          OutElementOp{});
    if(!conv.IsSupportedArgument(argument))
    {
@@ -166,7 +217,7 @@ int run_grouped_conv_fwd_bias_relu_add(bool do_verification,
            "not support this Conv problem");
    }
-    float avg_time = invoker.Run(argument, StreamConfig{nullptr, time_kernel});
+    float avg_time = invoker.Run(argument, StreamConfig{nullptr, config.time_kernel});
    std::size_t flop      = conv_param.GetFlops();
    std::size_t num_btype = conv_param.GetByte<InUserDataType, WeiUserDataType, OutUserDataType>();
@@ -176,20 +227,11 @@ int run_grouped_conv_fwd_bias_relu_add(bool do_verification,
    std::cout << "Perf: " << avg_time << " ms, " << tflops << " TFlops, " << gb_per_sec << " GB/s, "
              << conv.GetTypeString() << std::endl;
-    if(do_verification)
+    if(config.do_verification)
    {
        using PassThrough = ck::tensor_operation::element_wise::PassThrough;
        Tensor<CShuffleDataType> c_host(out_g_n_k_wos_desc);
-        auto ref_conv = ck::tensor_operation::host::ReferenceConvFwd<NDimSpatial,
+        auto ref_conv     = HostConvFwdInstance<NDimSpatial>{};
                                                                     InUserDataType,
                                                                     WeiUserDataType,
                                                                     CShuffleDataType,
                                                                     InElementOp,
                                                                     WeiElementOp,
                                                                     PassThrough>();
        auto ref_invoker  = ref_conv.MakeInvoker();
        auto ref_argument = ref_conv.MakeArgument(in,
                                                  wei,
@@ -198,36 +240,49 @@ int run_grouped_conv_fwd_bias_relu_add(bool do_verification,
                                                  conv_param.conv_filter_dilations_,
                                                  conv_param.input_left_pads_,
                                                  conv_param.input_right_pads_,
-                                                  in_element_op,
+                                                  InElementOp{},
-                                                  wei_element_op,
+                                                  WeiElementOp{},
                                                  PassThrough{});
        ref_invoker.Run(ref_argument);
        // TODO: implement elementwise operation for host
        out_host.ForEach([&](auto&, auto idx) {
-            out_element_op(out_host(idx), c_host(idx), bias(idx), residual(idx));
+            OutElementOp{}(out_host(idx), c_host(idx), bias(idx), residual(idx));
        });
        out_device_buf.FromDevice(out_device.mData.data());
-#ifdef CK_EXPERIMENTAL_BIT_INT_EXTENSION_INT4
+#ifdef BUILD_INT4_EXAMPLE
        const Tensor<OutUserDataType> out_device_converted(out_device);
        return ck::utils::check_err(out_device_converted.mData,
                                    out_host.mData,
                                    "Error: incorrect results!",
                                    1e-5f,
                                    1e-4f)
                   ? 0
                   : 1;
 #else  // CK_EXPERIMENTAL_BIT_INT_EXTENSION_INT4
        return ck::utils::check_err(
-                   out_device.mData, out_host.mData, "Error: incorrect results!", 1e-5f, 1e-4f)
+            out_device_converted.mData, out_host.mData, "Error: incorrect results!", 1e-5f, 1e-4f);
-                   ? 0
+#else
-                   : 1;
+        return ck::utils::check_err(
-#endif // CK_EXPERIMENTAL_BIT_INT_EXTENSION_INT4
+            out_device.mData, out_host.mData, "Error: incorrect results!", 1e-5f, 1e-4f);
 #endif
    }
-    return 0;
+    return true;
 }
 bool run_grouped_conv_fwd_bias_relu_add_example(int argc, char* argv[])
 {
    ExecutionConfig config;
    ck::utils::conv::ConvParam conv_param = DefaultConvParam;
    if(!parse_cmd_args(argc, argv, config, conv_param))
    {
        return false;
    }
    switch(conv_param.num_dim_spatial_)
    {
    case 1: return run_grouped_conv_fwd_bias_relu_add<1>(config, conv_param);
    case 2: return run_grouped_conv_fwd_bias_relu_add<2>(config, conv_param);
    case 3: return run_grouped_conv_fwd_bias_relu_add<3>(config, conv_param);
    }
    return false;
 }
--- a/example/30_grouped_conv_fwd_multiple_d/run_grouped_conv_fwd_example.inc
+++ b/example/30_grouped_conv_fwd_multiple_d/run_grouped_conv_fwd_example.inc
@@ -0,0 +1,223 @@
 // SPDX-License-Identifier: MIT
 // Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
 template <ck::index_t NDimSpatial>
 using DeviceConvFwdInstance =
    ck::tensor_operation::device::DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<
        NDimSpatial,
        InputLayout<NDimSpatial>,
        WeightLayout<NDimSpatial>,
        ck::Tuple<>,
        OutputLayout<NDimSpatial>,
        InKernelDataType,
        WeiKernelDataType,
        AccDataType,
        CShuffleDataType,
        ck::Tuple<>,
        OutKernelDataType,
        InElementOp,
        WeiElementOp,
        OutElementOp,
        ConvSpec,    // ConvForwardSpecialization
        GemmSpec,    // GemmSpecialization
        1,           //
        256,         // BlockSize
        128,         // MPerBlock
        256,         // NPerBlock
        16,          // KPerBlock
        4,           // AK1
        4,           // BK1
        32,          // MPerXdl
        32,          // NPerXdl
        2,           // MXdlPerWave
        4,           // NXdlPerWave
        S<4, 64, 1>, // ABlockTransferThreadClusterLengths_AK0_M_AK1
        S<1, 0, 2>,  // ABlockTransferThreadClusterArrangeOrder
        S<1, 0, 2>,  // ABlockTransferSrcAccessOrder
        2,           // ABlockTransferSrcVectorDim
        4,           // ABlockTransferSrcScalarPerVector
        4,           // ABlockTransferDstScalarPerVector_AK1
        1,           // ABlockLdsExtraM
        S<4, 64, 1>, // BBlockTransferThreadClusterLengths_BK0_N_BK1
        S<1, 0, 2>,  // BBlockTransferThreadClusterArrangeOrder
        S<1, 0, 2>,  // BBlockTransferSrcAccessOrder
        2,           // BBlockTransferSrcVectorDim
        4,           // BBlockTransferSrcScalarPerVector
        4,           // BBlockTransferDstScalarPerVector_BK1
        1,           // BBlockLdsExtraN
        1,
        1,
        S<1, 16, 1, 16>,
        4>;
 template <ck::index_t NDimSpatial>
 using HostConvFwdInstance = ck::tensor_operation::host::ReferenceConvFwd<NDimSpatial,
                                                                         InUserDataType,
                                                                         WeiUserDataType,
                                                                         CShuffleDataType,
                                                                         InElementOp,
                                                                         WeiElementOp,
                                                                         PassThrough>;
 template <ck::index_t NDimSpatial>
 bool run_grouped_conv_fwd(const ExecutionConfig& config,
                          const ck::utils::conv::ConvParam& conv_param)
 {
    static_assert(1 <= NDimSpatial && NDimSpatial <= 3, "Unsupported NDimSpatial");
    const auto in_g_n_c_wis_desc  = make_input_descriptor(conv_param);
    const auto wei_g_k_c_xs_desc  = make_weight_descriptor(conv_param);
    const auto out_g_n_k_wos_desc = make_output_descriptor(conv_param);
    Tensor<InUserDataType> in(in_g_n_c_wis_desc);
    Tensor<WeiUserDataType> wei(wei_g_k_c_xs_desc);
    Tensor<OutUserDataType> out_host(out_g_n_k_wos_desc);
    Tensor<OutKernelDataType> out_device(out_g_n_k_wos_desc);
    std::cout << "in: " << in.mDesc << std::endl;
    std::cout << "wei: " << wei.mDesc << std::endl;
    std::cout << "out: " << out_host.mDesc << std::endl;
    switch(config.init_method)
    {
    case 0: break;
    case 1:
        in.GenerateTensorValue(GeneratorTensor_2<InUserDataType>{-5, 5});
        wei.GenerateTensorValue(GeneratorTensor_2<WeiUserDataType>{-5, 5});
        break;
    default:
        in.GenerateTensorValue(GeneratorTensor_3<InUserDataType>{0.0, 1.0});
        wei.GenerateTensorValue(GeneratorTensor_3<WeiUserDataType>{-0.5, 0.5});
    }
    DeviceMem in_device_buf(sizeof(InKernelDataType) * in.mDesc.GetElementSpaceSize());
    DeviceMem wei_device_buf(sizeof(WeiKernelDataType) * wei.mDesc.GetElementSpaceSize());
    DeviceMem out_device_buf(sizeof(OutKernelDataType) * out_device.mDesc.GetElementSpaceSize());
 #ifdef BUILD_INT4_EXAMPLE
    const Tensor<InKernelDataType> in_converted(in);
    const Tensor<WeiKernelDataType> wei_converted(wei);
    in_device_buf.ToDevice(in_converted.mData.data());
    wei_device_buf.ToDevice(wei_converted.mData.data());
 #else
    in_device_buf.ToDevice(in.mData.data());
    wei_device_buf.ToDevice(wei.mData.data());
 #endif
    std::array<ck::index_t, NDimSpatial + 3> a_g_n_c_wis_lengths{};
    std::array<ck::index_t, NDimSpatial + 3> a_g_n_c_wis_strides{};
    std::array<ck::index_t, NDimSpatial + 3> b_g_k_c_xs_lengths{};
    std::array<ck::index_t, NDimSpatial + 3> b_g_k_c_xs_strides{};
    std::array<ck::index_t, NDimSpatial + 3> e_g_n_k_wos_lengths{};
    std::array<ck::index_t, NDimSpatial + 3> e_g_n_k_wos_strides{};
    std::array<ck::index_t, NDimSpatial> conv_filter_strides{};
    std::array<ck::index_t, NDimSpatial> conv_filter_dilations{};
    std::array<ck::index_t, NDimSpatial> input_left_pads{};
    std::array<ck::index_t, NDimSpatial> input_right_pads{};
    auto copy = [](auto& x, auto& y) { std::copy(x.begin(), x.end(), y.begin()); };
    copy(in_g_n_c_wis_desc.GetLengths(), a_g_n_c_wis_lengths);
    copy(in_g_n_c_wis_desc.GetStrides(), a_g_n_c_wis_strides);
    copy(wei_g_k_c_xs_desc.GetLengths(), b_g_k_c_xs_lengths);
    copy(wei_g_k_c_xs_desc.GetStrides(), b_g_k_c_xs_strides);
    copy(out_g_n_k_wos_desc.GetLengths(), e_g_n_k_wos_lengths);
    copy(out_g_n_k_wos_desc.GetStrides(), e_g_n_k_wos_strides);
    copy(conv_param.conv_filter_strides_, conv_filter_strides);
    copy(conv_param.conv_filter_dilations_, conv_filter_dilations);
    copy(conv_param.input_left_pads_, input_left_pads);
    copy(conv_param.input_right_pads_, input_right_pads);
    // do Conv
    auto conv     = DeviceConvFwdInstance<NDimSpatial>{};
    auto invoker  = conv.MakeInvoker();
    auto argument = conv.MakeArgument(in_device_buf.GetDeviceBuffer(),
                                      wei_device_buf.GetDeviceBuffer(),
                                      std::array<const void*, 0>{},
                                      out_device_buf.GetDeviceBuffer(),
                                      a_g_n_c_wis_lengths,
                                      a_g_n_c_wis_strides,
                                      b_g_k_c_xs_lengths,
                                      b_g_k_c_xs_strides,
                                      std::array<std::array<ck::index_t, NDimSpatial + 3>, 0>{},
                                      std::array<std::array<ck::index_t, NDimSpatial + 3>, 0>{},
                                      e_g_n_k_wos_lengths,
                                      e_g_n_k_wos_strides,
                                      conv_filter_strides,
                                      conv_filter_dilations,
                                      input_left_pads,
                                      input_right_pads,
                                      InElementOp{},
                                      WeiElementOp{},
                                      OutElementOp{});
    if(!conv.IsSupportedArgument(argument))
    {
        throw std::runtime_error(
            "wrong! device_conv with the specified compilation parameters does "
            "not support this Conv problem");
    }
    float avg_time = invoker.Run(argument, StreamConfig{nullptr, config.time_kernel});
    std::size_t flop      = conv_param.GetFlops();
    std::size_t num_btype = conv_param.GetByte<InUserDataType, WeiUserDataType, OutUserDataType>();
    float tflops     = static_cast<float>(flop) / 1.E9 / avg_time;
    float gb_per_sec = num_btype / 1.E6 / avg_time;
    std::cout << "Perf: " << avg_time << " ms, " << tflops << " TFlops, " << gb_per_sec << " GB/s, "
              << conv.GetTypeString() << std::endl;
    if(config.do_verification)
    {
        auto ref_conv     = HostConvFwdInstance<NDimSpatial>{};
        auto ref_invoker  = ref_conv.MakeInvoker();
        auto ref_argument = ref_conv.MakeArgument(in,
                                                  wei,
                                                  out_host,
                                                  conv_param.conv_filter_strides_,
                                                  conv_param.conv_filter_dilations_,
                                                  conv_param.input_left_pads_,
                                                  conv_param.input_right_pads_,
                                                  InElementOp{},
                                                  WeiElementOp{},
                                                  OutElementOp{});
        ref_invoker.Run(ref_argument);
        out_device_buf.FromDevice(out_device.mData.data());
 #ifdef BUILD_INT4_EXAMPLE
        const Tensor<OutUserDataType> out_device_converted(out_device);
        return ck::utils::check_err(
            out_device_converted.mData, out_host.mData, "Error: incorrect results!", 1e-5f, 1e-4f);
 #else
        return ck::utils::check_err(
            out_device.mData, out_host.mData, "Error: incorrect results!", 1e-5f, 1e-4f);
 #endif
    }
    return true;
 }
 bool run_grouped_conv_fwd_example(int argc, char* argv[])
 {
    ExecutionConfig config;
    ck::utils::conv::ConvParam conv_param = DefaultConvParam;
    if(!parse_cmd_args(argc, argv, config, conv_param))
    {
        return false;
    }
    switch(conv_param.num_dim_spatial_)
    {
    case 1: return run_grouped_conv_fwd<1>(config, conv_param);
    case 2: return run_grouped_conv_fwd<2>(config, conv_param);
    case 3: return run_grouped_conv_fwd<3>(config, conv_param);
    }
    return false;
 }
--- a/example/30_grouped_convnd_fwd_bias_relu_add/CMakeLists.txt
+++ b/example/30_grouped_convnd_fwd_bias_relu_add/CMakeLists.txt
@@ -1,11 +0,0 @@
 add_example_executable(example_grouped_convnd_fwd_bias_relu_add_xdl_fp16 grouped_convnd_fwd_bias_relu_add_xdl_fp16.cpp)
 add_example_executable(example_grouped_convnd_fwd_bias_relu_add_xdl_fp32 grouped_convnd_fwd_bias_relu_add_xdl_fp32.cpp)
 add_example_executable(example_grouped_convnd_fwd_bias_relu_add_xdl_bf16 grouped_convnd_fwd_bias_relu_add_xdl_bf16.cpp)
 add_example_executable(example_grouped_convnd_fwd_bias_relu_add_xdl_int8 grouped_convnd_fwd_bias_relu_add_xdl_int8.cpp)
 if(USE_BITINT_EXTENSION_INT4)
  add_example_executable(example_grouped_convnd_fwd_bias_relu_add_xdl_int4 grouped_convnd_fwd_bias_relu_add_xdl_int4.cpp)
 endif() # USE_BITINT_EXTENSION_INT4
--- a/example/30_grouped_convnd_fwd_bias_relu_add/README.md
+++ b/example/30_grouped_convnd_fwd_bias_relu_add/README.md
@@ -1,34 +0,0 @@
 ```bash
 #arg1: verification (0=no, 1=yes)
 #arg2: initialization (0=no init, 1=integer value, 2=decimal value)
 #arg3: time kernel (0=no, 1=yes)
 #Following arguments (depending on number of spatial dims):
 # N spatial dimensions
 # G, N, K, C,
 # <filter spatial dimensions>, (ie Y, X for 2D)
 # <input image spatial dimensions>, (ie Hi, Wi for 2D)
 # <strides>, (ie Sy, Sx for 2D)
 # <dilations>, (ie Dy, Dx for 2D)
 # <left padding>, (ie LeftPy, LeftPx for 2D)
 # <right padding>, (ie RightPy, RightPx for 2D)
 bin/example_grouped_convnd_fwd_bias_relu_add_xdl_fp16 1 1 1
 ```
 Result (MI100)
 ```
 in: dim 5, lengths {2, 128, 192, 71, 71}, strides {192, 1935744, 1, 27264, 384}
 wei: dim 5, lengths {2, 256, 192, 3, 3}, strides {442368, 1728, 1, 576, 192}
 bias: dim 5, lengths {2, 128, 256, 36, 36}, strides {256, 0, 1, 0, 0}
 residual: dim 5, lengths {2, 128, 256, 36, 36}, strides {256, 0, 1, 0, 0}
 out: dim 5, lengths {2, 128, 256, 36, 36}, strides {256, 663552, 1, 18432, 512}
 A[M, K]: {165888, 1728}
 B[N, K]: {256, 1728}
 Ds[M, N]: {165888, 256}
 Ds[M, N]: {165888, 256}
 E[M, N]: {165888, 256}
 launch_and_time_kernel: grid_dim {2592, 1, 1}, block_dim {256, 1, 1}
 Warm up 1 time
 Start running 10 times...
 Perf: 2.48075 ms, 118.325 TFlops, 268.946 GB/s, DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<256, 128, 256, 32, Default>
 ```
--- a/example/30_grouped_convnd_fwd_bias_relu_add/grouped_convnd_fwd_bias_relu_add_xdl_bf16.cpp
+++ b/example/30_grouped_convnd_fwd_bias_relu_add/grouped_convnd_fwd_bias_relu_add_xdl_bf16.cpp
@@ -1,459 +0,0 @@
 // SPDX-License-Identifier: MIT
 // Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
 #include "grouped_convnd_fwd_bias_relu_add_common.hpp"
 #include "ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_d_xdl_cshuffle.hpp"
 #include "ck/library/utility/convolution_host_tensor_descriptor_helper.hpp"
 // kernel data types
 using InKernelDataType       = ck::bhalf_t;
 using WeiKernelDataType      = ck::bhalf_t;
 using AccDataType            = float;
 using CShuffleDataType       = float;
 using BiasKernelDataType     = ck::bhalf_t;
 using ResidualKernelDataType = ck::bhalf_t;
 using OutKernelDataType      = ck::bhalf_t;
 // tensor data types
 using InUserDataType  = InKernelDataType;
 using WeiUserDataType = WeiKernelDataType;
 using OutUserDataType = OutKernelDataType;
 template <ck::index_t... Is>
 using S = ck::Sequence<Is...>;
 using InElementOp  = ck::tensor_operation::element_wise::PassThrough;
 using WeiElementOp = ck::tensor_operation::element_wise::PassThrough;
 using OutElementOp = ck::tensor_operation::element_wise::AddReluAdd;
 static constexpr auto ConvSpec =
    ck::tensor_operation::device::ConvolutionForwardSpecialization::Default;
 static constexpr auto GemmSpec = ck::tensor_operation::device::GemmSpecialization::MNKPadding;
 template <ck::index_t NDimSpatial,
          typename InLayout,
          typename WeiLayout,
          typename BiasLayout,
          typename ResidualLayout,
          typename OutLayout>
 using DeviceGroupedConvNDFwdInstance =
    ck::tensor_operation::device::DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<
        NDimSpatial,
        InLayout,
        WeiLayout,
        ck::Tuple<BiasLayout, ResidualLayout>,
        OutLayout,
        InKernelDataType,
        WeiKernelDataType,
        AccDataType,
        CShuffleDataType,
        ck::Tuple<BiasKernelDataType, ResidualKernelDataType>,
        OutKernelDataType,
        InElementOp,
        WeiElementOp,
        OutElementOp,
        ConvSpec,    // ConvForwardSpecialization
        GemmSpec,    // GemmSpecialization
        1,           //
        256,         // BlockSize
        128,         // MPerBlock
        256,         // NPerBlock
        32,          // KPerBlock
        8,           // AK1
        8,           // BK1
        32,          // MPerXdl
        32,          // NPerXdl
        2,           // MXdlPerWave
        4,           // NXdlPerWave
        S<4, 64, 1>, // ABlockTransferThreadClusterLengths_AK0_M_AK1
        S<1, 0, 2>,  // ABlockTransferThreadClusterArrangeOrder
        S<1, 0, 2>,  // ABlockTransferSrcAccessOrder
        2,           // ABlockTransferSrcVectorDim
        8,           // ABlockTransferSrcScalarPerVector
        8,           // ABlockTransferDstScalarPerVector_AK1
        1,           // ABlockLdsExtraM
        S<4, 64, 1>, // BBlockTransferThreadClusterLengths_BK0_N_BK1
        S<1, 0, 2>,  // BBlockTransferThreadClusterArrangeOrder
        S<1, 0, 2>,  // BBlockTransferSrcAccessOrder
        2,           // BBlockTransferSrcVectorDim
        8,           // BBlockTransferSrcScalarPerVector
        8,           // BBlockTransferDstScalarPerVector_BK1
        1,           // BBlockLdsExtraN
        1,
        1,
        S<1, 32, 1, 8>,
        8>;
 int main(int argc, char* argv[])
 {
    namespace ctc = ck::tensor_layout::convolution;
    print_helper_msg();
    bool do_verification = true;
    int init_method      = 1;
    bool time_kernel     = false;
    // conventional group conv definition
    // G = 2
    // [N, C, Hi, Wi] =  [128, 384, 71, 71]
    // [K, C,  Y,  X] =  [512, 192,  3,  3]
    // [N, K, Ho, Wo] =  [128, 512, 36, 36]
    // CK group conv definition
    // [G, N, C, Hi, Wi] =  [2, 128, 192, 71, 71]
    // [G, K, C,  Y,  X] =  [2, 256, 192,  3,  3]
    // [G, N, K, Ho, Wo] =  [2, 128, 256, 36, 36]
    ck::utils::conv::ConvParam conv_param{
        2, 2, 128, 256, 192, {3, 3}, {71, 71}, {2, 2}, {1, 1}, {1, 1}, {1, 1}};
    if(argc == 1)
    {
        // use default
    }
    else if(argc == 4)
    {
        do_verification = std::stoi(argv[1]);
        init_method     = std::stoi(argv[2]);
        time_kernel     = std::stoi(argv[3]);
    }
    else
    {
        do_verification                   = std::stoi(argv[1]);
        init_method                       = std::stoi(argv[2]);
        time_kernel                       = std::stoi(argv[3]);
        const ck::index_t num_dim_spatial = std::stoi(argv[4]);
        conv_param = ck::utils::conv::parse_conv_param(num_dim_spatial, 5, argv);
    }
    const auto in_element_op  = InElementOp{};
    const auto wei_element_op = WeiElementOp{};
    const auto out_element_op = OutElementOp{};
    if(conv_param.num_dim_spatial_ == 1)
    {
        using InLayout       = ctc::G_NW_C;
        using WeiLayout      = ctc::G_K_X_C;
        using BiasLayout     = ctc::G_K;
        using ResidualLayout = ctc::G_NW_K;
        using OutLayout      = ctc::G_NW_K;
        const auto in_g_n_c_wis_desc = HostTensorDescriptor(
            {conv_param.G_, conv_param.N_, conv_param.C_, conv_param.input_spatial_lengths_[0]},
            {
                conv_param.C_,                                                        // g
                conv_param.input_spatial_lengths_[0] * conv_param.G_ * conv_param.C_, // n
                1,                                                                    // c
                conv_param.G_ * conv_param.C_                                         // wi
            });
        const auto wei_g_k_c_xs_desc = HostTensorDescriptor(
            {conv_param.G_, conv_param.K_, conv_param.C_, conv_param.filter_spatial_lengths_[0]},
            {
                conv_param.K_ * conv_param.filter_spatial_lengths_[0] * conv_param.C_, // g
                conv_param.filter_spatial_lengths_[0] * conv_param.C_,                 // k
                1,                                                                     // c
                conv_param.C_                                                          // x
            });
        const auto bias_g_n_k_wos_desc = HostTensorDescriptor(
            {conv_param.G_, conv_param.N_, conv_param.K_, conv_param.output_spatial_lengths_[0]},
            {
                conv_param.K_, // g
                0,             // k
                1,             // c
                0              // x
            });
        const auto residual_g_n_k_wos_desc = HostTensorDescriptor(
            {conv_param.G_, conv_param.N_, conv_param.K_, conv_param.output_spatial_lengths_[0]},
            {
                conv_param.K_, // g
                0,             // k
                1,             // c
                0              // x
            });
        const auto out_g_n_k_wos_desc = HostTensorDescriptor(
            {conv_param.G_, conv_param.N_, conv_param.K_, conv_param.output_spatial_lengths_[0]},
            {
                conv_param.K_,                                                         // g
                conv_param.output_spatial_lengths_[0] * conv_param.G_ * conv_param.K_, // n
                1,                                                                     // k
                conv_param.G_ * conv_param.K_                                          // wo
            });
        return run_grouped_conv_fwd_bias_relu_add<1,
                                                  InKernelDataType,
                                                  WeiKernelDataType,
                                                  CShuffleDataType,
                                                  OutKernelDataType,
                                                  InElementOp,
                                                  WeiElementOp,
                                                  OutElementOp,
                                                  InUserDataType,
                                                  WeiUserDataType,
                                                  OutUserDataType,
                                                  DeviceGroupedConvNDFwdInstance<1,
                                                                                 InLayout,
                                                                                 WeiLayout,
                                                                                 BiasLayout,
                                                                                 ResidualLayout,
                                                                                 OutLayout>>(
            do_verification,
            init_method,
            time_kernel,
            conv_param,
            in_g_n_c_wis_desc,
            wei_g_k_c_xs_desc,
            bias_g_n_k_wos_desc,
            residual_g_n_k_wos_desc,
            out_g_n_k_wos_desc,
            in_element_op,
            wei_element_op,
            out_element_op);
    }
    else if(conv_param.num_dim_spatial_ == 2)
    {
        using InLayout       = ctc::G_NHW_C;
        using WeiLayout      = ctc::G_K_YX_C;
        using BiasLayout     = ctc::G_K;
        using ResidualLayout = ctc::G_NHW_K;
        using OutLayout      = ctc::G_NHW_K;
        const auto in_g_n_c_wis_desc = HostTensorDescriptor(
            {conv_param.G_,
             conv_param.N_,
             conv_param.C_,
             conv_param.input_spatial_lengths_[0],
             conv_param.input_spatial_lengths_[1]},
            {
                conv_param.C_, // g
                conv_param.input_spatial_lengths_[0] * conv_param.input_spatial_lengths_[1] *
                    conv_param.G_ * conv_param.C_,                                    // n
                1,                                                                    // c
                conv_param.input_spatial_lengths_[1] * conv_param.G_ * conv_param.C_, // hi
                conv_param.G_ * conv_param.C_                                         // wi
            });
        const auto wei_g_k_c_xs_desc =
            HostTensorDescriptor({conv_param.G_,
                                  conv_param.K_,
                                  conv_param.C_,
                                  conv_param.filter_spatial_lengths_[0],
                                  conv_param.filter_spatial_lengths_[1]},
                                 {
                                     conv_param.K_ * conv_param.filter_spatial_lengths_[0] *
                                         conv_param.filter_spatial_lengths_[1] * conv_param.C_, // g
                                     conv_param.filter_spatial_lengths_[0] *
                                         conv_param.filter_spatial_lengths_[1] * conv_param.C_, // k
                                     1,                                                         // c
                                     conv_param.filter_spatial_lengths_[1] * conv_param.C_,     // y
                                     conv_param.C_                                              // x
                                 });
        const auto bias_g_n_k_wos_desc =
            HostTensorDescriptor({conv_param.G_,
                                  conv_param.N_,
                                  conv_param.K_,
                                  conv_param.output_spatial_lengths_[0],
                                  conv_param.output_spatial_lengths_[1]},
                                 {
                                     conv_param.K_, // g
                                     0,             // n
                                     1,             // k
                                     0,             // ho
                                     0              // wo
                                 });
        const auto residual_g_n_k_wos_desc =
            HostTensorDescriptor({conv_param.G_,
                                  conv_param.N_,
                                  conv_param.K_,
                                  conv_param.output_spatial_lengths_[0],
                                  conv_param.output_spatial_lengths_[1]},
                                 {
                                     conv_param.K_, // g
                                     0,             // n
                                     1,             // k
                                     0,             // ho
                                     0              // wo
                                 });
        const auto out_g_n_k_wos_desc = HostTensorDescriptor(
            {conv_param.G_,
             conv_param.N_,
             conv_param.K_,
             conv_param.output_spatial_lengths_[0],
             conv_param.output_spatial_lengths_[1]},
            {
                conv_param.K_, // g
                conv_param.output_spatial_lengths_[0] * conv_param.output_spatial_lengths_[1] *
                    conv_param.G_ * conv_param.K_,                                     // n
                1,                                                                     // k
                conv_param.output_spatial_lengths_[1] * conv_param.G_ * conv_param.K_, // ho
                conv_param.G_ * conv_param.K_                                          // wo
            });
        return run_grouped_conv_fwd_bias_relu_add<2,
                                                  InKernelDataType,
                                                  WeiKernelDataType,
                                                  CShuffleDataType,
                                                  OutKernelDataType,
                                                  InElementOp,
                                                  WeiElementOp,
                                                  OutElementOp,
                                                  InUserDataType,
                                                  WeiUserDataType,
                                                  OutUserDataType,
                                                  DeviceGroupedConvNDFwdInstance<2,
                                                                                 InLayout,
                                                                                 WeiLayout,
                                                                                 BiasLayout,
                                                                                 ResidualLayout,
                                                                                 OutLayout>>(
            do_verification,
            init_method,
            time_kernel,
            conv_param,
            in_g_n_c_wis_desc,
            wei_g_k_c_xs_desc,
            bias_g_n_k_wos_desc,
            residual_g_n_k_wos_desc,
            out_g_n_k_wos_desc,
            in_element_op,
            wei_element_op,
            out_element_op);
    }
    else if(conv_param.num_dim_spatial_ == 3)
    {
        using InLayout       = ctc::G_NDHW_C;
        using WeiLayout      = ctc::G_K_ZYX_C;
        using BiasLayout     = ctc::G_K;
        using ResidualLayout = ctc::G_NDHW_K;
        using OutLayout      = ctc::G_NDHW_K;
        const auto in_g_n_c_wis_desc = HostTensorDescriptor(
            {conv_param.G_,
             conv_param.N_,
             conv_param.C_,
             conv_param.input_spatial_lengths_[0],
             conv_param.input_spatial_lengths_[1],
             conv_param.input_spatial_lengths_[2]},
            {
                conv_param.C_, // g
                conv_param.input_spatial_lengths_[0] * conv_param.input_spatial_lengths_[1] *
                    conv_param.input_spatial_lengths_[2] * conv_param.G_ * conv_param.C_, // n
                1,                                                                        // c
                conv_param.input_spatial_lengths_[1] * conv_param.input_spatial_lengths_[2] *
                    conv_param.G_ * conv_param.C_,                                    // di
                conv_param.input_spatial_lengths_[2] * conv_param.G_ * conv_param.C_, // hi
                conv_param.G_ * conv_param.C_                                         // wi
            });
        const auto wei_g_k_c_xs_desc = HostTensorDescriptor(
            {conv_param.G_,
             conv_param.K_,
             conv_param.C_,
             conv_param.filter_spatial_lengths_[0],
             conv_param.filter_spatial_lengths_[1],
             conv_param.filter_spatial_lengths_[2]},
            {
                conv_param.K_ * conv_param.filter_spatial_lengths_[0] *
                    conv_param.filter_spatial_lengths_[1] * conv_param.filter_spatial_lengths_[2] *
                    conv_param.C_, // g
                conv_param.filter_spatial_lengths_[0] * conv_param.filter_spatial_lengths_[1] *
                    conv_param.filter_spatial_lengths_[2] * conv_param.C_, // k
                1,                                                         // c
                conv_param.filter_spatial_lengths_[1] * conv_param.filter_spatial_lengths_[2] *
                    conv_param.C_,                                     // z
                conv_param.filter_spatial_lengths_[2] * conv_param.C_, // y
                conv_param.C_                                          // x
            });
        const auto bias_g_n_k_wos_desc =
            HostTensorDescriptor({conv_param.G_,
                                  conv_param.N_,
                                  conv_param.K_,
                                  conv_param.output_spatial_lengths_[0],
                                  conv_param.output_spatial_lengths_[1],
                                  conv_param.output_spatial_lengths_[2]},
                                 {
                                     conv_param.K_, // g
                                     0,             // n
                                     1,             // k
                                     0,             // z
                                     0,             // y
                                     0              // x
                                 });
        const auto residual_g_n_k_wos_desc =
            HostTensorDescriptor({conv_param.G_,
                                  conv_param.N_,
                                  conv_param.K_,
                                  conv_param.output_spatial_lengths_[0],
                                  conv_param.output_spatial_lengths_[1],
                                  conv_param.output_spatial_lengths_[2]},
                                 {
                                     conv_param.K_, // g
                                     0,             // n
                                     1,             // k
                                     0,             // z
                                     0,             // y
                                     0              // x
                                 });
        const auto out_g_n_k_wos_desc = HostTensorDescriptor(
            {conv_param.G_,
             conv_param.N_,
             conv_param.K_,
             conv_param.output_spatial_lengths_[0],
             conv_param.output_spatial_lengths_[1],
             conv_param.output_spatial_lengths_[2]},
            {
                conv_param.K_, // g
                conv_param.output_spatial_lengths_[0] * conv_param.output_spatial_lengths_[1] *
                    conv_param.output_spatial_lengths_[2] * conv_param.G_ * conv_param.K_, // n
                1,                                                                         // k
                conv_param.output_spatial_lengths_[1] * conv_param.output_spatial_lengths_[2] *
                    conv_param.G_ * conv_param.K_,                                     // do
                conv_param.output_spatial_lengths_[2] * conv_param.G_ * conv_param.K_, // ho
                conv_param.G_ * conv_param.K_                                          // wo
            });
        return run_grouped_conv_fwd_bias_relu_add<3,
                                                  InKernelDataType,
                                                  WeiKernelDataType,
                                                  CShuffleDataType,
                                                  OutKernelDataType,
                                                  InElementOp,
                                                  WeiElementOp,
                                                  OutElementOp,
                                                  InUserDataType,
                                                  WeiUserDataType,
                                                  OutUserDataType,
                                                  DeviceGroupedConvNDFwdInstance<3,
                                                                                 InLayout,
                                                                                 WeiLayout,
                                                                                 BiasLayout,
                                                                                 ResidualLayout,
                                                                                 OutLayout>>(
            do_verification,
            init_method,
            time_kernel,
            conv_param,
            in_g_n_c_wis_desc,
            wei_g_k_c_xs_desc,
            bias_g_n_k_wos_desc,
            residual_g_n_k_wos_desc,
            out_g_n_k_wos_desc,
            in_element_op,
            wei_element_op,
            out_element_op);
    }
    return 0;
 }
--- a/example/30_grouped_convnd_fwd_bias_relu_add/grouped_convnd_fwd_bias_relu_add_xdl_fp16.cpp
+++ b/example/30_grouped_convnd_fwd_bias_relu_add/grouped_convnd_fwd_bias_relu_add_xdl_fp16.cpp
@@ -1,459 +0,0 @@
 // SPDX-License-Identifier: MIT
 // Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
 #include "grouped_convnd_fwd_bias_relu_add_common.hpp"
 #include "ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_d_xdl_cshuffle.hpp"
 #include "ck/library/utility/convolution_host_tensor_descriptor_helper.hpp"
 // kernel data types
 using InKernelDataType       = ck::half_t;
 using WeiKernelDataType      = ck::half_t;
 using AccDataType            = float;
 using CShuffleDataType       = ck::half_t;
 using BiasKernelDataType     = ck::half_t;
 using ResidualKernelDataType = ck::half_t;
 using OutKernelDataType      = ck::half_t;
 // tensor data types
 using InUserDataType  = InKernelDataType;
 using WeiUserDataType = WeiKernelDataType;
 using OutUserDataType = OutKernelDataType;
 template <ck::index_t... Is>
 using S = ck::Sequence<Is...>;
 using InElementOp  = ck::tensor_operation::element_wise::PassThrough;
 using WeiElementOp = ck::tensor_operation::element_wise::PassThrough;
 using OutElementOp = ck::tensor_operation::element_wise::AddReluAdd;
 static constexpr auto ConvSpec =
    ck::tensor_operation::device::ConvolutionForwardSpecialization::Default;
 static constexpr auto GemmSpec = ck::tensor_operation::device::GemmSpecialization::MNKPadding;
 template <ck::index_t NDimSpatial,
          typename InLayout,
          typename WeiLayout,
          typename BiasLayout,
          typename ResidualLayout,
          typename OutLayout>
 using DeviceGroupedConvNDFwdInstance =
    ck::tensor_operation::device::DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<
        NDimSpatial,
        InLayout,
        WeiLayout,
        ck::Tuple<BiasLayout, ResidualLayout>,
        OutLayout,
        InKernelDataType,
        WeiKernelDataType,
        AccDataType,
        CShuffleDataType,
        ck::Tuple<BiasKernelDataType, ResidualKernelDataType>,
        OutKernelDataType,
        InElementOp,
        WeiElementOp,
        OutElementOp,
        ConvSpec,    // ConvForwardSpecialization
        GemmSpec,    // GemmSpecialization
        1,           //
        256,         // BlockSize
        128,         // MPerBlock
        256,         // NPerBlock
        32,          // KPerBlock
        8,           // AK1
        8,           // BK1
        32,          // MPerXdl
        32,          // NPerXdl
        2,           // MXdlPerWave
        4,           // NXdlPerWave
        S<4, 64, 1>, // ABlockTransferThreadClusterLengths_AK0_M_AK1
        S<1, 0, 2>,  // ABlockTransferThreadClusterArrangeOrder
        S<1, 0, 2>,  // ABlockTransferSrcAccessOrder
        2,           // ABlockTransferSrcVectorDim
        8,           // ABlockTransferSrcScalarPerVector
        8,           // ABlockTransferDstScalarPerVector_AK1
        1,           // ABlockLdsExtraM
        S<4, 64, 1>, // BBlockTransferThreadClusterLengths_BK0_N_BK1
        S<1, 0, 2>,  // BBlockTransferThreadClusterArrangeOrder
        S<1, 0, 2>,  // BBlockTransferSrcAccessOrder
        2,           // BBlockTransferSrcVectorDim
        8,           // BBlockTransferSrcScalarPerVector
        8,           // BBlockTransferDstScalarPerVector_BK1
        1,           // BBlockLdsExtraN
        1,
        1,
        S<1, 32, 1, 8>,
        8>;
 int main(int argc, char* argv[])
 {
    namespace ctc = ck::tensor_layout::convolution;
    print_helper_msg();
    bool do_verification = true;
    int init_method      = 1;
    bool time_kernel     = false;
    // conventional group conv definition
    // G = 2
    // [N, C, Hi, Wi] =  [128, 384, 71, 71]
    // [K, C,  Y,  X] =  [512, 192,  3,  3]
    // [N, K, Ho, Wo] =  [128, 512, 36, 36]
    // CK group conv definition
    // [G, N, C, Hi, Wi] =  [2, 128, 192, 71, 71]
    // [G, K, C,  Y,  X] =  [2, 256, 192,  3,  3]
    // [G, N, K, Ho, Wo] =  [2, 128, 256, 36, 36]
    ck::utils::conv::ConvParam conv_param{
        2, 2, 128, 256, 192, {3, 3}, {71, 71}, {2, 2}, {1, 1}, {1, 1}, {1, 1}};
    if(argc == 1)
    {
        // use default
    }
    else if(argc == 4)
    {
        do_verification = std::stoi(argv[1]);
        init_method     = std::stoi(argv[2]);
        time_kernel     = std::stoi(argv[3]);
    }
    else
    {
        do_verification                   = std::stoi(argv[1]);
        init_method                       = std::stoi(argv[2]);
        time_kernel                       = std::stoi(argv[3]);
        const ck::index_t num_dim_spatial = std::stoi(argv[4]);
        conv_param = ck::utils::conv::parse_conv_param(num_dim_spatial, 5, argv);
    }
    const auto in_element_op  = InElementOp{};
    const auto wei_element_op = WeiElementOp{};
    const auto out_element_op = OutElementOp{};
    if(conv_param.num_dim_spatial_ == 1)
    {
        using InLayout       = ctc::G_NW_C;
        using WeiLayout      = ctc::G_K_X_C;
        using BiasLayout     = ctc::G_K;
        using ResidualLayout = ctc::G_NW_K;
        using OutLayout      = ctc::G_NW_K;
        const auto in_g_n_c_wis_desc = HostTensorDescriptor(
            {conv_param.G_, conv_param.N_, conv_param.C_, conv_param.input_spatial_lengths_[0]},
            {
                conv_param.C_,                                                        // g
                conv_param.input_spatial_lengths_[0] * conv_param.G_ * conv_param.C_, // n
                1,                                                                    // c
                conv_param.G_ * conv_param.C_                                         // wi
            });
        const auto wei_g_k_c_xs_desc = HostTensorDescriptor(
            {conv_param.G_, conv_param.K_, conv_param.C_, conv_param.filter_spatial_lengths_[0]},
            {
                conv_param.K_ * conv_param.filter_spatial_lengths_[0] * conv_param.C_, // g
                conv_param.filter_spatial_lengths_[0] * conv_param.C_,                 // k
                1,                                                                     // c
                conv_param.C_                                                          // x
            });
        const auto bias_g_n_k_wos_desc = HostTensorDescriptor(
            {conv_param.G_, conv_param.N_, conv_param.K_, conv_param.output_spatial_lengths_[0]},
            {
                conv_param.K_, // g
                0,             // n
                1,             // k
                0              // wo
            });
        const auto residual_g_n_k_wos_desc = HostTensorDescriptor(
            {conv_param.G_, conv_param.N_, conv_param.K_, conv_param.output_spatial_lengths_[0]},
            {
                conv_param.K_, // g
                0,             // k
                1,             // c
                0              // x
            });
        const auto out_g_n_k_wos_desc = HostTensorDescriptor(
            {conv_param.G_, conv_param.N_, conv_param.K_, conv_param.output_spatial_lengths_[0]},
            {
                conv_param.K_,                                                         // g
                conv_param.output_spatial_lengths_[0] * conv_param.G_ * conv_param.K_, // n
                1,                                                                     // k
                conv_param.G_ * conv_param.K_                                          // wo
            });
        return run_grouped_conv_fwd_bias_relu_add<1,
                                                  InKernelDataType,
                                                  WeiKernelDataType,
                                                  CShuffleDataType,
                                                  OutKernelDataType,
                                                  InElementOp,
                                                  WeiElementOp,
                                                  OutElementOp,
                                                  InUserDataType,
                                                  WeiUserDataType,
                                                  OutUserDataType,
                                                  DeviceGroupedConvNDFwdInstance<1,
                                                                                 InLayout,
                                                                                 WeiLayout,
                                                                                 BiasLayout,
                                                                                 ResidualLayout,
                                                                                 OutLayout>>(
            do_verification,
            init_method,
            time_kernel,
            conv_param,
            in_g_n_c_wis_desc,
            wei_g_k_c_xs_desc,
            bias_g_n_k_wos_desc,
            residual_g_n_k_wos_desc,
            out_g_n_k_wos_desc,
            in_element_op,
            wei_element_op,
            out_element_op);
    }
    else if(conv_param.num_dim_spatial_ == 2)
    {
        using InLayout       = ctc::G_NHW_C;
        using WeiLayout      = ctc::G_K_YX_C;
        using BiasLayout     = ctc::G_K;
        using ResidualLayout = ctc::G_NHW_K;
        using OutLayout      = ctc::G_NHW_K;
        const auto in_g_n_c_wis_desc = HostTensorDescriptor(
            {conv_param.G_,
             conv_param.N_,
             conv_param.C_,
             conv_param.input_spatial_lengths_[0],
             conv_param.input_spatial_lengths_[1]},
            {
                conv_param.C_, // g
                conv_param.input_spatial_lengths_[0] * conv_param.input_spatial_lengths_[1] *
                    conv_param.G_ * conv_param.C_,                                    // n
                1,                                                                    // c
                conv_param.input_spatial_lengths_[1] * conv_param.G_ * conv_param.C_, // hi
                conv_param.G_ * conv_param.C_                                         // wi
            });
        const auto wei_g_k_c_xs_desc =
            HostTensorDescriptor({conv_param.G_,
                                  conv_param.K_,
                                  conv_param.C_,
                                  conv_param.filter_spatial_lengths_[0],
                                  conv_param.filter_spatial_lengths_[1]},
                                 {
                                     conv_param.K_ * conv_param.filter_spatial_lengths_[0] *
                                         conv_param.filter_spatial_lengths_[1] * conv_param.C_, // g
                                     conv_param.filter_spatial_lengths_[0] *
                                         conv_param.filter_spatial_lengths_[1] * conv_param.C_, // k
                                     1,                                                         // c
                                     conv_param.filter_spatial_lengths_[1] * conv_param.C_,     // y
                                     conv_param.C_                                              // x
                                 });
        const auto bias_g_n_k_wos_desc =
            HostTensorDescriptor({conv_param.G_,
                                  conv_param.N_,
                                  conv_param.K_,
                                  conv_param.output_spatial_lengths_[0],
                                  conv_param.output_spatial_lengths_[1]},
                                 {
                                     conv_param.K_, // g
                                     0,             // n
                                     1,             // k
                                     0,             // ho
                                     0              // wo
                                 });
        const auto residual_g_n_k_wos_desc =
            HostTensorDescriptor({conv_param.G_,
                                  conv_param.N_,
                                  conv_param.K_,
                                  conv_param.output_spatial_lengths_[0],
                                  conv_param.output_spatial_lengths_[1]},
                                 {
                                     conv_param.K_, // g
                                     0,             // n
                                     1,             // k
                                     0,             // ho
                                     0              // wo
                                 });
        const auto out_g_n_k_wos_desc = HostTensorDescriptor(
            {conv_param.G_,
             conv_param.N_,
             conv_param.K_,
             conv_param.output_spatial_lengths_[0],
             conv_param.output_spatial_lengths_[1]},
            {
                conv_param.K_, // g
                conv_param.output_spatial_lengths_[0] * conv_param.output_spatial_lengths_[1] *
                    conv_param.G_ * conv_param.K_,                                     // n
                1,                                                                     // k
                conv_param.output_spatial_lengths_[1] * conv_param.G_ * conv_param.K_, // ho
                conv_param.G_ * conv_param.K_                                          // wo
            });
        return run_grouped_conv_fwd_bias_relu_add<2,
                                                  InKernelDataType,
                                                  WeiKernelDataType,
                                                  CShuffleDataType,
                                                  OutKernelDataType,
                                                  InElementOp,
                                                  WeiElementOp,
                                                  OutElementOp,
                                                  InUserDataType,
                                                  WeiUserDataType,
                                                  OutUserDataType,
                                                  DeviceGroupedConvNDFwdInstance<2,
                                                                                 InLayout,
                                                                                 WeiLayout,
                                                                                 BiasLayout,
                                                                                 ResidualLayout,
                                                                                 OutLayout>>(
            do_verification,
            init_method,
            time_kernel,
            conv_param,
            in_g_n_c_wis_desc,
            wei_g_k_c_xs_desc,
            bias_g_n_k_wos_desc,
            residual_g_n_k_wos_desc,
            out_g_n_k_wos_desc,
            in_element_op,
            wei_element_op,
            out_element_op);
    }
    else if(conv_param.num_dim_spatial_ == 3)
    {
        using InLayout       = ctc::G_NDHW_C;
        using WeiLayout      = ctc::G_K_ZYX_C;
        using BiasLayout     = ctc::G_K;
        using ResidualLayout = ctc::G_NDHW_K;
        using OutLayout      = ctc::G_NDHW_K;
        const auto in_g_n_c_wis_desc = HostTensorDescriptor(
            {conv_param.G_,
             conv_param.N_,
             conv_param.C_,
             conv_param.input_spatial_lengths_[0],
             conv_param.input_spatial_lengths_[1],
             conv_param.input_spatial_lengths_[2]},
            {
                conv_param.C_, // g
                conv_param.input_spatial_lengths_[0] * conv_param.input_spatial_lengths_[1] *
                    conv_param.input_spatial_lengths_[2] * conv_param.G_ * conv_param.C_, // n
                1,                                                                        // c
                conv_param.input_spatial_lengths_[1] * conv_param.input_spatial_lengths_[2] *
                    conv_param.G_ * conv_param.C_,                                    // di
                conv_param.input_spatial_lengths_[2] * conv_param.G_ * conv_param.C_, // hi
                conv_param.G_ * conv_param.C_                                         // wi
            });
        const auto wei_g_k_c_xs_desc = HostTensorDescriptor(
            {conv_param.G_,
             conv_param.K_,
             conv_param.C_,
             conv_param.filter_spatial_lengths_[0],
             conv_param.filter_spatial_lengths_[1],
             conv_param.filter_spatial_lengths_[2]},
            {
                conv_param.K_ * conv_param.filter_spatial_lengths_[0] *
                    conv_param.filter_spatial_lengths_[1] * conv_param.filter_spatial_lengths_[2] *
                    conv_param.C_, // g
                conv_param.filter_spatial_lengths_[0] * conv_param.filter_spatial_lengths_[1] *
                    conv_param.filter_spatial_lengths_[2] * conv_param.C_, // k
                1,                                                         // c
                conv_param.filter_spatial_lengths_[1] * conv_param.filter_spatial_lengths_[2] *
                    conv_param.C_,                                     // z
                conv_param.filter_spatial_lengths_[2] * conv_param.C_, // y
                conv_param.C_                                          // x
            });
        const auto bias_g_n_k_wos_desc =
            HostTensorDescriptor({conv_param.G_,
                                  conv_param.N_,
                                  conv_param.K_,
                                  conv_param.output_spatial_lengths_[0],
                                  conv_param.output_spatial_lengths_[1],
                                  conv_param.output_spatial_lengths_[2]},
                                 {
                                     conv_param.K_, // g
                                     0,             // n
                                     1,             // k
                                     0,             // z
                                     0,             // y
                                     0              // x
                                 });
        const auto residual_g_n_k_wos_desc =
            HostTensorDescriptor({conv_param.G_,
                                  conv_param.N_,
                                  conv_param.K_,
                                  conv_param.output_spatial_lengths_[0],
                                  conv_param.output_spatial_lengths_[1],
                                  conv_param.output_spatial_lengths_[2]},
                                 {
                                     conv_param.K_, // g
                                     0,             // n
                                     1,             // k
                                     0,             // z
                                     0,             // y
                                     0              // x
                                 });
        const auto out_g_n_k_wos_desc = HostTensorDescriptor(
            {conv_param.G_,
             conv_param.N_,
             conv_param.K_,
             conv_param.output_spatial_lengths_[0],
             conv_param.output_spatial_lengths_[1],
             conv_param.output_spatial_lengths_[2]},
            {
                conv_param.K_, // g
                conv_param.output_spatial_lengths_[0] * conv_param.output_spatial_lengths_[1] *
                    conv_param.output_spatial_lengths_[2] * conv_param.G_ * conv_param.K_, // n
                1,                                                                         // k
                conv_param.output_spatial_lengths_[1] * conv_param.output_spatial_lengths_[2] *
                    conv_param.G_ * conv_param.K_,                                     // do
                conv_param.output_spatial_lengths_[2] * conv_param.G_ * conv_param.K_, // ho
                conv_param.G_ * conv_param.K_                                          // wo
            });
        return run_grouped_conv_fwd_bias_relu_add<3,
                                                  InKernelDataType,
                                                  WeiKernelDataType,
                                                  CShuffleDataType,
                                                  OutKernelDataType,
                                                  InElementOp,
                                                  WeiElementOp,
                                                  OutElementOp,
                                                  InUserDataType,
                                                  WeiUserDataType,
                                                  OutUserDataType,
                                                  DeviceGroupedConvNDFwdInstance<3,
                                                                                 InLayout,
                                                                                 WeiLayout,
                                                                                 BiasLayout,
                                                                                 ResidualLayout,
                                                                                 OutLayout>>(
            do_verification,
            init_method,
            time_kernel,
            conv_param,
            in_g_n_c_wis_desc,
            wei_g_k_c_xs_desc,
            bias_g_n_k_wos_desc,
            residual_g_n_k_wos_desc,
            out_g_n_k_wos_desc,
            in_element_op,
            wei_element_op,
            out_element_op);
    }
    return 0;
 }
--- a/example/30_grouped_convnd_fwd_bias_relu_add/grouped_convnd_fwd_bias_relu_add_xdl_fp32.cpp
+++ b/example/30_grouped_convnd_fwd_bias_relu_add/grouped_convnd_fwd_bias_relu_add_xdl_fp32.cpp
@@ -1,459 +0,0 @@
 // SPDX-License-Identifier: MIT
 // Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
 #include "grouped_convnd_fwd_bias_relu_add_common.hpp"
 #include "ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_d_xdl_cshuffle.hpp"
 #include "ck/library/utility/convolution_host_tensor_descriptor_helper.hpp"
 // kernel data types
 using InKernelDataType       = float;
 using WeiKernelDataType      = float;
 using AccDataType            = float;
 using CShuffleDataType       = float;
 using BiasKernelDataType     = float;
 using ResidualKernelDataType = float;
 using OutKernelDataType      = float;
 // tensor data types
 using InUserDataType  = InKernelDataType;
 using WeiUserDataType = WeiKernelDataType;
 using OutUserDataType = OutKernelDataType;
 template <ck::index_t... Is>
 using S = ck::Sequence<Is...>;
 using InElementOp  = ck::tensor_operation::element_wise::PassThrough;
 using WeiElementOp = ck::tensor_operation::element_wise::PassThrough;
 using OutElementOp = ck::tensor_operation::element_wise::AddReluAdd;
 static constexpr auto ConvSpec =
    ck::tensor_operation::device::ConvolutionForwardSpecialization::Default;
 static constexpr auto GemmSpec = ck::tensor_operation::device::GemmSpecialization::MNKPadding;
 template <ck::index_t NDimSpatial,
          typename InLayout,
          typename WeiLayout,
          typename BiasLayout,
          typename ResidualLayout,
          typename OutLayout>
 using DeviceGroupedConvNDFwdInstance =
    ck::tensor_operation::device::DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<
        NDimSpatial,
        InLayout,
        WeiLayout,
        ck::Tuple<BiasLayout, ResidualLayout>,
        OutLayout,
        InKernelDataType,
        WeiKernelDataType,
        AccDataType,
        CShuffleDataType,
        ck::Tuple<BiasKernelDataType, ResidualKernelDataType>,
        OutKernelDataType,
        InElementOp,
        WeiElementOp,
        OutElementOp,
        ConvSpec,    // ConvForwardSpecialization
        GemmSpec,    // GemmSpecialization
        1,           //
        256,         // BlockSize
        128,         // MPerBlock
        256,         // NPerBlock
        16,          // KPerBlock
        4,           // AK1
        4,           // BK1
        32,          // MPerXdl
        32,          // NPerXdl
        2,           // MXdlPerWave
        4,           // NXdlPerWave
        S<4, 64, 1>, // ABlockTransferThreadClusterLengths_AK0_M_AK1
        S<1, 0, 2>,  // ABlockTransferThreadClusterArrangeOrder
        S<1, 0, 2>,  // ABlockTransferSrcAccessOrder
        2,           // ABlockTransferSrcVectorDim
        4,           // ABlockTransferSrcScalarPerVector
        4,           // ABlockTransferDstScalarPerVector_AK1
        1,           // ABlockLdsExtraM
        S<4, 64, 1>, // BBlockTransferThreadClusterLengths_BK0_N_BK1
        S<1, 0, 2>,  // BBlockTransferThreadClusterArrangeOrder
        S<1, 0, 2>,  // BBlockTransferSrcAccessOrder
        2,           // BBlockTransferSrcVectorDim
        4,           // BBlockTransferSrcScalarPerVector
        4,           // BBlockTransferDstScalarPerVector_BK1
        1,           // BBlockLdsExtraN
        1,
        1,
        S<1, 16, 1, 16>,
        4>;
 int main(int argc, char* argv[])
 {
    namespace ctc = ck::tensor_layout::convolution;
    print_helper_msg();
    bool do_verification = true;
    int init_method      = 1;
    bool time_kernel     = false;
    // conventional group conv definition
    // G = 2
    // [N, C, Hi, Wi] =  [128, 384, 71, 71]
    // [K, C,  Y,  X] =  [512, 192,  3,  3]
    // [N, K, Ho, Wo] =  [128, 512, 36, 36]
    // CK group conv definition
    // [G, N, C, Hi, Wi] =  [2, 128, 192, 71, 71]
    // [G, K, C,  Y,  X] =  [2, 256, 192,  3,  3]
    // [G, N, K, Ho, Wo] =  [2, 128, 256, 36, 36]
    ck::utils::conv::ConvParam conv_param{
        2, 2, 128, 256, 192, {3, 3}, {71, 71}, {2, 2}, {1, 1}, {1, 1}, {1, 1}};
    if(argc == 1)
    {
        // use default
    }
    else if(argc == 4)
    {
        do_verification = std::stoi(argv[1]);
        init_method     = std::stoi(argv[2]);
        time_kernel     = std::stoi(argv[3]);
    }
    else
    {
        do_verification                   = std::stoi(argv[1]);
        init_method                       = std::stoi(argv[2]);
        time_kernel                       = std::stoi(argv[3]);
        const ck::index_t num_dim_spatial = std::stoi(argv[4]);
        conv_param = ck::utils::conv::parse_conv_param(num_dim_spatial, 5, argv);
    }
    const auto in_element_op  = InElementOp{};
    const auto wei_element_op = WeiElementOp{};
    const auto out_element_op = OutElementOp{};
    if(conv_param.num_dim_spatial_ == 1)
    {
        using InLayout       = ctc::G_NW_C;
        using WeiLayout      = ctc::G_K_X_C;
        using BiasLayout     = ctc::G_K;
        using ResidualLayout = ctc::G_NW_K;
        using OutLayout      = ctc::G_NW_K;
        const auto in_g_n_c_wis_desc = HostTensorDescriptor(
            {conv_param.G_, conv_param.N_, conv_param.C_, conv_param.input_spatial_lengths_[0]},
            {
                conv_param.C_,                                                        // g
                conv_param.input_spatial_lengths_[0] * conv_param.G_ * conv_param.C_, // n
                1,                                                                    // c
                conv_param.G_ * conv_param.C_                                         // wi
            });
        const auto wei_g_k_c_xs_desc = HostTensorDescriptor(
            {conv_param.G_, conv_param.K_, conv_param.C_, conv_param.filter_spatial_lengths_[0]},
            {
                conv_param.K_ * conv_param.filter_spatial_lengths_[0] * conv_param.C_, // g
                conv_param.filter_spatial_lengths_[0] * conv_param.C_,                 // k
                1,                                                                     // c
                conv_param.C_                                                          // x
            });
        const auto bias_g_n_k_wos_desc = HostTensorDescriptor(
            {conv_param.G_, conv_param.N_, conv_param.K_, conv_param.output_spatial_lengths_[0]},
            {
                conv_param.K_, // g
                0,             // k
                1,             // c
                0              // x
            });
        const auto residual_g_n_k_wos_desc = HostTensorDescriptor(
            {conv_param.G_, conv_param.N_, conv_param.K_, conv_param.output_spatial_lengths_[0]},
            {
                conv_param.K_, // g
                0,             // k
                1,             // c
                0              // x
            });
        const auto out_g_n_k_wos_desc = HostTensorDescriptor(
            {conv_param.G_, conv_param.N_, conv_param.K_, conv_param.output_spatial_lengths_[0]},
            {
                conv_param.K_,                                                         // g
                conv_param.output_spatial_lengths_[0] * conv_param.G_ * conv_param.K_, // n
                1,                                                                     // k
                conv_param.G_ * conv_param.K_                                          // wo
            });
        return run_grouped_conv_fwd_bias_relu_add<1,
                                                  InKernelDataType,
                                                  WeiKernelDataType,
                                                  CShuffleDataType,
                                                  OutKernelDataType,
                                                  InElementOp,
                                                  WeiElementOp,
                                                  OutElementOp,
                                                  InUserDataType,
                                                  WeiUserDataType,
                                                  OutUserDataType,
                                                  DeviceGroupedConvNDFwdInstance<1,
                                                                                 InLayout,
                                                                                 WeiLayout,
                                                                                 BiasLayout,
                                                                                 ResidualLayout,
                                                                                 OutLayout>>(
            do_verification,
            init_method,
            time_kernel,
            conv_param,
            in_g_n_c_wis_desc,
            wei_g_k_c_xs_desc,
            bias_g_n_k_wos_desc,
            residual_g_n_k_wos_desc,
            out_g_n_k_wos_desc,
            in_element_op,
            wei_element_op,
            out_element_op);
    }
    else if(conv_param.num_dim_spatial_ == 2)
    {
        using InLayout       = ctc::G_NHW_C;
        using WeiLayout      = ctc::G_K_YX_C;
        using BiasLayout     = ctc::G_K;
        using ResidualLayout = ctc::G_NHW_K;
        using OutLayout      = ctc::G_NHW_K;
        const auto in_g_n_c_wis_desc = HostTensorDescriptor(
            {conv_param.G_,
             conv_param.N_,
             conv_param.C_,
             conv_param.input_spatial_lengths_[0],
             conv_param.input_spatial_lengths_[1]},
            {
                conv_param.C_, // g
                conv_param.input_spatial_lengths_[0] * conv_param.input_spatial_lengths_[1] *
                    conv_param.G_ * conv_param.C_,                                    // n
                1,                                                                    // c
                conv_param.input_spatial_lengths_[1] * conv_param.G_ * conv_param.C_, // hi
                conv_param.G_ * conv_param.C_                                         // wi
            });
        const auto wei_g_k_c_xs_desc =
            HostTensorDescriptor({conv_param.G_,
                                  conv_param.K_,
                                  conv_param.C_,
                                  conv_param.filter_spatial_lengths_[0],
                                  conv_param.filter_spatial_lengths_[1]},
                                 {
                                     conv_param.K_ * conv_param.filter_spatial_lengths_[0] *
                                         conv_param.filter_spatial_lengths_[1] * conv_param.C_, // g
                                     conv_param.filter_spatial_lengths_[0] *
                                         conv_param.filter_spatial_lengths_[1] * conv_param.C_, // k
                                     1,                                                         // c
                                     conv_param.filter_spatial_lengths_[1] * conv_param.C_,     // y
                                     conv_param.C_                                              // x
                                 });
        const auto bias_g_n_k_wos_desc =
            HostTensorDescriptor({conv_param.G_,
                                  conv_param.N_,
                                  conv_param.K_,
                                  conv_param.output_spatial_lengths_[0],
                                  conv_param.output_spatial_lengths_[1]},
                                 {
                                     conv_param.K_, // g
                                     0,             // n
                                     1,             // k
                                     0,             // ho
                                     0              // wo
                                 });
        const auto residual_g_n_k_wos_desc =
            HostTensorDescriptor({conv_param.G_,
                                  conv_param.N_,
                                  conv_param.K_,
                                  conv_param.output_spatial_lengths_[0],
                                  conv_param.output_spatial_lengths_[1]},
                                 {
                                     conv_param.K_, // g
                                     0,             // n
                                     1,             // k
                                     0,             // ho
                                     0              // wo
                                 });
        const auto out_g_n_k_wos_desc = HostTensorDescriptor(
            {conv_param.G_,
             conv_param.N_,
             conv_param.K_,
             conv_param.output_spatial_lengths_[0],
             conv_param.output_spatial_lengths_[1]},
            {
                conv_param.K_, // g
                conv_param.output_spatial_lengths_[0] * conv_param.output_spatial_lengths_[1] *
                    conv_param.G_ * conv_param.K_,                                     // n
                1,                                                                     // k
                conv_param.output_spatial_lengths_[1] * conv_param.G_ * conv_param.K_, // ho
                conv_param.G_ * conv_param.K_                                          // wo
            });
        return run_grouped_conv_fwd_bias_relu_add<2,
                                                  InKernelDataType,
                                                  WeiKernelDataType,
                                                  CShuffleDataType,
                                                  OutKernelDataType,
                                                  InElementOp,
                                                  WeiElementOp,
                                                  OutElementOp,
                                                  InUserDataType,
                                                  WeiUserDataType,
                                                  OutUserDataType,
                                                  DeviceGroupedConvNDFwdInstance<2,
                                                                                 InLayout,
                                                                                 WeiLayout,
                                                                                 BiasLayout,
                                                                                 ResidualLayout,
                                                                                 OutLayout>>(
            do_verification,
            init_method,
            time_kernel,
            conv_param,
            in_g_n_c_wis_desc,
            wei_g_k_c_xs_desc,
            bias_g_n_k_wos_desc,
            residual_g_n_k_wos_desc,
            out_g_n_k_wos_desc,
            in_element_op,
            wei_element_op,
            out_element_op);
    }
    else if(conv_param.num_dim_spatial_ == 3)
    {
        using InLayout       = ctc::G_NDHW_C;
        using WeiLayout      = ctc::G_K_ZYX_C;
        using BiasLayout     = ctc::G_K;
        using ResidualLayout = ctc::G_NDHW_K;
        using OutLayout      = ctc::G_NDHW_K;
        const auto in_g_n_c_wis_desc = HostTensorDescriptor(
            {conv_param.G_,
             conv_param.N_,
             conv_param.C_,
             conv_param.input_spatial_lengths_[0],
             conv_param.input_spatial_lengths_[1],
             conv_param.input_spatial_lengths_[2]},
            {
                conv_param.C_, // g
                conv_param.input_spatial_lengths_[0] * conv_param.input_spatial_lengths_[1] *
                    conv_param.input_spatial_lengths_[2] * conv_param.G_ * conv_param.C_, // n
                1,                                                                        // c
                conv_param.input_spatial_lengths_[1] * conv_param.input_spatial_lengths_[2] *
                    conv_param.G_ * conv_param.C_,                                    // di
                conv_param.input_spatial_lengths_[2] * conv_param.G_ * conv_param.C_, // hi
                conv_param.G_ * conv_param.C_                                         // wi
            });
        const auto wei_g_k_c_xs_desc = HostTensorDescriptor(
            {conv_param.G_,
             conv_param.K_,
             conv_param.C_,
             conv_param.filter_spatial_lengths_[0],
             conv_param.filter_spatial_lengths_[1],
             conv_param.filter_spatial_lengths_[2]},
            {
                conv_param.K_ * conv_param.filter_spatial_lengths_[0] *
                    conv_param.filter_spatial_lengths_[1] * conv_param.filter_spatial_lengths_[2] *
                    conv_param.C_, // g
                conv_param.filter_spatial_lengths_[0] * conv_param.filter_spatial_lengths_[1] *
                    conv_param.filter_spatial_lengths_[2] * conv_param.C_, // k
                1,                                                         // c
                conv_param.filter_spatial_lengths_[1] * conv_param.filter_spatial_lengths_[2] *
                    conv_param.C_,                                     // z
                conv_param.filter_spatial_lengths_[2] * conv_param.C_, // y
                conv_param.C_                                          // x
            });
        const auto bias_g_n_k_wos_desc =
            HostTensorDescriptor({conv_param.G_,
                                  conv_param.N_,
                                  conv_param.K_,
                                  conv_param.output_spatial_lengths_[0],
                                  conv_param.output_spatial_lengths_[1],
                                  conv_param.output_spatial_lengths_[2]},
                                 {
                                     conv_param.K_, // g
                                     0,             // n
                                     1,             // k
                                     0,             // z
                                     0,             // y
                                     0              // x
                                 });
        const auto residual_g_n_k_wos_desc =
            HostTensorDescriptor({conv_param.G_,
                                  conv_param.N_,
                                  conv_param.K_,
                                  conv_param.output_spatial_lengths_[0],
                                  conv_param.output_spatial_lengths_[1],
                                  conv_param.output_spatial_lengths_[2]},
                                 {
                                     conv_param.K_, // g
                                     0,             // n
                                     1,             // k
                                     0,             // z
                                     0,             // y
                                     0              // x
                                 });
        const auto out_g_n_k_wos_desc = HostTensorDescriptor(
            {conv_param.G_,
             conv_param.N_,
             conv_param.K_,
             conv_param.output_spatial_lengths_[0],
             conv_param.output_spatial_lengths_[1],
             conv_param.output_spatial_lengths_[2]},
            {
                conv_param.K_, // g
                conv_param.output_spatial_lengths_[0] * conv_param.output_spatial_lengths_[1] *
                    conv_param.output_spatial_lengths_[2] * conv_param.G_ * conv_param.K_, // n
                1,                                                                         // k
                conv_param.output_spatial_lengths_[1] * conv_param.output_spatial_lengths_[2] *
                    conv_param.G_ * conv_param.K_,                                     // do
                conv_param.output_spatial_lengths_[2] * conv_param.G_ * conv_param.K_, // ho
                conv_param.G_ * conv_param.K_                                          // wo
            });
        return run_grouped_conv_fwd_bias_relu_add<3,
                                                  InKernelDataType,
                                                  WeiKernelDataType,
                                                  CShuffleDataType,
                                                  OutKernelDataType,
                                                  InElementOp,
                                                  WeiElementOp,
                                                  OutElementOp,
                                                  InUserDataType,
                                                  WeiUserDataType,
                                                  OutUserDataType,
                                                  DeviceGroupedConvNDFwdInstance<3,
                                                                                 InLayout,
                                                                                 WeiLayout,
                                                                                 BiasLayout,
                                                                                 ResidualLayout,
                                                                                 OutLayout>>(
            do_verification,
            init_method,
            time_kernel,
            conv_param,
            in_g_n_c_wis_desc,
            wei_g_k_c_xs_desc,
            bias_g_n_k_wos_desc,
            residual_g_n_k_wos_desc,
            out_g_n_k_wos_desc,
            in_element_op,
            wei_element_op,
            out_element_op);
    }
    return 0;
 }
--- a/example/30_grouped_convnd_fwd_bias_relu_add/grouped_convnd_fwd_bias_relu_add_xdl_int4.cpp
+++ b/example/30_grouped_convnd_fwd_bias_relu_add/grouped_convnd_fwd_bias_relu_add_xdl_int4.cpp
@@ -1,459 +0,0 @@
 // SPDX-License-Identifier: MIT
 // Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
 #include "grouped_convnd_fwd_bias_relu_add_common.hpp"
 #include "ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_d_xdl_cshuffle.hpp"
 #include "ck/library/utility/convolution_host_tensor_descriptor_helper.hpp"
 // kernel data types
 using InKernelDataType       = int8_t;
 using WeiKernelDataType      = int8_t;
 using AccDataType            = int32_t;
 using CShuffleDataType       = int8_t;
 using BiasKernelDataType     = int8_t;
 using ResidualKernelDataType = int8_t;
 using OutKernelDataType      = int8_t;
 // tensor data types
 using InUserDataType  = ck::int4_t;
 using WeiUserDataType = ck::int4_t;
 using OutUserDataType = ck::int4_t;
 template <ck::index_t... Is>
 using S = ck::Sequence<Is...>;
 using InElementOp  = ck::tensor_operation::element_wise::PassThrough;
 using WeiElementOp = ck::tensor_operation::element_wise::PassThrough;
 using OutElementOp = ck::tensor_operation::element_wise::AddReluAdd;
 static constexpr auto ConvSpec =
    ck::tensor_operation::device::ConvolutionForwardSpecialization::Default;
 static constexpr auto GemmSpec = ck::tensor_operation::device::GemmSpecialization::MNKPadding;
 template <ck::index_t NDimSpatial,
          typename InLayout,
          typename WeiLayout,
          typename BiasLayout,
          typename ResidualLayout,
          typename OutLayout>
 using DeviceGroupedConvNDFwdInstance =
    ck::tensor_operation::device::DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<
        NDimSpatial,
        InLayout,
        WeiLayout,
        ck::Tuple<BiasLayout, ResidualLayout>,
        OutLayout,
        InKernelDataType,
        WeiKernelDataType,
        AccDataType,
        CShuffleDataType,
        ck::Tuple<BiasKernelDataType, ResidualKernelDataType>,
        OutKernelDataType,
        InElementOp,
        WeiElementOp,
        OutElementOp,
        ConvSpec,    // ConvForwardSpecialization
        GemmSpec,    // GemmSpecialization
        1,           //
        256,         // BlockSize
        128,         // MPerBlock
        256,         // NPerBlock
        64,          // KPerBlock
        16,          // AK1
        16,          // BK1
        32,          // MPerXdl
        32,          // NPerXdl
        2,           // MXdlPerWave
        4,           // NXdlPerWave
        S<4, 64, 1>, // ABlockTransferThreadClusterLengths_AK0_M_AK1
        S<1, 0, 2>,  // ABlockTransferThreadClusterArrangeOrder
        S<1, 0, 2>,  // ABlockTransferSrcAccessOrder
        2,           // ABlockTransferSrcVectorDim
        16,          // ABlockTransferSrcScalarPerVector
        16,          // ABlockTransferDstScalarPerVector_AK1
        1,           // ABlockLdsExtraM
        S<4, 64, 1>, // BBlockTransferThreadClusterLengths_BK0_N_BK1
        S<1, 0, 2>,  // BBlockTransferThreadClusterArrangeOrder
        S<1, 0, 2>,  // BBlockTransferSrcAccessOrder
        2,           // BBlockTransferSrcVectorDim
        16,          // BBlockTransferSrcScalarPerVector
        16,          // BBlockTransferDstScalarPerVector_BK1
        1,           // BBlockLdsExtraN
        1,
        1,
        S<1, 64, 1, 4>,
        16>;
 int main(int argc, char* argv[])
 {
    namespace ctc = ck::tensor_layout::convolution;
    print_helper_msg();
    bool do_verification = true;
    int init_method      = 1;
    bool time_kernel     = false;
    // conventional group conv definition
    // G = 2
    // [N, C, Hi, Wi] =  [128, 384, 71, 71]
    // [K, C,  Y,  X] =  [512, 192,  3,  3]
    // [N, K, Ho, Wo] =  [128, 512, 36, 36]
    // CK group conv definition
    // [G, N, C, Hi, Wi] =  [2, 128, 192, 71, 71]
    // [G, K, C,  Y,  X] =  [2, 256, 192,  3,  3]
    // [G, N, K, Ho, Wo] =  [2, 128, 256, 36, 36]
    ck::utils::conv::ConvParam conv_param{
        2, 2, 128, 256, 192, {3, 3}, {71, 71}, {2, 2}, {1, 1}, {1, 1}, {1, 1}};
    if(argc == 1)
    {
        // use default
    }
    else if(argc == 4)
    {
        do_verification = std::stoi(argv[1]);
        init_method     = std::stoi(argv[2]);
        time_kernel     = std::stoi(argv[3]);
    }
    else
    {
        do_verification                   = std::stoi(argv[1]);
        init_method                       = std::stoi(argv[2]);
        time_kernel                       = std::stoi(argv[3]);
        const ck::index_t num_dim_spatial = std::stoi(argv[4]);
        conv_param = ck::utils::conv::parse_conv_param(num_dim_spatial, 5, argv);
    }
    const auto in_element_op  = InElementOp{};
    const auto wei_element_op = WeiElementOp{};
    const auto out_element_op = OutElementOp{};
    if(conv_param.num_dim_spatial_ == 1)
    {
        using InLayout       = ctc::G_NW_C;
        using WeiLayout      = ctc::G_K_X_C;
        using BiasLayout     = ctc::G_K;
        using ResidualLayout = ctc::G_NW_K;
        using OutLayout      = ctc::G_NW_K;
        const auto in_g_n_c_wis_desc = HostTensorDescriptor(
            {conv_param.G_, conv_param.N_, conv_param.C_, conv_param.input_spatial_lengths_[0]},
            {
                conv_param.C_,                                                        // g
                conv_param.input_spatial_lengths_[0] * conv_param.G_ * conv_param.C_, // n
                1,                                                                    // c
                conv_param.G_ * conv_param.C_                                         // wi
            });
        const auto wei_g_k_c_xs_desc = HostTensorDescriptor(
            {conv_param.G_, conv_param.K_, conv_param.C_, conv_param.filter_spatial_lengths_[0]},
            {
                conv_param.K_ * conv_param.filter_spatial_lengths_[0] * conv_param.C_, // g
                conv_param.filter_spatial_lengths_[0] * conv_param.C_,                 // k
                1,                                                                     // c
                conv_param.C_                                                          // x
            });
        const auto bias_g_n_k_wos_desc = HostTensorDescriptor(
            {conv_param.G_, conv_param.N_, conv_param.K_, conv_param.output_spatial_lengths_[0]},
            {
                conv_param.K_, // g
                0,             // k
                1,             // c
                0              // x
            });
        const auto residual_g_n_k_wos_desc = HostTensorDescriptor(
            {conv_param.G_, conv_param.N_, conv_param.K_, conv_param.output_spatial_lengths_[0]},
            {
                conv_param.K_, // g
                0,             // k
                1,             // c
                0              // x
            });
        const auto out_g_n_k_wos_desc = HostTensorDescriptor(
            {conv_param.G_, conv_param.N_, conv_param.K_, conv_param.output_spatial_lengths_[0]},
            {
                conv_param.K_,                                                         // g
                conv_param.output_spatial_lengths_[0] * conv_param.G_ * conv_param.K_, // n
                1,                                                                     // k
                conv_param.G_ * conv_param.K_                                          // wo
            });
        return run_grouped_conv_fwd_bias_relu_add<1,
                                                  InKernelDataType,
                                                  WeiKernelDataType,
                                                  CShuffleDataType,
                                                  OutKernelDataType,
                                                  InElementOp,
                                                  WeiElementOp,
                                                  OutElementOp,
                                                  InUserDataType,
                                                  WeiUserDataType,
                                                  OutUserDataType,
                                                  DeviceGroupedConvNDFwdInstance<1,
                                                                                 InLayout,
                                                                                 WeiLayout,
                                                                                 BiasLayout,
                                                                                 ResidualLayout,
                                                                                 OutLayout>>(
            do_verification,
            init_method,
            time_kernel,
            conv_param,
            in_g_n_c_wis_desc,
            wei_g_k_c_xs_desc,
            bias_g_n_k_wos_desc,
            residual_g_n_k_wos_desc,
            out_g_n_k_wos_desc,
            in_element_op,
            wei_element_op,
            out_element_op);
    }
    else if(conv_param.num_dim_spatial_ == 2)
    {
        using InLayout       = ctc::G_NHW_C;
        using WeiLayout      = ctc::G_K_YX_C;
        using BiasLayout     = ctc::G_K;
        using ResidualLayout = ctc::G_NHW_K;
        using OutLayout      = ctc::G_NHW_K;
        const auto in_g_n_c_wis_desc = HostTensorDescriptor(
            {conv_param.G_,
             conv_param.N_,
             conv_param.C_,
             conv_param.input_spatial_lengths_[0],
             conv_param.input_spatial_lengths_[1]},
            {
                conv_param.C_, // g
                conv_param.input_spatial_lengths_[0] * conv_param.input_spatial_lengths_[1] *
                    conv_param.G_ * conv_param.C_,                                    // n
                1,                                                                    // c
                conv_param.input_spatial_lengths_[1] * conv_param.G_ * conv_param.C_, // hi
                conv_param.G_ * conv_param.C_                                         // wi
            });
        const auto wei_g_k_c_xs_desc =
            HostTensorDescriptor({conv_param.G_,
                                  conv_param.K_,
                                  conv_param.C_,
                                  conv_param.filter_spatial_lengths_[0],
                                  conv_param.filter_spatial_lengths_[1]},
                                 {
                                     conv_param.K_ * conv_param.filter_spatial_lengths_[0] *
                                         conv_param.filter_spatial_lengths_[1] * conv_param.C_, // g
                                     conv_param.filter_spatial_lengths_[0] *
                                         conv_param.filter_spatial_lengths_[1] * conv_param.C_, // k
                                     1,                                                         // c
                                     conv_param.filter_spatial_lengths_[1] * conv_param.C_,     // y
                                     conv_param.C_                                              // x
                                 });
        const auto bias_g_n_k_wos_desc =
            HostTensorDescriptor({conv_param.G_,
                                  conv_param.N_,
                                  conv_param.K_,
                                  conv_param.output_spatial_lengths_[0],
                                  conv_param.output_spatial_lengths_[1]},
                                 {
                                     conv_param.K_, // g
                                     0,             // n
                                     1,             // k
                                     0,             // ho
                                     0              // wo
                                 });
        const auto residual_g_n_k_wos_desc =
            HostTensorDescriptor({conv_param.G_,
                                  conv_param.N_,
                                  conv_param.K_,
                                  conv_param.output_spatial_lengths_[0],
                                  conv_param.output_spatial_lengths_[1]},
                                 {
                                     conv_param.K_, // g
                                     0,             // n
                                     1,             // k
                                     0,             // ho
                                     0              // wo
                                 });
        const auto out_g_n_k_wos_desc = HostTensorDescriptor(
            {conv_param.G_,
             conv_param.N_,
             conv_param.K_,
             conv_param.output_spatial_lengths_[0],
             conv_param.output_spatial_lengths_[1]},
            {
                conv_param.K_, // g
                conv_param.output_spatial_lengths_[0] * conv_param.output_spatial_lengths_[1] *
                    conv_param.G_ * conv_param.K_,                                     // n
                1,                                                                     // k
                conv_param.output_spatial_lengths_[1] * conv_param.G_ * conv_param.K_, // ho
                conv_param.G_ * conv_param.K_                                          // wo
            });
        return run_grouped_conv_fwd_bias_relu_add<2,
                                                  InKernelDataType,
                                                  WeiKernelDataType,
                                                  CShuffleDataType,
                                                  OutKernelDataType,
                                                  InElementOp,
                                                  WeiElementOp,
                                                  OutElementOp,
                                                  InUserDataType,
                                                  WeiUserDataType,
                                                  OutUserDataType,
                                                  DeviceGroupedConvNDFwdInstance<2,
                                                                                 InLayout,
                                                                                 WeiLayout,
                                                                                 BiasLayout,
                                                                                 ResidualLayout,
                                                                                 OutLayout>>(
            do_verification,
            init_method,
            time_kernel,
            conv_param,
            in_g_n_c_wis_desc,
            wei_g_k_c_xs_desc,
            bias_g_n_k_wos_desc,
            residual_g_n_k_wos_desc,
            out_g_n_k_wos_desc,
            in_element_op,
            wei_element_op,
            out_element_op);
    }
    else if(conv_param.num_dim_spatial_ == 3)
    {
        using InLayout       = ctc::G_NDHW_C;
        using WeiLayout      = ctc::G_K_ZYX_C;
        using BiasLayout     = ctc::G_K;
        using ResidualLayout = ctc::G_NDHW_K;
        using OutLayout      = ctc::G_NDHW_K;
        const auto in_g_n_c_wis_desc = HostTensorDescriptor(
            {conv_param.G_,
             conv_param.N_,
             conv_param.C_,
             conv_param.input_spatial_lengths_[0],
             conv_param.input_spatial_lengths_[1],
             conv_param.input_spatial_lengths_[2]},
            {
                conv_param.C_, // g
                conv_param.input_spatial_lengths_[0] * conv_param.input_spatial_lengths_[1] *
                    conv_param.input_spatial_lengths_[2] * conv_param.G_ * conv_param.C_, // n
                1,                                                                        // c
                conv_param.input_spatial_lengths_[1] * conv_param.input_spatial_lengths_[2] *
                    conv_param.G_ * conv_param.C_,                                    // di
                conv_param.input_spatial_lengths_[2] * conv_param.G_ * conv_param.C_, // hi
                conv_param.G_ * conv_param.C_                                         // wi
            });
        const auto wei_g_k_c_xs_desc = HostTensorDescriptor(
            {conv_param.G_,
             conv_param.K_,
             conv_param.C_,
             conv_param.filter_spatial_lengths_[0],
             conv_param.filter_spatial_lengths_[1],
             conv_param.filter_spatial_lengths_[2]},
            {
                conv_param.K_ * conv_param.filter_spatial_lengths_[0] *
                    conv_param.filter_spatial_lengths_[1] * conv_param.filter_spatial_lengths_[2] *
                    conv_param.C_, // g
                conv_param.filter_spatial_lengths_[0] * conv_param.filter_spatial_lengths_[1] *
                    conv_param.filter_spatial_lengths_[2] * conv_param.C_, // k
                1,                                                         // c
                conv_param.filter_spatial_lengths_[1] * conv_param.filter_spatial_lengths_[2] *
                    conv_param.C_,                                     // z
                conv_param.filter_spatial_lengths_[2] * conv_param.C_, // y
                conv_param.C_                                          // x
            });
        const auto bias_g_n_k_wos_desc =
            HostTensorDescriptor({conv_param.G_,
                                  conv_param.N_,
                                  conv_param.K_,
                                  conv_param.output_spatial_lengths_[0],
                                  conv_param.output_spatial_lengths_[1],
                                  conv_param.output_spatial_lengths_[2]},
                                 {
                                     conv_param.K_, // g
                                     0,             // n
                                     1,             // k
                                     0,             // z
                                     0,             // y
                                     0              // x
                                 });
        const auto residual_g_n_k_wos_desc =
            HostTensorDescriptor({conv_param.G_,
                                  conv_param.N_,
                                  conv_param.K_,
                                  conv_param.output_spatial_lengths_[0],
                                  conv_param.output_spatial_lengths_[1],
                                  conv_param.output_spatial_lengths_[2]},
                                 {
                                     conv_param.K_, // g
                                     0,             // n
                                     1,             // k
                                     0,             // z
                                     0,             // y
                                     0              // x
                                 });
        const auto out_g_n_k_wos_desc = HostTensorDescriptor(
            {conv_param.G_,
             conv_param.N_,
             conv_param.K_,
             conv_param.output_spatial_lengths_[0],
             conv_param.output_spatial_lengths_[1],
             conv_param.output_spatial_lengths_[2]},
            {
                conv_param.K_, // g
                conv_param.output_spatial_lengths_[0] * conv_param.output_spatial_lengths_[1] *
                    conv_param.output_spatial_lengths_[2] * conv_param.G_ * conv_param.K_, // n
                1,                                                                         // k
                conv_param.output_spatial_lengths_[1] * conv_param.output_spatial_lengths_[2] *
                    conv_param.G_ * conv_param.K_,                                     // do
                conv_param.output_spatial_lengths_[2] * conv_param.G_ * conv_param.K_, // ho
                conv_param.G_ * conv_param.K_                                          // wo
            });
        return run_grouped_conv_fwd_bias_relu_add<3,
                                                  InKernelDataType,
                                                  WeiKernelDataType,
                                                  CShuffleDataType,
                                                  OutKernelDataType,
                                                  InElementOp,
                                                  WeiElementOp,
                                                  OutElementOp,
                                                  InUserDataType,
                                                  WeiUserDataType,
                                                  OutUserDataType,
                                                  DeviceGroupedConvNDFwdInstance<3,
                                                                                 InLayout,
                                                                                 WeiLayout,
                                                                                 BiasLayout,
                                                                                 ResidualLayout,
                                                                                 OutLayout>>(
            do_verification,
            init_method,
            time_kernel,
            conv_param,
            in_g_n_c_wis_desc,
            wei_g_k_c_xs_desc,
            bias_g_n_k_wos_desc,
            residual_g_n_k_wos_desc,
            out_g_n_k_wos_desc,
            in_element_op,
            wei_element_op,
            out_element_op);
    }
    return 0;
 }
--- a/example/30_grouped_convnd_fwd_bias_relu_add/grouped_convnd_fwd_bias_relu_add_xdl_int8.cpp
+++ b/example/30_grouped_convnd_fwd_bias_relu_add/grouped_convnd_fwd_bias_relu_add_xdl_int8.cpp
@@ -1,459 +0,0 @@
 // SPDX-License-Identifier: MIT
 // Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
 #include "grouped_convnd_fwd_bias_relu_add_common.hpp"
 #include "ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_d_xdl_cshuffle.hpp"
 #include "ck/library/utility/convolution_host_tensor_descriptor_helper.hpp"
 // kernel data types
 using InKernelDataType       = int8_t;
 using WeiKernelDataType      = int8_t;
 using AccDataType            = int32_t;
 using CShuffleDataType       = int8_t;
 using BiasKernelDataType     = int8_t;
 using ResidualKernelDataType = int8_t;
 using OutKernelDataType      = int8_t;
 // tensor data types
 using InUserDataType  = InKernelDataType;
 using WeiUserDataType = WeiKernelDataType;
 using OutUserDataType = OutKernelDataType;
 template <ck::index_t... Is>
 using S = ck::Sequence<Is...>;
 using InElementOp  = ck::tensor_operation::element_wise::PassThrough;
 using WeiElementOp = ck::tensor_operation::element_wise::PassThrough;
 using OutElementOp = ck::tensor_operation::element_wise::AddReluAdd;
 static constexpr auto ConvSpec =
    ck::tensor_operation::device::ConvolutionForwardSpecialization::Default;
 static constexpr auto GemmSpec = ck::tensor_operation::device::GemmSpecialization::MNKPadding;
 template <ck::index_t NDimSpatial,
          typename InLayout,
          typename WeiLayout,
          typename BiasLayout,
          typename ResidualLayout,
          typename OutLayout>
 using DeviceGroupedConvNDFwdInstance =
    ck::tensor_operation::device::DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<
        NDimSpatial,
        InLayout,
        WeiLayout,
        ck::Tuple<BiasLayout, ResidualLayout>,
        OutLayout,
        InKernelDataType,
        WeiKernelDataType,
        AccDataType,
        CShuffleDataType,
        ck::Tuple<BiasKernelDataType, ResidualKernelDataType>,
        OutKernelDataType,
        InElementOp,
        WeiElementOp,
        OutElementOp,
        ConvSpec,    // ConvForwardSpecialization
        GemmSpec,    // GemmSpecialization
        1,           //
        256,         // BlockSize
        128,         // MPerBlock
        256,         // NPerBlock
        64,          // KPerBlock
        16,          // AK1
        16,          // BK1
        32,          // MPerXdl
        32,          // NPerXdl
        2,           // MXdlPerWave
        4,           // NXdlPerWave
        S<4, 64, 1>, // ABlockTransferThreadClusterLengths_AK0_M_AK1
        S<1, 0, 2>,  // ABlockTransferThreadClusterArrangeOrder
        S<1, 0, 2>,  // ABlockTransferSrcAccessOrder
        2,           // ABlockTransferSrcVectorDim
        16,          // ABlockTransferSrcScalarPerVector
        16,          // ABlockTransferDstScalarPerVector_AK1
        1,           // ABlockLdsExtraM
        S<4, 64, 1>, // BBlockTransferThreadClusterLengths_BK0_N_BK1
        S<1, 0, 2>,  // BBlockTransferThreadClusterArrangeOrder
        S<1, 0, 2>,  // BBlockTransferSrcAccessOrder
        2,           // BBlockTransferSrcVectorDim
        16,          // BBlockTransferSrcScalarPerVector
        16,          // BBlockTransferDstScalarPerVector_BK1
        1,           // BBlockLdsExtraN
        1,
        1,
        S<1, 64, 1, 4>,
        16>;
 int main(int argc, char* argv[])
 {
    namespace ctc = ck::tensor_layout::convolution;
    print_helper_msg();
    bool do_verification = true;
    int init_method      = 1;
    bool time_kernel     = false;
    // conventional group conv definition
    // G = 2
    // [N, C, Hi, Wi] =  [128, 384, 71, 71]
    // [K, C,  Y,  X] =  [512, 192,  3,  3]
    // [N, K, Ho, Wo] =  [128, 512, 36, 36]
    // CK group conv definition
    // [G, N, C, Hi, Wi] =  [2, 128, 192, 71, 71]
    // [G, K, C,  Y,  X] =  [2, 256, 192,  3,  3]
    // [G, N, K, Ho, Wo] =  [2, 128, 256, 36, 36]
    ck::utils::conv::ConvParam conv_param{
        2, 2, 128, 256, 192, {3, 3}, {71, 71}, {2, 2}, {1, 1}, {1, 1}, {1, 1}};
    if(argc == 1)
    {
        // use default
    }
    else if(argc == 4)
    {
        do_verification = std::stoi(argv[1]);
        init_method     = std::stoi(argv[2]);
        time_kernel     = std::stoi(argv[3]);
    }
    else
    {
        do_verification                   = std::stoi(argv[1]);
        init_method                       = std::stoi(argv[2]);
        time_kernel                       = std::stoi(argv[3]);
        const ck::index_t num_dim_spatial = std::stoi(argv[4]);
        conv_param = ck::utils::conv::parse_conv_param(num_dim_spatial, 5, argv);
    }
    const auto in_element_op  = InElementOp{};
    const auto wei_element_op = WeiElementOp{};
    const auto out_element_op = OutElementOp{};
    if(conv_param.num_dim_spatial_ == 1)
    {
        using InLayout       = ctc::G_NW_C;
        using WeiLayout      = ctc::G_K_X_C;
        using BiasLayout     = ctc::G_K;
        using ResidualLayout = ctc::G_NW_K;
        using OutLayout      = ctc::G_NW_K;
        const auto in_g_n_c_wis_desc = HostTensorDescriptor(
            {conv_param.G_, conv_param.N_, conv_param.C_, conv_param.input_spatial_lengths_[0]},
            {
                conv_param.C_,                                                        // g
                conv_param.input_spatial_lengths_[0] * conv_param.G_ * conv_param.C_, // n
                1,                                                                    // c
                conv_param.G_ * conv_param.C_                                         // wi
            });
        const auto wei_g_k_c_xs_desc = HostTensorDescriptor(
            {conv_param.G_, conv_param.K_, conv_param.C_, conv_param.filter_spatial_lengths_[0]},
            {
                conv_param.K_ * conv_param.filter_spatial_lengths_[0] * conv_param.C_, // g
                conv_param.filter_spatial_lengths_[0] * conv_param.C_,                 // k
                1,                                                                     // c
                conv_param.C_                                                          // x
            });
        const auto bias_g_n_k_wos_desc = HostTensorDescriptor(
            {conv_param.G_, conv_param.N_, conv_param.K_, conv_param.output_spatial_lengths_[0]},
            {
                conv_param.K_, // g
                0,             // k
                1,             // c
                0              // x
            });
        const auto residual_g_n_k_wos_desc = HostTensorDescriptor(
            {conv_param.G_, conv_param.N_, conv_param.K_, conv_param.output_spatial_lengths_[0]},
            {
                conv_param.K_, // g
                0,             // k
                1,             // c
                0              // x
            });
        const auto out_g_n_k_wos_desc = HostTensorDescriptor(
            {conv_param.G_, conv_param.N_, conv_param.K_, conv_param.output_spatial_lengths_[0]},
            {
                conv_param.K_,                                                         // g
                conv_param.output_spatial_lengths_[0] * conv_param.G_ * conv_param.K_, // n
                1,                                                                     // k
                conv_param.G_ * conv_param.K_                                          // wo
            });
        return run_grouped_conv_fwd_bias_relu_add<1,
                                                  InKernelDataType,
                                                  WeiKernelDataType,
                                                  CShuffleDataType,
                                                  OutKernelDataType,
                                                  InElementOp,
                                                  WeiElementOp,
                                                  OutElementOp,
                                                  InUserDataType,
                                                  WeiUserDataType,
                                                  OutUserDataType,
                                                  DeviceGroupedConvNDFwdInstance<1,
                                                                                 InLayout,
                                                                                 WeiLayout,
                                                                                 BiasLayout,
                                                                                 ResidualLayout,
                                                                                 OutLayout>>(
            do_verification,
            init_method,
            time_kernel,
            conv_param,
            in_g_n_c_wis_desc,
            wei_g_k_c_xs_desc,
            bias_g_n_k_wos_desc,
            residual_g_n_k_wos_desc,
            out_g_n_k_wos_desc,
            in_element_op,
            wei_element_op,
            out_element_op);
    }
    else if(conv_param.num_dim_spatial_ == 2)
    {
        using InLayout       = ctc::G_NHW_C;
        using WeiLayout      = ctc::G_K_YX_C;
        using BiasLayout     = ctc::G_K;
        using ResidualLayout = ctc::G_NHW_K;
        using OutLayout      = ctc::G_NHW_K;
        const auto in_g_n_c_wis_desc = HostTensorDescriptor(
            {conv_param.G_,
             conv_param.N_,
             conv_param.C_,
             conv_param.input_spatial_lengths_[0],
             conv_param.input_spatial_lengths_[1]},
            {
                conv_param.C_, // g
                conv_param.input_spatial_lengths_[0] * conv_param.input_spatial_lengths_[1] *
                    conv_param.G_ * conv_param.C_,                                    // n
                1,                                                                    // c
                conv_param.input_spatial_lengths_[1] * conv_param.G_ * conv_param.C_, // hi
                conv_param.G_ * conv_param.C_                                         // wi
            });
        const auto wei_g_k_c_xs_desc =
            HostTensorDescriptor({conv_param.G_,
                                  conv_param.K_,
                                  conv_param.C_,
                                  conv_param.filter_spatial_lengths_[0],
                                  conv_param.filter_spatial_lengths_[1]},
                                 {
                                     conv_param.K_ * conv_param.filter_spatial_lengths_[0] *
                                         conv_param.filter_spatial_lengths_[1] * conv_param.C_, // g
                                     conv_param.filter_spatial_lengths_[0] *
                                         conv_param.filter_spatial_lengths_[1] * conv_param.C_, // k
                                     1,                                                         // c
                                     conv_param.filter_spatial_lengths_[1] * conv_param.C_,     // y
                                     conv_param.C_                                              // x
                                 });
        const auto bias_g_n_k_wos_desc =
            HostTensorDescriptor({conv_param.G_,
                                  conv_param.N_,
                                  conv_param.K_,
                                  conv_param.output_spatial_lengths_[0],
                                  conv_param.output_spatial_lengths_[1]},
                                 {
                                     conv_param.K_, // g
                                     0,             // n
                                     1,             // k
                                     0,             // ho
                                     0              // wo
                                 });
        const auto residual_g_n_k_wos_desc =
            HostTensorDescriptor({conv_param.G_,
                                  conv_param.N_,
                                  conv_param.K_,
                                  conv_param.output_spatial_lengths_[0],
                                  conv_param.output_spatial_lengths_[1]},
                                 {
                                     conv_param.K_, // g
                                     0,             // n
                                     1,             // k
                                     0,             // ho
                                     0              // wo
                                 });
        const auto out_g_n_k_wos_desc = HostTensorDescriptor(
            {conv_param.G_,
             conv_param.N_,
             conv_param.K_,
             conv_param.output_spatial_lengths_[0],
             conv_param.output_spatial_lengths_[1]},
            {
                conv_param.K_, // g
                conv_param.output_spatial_lengths_[0] * conv_param.output_spatial_lengths_[1] *
                    conv_param.G_ * conv_param.K_,                                     // n
                1,                                                                     // k
                conv_param.output_spatial_lengths_[1] * conv_param.G_ * conv_param.K_, // ho
                conv_param.G_ * conv_param.K_                                          // wo
            });
        return run_grouped_conv_fwd_bias_relu_add<2,
                                                  InKernelDataType,
                                                  WeiKernelDataType,
                                                  CShuffleDataType,
                                                  OutKernelDataType,
                                                  InElementOp,
                                                  WeiElementOp,
                                                  OutElementOp,
                                                  InUserDataType,
                                                  WeiUserDataType,
                                                  OutUserDataType,
                                                  DeviceGroupedConvNDFwdInstance<2,
                                                                                 InLayout,
                                                                                 WeiLayout,
                                                                                 BiasLayout,
                                                                                 ResidualLayout,
                                                                                 OutLayout>>(
            do_verification,
            init_method,
            time_kernel,
            conv_param,
            in_g_n_c_wis_desc,
            wei_g_k_c_xs_desc,
            bias_g_n_k_wos_desc,
            residual_g_n_k_wos_desc,
            out_g_n_k_wos_desc,
            in_element_op,
            wei_element_op,
            out_element_op);
    }
    else if(conv_param.num_dim_spatial_ == 3)
    {
        using InLayout       = ctc::G_NDHW_C;
        using WeiLayout      = ctc::G_K_ZYX_C;
        using BiasLayout     = ctc::G_K;
        using ResidualLayout = ctc::G_NDHW_K;
        using OutLayout      = ctc::G_NDHW_K;
        const auto in_g_n_c_wis_desc = HostTensorDescriptor(
            {conv_param.G_,
             conv_param.N_,
             conv_param.C_,
             conv_param.input_spatial_lengths_[0],
             conv_param.input_spatial_lengths_[1],
             conv_param.input_spatial_lengths_[2]},
            {
                conv_param.C_, // g
                conv_param.input_spatial_lengths_[0] * conv_param.input_spatial_lengths_[1] *
                    conv_param.input_spatial_lengths_[2] * conv_param.G_ * conv_param.C_, // n
                1,                                                                        // c
                conv_param.input_spatial_lengths_[1] * conv_param.input_spatial_lengths_[2] *
                    conv_param.G_ * conv_param.C_,                                    // di
                conv_param.input_spatial_lengths_[2] * conv_param.G_ * conv_param.C_, // hi
                conv_param.G_ * conv_param.C_                                         // wi
            });
        const auto wei_g_k_c_xs_desc = HostTensorDescriptor(
            {conv_param.G_,
             conv_param.K_,
             conv_param.C_,
             conv_param.filter_spatial_lengths_[0],
             conv_param.filter_spatial_lengths_[1],
             conv_param.filter_spatial_lengths_[2]},
            {
                conv_param.K_ * conv_param.filter_spatial_lengths_[0] *
                    conv_param.filter_spatial_lengths_[1] * conv_param.filter_spatial_lengths_[2] *
                    conv_param.C_, // g
                conv_param.filter_spatial_lengths_[0] * conv_param.filter_spatial_lengths_[1] *
                    conv_param.filter_spatial_lengths_[2] * conv_param.C_, // k
                1,                                                         // c
                conv_param.filter_spatial_lengths_[1] * conv_param.filter_spatial_lengths_[2] *
                    conv_param.C_,                                     // z
                conv_param.filter_spatial_lengths_[2] * conv_param.C_, // y
                conv_param.C_                                          // x
            });
        const auto bias_g_n_k_wos_desc =
            HostTensorDescriptor({conv_param.G_,
                                  conv_param.N_,
                                  conv_param.K_,
                                  conv_param.output_spatial_lengths_[0],
                                  conv_param.output_spatial_lengths_[1],
                                  conv_param.output_spatial_lengths_[2]},
                                 {
                                     conv_param.K_, // g
                                     0,             // n
                                     1,             // k
                                     0,             // z
                                     0,             // y
                                     0              // x
                                 });
        const auto residual_g_n_k_wos_desc =
            HostTensorDescriptor({conv_param.G_,
                                  conv_param.N_,
                                  conv_param.K_,
                                  conv_param.output_spatial_lengths_[0],
                                  conv_param.output_spatial_lengths_[1],
                                  conv_param.output_spatial_lengths_[2]},
                                 {
                                     conv_param.K_, // g
                                     0,             // n
                                     1,             // k
                                     0,             // z
                                     0,             // y
                                     0              // x
                                 });
        const auto out_g_n_k_wos_desc = HostTensorDescriptor(
            {conv_param.G_,
             conv_param.N_,
             conv_param.K_,
             conv_param.output_spatial_lengths_[0],
             conv_param.output_spatial_lengths_[1],
             conv_param.output_spatial_lengths_[2]},
            {
                conv_param.K_, // g
                conv_param.output_spatial_lengths_[0] * conv_param.output_spatial_lengths_[1] *
                    conv_param.output_spatial_lengths_[2] * conv_param.G_ * conv_param.K_, // n
                1,                                                                         // k
                conv_param.output_spatial_lengths_[1] * conv_param.output_spatial_lengths_[2] *
                    conv_param.G_ * conv_param.K_,                                     // do
                conv_param.output_spatial_lengths_[2] * conv_param.G_ * conv_param.K_, // ho
                conv_param.G_ * conv_param.K_                                          // wo
            });
        return run_grouped_conv_fwd_bias_relu_add<3,
                                                  InKernelDataType,
                                                  WeiKernelDataType,
                                                  CShuffleDataType,
                                                  OutKernelDataType,
                                                  InElementOp,
                                                  WeiElementOp,
                                                  OutElementOp,
                                                  InUserDataType,
                                                  WeiUserDataType,
                                                  OutUserDataType,
                                                  DeviceGroupedConvNDFwdInstance<3,
                                                                                 InLayout,
                                                                                 WeiLayout,
                                                                                 BiasLayout,
                                                                                 ResidualLayout,
                                                                                 OutLayout>>(
            do_verification,
            init_method,
            time_kernel,
            conv_param,
            in_g_n_c_wis_desc,
            wei_g_k_c_xs_desc,
            bias_g_n_k_wos_desc,
            residual_g_n_k_wos_desc,
            out_g_n_k_wos_desc,
            in_element_op,
            wei_element_op,
            out_element_op);
    }
    return 0;
 }
--- a/include/ck/tensor_operation/gpu/device/device_grouped_conv_fwd.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_grouped_conv_fwd.hpp
@@ -14,39 +14,38 @@ namespace device {
 // Convolution Forward:
 //   input : input image A[G, N, C, Hi, Wi],
 //   input : weight B[G, K, C, Y, X],
 //   input : D0[G, N, K, Ho, Wo], D1[G, N, K, Ho, Wo], ...
 //   output : output image E[G, N, K, Ho, Wo]
 //   C = a_op(A) * b_op(B)
 //   E = cde_op(C, D0, D1, ...)
 template <index_t NDimSpatial,
-          typename ALayout,
+          typename InLayout,
-          typename BLayout,
+          typename WeiLayout,
-          typename CLayout,
+          typename OutLayout,
-          typename ADataType,
+          typename InDataType,
-          typename BDataType,
+          typename WeiDataType,
-          typename CDataType,
+          typename OutDataType,
-          typename AElementwiseOperation,
+          typename InElementwiseOperation,
-          typename BElementwiseOperation,
+          typename WeiElementwiseOperation,
-          typename CElementwiseOperation>
+          typename OutElementwiseOperation>
 struct DeviceGroupedConvFwd : public BaseOperator
 {
    virtual std::unique_ptr<BaseArgument>
-    MakeArgumentPointer(const void* p_a, // input image
+    MakeArgumentPointer(const void* p_in,  // input image
-                        const void* p_b, // weight
+                        const void* p_wei, // weight
-                        void* p_c,       // output image
+                        void* p_out,       // output image
-                        const std::array<index_t, NDimSpatial + 3>& a_g_n_c_wis_lengths,
+                        const std::array<index_t, NDimSpatial + 3>& in_g_n_c_wis_lengths,
-                        const std::array<index_t, NDimSpatial + 3>& a_g_n_c_wis_strides,
+                        const std::array<index_t, NDimSpatial + 3>& in_g_n_c_wis_strides,
-                        const std::array<index_t, NDimSpatial + 3>& b_g_k_c_xs_lengths,
+                        const std::array<index_t, NDimSpatial + 3>& wei_g_k_c_xs_lengths,
-                        const std::array<index_t, NDimSpatial + 3>& b_g_k_c_xs_strides,
+                        const std::array<index_t, NDimSpatial + 3>& wei_g_k_c_xs_strides,
-                        const std::array<index_t, NDimSpatial + 3>& c_g_n_k_wos_lengths,
+                        const std::array<index_t, NDimSpatial + 3>& out_g_n_k_wos_lengths,
-                        const std::array<index_t, NDimSpatial + 3>& c_g_n_k_wos_strides,
+                        const std::array<index_t, NDimSpatial + 3>& out_g_n_k_wos_strides,
                        const std::array<index_t, NDimSpatial>& conv_filter_strides,
                        const std::array<index_t, NDimSpatial>& conv_filter_dilations,
                        const std::array<index_t, NDimSpatial>& input_left_pads,
                        const std::array<index_t, NDimSpatial>& input_right_pads,
-                        const AElementwiseOperation& a_element_op,
+                        const InElementwiseOperation& in_element_op,
-                        const BElementwiseOperation& b_element_op,
+                        const WeiElementwiseOperation& wei_element_op,
-                        const CElementwiseOperation& c_element_op) = 0;
+                        const OutElementwiseOperation& out_element_op) = 0;
    virtual std::unique_ptr<BaseInvoker> MakeInvokerPointer() = 0;
 };
--- a/library/include/ck/library/tensor_operation_instance/device_operation_instance_factory.hpp
+++ b/library/include/ck/library/tensor_operation_instance/device_operation_instance_factory.hpp
@@ -95,7 +95,7 @@ template <typename Activation>
 using Add_Activation_Mul_Clamp =
    ck::tensor_operation::element_wise::Add_Activation_Mul_Clamp<Activation>;
-template <typename DeviceOp>
+template <typename DeviceOp, typename Tag = void>
 struct DeviceOperationInstanceFactory;
 } // namespace instance
--- a/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_forward.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_forward.hpp
@@ -3,11 +3,9 @@
 #pragma once
 #include <cstdlib>
 #include "ck/ck.hpp"
 #include "ck/tensor_operation/gpu/device/device_grouped_conv_fwd_multiple_d.hpp"
 #include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
 #include "ck/tensor_operation/gpu/device/device_conv_fwd.hpp"
 #include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
 #include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp"
--- a/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_forward_dl.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_forward_dl.hpp
@@ -3,11 +3,9 @@
 #pragma once
 #include <cstdlib>
 #include "ck/ck.hpp"
 #include "ck/tensor_operation/gpu/device/device_grouped_conv_fwd.hpp"
 #include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
 #include "ck/tensor_operation/gpu/device/device_conv_fwd.hpp"
 #include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
 #include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp"
--- a/profiler/include/profile_grouped_conv_fwd_impl.hpp
+++ b/profiler/include/profile_grouped_conv_fwd_impl.hpp
@@ -9,12 +9,9 @@
 #include "ck/ck.hpp"
 #include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
 #include "ck/tensor_operation/gpu/device/device_grouped_conv_fwd_multiple_d.hpp"
 #include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
 #include "ck/library/tensor_operation_instance/gpu/grouped_convolution_forward.hpp"
 #include "ck/tensor_operation/gpu/device/device_grouped_conv_fwd.hpp"
 #include "ck/library/tensor_operation_instance/gpu/grouped_convolution_forward_dl.hpp"
 #include "ck/library/utility/check_err.hpp"
@@ -224,26 +221,25 @@ bool profile_grouped_conv_fwd_impl(int do_verification,
        for(auto& op_ptr : op_ptrs)
        {
-            auto argument_ptr = op_ptr->MakeArgumentPointer(
+            auto argument_ptr = op_ptr->MakeArgumentPointer(in_device_buf.GetDeviceBuffer(),
-                in_device_buf.GetDeviceBuffer(),
+                                                            wei_device_buf.GetDeviceBuffer(),
-                wei_device_buf.GetDeviceBuffer(),
+                                                            {},
-                std::array<const void*, 0>{},
+                                                            out_device_buf.GetDeviceBuffer(),
-                out_device_buf.GetDeviceBuffer(),
+                                                            a_g_n_c_wis_lengths,
-                a_g_n_c_wis_lengths,
+                                                            a_g_n_c_wis_strides,
-                a_g_n_c_wis_strides,
+                                                            b_g_k_c_xs_lengths,
-                b_g_k_c_xs_lengths,
+                                                            b_g_k_c_xs_strides,
-                b_g_k_c_xs_strides,
+                                                            {},
-                std::array<std::array<ck::index_t, NDimSpatial + 3>, 0>{{}},
+                                                            {},
-                std::array<std::array<ck::index_t, NDimSpatial + 3>, 0>{{}},
+                                                            e_g_n_k_wos_lengths,
-                e_g_n_k_wos_lengths,
+                                                            e_g_n_k_wos_strides,
-                e_g_n_k_wos_strides,
+                                                            conv_filter_strides,
-                conv_filter_strides,
+                                                            conv_filter_dilations,
-                conv_filter_dilations,
+                                                            input_left_pads,
-                input_left_pads,
+                                                            input_right_pads,
-                input_right_pads,
+                                                            in_element_op,
-                in_element_op,
+                                                            wei_element_op,
-                wei_element_op,
+                                                            out_element_op);
                out_element_op);
            run_impl(op_ptr, argument_ptr);
        }
@@ -262,8 +258,10 @@ bool profile_grouped_conv_fwd_impl(int do_verification,
                                                                            WeiElementOp,
                                                                            OutElementOp>;
        // get device op instances
        const auto op_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory<
            DeviceOp>::GetInstances();
        std::cout << "dl found " << op_ptrs.size() << " instances" << std::endl;
        for(auto& op_ptr : op_ptrs)
		`@@ -1,2 +0,0 @@`
			`add_executable(client_conv2d_fwd conv2d_fwd.cpp)`
			`target_link_libraries(client_conv2d_fwd PRIVATE composable_kernel::device_operations)`
		`@@ -0,0 +1,2 @@`
							`add_executable(client_grouped_conv2d_fwd grouped_conv2d_fwd.cpp)`
							`target_link_libraries(client_grouped_conv2d_fwd PRIVATE composable_kernel::device_operations)`