From 3190273630f1da55356aec729986546d21883747 Mon Sep 17 00:00:00 2001
From: Rostyslav Geyyer <46627076+geyyer@users.noreply.github.com>
Date: Thu, 25 Aug 2022 17:08:43 -0500
Subject: [PATCH] Add int4 example for convnd_fwd_bias_relu_add (#375)

* Add int4 example for convnd_fwd_bias_relu_add

* Fix AddReluAdd for building without int4 support

* Update CMakeLists.txt

* Format

* Convert int4 tensors for int8 kernel

* Fix device memory allocation

* Format

* Format

[ROCm/composable_kernel commit: b73ae2423495a9054ceaec4d529d30db7e089743]
---
 .../CMakeLists.txt                            |   8 +-
 ...rouped_convnd_fwd_bias_relu_add_common.hpp |  73 ++-
 ...uped_convnd_fwd_bias_relu_add_xdl_bf16.cpp |  55 ++-
 ...uped_convnd_fwd_bias_relu_add_xdl_fp16.cpp |  55 ++-
 ...uped_convnd_fwd_bias_relu_add_xdl_fp32.cpp |  55 ++-
 ...uped_convnd_fwd_bias_relu_add_xdl_int4.cpp | 459 ++++++++++++++++++
 ...uped_convnd_fwd_bias_relu_add_xdl_int8.cpp |  55 ++-
 .../gpu/element/element_wise_operation.hpp    |  12 +
 8 files changed, 665 insertions(+), 107 deletions(-)
 create mode 100644 example/30_grouped_convnd_fwd_bias_relu_add/grouped_convnd_fwd_bias_relu_add_xdl_int4.cpp
diff --git a/example/30_grouped_convnd_fwd_bias_relu_add/CMakeLists.txt b/example/30_grouped_convnd_fwd_bias_relu_add/CMakeLists.txt
index 628cb93daa..98c2211b19 100644
--- a/example/30_grouped_convnd_fwd_bias_relu_add/CMakeLists.txt
+++ b/example/30_grouped_convnd_fwd_bias_relu_add/CMakeLists.txt
@@ -1,11 +1,11 @@
 add_example_executable(example_grouped_convnd_fwd_bias_relu_add_xdl_fp16 grouped_convnd_fwd_bias_relu_add_xdl_fp16.cpp)
-target_link_libraries(example_grouped_convnd_fwd_bias_relu_add_xdl_fp16 PRIVATE utility)
 
 add_example_executable(example_grouped_convnd_fwd_bias_relu_add_xdl_fp32 grouped_convnd_fwd_bias_relu_add_xdl_fp32.cpp)
-target_link_libraries(example_grouped_convnd_fwd_bias_relu_add_xdl_fp32 PRIVATE utility)
 
 add_example_executable(example_grouped_convnd_fwd_bias_relu_add_xdl_bf16 grouped_convnd_fwd_bias_relu_add_xdl_bf16.cpp)
-target_link_libraries(example_grouped_convnd_fwd_bias_relu_add_xdl_bf16 PRIVATE utility)
 
 add_example_executable(example_grouped_convnd_fwd_bias_relu_add_xdl_int8 grouped_convnd_fwd_bias_relu_add_xdl_int8.cpp)
-target_link_libraries(example_grouped_convnd_fwd_bias_relu_add_xdl_int8 PRIVATE utility)
\ No newline at end of file
+
+if(USE_BITINT_EXTENSION_INT4)
+  add_example_executable(example_grouped_convnd_fwd_bias_relu_add_xdl_int4 grouped_convnd_fwd_bias_relu_add_xdl_int4.cpp)
+endif() # USE_BITINT_EXTENSION_INT4
diff --git a/example/30_grouped_convnd_fwd_bias_relu_add/grouped_convnd_fwd_bias_relu_add_common.hpp b/example/30_grouped_convnd_fwd_bias_relu_add/grouped_convnd_fwd_bias_relu_add_common.hpp
index 3fb62e77e2..a2d9c21287 100644
--- a/example/30_grouped_convnd_fwd_bias_relu_add/grouped_convnd_fwd_bias_relu_add_common.hpp
+++ b/example/30_grouped_convnd_fwd_bias_relu_add/grouped_convnd_fwd_bias_relu_add_common.hpp
@@ -26,13 +26,16 @@ void print_helper_msg()
 }
 
 template <ck::index_t NDimSpatial,
-          typename InDataType,
-          typename WeiDataType,
+          typename InKernelDataType,
+          typename WeiKernelDataType,
           typename CShuffleDataType,
-          typename OutDataType,
+          typename OutKernelDataType,
           typename InElementOp,
           typename WeiElementOp,
           typename OutElementOp,
+          typename InUserDataType,
+          typename WeiUserDataType,
+          typename OutUserDataType,
           typename DeviceConvNDFwdInstance>
 int run_grouped_conv_fwd_bias_relu_add(bool do_verification,
                                        int init_method,
@@ -47,12 +50,12 @@ int run_grouped_conv_fwd_bias_relu_add(bool do_verification,
                                        const WeiElementOp& wei_element_op,
                                        const OutElementOp& out_element_op)
 {
-    Tensor<InDataType> in(in_g_n_c_wis_desc);
-    Tensor<WeiDataType> wei(wei_g_k_c_xs_desc);
-    Tensor<OutDataType> bias(bias_g_n_k_wos_desc);
-    Tensor<OutDataType> residual(residual_g_n_k_wos_desc);
-    Tensor<OutDataType> out_host(out_g_n_k_wos_desc);
-    Tensor<OutDataType> out_device(out_g_n_k_wos_desc);
+    Tensor<InUserDataType> in(in_g_n_c_wis_desc);
+    Tensor<WeiUserDataType> wei(wei_g_k_c_xs_desc);
+    Tensor<OutUserDataType> bias(bias_g_n_k_wos_desc);
+    Tensor<OutUserDataType> residual(residual_g_n_k_wos_desc);
+    Tensor<OutUserDataType> out_host(out_g_n_k_wos_desc);
+    Tensor<OutKernelDataType> out_device(out_g_n_k_wos_desc);
 
     std::cout << "in: " << in.mDesc << std::endl;
     std::cout << "wei: " << wei.mDesc << std::endl;
@@ -64,26 +67,38 @@ int run_grouped_conv_fwd_bias_relu_add(bool do_verification,
     {
     case 0: break;
     case 1:
-        in.GenerateTensorValue(GeneratorTensor_2<InDataType>{-5, 5});
-        wei.GenerateTensorValue(GeneratorTensor_2<WeiDataType>{-5, 5});
-        bias.GenerateTensorValue(GeneratorTensor_2<OutDataType>{-5, 5});
+        in.GenerateTensorValue(GeneratorTensor_2<InUserDataType>{-5, 5});
+        wei.GenerateTensorValue(GeneratorTensor_2<WeiUserDataType>{-5, 5});
+        bias.GenerateTensorValue(GeneratorTensor_2<OutUserDataType>{-5, 5});
         break;
     default:
-        in.GenerateTensorValue(GeneratorTensor_3<InDataType>{0.0, 1.0});
-        wei.GenerateTensorValue(GeneratorTensor_3<WeiDataType>{-0.5, 0.5});
-        bias.GenerateTensorValue(GeneratorTensor_3<OutDataType>{-0.5, 0.5});
+        in.GenerateTensorValue(GeneratorTensor_3<InUserDataType>{0.0, 1.0});
+        wei.GenerateTensorValue(GeneratorTensor_3<WeiUserDataType>{-0.5, 0.5});
+        bias.GenerateTensorValue(GeneratorTensor_3<OutUserDataType>{-0.5, 0.5});
     }
 
-    DeviceMem in_device_buf(sizeof(InDataType) * in.mDesc.GetElementSpaceSize());
-    DeviceMem wei_device_buf(sizeof(WeiDataType) * wei.mDesc.GetElementSpaceSize());
-    DeviceMem bias_device_buf(sizeof(OutDataType) * bias.mDesc.GetElementSpaceSize());
-    DeviceMem residual_device_buf(sizeof(OutDataType) * residual.mDesc.GetElementSpaceSize());
-    DeviceMem out_device_buf(sizeof(OutDataType) * out_device.mDesc.GetElementSpaceSize());
+    DeviceMem in_device_buf(sizeof(InKernelDataType) * in.mDesc.GetElementSpaceSize());
+    DeviceMem wei_device_buf(sizeof(WeiKernelDataType) * wei.mDesc.GetElementSpaceSize());
+    DeviceMem bias_device_buf(sizeof(OutKernelDataType) * bias.mDesc.GetElementSpaceSize());
+    DeviceMem residual_device_buf(sizeof(OutKernelDataType) * residual.mDesc.GetElementSpaceSize());
+    DeviceMem out_device_buf(sizeof(OutKernelDataType) * out_device.mDesc.GetElementSpaceSize());
 
+#ifdef CK_EXPERIMENTAL_BIT_INT_EXTENSION_INT4
+    const Tensor<InKernelDataType> in_converted(in);
+    const Tensor<WeiKernelDataType> wei_converted(wei);
+    const Tensor<OutKernelDataType> bias_converted(bias);
+    const Tensor<OutKernelDataType> residual_converted(residual);
+
+    in_device_buf.ToDevice(in_converted.mData.data());
+    wei_device_buf.ToDevice(wei_converted.mData.data());
+    bias_device_buf.ToDevice(bias_converted.mData.data());
+    residual_device_buf.ToDevice(residual_converted.mData.data());
+#else  // CK_EXPERIMENTAL_BIT_INT_EXTENSION_INT4
     in_device_buf.ToDevice(in.mData.data());
     wei_device_buf.ToDevice(wei.mData.data());
     bias_device_buf.ToDevice(bias.mData.data());
     residual_device_buf.ToDevice(residual.mData.data());
+#endif //  CK_EXPERIMENTAL_BIT_INT_EXTENSION_INT4
 
     std::array<ck::index_t, NDimSpatial + 3> a_g_n_c_wis_lengths{};
     std::array<ck::index_t, NDimSpatial + 3> a_g_n_c_wis_strides{};
@@ -154,7 +169,7 @@ int run_grouped_conv_fwd_bias_relu_add(bool do_verification,
     float avg_time = invoker.Run(argument, StreamConfig{nullptr, time_kernel});
 
     std::size_t flop      = conv_param.GetFlops();
-    std::size_t num_btype = conv_param.GetByte<InDataType, WeiDataType, OutDataType>();
+    std::size_t num_btype = conv_param.GetByte<InUserDataType, WeiUserDataType, OutUserDataType>();
 
     float tflops     = static_cast<float>(flop) / 1.E9 / avg_time;
     float gb_per_sec = num_btype / 1.E6 / avg_time;
@@ -168,8 +183,8 @@ int run_grouped_conv_fwd_bias_relu_add(bool do_verification,
         Tensor<CShuffleDataType> c_host(out_g_n_k_wos_desc);
 
         auto ref_conv = ck::tensor_operation::host::ReferenceConvFwd<NDimSpatial,
-                                                                     InDataType,
-                                                                     WeiDataType,
+                                                                     InUserDataType,
+                                                                     WeiUserDataType,
                                                                      CShuffleDataType,
                                                                      InElementOp,
                                                                      WeiElementOp,
@@ -196,10 +211,22 @@ int run_grouped_conv_fwd_bias_relu_add(bool do_verification,
 
         out_device_buf.FromDevice(out_device.mData.data());
 
+#ifdef CK_EXPERIMENTAL_BIT_INT_EXTENSION_INT4
+        const Tensor<OutUserDataType> out_device_converted(out_device);
+
+        return ck::utils::check_err(out_device_converted.mData,
+                                    out_host.mData,
+                                    "Error: incorrect results!",
+                                    1e-5f,
+                                    1e-4f)
+                   ? 0
+                   : 1;
+#else  // CK_EXPERIMENTAL_BIT_INT_EXTENSION_INT4
         return ck::utils::check_err(
                    out_device.mData, out_host.mData, "Error: incorrect results!", 1e-5f, 1e-4f)
                    ? 0
                    : 1;
+#endif // CK_EXPERIMENTAL_BIT_INT_EXTENSION_INT4
     }
 
     return 0;
diff --git a/example/30_grouped_convnd_fwd_bias_relu_add/grouped_convnd_fwd_bias_relu_add_xdl_bf16.cpp b/example/30_grouped_convnd_fwd_bias_relu_add/grouped_convnd_fwd_bias_relu_add_xdl_bf16.cpp
index 1da96b2d37..4ac996dbaa 100644
--- a/example/30_grouped_convnd_fwd_bias_relu_add/grouped_convnd_fwd_bias_relu_add_xdl_bf16.cpp
+++ b/example/30_grouped_convnd_fwd_bias_relu_add/grouped_convnd_fwd_bias_relu_add_xdl_bf16.cpp
@@ -7,13 +7,19 @@
 
 #include "ck/library/utility/convolution_host_tensor_descriptor_helper.hpp"
 
-using InDataType       = ck::bhalf_t;
-using WeiDataType      = ck::bhalf_t;
-using AccDataType      = float;
-using CShuffleDataType = float;
-using BiasDataType     = ck::bhalf_t;
-using ResidualDataType = ck::bhalf_t;
-using OutDataType      = ck::bhalf_t;
+// kernel data types
+using InKernelDataType       = ck::bhalf_t;
+using WeiKernelDataType      = ck::bhalf_t;
+using AccDataType            = float;
+using CShuffleDataType       = float;
+using BiasKernelDataType     = ck::bhalf_t;
+using ResidualKernelDataType = ck::bhalf_t;
+using OutKernelDataType      = ck::bhalf_t;
+
+// tensor data types
+using InUserDataType  = InKernelDataType;
+using WeiUserDataType = WeiKernelDataType;
+using OutUserDataType = OutKernelDataType;
 
 template <ck::index_t... Is>
 using S = ck::Sequence<Is...>;
@@ -40,12 +46,12 @@ using DeviceGroupedConvNDFwdInstance =
         WeiLayout,
         ck::Tuple<BiasLayout, ResidualLayout>,
         OutLayout,
-        InDataType,
-        WeiDataType,
+        InKernelDataType,
+        WeiKernelDataType,
         AccDataType,
         CShuffleDataType,
-        ck::Tuple<BiasDataType, ResidualDataType>,
-        OutDataType,
+        ck::Tuple<BiasKernelDataType, ResidualKernelDataType>,
+        OutKernelDataType,
         InElementOp,
         WeiElementOp,
         OutElementOp,
@@ -181,13 +187,16 @@ int main(int argc, char* argv[])
             });
 
         return run_grouped_conv_fwd_bias_relu_add<1,
-                                                  InDataType,
-                                                  WeiDataType,
+                                                  InKernelDataType,
+                                                  WeiKernelDataType,
                                                   CShuffleDataType,
-                                                  OutDataType,
+                                                  OutKernelDataType,
                                                   InElementOp,
                                                   WeiElementOp,
                                                   OutElementOp,
+                                                  InUserDataType,
+                                                  WeiUserDataType,
+                                                  OutUserDataType,
                                                   DeviceGroupedConvNDFwdInstance<1,
                                                                                  InLayout,
                                                                                  WeiLayout,
@@ -290,13 +299,16 @@ int main(int argc, char* argv[])
             });
 
         return run_grouped_conv_fwd_bias_relu_add<2,
-                                                  InDataType,
-                                                  WeiDataType,
+                                                  InKernelDataType,
+                                                  WeiKernelDataType,
                                                   CShuffleDataType,
-                                                  OutDataType,
+                                                  OutKernelDataType,
                                                   InElementOp,
                                                   WeiElementOp,
                                                   OutElementOp,
+                                                  InUserDataType,
+                                                  WeiUserDataType,
+                                                  OutUserDataType,
                                                   DeviceGroupedConvNDFwdInstance<2,
                                                                                  InLayout,
                                                                                  WeiLayout,
@@ -413,13 +425,16 @@ int main(int argc, char* argv[])
             });
 
         return run_grouped_conv_fwd_bias_relu_add<3,
-                                                  InDataType,
-                                                  WeiDataType,
+                                                  InKernelDataType,
+                                                  WeiKernelDataType,
                                                   CShuffleDataType,
-                                                  OutDataType,
+                                                  OutKernelDataType,
                                                   InElementOp,
                                                   WeiElementOp,
                                                   OutElementOp,
+                                                  InUserDataType,
+                                                  WeiUserDataType,
+                                                  OutUserDataType,
                                                   DeviceGroupedConvNDFwdInstance<3,
                                                                                  InLayout,
                                                                                  WeiLayout,
diff --git a/example/30_grouped_convnd_fwd_bias_relu_add/grouped_convnd_fwd_bias_relu_add_xdl_fp16.cpp b/example/30_grouped_convnd_fwd_bias_relu_add/grouped_convnd_fwd_bias_relu_add_xdl_fp16.cpp
index d505073f28..8846633982 100644
--- a/example/30_grouped_convnd_fwd_bias_relu_add/grouped_convnd_fwd_bias_relu_add_xdl_fp16.cpp
+++ b/example/30_grouped_convnd_fwd_bias_relu_add/grouped_convnd_fwd_bias_relu_add_xdl_fp16.cpp
@@ -7,13 +7,19 @@
 
 #include "ck/library/utility/convolution_host_tensor_descriptor_helper.hpp"
 
-using InDataType       = ck::half_t;
-using WeiDataType      = ck::half_t;
-using AccDataType      = float;
-using CShuffleDataType = ck::half_t;
-using BiasDataType     = ck::half_t;
-using ResidualDataType = ck::half_t;
-using OutDataType      = ck::half_t;
+// kernel data types
+using InKernelDataType       = ck::half_t;
+using WeiKernelDataType      = ck::half_t;
+using AccDataType            = float;
+using CShuffleDataType       = ck::half_t;
+using BiasKernelDataType     = ck::half_t;
+using ResidualKernelDataType = ck::half_t;
+using OutKernelDataType      = ck::half_t;
+
+// tensor data types
+using InUserDataType  = InKernelDataType;
+using WeiUserDataType = WeiKernelDataType;
+using OutUserDataType = OutKernelDataType;
 
 template <ck::index_t... Is>
 using S = ck::Sequence<Is...>;
@@ -40,12 +46,12 @@ using DeviceGroupedConvNDFwdInstance =
         WeiLayout,
         ck::Tuple<BiasLayout, ResidualLayout>,
         OutLayout,
-        InDataType,
-        WeiDataType,
+        InKernelDataType,
+        WeiKernelDataType,
         AccDataType,
         CShuffleDataType,
-        ck::Tuple<BiasDataType, ResidualDataType>,
-        OutDataType,
+        ck::Tuple<BiasKernelDataType, ResidualKernelDataType>,
+        OutKernelDataType,
         InElementOp,
         WeiElementOp,
         OutElementOp,
@@ -181,13 +187,16 @@ int main(int argc, char* argv[])
             });
 
         return run_grouped_conv_fwd_bias_relu_add<1,
-                                                  InDataType,
-                                                  WeiDataType,
+                                                  InKernelDataType,
+                                                  WeiKernelDataType,
                                                   CShuffleDataType,
-                                                  OutDataType,
+                                                  OutKernelDataType,
                                                   InElementOp,
                                                   WeiElementOp,
                                                   OutElementOp,
+                                                  InUserDataType,
+                                                  WeiUserDataType,
+                                                  OutUserDataType,
                                                   DeviceGroupedConvNDFwdInstance<1,
                                                                                  InLayout,
                                                                                  WeiLayout,
@@ -290,13 +299,16 @@ int main(int argc, char* argv[])
             });
 
         return run_grouped_conv_fwd_bias_relu_add<2,
-                                                  InDataType,
-                                                  WeiDataType,
+                                                  InKernelDataType,
+                                                  WeiKernelDataType,
                                                   CShuffleDataType,
-                                                  OutDataType,
+                                                  OutKernelDataType,
                                                   InElementOp,
                                                   WeiElementOp,
                                                   OutElementOp,
+                                                  InUserDataType,
+                                                  WeiUserDataType,
+                                                  OutUserDataType,
                                                   DeviceGroupedConvNDFwdInstance<2,
                                                                                  InLayout,
                                                                                  WeiLayout,
@@ -413,13 +425,16 @@ int main(int argc, char* argv[])
             });
 
         return run_grouped_conv_fwd_bias_relu_add<3,
-                                                  InDataType,
-                                                  WeiDataType,
+                                                  InKernelDataType,
+                                                  WeiKernelDataType,
                                                   CShuffleDataType,
-                                                  OutDataType,
+                                                  OutKernelDataType,
                                                   InElementOp,
                                                   WeiElementOp,
                                                   OutElementOp,
+                                                  InUserDataType,
+                                                  WeiUserDataType,
+                                                  OutUserDataType,
                                                   DeviceGroupedConvNDFwdInstance<3,
                                                                                  InLayout,
                                                                                  WeiLayout,
diff --git a/example/30_grouped_convnd_fwd_bias_relu_add/grouped_convnd_fwd_bias_relu_add_xdl_fp32.cpp b/example/30_grouped_convnd_fwd_bias_relu_add/grouped_convnd_fwd_bias_relu_add_xdl_fp32.cpp
index 5237a9cb5a..c792ac5fe3 100644
--- a/example/30_grouped_convnd_fwd_bias_relu_add/grouped_convnd_fwd_bias_relu_add_xdl_fp32.cpp
+++ b/example/30_grouped_convnd_fwd_bias_relu_add/grouped_convnd_fwd_bias_relu_add_xdl_fp32.cpp
@@ -7,13 +7,19 @@
 
 #include "ck/library/utility/convolution_host_tensor_descriptor_helper.hpp"
 
-using InDataType       = float;
-using WeiDataType      = float;
-using AccDataType      = float;
-using CShuffleDataType = float;
-using BiasDataType     = float;
-using ResidualDataType = float;
-using OutDataType      = float;
+// kernel data types
+using InKernelDataType       = float;
+using WeiKernelDataType      = float;
+using AccDataType            = float;
+using CShuffleDataType       = float;
+using BiasKernelDataType     = float;
+using ResidualKernelDataType = float;
+using OutKernelDataType      = float;
+
+// tensor data types
+using InUserDataType  = InKernelDataType;
+using WeiUserDataType = WeiKernelDataType;
+using OutUserDataType = OutKernelDataType;
 
 template <ck::index_t... Is>
 using S = ck::Sequence<Is...>;
@@ -40,12 +46,12 @@ using DeviceGroupedConvNDFwdInstance =
         WeiLayout,
         ck::Tuple<BiasLayout, ResidualLayout>,
         OutLayout,
-        InDataType,
-        WeiDataType,
+        InKernelDataType,
+        WeiKernelDataType,
         AccDataType,
         CShuffleDataType,
-        ck::Tuple<BiasDataType, ResidualDataType>,
-        OutDataType,
+        ck::Tuple<BiasKernelDataType, ResidualKernelDataType>,
+        OutKernelDataType,
         InElementOp,
         WeiElementOp,
         OutElementOp,
@@ -181,13 +187,16 @@ int main(int argc, char* argv[])
             });
 
         return run_grouped_conv_fwd_bias_relu_add<1,
-                                                  InDataType,
-                                                  WeiDataType,
+                                                  InKernelDataType,
+                                                  WeiKernelDataType,
                                                   CShuffleDataType,
-                                                  OutDataType,
+                                                  OutKernelDataType,
                                                   InElementOp,
                                                   WeiElementOp,
                                                   OutElementOp,
+                                                  InUserDataType,
+                                                  WeiUserDataType,
+                                                  OutUserDataType,
                                                   DeviceGroupedConvNDFwdInstance<1,
                                                                                  InLayout,
                                                                                  WeiLayout,
@@ -290,13 +299,16 @@ int main(int argc, char* argv[])
             });
 
         return run_grouped_conv_fwd_bias_relu_add<2,
-                                                  InDataType,
-                                                  WeiDataType,
+                                                  InKernelDataType,
+                                                  WeiKernelDataType,
                                                   CShuffleDataType,
-                                                  OutDataType,
+                                                  OutKernelDataType,
                                                   InElementOp,
                                                   WeiElementOp,
                                                   OutElementOp,
+                                                  InUserDataType,
+                                                  WeiUserDataType,
+                                                  OutUserDataType,
                                                   DeviceGroupedConvNDFwdInstance<2,
                                                                                  InLayout,
                                                                                  WeiLayout,
@@ -413,13 +425,16 @@ int main(int argc, char* argv[])
             });
 
         return run_grouped_conv_fwd_bias_relu_add<3,
-                                                  InDataType,
-                                                  WeiDataType,
+                                                  InKernelDataType,
+                                                  WeiKernelDataType,
                                                   CShuffleDataType,
-                                                  OutDataType,
+                                                  OutKernelDataType,
                                                   InElementOp,
                                                   WeiElementOp,
                                                   OutElementOp,
+                                                  InUserDataType,
+                                                  WeiUserDataType,
+                                                  OutUserDataType,
                                                   DeviceGroupedConvNDFwdInstance<3,
                                                                                  InLayout,
                                                                                  WeiLayout,
diff --git a/example/30_grouped_convnd_fwd_bias_relu_add/grouped_convnd_fwd_bias_relu_add_xdl_int4.cpp b/example/30_grouped_convnd_fwd_bias_relu_add/grouped_convnd_fwd_bias_relu_add_xdl_int4.cpp
new file mode 100644
index 0000000000..d989e63590
--- /dev/null
+++ b/example/30_grouped_convnd_fwd_bias_relu_add/grouped_convnd_fwd_bias_relu_add_xdl_int4.cpp
@@ -0,0 +1,459 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "grouped_convnd_fwd_bias_relu_add_common.hpp"
+
+#include "ck/tensor_operation/gpu/device/device_grouped_conv_fwd_multiple_d_xdl_cshuffle.hpp"
+
+#include "ck/library/utility/convolution_host_tensor_descriptor_helper.hpp"
+
+// kernel data types
+using InKernelDataType       = int8_t;
+using WeiKernelDataType      = int8_t;
+using AccDataType            = int32_t;
+using CShuffleDataType       = int8_t;
+using BiasKernelDataType     = int8_t;
+using ResidualKernelDataType = int8_t;
+using OutKernelDataType      = int8_t;
+
+// tensor data types
+using InUserDataType  = ck::int4_t;
+using WeiUserDataType = ck::int4_t;
+using OutUserDataType = ck::int4_t;
+
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+
+using InElementOp  = ck::tensor_operation::element_wise::PassThrough;
+using WeiElementOp = ck::tensor_operation::element_wise::PassThrough;
+using OutElementOp = ck::tensor_operation::element_wise::AddReluAdd;
+
+static constexpr auto ConvSpec =
+    ck::tensor_operation::device::ConvolutionForwardSpecialization::Default;
+
+static constexpr auto GemmSpec = ck::tensor_operation::device::GemmSpecialization::MNKPadding;
+
+template <ck::index_t NDimSpatial,
+          typename InLayout,
+          typename WeiLayout,
+          typename BiasLayout,
+          typename ResidualLayout,
+          typename OutLayout>
+using DeviceGroupedConvNDFwdInstance =
+    ck::tensor_operation::device::DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<
+        NDimSpatial,
+        InLayout,
+        WeiLayout,
+        ck::Tuple<BiasLayout, ResidualLayout>,
+        OutLayout,
+        InKernelDataType,
+        WeiKernelDataType,
+        AccDataType,
+        CShuffleDataType,
+        ck::Tuple<BiasKernelDataType, ResidualKernelDataType>,
+        OutKernelDataType,
+        InElementOp,
+        WeiElementOp,
+        OutElementOp,
+        ConvSpec,    // ConvForwardSpecialization
+        GemmSpec,    // GemmSpecialization
+        1,           //
+        256,         // BlockSize
+        128,         // MPerBlock
+        256,         // NPerBlock
+        64,          // KPerBlock
+        16,          // AK1
+        16,          // BK1
+        32,          // MPerXdl
+        32,          // NPerXdl
+        2,           // MXdlPerWave
+        4,           // NXdlPerWave
+        S<4, 64, 1>, // ABlockTransferThreadClusterLengths_AK0_M_AK1
+        S<1, 0, 2>,  // ABlockTransferThreadClusterArrangeOrder
+        S<1, 0, 2>,  // ABlockTransferSrcAccessOrder
+        2,           // ABlockTransferSrcVectorDim
+        16,          // ABlockTransferSrcScalarPerVector
+        16,          // ABlockTransferDstScalarPerVector_AK1
+        1,           // ABlockLdsExtraM
+        S<4, 64, 1>, // BBlockTransferThreadClusterLengths_BK0_N_BK1
+        S<1, 0, 2>,  // BBlockTransferThreadClusterArrangeOrder
+        S<1, 0, 2>,  // BBlockTransferSrcAccessOrder
+        2,           // BBlockTransferSrcVectorDim
+        16,          // BBlockTransferSrcScalarPerVector
+        16,          // BBlockTransferDstScalarPerVector_BK1
+        1,           // BBlockLdsExtraN
+        1,
+        1,
+        S<1, 64, 1, 4>,
+        16>;
+
+int main(int argc, char* argv[])
+{
+    namespace ctc = ck::tensor_layout::convolution;
+
+    print_helper_msg();
+
+    bool do_verification = true;
+    int init_method      = 1;
+    bool time_kernel     = false;
+
+    // conventional group conv definition
+    // G = 2
+    // [N, C, Hi, Wi] =  [128, 384, 71, 71]
+    // [K, C,  Y,  X] =  [512, 192,  3,  3]
+    // [N, K, Ho, Wo] =  [128, 512, 36, 36]
+    // CK group conv definition
+    // [G, N, C, Hi, Wi] =  [2, 128, 192, 71, 71]
+    // [G, K, C,  Y,  X] =  [2, 256, 192,  3,  3]
+    // [G, N, K, Ho, Wo] =  [2, 128, 256, 36, 36]
+    ck::utils::conv::ConvParam conv_param{
+        2, 2, 128, 256, 192, {3, 3}, {71, 71}, {2, 2}, {1, 1}, {1, 1}, {1, 1}};
+
+    if(argc == 1)
+    {
+        // use default
+    }
+    else if(argc == 4)
+    {
+        do_verification = std::stoi(argv[1]);
+        init_method     = std::stoi(argv[2]);
+        time_kernel     = std::stoi(argv[3]);
+    }
+    else
+    {
+        do_verification                   = std::stoi(argv[1]);
+        init_method                       = std::stoi(argv[2]);
+        time_kernel                       = std::stoi(argv[3]);
+        const ck::index_t num_dim_spatial = std::stoi(argv[4]);
+
+        conv_param = ck::utils::conv::parse_conv_param(num_dim_spatial, 5, argv);
+    }
+
+    const auto in_element_op  = InElementOp{};
+    const auto wei_element_op = WeiElementOp{};
+    const auto out_element_op = OutElementOp{};
+
+    if(conv_param.num_dim_spatial_ == 1)
+    {
+        using InLayout       = ctc::G_NW_C;
+        using WeiLayout      = ctc::G_K_X_C;
+        using BiasLayout     = ctc::G_NW_K;
+        using ResidualLayout = ctc::G_NW_K;
+        using OutLayout      = ctc::G_NW_K;
+
+        const auto in_g_n_c_wis_desc = HostTensorDescriptor(
+            {conv_param.G_, conv_param.N_, conv_param.C_, conv_param.input_spatial_lengths_[0]},
+            {
+                conv_param.C_,                                                        // g
+                conv_param.input_spatial_lengths_[0] * conv_param.G_ * conv_param.C_, // n
+                1,                                                                    // c
+                conv_param.G_ * conv_param.C_                                         // wi
+            });
+
+        const auto wei_g_k_c_xs_desc = HostTensorDescriptor(
+            {conv_param.G_, conv_param.K_, conv_param.C_, conv_param.filter_spatial_lengths_[0]},
+            {
+                conv_param.K_ * conv_param.filter_spatial_lengths_[0] * conv_param.C_, // g
+                conv_param.filter_spatial_lengths_[0] * conv_param.C_,                 // k
+                1,                                                                     // c
+                conv_param.C_                                                          // x
+            });
+
+        const auto bias_g_n_k_wos_desc = HostTensorDescriptor(
+            {conv_param.G_, conv_param.N_, conv_param.K_, conv_param.output_spatial_lengths_[0]},
+            {
+                conv_param.K_, // g
+                0,             // k
+                1,             // c
+                0              // x
+            });
+
+        const auto residual_g_n_k_wos_desc = HostTensorDescriptor(
+            {conv_param.G_, conv_param.N_, conv_param.K_, conv_param.output_spatial_lengths_[0]},
+            {
+                conv_param.K_, // g
+                0,             // k
+                1,             // c
+                0              // x
+            });
+
+        const auto out_g_n_k_wos_desc = HostTensorDescriptor(
+            {conv_param.G_, conv_param.N_, conv_param.K_, conv_param.output_spatial_lengths_[0]},
+            {
+                conv_param.K_,                                                         // g
+                conv_param.output_spatial_lengths_[0] * conv_param.G_ * conv_param.K_, // n
+                1,                                                                     // k
+                conv_param.G_ * conv_param.K_                                          // wo
+            });
+
+        return run_grouped_conv_fwd_bias_relu_add<1,
+                                                  InKernelDataType,
+                                                  WeiKernelDataType,
+                                                  CShuffleDataType,
+                                                  OutKernelDataType,
+                                                  InElementOp,
+                                                  WeiElementOp,
+                                                  OutElementOp,
+                                                  InUserDataType,
+                                                  WeiUserDataType,
+                                                  OutUserDataType,
+                                                  DeviceGroupedConvNDFwdInstance<1,
+                                                                                 InLayout,
+                                                                                 WeiLayout,
+                                                                                 BiasLayout,
+                                                                                 ResidualLayout,
+                                                                                 OutLayout>>(
+            do_verification,
+            init_method,
+            time_kernel,
+            conv_param,
+            in_g_n_c_wis_desc,
+            wei_g_k_c_xs_desc,
+            bias_g_n_k_wos_desc,
+            residual_g_n_k_wos_desc,
+            out_g_n_k_wos_desc,
+            in_element_op,
+            wei_element_op,
+            out_element_op);
+    }
+    else if(conv_param.num_dim_spatial_ == 2)
+    {
+        using InLayout       = ctc::G_NHW_C;
+        using WeiLayout      = ctc::G_K_YX_C;
+        using BiasLayout     = ctc::G_NHW_K;
+        using ResidualLayout = ctc::G_NHW_K;
+        using OutLayout      = ctc::G_NHW_K;
+
+        const auto in_g_n_c_wis_desc = HostTensorDescriptor(
+            {conv_param.G_,
+             conv_param.N_,
+             conv_param.C_,
+             conv_param.input_spatial_lengths_[0],
+             conv_param.input_spatial_lengths_[1]},
+            {
+                conv_param.C_, // g
+                conv_param.input_spatial_lengths_[0] * conv_param.input_spatial_lengths_[1] *
+                    conv_param.G_ * conv_param.C_,                                    // n
+                1,                                                                    // c
+                conv_param.input_spatial_lengths_[1] * conv_param.G_ * conv_param.C_, // hi
+                conv_param.G_ * conv_param.C_                                         // wi
+            });
+
+        const auto wei_g_k_c_xs_desc =
+            HostTensorDescriptor({conv_param.G_,
+                                  conv_param.K_,
+                                  conv_param.C_,
+                                  conv_param.filter_spatial_lengths_[0],
+                                  conv_param.filter_spatial_lengths_[1]},
+                                 {
+                                     conv_param.K_ * conv_param.filter_spatial_lengths_[0] *
+                                         conv_param.filter_spatial_lengths_[1] * conv_param.C_, // g
+                                     conv_param.filter_spatial_lengths_[0] *
+                                         conv_param.filter_spatial_lengths_[1] * conv_param.C_, // k
+                                     1,                                                         // c
+                                     conv_param.filter_spatial_lengths_[1] * conv_param.C_,     // y
+                                     conv_param.C_                                              // x
+                                 });
+
+        const auto bias_g_n_k_wos_desc =
+            HostTensorDescriptor({conv_param.G_,
+                                  conv_param.N_,
+                                  conv_param.K_,
+                                  conv_param.output_spatial_lengths_[0],
+                                  conv_param.output_spatial_lengths_[1]},
+                                 {
+                                     conv_param.K_, // g
+                                     0,             // n
+                                     1,             // k
+                                     0,             // ho
+                                     0              // wo
+                                 });
+
+        const auto residual_g_n_k_wos_desc =
+            HostTensorDescriptor({conv_param.G_,
+                                  conv_param.N_,
+                                  conv_param.K_,
+                                  conv_param.output_spatial_lengths_[0],
+                                  conv_param.output_spatial_lengths_[1]},
+                                 {
+                                     conv_param.K_, // g
+                                     0,             // n
+                                     1,             // k
+                                     0,             // ho
+                                     0              // wo
+                                 });
+
+        const auto out_g_n_k_wos_desc = HostTensorDescriptor(
+            {conv_param.G_,
+             conv_param.N_,
+             conv_param.K_,
+             conv_param.output_spatial_lengths_[0],
+             conv_param.output_spatial_lengths_[1]},
+            {
+                conv_param.K_, // g
+                conv_param.output_spatial_lengths_[0] * conv_param.output_spatial_lengths_[1] *
+                    conv_param.G_ * conv_param.K_,                                     // n
+                1,                                                                     // k
+                conv_param.output_spatial_lengths_[1] * conv_param.G_ * conv_param.K_, // ho
+                conv_param.G_ * conv_param.K_                                          // wo
+            });
+
+        return run_grouped_conv_fwd_bias_relu_add<2,
+                                                  InKernelDataType,
+                                                  WeiKernelDataType,
+                                                  CShuffleDataType,
+                                                  OutKernelDataType,
+                                                  InElementOp,
+                                                  WeiElementOp,
+                                                  OutElementOp,
+                                                  InUserDataType,
+                                                  WeiUserDataType,
+                                                  OutUserDataType,
+                                                  DeviceGroupedConvNDFwdInstance<2,
+                                                                                 InLayout,
+                                                                                 WeiLayout,
+                                                                                 BiasLayout,
+                                                                                 ResidualLayout,
+                                                                                 OutLayout>>(
+            do_verification,
+            init_method,
+            time_kernel,
+            conv_param,
+            in_g_n_c_wis_desc,
+            wei_g_k_c_xs_desc,
+            bias_g_n_k_wos_desc,
+            residual_g_n_k_wos_desc,
+            out_g_n_k_wos_desc,
+            in_element_op,
+            wei_element_op,
+            out_element_op);
+    }
+    else if(conv_param.num_dim_spatial_ == 3)
+    {
+        using InLayout       = ctc::G_NDHW_C;
+        using WeiLayout      = ctc::G_K_ZYX_C;
+        using BiasLayout     = ctc::G_NDHW_K;
+        using ResidualLayout = ctc::G_NDHW_K;
+        using OutLayout      = ctc::G_NDHW_K;
+
+        const auto in_g_n_c_wis_desc = HostTensorDescriptor(
+            {conv_param.G_,
+             conv_param.N_,
+             conv_param.C_,
+             conv_param.input_spatial_lengths_[0],
+             conv_param.input_spatial_lengths_[1],
+             conv_param.input_spatial_lengths_[2]},
+            {
+                conv_param.C_, // g
+                conv_param.input_spatial_lengths_[0] * conv_param.input_spatial_lengths_[1] *
+                    conv_param.input_spatial_lengths_[2] * conv_param.G_ * conv_param.C_, // n
+                1,                                                                        // c
+                conv_param.input_spatial_lengths_[1] * conv_param.input_spatial_lengths_[2] *
+                    conv_param.G_ * conv_param.C_,                                    // di
+                conv_param.input_spatial_lengths_[2] * conv_param.G_ * conv_param.C_, // hi
+                conv_param.G_ * conv_param.C_                                         // wi
+            });
+
+        const auto wei_g_k_c_xs_desc = HostTensorDescriptor(
+            {conv_param.G_,
+             conv_param.K_,
+             conv_param.C_,
+             conv_param.filter_spatial_lengths_[0],
+             conv_param.filter_spatial_lengths_[1],
+             conv_param.filter_spatial_lengths_[2]},
+            {
+                conv_param.K_ * conv_param.filter_spatial_lengths_[0] *
+                    conv_param.filter_spatial_lengths_[1] * conv_param.filter_spatial_lengths_[2] *
+                    conv_param.C_, // g
+                conv_param.filter_spatial_lengths_[0] * conv_param.filter_spatial_lengths_[1] *
+                    conv_param.filter_spatial_lengths_[2] * conv_param.C_, // k
+                1,                                                         // c
+                conv_param.filter_spatial_lengths_[1] * conv_param.filter_spatial_lengths_[2] *
+                    conv_param.C_,                                     // z
+                conv_param.filter_spatial_lengths_[2] * conv_param.C_, // y
+                conv_param.C_                                          // x
+            });
+
+        const auto bias_g_n_k_wos_desc =
+            HostTensorDescriptor({conv_param.G_,
+                                  conv_param.N_,
+                                  conv_param.K_,
+                                  conv_param.output_spatial_lengths_[0],
+                                  conv_param.output_spatial_lengths_[1],
+                                  conv_param.output_spatial_lengths_[2]},
+                                 {
+                                     conv_param.K_, // g
+                                     0,             // n
+                                     1,             // k
+                                     0,             // z
+                                     0,             // y
+                                     0              // x
+                                 });
+
+        const auto residual_g_n_k_wos_desc =
+            HostTensorDescriptor({conv_param.G_,
+                                  conv_param.N_,
+                                  conv_param.K_,
+                                  conv_param.output_spatial_lengths_[0],
+                                  conv_param.output_spatial_lengths_[1],
+                                  conv_param.output_spatial_lengths_[2]},
+                                 {
+                                     conv_param.K_, // g
+                                     0,             // n
+                                     1,             // k
+                                     0,             // z
+                                     0,             // y
+                                     0              // x
+                                 });
+
+        const auto out_g_n_k_wos_desc = HostTensorDescriptor(
+            {conv_param.G_,
+             conv_param.N_,
+             conv_param.K_,
+             conv_param.output_spatial_lengths_[0],
+             conv_param.output_spatial_lengths_[1],
+             conv_param.output_spatial_lengths_[2]},
+            {
+                conv_param.K_, // g
+                conv_param.output_spatial_lengths_[0] * conv_param.output_spatial_lengths_[1] *
+                    conv_param.output_spatial_lengths_[2] * conv_param.G_ * conv_param.K_, // n
+                1,                                                                         // k
+                conv_param.output_spatial_lengths_[1] * conv_param.output_spatial_lengths_[2] *
+                    conv_param.G_ * conv_param.K_,                                     // do
+                conv_param.output_spatial_lengths_[2] * conv_param.G_ * conv_param.K_, // ho
+                conv_param.G_ * conv_param.K_                                          // wo
+            });
+
+        return run_grouped_conv_fwd_bias_relu_add<3,
+                                                  InKernelDataType,
+                                                  WeiKernelDataType,
+                                                  CShuffleDataType,
+                                                  OutKernelDataType,
+                                                  InElementOp,
+                                                  WeiElementOp,
+                                                  OutElementOp,
+                                                  InUserDataType,
+                                                  WeiUserDataType,
+                                                  OutUserDataType,
+                                                  DeviceGroupedConvNDFwdInstance<3,
+                                                                                 InLayout,
+                                                                                 WeiLayout,
+                                                                                 BiasLayout,
+                                                                                 ResidualLayout,
+                                                                                 OutLayout>>(
+            do_verification,
+            init_method,
+            time_kernel,
+            conv_param,
+            in_g_n_c_wis_desc,
+            wei_g_k_c_xs_desc,
+            bias_g_n_k_wos_desc,
+            residual_g_n_k_wos_desc,
+            out_g_n_k_wos_desc,
+            in_element_op,
+            wei_element_op,
+            out_element_op);
+    }
+
+    return 0;
+}
diff --git a/example/30_grouped_convnd_fwd_bias_relu_add/grouped_convnd_fwd_bias_relu_add_xdl_int8.cpp b/example/30_grouped_convnd_fwd_bias_relu_add/grouped_convnd_fwd_bias_relu_add_xdl_int8.cpp
index 859c9cea34..9aabe86948 100644
--- a/example/30_grouped_convnd_fwd_bias_relu_add/grouped_convnd_fwd_bias_relu_add_xdl_int8.cpp
+++ b/example/30_grouped_convnd_fwd_bias_relu_add/grouped_convnd_fwd_bias_relu_add_xdl_int8.cpp
@@ -7,13 +7,19 @@
 
 #include "ck/library/utility/convolution_host_tensor_descriptor_helper.hpp"
 
-using InDataType       = int8_t;
-using WeiDataType      = int8_t;
-using AccDataType      = int32_t;
-using CShuffleDataType = int8_t;
-using BiasDataType     = int8_t;
-using ResidualDataType = int8_t;
-using OutDataType      = int8_t;
+// kernel data types
+using InKernelDataType       = int8_t;
+using WeiKernelDataType      = int8_t;
+using AccDataType            = int32_t;
+using CShuffleDataType       = int8_t;
+using BiasKernelDataType     = int8_t;
+using ResidualKernelDataType = int8_t;
+using OutKernelDataType      = int8_t;
+
+// tensor data types
+using InUserDataType  = InKernelDataType;
+using WeiUserDataType = WeiKernelDataType;
+using OutUserDataType = OutKernelDataType;
 
 template <ck::index_t... Is>
 using S = ck::Sequence<Is...>;
@@ -40,12 +46,12 @@ using DeviceGroupedConvNDFwdInstance =
         WeiLayout,
         ck::Tuple<BiasLayout, ResidualLayout>,
         OutLayout,
-        InDataType,
-        WeiDataType,
+        InKernelDataType,
+        WeiKernelDataType,
         AccDataType,
         CShuffleDataType,
-        ck::Tuple<BiasDataType, ResidualDataType>,
-        OutDataType,
+        ck::Tuple<BiasKernelDataType, ResidualKernelDataType>,
+        OutKernelDataType,
         InElementOp,
         WeiElementOp,
         OutElementOp,
@@ -181,13 +187,16 @@ int main(int argc, char* argv[])
             });
 
         return run_grouped_conv_fwd_bias_relu_add<1,
-                                                  InDataType,
-                                                  WeiDataType,
+                                                  InKernelDataType,
+                                                  WeiKernelDataType,
                                                   CShuffleDataType,
-                                                  OutDataType,
+                                                  OutKernelDataType,
                                                   InElementOp,
                                                   WeiElementOp,
                                                   OutElementOp,
+                                                  InUserDataType,
+                                                  WeiUserDataType,
+                                                  OutUserDataType,
                                                   DeviceGroupedConvNDFwdInstance<1,
                                                                                  InLayout,
                                                                                  WeiLayout,
@@ -290,13 +299,16 @@ int main(int argc, char* argv[])
             });
 
         return run_grouped_conv_fwd_bias_relu_add<2,
-                                                  InDataType,
-                                                  WeiDataType,
+                                                  InKernelDataType,
+                                                  WeiKernelDataType,
                                                   CShuffleDataType,
-                                                  OutDataType,
+                                                  OutKernelDataType,
                                                   InElementOp,
                                                   WeiElementOp,
                                                   OutElementOp,
+                                                  InUserDataType,
+                                                  WeiUserDataType,
+                                                  OutUserDataType,
                                                   DeviceGroupedConvNDFwdInstance<2,
                                                                                  InLayout,
                                                                                  WeiLayout,
@@ -413,13 +425,16 @@ int main(int argc, char* argv[])
             });
 
         return run_grouped_conv_fwd_bias_relu_add<3,
-                                                  InDataType,
-                                                  WeiDataType,
+                                                  InKernelDataType,
+                                                  WeiKernelDataType,
                                                   CShuffleDataType,
-                                                  OutDataType,
+                                                  OutKernelDataType,
                                                   InElementOp,
                                                   WeiElementOp,
                                                   OutElementOp,
+                                                  InUserDataType,
+                                                  WeiUserDataType,
+                                                  OutUserDataType,
                                                   DeviceGroupedConvNDFwdInstance<3,
                                                                                  InLayout,
                                                                                  WeiLayout,
diff --git a/include/ck/tensor_operation/gpu/element/element_wise_operation.hpp b/include/ck/tensor_operation/gpu/element/element_wise_operation.hpp
index 44cd5c0694..47d018095d 100644
--- a/include/ck/tensor_operation/gpu/element/element_wise_operation.hpp
+++ b/include/ck/tensor_operation/gpu/element/element_wise_operation.hpp
@@ -98,6 +98,18 @@ struct AddReluAdd
         int32_t c = b + x2;
         y         = c;
     }
+
+#ifdef CK_EXPERIMENTAL_BIT_INT_EXTENSION_INT4
+    template <>
+    __host__ __device__ constexpr void operator()<int4_t, int8_t, int4_t, int4_t>(
+        int4_t& y, const int8_t& x0, const int4_t& x1, const int4_t& x2) const
+    {
+        int32_t a = x0 + x1;
+        int32_t b = a > 0 ? a : 0;
+        int32_t c = b + x2;
+        y         = c;
+    }
+#endif // CK_EXPERIMENTAL_BIT_INT_EXTENSION_INT4
 };
 
 struct AddHardswishAdd