References for conv2d fwd bias relu and add (#75)

* add reference * clean up * add reference for conv * rename Co-authored-by: ltqin <letaoqin@amd.com> Co-authored-by: Chao Liu <chao.liu2@amd.com> [ROCm/composable_kernel commit: 690c75a7eb]
2026-07-17 00:58:44 +00:00 · 2022-02-04 12:29:58 +08:00
parent 569b084436
commit b5cd5f7005
7 changed files with 706 additions and 209 deletions
--- a/example/4_conv2d_fwd_xdl/conv2d_fwd_xdl.cpp
+++ b/example/4_conv2d_fwd_xdl/conv2d_fwd_xdl.cpp
@@ -11,8 +11,9 @@
 #include "host_tensor_generator.hpp"
 #include "device_tensor.hpp"
 #include "tensor_layout.hpp"
-#include "device_operation/include/device_conv2d_fwd_xdl_c_shuffle_nhwc_kyxc_nhwk.hpp"
 #include "element_wise_operation.hpp"
+#include "device_conv2d_fwd_xdl_c_shuffle_nhwc_kyxc_nhwk.hpp"
+#include "reference_conv_fwd.hpp"

 using InDataType  = ck::half_t;
 using WeiDataType = ck::half_t;
@@ -33,65 +34,53 @@ using OutElementOp = ck::tensor_operation::element_wise::PassThrough;
 static constexpr auto ConvFwdDefault =
    ck::tensor_operation::device::ConvolutionForwardSpecialization_t::Default;

+// clang-format off
 using DeviceConvFwdInstance = ck::tensor_operation::device::
-    DeviceConv2dFwdXdl_C_Shuffle_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K
-    // clang-format off
-//      |    InData|     WeiData|     OutData|     AccData|          In|         Wei|           Out|    ConvForward| Block|  MPer|  NPer| K0Per| K1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle|     CBlockTransferClusterLengths|  CBlockTransfer|
-//      |      Type|        Type|        Type|        Type| Elementwise| Elementwise|   Elementwise| Specialization|  Size| Block| Block| Block|   |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MXdlPerWave_MWaveMPerXdl| ScalarPerVector|
-//      |          |            |            |            |   Operation|   Operation|     Operation|               |      |      |      |      |   |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle| _NBlock_NXdlPerWave_NWaveNPerXdl|   _NWaveNPerXdl|
-//      |          |            |            |            |            |            |              |               |      |      |      |      |   |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                                 |                |
-        <InDataType, WeiDataType, OutDataType, AccDataType, InElementOp, WeiElementOp, OutElementOp, ConvFwdDefault,   256,   128,   256,     4,  8,   32,   32,    2,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,           1,           1,             S<1, 1, 32, 1, 1, 8>,               8>;
+    DeviceConv2dFwdXdl_C_Shuffle_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<
+        InDataType,                       // InDataType
+        WeiDataType,                      // WeiDataType
+        OutDataType,                      // OutDataType
+        AccDataType,                      // AccDataType
+        InElementOp,                      // InElementwiseOperation
+        WeiElementOp,                     // WeiElementwiseOperation
+        OutElementOp,                     // OutElementwiseOperation
+        ConvFwdDefault,                   // ConvForwardSpecialization
+        256,                              // BlockSize
+        128,                              // MPerBlock
+        256,                              // NPerBlock
+        4,                                // K0PerBlock
+        8,                                // K1
+        32,                               // MPerXdl
+        32,                               // NPerXdl
+        2,                                // MXdlPerWave
+        4,                                // NXdlPerWave
+        S<4, 64, 1>,                      // ABlockTransferThreadClusterLengths_K0_M_K1
+        S<1, 0, 2>,                       // ABlockTransferThreadClusterArrangeOrder
+        S<1, 0, 2>,                       // ABlockTransferSrcAccessOrder
+        2,                                // ABlockTransferSrcVectorDim
+        8,                                // ABlockTransferSrcScalarPerVector
+        8,                                // ABlockTransferDstScalarPerVector_K1
+        true,                             // ABlockLdsAddExtraM
+        S<4, 64, 1>,                      // BBlockTransferThreadClusterLengths_K0_N_K1
+        S<1, 0, 2>,                       // BBlockTransferThreadClusterArrangeOrder
+        S<1, 0, 2>,                       // BBlockTransferSrcAccessOrder
+        2,                                // BBlockTransferSrcVectorDim
+        8,                                // BBlockTransferSrcScalarPerVector
+        8,                                // BBlockTransferDstScalarPerVector_K1
+        true,                             // BBlockLdsAddExtraN
+        1,                                // CShuffleMXdlPerWavePerShuffle
+        1,                                // CShuffleNXdlPerWavePerShuffle
+        S<1, 1, 32, 1, 1, 8>,             // CBlockTransferClusterLengths_MBlock_MXdlPerWave_MWaveMPerXdl_NBlock_NXdlPerWave_NWaveNPerXdl
+        8>;                               // CBlockTransferScalarPerVector_NWaveNPerXdl
 // clang-format on

-template <typename TIn,
-          typename TWei,
-          typename TOut,
-          typename InElementOp,
-          typename WeiElementOp,
-          typename OutElementOp>
-void host_verify(const Tensor<TIn>& in,
-                 const Tensor<TWei>& wei,
-                 Tensor<TOut>& out,
-                 const std::vector<ck::index_t>& conv_strides,
-                 const std::vector<ck::index_t>& conv_dilations,
-                 const std::vector<ck::index_t>& in_left_pads,
-                 const std::vector<ck::index_t>&,
-                 const InElementOp& in_element_op,
-                 const WeiElementOp& wei_element_op,
-                 const OutElementOp& out_element_op)
-{
-    auto f_nchw = [&](auto n, auto k, auto ho, auto wo) {
-        double v = 0;
-        for(int c = 0; c < wei.mDesc.GetLengths()[1]; ++c)
-        {
-            for(int y = 0; y < wei.mDesc.GetLengths()[2]; ++y)
-            {
-                int hi = ho * conv_strides[0] + y * conv_dilations[0] - in_left_pads[0];
-                for(int x = 0; x < wei.mDesc.GetLengths()[3]; ++x)
-                {
-                    int wi = wo * conv_strides[1] + x * conv_dilations[1] - in_left_pads[1];
-                    if(hi >= 0 && hi < in.mDesc.GetLengths()[2] && wi >= 0 &&
-                       wi < in.mDesc.GetLengths()[3])
-                    {
-                        v += in_element_op(static_cast<const double>(in(n, c, hi, wi))) *
-                             wei_element_op(static_cast<const double>(wei(k, c, y, x)));
-                    }
-                }
-            }
-        }
-        double v2 = out(n, k, ho, wo);
-
-        out_element_op(v2, v);
-
-        out(n, k, ho, wo) = v2;
-    };
-
-    make_ParallelTensorFunctor(f_nchw,
-                               out.mDesc.GetLengths()[0],
-                               out.mDesc.GetLengths()[1],
-                               out.mDesc.GetLengths()[2],
-                               out.mDesc.GetLengths()[3])(std::thread::hardware_concurrency());
-}
+using ReferenceConvFwdInstance = ck::tensor_operation::host::ReferenceConvFwd<InDataType,
+                                                                              WeiDataType,
+                                                                              OutDataType,
+                                                                              AccDataType,
+                                                                              InElementOp,
+                                                                              WeiElementOp,
+                                                                              OutElementOp>;

 int main(int argc, char* argv[])
 {
@@ -265,16 +254,20 @@ int main(int argc, char* argv[])

    if(do_verification)
    {
-        host_verify(in_n_c_hi_wi,
-                    wei_k_c_y_x,
-                    out_n_k_ho_wo_host_result,
-                    conv_filter_strides,
-                    conv_filter_dilations,
-                    input_left_pads,
-                    input_right_pads,
-                    InElementOp{},
-                    WeiElementOp{},
-                    OutElementOp{});
+        auto refConv    = ReferenceConvFwdInstance{};
+        auto refInvoker = refConv.MakeInvoker();
+
+        auto refArgument = refConv.MakeArgument(in_n_c_hi_wi,
+                                                wei_k_c_y_x,
+                                                out_n_k_ho_wo_host_result,
+                                                conv_filter_strides,
+                                                conv_filter_dilations,
+                                                input_left_pads,
+                                                input_right_pads,
+                                                InElementOp{},
+                                                WeiElementOp{},
+                                                OutElementOp{});
+        refInvoker.Run(refArgument);

        out_device_buf.FromDevice(out_n_k_ho_wo_device_result.mData.data());

--- a/example/5_conv2d_fwd_xdl_bias_relu/conv2d_fwd_xdl_bias_relu.cpp
+++ b/example/5_conv2d_fwd_xdl_bias_relu/conv2d_fwd_xdl_bias_relu.cpp
@@ -11,8 +11,9 @@
 #include "host_tensor_generator.hpp"
 #include "device_tensor.hpp"
 #include "tensor_layout.hpp"
-#include "device_conv2d_fwd_xdl_c_shuffle_bias_activation_nhwc_kyxc_nhwk.hpp"
 #include "element_wise_operation.hpp"
+#include "device_conv2d_fwd_xdl_c_shuffle_bias_activation_nhwc_kyxc_nhwk.hpp"
+#include "reference_conv_fwd_bias_activation.hpp"

 using InDataType  = ck::half_t;
 using WeiDataType = ck::half_t;
@@ -37,63 +38,53 @@ static constexpr auto ConvFwdDefault =

 // clang-format off
 using DeviceConvFwdInstance = ck::tensor_operation::device::
-    DeviceConv2dFwdXdl_C_Shuffle_Bias_Activation_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K
-    // clang-format off
-//      |    InData|     WeiData|     OutData|     AccData|          In|         Wei|           Out|           Out|    ConvForward| Block|  MPer|  NPer| K0Per| K1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle|     CBlockTransferClusterLengths|  CBlockTransfer|
-//      |      Type|        Type|        Type|        Type| Elementwise| Elementwise|   Elementwise|  GlobalMemory| Specialization|  Size| Block| Block| Block|   |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MXdlPerWave_MWaveMPerXdl| ScalarPerVector|
-//      |          |            |            |            |   Operation|   Operation|     Operation| DataOperation|               |      |      |      |      |   |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle| _NBlock_NXdlPerWave_NWaveNPerXdl|   _NWaveNPerXdl|
-//      |          |            |            |            |            |            |              |              |               |      |      |      |      |   |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                                 |                |
-        <InDataType, WeiDataType, OutDataType, AccDataType, InElementOp, WeiElementOp, OutElementOp,     MemorySet, ConvFwdDefault,   256,   128,   256,     4,  8,   32,   32,    2,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,           1,           1,             S<1, 1, 32, 1, 1, 8>,               8>;
+    DeviceConv2dFwdXdl_C_Shuffle_Bias_Activation_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<
+        InDataType,                   // InDataType
+        WeiDataType,                  // WeiDataType
+        OutDataType,                  // OutDataType
+        AccDataType,                  // AccDataType
+        InElementOp,                  // InElementwiseOperation
+        WeiElementOp,                 // WeiElementwiseOperation
+        OutElementOp,                 // OutElementwiseOperation
+        MemorySet,                    // OutGlobalMemoryDataOperation
+        ConvFwdDefault,               // ConvForwardSpecialization
+        256,                          // BlockSize
+        128,                          // MPerBlock
+        256,                          // NPerBlock
+        4,                            // K0PerBlock
+        8,                            // K1
+        32,                           // MPerXdl
+        32,                           // NPerXdl
+        2,                            // MXdlPerWave
+        4,                            // NXdlPerWave
+        S<4, 64, 1>,                  // ABlockTransferThreadClusterLengths_K0_M_K1
+        S<1, 0, 2>,                   // ABlockTransferThreadClusterArrangeOrder
+        S<1, 0, 2>,                   // ABlockTransferSrcAccessOrder
+        2,                            // ABlockTransferSrcVectorDim
+        8,                            // ABlockTransferSrcScalarPerVector
+        8,                            // ABlockTransferDstScalarPerVector_K1
+        true,                         // ABlockLdsAddExtraM
+        S<4, 64, 1>,                  // BBlockTransferThreadClusterLengths_K0_N_K1
+        S<1, 0, 2>,                   // BBlockTransferThreadClusterArrangeOrder
+        S<1, 0, 2>,                   // BBlockTransferSrcAccessOrder
+        2,                            // BBlockTransferSrcVectorDim
+        8,                            // BBlockTransferSrcScalarPerVector
+        8,                            // BBlockTransferDstScalarPerVector_K1
+        true,                         // BBlockLdsAddExtraN
+        1,                            // CShuffleMXdlPerWavePerShuffle
+        1,                            // CShuffleNXdlPerWavePerShuffle
+        S<1, 1, 32, 1, 1, 8>,         // CBlockTransferClusterLengths_MBlock_MXdlPerWave_MWaveMPerXdl_NBlock_NXdlPerWave_NWaveNPerXdl
+        8>;                           // CBlockTransferScalarPerVector_NWaveNPerXdl
 // clang-format on

-template <typename TIn,
-          typename TWei,
-          typename TOut,
-          typename InElementOp,
-          typename WeiElementOp,
-          typename OutElementOp>
-void host_reference_calculation(const Tensor<TIn>& in_n_c_hi_wi,
-                                const Tensor<TWei>& wei_k_c_y_x,
-                                Tensor<TOut>& out_n_k_ho_wo,
-                                const Tensor<TOut>& bias_k,
-                                const std::vector<ck::index_t>& conv_strides,
-                                const std::vector<ck::index_t>& conv_dilations,
-                                const std::vector<ck::index_t>& in_left_pads,
-                                const std::vector<ck::index_t>& /* in_right_pads */,
-                                const InElementOp& in_element_op,
-                                const WeiElementOp& wei_element_op,
-                                const OutElementOp& out_element_op)
-{
-    auto f_nchw = [&](auto n, auto k, auto ho, auto wo) {
-        double v = 0;
-        for(int c = 0; c < wei_k_c_y_x.mDesc.GetLengths()[1]; ++c)
-        {
-            for(int y = 0; y < wei_k_c_y_x.mDesc.GetLengths()[2]; ++y)
-            {
-                int hi = ho * conv_strides[0] + y * conv_dilations[0] - in_left_pads[0];
-                for(int x = 0; x < wei_k_c_y_x.mDesc.GetLengths()[3]; ++x)
-                {
-                    int wi = wo * conv_strides[1] + x * conv_dilations[1] - in_left_pads[1];
-                    if(hi >= 0 && hi < in_n_c_hi_wi.mDesc.GetLengths()[2] && wi >= 0 &&
-                       wi < in_n_c_hi_wi.mDesc.GetLengths()[3])
-                    {
-                        v += in_element_op(static_cast<const double>(in_n_c_hi_wi(n, c, hi, wi))) *
-                             wei_element_op(static_cast<const double>(wei_k_c_y_x(k, c, y, x)));
-                    }
-                }
-            }
-        }
-
-        out_n_k_ho_wo(n, k, ho, wo) = out_element_op(v, bias_k(k));
-    };
-
-    make_ParallelTensorFunctor(f_nchw,
-                               out_n_k_ho_wo.mDesc.GetLengths()[0],
-                               out_n_k_ho_wo.mDesc.GetLengths()[1],
-                               out_n_k_ho_wo.mDesc.GetLengths()[2],
-                               out_n_k_ho_wo.mDesc.GetLengths()[3])(
-        std::thread::hardware_concurrency());
-}
+using ReferenceConvFwdInstance =
+    ck::tensor_operation::host::ReferenceConvFwd_Bias_Activation<InDataType,
+                                                                 WeiDataType,
+                                                                 OutDataType,
+                                                                 AccDataType,
+                                                                 InElementOp,
+                                                                 WeiElementOp,
+                                                                 OutElementOp>;

 int main(int argc, char* argv[])
 {
@@ -277,17 +268,21 @@ int main(int argc, char* argv[])

    if(do_verification)
    {
-        host_reference_calculation(in_n_c_hi_wi,
-                                   wei_k_c_y_x,
-                                   out_n_k_ho_wo_host_result,
-                                   bias_k,
-                                   conv_filter_strides,
-                                   conv_filter_dilations,
-                                   input_left_pads,
-                                   input_right_pads,
-                                   InElementOp{},
-                                   WeiElementOp{},
-                                   OutElementOp{});
+        auto refConv    = ReferenceConvFwdInstance{};
+        auto refInvoker = refConv.MakeInvoker();
+
+        auto refArgument = refConv.MakeArgument(in_n_c_hi_wi,
+                                                wei_k_c_y_x,
+                                                out_n_k_ho_wo_host_result,
+                                                bias_k,
+                                                conv_filter_strides,
+                                                conv_filter_dilations,
+                                                input_left_pads,
+                                                input_right_pads,
+                                                InElementOp{},
+                                                WeiElementOp{},
+                                                OutElementOp{});
+        refInvoker.Run(refArgument);

        out_device_buf.FromDevice(out_n_k_ho_wo_device_result.mData.data());

--- a/example/6_conv2d_fwd_xdl_bias_relu_add/conv2d_fwd_xdl_bias_relu_add.cpp
+++ b/example/6_conv2d_fwd_xdl_bias_relu_add/conv2d_fwd_xdl_bias_relu_add.cpp
@@ -11,8 +11,9 @@
 #include "host_tensor_generator.hpp"
 #include "device_tensor.hpp"
 #include "tensor_layout.hpp"
-#include "device_conv2d_fwd_xdl_c_shuffle_bias_activation_add_nhwc_kyxc_nhwk.hpp"
 #include "element_wise_operation.hpp"
+#include "device_conv2d_fwd_xdl_c_shuffle_bias_activation_add_nhwc_kyxc_nhwk.hpp"
+#include "reference_conv_fwd_bias_activation_add.hpp"

 using InDataType  = ck::half_t;
 using WeiDataType = ck::half_t;
@@ -35,70 +36,52 @@ static constexpr auto ConvFwdDefault =

 // clang-format off
 using DeviceConvFwdInstance = ck::tensor_operation::device::
-   DeviceConv2dFwdXdl_C_Shuffle_Bias_Activation_Add_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K 
-//      |    InData|     WeiData|     OutData|     AccData|          In|         Wei|           Out|    ConvForward| Block|  MPer|  NPer| K0Per| K1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle|     CBlockTransferClusterLengths|  CBlockTransfer|
-//      |      Type|        Type|        Type|        Type| Elementwise| Elementwise|   Elementwise| Specialization|  Size| Block| Block| Block|   |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MXdlPerWave_MWaveMPerXdl| ScalarPerVector|
-//      |          |            |            |            |   Operation|   Operation|     Operation|               |      |      |      |      |   |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle| _NBlock_NXdlPerWave_NWaveNPerXdl|   _NWaveNPerXdl|
-//      |          |            |            |            |            |            |              |               |      |      |      |      |   |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                                 |                |
-        <InDataType, WeiDataType, OutDataType, AccDataType, InElementOp, WeiElementOp, OutElementOp, ConvFwdDefault,   256,   128,   256,     4,  8,   32,   32,    2,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,           1,           1,             S<1, 1, 32, 1, 1, 8>,               8>;
+    DeviceConv2dFwdXdl_C_Shuffle_Bias_Activation_Add_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<
+        InDataType,              // InDataType
+        WeiDataType,             // WeiDataType
+        OutDataType,             // OutDataType
+        AccDataType,             // AccDataType
+        InElementOp,             // InElementwiseOperation
+        WeiElementOp,            // WeiElementwiseOperation
+        OutElementOp,            // OutElementwiseOperation
+        ConvFwdDefault,          // ConvForwardSpecialization
+        256,                     // BlockSize
+        128,                     // MPerBlock
+        256,                     // NPerBlock
+        4,                       // K0PerBlock
+        8,                       // K1
+        32,                      // MPerXdl
+        32,                      // NPerXdl
+        2,                       // MXdlPerWave
+        4,                       // NXdlPerWave
+        S<4, 64, 1>,             // ABlockTransferThreadClusterLengths_K0_M_K1
+        S<1, 0, 2>,              // ABlockTransferThreadClusterArrangeOrder
+        S<1, 0, 2>,              // ABlockTransferSrcAccessOrder
+        2,                       // ABlockTransferSrcVectorDim
+        8,                       // ABlockTransferSrcScalarPerVector
+        8,                       // ABlockTransferDstScalarPerVector_K1
+        true,                    // ABlockLdsAddExtraM
+        S<4, 64, 1>,             // BBlockTransferThreadClusterLengths_K0_N_K1
+        S<1, 0, 2>,              // BBlockTransferThreadClusterArrangeOrder
+        S<1, 0, 2>,              // BBlockTransferSrcAccessOrder
+        2,                       // BBlockTransferSrcVectorDim
+        8,                       // BBlockTransferSrcScalarPerVector
+        8,                       // BBlockTransferDstScalarPerVector_K1
+        true,                    // BBlockLdsAddExtraN
+        1,                       // CShuffleMXdlPerWavePerShuffle
+        1,                       // CShuffleNXdlPerWavePerShuffle
+        S<1, 1, 32, 1, 1, 8>,    // CBlockTransferClusterLengths_MBlock_MXdlPerWave_MWaveMPerXdl_NBlock_NXdlPerWave_NWaveNPerXdl
+        8>;                      // CBlockTransferScalarPerVector_NWaveNPerXdl
 // clang-format on

-template <typename TIn,
-          typename TWei,
-          typename TOut,
-          typename InElementOp,
-          typename WeiElementOp,
-          typename OutElementOp>
-void host_reference_calculation(const Tensor<TIn>& in_n_c_hi_wi,
-                                const Tensor<TWei>& wei_k_c_y_x,
-                                Tensor<TOut>& out_n_k_ho_wo,
-                                const Tensor<TOut>& bias_k,
-                                const Tensor<TOut>& resi_n_k_ho_wo,
-                                const std::vector<ck::index_t>& conv_strides,
-                                const std::vector<ck::index_t>& conv_dilations,
-                                const std::vector<ck::index_t>& in_left_pads,
-                                const std::vector<ck::index_t>& /* in_right_pads */,
-                                const InElementOp& in_element_op,
-                                const WeiElementOp& wei_element_op,
-                                const OutElementOp& out_element_op)
-{
-    auto f_nchw = [&](auto n, auto k, auto ho, auto wo) {
-        double v = 0;
-        for(int c = 0; c < wei_k_c_y_x.mDesc.GetLengths()[1]; ++c)
-        {
-            for(int y = 0; y < wei_k_c_y_x.mDesc.GetLengths()[2]; ++y)
-            {
-                int hi = ho * conv_strides[0] + y * conv_dilations[0] - in_left_pads[0];
-                for(int x = 0; x < wei_k_c_y_x.mDesc.GetLengths()[3]; ++x)
-                {
-                    int wi = wo * conv_strides[1] + x * conv_dilations[1] - in_left_pads[1];
-                    if(hi >= 0 && hi < in_n_c_hi_wi.mDesc.GetLengths()[2] && wi >= 0 &&
-                       wi < in_n_c_hi_wi.mDesc.GetLengths()[3])
-                    {
-                        v += in_element_op(static_cast<const double>(in_n_c_hi_wi(n, c, hi, wi))) *
-                             wei_element_op(static_cast<const double>(wei_k_c_y_x(k, c, y, x)));
-                    }
-                }
-            }
-        }
-
-        double v2 = out_n_k_ho_wo(n, k, ho, wo);
-
-        out_element_op(v2,
-                       v,
-                       static_cast<const double>(bias_k(k)),
-                       static_cast<const double>(resi_n_k_ho_wo(n, k, ho, wo)));
-
-        out_n_k_ho_wo(n, k, ho, wo) = v2;
-    };
-
-    make_ParallelTensorFunctor(f_nchw,
-                               out_n_k_ho_wo.mDesc.GetLengths()[0],
-                               out_n_k_ho_wo.mDesc.GetLengths()[1],
-                               out_n_k_ho_wo.mDesc.GetLengths()[2],
-                               out_n_k_ho_wo.mDesc.GetLengths()[3])(
-        std::thread::hardware_concurrency());
-}
+using ReferenceConvFwdInstance =
+    ck::tensor_operation::host::ReferenceConvFwd_Bias_Activation_Add<InDataType,
+                                                                     WeiDataType,
+                                                                     OutDataType,
+                                                                     AccDataType,
+                                                                     InElementOp,
+                                                                     WeiElementOp,
+                                                                     OutElementOp>;

 int main(int argc, char* argv[])
 {
@@ -292,18 +275,22 @@ int main(int argc, char* argv[])

    if(do_verification)
    {
-        host_reference_calculation(in_n_c_hi_wi,
-                                   wei_k_c_y_x,
-                                   out_n_k_ho_wo_host_result,
-                                   bias_k,
-                                   resi_n_k_ho_wo,
-                                   conv_filter_strides,
-                                   conv_filter_dilations,
-                                   input_left_pads,
-                                   input_right_pads,
-                                   InElementOp{},
-                                   WeiElementOp{},
-                                   OutElementOp{});
+        auto refConv    = ReferenceConvFwdInstance{};
+        auto refInvoker = refConv.MakeInvoker();
+
+        auto refArgument = refConv.MakeArgument(in_n_c_hi_wi,
+                                                wei_k_c_y_x,
+                                                out_n_k_ho_wo_host_result,
+                                                bias_k,
+                                                resi_n_k_ho_wo,
+                                                conv_filter_strides,
+                                                conv_filter_dilations,
+                                                input_left_pads,
+                                                input_right_pads,
+                                                InElementOp{},
+                                                WeiElementOp{},
+                                                OutElementOp{});
+        refInvoker.Run(refArgument);

        out_device_buf.FromDevice(out_n_k_ho_wo_device_result.mData.data());

--- a/example/CMakeLists.txt
+++ b/example/CMakeLists.txt
@@ -2,6 +2,7 @@ include_directories(BEFORE
    ${PROJECT_SOURCE_DIR}
    ${PROJECT_SOURCE_DIR}/host/host_tensor/include
    ${PROJECT_SOURCE_DIR}/host/device/include
+    ${PROJECT_SOURCE_DIR}/host/include
    ${PROJECT_SOURCE_DIR}/device_operation/include
    ${PROJECT_SOURCE_DIR}/composable_kernel/include
    ${PROJECT_SOURCE_DIR}/composable_kernel/include/utility
--- a/host/include/reference_conv_fwd.hpp
+++ b/host/include/reference_conv_fwd.hpp
@@ -0,0 +1,166 @@
+#ifndef REFERENCE_CONV_FWD_HPP
+#define REFERENCE_CONV_FWD_HPP
+
+#include <iostream>
+#include <sstream>
+#include "device_base.hpp"
+#include "host_tensor.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace host {
+
+// out[N, K, Ho, Wo] = in[N, C, Hi, Wi] * wei[K, C, Y, X]
+template <typename InDataType,
+          typename WeiDataType,
+          typename OutDataType,
+          typename AccDataType,
+          typename InElementwiseOperation,
+          typename WeiElementwiseOperation,
+          typename OutElementwiseOperation>
+struct ReferenceConvFwd : public device::BaseOperator
+{
+    // Argument
+    struct Argument : public device::BaseArgument
+    {
+        Argument(const Tensor<InDataType>& in_n_c_hi_wi,
+                 const Tensor<WeiDataType>& wei_k_c_y_x,
+                 Tensor<OutDataType>& out_n_k_ho_wo,
+                 std::vector<ck::index_t> conv_filter_strides,
+                 std::vector<ck::index_t> conv_filter_dilations,
+                 std::vector<ck::index_t> input_left_pads,
+                 std::vector<ck::index_t> input_right_pads,
+                 InElementwiseOperation in_element_op,
+                 WeiElementwiseOperation wei_element_op,
+                 OutElementwiseOperation out_element_op)
+            : in_n_c_hi_wi_{in_n_c_hi_wi},
+              wei_k_c_y_x_{wei_k_c_y_x},
+              out_n_k_ho_wo_{out_n_k_ho_wo},
+              conv_strides_{conv_filter_strides},
+              conv_dilations_{conv_filter_dilations},
+              in_left_pads_{input_left_pads},
+              in_right_pads_{input_right_pads},
+              in_element_op_{in_element_op},
+              wei_element_op_{wei_element_op},
+              out_element_op_{out_element_op}
+        {
+        }
+
+        const Tensor<InDataType>& in_n_c_hi_wi_;
+        const Tensor<WeiDataType>& wei_k_c_y_x_;
+        Tensor<OutDataType>& out_n_k_ho_wo_;
+
+        std::vector<index_t> conv_strides_;
+        std::vector<index_t> conv_dilations_;
+        std::vector<index_t> in_left_pads_;
+        std::vector<index_t> in_right_pads_;
+
+        InElementwiseOperation in_element_op_;
+        WeiElementwiseOperation wei_element_op_;
+        OutElementwiseOperation out_element_op_;
+    };
+
+    // Invoker
+    struct Invoker : public device::BaseInvoker
+    {
+        using Argument = ReferenceConvFwd::Argument;
+
+        float Run(const Argument& arg)
+        {
+            auto f_nchw = [&](auto n, auto k, auto ho, auto wo) {
+                float v = 0;
+                for(int c = 0; c < arg.wei_k_c_y_x_.mDesc.GetLengths()[1]; ++c)
+                {
+                    for(int y = 0; y < arg.wei_k_c_y_x_.mDesc.GetLengths()[2]; ++y)
+                    {
+                        int hi = ho * arg.conv_strides_[0] + y * arg.conv_dilations_[0] -
+                                 arg.in_left_pads_[0];
+                        for(int x = 0; x < arg.wei_k_c_y_x_.mDesc.GetLengths()[3]; ++x)
+                        {
+                            int wi = wo * arg.conv_strides_[1] + x * arg.conv_dilations_[1] -
+                                     arg.in_left_pads_[1];
+                            if(hi >= 0 && hi < arg.in_n_c_hi_wi_.mDesc.GetLengths()[2] && wi >= 0 &&
+                               wi < arg.in_n_c_hi_wi_.mDesc.GetLengths()[3])
+                            {
+                                v += arg.in_element_op_(
+                                         ck::type_convert<float>(arg.in_n_c_hi_wi_(n, c, hi, wi))) *
+                                     arg.wei_element_op_(
+                                         ck::type_convert<float>(arg.wei_k_c_y_x_(k, c, y, x)));
+                            }
+                        }
+                    }
+                }
+
+                arg.out_n_k_ho_wo_(n, k, ho, wo) =
+                    ck::type_convert<OutDataType>(arg.out_element_op_(v));
+            };
+
+            make_ParallelTensorFunctor(f_nchw,
+                                       arg.out_n_k_ho_wo_.mDesc.GetLengths()[0],
+                                       arg.out_n_k_ho_wo_.mDesc.GetLengths()[1],
+                                       arg.out_n_k_ho_wo_.mDesc.GetLengths()[2],
+                                       arg.out_n_k_ho_wo_.mDesc.GetLengths()[3])(
+                std::thread::hardware_concurrency());
+            return 0;
+        }
+
+        float Run(const device::BaseArgument* p_arg, int) override
+        {
+            return Run(*dynamic_cast<const Argument*>(p_arg));
+        }
+    };
+
+    static constexpr bool IsValidCompilationParameter()
+    {
+        // TODO: properly implement this check
+        return true;
+    }
+
+    bool IsSupportedArgument(const device::BaseArgument*) override { return true; }
+
+    static auto MakeArgument(const Tensor<InDataType>& in_n_c_hi_wi,
+                             const Tensor<WeiDataType>& wei_k_c_y_x,
+                             Tensor<OutDataType>& out_n_k_ho_wo,
+                             std::vector<ck::index_t> conv_filter_strides,
+                             std::vector<ck::index_t> conv_filter_dilations,
+                             std::vector<ck::index_t> input_left_pads,
+                             std::vector<ck::index_t> input_right_pads,
+                             InElementwiseOperation in_element_op,
+                             WeiElementwiseOperation wei_element_op,
+                             OutElementwiseOperation out_element_op)
+    {
+        return Argument{in_n_c_hi_wi,
+                        wei_k_c_y_x,
+                        out_n_k_ho_wo,
+                        conv_filter_strides,
+                        conv_filter_dilations,
+                        input_left_pads,
+                        input_right_pads,
+                        in_element_op,
+                        wei_element_op,
+                        out_element_op};
+    }
+
+    static auto MakeInvoker() { return Invoker{}; }
+
+    virtual std::unique_ptr<device::BaseInvoker> MakeInvokerPointer()
+    {
+        return std::make_unique<Invoker>(Invoker{});
+    }
+
+    std::string GetTypeString() const override
+    {
+        auto str = std::stringstream();
+
+        // clang-format off
+        str << "ReferenceConvFwd"
+            << std::endl;
+        // clang-format on
+
+        return str.str();
+    }
+};
+} // namespace host
+} // namespace tensor_operation
+} // namespace ck
+#endif
--- a/host/include/reference_conv_fwd_bias_activation.hpp
+++ b/host/include/reference_conv_fwd_bias_activation.hpp
@@ -0,0 +1,172 @@
+#ifndef REFERENCE_CONV_FWD_BIAS_ACTIVATION_HPP
+#define REFERENCE_CONV_FWD_BIAS_ACTIVATION_HPP
+
+#include <iostream>
+#include <sstream>
+#include "device_base.hpp"
+#include "host_tensor.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace host {
+
+// out[N, Ho, Wo, K] =
+//     activate(in[N, Hi, Wi, C] * wei[K, Y, X, C] + bias[K])
+template <typename InDataType,
+          typename WeiDataType,
+          typename OutDataType,
+          typename AccDataType,
+          typename InElementwiseOperation,
+          typename WeiElementwiseOperation,
+          typename OutElementwiseOperation>
+struct ReferenceConvFwd_Bias_Activation : public device::BaseOperator
+{
+    // Argument
+    struct Argument : public device::BaseArgument
+    {
+        Argument(const Tensor<InDataType>& in_n_c_hi_wi,
+                 const Tensor<WeiDataType>& wei_k_c_y_x,
+                 Tensor<OutDataType>& out_n_k_ho_wo,
+                 const Tensor<OutDataType>& bias_k,
+                 std::vector<ck::index_t> conv_filter_strides,
+                 std::vector<ck::index_t> conv_filter_dilations,
+                 std::vector<ck::index_t> input_left_pads,
+                 std::vector<ck::index_t> input_right_pads,
+                 InElementwiseOperation in_element_op,
+                 WeiElementwiseOperation wei_element_op,
+                 OutElementwiseOperation out_element_op)
+            : in_n_c_hi_wi_{in_n_c_hi_wi},
+              wei_k_c_y_x_{wei_k_c_y_x},
+              out_n_k_ho_wo_{out_n_k_ho_wo},
+              bias_k_{bias_k},
+              conv_strides_{conv_filter_strides},
+              conv_dilations_{conv_filter_dilations},
+              in_left_pads_{input_left_pads},
+              in_right_pads_{input_right_pads},
+              in_element_op_{in_element_op},
+              wei_element_op_{wei_element_op},
+              out_element_op_{out_element_op}
+        {
+        }
+
+        const Tensor<InDataType>& in_n_c_hi_wi_;
+        const Tensor<WeiDataType>& wei_k_c_y_x_;
+        Tensor<OutDataType>& out_n_k_ho_wo_;
+        const Tensor<OutDataType>& bias_k_;
+
+        std::vector<index_t> conv_strides_;
+        std::vector<index_t> conv_dilations_;
+        std::vector<index_t> in_left_pads_;
+        std::vector<index_t> in_right_pads_;
+
+        InElementwiseOperation in_element_op_;
+        WeiElementwiseOperation wei_element_op_;
+        OutElementwiseOperation out_element_op_;
+    };
+
+    // Invoker
+    struct Invoker : public device::BaseInvoker
+    {
+        using Argument = ReferenceConvFwd_Bias_Activation::Argument;
+
+        float Run(const Argument& arg)
+        {
+            auto f_nchw = [&](auto n, auto k, auto ho, auto wo) {
+                float v = 0;
+                for(int c = 0; c < arg.wei_k_c_y_x_.mDesc.GetLengths()[1]; ++c)
+                {
+                    for(int y = 0; y < arg.wei_k_c_y_x_.mDesc.GetLengths()[2]; ++y)
+                    {
+                        int hi = ho * arg.conv_strides_[0] + y * arg.conv_dilations_[0] -
+                                 arg.in_left_pads_[0];
+                        for(int x = 0; x < arg.wei_k_c_y_x_.mDesc.GetLengths()[3]; ++x)
+                        {
+                            int wi = wo * arg.conv_strides_[1] + x * arg.conv_dilations_[1] -
+                                     arg.in_left_pads_[1];
+                            if(hi >= 0 && hi < arg.in_n_c_hi_wi_.mDesc.GetLengths()[2] && wi >= 0 &&
+                               wi < arg.in_n_c_hi_wi_.mDesc.GetLengths()[3])
+                            {
+                                v += arg.in_element_op_(
+                                         ck::type_convert<float>(arg.in_n_c_hi_wi_(n, c, hi, wi))) *
+                                     arg.wei_element_op_(
+                                         ck::type_convert<float>(arg.wei_k_c_y_x_(k, c, y, x)));
+                            }
+                        }
+                    }
+                }
+
+                arg.out_n_k_ho_wo_(n, k, ho, wo) =
+                    ck::type_convert<OutDataType>(arg.out_element_op_(v, arg.bias_k_(k)));
+            };
+
+            make_ParallelTensorFunctor(f_nchw,
+                                       arg.out_n_k_ho_wo_.mDesc.GetLengths()[0],
+                                       arg.out_n_k_ho_wo_.mDesc.GetLengths()[1],
+                                       arg.out_n_k_ho_wo_.mDesc.GetLengths()[2],
+                                       arg.out_n_k_ho_wo_.mDesc.GetLengths()[3])(
+                std::thread::hardware_concurrency());
+            return 0;
+        }
+
+        float Run(const device::BaseArgument* p_arg, int) override
+        {
+            return Run(*dynamic_cast<const Argument*>(p_arg));
+        }
+    };
+
+    static constexpr bool IsValidCompilationParameter()
+    {
+        // TODO: properly implement this check
+        return true;
+    }
+
+    bool IsSupportedArgument(const device::BaseArgument*) override { return true; }
+
+    static auto MakeArgument(const Tensor<InDataType>& in_n_c_hi_wi,
+                             const Tensor<WeiDataType>& wei_k_c_y_x,
+                             Tensor<OutDataType>& out_n_k_ho_wo,
+                             const Tensor<OutDataType>& bias_k,
+                             std::vector<ck::index_t> conv_filter_strides,
+                             std::vector<ck::index_t> conv_filter_dilations,
+                             std::vector<ck::index_t> input_left_pads,
+                             std::vector<ck::index_t> input_right_pads,
+                             InElementwiseOperation in_element_op,
+                             WeiElementwiseOperation wei_element_op,
+                             OutElementwiseOperation out_element_op)
+    {
+        return Argument{in_n_c_hi_wi,
+                        wei_k_c_y_x,
+                        out_n_k_ho_wo,
+                        bias_k,
+                        conv_filter_strides,
+                        conv_filter_dilations,
+                        input_left_pads,
+                        input_right_pads,
+                        in_element_op,
+                        wei_element_op,
+                        out_element_op};
+    }
+
+    static auto MakeInvoker() { return Invoker{}; }
+
+    virtual std::unique_ptr<device::BaseInvoker> MakeInvokerPointer()
+    {
+        return std::make_unique<Invoker>(Invoker{});
+    }
+
+    std::string GetTypeString() const override
+    {
+        auto str = std::stringstream();
+
+        // clang-format off
+        str << "ReferenceConvFwd_Bias_Activation"
+            << std::endl;
+        // clang-format on
+
+        return str.str();
+    }
+};
+} // namespace host
+} // namespace tensor_operation
+} // namespace ck
+#endif
--- a/host/include/reference_conv_fwd_bias_activation_add.hpp
+++ b/host/include/reference_conv_fwd_bias_activation_add.hpp
@@ -0,0 +1,183 @@
+#ifndef REFERENCE_CONV2D_FWD_BIAS_ACTIVATION_ADD_HPP
+#define REFERENCE_CONV2D_FWD_BIAS_ACTIVATION_ADD_HPP
+
+#include <iostream>
+#include <sstream>
+#include "device_base.hpp"
+#include "host_tensor.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace host {
+
+// out[N, Ho, Wo, K] =
+//     activate(in[N, Hi, Wi, C] * wei[K, Y, X, C] + bias[K]) + residual[N, Ho, Wo, K]
+template <typename InDataType,
+          typename WeiDataType,
+          typename OutDataType,
+          typename AccDataType,
+          typename InElementwiseOperation,
+          typename WeiElementwiseOperation,
+          typename OutElementwiseOperation>
+struct ReferenceConvFwd_Bias_Activation_Add : public device::BaseOperator
+{
+    // Argument
+    struct Argument : public device::BaseArgument
+    {
+        Argument(const Tensor<InDataType>& in_n_c_hi_wi,
+                 const Tensor<WeiDataType>& wei_k_c_y_x,
+                 Tensor<OutDataType>& out_n_k_ho_wo,
+                 const Tensor<OutDataType>& bias_k,
+                 const Tensor<OutDataType>& resi_n_k_ho_wo,
+                 std::vector<ck::index_t> conv_filter_strides,
+                 std::vector<ck::index_t> conv_filter_dilations,
+                 std::vector<ck::index_t> input_left_pads,
+                 std::vector<ck::index_t> input_right_pads,
+                 InElementwiseOperation in_element_op,
+                 WeiElementwiseOperation wei_element_op,
+                 OutElementwiseOperation out_element_op)
+            : in_n_c_hi_wi_{in_n_c_hi_wi},
+              wei_k_c_y_x_{wei_k_c_y_x},
+              out_n_k_ho_wo_{out_n_k_ho_wo},
+              bias_k_{bias_k},
+              resi_n_k_ho_wo_{resi_n_k_ho_wo},
+              conv_strides_{conv_filter_strides},
+              conv_dilations_{conv_filter_dilations},
+              in_left_pads_{input_left_pads},
+              in_right_pads_{input_right_pads},
+              in_element_op_{in_element_op},
+              wei_element_op_{wei_element_op},
+              out_element_op_{out_element_op}
+        {
+        }
+
+        const Tensor<InDataType>& in_n_c_hi_wi_;
+        const Tensor<WeiDataType>& wei_k_c_y_x_;
+        Tensor<OutDataType>& out_n_k_ho_wo_;
+        const Tensor<OutDataType>& bias_k_;
+        const Tensor<OutDataType>& resi_n_k_ho_wo_;
+
+        std::vector<index_t> conv_strides_;
+        std::vector<index_t> conv_dilations_;
+        std::vector<index_t> in_left_pads_;
+        std::vector<index_t> in_right_pads_;
+
+        InElementwiseOperation in_element_op_;
+        WeiElementwiseOperation wei_element_op_;
+        OutElementwiseOperation out_element_op_;
+    };
+
+    // Invoker
+    struct Invoker : public device::BaseInvoker
+    {
+        using Argument = ReferenceConvFwd_Bias_Activation_Add::Argument;
+
+        float Run(const Argument& arg)
+        {
+            auto f_nchw = [&](auto n, auto k, auto ho, auto wo) {
+                float v = 0;
+                for(int c = 0; c < arg.wei_k_c_y_x_.mDesc.GetLengths()[1]; ++c)
+                {
+                    for(int y = 0; y < arg.wei_k_c_y_x_.mDesc.GetLengths()[2]; ++y)
+                    {
+                        int hi = ho * arg.conv_strides_[0] + y * arg.conv_dilations_[0] -
+                                 arg.in_left_pads_[0];
+                        for(int x = 0; x < arg.wei_k_c_y_x_.mDesc.GetLengths()[3]; ++x)
+                        {
+                            int wi = wo * arg.conv_strides_[1] + x * arg.conv_dilations_[1] -
+                                     arg.in_left_pads_[1];
+                            if(hi >= 0 && hi < arg.in_n_c_hi_wi_.mDesc.GetLengths()[2] && wi >= 0 &&
+                               wi < arg.in_n_c_hi_wi_.mDesc.GetLengths()[3])
+                            {
+                                v += arg.in_element_op_(
+                                         ck::type_convert<float>(arg.in_n_c_hi_wi_(n, c, hi, wi))) *
+                                     arg.wei_element_op_(
+                                         ck::type_convert<float>(arg.wei_k_c_y_x_(k, c, y, x)));
+                            }
+                        }
+                    }
+                }
+
+                float v2 = ck::type_convert<float>(arg.out_n_k_ho_wo_(n, k, ho, wo));
+
+                arg.out_element_op_(v2,
+                                    v,
+                                    ck::type_convert<float>(arg.bias_k_(k)),
+                                    ck::type_convert<float>(arg.resi_n_k_ho_wo_(n, k, ho, wo)));
+
+                arg.out_n_k_ho_wo_(n, k, ho, wo) = ck::type_convert<OutDataType>(v2);
+            };
+
+            make_ParallelTensorFunctor(f_nchw,
+                                       arg.out_n_k_ho_wo_.mDesc.GetLengths()[0],
+                                       arg.out_n_k_ho_wo_.mDesc.GetLengths()[1],
+                                       arg.out_n_k_ho_wo_.mDesc.GetLengths()[2],
+                                       arg.out_n_k_ho_wo_.mDesc.GetLengths()[3])(
+                std::thread::hardware_concurrency());
+            return 0;
+        }
+
+        float Run(const device::BaseArgument* p_arg, int) override
+        {
+            return Run(*dynamic_cast<const Argument*>(p_arg));
+        }
+    };
+
+    static constexpr bool IsValidCompilationParameter()
+    {
+        // TODO: properly implement this check
+        return true;
+    }
+
+    bool IsSupportedArgument(const device::BaseArgument*) override { return true; }
+
+    static auto MakeArgument(const Tensor<InDataType>& in_n_c_hi_wi,
+                             const Tensor<WeiDataType>& wei_k_c_y_x,
+                             Tensor<OutDataType>& out_n_k_ho_wo,
+                             const Tensor<OutDataType>& bias_k,
+                             const Tensor<OutDataType>& resi_n_k_ho_wo,
+                             std::vector<ck::index_t> conv_filter_strides,
+                             std::vector<ck::index_t> conv_filter_dilations,
+                             std::vector<ck::index_t> input_left_pads,
+                             std::vector<ck::index_t> input_right_pads,
+                             InElementwiseOperation in_element_op,
+                             WeiElementwiseOperation wei_element_op,
+                             OutElementwiseOperation out_element_op)
+    {
+        return Argument{in_n_c_hi_wi,
+                        wei_k_c_y_x,
+                        out_n_k_ho_wo,
+                        bias_k,
+                        resi_n_k_ho_wo,
+                        conv_filter_strides,
+                        conv_filter_dilations,
+                        input_left_pads,
+                        input_right_pads,
+                        in_element_op,
+                        wei_element_op,
+                        out_element_op};
+    }
+
+    static auto MakeInvoker() { return Invoker{}; }
+
+    virtual std::unique_ptr<device::BaseInvoker> MakeInvokerPointer()
+    {
+        return std::make_unique<Invoker>(Invoker{});
+    }
+
+    std::string GetTypeString() const override
+    {
+        auto str = std::stringstream();
+
+        // clang-format off
+        str << "ReferenceConvFwd_Bias_Activation_Add"
+            << std::endl;
+        // clang-format on
+
+        return str.str();
+    }
+};
+} // namespace host
+} // namespace tensor_operation
+} // namespace ck
+#endif