Unify Convolution FWD XDL 1D/2D implementation. (#93)

* Convolution ND * Code unification across dimensions for generating tensor descriptors. * Example * Instances * Move convnd f32 instance file to comply with repo structure. * Conv 1D tensor layouts. * Formatting and use ReferenceConv * Reference ConvFwd supporting 1D and 2D convolution. * Debug printing TensorLayout name. * Conv fwd 1D instance f32 * Refactor conv ND example. Needed to support various conv dimensio. Needed to support various conv dimensions * Rename conv nd example director to prevent conflicts. * Refactor some common utility to single file. Plus some tests. * Refactor GetHostTensorDescriptor + UT. * Add 1D test case. * Test reference convolution 1d/2d * Remove some leftovers. * Fix convolution example error for 1D * Refactor test check errors utility function. * Test Conv2D Fwd XDL * More UT for 1D case. * Parameterize input & weight initializers. * Rename example to prevent conflicts. * Split convnd instance into separate files for 1d/2d * Address review comments. * Fix data type for flops/gbytes calculations. * Assign example number 11. Co-authored-by: Adam Osewski <aosewski@amd.com> Co-authored-by: Chao Liu <chao.liu2@amd.com> [ROCm/composable_kernel commit: 756a761727]
2026-05-18 03:49:41 +00:00 · 2022-02-23 17:44:20 +01:00
parent c2e3fa5c91
commit a2386b6bbf
17 changed files with 2698 additions and 108 deletions
--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@@ -10,6 +10,7 @@ include_directories(BEFORE
    ${PROJECT_SOURCE_DIR}/composable_kernel/include/problem_transform
    ${PROJECT_SOURCE_DIR}/external/rocm/include
    ${PROJECT_SOURCE_DIR}/reference_operation/include
+    ${PROJECT_SOURCE_DIR}/test/include
 )

 # test_magic_number_division
@@ -30,3 +31,17 @@ add_executable(test_split_k ${SPLIT_K_SOURCE})
 target_link_libraries(test_split_k PRIVATE host_tensor)
 target_link_libraries(test_split_k PRIVATE device_gemm_instance)

+# test_conv_util
+set(CONV_UTIL_SOURCE conv_util/main.cpp)
+add_executable(test_conv_util ${CONV_UTIL_SOURCE})
+target_link_libraries(test_conv_util PRIVATE host_tensor)
+
+# test_reference_conv_fwd
+set(REFERENCE_CONV_FWD_SOURCE reference_conv_fwd/main.cpp)
+add_executable(test_reference_conv_fwd ${REFERENCE_CONV_FWD_SOURCE})
+target_link_libraries(test_reference_conv_fwd PRIVATE host_tensor)
+
+# test_convnd_fwd_xdl
+set(CONVND_FWD_XDL_SOURCE convnd_fwd_xdl/main.cpp)
+add_executable(test_convnd_fwd_xdl ${CONVND_FWD_XDL_SOURCE})
+target_link_libraries(test_convnd_fwd_xdl PRIVATE host_tensor)
--- a/test/conv_util/main.cpp
+++ b/test/conv_util/main.cpp
@@ -0,0 +1,157 @@
+#include <iostream>
+#include <string>
+#include <vector>
+
+#include "config.hpp"
+#include "conv_utils.hpp"
+#include "tensor_layout.hpp"
+
+namespace {
+
+template <typename T>
+bool cmp_vec(const std::vector<T>& out, const std::vector<T>& ref, const std::string& msg)
+{
+    if(out.size() != ref.size())
+    {
+        std::cout << "out.size() != ref.size(), :" << out.size() << " != " << ref.size()
+                  << std::endl
+                  << msg << std::endl;
+        return false;
+    }
+
+    for(std::size_t i = 0; i < ref.size(); ++i)
+    {
+        if(out[i] != ref[i])
+        {
+            std::cout << "out[" << i << "] != ref[" << i << "]: " << out[i] << "!=" << ref[i]
+                      << std::endl
+                      << msg << std::endl;
+            return false;
+        }
+    }
+    return true;
+}
+
+bool TestConvParams_GetOutputSpatialLengths()
+{
+    bool res{true};
+    // -------------------------- default 2D ------------------------------------
+    // input NCHW {128,192,71,71},
+    // weights KCYX {256,192,3,3},
+    // stride {2,2},
+    // dilations {1,1},
+    // padding {{1,1}, {1,1}}
+    ck::conv_util::ConvParams conv_params;
+    std::vector<ck::index_t> out_spatial_len = conv_params.GetOutputSpatialLengths();
+    res                                      = cmp_vec(out_spatial_len,
+                  std::vector<ck::index_t>{36, 36},
+                  "Error: ConvParams 2D default constructor.");
+
+    conv_params.conv_filter_strides = std::vector<ck::index_t>{1, 1};
+    out_spatial_len                 = conv_params.GetOutputSpatialLengths();
+    res                             = cmp_vec(
+        out_spatial_len, std::vector<ck::index_t>{71, 71}, "Error: ConvParams 2D stride {1,1}.");
+
+    conv_params.conv_filter_strides = std::vector<ck::index_t>{2, 2};
+    conv_params.input_left_pads     = std::vector<ck::index_t>{2, 2};
+    conv_params.input_right_pads    = std::vector<ck::index_t>{2, 2};
+    out_spatial_len                 = conv_params.GetOutputSpatialLengths();
+    res                             = cmp_vec(out_spatial_len,
+                  std::vector<ck::index_t>{37, 37},
+                  "Error: ConvParams 2D padding left/right {2,2}.");
+
+    conv_params.conv_filter_dilations = std::vector<ck::index_t>{2, 2};
+    out_spatial_len                   = conv_params.GetOutputSpatialLengths();
+    res                               = cmp_vec(
+        out_spatial_len, std::vector<ck::index_t>{36, 36}, "Error: ConvParams 2D dilation {2,2}.");
+
+    conv_params.conv_filter_strides   = std::vector<ck::index_t>{3, 3};
+    conv_params.input_left_pads       = std::vector<ck::index_t>{1, 1};
+    conv_params.input_right_pads      = std::vector<ck::index_t>{1, 1};
+    conv_params.conv_filter_dilations = std::vector<ck::index_t>{2, 2};
+    out_spatial_len                   = conv_params.GetOutputSpatialLengths();
+    res                               = cmp_vec(out_spatial_len,
+                  std::vector<ck::index_t>{23, 23},
+                  "Error: ConvParams 2D strides{3,3}, padding {1,1}, dilations {2,2}.");
+
+    // -------------------------- 1D ------------------------------------
+    conv_params.num_dim_spatial        = 1;
+    conv_params.filter_spatial_lengths = std::vector<ck::index_t>{3};
+    conv_params.input_spatial_lengths  = std::vector<ck::index_t>{71};
+    conv_params.conv_filter_strides    = std::vector<ck::index_t>{2};
+    conv_params.conv_filter_dilations  = std::vector<ck::index_t>{1};
+    conv_params.input_left_pads        = std::vector<ck::index_t>{1};
+    conv_params.input_right_pads       = std::vector<ck::index_t>{1};
+
+    out_spatial_len = conv_params.GetOutputSpatialLengths();
+    res             = cmp_vec(
+        out_spatial_len, std::vector<ck::index_t>{36}, "Error: ConvParams 1D default constructor.");
+
+    conv_params.conv_filter_strides = std::vector<ck::index_t>{1, 1};
+    out_spatial_len                 = conv_params.GetOutputSpatialLengths();
+    res =
+        cmp_vec(out_spatial_len, std::vector<ck::index_t>{71}, "Error: ConvParams 1D stride {1}.");
+
+    conv_params.conv_filter_strides = std::vector<ck::index_t>{2};
+    conv_params.input_left_pads     = std::vector<ck::index_t>{2};
+    conv_params.input_right_pads    = std::vector<ck::index_t>{2};
+    out_spatial_len                 = conv_params.GetOutputSpatialLengths();
+    res                             = cmp_vec(out_spatial_len,
+                  std::vector<ck::index_t>{37},
+                  "Error: ConvParams 1D padding left/right {2}.");
+
+    conv_params.conv_filter_dilations = std::vector<ck::index_t>{2};
+    out_spatial_len                   = conv_params.GetOutputSpatialLengths();
+    res                               = cmp_vec(
+        out_spatial_len, std::vector<ck::index_t>{36}, "Error: ConvParams 1D dilation {2}.");
+
+    conv_params.conv_filter_strides   = std::vector<ck::index_t>{3};
+    conv_params.input_left_pads       = std::vector<ck::index_t>{1};
+    conv_params.input_right_pads      = std::vector<ck::index_t>{1};
+    conv_params.conv_filter_dilations = std::vector<ck::index_t>{2};
+    out_spatial_len                   = conv_params.GetOutputSpatialLengths();
+    res                               = cmp_vec(out_spatial_len,
+                  std::vector<ck::index_t>{23},
+                  "Error: ConvParams 1D strides{3}, padding {1}, dilations {2}.");
+
+    return res;
+}
+
+bool TestGetHostTensorDescriptor()
+{
+    bool res{true};
+    namespace tl = ck::tensor_layout::convolution;
+    std::vector<std::size_t> dims{2, 3, 4, 5};
+    HostTensorDescriptor h = ck::conv_util::GetHostTensorDescriptor(dims, tl::NHWC{});
+    res = cmp_vec(h.GetLengths(), {2, 3, 4, 5}, "Error: wrong NHWC dimensions lengths!");
+    res =
+        cmp_vec(h.GetStrides(), {3 * 4 * 5, 1, 3 * 5, 3}, "Error: wrong NHWC dimensions strides!");
+
+    h   = ck::conv_util::GetHostTensorDescriptor(dims, tl::NCHW{});
+    res = cmp_vec(h.GetLengths(), {2, 3, 4, 5}, "Error: wrong NCHW dimensions lengths!");
+    res =
+        cmp_vec(h.GetStrides(), {3 * 4 * 5, 4 * 5, 5, 1}, "Error: wrong NCHW dimensions strides!");
+
+    dims = std::vector<std::size_t>{2, 3, 4};
+    h    = ck::conv_util::GetHostTensorDescriptor(dims, tl::NWC{});
+    res  = cmp_vec(h.GetLengths(), {2, 3, 4}, "Error: wrong NWC dimensions lengths!");
+    res  = cmp_vec(h.GetStrides(), {3 * 4, 1, 3}, "Error: wrong NWC dimensions strides!");
+
+    h   = ck::conv_util::GetHostTensorDescriptor(dims, tl::NCW{});
+    res = cmp_vec(h.GetLengths(), {2, 3, 4}, "Error: wrong NCW dimensions lengths!");
+    res = cmp_vec(h.GetStrides(), {3 * 4, 4, 1}, "Error: wrong NCW dimensions strides!");
+
+    return res;
+}
+
+} // namespace
+
+int main(void)
+{
+    bool res = TestConvParams_GetOutputSpatialLengths();
+    std::cout << "TestConvParams_GetOutputSpatialLengths ..... " << (res ? "SUCCESS" : "FAILURE")
+              << std::endl;
+    res = TestGetHostTensorDescriptor();
+    std::cout << "TestGetHostTensorDescriptor ..... " << (res ? "SUCCESS" : "FAILURE") << std::endl;
+    return 0;
+}
--- a/test/convnd_fwd_xdl/main.cpp
+++ b/test/convnd_fwd_xdl/main.cpp
@@ -0,0 +1,262 @@
+#include <algorithm>
+#include <cstdlib>
+#include <half.hpp>
+#include <iostream>
+#include <numeric>
+#include <tuple>
+#include <vector>
+
+#include "config.hpp"
+#include "conv_utils.hpp"
+#include "device.hpp"
+#include "device_tensor.hpp"
+#include "device_convnd_fwd_xdl_nhwc_kyxc_nhwk.hpp"
+#include "element_wise_operation.hpp"
+#include "host_tensor.hpp"
+#include "reference_conv_fwd.hpp"
+#include "tensor_layout.hpp"
+#include "test_util.hpp"
+
+namespace {
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+
+using InElementOp  = ck::tensor_operation::element_wise::PassThrough;
+using WeiElementOp = ck::tensor_operation::element_wise::PassThrough;
+using OutElementOp = ck::tensor_operation::element_wise::PassThrough;
+
+static constexpr auto ConvFwdDefault =
+    ck::tensor_operation::device::ConvolutionForwardSpecialization_t::Default;
+
+template <ck::index_t SpatialDims, typename InDataType, typename WeiDataType, typename OutDataType>
+using DeviceConvNDFwdInstance = ck::tensor_operation::device::
+    DeviceConvNDFwdXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<
+        // clang-format off
+        InDataType,         // 
+        WeiDataType,        //
+        OutDataType,        //
+        InDataType,         // 
+        InElementOp,        // Input Elementwise Operation
+        WeiElementOp,       // Weights Elementwise Operation
+        OutElementOp,       // Output Elementwise Operation
+        ConvFwdDefault,     // ConvForwardSpecialization
+        SpatialDims,        // SptialDims
+        64,                 // BlockSize
+        16,                 // MPerBlock
+        16,                 // NPerBlock
+        4,                  // K0PerBlock
+        1,                  // K1                                           
+        16,                 // MPerXDL
+        16,                 // NPerXDL
+        1,                  // MXdlPerWave
+        1,                  // NXdlPerWave
+        S<1, 16, 1>,        // ABlockTransferThreadClusterLengths_K0_M_K1
+        S<1, 0, 2>,         // ABlockTransferThreadClusterArrangeOrder
+        S<1, 0, 2>,         // ABlockTransferSrcAccessOrder
+        2,                  // ABlockTransferSrcVectorDim
+        1,                  // ABlockTransferSrcScalarPerVector
+        1,                  // ABlockTransferDstScalarPerVector_K1
+        true,               // ABlockLdsAddExtraM
+        S<1, 16, 1>,        // BBlockTransferThreadClusterLengths_K0_N_K1
+        S<1, 0, 2>,         // BBlockTransferThreadClusterArrangeOrder
+        S<1, 0, 2>,         // BBlockTransferSrcAccessOrder
+        2,                  // BBlockTransferSrcVectorDim
+        1,                  // BBlockTransferSrcScalarPerVector
+        1,                  // BBlockTransferDstScalarPerVector_K1
+        true,               // BBlockTransferAddExtraN
+        7,                  // CThreadTransferSrcDstVectorDim
+        1>;                 // CThreadTransferDstScalarPerVector
+// clang-format on
+
+template <typename InDataType  = float,
+          typename WeiDataType = float,
+          typename OutDataType = float,
+          typename InLayout    = ck::tensor_layout::convolution::NHWC,
+          typename WeiLayout   = ck::tensor_layout::convolution::KYXC,
+          typename OutLayout   = ck::tensor_layout::convolution::NHWK>
+auto GetHostTensors(const ck::conv_util::ConvParams& params)
+{
+    std::vector<std::size_t> input_dims{static_cast<std::size_t>(params.N),
+                                        static_cast<std::size_t>(params.C)};
+    input_dims.insert(std::end(input_dims),
+                      std::begin(params.input_spatial_lengths),
+                      std::end(params.input_spatial_lengths));
+
+    std::vector<std::size_t> filter_dims{static_cast<std::size_t>(params.K),
+                                         static_cast<std::size_t>(params.C)};
+    filter_dims.insert(std::end(filter_dims),
+                       std::begin(params.filter_spatial_lengths),
+                       std::end(params.filter_spatial_lengths));
+
+    const std::vector<ck::index_t>& output_spatial_lengths = params.GetOutputSpatialLengths();
+    std::vector<std::size_t> output_dims{static_cast<std::size_t>(params.N),
+                                         static_cast<std::size_t>(params.K)};
+    output_dims.insert(std::end(output_dims),
+                       std::begin(output_spatial_lengths),
+                       std::end(output_spatial_lengths));
+
+    Tensor<InDataType> input(ck::conv_util::GetHostTensorDescriptor(input_dims, InLayout{}));
+    Tensor<WeiDataType> weights(ck::conv_util::GetHostTensorDescriptor(filter_dims, WeiLayout{}));
+    Tensor<OutDataType> host_output(
+        ck::conv_util::GetHostTensorDescriptor(output_dims, OutLayout{}));
+    Tensor<OutDataType> device_output(
+        ck::conv_util::GetHostTensorDescriptor(output_dims, OutLayout{}));
+
+    std::generate(input.begin(), input.end(), [n = 0]() mutable {
+        return InDataType(n++) * InDataType(0.1f);
+    });
+    std::fill(weights.begin(), weights.end(), WeiDataType(0.5f));
+    std::fill(host_output.begin(), host_output.end(), OutDataType(0.f));
+    std::fill(device_output.begin(), device_output.end(), OutDataType(0.f));
+
+    return std::make_tuple(input, weights, host_output, device_output);
+}
+
+template <ck::index_t NDim,
+          typename InDataType  = float,
+          typename WeiDataType = float,
+          typename OutDataType = float>
+void RunReferenceConv(const ck::conv_util::ConvParams& params,
+                      const Tensor<InDataType>& input,
+                      const Tensor<WeiDataType>& weights,
+                      Tensor<OutDataType>& output)
+{
+    auto ref_conv     = ck::tensor_operation::host::ReferenceConvFwd<InDataType,
+                                                                 WeiDataType,
+                                                                 OutDataType,
+                                                                 InElementOp,
+                                                                 WeiElementOp,
+                                                                 OutElementOp,
+                                                                 NDim>();
+    auto ref_invoker  = ref_conv.MakeInvoker();
+    auto ref_argument = ref_conv.MakeArgument(input,
+                                              weights,
+                                              output,
+                                              params.conv_filter_strides,
+                                              params.conv_filter_dilations,
+                                              params.input_left_pads,
+                                              params.input_right_pads,
+                                              InElementOp{},
+                                              WeiElementOp{},
+                                              OutElementOp{});
+
+    ref_invoker.Run(ref_argument);
+}
+
+template <ck::index_t NDim,
+          typename InDataType  = float,
+          typename WeiDataType = float,
+          typename OutDataType = float>
+void RunConv(const ck::conv_util::ConvParams& params,
+             const Tensor<InDataType>& input,
+             const Tensor<WeiDataType>& weights,
+             Tensor<OutDataType>& output)
+{
+    DeviceMem in_device_buf(sizeof(InDataType) * input.mDesc.GetElementSpace());
+    DeviceMem wei_device_buf(sizeof(WeiDataType) * weights.mDesc.GetElementSpace());
+    DeviceMem out_device_buf(sizeof(OutDataType) * output.mDesc.GetElementSpace());
+
+    in_device_buf.ToDevice(input.mData.data());
+    wei_device_buf.ToDevice(weights.mData.data());
+    const std::vector<ck::index_t>& output_spatial_lengths = params.GetOutputSpatialLengths();
+
+    auto conv     = DeviceConvNDFwdInstance<NDim, InDataType, WeiDataType, OutDataType>();
+    auto invoker  = conv.MakeInvoker();
+    auto argument = conv.MakeArgument(static_cast<InDataType*>(in_device_buf.GetDeviceBuffer()),
+                                      static_cast<WeiDataType*>(wei_device_buf.GetDeviceBuffer()),
+                                      static_cast<OutDataType*>(out_device_buf.GetDeviceBuffer()),
+                                      params.N,
+                                      params.K,
+                                      params.C,
+                                      params.input_spatial_lengths,
+                                      params.filter_spatial_lengths,
+                                      output_spatial_lengths,
+                                      params.conv_filter_strides,
+                                      params.conv_filter_dilations,
+                                      params.input_left_pads,
+                                      params.input_right_pads,
+                                      InElementOp{},
+                                      WeiElementOp{},
+                                      OutElementOp{});
+
+    if(!conv.IsSupportedArgument(argument))
+    {
+        throw std::runtime_error(
+            "Error! device_conv with the specified compilation parameters does "
+            "not support this Conv problem");
+    }
+
+    invoker.Run(argument);
+    out_device_buf.FromDevice(output.mData.data());
+}
+
+bool TestConv2DNHWC()
+{
+    bool res{true};
+    ck::conv_util::ConvParams params;
+    params.N                     = 2;
+    params.K                     = 16;
+    params.C                     = 4;
+    params.input_spatial_lengths = std::vector<ck::index_t>{16, 16};
+    params.conv_filter_strides   = std::vector<ck::index_t>{1, 1};
+
+    auto host_tensors            = GetHostTensors(params);
+    const Tensor<float>& input   = std::get<0>(host_tensors);
+    const Tensor<float>& weights = std::get<1>(host_tensors);
+    Tensor<float>& host_output   = std::get<2>(host_tensors);
+    Tensor<float>& device_output = std::get<3>(host_tensors);
+
+    RunReferenceConv<2>(params, input, weights, host_output);
+    RunConv<2>(params, input, weights, device_output);
+    res = res &&
+          test_util::check_err(
+              device_output.mData, host_output.mData, "Error: incorrect results!", 1e-5f, 1e-4f);
+
+    return res;
+}
+
+bool TestConv1DNWC()
+{
+    bool res{true};
+    ck::conv_util::ConvParams params;
+    params.num_dim_spatial        = 1;
+    params.N                      = 2;
+    params.K                      = 16;
+    params.C                      = 4;
+    params.filter_spatial_lengths = std::vector<ck::index_t>{3};
+    params.input_spatial_lengths  = std::vector<ck::index_t>{16};
+    params.conv_filter_strides    = std::vector<ck::index_t>{1};
+    params.conv_filter_dilations  = std::vector<ck::index_t>{1};
+    params.input_left_pads        = std::vector<ck::index_t>{1};
+    params.input_right_pads       = std::vector<ck::index_t>{1};
+
+    auto host_tensors            = GetHostTensors<float,
+                                       float,
+                                       float,
+                                       ck::tensor_layout::convolution::NWC,
+                                       ck::tensor_layout::convolution::KXC,
+                                       ck::tensor_layout::convolution::NWK>(params);
+    const Tensor<float>& input   = std::get<0>(host_tensors);
+    const Tensor<float>& weights = std::get<1>(host_tensors);
+    Tensor<float>& host_output   = std::get<2>(host_tensors);
+    Tensor<float>& device_output = std::get<3>(host_tensors);
+
+    RunReferenceConv<1>(params, input, weights, host_output);
+    RunConv<1>(params, input, weights, device_output);
+    res = res &&
+          test_util::check_err(
+              device_output.mData, host_output.mData, "Error: incorrect results!", 1e-5f, 1e-4f);
+
+    return res;
+}
+
+} // anonymous namespace
+
+int main()
+{
+    bool res{true};
+    res = TestConv1DNWC();
+    std::cout << "TestConv1DNWC ..... " << (res ? "SUCCESS" : "FAILURE") << std::endl;
+    res = TestConv2DNHWC();
+    std::cout << "TestConv2DNHWC ..... " << (res ? "SUCCESS" : "FAILURE") << std::endl;
+}
--- a/test/include/test_util.hpp
+++ b/test/include/test_util.hpp
@@ -0,0 +1,84 @@
+#ifndef TEST_UTIL_HPP
+#define TEST_UTIL_HPP
+
+#include <cmath>
+#include <cstdlib>
+#include <iostream>
+#include <iomanip>
+#include <limits>
+#include <type_traits>
+#include <vector>
+
+namespace test_util {
+
+template <typename T>
+typename std::enable_if<std::is_floating_point<T>::value, bool>::type
+check_err(const std::vector<T>& out,
+          const std::vector<T>& ref,
+          const std::string& msg,
+          T rtol = static_cast<T>(1e-5),
+          T atol = static_cast<T>(1e-8))
+{
+    if(out.size() != ref.size())
+    {
+        std::cout << "out.size() != ref.size(), :" << out.size() << " != " << ref.size()
+                  << std::endl
+                  << msg << std::endl;
+        return false;
+    }
+
+    bool res{true};
+    int err_count = 0;
+    T err         = 0;
+    T max_err     = std::numeric_limits<T>::min();
+    for(std::size_t i = 0; i < ref.size(); ++i)
+    {
+        err = std::abs(out[i] - ref[i]);
+        if(err > atol + rtol * std::abs(ref[i]) || !std::isfinite(out[i]) || !std::isfinite(ref[i]))
+        {
+            max_err = err > max_err ? err : max_err;
+            err_count++;
+            if(err_count < 5)
+            {
+                std::cout << std::setw(12) << std::setprecision(7) << "out[" << i << "] != ref["
+                          << i << "]: " << out[i] << "!=" << ref[i] << std::endl
+                          << msg << std::endl;
+            }
+            res = false;
+        }
+    }
+    if(!res)
+    {
+        std::cout << std::setw(12) << std::setprecision(7) << "max err: " << max_err << std::endl;
+    }
+    return res;
+}
+
+template <typename T>
+typename std::enable_if<std::is_integral<T>::value, bool>::type check_err(
+    const std::vector<T>& out, const std::vector<T>& ref, const std::string& msg, T = 0, T = 0)
+{
+    if(out.size() != ref.size())
+    {
+        std::cout << "out.size() != ref.size(), :" << out.size() << " != " << ref.size()
+                  << std::endl
+                  << msg << std::endl;
+        return false;
+    }
+
+    for(std::size_t i = 0; i < ref.size(); ++i)
+    {
+        if(out[i] != ref[i])
+        {
+            std::cout << "out[" << i << "] != ref[" << i << "]: " << out[i] << "!=" << ref[i]
+                      << std::endl
+                      << msg << std::endl;
+            return false;
+        }
+    }
+    return true;
+}
+
+} // namespace test_util
+
+#endif
--- a/test/reference_conv_fwd/main.cpp
+++ b/test/reference_conv_fwd/main.cpp
@@ -0,0 +1,333 @@
+#include <algorithm>
+#include <cmath>
+#include <cstdlib>
+#include <half.hpp>
+#include <numeric>
+#include <type_traits>
+#include <vector>
+
+#include "config.hpp"
+#include "conv_utils.hpp"
+#include "element_wise_operation.hpp"
+#include "host_tensor.hpp"
+#include "reference_conv_fwd.hpp"
+#include "tensor_layout.hpp"
+#include "test_util.hpp"
+
+namespace {
+using InElementOp  = ck::tensor_operation::element_wise::PassThrough;
+using WeiElementOp = ck::tensor_operation::element_wise::PassThrough;
+using OutElementOp = ck::tensor_operation::element_wise::PassThrough;
+
+template <typename T>
+struct FillMonotonicSeq
+{
+    T m_init_value{0};
+
+    template <typename ForwardIter>
+    void operator()(ForwardIter first, ForwardIter last) const
+    {
+        std::iota(first, last, m_init_value);
+    }
+};
+
+template <typename T>
+struct FillConstant
+{
+    T m_value{0};
+
+    template <typename ForwardIter>
+    void operator()(ForwardIter first, ForwardIter last) const
+    {
+        std::fill(first, last, m_value);
+    }
+};
+
+template <ck::index_t NDim,
+          typename InDataType    = float,
+          typename WeiDataType   = float,
+          typename OutDataType   = float,
+          typename InLayout      = ck::tensor_layout::convolution::NHWC,
+          typename WeiLayout     = ck::tensor_layout::convolution::KYXC,
+          typename OutLayout     = ck::tensor_layout::convolution::NHWK,
+          typename FillInputOp   = FillMonotonicSeq<InDataType>,
+          typename FillWeightsOp = FillConstant<WeiDataType>>
+Tensor<OutDataType> RunReferenceConv(const ck::conv_util::ConvParams& params,
+                                     const FillInputOp& fill_input_op     = FillInputOp{0},
+                                     const FillWeightsOp& fill_weights_op = FillWeightsOp{0.5f})
+{
+    std::vector<std::size_t> input_dims{static_cast<std::size_t>(params.N),
+                                        static_cast<std::size_t>(params.C)};
+    input_dims.insert(std::end(input_dims),
+                      std::begin(params.input_spatial_lengths),
+                      std::end(params.input_spatial_lengths));
+
+    std::vector<std::size_t> filter_dims{static_cast<std::size_t>(params.K),
+                                         static_cast<std::size_t>(params.C)};
+    filter_dims.insert(std::end(filter_dims),
+                       std::begin(params.filter_spatial_lengths),
+                       std::end(params.filter_spatial_lengths));
+
+    const std::vector<ck::index_t>& output_spatial_lengths = params.GetOutputSpatialLengths();
+    std::vector<std::size_t> output_dims{static_cast<std::size_t>(params.N),
+                                         static_cast<std::size_t>(params.K)};
+    output_dims.insert(std::end(output_dims),
+                       std::begin(output_spatial_lengths),
+                       std::end(output_spatial_lengths));
+
+    Tensor<InDataType> input(ck::conv_util::GetHostTensorDescriptor(input_dims, InLayout{}));
+    Tensor<WeiDataType> weights(ck::conv_util::GetHostTensorDescriptor(filter_dims, WeiLayout{}));
+    Tensor<OutDataType> host_output(
+        ck::conv_util::GetHostTensorDescriptor(output_dims, OutLayout{}));
+
+    fill_input_op(input.begin(), input.end());
+    fill_weights_op(weights.begin(), weights.end());
+    std::fill(host_output.begin(), host_output.end(), OutDataType(0.f));
+
+    auto ref_conv     = ck::tensor_operation::host::ReferenceConvFwd<InDataType,
+                                                                 WeiDataType,
+                                                                 OutDataType,
+                                                                 InElementOp,
+                                                                 WeiElementOp,
+                                                                 OutElementOp,
+                                                                 NDim>();
+    auto ref_invoker  = ref_conv.MakeInvoker();
+    auto ref_argument = ref_conv.MakeArgument(input,
+                                              weights,
+                                              host_output,
+                                              params.conv_filter_strides,
+                                              params.conv_filter_dilations,
+                                              params.input_left_pads,
+                                              params.input_right_pads,
+                                              InElementOp{},
+                                              WeiElementOp{},
+                                              OutElementOp{});
+
+    ref_invoker.Run(ref_argument);
+    return host_output;
+}
+
+bool TestConv2DNHWC()
+{
+    bool res{true};
+    ck::conv_util::ConvParams params;
+    params.N                      = 1;
+    params.K                      = 1;
+    params.C                      = 2;
+    params.filter_spatial_lengths = std::vector<ck::index_t>{3, 3};
+    params.input_spatial_lengths  = std::vector<ck::index_t>{6, 6};
+    params.conv_filter_strides    = std::vector<ck::index_t>{1, 1};
+    params.conv_filter_dilations  = std::vector<ck::index_t>{1, 1};
+    params.input_left_pads        = std::vector<ck::index_t>{0, 0};
+    params.input_right_pads       = std::vector<ck::index_t>{0, 0};
+
+    auto out_tensor = RunReferenceConv<2>(params);
+    std::vector<std::size_t> ref_dims{1, 1, 4, 4};
+    std::vector<float> ref_data{130.5,
+                                148.5,
+                                166.5,
+                                184.5,
+                                238.5,
+                                256.5,
+                                274.5,
+                                292.5,
+                                346.5,
+                                364.5,
+                                382.5,
+                                400.5,
+                                454.5,
+                                472.5,
+                                490.5,
+                                508.5};
+    res = res && test_util::check_err(out_tensor.mDesc.GetLengths(),
+                                      ref_dims,
+                                      "Error: wrong output tensor dimensions!");
+    res = res && test_util::check_err(out_tensor.mData, ref_data, "Error: incorrect results!");
+
+    params.N                      = 1;
+    params.K                      = 2;
+    params.C                      = 2;
+    params.filter_spatial_lengths = std::vector<ck::index_t>{3, 3};
+    params.input_spatial_lengths  = std::vector<ck::index_t>{12, 12};
+    params.conv_filter_strides    = std::vector<ck::index_t>{2, 2};
+    params.conv_filter_dilations  = std::vector<ck::index_t>{2, 2};
+    params.input_left_pads        = std::vector<ck::index_t>{1, 1};
+    params.input_right_pads       = std::vector<ck::index_t>{1, 1};
+
+    out_tensor = RunReferenceConv<2>(params);
+    ref_dims   = std::vector<std::size_t>{1, 2, 5, 5};
+    ref_data   = std::vector<float>{
+        210.,  210.,  327.,   327.,   351.,   351.,   375.,   375.,   399.,   399.,
+        459.,  459.,  706.5,  706.5,  742.5,  742.5,  778.5,  778.5,  814.5,  814.5,
+        747.,  747.,  1138.5, 1138.5, 1174.5, 1174.5, 1210.5, 1210.5, 1246.5, 1246.5,
+        1035., 1035., 1570.5, 1570.5, 1606.5, 1606.5, 1642.5, 1642.5, 1678.5, 1678.5,
+        1323., 1323., 2002.5, 2002.5, 2038.5, 2038.5, 2074.5, 2074.5, 2110.5, 2110.5};
+    res = res && test_util::check_err(out_tensor.mDesc.GetLengths(),
+                                      ref_dims,
+                                      "Error: wrong output tensor dimensions!");
+    res = res && test_util::check_err(out_tensor.mData, ref_data, "Error: incorrect results!");
+
+    return res;
+}
+
+bool TestConv1DNWC()
+{
+    bool res{true};
+    ck::conv_util::ConvParams params;
+    params.num_dim_spatial        = 1;
+    params.N                      = 1;
+    params.K                      = 1;
+    params.C                      = 2;
+    params.filter_spatial_lengths = std::vector<ck::index_t>{3};
+    params.input_spatial_lengths  = std::vector<ck::index_t>{6};
+    params.conv_filter_strides    = std::vector<ck::index_t>{1};
+    params.conv_filter_dilations  = std::vector<ck::index_t>{1};
+    params.input_left_pads        = std::vector<ck::index_t>{0};
+    params.input_right_pads       = std::vector<ck::index_t>{0};
+
+    auto out_tensor = RunReferenceConv<1,
+                                       float,
+                                       float,
+                                       float,
+                                       ck::tensor_layout::convolution::NWC,
+                                       ck::tensor_layout::convolution::KXC,
+                                       ck::tensor_layout::convolution::NWK>(params);
+    std::vector<std::size_t> ref_dims{1, 1, 4};
+    std::vector<float> ref_data{7.5, 13.5, 19.5, 25.5};
+    res = res && test_util::check_err(out_tensor.mDesc.GetLengths(),
+                                      ref_dims,
+                                      "Error: wrong output tensor dimensions!");
+    res = res && test_util::check_err(out_tensor.mData, ref_data, "Error: incorrect results!");
+
+    params.num_dim_spatial        = 1;
+    params.N                      = 1;
+    params.K                      = 2;
+    params.C                      = 2;
+    params.filter_spatial_lengths = std::vector<ck::index_t>{3};
+    params.input_spatial_lengths  = std::vector<ck::index_t>{12};
+    params.conv_filter_strides    = std::vector<ck::index_t>{2};
+    params.conv_filter_dilations  = std::vector<ck::index_t>{2};
+    params.input_left_pads        = std::vector<ck::index_t>{1};
+    params.input_right_pads       = std::vector<ck::index_t>{1};
+
+    out_tensor = RunReferenceConv<1,
+                                  float,
+                                  float,
+                                  float,
+                                  ck::tensor_layout::convolution::NWC,
+                                  ck::tensor_layout::convolution::KXC,
+                                  ck::tensor_layout::convolution::NWK>(params);
+    ref_dims   = std::vector<std::size_t>{1, 2, 5};
+    ref_data   = std::vector<float>{9., 9., 19.5, 19.5, 31.5, 31.5, 43.5, 43.5, 55.5, 55.5};
+    res        = res && test_util::check_err(out_tensor.mDesc.GetLengths(),
+                                      ref_dims,
+                                      "Error: wrong output tensor dimensions!");
+    res = res && test_util::check_err(out_tensor.mData, ref_data, "Error: incorrect results!");
+
+    params.num_dim_spatial        = 1;
+    params.N                      = 2;
+    params.K                      = 16;
+    params.C                      = 4;
+    params.filter_spatial_lengths = std::vector<ck::index_t>{3};
+    params.input_spatial_lengths  = std::vector<ck::index_t>{16};
+    params.conv_filter_strides    = std::vector<ck::index_t>{1};
+    params.conv_filter_dilations  = std::vector<ck::index_t>{1};
+    params.input_left_pads        = std::vector<ck::index_t>{1};
+    params.input_right_pads       = std::vector<ck::index_t>{1};
+
+    auto out_tensor2 =
+        RunReferenceConv<1,
+                         float,
+                         float,
+                         float,
+                         ck::tensor_layout::convolution::NWC,
+                         ck::tensor_layout::convolution::KXC,
+                         ck::tensor_layout::convolution::NWK>(params, [](auto first, auto last) {
+            std::generate(first, last, [n = 0]() mutable { return float(n++) * float(0.1f); });
+        });
+
+    ref_dims = std::vector<std::size_t>{2, 16, 16};
+    ref_data = std::vector<float>{
+        1.4,       1.4,       1.4,       1.4,       1.4,       1.4,       1.4,       1.4,
+        1.4,       1.4,       1.4,       1.4,       1.4,       1.4,       1.4,       1.4,
+        3.3,       3.3,       3.3,       3.3,       3.3,       3.3,       3.3,       3.3,
+        3.3,       3.3,       3.3,       3.3,       3.3,       3.3,       3.3,       3.3,
+        5.7,       5.7,       5.7,       5.7,       5.7,       5.7,       5.7,       5.7,
+        5.7,       5.7,       5.7,       5.7,       5.7,       5.7,       5.7,       5.7,
+        8.1,       8.1,       8.1,       8.1,       8.1,       8.1,       8.1,       8.1,
+        8.1,       8.1,       8.1,       8.1,       8.1,       8.1,       8.1,       8.1,
+        10.5,      10.5,      10.5,      10.5,      10.5,      10.5,      10.5,      10.5,
+        10.5,      10.5,      10.5,      10.5,      10.5,      10.5,      10.5,      10.5,
+        12.900001, 12.900001, 12.900001, 12.900001, 12.900001, 12.900001, 12.900001, 12.900001,
+        12.900001, 12.900001, 12.900001, 12.900001, 12.900001, 12.900001, 12.900001, 12.900001,
+        15.3,      15.3,      15.3,      15.3,      15.3,      15.3,      15.3,      15.3,
+        15.3,      15.3,      15.3,      15.3,      15.3,      15.3,      15.3,      15.3,
+        17.7,      17.7,      17.7,      17.7,      17.7,      17.7,      17.7,      17.7,
+        17.7,      17.7,      17.7,      17.7,      17.7,      17.7,      17.7,      17.7,
+        20.1,      20.1,      20.1,      20.1,      20.1,      20.1,      20.1,      20.1,
+        20.1,      20.1,      20.1,      20.1,      20.1,      20.1,      20.1,      20.1,
+        22.5,      22.5,      22.5,      22.5,      22.5,      22.5,      22.5,      22.5,
+        22.5,      22.5,      22.5,      22.5,      22.5,      22.5,      22.5,      22.5,
+        24.900002, 24.900002, 24.900002, 24.900002, 24.900002, 24.900002, 24.900002, 24.900002,
+        24.900002, 24.900002, 24.900002, 24.900002, 24.900002, 24.900002, 24.900002, 24.900002,
+        27.300001, 27.300001, 27.300001, 27.300001, 27.300001, 27.300001, 27.300001, 27.300001,
+        27.300001, 27.300001, 27.300001, 27.300001, 27.300001, 27.300001, 27.300001, 27.300001,
+        29.7,      29.7,      29.7,      29.7,      29.7,      29.7,      29.7,      29.7,
+        29.7,      29.7,      29.7,      29.7,      29.7,      29.7,      29.7,      29.7,
+        32.100002, 32.100002, 32.100002, 32.100002, 32.100002, 32.100002, 32.100002, 32.100002,
+        32.100002, 32.100002, 32.100002, 32.100002, 32.100002, 32.100002, 32.100002, 32.100002,
+        34.5,      34.5,      34.5,      34.5,      34.5,      34.5,      34.5,      34.5,
+        34.5,      34.5,      34.5,      34.5,      34.5,      34.5,      34.5,      34.5,
+        23.8,      23.8,      23.8,      23.8,      23.8,      23.8,      23.8,      23.8,
+        23.8,      23.8,      23.8,      23.8,      23.8,      23.8,      23.8,      23.8,
+        27.,       27.,       27.,       27.,       27.,       27.,       27.,       27.,
+        27.,       27.,       27.,       27.,       27.,       27.,       27.,       27.,
+        41.7,      41.7,      41.7,      41.7,      41.7,      41.7,      41.7,      41.7,
+        41.7,      41.7,      41.7,      41.7,      41.7,      41.7,      41.7,      41.7,
+        44.100002, 44.100002, 44.100002, 44.100002, 44.100002, 44.100002, 44.100002, 44.100002,
+        44.100002, 44.100002, 44.100002, 44.100002, 44.100002, 44.100002, 44.100002, 44.100002,
+        46.5,      46.5,      46.5,      46.5,      46.5,      46.5,      46.5,      46.5,
+        46.5,      46.5,      46.5,      46.5,      46.5,      46.5,      46.5,      46.5,
+        48.899998, 48.899998, 48.899998, 48.899998, 48.899998, 48.899998, 48.899998, 48.899998,
+        48.899998, 48.899998, 48.899998, 48.899998, 48.899998, 48.899998, 48.899998, 48.899998,
+        51.3,      51.3,      51.3,      51.3,      51.3,      51.3,      51.3,      51.3,
+        51.3,      51.3,      51.3,      51.3,      51.3,      51.3,      51.3,      51.3,
+        53.7,      53.7,      53.7,      53.7,      53.7,      53.7,      53.7,      53.7,
+        53.7,      53.7,      53.7,      53.7,      53.7,      53.7,      53.7,      53.7,
+        56.100002, 56.100002, 56.100002, 56.100002, 56.100002, 56.100002, 56.100002, 56.100002,
+        56.100002, 56.100002, 56.100002, 56.100002, 56.100002, 56.100002, 56.100002, 56.100002,
+        58.5,      58.5,      58.5,      58.5,      58.5,      58.5,      58.5,      58.5,
+        58.5,      58.5,      58.5,      58.5,      58.5,      58.5,      58.5,      58.5,
+        60.899998, 60.899998, 60.899998, 60.899998, 60.899998, 60.899998, 60.899998, 60.899998,
+        60.899998, 60.899998, 60.899998, 60.899998, 60.899998, 60.899998, 60.899998, 60.899998,
+        63.3,      63.3,      63.3,      63.3,      63.3,      63.3,      63.3,      63.3,
+        63.3,      63.3,      63.3,      63.3,      63.3,      63.3,      63.3,      63.3,
+        65.7,      65.7,      65.7,      65.7,      65.7,      65.7,      65.7,      65.7,
+        65.7,      65.7,      65.7,      65.7,      65.7,      65.7,      65.7,      65.7,
+        68.1,      68.1,      68.1,      68.1,      68.1,      68.1,      68.1,      68.1,
+        68.1,      68.1,      68.1,      68.1,      68.1,      68.1,      68.1,      68.1,
+        70.5,      70.5,      70.5,      70.5,      70.5,      70.5,      70.5,      70.5,
+        70.5,      70.5,      70.5,      70.5,      70.5,      70.5,      70.5,      70.5,
+        72.9,      72.9,      72.9,      72.9,      72.9,      72.9,      72.9,      72.9,
+        72.9,      72.9,      72.9,      72.9,      72.9,      72.9,      72.9,      72.9,
+        49.4,      49.4,      49.4,      49.4,      49.4,      49.4,      49.4,      49.4,
+        49.4,      49.4,      49.4,      49.4,      49.4,      49.4,      49.4,      49.4};
+    res = res && test_util::check_err(out_tensor2.mDesc.GetLengths(),
+                                      ref_dims,
+                                      "Error: wrong output tensor dimensions!");
+    res = res && test_util::check_err(out_tensor2.mData, ref_data, "Error: incorrect results!");
+
+    return res;
+}
+
+} // anonymous namespace
+
+int main(void)
+{
+    bool res{true};
+    res = TestConv2DNHWC();
+    std::cout << "TestConv2DNHWC ..... " << (res ? "SUCCESS" : "FAILURE") << std::endl;
+    res = TestConv1DNWC();
+    std::cout << "TestConv1DNHWC ..... " << (res ? "SUCCESS" : "FAILURE") << std::endl;
+    return 0;
+}