backward data (#7)

* enabled atomic add in tensor copy * added gridwise GEMM * added backward data conv using GEMM + atomic * added backward data conv using GEMM, no atomic
2026-05-13 09:45:56 +00:00 · 2019-12-03 01:16:12 -06:00
parent 31ded4ac4b
commit 8f5f64960e
51 changed files with 3563 additions and 570 deletions
--- a/driver/include/conv_common.hpp
+++ b/driver/include/conv_common.hpp
@@ -2,39 +2,7 @@
 #define CONV_COMMON_HPP

 #include "ConstantTensorDescriptor_deprecated.hpp"
-
-// this is ugly, only for 4d
-template <class InDesc, class WeiDesc>
-constexpr auto get_convolution_output_default_4d_tensor_descriptor(InDesc, WeiDesc)
-{
-    using namespace ck;
-
-    constexpr auto in_desc  = InDesc{};
-    constexpr auto wei_desc = WeiDesc{};
-
-    constexpr auto I0 = Number<0>{};
-    constexpr auto I1 = Number<1>{};
-    constexpr auto I2 = Number<2>{};
-    constexpr auto I3 = Number<3>{};
-
-    static_assert(in_desc.GetNumOfDimension() == 4, "input nDim is not 4");
-    static_assert(wei_desc.GetNumOfDimension() == 4, "weight nDim is not 4");
-    static_assert(in_desc.GetLength(I1) == wei_desc.GetLength(I1),
-                  "input & weight dimension not consistent");
-
-    constexpr auto N  = in_desc.GetLength(I0);
-    constexpr auto HI = in_desc.GetLength(I2);
-    constexpr auto WI = in_desc.GetLength(I3);
-
-    constexpr auto K = wei_desc.GetLength(I0);
-    constexpr auto Y = wei_desc.GetLength(I2);
-    constexpr auto X = wei_desc.GetLength(I3);
-
-    constexpr auto HO = HI + 1 - Y;
-    constexpr auto WO = WI + 1 - X;
-
-    return make_ConstantTensorDescriptor_packed(Sequence<N, K, HO, WO>{});
-}
+#include "tensor_descriptor.hpp"

 template <class InDesc,
          class WeiDesc,
@@ -42,7 +10,7 @@ template <class InDesc,
          class ConvDilations,
          class LowerPads,
          class UpperPads>
-constexpr auto get_convolution_with_padding_output_default_4d_tensor_descriptor(
+constexpr auto get_convolution_output_default_4d_tensor_descriptor_deprecated(
    InDesc, WeiDesc, ConvStrides, ConvDilations, LowerPads, UpperPads)
 {
    using namespace ck;
@@ -83,6 +51,53 @@ constexpr auto get_convolution_with_padding_output_default_4d_tensor_descriptor(
    return make_ConstantTensorDescriptor_packed(Sequence<N, K, Ho, Wo>{});
 }

+template <class InDesc,
+          class WeiDesc,
+          class ConvStrides,
+          class ConvDilations,
+          class LowerPads,
+          class UpperPads>
+constexpr auto get_convolution_output_default_4d_tensor_descriptor(
+    InDesc, WeiDesc, ConvStrides, ConvDilations, LowerPads, UpperPads)
+{
+    using namespace ck;
+
+    constexpr auto in_desc  = InDesc{};
+    constexpr auto wei_desc = WeiDesc{};
+
+    constexpr auto I0 = Number<0>{};
+    constexpr auto I1 = Number<1>{};
+    constexpr auto I2 = Number<2>{};
+    constexpr auto I3 = Number<3>{};
+
+    static_assert(in_desc.GetNumOfDimension() == 4, "input nDim is not 4");
+    static_assert(wei_desc.GetNumOfDimension() == 4, "weight nDim is not 4");
+    static_assert(in_desc.GetLength(I1) == wei_desc.GetLength(I1),
+                  "input & weight dimension not consistent");
+
+    constexpr index_t N  = in_desc.GetLength(I0);
+    constexpr index_t Hi = in_desc.GetLength(I2);
+    constexpr index_t Wi = in_desc.GetLength(I3);
+
+    constexpr index_t K = wei_desc.GetLength(I0);
+    constexpr index_t Y = wei_desc.GetLength(I2);
+    constexpr index_t X = wei_desc.GetLength(I3);
+
+    constexpr index_t HPadLow = LowerPads{}.Get(I0);
+    constexpr index_t WPadLow = LowerPads{}.Get(I1);
+
+    constexpr index_t HPadUp = UpperPads{}.Get(I0);
+    constexpr index_t WPadUp = UpperPads{}.Get(I1);
+
+    constexpr index_t YEff = (Y - 1) * ConvDilations{}[0] + 1;
+    constexpr index_t XEff = (X - 1) * ConvDilations{}[1] + 1;
+
+    constexpr index_t Ho = (Hi + HPadLow + HPadUp - YEff) / ConvStrides{}[0] + 1;
+    constexpr index_t Wo = (Wi + WPadLow + WPadUp - XEff) / ConvStrides{}[1] + 1;
+
+    return make_native_tensor_descriptor_packed(Sequence<N, K, Ho, Wo>{});
+}
+
 template <class InDesc, class WeiDesc, class OutDesc>
 constexpr std::size_t calculate_convolution_flops(InDesc, WeiDesc, OutDesc)
 {
--- a/driver/include/device_col2im_eb_nchw.hpp
+++ b/driver/include/device_col2im_eb_nchw.hpp
@@ -0,0 +1,108 @@
+#pragma once
+#include <unistd.h>
+#include "device.hpp"
+#include "tensor.hpp"
+#include "gridwise_operation_wrapper.hpp"
+#include "gridwise_col2im_eb_nchw.hpp"
+
+template <typename T,
+          typename ColDesc,
+          typename ImgDesc,
+          typename FilterSizes,
+          typename OutputSizes,
+          typename ConvStrides,
+          typename ConvDilations,
+          typename LeftPads,
+          typename RightPads>
+void device_col2im_eb_nchw(ColDesc,
+                           const Tensor<T>& col_eb,
+                           ImgDesc,
+                           Tensor<T>& img_nchw,
+                           FilterSizes,
+                           OutputSizes,
+                           ConvStrides,
+                           ConvDilations,
+                           LeftPads,
+                           RightPads,
+                           std::size_t nrepeat)
+{
+    using namespace ck;
+
+    constexpr auto col_eb_desc   = ColDesc{};
+    constexpr auto img_nchw_desc = ImgDesc{};
+
+    constexpr index_t N  = img_nchw_desc.GetLengths()[0];
+    constexpr index_t C  = img_nchw_desc.GetLengths()[1];
+    constexpr index_t Hi = img_nchw_desc.GetLengths()[2];
+    constexpr index_t Wi = img_nchw_desc.GetLengths()[3];
+
+    constexpr index_t E = col_eb_desc.GetLengths()[0];
+    constexpr index_t B = col_eb_desc.GetLengths()[1];
+
+    std::size_t data_sz = sizeof(T);
+    DeviceMem col_eb_device_buf(data_sz * col_eb.mDesc.GetElementSpace());
+    DeviceMem img_nchw_device_buf(data_sz * img_nchw.mDesc.GetElementSpace());
+
+    col_eb_device_buf.ToDevice(col_eb.mData.data());
+    img_nchw_device_buf.ToDevice(img_nchw.mData.data());
+
+#if 1
+    constexpr index_t BlockSize = 256;
+
+    constexpr index_t EPerBlock = 128;
+    constexpr index_t BPerBlock = 128;
+
+    using BlockCopySubLengths_E_B            = Sequence<8, 8>;
+    using BlockCopyClusterLengths_E_B        = Sequence<16, 16>;
+    using BlockCopyThreadClusterArrangeOrder = Sequence<0, 1>; // [E, B]
+    using BlockCopySrcAccessOrder            = Sequence<0, 1>; // [E, B]
+    using BlockCopyDstAccessOrder            = Sequence<0, 1>; // [E, B]
+
+    constexpr index_t BlockCopyDataPerAccess_B = 1;
+#endif
+
+    constexpr index_t GridSize =
+        ((E + EPerBlock - 1) / EPerBlock) * ((B + BPerBlock - 1) / BPerBlock);
+
+    printf("%s: BlockSize %u, GridSize %u \n", __func__, BlockSize, GridSize);
+
+    constexpr auto gridwise_col2im = GridwiseCol2Im_eb_nchw<GridSize,
+                                                            BlockSize,
+                                                            T,
+                                                            ColDesc,
+                                                            ImgDesc,
+                                                            FilterSizes,
+                                                            OutputSizes,
+                                                            ConvStrides,
+                                                            ConvDilations,
+                                                            LeftPads,
+                                                            RightPads,
+                                                            EPerBlock,
+                                                            BPerBlock,
+                                                            BlockCopySubLengths_E_B,
+                                                            BlockCopyClusterLengths_E_B,
+                                                            BlockCopyThreadClusterArrangeOrder,
+                                                            BlockCopySrcAccessOrder,
+                                                            BlockCopyDstAccessOrder,
+                                                            BlockCopyDataPerAccess_B>{};
+
+    for(index_t i = 0; i < nrepeat; ++i)
+    {
+        float time = launch_kernel(run_gridwise_operation<decltype(gridwise_col2im),
+                                                          const T* const __restrict__,
+                                                          T* const __restrict__>,
+                                   dim3(GridSize),
+                                   dim3(BlockSize),
+                                   0,
+                                   gridwise_col2im,
+                                   const_cast<const T* const __restrict__>(
+                                       static_cast<T*>(col_eb_device_buf.GetDeviceBuffer())),
+                                   const_cast<T* const __restrict__>(
+                                       static_cast<T*>(img_nchw_device_buf.GetDeviceBuffer())));
+
+        printf("Elapsed time : %f ms\n", time);
+        usleep(std::min(time * 1000, float(10000)));
+    }
+
+    img_nchw_device_buf.FromDevice(img_nchw.mData.data());
+}
--- a/driver/include/device_convolution_backward_data_implicit_gemm_v1r1_nchw_kcyx_nkhw.hpp
+++ b/driver/include/device_convolution_backward_data_implicit_gemm_v1r1_nchw_kcyx_nkhw.hpp
@@ -0,0 +1,143 @@
+#pragma once
+#include <unistd.h>
+#include "device.hpp"
+#include "tensor.hpp"
+#include "gridwise_operation_wrapper.hpp"
+#include "gridwise_convolution_backward_data_implicit_gemm_v1r1_nchw_kcyx_nkhw.hpp"
+
+template <typename T,
+          typename InDesc,
+          typename WeiDesc,
+          typename OutDesc,
+          typename ConvStrides,
+          typename ConvDilations,
+          typename LeftPads,
+          typename RightPads>
+void device_convolution_backward_data_implicit_gemm_v1r1_nchw_kcyx_nkhw(InDesc in_nchw_desc,
+                                                                        Tensor<T>& in_nchw,
+                                                                        WeiDesc wei_kcyx_desc,
+                                                                        const Tensor<T>& wei_kcyx,
+                                                                        OutDesc out_nkhw_desc,
+                                                                        const Tensor<T>& out_nkhw,
+                                                                        ConvStrides,
+                                                                        ConvDilations,
+                                                                        LeftPads,
+                                                                        RightPads,
+                                                                        std::size_t nrepeat)
+{
+    using namespace ck;
+
+    constexpr index_t N  = out_nkhw_desc.GetLengths()[0];
+    constexpr index_t K  = out_nkhw_desc.GetLengths()[1];
+    constexpr index_t Ho = out_nkhw_desc.GetLengths()[2];
+    constexpr index_t Wo = out_nkhw_desc.GetLengths()[3];
+
+    constexpr index_t C = wei_kcyx_desc.GetLengths()[1];
+    constexpr index_t Y = wei_kcyx_desc.GetLengths()[2];
+    constexpr index_t X = wei_kcyx_desc.GetLengths()[3];
+
+    std::size_t data_sz = sizeof(T);
+    DeviceMem in_nchw_device_buf(data_sz * in_nchw.mDesc.GetElementSpace());
+    DeviceMem wei_kcyx_device_buf(data_sz * wei_kcyx.mDesc.GetElementSpace());
+    DeviceMem out_nkhw_device_buf(data_sz * out_nkhw.mDesc.GetElementSpace());
+
+    in_nchw_device_buf.ToDevice(in_nchw.mData.data());
+    wei_kcyx_device_buf.ToDevice(wei_kcyx.mData.data());
+    out_nkhw_device_buf.ToDevice(out_nkhw.mData.data());
+
+#if 1
+    // BlockSize = 256, each thread hold 64 data
+    constexpr index_t BlockSize = 256;
+
+    constexpr index_t GemmMPerBlock              = 128;
+    constexpr index_t GemmNPerBlock              = 128;
+    constexpr index_t GemmKPerBlock              = 8;
+    constexpr index_t GemmMPerThreadSubC         = 4;
+    constexpr index_t GemmNPerThreadSubC         = 4;
+    constexpr index_t GemmMLevel0Cluster         = 4;
+    constexpr index_t GemmNLevel0Cluster         = 4;
+    constexpr index_t GemmMLevel1Cluster         = 4;
+    constexpr index_t GemmNLevel1Cluster         = 4;
+    constexpr index_t GemmKPerThreadLoop         = 1;
+    constexpr index_t GemmThreadGemmDataPerReadM = 4;
+    constexpr index_t GemmThreadGemmDataPerReadN = 4;
+
+    using GemmABlockCopySubLengths     = Sequence<1, 4>;  // Gemm-K, Gemm-M
+    using GemmABlockCopyClusterLengths = Sequence<8, 32>; // Gemm-K, Gemm-M
+
+    constexpr index_t GemmABlockCopyDataPerAccess = 4; // Gemm-M
+
+    using GemmBBlockCopySubLengths     = Sequence<4, 1>;   // Gemm-K, Gemm-N
+    using GemmBBlockCopyClusterLengths = Sequence<2, 128>; // Gemm-K, Gemm-N
+
+    constexpr index_t GemmBBlockCopyDataPerAccess = 1; // Gemm-N
+
+    constexpr index_t GemmCThreadCopyDataPerAccess = 1; // Gemm-N
+#endif
+
+    constexpr index_t GemmM = C * Y * X;
+    constexpr index_t GemmN = N * Ho * Wo;
+
+    constexpr index_t GridSize = ((GemmM + GemmMPerBlock - 1) / GemmMPerBlock) *
+                                 ((GemmN + GemmNPerBlock - 1) / GemmNPerBlock);
+
+    printf("%s: BlockSize %u, GridSize %u \n", __func__, BlockSize, GridSize);
+
+    constexpr auto gridwise_conv = GridwiseConvolutionBackwardDataImplicitGemm_v1r1_nchw_kcyx_nkhw<
+        GridSize,
+        BlockSize,
+        T,
+        T,
+        decltype(in_nchw_desc),
+        decltype(wei_kcyx_desc),
+        decltype(out_nkhw_desc),
+        ConvStrides,
+        ConvDilations,
+        LeftPads,
+        RightPads,
+        GemmMPerBlock,
+        GemmNPerBlock,
+        GemmKPerBlock,
+        GemmMPerThreadSubC,
+        GemmNPerThreadSubC,
+        GemmMLevel0Cluster,
+        GemmNLevel0Cluster,
+        GemmMLevel1Cluster,
+        GemmNLevel1Cluster,
+        GemmKPerThreadLoop,
+        GemmThreadGemmDataPerReadM,
+        GemmThreadGemmDataPerReadN,
+        GemmABlockCopySubLengths,
+        GemmABlockCopyClusterLengths,
+        GemmABlockCopyDataPerAccess,
+        GemmBBlockCopySubLengths,
+        GemmBBlockCopyClusterLengths,
+        GemmBBlockCopyDataPerAccess,
+        GemmCThreadCopyDataPerAccess>{};
+
+    for(index_t i = 0; i < nrepeat; ++i)
+    {
+        float time = launch_kernel(run_gridwise_operation<decltype(gridwise_conv),
+                                                          T* const __restrict__,
+                                                          const T* const __restrict__,
+                                                          const T* const __restrict__>,
+                                   dim3(GridSize),
+                                   dim3(BlockSize),
+                                   0,
+                                   gridwise_conv,
+                                   const_cast<T* const __restrict__>(
+                                       static_cast<T*>(in_nchw_device_buf.GetDeviceBuffer())),
+                                   const_cast<const T* const __restrict__>(
+                                       static_cast<T*>(wei_kcyx_device_buf.GetDeviceBuffer())),
+                                   const_cast<const T* const __restrict__>(
+                                       static_cast<T*>(out_nkhw_device_buf.GetDeviceBuffer())));
+
+        printf("Elapsed time : %f ms, %f TFlop/s\n",
+               time,
+               (float)calculate_convolution_flops(InDesc{}, WeiDesc{}, OutDesc{}) /
+                   (std::size_t(1000) * 1000 * 1000) / time);
+        usleep(std::min(time * 1000, float(10000)));
+    }
+
+    in_nchw_device_buf.FromDevice(in_nchw.mData.data());
+}
--- a/driver/include/device_convolution_backward_data_implicit_gemm_v1r2_nchw_kcyx_nkhw.hpp
+++ b/driver/include/device_convolution_backward_data_implicit_gemm_v1r2_nchw_kcyx_nkhw.hpp
@@ -0,0 +1,155 @@
+#pragma once
+#include <unistd.h>
+#include "device.hpp"
+#include "tensor.hpp"
+#include "gridwise_operation_wrapper.hpp"
+#include "gridwise_convolution_backward_data_implicit_gemm_v1r2_nchw_kcyx_nkhw_lds_double_buffer.hpp"
+
+template <typename T,
+          typename InDesc,
+          typename WeiDesc,
+          typename OutDesc,
+          typename ConvStrides,
+          typename ConvDilations,
+          typename LeftPads,
+          typename RightPads>
+void device_convolution_backward_data_implicit_gemm_v1r2_nchw_kcyx_nkhw(InDesc in_nchw_desc,
+                                                                        Tensor<T>& in_nchw,
+                                                                        WeiDesc wei_kcyx_desc,
+                                                                        const Tensor<T>& wei_kcyx,
+                                                                        OutDesc out_nkhw_desc,
+                                                                        const Tensor<T>& out_nkhw,
+                                                                        ConvStrides,
+                                                                        ConvDilations,
+                                                                        LeftPads,
+                                                                        RightPads,
+                                                                        std::size_t nrepeat)
+{
+    using namespace ck;
+
+    constexpr index_t N  = out_nkhw_desc.GetLengths()[0];
+    constexpr index_t K  = out_nkhw_desc.GetLengths()[1];
+    constexpr index_t Ho = out_nkhw_desc.GetLengths()[2];
+    constexpr index_t Wo = out_nkhw_desc.GetLengths()[3];
+
+    constexpr index_t C = wei_kcyx_desc.GetLengths()[1];
+    constexpr index_t Y = wei_kcyx_desc.GetLengths()[2];
+    constexpr index_t X = wei_kcyx_desc.GetLengths()[3];
+
+    std::size_t data_sz = sizeof(T);
+    DeviceMem in_nchw_device_buf(data_sz * in_nchw.mDesc.GetElementSpace());
+    DeviceMem wei_kcyx_device_buf(data_sz * wei_kcyx.mDesc.GetElementSpace());
+    DeviceMem out_nkhw_device_buf(data_sz * out_nkhw.mDesc.GetElementSpace());
+
+    in_nchw_device_buf.ToDevice(in_nchw.mData.data());
+    wei_kcyx_device_buf.ToDevice(wei_kcyx.mData.data());
+    out_nkhw_device_buf.ToDevice(out_nkhw.mData.data());
+
+#if 1
+    // BlockSize = 256, each thread hold 64 data
+    constexpr index_t BlockSize = 256;
+
+    constexpr index_t BPerBlock = 32;
+    constexpr index_t EPerBlock = 32;
+    constexpr index_t KPerBlock = 8;
+
+    constexpr index_t GemmMPerThreadSubC = 4;
+    constexpr index_t GemmNPerThreadSubC = 4;
+    constexpr index_t GemmMLevel0Cluster = 4;
+    constexpr index_t GemmNLevel0Cluster = 4;
+    constexpr index_t GemmMLevel1Cluster = 4;
+    constexpr index_t GemmNLevel1Cluster = 4;
+    constexpr index_t GemmKPerThreadLoop = 1;
+    constexpr index_t GemmDataPerReadA   = 4;
+    constexpr index_t GemmDataPerReadB   = 4;
+
+    using OutBlockCopySubLengths_K_B_N0     = Sequence<1, 1, 4>;
+    using OutBlockCopyClusterLengths_K_B_N0 = Sequence<8, 32, 1>;
+
+    constexpr index_t OutBlockCopySrcDataPerRead_B   = 1;
+    constexpr index_t OutBlockCopyDstDataPerWrite_N0 = 4;
+
+    using WeiBlockCopySubLengths_K_E_C0     = Sequence<1, 4, 1>;
+    using WeiBlockCopyClusterLengths_K_E_C0 = Sequence<8, 8, 4>;
+
+    constexpr index_t WeiBlockCopySrcDataPerRead_E   = 4;
+    constexpr index_t WeiBlockCopyDstDataPerWrite_C0 = 1;
+
+    constexpr index_t InThreadCopyDstDataPerWrite_B = 1;
+#endif
+
+    constexpr index_t C0 = GemmMPerThreadSubC;
+    constexpr index_t N0 = GemmNPerThreadSubC;
+
+    constexpr index_t C1 = C / C0;
+    constexpr index_t N1 = N / N0;
+
+    constexpr index_t E = C1 * Y * X;
+    constexpr index_t B = (N1 * Ho * Wo);
+
+    constexpr index_t GridSize =
+        ((E + EPerBlock - 1) / EPerBlock) * ((B + BPerBlock - 1) / BPerBlock);
+
+    printf("%s: BlockSize %u, GridSize %u \n", __func__, BlockSize, GridSize);
+
+    constexpr auto gridwise_conv =
+        GridwiseConvolutionBackwardDataImplicitGemm_v1r2_nchw_kcyx_nkhw_lds_double_buffer<
+            GridSize,
+            BlockSize,
+            T,
+            T,
+            decltype(in_nchw_desc),
+            decltype(wei_kcyx_desc),
+            decltype(out_nkhw_desc),
+            ConvStrides,
+            ConvDilations,
+            LeftPads,
+            RightPads,
+            EPerBlock,
+            BPerBlock,
+            KPerBlock,
+            GemmMPerThreadSubC,
+            GemmNPerThreadSubC,
+            GemmMLevel0Cluster,
+            GemmNLevel0Cluster,
+            GemmMLevel1Cluster,
+            GemmNLevel1Cluster,
+            GemmKPerThreadLoop,
+            GemmDataPerReadA,
+            GemmDataPerReadB,
+            OutBlockCopySubLengths_K_B_N0,
+            OutBlockCopyClusterLengths_K_B_N0,
+            OutBlockCopySrcDataPerRead_B,
+            OutBlockCopyDstDataPerWrite_N0,
+            WeiBlockCopySubLengths_K_E_C0,
+            WeiBlockCopyClusterLengths_K_E_C0,
+            WeiBlockCopySrcDataPerRead_E,
+            WeiBlockCopyDstDataPerWrite_C0,
+            InThreadCopyDstDataPerWrite_B>{};
+
+    for(index_t i = 0; i < nrepeat; ++i)
+    {
+        float time = launch_kernel(run_gridwise_operation<decltype(gridwise_conv),
+                                                          T* const __restrict__,
+                                                          const T* const __restrict__,
+                                                          const T* const __restrict__>,
+                                   dim3(GridSize),
+                                   dim3(BlockSize),
+                                   0,
+                                   gridwise_conv,
+                                   const_cast<T* const __restrict__>(
+                                       static_cast<T*>(in_nchw_device_buf.GetDeviceBuffer())),
+                                   const_cast<const T* const __restrict__>(
+                                       static_cast<T*>(wei_kcyx_device_buf.GetDeviceBuffer())),
+                                   const_cast<const T* const __restrict__>(
+                                       static_cast<T*>(out_nkhw_device_buf.GetDeviceBuffer())));
+
+        printf("Elapsed time : %f ms, %f TFlop/s\n",
+               time,
+               (float)calculate_convolution_flops(InDesc{}, WeiDesc{}, OutDesc{}) /
+                   (std::size_t(1000) * 1000 * 1000) / time);
+        usleep(std::min(time * 1000, float(10000)));
+    }
+
+    in_nchw_device_buf.FromDevice(in_nchw.mData.data());
+}
--- a/driver/include/device_convolution_backward_data_implicit_gemm_v2r1_nchw_kcyx_nkhw.hpp
+++ b/driver/include/device_convolution_backward_data_implicit_gemm_v2r1_nchw_kcyx_nkhw.hpp
@@ -0,0 +1,195 @@
+#pragma once
+#include <unistd.h>
+#include "device.hpp"
+#include "tensor.hpp"
+#include "gridwise_operation_wrapper.hpp"
+#include "gridwise_convolution_backward_data_implicit_gemm_v2r1_nchw_kcyx_nkhw.hpp"
+
+template <typename T,
+          typename InDesc,
+          typename WeiDesc,
+          typename OutDesc,
+          typename ConvStrides,
+          typename ConvDilations,
+          typename LeftPads,
+          typename RightPads>
+void device_convolution_backward_data_implicit_gemm_v2r1_nchw_kcyx_nkhw(InDesc in_nchw_desc,
+                                                                        Tensor<T>& in_nchw,
+                                                                        WeiDesc wei_kcyx_desc,
+                                                                        const Tensor<T>& wei_kcyx,
+                                                                        OutDesc out_nkhw_desc,
+                                                                        const Tensor<T>& out_nkhw,
+                                                                        ConvStrides,
+                                                                        ConvDilations,
+                                                                        LeftPads,
+                                                                        RightPads,
+                                                                        std::size_t nrepeat)
+{
+    using namespace ck;
+
+    constexpr index_t N  = out_nkhw_desc.GetLengths()[0];
+    constexpr index_t K  = out_nkhw_desc.GetLengths()[1];
+    constexpr index_t Ho = out_nkhw_desc.GetLengths()[2];
+    constexpr index_t Wo = out_nkhw_desc.GetLengths()[3];
+
+    constexpr index_t C = wei_kcyx_desc.GetLengths()[1];
+    constexpr index_t Y = wei_kcyx_desc.GetLengths()[2];
+    constexpr index_t X = wei_kcyx_desc.GetLengths()[3];
+
+    constexpr index_t ConvStrideH = ConvStrides{}[0];
+    constexpr index_t ConvStrideW = ConvStrides{}[1];
+
+    constexpr index_t ConvDilationH = ConvDilations{}[0];
+    constexpr index_t ConvDilationW = ConvDilations{}[1];
+
+    std::size_t data_sz = sizeof(T);
+    DeviceMem in_nchw_device_buf(data_sz * in_nchw.mDesc.GetElementSpace());
+    DeviceMem wei_kcyx_device_buf(data_sz * wei_kcyx.mDesc.GetElementSpace());
+    DeviceMem out_nkhw_device_buf(data_sz * out_nkhw.mDesc.GetElementSpace());
+
+    in_nchw_device_buf.ToDevice(in_nchw.mData.data());
+    wei_kcyx_device_buf.ToDevice(wei_kcyx.mData.data());
+    out_nkhw_device_buf.ToDevice(out_nkhw.mData.data());
+
+#if 1
+    // BlockSize = 256, each thread hold 64 data
+    constexpr index_t BlockSize = 256;
+
+    constexpr index_t GemmMPerBlock              = 128;
+    constexpr index_t GemmNPerBlock              = 128;
+    constexpr index_t GemmKPerBlock              = 8;
+    constexpr index_t GemmMPerThreadSubC         = 4;
+    constexpr index_t GemmNPerThreadSubC         = 4;
+    constexpr index_t GemmMLevel0Cluster         = 4;
+    constexpr index_t GemmNLevel0Cluster         = 4;
+    constexpr index_t GemmMLevel1Cluster         = 4;
+    constexpr index_t GemmNLevel1Cluster         = 4;
+    constexpr index_t GemmKPerThreadLoop         = 1;
+    constexpr index_t GemmThreadGemmDataPerReadM = 4;
+    constexpr index_t GemmThreadGemmDataPerReadN = 4;
+
+    using GemmABlockCopySubLengths     = Sequence<4, 1>;   // Gemm-K, Gemm-M
+    using GemmABlockCopyClusterLengths = Sequence<2, 128>; // Gemm-K, Gemm-M
+
+    constexpr index_t GemmABlockCopyDataPerAccess = 1; // Gemm-M
+
+    using GemmBBlockCopySubLengths     = Sequence<4, 1>;   // Gemm-K, Gemm-N
+    using GemmBBlockCopyClusterLengths = Sequence<2, 128>; // Gemm-K, Gemm-N
+
+    constexpr index_t GemmBBlockCopyDataPerAccess = 1; // Gemm-N
+
+    constexpr index_t GemmCThreadCopyDataPerAccess = 1; // Gemm-N
+#elif 0
+    // BlockSize = 256, each thread hold 64 data
+    constexpr index_t BlockSize = 256;
+
+    constexpr index_t GemmMPerBlock              = 128;
+    constexpr index_t GemmNPerBlock              = 128;
+    constexpr index_t GemmKPerBlock              = 8;
+    constexpr index_t GemmMPerThreadSubC         = 4;
+    constexpr index_t GemmNPerThreadSubC         = 4;
+    constexpr index_t GemmMLevel0Cluster         = 4;
+    constexpr index_t GemmNLevel0Cluster         = 4;
+    constexpr index_t GemmMLevel1Cluster         = 4;
+    constexpr index_t GemmNLevel1Cluster         = 4;
+    constexpr index_t GemmKPerThreadLoop         = 1;
+    constexpr index_t GemmThreadGemmDataPerReadM = 4;
+    constexpr index_t GemmThreadGemmDataPerReadN = 4;
+
+    using GemmABlockCopySubLengths     = Sequence<1, 4>;  // Gemm-K, Gemm-M
+    using GemmABlockCopyClusterLengths = Sequence<8, 32>; // Gemm-K, Gemm-M
+
+    constexpr index_t GemmABlockCopyDataPerAccess = 4; // Gemm-M
+
+    using GemmBBlockCopySubLengths     = Sequence<4, 1>;   // Gemm-K, Gemm-N
+    using GemmBBlockCopyClusterLengths = Sequence<2, 128>; // Gemm-K, Gemm-N
+
+    constexpr index_t GemmBBlockCopyDataPerAccess = 1; // Gemm-N
+
+    constexpr index_t GemmCThreadCopyDataPerAccess = 1; // Gemm-N
+#endif
+
+    // TODO: this algo support any stride and dilation. But for now, let's fix them to be 1 for
+    // simplicity
+    constexpr index_t hcf_stride_dilation_h = math::hcf(ConvStrideH, ConvDilationH);
+    constexpr index_t hcf_stride_dilation_w = math::hcf(ConvStrideW, ConvDilationW);
+
+    constexpr index_t Ytilda = ConvStrideH / hcf_stride_dilation_h; // may be wrong
+    constexpr index_t Xtilda = ConvStrideW / hcf_stride_dilation_w; // may be wrong
+
+    constexpr index_t Ydot = math::integer_divide_ceil(Y, Ytilda);
+    constexpr index_t Xdot = math::integer_divide_ceil(X, Xtilda);
+
+    constexpr index_t right_pad_ho = (ConvDilationH / hcf_stride_dilation_h) * (Y - Ytilda);
+    constexpr index_t right_pad_wo = (ConvDilationW / hcf_stride_dilation_w) * (X - Xtilda);
+
+    constexpr index_t Htilda = Ho + right_pad_ho;
+    constexpr index_t Wtilda = Wo + right_pad_wo;
+
+    constexpr index_t GemmK = K * Ydot * Xdot;
+    constexpr index_t GemmM = C * Ytilda * Xtilda;
+    constexpr index_t GemmN = N * Htilda * Wtilda;
+
+    constexpr index_t GridSize = ((GemmM + GemmMPerBlock - 1) / GemmMPerBlock) *
+                                 ((GemmN + GemmNPerBlock - 1) / GemmNPerBlock);
+
+    printf("%s: BlockSize %u, GridSize %u \n", __func__, BlockSize, GridSize);
+
+    constexpr auto gridwise_conv = GridwiseConvolutionBackwardDataImplicitGemm_v2r1_nchw_kcyx_nkhw<
+        GridSize,
+        BlockSize,
+        T,
+        T,
+        decltype(in_nchw_desc),
+        decltype(wei_kcyx_desc),
+        decltype(out_nkhw_desc),
+        ConvStrides,
+        ConvDilations,
+        LeftPads,
+        RightPads,
+        GemmMPerBlock,
+        GemmNPerBlock,
+        GemmKPerBlock,
+        GemmMPerThreadSubC,
+        GemmNPerThreadSubC,
+        GemmMLevel0Cluster,
+        GemmNLevel0Cluster,
+        GemmMLevel1Cluster,
+        GemmNLevel1Cluster,
+        GemmKPerThreadLoop,
+        GemmThreadGemmDataPerReadM,
+        GemmThreadGemmDataPerReadN,
+        GemmABlockCopySubLengths,
+        GemmABlockCopyClusterLengths,
+        GemmABlockCopyDataPerAccess,
+        GemmBBlockCopySubLengths,
+        GemmBBlockCopyClusterLengths,
+        GemmBBlockCopyDataPerAccess,
+        GemmCThreadCopyDataPerAccess>{};
+
+    for(index_t i = 0; i < nrepeat; ++i)
+    {
+        float time = launch_kernel(run_gridwise_operation<decltype(gridwise_conv),
+                                                          T* const __restrict__,
+                                                          const T* const __restrict__,
+                                                          const T* const __restrict__>,
+                                   dim3(GridSize),
+                                   dim3(BlockSize),
+                                   0,
+                                   gridwise_conv,
+                                   const_cast<T* const __restrict__>(
+                                       static_cast<T*>(in_nchw_device_buf.GetDeviceBuffer())),
+                                   const_cast<const T* const __restrict__>(
+                                       static_cast<T*>(wei_kcyx_device_buf.GetDeviceBuffer())),
+                                   const_cast<const T* const __restrict__>(
+                                       static_cast<T*>(out_nkhw_device_buf.GetDeviceBuffer())));
+
+        printf("Elapsed time : %f ms, %f TFlop/s\n",
+               time,
+               (float)calculate_convolution_flops(InDesc{}, WeiDesc{}, OutDesc{}) /
+                   (std::size_t(1000) * 1000 * 1000) / time);
+        usleep(std::min(time * 1000, float(10000)));
+    }
+
+    in_nchw_device_buf.FromDevice(in_nchw.mData.data());
+}
--- a/driver/include/device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw.hpp
+++ b/driver/include/device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw.hpp
@@ -2,7 +2,7 @@
 #include <unistd.h>
 #include "device.hpp"
 #include "tensor.hpp"
-#include "gridwise_convolution_kernel_wrapper.hpp"
+#include "gridwise_operation_wrapper.hpp"
 #include "convolution_common.hpp"
 #include "gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer.hpp"

@@ -54,8 +54,8 @@ void device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw(InDesc,
    wei_kcyx_device_buf.ToDevice(wei_kcyx.mData.data());
    out_nkhw_device_buf.ToDevice(out_nkhw.mData.data());

-#if 1
-    // BlockSize = 256, each thread hold 64 data
+#if 0
+    // BlockSize = 256, EperBlock = 8, each thread hold 64 data
    constexpr index_t BlockSize = 256;

    constexpr index_t BPerBlock = 16;
@@ -89,6 +89,43 @@ void device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw(InDesc,
    using WeiBlockCopySrcAccessOrder            = Sequence<1, 0>; // [K, E]
    using WeiBlockCopyDstAccessOrder            = Sequence<0, 1>; // [E, K]

+    constexpr index_t WeiBlockCopySrcDataPerRead_E  = 4;
+    constexpr index_t WeiBlockCopyDstDataPerWrite_K = 1;
+#elif 1
+    // BlockSize = 256, EPerBlock = 16, each thread hold 64 data
+    constexpr index_t BlockSize = 256;
+
+    constexpr index_t BPerBlock = 16;
+    constexpr index_t KPerBlock = 128;
+    constexpr index_t EPerBlock = 16;
+
+    constexpr index_t GemmNRepeat = 2;
+
+    constexpr index_t GemmMPerThreadSubC = 4;
+    constexpr index_t GemmNPerThreadSubC = 4;
+    constexpr index_t GemmMLevel0Cluster = 4;
+    constexpr index_t GemmNLevel0Cluster = 4;
+    constexpr index_t GemmMLevel1Cluster = 4;
+    constexpr index_t GemmNLevel1Cluster = 4;
+    constexpr index_t GemmKPerThreadLoop = 1;
+    constexpr index_t GemmDataPerReadA   = 4;
+    constexpr index_t GemmDataPerReadB   = 4;
+
+    using InBlockCopySubLengths_E_N1_B_N2      = Sequence<1, 2, 1, 4>;
+    using InBlockCopyClusterLengths_E_N1_B_N2  = Sequence<16, 1, 16, 1>;
+    using InBlockCopyThreadClusterArrangeOrder = Sequence<0, 1, 3, 2>; // [E, N1, N2, B]
+    using InBlockCopySrcAccessOrder            = Sequence<0, 2, 1, 3>; // [E, B, N1, N2]
+    using InBlockCopyDstAccessOrder            = Sequence<0, 1, 2, 3>; // [E, N1, B, N2]
+
+    constexpr index_t InBlockCopySrcDataPerRead_B   = 1;
+    constexpr index_t InBlockCopyDstDataPerWrite_N2 = 4;
+
+    using WeiBlockCopySubLengths_E_K            = Sequence<4, 2>;
+    using WeiBlockCopyClusterLengths_E_K        = Sequence<4, 64>;
+    using WeiBlockCopyThreadClusterArrangeOrder = Sequence<1, 0>; // [K, E]
+    using WeiBlockCopySrcAccessOrder            = Sequence<1, 0>; // [K, E]
+    using WeiBlockCopyDstAccessOrder            = Sequence<0, 1>; // [E, K]
+
    constexpr index_t WeiBlockCopySrcDataPerRead_E  = 4;
    constexpr index_t WeiBlockCopyDstDataPerWrite_K = 1;
 #elif 0
@@ -221,13 +258,20 @@ void device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw(InDesc,

    for(index_t i = 0; i < nrepeat; ++i)
    {
-        float time = launch_kernel(run_gridwise_convolution_kernel<decltype(gridwise_conv), T>,
+        float time = launch_kernel(run_gridwise_operation<decltype(gridwise_conv),
+                                                          const T* const __restrict__,
+                                                          const T* const __restrict__,
+                                                          T* const __restrict__>,
                                   dim3(GridSize),
                                   dim3(BlockSize),
                                   0,
-                                   static_cast<T*>(in_nchw_device_buf.GetDeviceBuffer()),
-                                   static_cast<T*>(wei_kcyx_device_buf.GetDeviceBuffer()),
-                                   static_cast<T*>(out_nkhw_device_buf.GetDeviceBuffer()));
+                                   gridwise_conv,
+                                   const_cast<const T* const __restrict__>(
+                                       static_cast<T*>(in_nchw_device_buf.GetDeviceBuffer())),
+                                   const_cast<const T* const __restrict__>(
+                                       static_cast<T*>(wei_kcyx_device_buf.GetDeviceBuffer())),
+                                   const_cast<T* const __restrict__>(
+                                       static_cast<T*>(out_nkhw_device_buf.GetDeviceBuffer())));

        printf("Elapsed time : %f ms, %f TFlop/s\n",
               time,
--- a/driver/include/device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_deprecated.hpp
+++ b/driver/include/device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_deprecated.hpp
@@ -46,7 +46,7 @@ void device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_deprecated(InDesc,
    wei_kcyx_device_buf.ToDevice(wei_kcyx.mData.data());
    out_nkhw_device_buf.ToDevice(out_nkhw.mData.data());

-#if 1
+#if 0
    // BlockSize = 256, blockwise-GEMM 128x128, each thread hold 64 data
    constexpr index_t BlockSize = 256;

@@ -120,7 +120,7 @@ void device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_deprecated(InDesc,

    constexpr index_t WeiBlockCopySrcDataPerRead_E  = 4;
    constexpr index_t WeiBlockCopyDstDataPerWrite_K = 1;
-#elif 1
+#elif 0
    // BlockSize = 256, blockwise-GEMM 64x128, each thread hold 32 data
    constexpr index_t BlockSize = 256;

@@ -157,6 +157,42 @@ void device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_deprecated(InDesc,

    constexpr index_t WeiBlockCopySrcDataPerRead_E  = 2;
    constexpr index_t WeiBlockCopyDstDataPerWrite_K = 1;
+#elif 1
+    constexpr index_t BlockSize = 64;
+
+    constexpr index_t BPerBlock = 16;
+    constexpr index_t KPerBlock = 32;
+    constexpr index_t EPerBlock = 4;
+
+    constexpr index_t GemmNRepeat = 2;
+
+    constexpr index_t GemmMPerThreadSubC = 4;
+    constexpr index_t GemmNPerThreadSubC = 4;
+    constexpr index_t GemmMLevel0Cluster = 1;
+    constexpr index_t GemmNLevel0Cluster = 4;
+    constexpr index_t GemmMLevel1Cluster = 4;
+    constexpr index_t GemmNLevel1Cluster = 4;
+    constexpr index_t GemmKPerThreadLoop = 1;
+    constexpr index_t GemmDataPerReadA   = 4;
+    constexpr index_t GemmDataPerReadB   = 4;
+
+    using InBlockCopySubLengths_E_N1_B_N2      = Sequence<1, 2, 1, 4>;
+    using InBlockCopyClusterLengths_E_N1_B_N2  = Sequence<4, 1, 16, 1>;
+    using InBlockCopyThreadClusterArrangeOrder = Sequence<0, 1, 3, 2>; // [E, N1, N2, B]
+    using InBlockCopySrcAccessOrder            = Sequence<0, 2, 1, 3>; // [E, B, N1, N2]
+    using InBlockCopyDstAccessOrder            = Sequence<0, 1, 2, 3>; // [E, N1, B, N2]
+
+    constexpr index_t InBlockCopySrcDataPerRead_B   = 1;
+    constexpr index_t InBlockCopyDstDataPerWrite_N2 = 4;
+
+    using WeiBlockCopySubLengths_E_K            = Sequence<1, 2>;
+    using WeiBlockCopyClusterLengths_E_K        = Sequence<4, 16>;
+    using WeiBlockCopyThreadClusterArrangeOrder = Sequence<1, 0>; // [K, E]
+    using WeiBlockCopySrcAccessOrder            = Sequence<1, 0>; // [K, E]
+    using WeiBlockCopyDstAccessOrder            = Sequence<0, 1>; // [E, K]
+
+    constexpr index_t WeiBlockCopySrcDataPerRead_E  = 1;
+    constexpr index_t WeiBlockCopyDstDataPerWrite_K = 2;
 #endif

    constexpr index_t N1 = GemmNRepeat;
--- a/driver/include/device_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw.hpp
+++ b/driver/include/device_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw.hpp
@@ -51,6 +51,7 @@ void device_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw(InDesc,
    out_nkhw_device_buf.ToDevice(out_nkhw.mData.data());

 #if 1
+    // BlockSize = 256, EPerBlock = 8
    constexpr index_t BlockSize = 256;

    constexpr index_t BPerBlock = 128;
@@ -85,7 +86,8 @@ void device_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw(InDesc,
    constexpr index_t WeiBlockCopyDstDataPerWrite_K = 1;

    constexpr index_t OutThreadCopyDataPerAccess_B = 1;
-#elif 1
+#elif 0
+    // BlockSize = 256, EPerBlock = 8
    // 1x1 filter, 8x8 image
    constexpr index_t BlockSize = 256;

@@ -122,6 +124,43 @@ void device_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw(InDesc,

    constexpr index_t OutThreadCopyDataPerAccess_B = 4;
 #elif 0
+    // BlockSize = 256, EPerBlock = 16
+    // 1x1 filter, 8x8 image
+    constexpr index_t BlockSize = 256;
+
+    constexpr index_t BPerBlock = 128;
+    constexpr index_t KPerBlock = 128;
+    constexpr index_t EPerBlock = 16;
+
+    constexpr index_t GemmMPerThreadSubC = 4;
+    constexpr index_t GemmNPerThreadSubC = 4;
+    constexpr index_t GemmMLevel0Cluster = 4;
+    constexpr index_t GemmNLevel0Cluster = 4;
+    constexpr index_t GemmMLevel1Cluster = 4;
+    constexpr index_t GemmNLevel1Cluster = 4;
+    constexpr index_t GemmKPerThreadLoop = 1;
+    constexpr index_t GemmDataPerReadA   = 4;
+    constexpr index_t GemmDataPerReadB   = 4;
+
+    using InBlockCopySubLengths_E_B            = Sequence<2, 4>;
+    using InBlockCopyClusterLengths_E_B        = Sequence<8, 32>;
+    using InBlockCopyThreadClusterArrangeOrder = Sequence<0, 1>; // [E, B]
+    using InBlockCopySrcAccessOrder            = Sequence<0, 1>; // [E, B]
+    using InBlockCopyDstAccessOrder            = Sequence<0, 1>; // [E, B]
+
+    constexpr index_t InBlockCopyDataPerAccess_B = 4;
+
+    using WeiBlockCopySubLengths_E_K            = Sequence<4, 2>;
+    using WeiBlockCopyClusterLengths_E_K        = Sequence<4, 64>;
+    using WeiBlockCopyThreadClusterArrangeOrder = Sequence<1, 0>; // [K, E]
+    using WeiBlockCopySrcAccessOrder            = Sequence<1, 0>; // [K, E]
+    using WeiBlockCopyDstAccessOrder            = Sequence<0, 1>; // [E, K]
+
+    constexpr index_t WeiBlockCopySrcDataPerRead_E  = 4;
+    constexpr index_t WeiBlockCopyDstDataPerWrite_K = 1;
+
+    constexpr index_t OutThreadCopyDataPerAccess_B = 4;
+#elif 1
    // 1x1 filter, 14x14 image
    constexpr index_t BlockSize = 256;

@@ -167,47 +206,43 @@ void device_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw(InDesc,
    printf("%s: BlockSize %u, GridSize %u \n", __func__, BlockSize, GridSize);

    constexpr auto gridwise_conv =
-#if 0
-        GridwiseConvolutionImplicitGemm_v4r4_nchw_kcyx_nkhw_padded
-#else
-        GridwiseConvolutionImplicitGemm_v4r4_nchw_kcyx_nkhw_lds_double_buffer
-#endif
-        <GridSize,
-         BlockSize,
-         T,
-         decltype(in_nchw_desc),
-         decltype(wei_kcyx_desc),
-         decltype(out_nkhw_desc),
-         ConvStrides,
-         ConvDilations,
-         LeftPads,
-         RightPads,
-         BPerBlock,
-         KPerBlock,
-         EPerBlock,
-         GemmMPerThreadSubC,
-         GemmNPerThreadSubC,
-         GemmMLevel0Cluster,
-         GemmNLevel0Cluster,
-         GemmMLevel1Cluster,
-         GemmNLevel1Cluster,
-         GemmKPerThreadLoop,
-         GemmDataPerReadA,
-         GemmDataPerReadB,
-         InBlockCopySubLengths_E_B,
-         InBlockCopyClusterLengths_E_B,
-         InBlockCopyThreadClusterArrangeOrder,
-         InBlockCopySrcAccessOrder,
-         InBlockCopyDstAccessOrder,
-         InBlockCopyDataPerAccess_B,
-         WeiBlockCopySubLengths_E_K,
-         WeiBlockCopyClusterLengths_E_K,
-         WeiBlockCopyThreadClusterArrangeOrder,
-         WeiBlockCopySrcAccessOrder,
-         WeiBlockCopyDstAccessOrder,
-         WeiBlockCopySrcDataPerRead_E,
-         WeiBlockCopyDstDataPerWrite_K,
-         OutThreadCopyDataPerAccess_B>{};
+        GridwiseConvolutionImplicitGemm_v4r4_nchw_kcyx_nkhw_lds_double_buffer<
+            GridSize,
+            BlockSize,
+            T,
+            decltype(in_nchw_desc),
+            decltype(wei_kcyx_desc),
+            decltype(out_nkhw_desc),
+            ConvStrides,
+            ConvDilations,
+            LeftPads,
+            RightPads,
+            BPerBlock,
+            KPerBlock,
+            EPerBlock,
+            GemmMPerThreadSubC,
+            GemmNPerThreadSubC,
+            GemmMLevel0Cluster,
+            GemmNLevel0Cluster,
+            GemmMLevel1Cluster,
+            GemmNLevel1Cluster,
+            GemmKPerThreadLoop,
+            GemmDataPerReadA,
+            GemmDataPerReadB,
+            InBlockCopySubLengths_E_B,
+            InBlockCopyClusterLengths_E_B,
+            InBlockCopyThreadClusterArrangeOrder,
+            InBlockCopySrcAccessOrder,
+            InBlockCopyDstAccessOrder,
+            InBlockCopyDataPerAccess_B,
+            WeiBlockCopySubLengths_E_K,
+            WeiBlockCopyClusterLengths_E_K,
+            WeiBlockCopyThreadClusterArrangeOrder,
+            WeiBlockCopySrcAccessOrder,
+            WeiBlockCopyDstAccessOrder,
+            WeiBlockCopySrcDataPerRead_E,
+            WeiBlockCopyDstDataPerWrite_K,
+            OutThreadCopyDataPerAccess_B>{};

    for(index_t i = 0; i < nrepeat; ++i)
    {
--- a/driver/include/device_tensor.hpp
+++ b/driver/include/device_tensor.hpp
@@ -0,0 +1,28 @@
+#pragma once
+#include "tensor.hpp"
+#include "common_header.hpp"
+#include "ConstantTensorDescriptor_deprecated.hpp"
+#include "tensor_descriptor.hpp"
+
+template <typename ConstTensorDesc, std::size_t... Is>
+auto make_TensorDescriptor_impl(ConstTensorDesc, std::integer_sequence<std::size_t, Is...>)
+{
+    std::initializer_list<std::size_t> lengths = {ConstTensorDesc::GetLengths()[Is]...};
+    std::initializer_list<std::size_t> strides = {ConstTensorDesc::GetStrides()[Is]...};
+
+    return TensorDescriptor(lengths, strides);
+}
+
+template <typename ConstTensorDesc>
+auto make_TensorDescriptor(ConstTensorDesc)
+{
+    return make_TensorDescriptor_impl(
+        ConstTensorDesc{},
+        std::make_integer_sequence<std::size_t, ConstTensorDesc::GetNumOfDimension()>{});
+}
+
+template <typename ConstTensorDesc>
+void ostream_ConstantTensorDescriptor(ConstTensorDesc, std::ostream& os = std::cout)
+{
+    ostream_TensorDescriptor(make_TensorDescriptor(ConstTensorDesc{}), os);
+}
--- a/driver/include/host_col2im.hpp
+++ b/driver/include/host_col2im.hpp
@@ -0,0 +1,71 @@
+#pragma once
+#include "tensor.hpp"
+
+template <typename T,
+          typename FilterSizes,
+          typename OutputSizes,
+          typename ConvStrides,
+          typename ConvDilations,
+          typename LeftPads,
+          typename RightPads>
+void host_col2im(const Tensor<T>& in_eb,
+                 Tensor<T>& in_nchw,
+                 FilterSizes,
+                 OutputSizes,
+                 ConvStrides,
+                 ConvDilations,
+                 LeftPads,
+                 RightPads)
+{
+    using namespace ck;
+
+    int N  = in_nchw.mDesc.GetLengths()[0];
+    int C  = in_nchw.mDesc.GetLengths()[1];
+    int HI = in_nchw.mDesc.GetLengths()[2];
+    int WI = in_nchw.mDesc.GetLengths()[3];
+
+    int Y = FilterSizes{}[0];
+    int X = FilterSizes{}[1];
+
+    int HO = OutputSizes{}[0];
+    int WO = OutputSizes{}[1];
+
+    auto f = [&](auto n, auto c, auto hi, auto wi) {
+        double v = 0;
+
+        for(int y = 0; y < Y; ++y)
+        {
+            int h_tmp = hi + LeftPads{}[0] - y * ConvDilations{}[0];
+
+            if(h_tmp >= 0 && h_tmp < HI && h_tmp % ConvStrides{}[0] == 0)
+            {
+                int ho = h_tmp / ConvStrides{}[0];
+
+                for(int x = 0; x < X; ++x)
+                {
+                    int w_tmp = wi + LeftPads{}[1] - x * ConvDilations{}[1];
+
+                    if(w_tmp >= 0 && w_tmp < WI && w_tmp % ConvStrides{}[1] == 0)
+                    {
+                        int wo = w_tmp / ConvStrides{}[1];
+
+                        int e = c * (Y * X) + y * X + x;
+                        int b = n * (HO * WO) + ho * WO + wo;
+
+                        v += in_eb(e, b);
+                    }
+                }
+            }
+        }
+
+        in_nchw(n, c, hi, wi) = v;
+    };
+
+    auto f_par = make_ParallelTensorFunctor(f,
+                                            in_nchw.mDesc.GetLengths()[0],
+                                            in_nchw.mDesc.GetLengths()[1],
+                                            in_nchw.mDesc.GetLengths()[2],
+                                            in_nchw.mDesc.GetLengths()[3]);
+
+    f_par(std::thread::hardware_concurrency());
+}
--- a/driver/include/host_conv.hpp
+++ b/driver/include/host_conv.hpp
@@ -1,49 +1,5 @@
 #pragma once
 #include "tensor.hpp"
-#include "common_header.hpp"
-#include "ConstantTensorDescriptor_deprecated.hpp"
-
-// this is ugly, only for 4d
-template <class TConstTensorDesc>
-void ostream_ConstantTensorDescriptor(TConstTensorDesc, std::ostream& os = std::cout)
-{
-    using namespace ck;
-
-    static_assert(TConstTensorDesc::nDim == 4, "nDim is not 4");
-
-    constexpr auto I0   = Number<0>{};
-    constexpr auto I1   = Number<1>{};
-    constexpr auto I2   = Number<2>{};
-    constexpr auto I3   = Number<3>{};
-    constexpr auto desc = TConstTensorDesc{};
-
-    os << "Lengths: {" << desc.GetLength(I0) << ", " << desc.GetLength(I1) << ", "
-       << desc.GetLength(I2) << ", " << desc.GetLength(I3) << "}, "
-       << "Strides: {" << desc.GetStride(I0) << ", " << desc.GetStride(I1) << ", "
-       << desc.GetStride(I2) << ", " << desc.GetStride(I3) << "}" << std::endl;
-}
-
-// this is ugly, only for 4d
-template <class TConstTensorDesc>
-auto make_TensorDescriptor(TConstTensorDesc)
-{
-    using namespace ck;
-
-    static_assert(TConstTensorDesc::nDim == 4, "nDim is not 4");
-
-    constexpr auto I0   = Number<0>{};
-    constexpr auto I1   = Number<1>{};
-    constexpr auto I2   = Number<2>{};
-    constexpr auto I3   = Number<3>{};
-    constexpr auto desc = TConstTensorDesc{};
-
-    std::initializer_list<index_t> lengths = {
-        desc.GetLength(I0), desc.GetLength(I1), desc.GetLength(I2), desc.GetLength(I3)};
-    std::initializer_list<index_t> strides = {
-        desc.GetStride(I0), desc.GetStride(I1), desc.GetStride(I2), desc.GetStride(I3)};
-
-    return TensorDescriptor(lengths, strides);
-}

 template <class TIn,
          class TWei,
@@ -331,25 +287,3 @@ void host_winograd_3x3_convolution(const Tensor<TIn>& in_nchw,
    make_ParallelTensorFunctor(f_out_hold, N, K, HTile, WTile)(num_thread);
    make_ParallelTensorFunctor(f_out, N, K, HTile, WTile)(num_thread);
 }
-
-template <class T>
-void check_error(const Tensor<T>& ref, const Tensor<T>& result)
-{
-    float error     = 0;
-    float max_diff  = -1;
-    float ref_value = 0, result_value = 0;
-    for(int i = 0; i < ref.mData.size(); ++i)
-    {
-        error += std::abs(double(ref.mData[i]) - double(result.mData[i]));
-        float diff = std::abs(double(ref.mData[i]) - double(result.mData[i]));
-        if(max_diff < diff)
-        {
-            max_diff     = diff;
-            ref_value    = ref.mData[i];
-            result_value = result.mData[i];
-        }
-    }
-
-    std::cout << "error: " << error << std::endl;
-    std::cout << "max_diff: " << max_diff << ", " << ref_value << ", " << result_value << std::endl;
-}
--- a/driver/include/host_conv_bwd_data.hpp
+++ b/driver/include/host_conv_bwd_data.hpp
@@ -0,0 +1,77 @@
+#pragma once
+#include "tensor.hpp"
+
+template <typename TIn,
+          typename TWei,
+          typename TOut,
+          typename ConvStrides,
+          typename ConvDilations,
+          typename LeftPads,
+          typename RightPads>
+void host_direct_convolution_backward_data(Tensor<TIn>& in_nchw,
+                                           const Tensor<TWei>& wei_kcyx,
+                                           const Tensor<TOut>& out_nkhw,
+                                           ConvStrides,
+                                           ConvDilations,
+                                           LeftPads,
+                                           RightPads)
+{
+    using namespace ck;
+
+    int N  = in_nchw.mDesc.GetLengths()[0];
+    int C  = in_nchw.mDesc.GetLengths()[1];
+    int HI = in_nchw.mDesc.GetLengths()[2];
+    int WI = in_nchw.mDesc.GetLengths()[3];
+
+    std::size_t K = wei_kcyx.mDesc.GetLengths()[0];
+    std::size_t Y = wei_kcyx.mDesc.GetLengths()[2];
+    std::size_t X = wei_kcyx.mDesc.GetLengths()[3];
+
+    std::size_t HO = out_nkhw.mDesc.GetLengths()[2];
+    std::size_t WO = out_nkhw.mDesc.GetLengths()[3];
+
+    auto f = [&](auto n, auto c, auto hi, auto wi) {
+        double v = 0;
+
+        for(int y = 0; y < Y; ++y)
+        {
+            int h_tmp = hi + LeftPads{}[0] - y * ConvDilations{}[0];
+
+            if(h_tmp % ConvStrides{}[0] == 0)
+            {
+                int ho = h_tmp / ConvStrides{}[0];
+
+                if(ho >= 0 && ho < HO)
+                {
+                    for(int x = 0; x < X; ++x)
+                    {
+                        int w_tmp = wi + LeftPads{}[1] - x * ConvDilations{}[1];
+
+                        if(w_tmp % ConvStrides{}[1] == 0)
+                        {
+                            int wo = w_tmp / ConvStrides{}[1];
+
+                            if(wo >= 0 && wo < WO)
+                            {
+                                for(int k = 0; k < K; ++k)
+                                {
+                                    v += out_nkhw(n, k, ho, wo) * wei_kcyx(k, c, y, x);
+                                }
+                            }
+                        }
+                    }
+                }
+            }
+        }
+
+        in_nchw(n, c, hi, wi) = v;
+    };
+
+    auto f_par = make_ParallelTensorFunctor(f,
+                                            in_nchw.mDesc.GetLengths()[0],
+                                            in_nchw.mDesc.GetLengths()[1],
+                                            in_nchw.mDesc.GetLengths()[2],
+                                            in_nchw.mDesc.GetLengths()[3]);
+
+    f_par(std::thread::hardware_concurrency());
+}
--- a/driver/include/tensor.hpp
+++ b/driver/include/tensor.hpp
@@ -68,10 +68,12 @@ auto construct_f_unpack_args(F, T args)
 struct TensorDescriptor
 {
    TensorDescriptor() = delete;
-    TensorDescriptor(std::initializer_list<std::size_t> lens);
-    TensorDescriptor(std::initializer_list<std::size_t> lens,
-                     std::initializer_list<std::size_t> strides);
-    TensorDescriptor(std::vector<std::size_t> lens, std::vector<std::size_t> strides);
+
+    template <typename X>
+    TensorDescriptor(std::vector<X> lens);
+
+    template <typename X, typename Y>
+    TensorDescriptor(std::vector<X> lens, std::vector<Y> strides);

    void CalculateStrides();

@@ -269,4 +271,39 @@ struct Tensor
    std::vector<T> mData;
 };

+void ostream_TensorDescriptor(const TensorDescriptor& desc, std::ostream& os = std::cout)
+{
+    os << "dim " << desc.GetNumOfDimension() << ", ";
+
+    os << "lengths {";
+    LogRange(os, desc.GetLengths(), ", ");
+    os << "}, ";
+
+    os << "strides {";
+    LogRange(os, desc.GetStrides(), ", ");
+    os << "}" << std::endl;
+}
+
+template <class T>
+void check_error(const Tensor<T>& ref, const Tensor<T>& result)
+{
+    float error     = 0;
+    float max_diff  = -1;
+    float ref_value = 0, result_value = 0;
+    for(int i = 0; i < ref.mData.size(); ++i)
+    {
+        error += std::abs(double(ref.mData[i]) - double(result.mData[i]));
+        float diff = std::abs(double(ref.mData[i]) - double(result.mData[i]));
+        if(max_diff < diff)
+        {
+            max_diff     = diff;
+            ref_value    = ref.mData[i];
+            result_value = result.mData[i];
+        }
+    }
+
+    std::cout << "error: " << error << std::endl;
+    std::cout << "max_diff: " << max_diff << ", " << ref_value << ", " << result_value << std::endl;
+}
+
 #endif
--- a/driver/include/tensor_generator.hpp
+++ b/driver/include/tensor_generator.hpp
@@ -0,0 +1,57 @@
+#ifndef TENSOR_GENERATOR_HPP
+#define TENSOR_GENERATOR_HPP
+
+#include "config.hpp"
+
+struct GeneratorTensor_1
+{
+    int value = 1;
+
+    template <class... Is>
+    double operator()(Is... is)
+    {
+        return value;
+    }
+};
+
+struct GeneratorTensor_2
+{
+    int min_value = 0;
+    int max_value = 1;
+
+    template <class... Is>
+    double operator()(Is...)
+    {
+        return (std::rand() % (max_value - min_value)) + min_value;
+    }
+};
+
+struct GeneratorTensor_3
+{
+    template <class... Is>
+    double operator()(Is... is)
+    {
+        std::array<ck::index_t, sizeof...(Is)> dims = {{static_cast<ck::index_t>(is)...}};
+
+        auto f_acc = [](auto a, auto b) { return 10 * a + b; };
+
+        return std::accumulate(dims.begin(), dims.end(), ck::index_t(0), f_acc);
+    }
+};
+
+struct GeneratorTensor_Checkboard
+{
+    template <class... Ts>
+    double operator()(Ts... Xs) const
+    {
+        std::array<ck::index_t, sizeof...(Ts)> dims = {{Xs...}};
+        return std::accumulate(dims.begin(),
+                               dims.end(),
+                               true,
+                               [](bool init, ck::index_t x) -> int { return init != (x % 2); })
+                   ? 1
+                   : -1;
+    }
+};
+
+#endif