From 39775d484c4d15a5b895edfc9d2323f05ab2d3d4 Mon Sep 17 00:00:00 2001
From: Chao Liu <lc.roy86@gmail.com>
Date: Tue, 18 Dec 2018 03:22:12 -0600
Subject: [PATCH] another version of direct conv

---
 driver/conv.cu                                |  13 +-
 driver/device_direct_convolution_1.cuh        |   2 +-
 driver/device_direct_convolution_2.cuh        |   6 +-
 driver/device_direct_convolution_3.cuh        | 120 ++++++++++
 src/include/blockwise_direct_convolution.cuh  |  24 +-
 src/include/blockwise_tensor_op.cuh           |  93 ++++----
 src/include/constant_tensor_descriptor.cuh    |   2 +-
 src/include/gridwise_direct_convolution_1.cuh |  12 +-
 src/include/gridwise_direct_convolution_2.cuh |  78 +++----
 src/include/gridwise_direct_convolution_3.cuh | 208 ++++++++++++++++++
 src/include/threadwise_direct_convolution.cuh | 127 ++++++++++-
 src/include/threadwise_tensor_op.cuh          |  68 +++---
 12 files changed, 596 insertions(+), 157 deletions(-)
 create mode 100644 driver/device_direct_convolution_3.cuh
 create mode 100644 src/include/gridwise_direct_convolution_3.cuh

diff --git a/driver/conv.cu b/driver/conv.cu
index f486d9ad51..80e5057aaf 100644
--- a/driver/conv.cu
+++ b/driver/conv.cu
@@ -7,6 +7,7 @@
 #include "constant_tensor_descriptor.cuh"
 #include "device_direct_convolution_1.cuh"
 #include "device_direct_convolution_2.cuh"
+#include "device_direct_convolution_3.cuh"
 //#include "device_winograd_convolution.cuh"
 
 struct GeneratorTensor_1
@@ -338,7 +339,7 @@ int main()
     constexpr unsigned K = 1;
     constexpr unsigned S = 3;
     constexpr unsigned R = 3;
-#elif 0
+#elif 1
     constexpr unsigned N = 64;
     constexpr unsigned C = 256;
     constexpr unsigned HI = 34;
@@ -346,7 +347,7 @@ int main()
     constexpr unsigned K = 64;
     constexpr unsigned S = 3;
     constexpr unsigned R = 3;
-#elif 1
+#elif 0
     constexpr unsigned N = 64;
     constexpr unsigned C = 64;
     constexpr unsigned HI = 56;
@@ -387,11 +388,15 @@ int main()
     wei.GenerateTensorValue(GeneratorTensor_2{-5, 5}, num_thread);
 #endif
 
-    for(int i = 0; i < 20; ++i)
+    for(int i = 0; i < 40; ++i)
     {
 #if 1
         device_direct_convolution_1(in_desc, in, wei_desc, wei, out_desc, out_device);
-#else
+#elif 0
+        device_direct_convolution_2(in_desc, in, wei_desc, wei, out_desc, out_device);
+#elif 0
+        device_direct_convolution_3(in_desc, in, wei_desc, wei, out_desc, out_device);
+#elif 0
         device_winograd_convolution(in_desc, in, wei_desc, wei, out_desc, out_device);
 #endif
     }
diff --git a/driver/device_direct_convolution_1.cuh b/driver/device_direct_convolution_1.cuh
index fbd1b98d41..68dc000173 100644
--- a/driver/device_direct_convolution_1.cuh
+++ b/driver/device_direct_convolution_1.cuh
@@ -25,7 +25,7 @@ void device_direct_convolution_1(
     constexpr auto wei_desc = WeiDesc{};
     constexpr auto out_desc = OutDesc{};
 
-#if 0
+#if 1
     constexpr unsigned OutTileSizeH = 2;
     constexpr unsigned OutTileSizeW = 2;
     constexpr unsigned NPerBlock    = 2;
diff --git a/driver/device_direct_convolution_2.cuh b/driver/device_direct_convolution_2.cuh
index 5c08870a26..9e3d8b2d92 100644
--- a/driver/device_direct_convolution_2.cuh
+++ b/driver/device_direct_convolution_2.cuh
@@ -25,7 +25,7 @@ void device_direct_convolution_2(
     constexpr auto wei_desc = WeiDesc{};
     constexpr auto out_desc = OutDesc{};
 
-#if 0
+#if 1
     constexpr unsigned OutTileSizeH = 2;
     constexpr unsigned OutTileSizeW = 2;
     constexpr unsigned NPerBlock    = 2;
@@ -39,7 +39,7 @@ void device_direct_convolution_2(
     constexpr unsigned CPerThread = 2;
 
     constexpr unsigned BlockSize = 128;
-#elif 1
+#elif 0
     constexpr unsigned OutTileSizeH = 2;
     constexpr unsigned OutTileSizeW = 2;
     constexpr unsigned NPerBlock    = 2;
@@ -53,7 +53,7 @@ void device_direct_convolution_2(
     constexpr unsigned CPerThread = 2;
 
     constexpr unsigned BlockSize = 216;
-#elif 1
+#elif 0
     constexpr unsigned OutTileSizeH = 2;
     constexpr unsigned OutTileSizeW = 2;
     constexpr unsigned NPerBlock    = 2;
diff --git a/driver/device_direct_convolution_3.cuh b/driver/device_direct_convolution_3.cuh
new file mode 100644
index 0000000000..884b3bf38e
--- /dev/null
+++ b/driver/device_direct_convolution_3.cuh
@@ -0,0 +1,120 @@
+#pragma once
+#include "gridwise_direct_convolution_3.cuh"
+
+template <class T, class InDesc, class WeiDesc, class OutDesc>
+void device_direct_convolution_3(
+    InDesc, const Tensor<T>& in, WeiDesc, const Tensor<T>& wei, OutDesc, Tensor<T>& out)
+{
+    std::size_t data_sz = sizeof(T);
+    DeviceMem in_device_buf(data_sz * in.mDesc.GetElementSpace());
+    DeviceMem wei_device_buf(data_sz * wei.mDesc.GetElementSpace());
+    DeviceMem out_device_buf(data_sz * out.mDesc.GetElementSpace());
+
+    int num_thread = std::thread::hardware_concurrency();
+
+    in_device_buf.ToDevice(in.mData.data());
+    wei_device_buf.ToDevice(wei.mData.data());
+    out_device_buf.ToDevice(out.mData.data());
+
+    constexpr auto I0 = Number<0>{};
+    constexpr auto I1 = Number<1>{};
+    constexpr auto I2 = Number<2>{};
+    constexpr auto I3 = Number<3>{};
+
+    constexpr auto in_desc  = InDesc{};
+    constexpr auto wei_desc = WeiDesc{};
+    constexpr auto out_desc = OutDesc{};
+
+#if 1
+    constexpr unsigned OutTileSizeH = 2;
+    constexpr unsigned OutTileSizeW = 2;
+    constexpr unsigned NPerBlock    = 2;
+    constexpr unsigned KPerBlock    = 32;
+    constexpr unsigned CPerBlock    = 4;
+    constexpr unsigned YPerBlock    = 1;
+    constexpr unsigned XPerBlock    = 16;
+
+    constexpr unsigned NPerThread = 2;
+    constexpr unsigned KPerThread = 4;
+    constexpr unsigned CPerThread = 2;
+
+    constexpr unsigned BlockSize = 128;
+#elif 0
+    constexpr unsigned OutTileSizeH = 2;
+    constexpr unsigned OutTileSizeW = 2;
+    constexpr unsigned NPerBlock    = 2;
+    constexpr unsigned KPerBlock    = 32;
+    constexpr unsigned CPerBlock    = 4;
+    constexpr unsigned YPerBlock    = 1;
+    constexpr unsigned XPerBlock    = 27;
+
+    constexpr unsigned NPerThread = 2;
+    constexpr unsigned KPerThread = 4;
+    constexpr unsigned CPerThread = 2;
+
+    constexpr unsigned BlockSize = 216;
+#elif 0
+    constexpr unsigned OutTileSizeH = 2;
+    constexpr unsigned OutTileSizeW = 2;
+    constexpr unsigned NPerBlock    = 2;
+    constexpr unsigned KPerBlock    = 32;
+    constexpr unsigned CPerBlock    = 4;
+    constexpr unsigned YPerBlock    = 1;
+    constexpr unsigned XPerBlock    = 32;
+
+    constexpr unsigned NPerThread = 2;
+    constexpr unsigned KPerThread = 4;
+    constexpr unsigned CPerThread = 2;
+
+    constexpr unsigned BlockSize = 256;
+#endif
+
+    constexpr unsigned GridSize = (out_desc.GetLength(I0) / NPerBlock) *
+                                  (out_desc.GetLength(I1) / KPerBlock) *
+                                  (out_desc.GetLength(I2) / (OutTileSizeH * YPerBlock)) *
+                                  (out_desc.GetLength(I3) / (OutTileSizeW * XPerBlock));
+
+    dim3 block_dim(BlockSize);
+    dim3 grid_dim(GridSize);
+
+    printf("%s: BlockSize %u, GridSize %u \n", __func__, BlockSize, GridSize);
+
+    cudaEvent_t start, stop;
+    float elapsedTime;
+
+    cudaEventCreate(&start);
+    cudaEventRecord(start, 0);
+
+    gridwise_direct_convolution_3<T,
+                                  InDesc,
+                                  WeiDesc,
+                                  OutDesc,
+                                  OutTileSizeH,
+                                  OutTileSizeW,
+                                  NPerBlock,
+                                  KPerBlock,
+                                  CPerBlock,
+                                  YPerBlock,
+                                  XPerBlock,
+                                  NPerThread,
+                                  KPerThread,
+                                  CPerThread,
+                                  BlockSize,
+                                  GridSize>
+        <<<grid_dim, block_dim>>>(InDesc{},
+                                  static_cast<T*>(in_device_buf.GetDeviceBuffer()),
+                                  WeiDesc{},
+                                  static_cast<T*>(wei_device_buf.GetDeviceBuffer()),
+                                  OutDesc{},
+                                  static_cast<T*>(out_device_buf.GetDeviceBuffer()));
+
+    cudaEventCreate(&stop);
+    cudaEventRecord(stop, 0);
+    cudaEventSynchronize(stop);
+
+    cudaEventElapsedTime(&elapsedTime, start, stop);
+    printf("Elapsed time : %f ms\n", elapsedTime);
+
+    checkCudaErrors(cudaGetLastError());
+    out_device_buf.FromDevice(out.mData.data());
+}
diff --git a/src/include/blockwise_direct_convolution.cuh b/src/include/blockwise_direct_convolution.cuh
index 879e682cc4..95005af765 100644
--- a/src/include/blockwise_direct_convolution.cuh
+++ b/src/include/blockwise_direct_convolution.cuh
@@ -102,7 +102,8 @@ __device__ void blockwise_direct_convolution(InBlockDesc,
                                                                           ho_thread_data_begin,
                                                                           wo_thread_data_begin),
                                   out_thread_desc,
-                                  p_out_thread);
+                                  p_out_thread,
+                                  out_thread_desc);
 
         for(unsigned c_thread_data_begin = 0; c_thread_data_begin < in_block_desc.GetLength(I1);
             c_thread_data_begin += CPerThread)
@@ -114,7 +115,8 @@ __device__ void blockwise_direct_convolution(InBlockDesc,
                                                                             hi_thread_data_begin,
                                                                             wi_thread_data_begin),
                                       in_thread_desc,
-                                      p_in_thread);
+                                      p_in_thread,
+                                      in_thread_desc);
 
             // copy weight into register
             threadwise_4d_tensor_copy(
@@ -122,15 +124,16 @@ __device__ void blockwise_direct_convolution(InBlockDesc,
                 p_wei_block +
                     wei_block_desc.Get1dIndex(k_thread_data_begin, c_thread_data_begin, 0, 0),
                 wei_thread_desc,
-                p_wei_thread);
+                p_wei_thread,
+                wei_thread_desc);
 
             // threadwise convolution
-            threadwise_direct_convolution(in_thread_desc,
-                                          p_in_thread,
-                                          wei_thread_desc,
-                                          p_wei_thread,
-                                          out_thread_desc,
-                                          p_out_thread);
+            threadwise_direct_convolution_2(in_thread_desc,
+                                            p_in_thread,
+                                            wei_thread_desc,
+                                            p_wei_thread,
+                                            out_thread_desc,
+                                            p_out_thread);
         }
 
         // copy output into LDS
@@ -140,6 +143,7 @@ __device__ void blockwise_direct_convolution(InBlockDesc,
                                   p_out_block + out_block_desc.Get1dIndex(n_thread_data_begin,
                                                                           k_thread_data_begin,
                                                                           ho_thread_data_begin,
-                                                                          wo_thread_data_begin));
+                                                                          wo_thread_data_begin),
+                                  out_thread_desc);
     }
 }
diff --git a/src/include/blockwise_tensor_op.cuh b/src/include/blockwise_tensor_op.cuh
index 3b4bc58fdc..f404c1d2dc 100644
--- a/src/include/blockwise_tensor_op.cuh
+++ b/src/include/blockwise_tensor_op.cuh
@@ -75,84 +75,82 @@ __device__ void blockwise_4d_tensor_pointwise_op_unary(DstDesc, TFloat* __restri
     }
 }
 
-template <class TFloat, class SrcDesc, class DstDesc, class F, unsigned BlockSize>
+template <class TFloat, class DescA, class DescB, class DescRef, class F, unsigned BlockSize>
 __device__ void blockwise_4d_tensor_pointwise_op_binary(
-    SrcDesc, TFloat* const __restrict__ p_src, DstDesc, TFloat* __restrict__ p_dst, F f)
+    DescA, TFloat* const __restrict__ p_a, DescB, TFloat* __restrict__ p_b, DescRef, F f)
 {
     constexpr auto I0 = Number<0>{};
     constexpr auto I1 = Number<1>{};
     constexpr auto I2 = Number<2>{};
     constexpr auto I3 = Number<3>{};
 
-    constexpr auto src_desc = SrcDesc{};
-    constexpr auto dst_desc = DstDesc{};
-
-    static_assert(is_same<decltype(src_desc.GetLengths()), decltype(dst_desc.GetLengths())>::value);
-
-    constexpr auto desc = make_ConstantTensorDescriptor(src_desc.GetLengths());
+    constexpr auto desc_a   = DescA{};
+    constexpr auto desc_b   = DescB{};
+    constexpr auto desc_ref = DescRef{};
 
 #if 0
     if(threadIdx.x == 0)
     {
-        print_ConstantTensorDescriptor(src_desc, "blockwise_4d_tensor_op_binary: src_desc: ");
-        print_ConstantTensorDescriptor(dst_desc, "blockwise_4d_tensor_op_binary: dst_desc: ");
+        print_ConstantTensorDescriptor(desc_a, "blockwise_4d_tensor_op_binary: desc_a: ");
+        print_ConstantTensorDescriptor(desc_b, "blockwise_4d_tensor_op_binary: desc_b: ");
+        print_ConstantTensorDescriptor(desc_ref, "blockwise_4d_tensor_op_binary: desc_ref: ");
     }
 #endif
 
-    constexpr unsigned NLoop = desc.GetElementSize() / BlockSize;
+    constexpr unsigned NLoop = desc_ref.GetElementSize() / BlockSize;
 
     for(unsigned iloop = 0; iloop < NLoop; ++iloop)
     {
         unsigned is = threadIdx.x + iloop * BlockSize;
 
-        const unsigned did0 = is / desc.GetStride(I0);
+        const unsigned did0 = is / desc_ref.GetStride(I0);
 
-        is -= did0 * desc.GetStride(I0);
+        is -= did0 * desc_ref.GetStride(I0);
 
-        const unsigned did1 = is / desc.GetStride(I1);
+        const unsigned did1 = is / desc_ref.GetStride(I1);
 
-        is -= did1 * desc.GetStride(I1);
+        is -= did1 * desc_ref.GetStride(I1);
 
-        const unsigned did2 = is / desc.GetStride(I2);
+        const unsigned did2 = is / desc_ref.GetStride(I2);
 
-        is -= did2 * desc.GetStride(I2);
+        is -= did2 * desc_ref.GetStride(I2);
 
-        const unsigned did3 = is / desc.GetStride(I3);
+        const unsigned did3 = is / desc_ref.GetStride(I3);
 
-        const unsigned sindex = src_desc.Get1dIndex(did0, did1, did2, did3);
+        const unsigned aindex = desc_a.Get1dIndex(did0, did1, did2, did3);
 
-        const unsigned dindex = dst_desc.Get1dIndex(did0, did1, did2, did3);
+        const unsigned bindex = desc_b.Get1dIndex(did0, did1, did2, did3);
 
-        f(p_src[sindex], p_dst[dindex]);
+        f(p_a[aindex], p_b[bindex]);
     }
 
-    constexpr bool has_tail = (desc.GetElementSize() > NLoop * BlockSize);
+    constexpr bool has_tail = (desc_ref.GetElementSize() > NLoop * BlockSize);
 
     if(has_tail)
     {
         unsigned is = threadIdx.x + NLoop * BlockSize;
 
-        if(is < desc.GetElementSize())
+        if(is < desc_ref.GetElementSize())
         {
-            const unsigned did0 = is / desc.GetStride(I0);
+            const unsigned did0 = is / desc_ref.GetStride(I0);
 
-            is -= did0 * desc.GetStride(I0);
+            is -= did0 * desc_ref.GetStride(I0);
 
-            const unsigned did1 = is / desc.GetStride(I1);
+            const unsigned did1 = is / desc_ref.GetStride(I1);
 
-            is -= did1 * desc.GetStride(I1);
+            is -= did1 * desc_ref.GetStride(I1);
 
-            const unsigned did2 = is / desc.GetStride(I2);
+            const unsigned did2 = is / desc_ref.GetStride(I2);
 
-            is -= did2 * desc.GetStride(I2);
+            is -= did2 * desc_ref.GetStride(I2);
 
-            const unsigned did3 = is / desc.GetStride(I3);
+            const unsigned did3 = is / desc_ref.GetStride(I3);
 
-            const unsigned sindex = src_desc.Get1dIndex(did0, did1, did2, did3);
+            const unsigned aindex = desc_a.Get1dIndex(did0, did1, did2, did3);
 
-            const unsigned dindex = dst_desc.Get1dIndex(did0, did1, did2, did3);
+            const unsigned bindex = desc_b.Get1dIndex(did0, did1, did2, did3);
 
-            f(p_src[sindex], p_dst[dindex]);
+            f(p_a[aindex], p_b[bindex]);
         }
     }
 }
@@ -166,26 +164,17 @@ __device__ void blockwise_4d_tensor_set_zero(DstDesc, TFloat* __restrict__ p_dst
         DstDesc{}, p_dst, f_set_zero);
 }
 
-template <class TFloat, class SrcDesc, class DstDesc, unsigned BlockSize>
-__device__ void blockwise_4d_tensor_copy(SrcDesc,
-                                         TFloat* const __restrict__ p_src,
-                                         DstDesc,
-                                         TFloat* __restrict__ p_dst)
+template <class TFloat, class SrcDesc, class DstDesc, class RefDesc, unsigned BlockSize>
+__device__ void blockwise_4d_tensor_copy(
+    SrcDesc, TFloat* const __restrict__ p_src, DstDesc, TFloat* __restrict__ p_dst, RefDesc)
 {
     auto f_copy = [](const TFloat& src, TFloat& dst) { dst = src; };
 
-    blockwise_4d_tensor_pointwise_op_binary<TFloat, SrcDesc, DstDesc, decltype(f_copy), BlockSize>(
-        SrcDesc{}, p_src, DstDesc{}, p_dst, f_copy);
+    blockwise_4d_tensor_pointwise_op_binary<TFloat,
+                                            SrcDesc,
+                                            DstDesc,
+                                            RefDesc,
+                                            decltype(f_copy),
+                                            BlockSize>(
+        SrcDesc{}, p_src, DstDesc{}, p_dst, RefDesc{}, f_copy);
 }
-
-template <class TFloat, class SrcDesc, class DstDesc, unsigned BlockSize>
-__device__ void blockwise_4d_tensor_accumulate(SrcDesc,
-                                               TFloat* const __restrict__ p_src,
-                                               DstDesc,
-                                               TFloat* __restrict__ p_dst)
-{
-    auto f_accum = [](const TFloat& src, TFloat& dst) { dst += src; };
-
-    blockwise_4d_tensor_pointwise_op_binary<TFloat, SrcDesc, DstDesc, decltype(f_accum), BlockSize>(
-        SrcDesc{}, p_src, DstDesc{}, p_dst, f_accum);
-}
\ No newline at end of file
diff --git a/src/include/constant_tensor_descriptor.cuh b/src/include/constant_tensor_descriptor.cuh
index 11c5c30e54..5ce3f0deaa 100644
--- a/src/include/constant_tensor_descriptor.cuh
+++ b/src/include/constant_tensor_descriptor.cuh
@@ -4,7 +4,7 @@
 template <class T, T N>
 struct Constant
 {
-    const T mValue = N;
+    static const T mValue = N;
 };
 
 template <unsigned N>
diff --git a/src/include/gridwise_direct_convolution_1.cuh b/src/include/gridwise_direct_convolution_1.cuh
index 5c77f0c9d2..8a339b14e4 100644
--- a/src/include/gridwise_direct_convolution_1.cuh
+++ b/src/include/gridwise_direct_convolution_1.cuh
@@ -130,6 +130,7 @@ __global__ void gridwise_direct_convolution_1(InGlobalDesc,
         blockwise_4d_tensor_copy<TFloat,
                                  decltype(in_block_src_desc),
                                  decltype(in_block_desc),
+                                 decltype(in_block_desc),
                                  BlockSize>(in_block_src_desc,
                                             p_in_global +
                                                 in_global_desc.Get1dIndex(n_block_work_begin,
@@ -137,17 +138,20 @@ __global__ void gridwise_direct_convolution_1(InGlobalDesc,
                                                                           hi_block_work_begin,
                                                                           wi_block_work_begin),
                                             in_block_desc,
-                                            p_in_block);
+                                            p_in_block,
+                                            in_block_desc);
 
         // copy weight tensor to LDS
         blockwise_4d_tensor_copy<TFloat,
                                  decltype(wei_block_src_desc),
                                  decltype(wei_block_desc),
+                                 decltype(wei_block_desc),
                                  BlockSize>(
             wei_block_src_desc,
             p_wei_global + wei_global_desc.Get1dIndex(k_block_work_begin, c_block_work_begin, 0, 0),
             wei_block_desc,
-            p_wei_block);
+            p_wei_block,
+            wei_block_desc);
 
         __syncthreads();
 
@@ -171,11 +175,13 @@ __global__ void gridwise_direct_convolution_1(InGlobalDesc,
     blockwise_4d_tensor_copy<TFloat,
                              decltype(out_block_desc),
                              decltype(out_block_src_desc),
+                             decltype(out_block_desc),
                              BlockSize>(
         out_block_desc,
         p_out_block,
         out_block_src_desc,
         p_out_global +
             out_global_desc.Get1dIndex(
-                n_block_work_begin, k_block_work_begin, ho_block_work_begin, wo_block_work_begin));
+                n_block_work_begin, k_block_work_begin, ho_block_work_begin, wo_block_work_begin),
+        out_block_desc);
 }
\ No newline at end of file
diff --git a/src/include/gridwise_direct_convolution_2.cuh b/src/include/gridwise_direct_convolution_2.cuh
index 008d5be695..d2040f1f90 100644
--- a/src/include/gridwise_direct_convolution_2.cuh
+++ b/src/include/gridwise_direct_convolution_2.cuh
@@ -46,15 +46,11 @@ __global__ void gridwise_direct_convolution_2(InGlobalDesc,
     constexpr unsigned HiPerBlock = YPerBlock * OutTileSizeH + S - 1;
     constexpr unsigned WiPerBlock = XPerBlock * OutTileSizeW + R - 1;
 
-    constexpr auto in_block_global_desc = make_ConstantTensorDescriptor(
-        Sequence<NPerBlock, CPerBlock, HiPerBlock, WiPerBlock>{}, in_global_desc.GetStrides());
+    constexpr auto in_block_desc =
+        make_ConstantTensorDescriptor(Sequence<NPerBlock, CPerBlock, HiPerBlock, WiPerBlock>{});
 
-    constexpr auto wei_block_global_desc = make_ConstantTensorDescriptor(
-        Sequence<KPerBlock, CPerBlock, S, R>{}, wei_global_desc.GetStrides());
-
-    constexpr auto in_block_desc = make_ConstantTensorDescriptor(in_block_global_desc.GetLengths());
     constexpr auto wei_block_desc =
-        make_ConstantTensorDescriptor(wei_block_global_desc.GetLengths());
+        make_ConstantTensorDescriptor(Sequence<KPerBlock, CPerBlock, S, R>{});
 
     // shared mem
     constexpr unsigned in_block_size  = in_block_desc.GetElementSpace();
@@ -67,30 +63,19 @@ __global__ void gridwise_direct_convolution_2(InGlobalDesc,
     constexpr unsigned InTileSizeH = OutTileSizeH + S - 1;
     constexpr unsigned InTileSizeW = OutTileSizeW + R - 1;
 
-    constexpr auto in_thread_block_desc = make_ConstantTensorDescriptor(
-        Sequence<NPerThread, CPerThread, InTileSizeH, InTileSizeW>{}, in_block_desc.GetStrides());
-
-    constexpr auto wei_thread_block_desc = make_ConstantTensorDescriptor(
-        Sequence<KPerThread, CPerThread, S, R>{}, wei_block_desc.GetStrides());
-
     constexpr auto in_thread_desc =
-        make_ConstantTensorDescriptor(in_thread_block_desc.GetLengths());
+        make_ConstantTensorDescriptor(Sequence<NPerThread, CPerThread, InTileSizeH, InTileSizeW>{});
+
     constexpr auto wei_thread_desc =
-        make_ConstantTensorDescriptor(wei_thread_block_desc.GetLengths());
+        make_ConstantTensorDescriptor(Sequence<KPerThread, CPerThread, S, R>{});
+
     constexpr auto out_thread_desc =
         get_output_4d_tensor_descriptor(in_thread_desc, wei_thread_desc);
 
-    constexpr auto out_thread_global_desc =
-        make_ConstantTensorDescriptor(out_thread_desc.GetLengths(), out_global_desc.GetStrides());
-
     // register
-    constexpr unsigned in_thread_size  = in_thread_desc.GetElementSpace();
-    constexpr unsigned wei_thread_size = wei_thread_desc.GetElementSpace();
-    constexpr unsigned out_thread_size = out_thread_desc.GetElementSpace();
-
-    TFloat p_in_thread[in_thread_size];
-    TFloat p_wei_thread[wei_thread_size];
-    TFloat p_out_thread[out_thread_size];
+    TFloat p_in_thread[in_thread_desc.GetElementSpace()];
+    TFloat p_wei_thread[wei_thread_desc.GetElementSpace()];
+    TFloat p_out_thread[out_thread_desc.GetElementSpace()];
 
     // divide block work
     constexpr unsigned NBlockWork = (out_global_desc.GetLength(I0) + NPerBlock - 1) / NPerBlock;
@@ -169,54 +154,60 @@ __global__ void gridwise_direct_convolution_2(InGlobalDesc,
     {
         // copy input tensor to LDS
         blockwise_4d_tensor_copy<TFloat,
-                                 decltype(in_block_global_desc),
+                                 decltype(in_global_desc),
                                  decltype(in_block_desc),
-                                 BlockSize>(in_block_global_desc,
+                                 decltype(in_block_desc),
+                                 BlockSize>(in_global_desc,
                                             p_in_global +
                                                 in_global_desc.Get1dIndex(n_block_data_begin,
                                                                           c_block_data_begin,
                                                                           hi_block_data_begin,
                                                                           wi_block_data_begin),
                                             in_block_desc,
-                                            p_in_block);
+                                            p_in_block,
+                                            in_block_desc);
 
         // copy weight tensor to LDS
         blockwise_4d_tensor_copy<TFloat,
-                                 decltype(wei_block_global_desc),
+                                 decltype(wei_global_desc),
+                                 decltype(wei_block_desc),
                                  decltype(wei_block_desc),
                                  BlockSize>(
-            wei_block_global_desc,
+            wei_global_desc,
             p_wei_global + wei_global_desc.Get1dIndex(k_block_data_begin, c_block_data_begin, 0, 0),
             wei_block_desc,
-            p_wei_block);
+            p_wei_block,
+            wei_block_desc);
 
         __syncthreads();
 
         for(unsigned c_thread_data = 0; c_thread_data < CPerBlock; c_thread_data += CPerThread)
         {
             // copy input tensor into register
-            threadwise_4d_tensor_copy(in_thread_block_desc,
+            threadwise_4d_tensor_copy(in_block_desc,
                                       p_in_block + in_block_desc.Get1dIndex(n_thread_data_begin,
                                                                             c_thread_data,
                                                                             hi_thread_data_begin,
                                                                             wi_thread_data_begin),
                                       in_thread_desc,
-                                      p_in_thread);
+                                      p_in_thread,
+                                      in_thread_desc);
 
             // copy weight tensor into register
             threadwise_4d_tensor_copy(
-                wei_thread_block_desc,
+                wei_block_desc,
                 p_wei_block + wei_block_desc.Get1dIndex(k_thread_data_begin, c_thread_data, 0, 0),
                 wei_thread_desc,
-                p_wei_thread);
+                p_wei_thread,
+                wei_thread_desc);
 
             // threadwise convolution
-            threadwise_direct_convolution(in_thread_desc,
-                                          p_in_thread,
-                                          wei_thread_desc,
-                                          p_wei_thread,
-                                          out_thread_desc,
-                                          p_out_thread);
+            threadwise_direct_convolution_1(in_thread_desc,
+                                            p_in_thread,
+                                            wei_thread_desc,
+                                            p_wei_thread,
+                                            out_thread_desc,
+                                            p_out_thread);
         }
     }
 
@@ -224,9 +215,10 @@ __global__ void gridwise_direct_convolution_2(InGlobalDesc,
     threadwise_4d_tensor_copy(
         out_thread_desc,
         p_out_thread,
-        out_thread_global_desc,
+        out_global_desc,
         p_out_global + out_global_desc.Get1dIndex(n_block_data_begin + n_thread_data_begin,
                                                   k_block_data_begin + k_thread_data_begin,
                                                   ho_block_data_begin + ho_thread_data_begin,
-                                                  wo_block_data_begin + wo_thread_data_begin));
+                                                  wo_block_data_begin + wo_thread_data_begin),
+        out_thread_desc);
 }
diff --git a/src/include/gridwise_direct_convolution_3.cuh b/src/include/gridwise_direct_convolution_3.cuh
new file mode 100644
index 0000000000..f8bf42a011
--- /dev/null
+++ b/src/include/gridwise_direct_convolution_3.cuh
@@ -0,0 +1,208 @@
+#pragma once
+#include "constant_tensor_descriptor.cuh"
+#include "blockwise_tensor_op.cuh"
+#include "blockwise_direct_convolution.cuh"
+#include "threadwise_tensor_op.cuh"
+#include "threadwise_direct_convolution.cuh"
+
+template <class TFloat,
+          class InGlobalDesc,
+          class WeiGlobalDesc,
+          class OutGlobalDesc,
+          unsigned OutTileSizeH,
+          unsigned OutTileSizeW,
+          unsigned NPerBlock,
+          unsigned KPerBlock,
+          unsigned CPerBlock,
+          unsigned YPerBlock,
+          unsigned XPerBlock,
+          unsigned NPerThread,
+          unsigned KPerThread,
+          unsigned CPerThread,
+          unsigned BlockSize,
+          unsigned GridSize>
+__global__ void gridwise_direct_convolution_3(InGlobalDesc,
+                                              TFloat* const __restrict__ p_in_global,
+                                              WeiGlobalDesc,
+                                              TFloat* const __restrict__ p_wei_global,
+                                              OutGlobalDesc,
+                                              TFloat* __restrict__ p_out_global)
+{
+    constexpr auto I0 = Number<0>{};
+    constexpr auto I1 = Number<1>{};
+    constexpr auto I2 = Number<2>{};
+    constexpr auto I3 = Number<3>{};
+
+    constexpr auto in_global_desc  = InGlobalDesc{};
+    constexpr auto wei_global_desc = WeiGlobalDesc{};
+    constexpr auto out_global_desc = OutGlobalDesc{};
+
+    constexpr unsigned S = wei_global_desc.GetLength(I2);
+    constexpr unsigned R = wei_global_desc.GetLength(I3);
+
+    constexpr unsigned HoPerBlock = OutTileSizeH * YPerBlock;
+    constexpr unsigned WoPerBlock = OutTileSizeW * XPerBlock;
+
+    constexpr unsigned HiPerBlock = YPerBlock * OutTileSizeH + S - 1;
+    constexpr unsigned WiPerBlock = XPerBlock * OutTileSizeW + R - 1;
+
+    constexpr auto in_block_desc =
+        make_ConstantTensorDescriptor(Sequence<NPerBlock, CPerBlock, HiPerBlock, WiPerBlock>{});
+
+    constexpr auto wei_block_desc =
+        make_ConstantTensorDescriptor(Sequence<KPerBlock, CPerBlock, S, R>{});
+
+    // shared mem
+    constexpr unsigned in_block_size  = in_block_desc.GetElementSpace();
+    constexpr unsigned wei_block_size = wei_block_desc.GetElementSpace();
+
+    __shared__ TFloat p_in_block[in_block_size];
+    __shared__ TFloat p_wei_block[wei_block_size];
+
+    // threadwise tensors
+    constexpr unsigned InTileSizeH = OutTileSizeH + S - 1;
+    constexpr unsigned InTileSizeW = OutTileSizeW + R - 1;
+
+    constexpr auto in_thread_block_desc = make_ConstantTensorDescriptor(
+        Sequence<NPerThread, CPerThread, InTileSizeH, InTileSizeW>{}, in_block_desc.GetStrides());
+
+    constexpr auto wei_thread_block_desc = make_ConstantTensorDescriptor(
+        Sequence<KPerThread, CPerThread, S, R>{}, wei_block_desc.GetStrides());
+
+    constexpr auto out_thread_desc =
+        get_output_4d_tensor_descriptor(in_thread_block_desc, wei_thread_block_desc);
+
+    // register
+    TFloat p_out_thread[out_thread_desc.GetElementSpace()];
+
+    // divide block work
+    constexpr unsigned NBlockWork = (out_global_desc.GetLength(I0) + NPerBlock - 1) / NPerBlock;
+    constexpr unsigned KBlockWork = (out_global_desc.GetLength(I1) + KPerBlock - 1) / KPerBlock;
+    constexpr unsigned YBlockWork = (out_global_desc.GetLength(I2) + HoPerBlock - 1) / HoPerBlock;
+    constexpr unsigned XBlockWork = (out_global_desc.GetLength(I3) + WoPerBlock - 1) / WoPerBlock;
+
+    const unsigned block_id = blockIdx.x;
+
+    unsigned itmp                  = block_id;
+    const unsigned n_block_work_id = itmp / (KBlockWork * YBlockWork * XBlockWork);
+    itmp -= n_block_work_id * (KBlockWork * YBlockWork * XBlockWork);
+    const unsigned k_block_work_id = itmp / (YBlockWork * XBlockWork);
+    itmp -= k_block_work_id * (YBlockWork * XBlockWork);
+    const unsigned y_block_work_id = itmp / XBlockWork;
+    const unsigned x_block_work_id = itmp - y_block_work_id * XBlockWork;
+
+    const unsigned n_block_data_begin = n_block_work_id * NPerBlock;
+    const unsigned k_block_data_begin = k_block_work_id * KPerBlock;
+    const unsigned y_block_data_begin = y_block_work_id * YPerBlock;
+    const unsigned x_block_data_begin = x_block_work_id * XPerBlock;
+
+    const unsigned ho_block_data_begin = y_block_data_begin * OutTileSizeH;
+    const unsigned wo_block_data_begin = x_block_data_begin * OutTileSizeW;
+
+    const unsigned hi_block_data_begin = ho_block_data_begin; // minus padding
+    const unsigned wi_block_data_begin = wo_block_data_begin; // minus padding
+
+    // divide thread work
+    constexpr unsigned NThreadWork = (NPerBlock + NPerThread - 1) / NPerThread;
+    constexpr unsigned KThreadWork = (KPerBlock + KPerThread - 1) / KPerThread;
+    constexpr unsigned YThreadWork = YPerBlock;
+    constexpr unsigned XThreadWork = XPerBlock;
+
+    const unsigned thread_id = threadIdx.x;
+
+    itmp                            = thread_id;
+    const unsigned n_thread_work_id = itmp / (KThreadWork * YThreadWork * XThreadWork);
+    itmp -= n_thread_work_id * (KThreadWork * YThreadWork * XThreadWork);
+    const unsigned k_thread_work_id = itmp / (YThreadWork * XThreadWork);
+    itmp -= k_thread_work_id * (YThreadWork * XThreadWork);
+    const unsigned y_thread_work_id = itmp / XThreadWork;
+    const unsigned x_thread_work_id = itmp - y_thread_work_id * XThreadWork;
+
+    const unsigned n_thread_data_begin  = n_thread_work_id * NPerThread;
+    const unsigned k_thread_data_begin  = k_thread_work_id * KPerThread;
+    const unsigned ho_thread_data_begin = y_thread_work_id * OutTileSizeH;
+    const unsigned wo_thread_data_begin = x_thread_work_id * OutTileSizeW;
+
+    const unsigned hi_thread_data_begin = ho_thread_data_begin;
+    const unsigned wi_thread_data_begin = wo_thread_data_begin;
+
+#if 0
+    if(threadIdx.x == 0)
+    {
+        print_ConstantTensorDescriptor(in_global_desc, "gridwise_convolution:  in_global_desc: ");
+        print_ConstantTensorDescriptor(wei_global_desc, "gridwise_convolution: wei_global_desc: ");
+        print_ConstantTensorDescriptor(out_global_desc, "gridwise_convolution: out_global_desc: ");
+    }
+
+    printf("threadIdx.x %u \t"
+           "n_thread_data_begin %u, k_thread_data_begin %u, ho_thread_data_begin %u, "
+           "wo_thread_data_begin %u\n",
+           threadIdx.x,
+           n_thread_data_begin,
+           k_thread_data_begin,
+           ho_thread_data_begin,
+           wo_thread_data_begin);
+#endif
+
+    // set threadwise output tensor to 0
+    threadwise_4d_tensor_set_zero(out_thread_desc, p_out_thread);
+
+    for(unsigned c_block_data_begin = 0; c_block_data_begin < in_global_desc.GetLength(I1);
+        c_block_data_begin += CPerBlock, __syncthreads())
+    {
+        // copy input tensor to LDS
+        blockwise_4d_tensor_copy<TFloat,
+                                 decltype(in_global_desc),
+                                 decltype(in_block_desc),
+                                 decltype(in_block_desc),
+                                 BlockSize>(in_global_desc,
+                                            p_in_global +
+                                                in_global_desc.Get1dIndex(n_block_data_begin,
+                                                                          c_block_data_begin,
+                                                                          hi_block_data_begin,
+                                                                          wi_block_data_begin),
+                                            in_block_desc,
+                                            p_in_block,
+                                            in_block_desc);
+
+        // copy weight tensor to LDS
+        blockwise_4d_tensor_copy<TFloat,
+                                 decltype(wei_global_desc),
+                                 decltype(wei_block_desc),
+                                 decltype(wei_block_desc),
+                                 BlockSize>(
+            wei_global_desc,
+            p_wei_global + wei_global_desc.Get1dIndex(k_block_data_begin, c_block_data_begin, 0, 0),
+            wei_block_desc,
+            p_wei_block,
+            wei_block_desc);
+
+        __syncthreads();
+
+        for(unsigned c_thread_data = 0; c_thread_data < CPerBlock; c_thread_data += CPerThread)
+        {
+            // threadwise convolution
+            threadwise_direct_convolution_2(
+                in_thread_block_desc,
+                p_in_block + in_block_desc.Get1dIndex(n_thread_data_begin,
+                                                      c_thread_data,
+                                                      hi_thread_data_begin,
+                                                      wi_thread_data_begin),
+                wei_thread_block_desc,
+                p_wei_block + wei_block_desc.Get1dIndex(k_thread_data_begin, c_thread_data, 0, 0),
+                out_thread_desc,
+                p_out_thread);
+        }
+    }
+
+    // copy output tensor from register to global mem
+    threadwise_4d_tensor_copy(
+        out_thread_desc,
+        p_out_thread,
+        out_global_desc,
+        p_out_global + out_global_desc.Get1dIndex(n_block_data_begin + n_thread_data_begin,
+                                                  k_block_data_begin + k_thread_data_begin,
+                                                  ho_block_data_begin + ho_thread_data_begin,
+                                                  wo_block_data_begin + wo_thread_data_begin),
+        out_thread_desc);
+}
diff --git a/src/include/threadwise_direct_convolution.cuh b/src/include/threadwise_direct_convolution.cuh
index 1a00dfde51..ffd4f47849 100644
--- a/src/include/threadwise_direct_convolution.cuh
+++ b/src/include/threadwise_direct_convolution.cuh
@@ -1,13 +1,14 @@
 #pragma once
 #include "constant_tensor_descriptor.cuh"
 
+// optimized for scenario if p_in, p_wei, p_out are in register
 template <class TFloat, class InDesc, class WeiDesc, class OutDesc>
-__device__ void threadwise_direct_convolution(InDesc,
-                                              TFloat* const __restrict__ p_in,
-                                              WeiDesc,
-                                              TFloat* const __restrict__ p_wei,
-                                              OutDesc,
-                                              TFloat* __restrict__ p_out)
+__device__ void threadwise_direct_convolution_1(InDesc,
+                                                TFloat* const __restrict__ p_in,
+                                                WeiDesc,
+                                                TFloat* const __restrict__ p_wei,
+                                                OutDesc,
+                                                TFloat* __restrict__ p_out)
 {
     constexpr auto I0 = Number<0>{};
     constexpr auto I1 = Number<1>{};
@@ -77,3 +78,117 @@ __device__ void threadwise_direct_convolution(InDesc,
         }
     }
 }
+
+// optimized for scenario where p_in and p_wei are in LDS, p_out is in register
+// break down a non-1x1 convolution into a sequence of 1x1 convolutions,
+// load 1x1 weight into register, and do 1x1 convolution in register.
+template <class TFloat, class InDesc, class WeiDesc, class OutDesc>
+__device__ void threadwise_direct_convolution_2(InDesc,
+                                                TFloat* const __restrict__ p_in,
+                                                WeiDesc,
+                                                TFloat* const __restrict__ p_wei,
+                                                OutDesc,
+                                                TFloat* __restrict__ p_out)
+{
+    constexpr auto I0 = Number<0>{};
+    constexpr auto I1 = Number<1>{};
+    constexpr auto I2 = Number<2>{};
+    constexpr auto I3 = Number<3>{};
+
+    constexpr auto in_desc_lds  = InDesc{};
+    constexpr auto wei_desc_lds = WeiDesc{};
+    constexpr auto out_desc_reg = OutDesc{};
+
+    constexpr auto in_desc_reg =
+        make_ConstantTensorDescriptor(Sequence<in_desc_lds.GetLength(I0),
+                                               in_desc_lds.GetLength(I1),
+                                               out_desc_reg.GetLength(I2),
+                                               out_desc_reg.GetLength(I3)>{});
+
+    constexpr auto wei_desc_reg = make_ConstantTensorDescriptor(
+        Sequence<wei_desc_lds.GetLength(I0), wei_desc_lds.GetLength(I1), 1, 1>{});
+
+    TFloat p_in_reg[in_desc_reg.GetElementSpace()];
+    TFloat p_wei_reg[wei_desc_reg.GetElementSpace()];
+
+    constexpr unsigned in_w_new_read = 1;
+
+    constexpr auto in_desc_reg_new_read =
+        make_ConstantTensorDescriptor(Sequence<in_desc_reg.GetLength(I0),
+                                               in_desc_reg.GetLength(I1),
+                                               in_desc_reg.GetLength(I2),
+                                               in_w_new_read>{});
+
+    // loop over vertical direction
+    for(unsigned s = 0; s < wei_desc_lds.GetLength(I2); ++s)
+    {
+#if 1
+        // read first input
+        threadwise_4d_tensor_copy(in_desc_lds,
+                                  p_in + in_desc_lds.Get1dIndex(0, 0, s, 0),
+                                  in_desc_reg,
+                                  p_in_reg,
+                                  in_desc_reg);
+
+        // read first 1x1 weight
+        threadwise_4d_tensor_copy(wei_desc_lds,
+                                  p_wei + wei_desc_lds.Get1dIndex(0, 0, s, 0),
+                                  wei_desc_reg,
+                                  p_wei_reg,
+                                  wei_desc_reg);
+
+        // do first 1x1 conv
+        threadwise_direct_convolution_1(
+            in_desc_reg, p_in_reg, wei_desc_reg, p_wei_reg, out_desc_reg, p_out);
+
+        // loop over horizontal direction
+        for(unsigned r = 1; r < wei_desc_lds.GetLength(I3); ++r)
+        {
+            // read new weight
+            threadwise_4d_tensor_copy(wei_desc_lds,
+                                      p_wei + wei_desc_lds.Get1dIndex(0, 0, s, r),
+                                      wei_desc_reg,
+                                      p_wei_reg,
+                                      wei_desc_reg);
+
+            // shift old input to the left
+            threadwise_4d_tensor_shift_down(in_desc_reg, p_in_reg, I3, Number<in_w_new_read>{});
+
+            // read new input
+            threadwise_4d_tensor_copy(
+                in_desc_lds,
+                p_in + in_desc_lds.Get1dIndex(0, 0, s, in_desc_reg.GetLength(I3) + r - 1),
+                in_desc_reg,
+                p_in_reg +
+                    in_desc_reg.Get1dIndex(0, 0, 0, in_desc_reg.GetLength(I3) - in_w_new_read),
+                in_desc_reg_new_read);
+
+            // do 1x1 conv
+            threadwise_direct_convolution_1(
+                in_desc_reg, p_in_reg, wei_desc_reg, p_wei_reg, out_desc_reg, p_out);
+        }
+#elif 1
+        // loop over horizontal direction
+        for(unsigned r = 0; r < wei_desc_lds.GetLength(I3); ++r)
+        {
+            // read new weight
+            threadwise_4d_tensor_copy(wei_desc_lds,
+                                      p_wei + wei_desc_lds.Get1dIndex(0, 0, s, r),
+                                      wei_desc_reg,
+                                      p_wei_reg,
+                                      wei_desc_reg);
+
+            // read new input
+            threadwise_4d_tensor_copy(in_desc_lds,
+                                      p_in + in_desc_lds.Get1dIndex(0, 0, s, r),
+                                      in_desc_reg,
+                                      p_in_reg,
+                                      in_desc_reg);
+
+            // do 1x1 conv
+            threadwise_direct_convolution_1(
+                in_desc_reg, p_in_reg, wei_desc_reg, p_wei_reg, out_desc_reg, p_out);
+        }
+#endif
+    }
+}
diff --git a/src/include/threadwise_tensor_op.cuh b/src/include/threadwise_tensor_op.cuh
index 99701d795f..17cf533dec 100644
--- a/src/include/threadwise_tensor_op.cuh
+++ b/src/include/threadwise_tensor_op.cuh
@@ -35,41 +35,41 @@ __device__ void threadwise_4d_tensor_pointwise_op_unary(Desc, TFloat* __restrict
     }
 }
 
-template <class TFloat, class SrcDesc, class DstDesc, class F>
+template <class TFloat, class DescA, class DescB, class DescRef, class F>
 __device__ void threadwise_4d_tensor_pointwise_op_binary(
-    SrcDesc, TFloat* const __restrict__ p_src, DstDesc, TFloat* __restrict__ p_dst, F f)
+    DescA, TFloat* const __restrict__ p_a, DescB, TFloat* __restrict__ p_b, DescRef, F f)
 {
     constexpr auto I0 = Number<0>{};
     constexpr auto I1 = Number<1>{};
     constexpr auto I2 = Number<2>{};
     constexpr auto I3 = Number<3>{};
 
-    constexpr auto src_desc = SrcDesc{};
-    constexpr auto dst_desc = DstDesc{};
-
-    static_assert(is_same<decltype(src_desc.GetLengths()), decltype(dst_desc.GetLengths())>::value);
+    constexpr auto desc_a   = DescA{};
+    constexpr auto desc_b   = DescB{};
+    constexpr auto desc_ref = DescRef{};
 
 #if 0
     if(threadIdx.x == 0)
     {
-        print_ConstantTensorDescriptor(src_desc, "threadwise_4d_tensor_op_binary: src_desc: ");
-        print_ConstantTensorDescriptor(dst_desc, "threadwise_4d_tensor_op_binary: dst_desc: ");
+        print_ConstantTensorDescriptor(desc_a, "threadwise_4d_tensor_op_binary: desc_a: ");
+        print_ConstantTensorDescriptor(desc_b, "threadwise_4d_tensor_op_binary: desc_b: ");
+        print_ConstantTensorDescriptor(desc_ref, "threadwise_4d_tensor_op_binary: desc_ref: ");
     }
 #endif
 
-    for(unsigned did0 = 0; did0 < src_desc.GetLength(I0); ++did0)
+    for(unsigned did0 = 0; did0 < desc_ref.GetLength(I0); ++did0)
     {
-        for(unsigned did1 = 0; did1 < src_desc.GetLength(I1); ++did1)
+        for(unsigned did1 = 0; did1 < desc_ref.GetLength(I1); ++did1)
         {
-            for(unsigned did2 = 0; did2 < src_desc.GetLength(I2); ++did2)
+            for(unsigned did2 = 0; did2 < desc_ref.GetLength(I2); ++did2)
             {
-                for(unsigned did3 = 0; did3 < src_desc.GetLength(I3); ++did3)
+                for(unsigned did3 = 0; did3 < desc_ref.GetLength(I3); ++did3)
                 {
-                    const unsigned sindex = src_desc.Get1dIndex(did0, did1, did2, did3);
+                    const unsigned aindex = desc_a.Get1dIndex(did0, did1, did2, did3);
 
-                    const unsigned dindex = dst_desc.Get1dIndex(did0, did1, did2, did3);
+                    const unsigned bindex = desc_b.Get1dIndex(did0, did1, did2, did3);
 
-                    f(p_src[sindex], p_dst[dindex]);
+                    f(p_a[aindex], p_b[bindex]);
                 }
             }
         }
@@ -85,20 +85,18 @@ __device__ void threadwise_4d_tensor_set_zero(Desc, TFloat* __restrict__ p)
         Desc{}, p, f_set_zero);
 }
 
-template <class TFloat, class SrcDesc, class DstDesc>
-__device__ void threadwise_4d_tensor_copy(SrcDesc,
-                                          TFloat* const __restrict__ p_src,
-                                          DstDesc,
-                                          TFloat* __restrict__ p_dst)
+template <class TFloat, class SrcDesc, class DstDesc, class RefDesc>
+__device__ void threadwise_4d_tensor_copy(
+    SrcDesc, TFloat* const __restrict__ p_src, DstDesc, TFloat* __restrict__ p_dst, RefDesc)
 {
     auto f_copy = [](const TFloat& src, TFloat& dst) { dst = src; };
 
-    threadwise_4d_tensor_pointwise_op_binary<TFloat, SrcDesc, DstDesc, decltype(f_copy)>(
-        SrcDesc{}, p_src, DstDesc{}, p_dst, f_copy);
+    threadwise_4d_tensor_pointwise_op_binary<TFloat, SrcDesc, DstDesc, RefDesc, decltype(f_copy)>(
+        SrcDesc{}, p_src, DstDesc{}, p_dst, RefDesc{}, f_copy);
 }
 
-template <class TFloat, class Desc, class IDim>
-__device__ void threadwise_4d_tensor_shift_down(Desc, TFloat* __restrict__ p, IDim, unsigned shift)
+template <class TFloat, class Desc, class IDim, class NShift>
+__device__ void threadwise_4d_tensor_shift_down(Desc, TFloat* __restrict__ p, IDim, NShift)
 {
     constexpr auto I0 = Number<0>{};
     constexpr auto I1 = Number<1>{};
@@ -114,17 +112,19 @@ __device__ void threadwise_4d_tensor_shift_down(Desc, TFloat* __restrict__ p, ID
     }
 #endif
 
-    const unsigned did0_end =
-        is_same<decltype(I0), IDim>::value ? desc.GetLength(I0) - shift : desc.GetLength(I0);
+    constexpr unsigned nshift = NShift::mValue;
 
-    const unsigned did1_end =
-        is_same<decltype(I1), IDim>::value ? desc.GetLength(I1) - shift : desc.GetLength(I1);
+    constexpr unsigned did0_end =
+        is_same<decltype(I0), IDim>::value ? desc.GetLength(I0) - nshift : desc.GetLength(I0);
 
-    const unsigned did2_end =
-        is_same<decltype(I2), IDim>::value ? desc.GetLength(I2) - shift : desc.GetLength(I2);
+    constexpr unsigned did1_end =
+        is_same<decltype(I1), IDim>::value ? desc.GetLength(I1) - nshift : desc.GetLength(I1);
 
-    const unsigned did3_end =
-        is_same<decltype(I3), IDim>::value ? desc.GetLength(I3) - shift : desc.GetLength(I3);
+    constexpr unsigned did2_end =
+        is_same<decltype(I2), IDim>::value ? desc.GetLength(I2) - nshift : desc.GetLength(I2);
+
+    constexpr unsigned did3_end =
+        is_same<decltype(I3), IDim>::value ? desc.GetLength(I3) - nshift : desc.GetLength(I3);
 
     for(unsigned did0 = 0; did0 < did0_end; ++did0)
     {
@@ -136,11 +136,11 @@ __device__ void threadwise_4d_tensor_shift_down(Desc, TFloat* __restrict__ p, ID
                 {
                     const unsigned dindex = desc.Get1dIndex(did0, did1, did2, did3);
 
-                    const unsigned sindex = dindex + shift * desc.GetStride(IDim{});
+                    const unsigned sindex = dindex + nshift * desc.GetStride(IDim{});
 
                     p[dindex] = p[sindex];
                 }
             }
         }
     }
-}
\ No newline at end of file
+}