refactor

2026-05-12 17:26:00 +00:00 · 2018-12-28 17:25:07 -06:00
parent 39775d484c
commit 057c10e57e
7 changed files with 143 additions and 459 deletions
--- a/src/include/blockwise_direct_convolution.cuh
+++ b/src/include/blockwise_direct_convolution.cuh
@@ -58,8 +58,8 @@ __device__ void blockwise_direct_convolution(InBlockDesc,
    constexpr auto wei_thread_desc =
        make_ConstantTensorDescriptor(Sequence<KPerThread, CPerThread, S, R>{});

-    constexpr auto out_thread_desc = make_ConstantTensorDescriptor(
-        Sequence<NPerThread, KPerThread, OutTileSizeH, OutTileSizeW>{});
+    constexpr auto out_thread_desc =
+        get_output_4d_tensor_descriptor(in_thread_desc, wei_thread_desc);

    constexpr auto in_thread_block_desc =
        make_ConstantTensorDescriptor(in_thread_desc.GetLengths(), in_block_desc.GetStrides());
@@ -92,11 +92,9 @@ __device__ void blockwise_direct_convolution(InBlockDesc,
        unsigned hi_thread_data_begin = ho_thread_data_begin; // minus padding
        unsigned wi_thread_data_begin = wo_thread_data_begin; // minus padding

-        TFloat p_in_thread[in_thread_desc.GetElementSpace()];
-        TFloat p_wei_thread[wei_thread_desc.GetElementSpace()];
        TFloat p_out_thread[out_thread_desc.GetElementSpace()];

-        threadwise_4d_tensor_copy(out_thread_block_desc,
+        threadwise_4d_tensor_copy(out_block_desc,
                                  p_out_block + out_block_desc.Get1dIndex(n_thread_data_begin,
                                                                          k_thread_data_begin,
                                                                          ho_thread_data_begin,
@@ -108,38 +106,24 @@ __device__ void blockwise_direct_convolution(InBlockDesc,
        for(unsigned c_thread_data_begin = 0; c_thread_data_begin < in_block_desc.GetLength(I1);
            c_thread_data_begin += CPerThread)
        {
-            // copy input into register
-            threadwise_4d_tensor_copy(in_thread_block_desc,
-                                      p_in_block + in_block_desc.Get1dIndex(n_thread_data_begin,
-                                                                            c_thread_data_begin,
-                                                                            hi_thread_data_begin,
-                                                                            wi_thread_data_begin),
-                                      in_thread_desc,
-                                      p_in_thread,
-                                      in_thread_desc);
-
-            // copy weight into register
-            threadwise_4d_tensor_copy(
+            // threadwise convolution
+            threadwise_direct_convolution_2(
+                in_thread_block_desc,
+                p_in_block + in_block_desc.Get1dIndex(n_thread_data_begin,
+                                                      c_thread_data_begin,
+                                                      hi_thread_data_begin,
+                                                      wi_thread_data_begin),
                wei_thread_block_desc,
                p_wei_block +
                    wei_block_desc.Get1dIndex(k_thread_data_begin, c_thread_data_begin, 0, 0),
-                wei_thread_desc,
-                p_wei_thread,
-                wei_thread_desc);
-
-            // threadwise convolution
-            threadwise_direct_convolution_2(in_thread_desc,
-                                            p_in_thread,
-                                            wei_thread_desc,
-                                            p_wei_thread,
-                                            out_thread_desc,
-                                            p_out_thread);
+                out_thread_desc,
+                p_out_thread);
        }

        // copy output into LDS
        threadwise_4d_tensor_copy(out_thread_desc,
                                  p_out_thread,
-                                  out_thread_block_desc,
+                                  out_block_desc,
                                  p_out_block + out_block_desc.Get1dIndex(n_thread_data_begin,
                                                                          k_thread_data_begin,
                                                                          ho_thread_data_begin,
--- a/src/include/gridwise_direct_convolution_1.cuh
+++ b/src/include/gridwise_direct_convolution_1.cuh
@@ -49,18 +49,20 @@ __global__ void gridwise_direct_convolution_1(InGlobalDesc,
    constexpr unsigned YBlockWork = (out_global_desc.GetLength(I2) + HoPerBlock - 1) / HoPerBlock;
    constexpr unsigned XBlockWork = (out_global_desc.GetLength(I3) + WoPerBlock - 1) / WoPerBlock;

-    constexpr auto in_block_src_desc = make_ConstantTensorDescriptor(
+    constexpr auto in_block_global_desc = make_ConstantTensorDescriptor(
        Sequence<NPerBlock, CPerBlock, HiPerBlock, WiPerBlock>{}, in_global_desc.GetStrides());

-    constexpr auto wei_block_src_desc = make_ConstantTensorDescriptor(
+    constexpr auto wei_block_global_desc = make_ConstantTensorDescriptor(
        Sequence<KPerBlock, CPerBlock, S, R>{}, wei_global_desc.GetStrides());

-    constexpr auto out_block_src_desc = make_ConstantTensorDescriptor(
+    constexpr auto out_block_global_desc = make_ConstantTensorDescriptor(
        Sequence<NPerBlock, KPerBlock, HoPerBlock, WoPerBlock>{}, out_global_desc.GetStrides());

-    constexpr auto in_block_desc  = make_ConstantTensorDescriptor(in_block_src_desc.GetLengths());
-    constexpr auto wei_block_desc = make_ConstantTensorDescriptor(wei_block_src_desc.GetLengths());
-    constexpr auto out_block_desc = make_ConstantTensorDescriptor(out_block_src_desc.GetLengths());
+    constexpr auto in_block_desc = make_ConstantTensorDescriptor(in_block_global_desc.GetLengths());
+    constexpr auto wei_block_desc =
+        make_ConstantTensorDescriptor(wei_block_global_desc.GetLengths());
+    constexpr auto out_block_desc =
+        make_ConstantTensorDescriptor(out_block_global_desc.GetLengths());

    constexpr unsigned in_block_size  = in_block_desc.GetElementSpace();
    constexpr unsigned wei_block_size = wei_block_desc.GetElementSpace();
@@ -97,9 +99,9 @@ __global__ void gridwise_direct_convolution_1(InGlobalDesc,
        print_ConstantTensorDescriptor( in_global_desc, "gridwise_convolution:  in_global_desc: ");
        print_ConstantTensorDescriptor(wei_global_desc, "gridwise_convolution: wei_global_desc: ");
        print_ConstantTensorDescriptor(out_global_desc, "gridwise_convolution: out_global_desc: ");
-        print_ConstantTensorDescriptor( in_block_src_desc, "gridwise_convolution:  in_block_src_desc: ");
-        print_ConstantTensorDescriptor(wei_block_src_desc, "gridwise_convolution: wei_block_src_desc: ");
-        print_ConstantTensorDescriptor(out_block_src_desc, "gridwise_convolution: out_block_src_desc: ");
+        print_ConstantTensorDescriptor( in_block_global_desc, "gridwise_convolution:  in_block_global_desc: ");
+        print_ConstantTensorDescriptor(wei_block_global_desc, "gridwise_convolution: wei_block_global_desc: ");
+        print_ConstantTensorDescriptor(out_block_global_desc, "gridwise_convolution: out_block_global_desc: ");
        print_ConstantTensorDescriptor( in_block_desc, "gridwise_convolution:  in_block_desc: ");
        print_ConstantTensorDescriptor(wei_block_desc, "gridwise_convolution: wei_block_desc: ");
        print_ConstantTensorDescriptor(out_block_desc, "gridwise_convolution: out_block_desc: ");
@@ -128,10 +130,10 @@ __global__ void gridwise_direct_convolution_1(InGlobalDesc,
    {
        // copy input tensor to LDS
        blockwise_4d_tensor_copy<TFloat,
-                                 decltype(in_block_src_desc),
+                                 decltype(in_block_global_desc),
                                 decltype(in_block_desc),
                                 decltype(in_block_desc),
-                                 BlockSize>(in_block_src_desc,
+                                 BlockSize>(in_block_global_desc,
                                            p_in_global +
                                                in_global_desc.Get1dIndex(n_block_work_begin,
                                                                          c_block_work_begin,
@@ -143,11 +145,11 @@ __global__ void gridwise_direct_convolution_1(InGlobalDesc,

        // copy weight tensor to LDS
        blockwise_4d_tensor_copy<TFloat,
-                                 decltype(wei_block_src_desc),
+                                 decltype(wei_block_global_desc),
                                 decltype(wei_block_desc),
                                 decltype(wei_block_desc),
                                 BlockSize>(
-            wei_block_src_desc,
+            wei_block_global_desc,
            p_wei_global + wei_global_desc.Get1dIndex(k_block_work_begin, c_block_work_begin, 0, 0),
            wei_block_desc,
            p_wei_block,
@@ -174,12 +176,12 @@ __global__ void gridwise_direct_convolution_1(InGlobalDesc,
    // copy output tensor from LDS to device mem
    blockwise_4d_tensor_copy<TFloat,
                             decltype(out_block_desc),
-                             decltype(out_block_src_desc),
+                             decltype(out_block_global_desc),
                             decltype(out_block_desc),
                             BlockSize>(
        out_block_desc,
        p_out_block,
-        out_block_src_desc,
+        out_block_global_desc,
        p_out_global +
            out_global_desc.Get1dIndex(
                n_block_work_begin, k_block_work_begin, ho_block_work_begin, wo_block_work_begin),
--- a/src/include/gridwise_direct_convolution_2.cuh
+++ b/src/include/gridwise_direct_convolution_2.cuh
@@ -63,18 +63,16 @@ __global__ void gridwise_direct_convolution_2(InGlobalDesc,
    constexpr unsigned InTileSizeH = OutTileSizeH + S - 1;
    constexpr unsigned InTileSizeW = OutTileSizeW + R - 1;

-    constexpr auto in_thread_desc =
-        make_ConstantTensorDescriptor(Sequence<NPerThread, CPerThread, InTileSizeH, InTileSizeW>{});
+    constexpr auto in_thread_block_desc = make_ConstantTensorDescriptor(
+        Sequence<NPerThread, CPerThread, InTileSizeH, InTileSizeW>{}, in_block_desc.GetStrides());

-    constexpr auto wei_thread_desc =
-        make_ConstantTensorDescriptor(Sequence<KPerThread, CPerThread, S, R>{});
+    constexpr auto wei_thread_block_desc = make_ConstantTensorDescriptor(
+        Sequence<KPerThread, CPerThread, S, R>{}, wei_block_desc.GetStrides());

    constexpr auto out_thread_desc =
-        get_output_4d_tensor_descriptor(in_thread_desc, wei_thread_desc);
+        get_output_4d_tensor_descriptor(in_thread_block_desc, wei_thread_block_desc);

    // register
-    TFloat p_in_thread[in_thread_desc.GetElementSpace()];
-    TFloat p_wei_thread[wei_thread_desc.GetElementSpace()];
    TFloat p_out_thread[out_thread_desc.GetElementSpace()];

    // divide block work
@@ -183,31 +181,30 @@ __global__ void gridwise_direct_convolution_2(InGlobalDesc,

        for(unsigned c_thread_data = 0; c_thread_data < CPerBlock; c_thread_data += CPerThread)
        {
-            // copy input tensor into register
-            threadwise_4d_tensor_copy(in_block_desc,
-                                      p_in_block + in_block_desc.Get1dIndex(n_thread_data_begin,
-                                                                            c_thread_data,
-                                                                            hi_thread_data_begin,
-                                                                            wi_thread_data_begin),
-                                      in_thread_desc,
-                                      p_in_thread,
-                                      in_thread_desc);
-
-            // copy weight tensor into register
-            threadwise_4d_tensor_copy(
-                wei_block_desc,
-                p_wei_block + wei_block_desc.Get1dIndex(k_thread_data_begin, c_thread_data, 0, 0),
-                wei_thread_desc,
-                p_wei_thread,
-                wei_thread_desc);
-
            // threadwise convolution
-            threadwise_direct_convolution_1(in_thread_desc,
-                                            p_in_thread,
-                                            wei_thread_desc,
-                                            p_wei_thread,
-                                            out_thread_desc,
-                                            p_out_thread);
+#if 1
+            threadwise_direct_convolution_2(
+                in_thread_block_desc,
+                p_in_block + in_block_desc.Get1dIndex(n_thread_data_begin,
+                                                      c_thread_data,
+                                                      hi_thread_data_begin,
+                                                      wi_thread_data_begin),
+                wei_thread_block_desc,
+                p_wei_block + wei_block_desc.Get1dIndex(k_thread_data_begin, c_thread_data, 0, 0),
+                out_thread_desc,
+                p_out_thread);
+#elif 1
+            threadwise_direct_convolution_3(
+                in_thread_block_desc,
+                p_in_block + in_block_desc.Get1dIndex(n_thread_data_begin,
+                                                      c_thread_data,
+                                                      hi_thread_data_begin,
+                                                      wi_thread_data_begin),
+                wei_thread_block_desc,
+                p_wei_block + wei_block_desc.Get1dIndex(k_thread_data_begin, c_thread_data, 0, 0),
+                out_thread_desc,
+                p_out_thread);
+#endif
        }
    }

--- a/src/include/gridwise_direct_convolution_3.cuh
+++ b/src/include/gridwise_direct_convolution_3.cuh
@@ -1,208 +0,0 @@
-#pragma once
-#include "constant_tensor_descriptor.cuh"
-#include "blockwise_tensor_op.cuh"
-#include "blockwise_direct_convolution.cuh"
-#include "threadwise_tensor_op.cuh"
-#include "threadwise_direct_convolution.cuh"
-
-template <class TFloat,
-          class InGlobalDesc,
-          class WeiGlobalDesc,
-          class OutGlobalDesc,
-          unsigned OutTileSizeH,
-          unsigned OutTileSizeW,
-          unsigned NPerBlock,
-          unsigned KPerBlock,
-          unsigned CPerBlock,
-          unsigned YPerBlock,
-          unsigned XPerBlock,
-          unsigned NPerThread,
-          unsigned KPerThread,
-          unsigned CPerThread,
-          unsigned BlockSize,
-          unsigned GridSize>
-__global__ void gridwise_direct_convolution_3(InGlobalDesc,
-                                              TFloat* const __restrict__ p_in_global,
-                                              WeiGlobalDesc,
-                                              TFloat* const __restrict__ p_wei_global,
-                                              OutGlobalDesc,
-                                              TFloat* __restrict__ p_out_global)
-{
-    constexpr auto I0 = Number<0>{};
-    constexpr auto I1 = Number<1>{};
-    constexpr auto I2 = Number<2>{};
-    constexpr auto I3 = Number<3>{};
-
-    constexpr auto in_global_desc  = InGlobalDesc{};
-    constexpr auto wei_global_desc = WeiGlobalDesc{};
-    constexpr auto out_global_desc = OutGlobalDesc{};
-
-    constexpr unsigned S = wei_global_desc.GetLength(I2);
-    constexpr unsigned R = wei_global_desc.GetLength(I3);
-
-    constexpr unsigned HoPerBlock = OutTileSizeH * YPerBlock;
-    constexpr unsigned WoPerBlock = OutTileSizeW * XPerBlock;
-
-    constexpr unsigned HiPerBlock = YPerBlock * OutTileSizeH + S - 1;
-    constexpr unsigned WiPerBlock = XPerBlock * OutTileSizeW + R - 1;
-
-    constexpr auto in_block_desc =
-        make_ConstantTensorDescriptor(Sequence<NPerBlock, CPerBlock, HiPerBlock, WiPerBlock>{});
-
-    constexpr auto wei_block_desc =
-        make_ConstantTensorDescriptor(Sequence<KPerBlock, CPerBlock, S, R>{});
-
-    // shared mem
-    constexpr unsigned in_block_size  = in_block_desc.GetElementSpace();
-    constexpr unsigned wei_block_size = wei_block_desc.GetElementSpace();
-
-    __shared__ TFloat p_in_block[in_block_size];
-    __shared__ TFloat p_wei_block[wei_block_size];
-
-    // threadwise tensors
-    constexpr unsigned InTileSizeH = OutTileSizeH + S - 1;
-    constexpr unsigned InTileSizeW = OutTileSizeW + R - 1;
-
-    constexpr auto in_thread_block_desc = make_ConstantTensorDescriptor(
-        Sequence<NPerThread, CPerThread, InTileSizeH, InTileSizeW>{}, in_block_desc.GetStrides());
-
-    constexpr auto wei_thread_block_desc = make_ConstantTensorDescriptor(
-        Sequence<KPerThread, CPerThread, S, R>{}, wei_block_desc.GetStrides());
-
-    constexpr auto out_thread_desc =
-        get_output_4d_tensor_descriptor(in_thread_block_desc, wei_thread_block_desc);
-
-    // register
-    TFloat p_out_thread[out_thread_desc.GetElementSpace()];
-
-    // divide block work
-    constexpr unsigned NBlockWork = (out_global_desc.GetLength(I0) + NPerBlock - 1) / NPerBlock;
-    constexpr unsigned KBlockWork = (out_global_desc.GetLength(I1) + KPerBlock - 1) / KPerBlock;
-    constexpr unsigned YBlockWork = (out_global_desc.GetLength(I2) + HoPerBlock - 1) / HoPerBlock;
-    constexpr unsigned XBlockWork = (out_global_desc.GetLength(I3) + WoPerBlock - 1) / WoPerBlock;
-
-    const unsigned block_id = blockIdx.x;
-
-    unsigned itmp                  = block_id;
-    const unsigned n_block_work_id = itmp / (KBlockWork * YBlockWork * XBlockWork);
-    itmp -= n_block_work_id * (KBlockWork * YBlockWork * XBlockWork);
-    const unsigned k_block_work_id = itmp / (YBlockWork * XBlockWork);
-    itmp -= k_block_work_id * (YBlockWork * XBlockWork);
-    const unsigned y_block_work_id = itmp / XBlockWork;
-    const unsigned x_block_work_id = itmp - y_block_work_id * XBlockWork;
-
-    const unsigned n_block_data_begin = n_block_work_id * NPerBlock;
-    const unsigned k_block_data_begin = k_block_work_id * KPerBlock;
-    const unsigned y_block_data_begin = y_block_work_id * YPerBlock;
-    const unsigned x_block_data_begin = x_block_work_id * XPerBlock;
-
-    const unsigned ho_block_data_begin = y_block_data_begin * OutTileSizeH;
-    const unsigned wo_block_data_begin = x_block_data_begin * OutTileSizeW;
-
-    const unsigned hi_block_data_begin = ho_block_data_begin; // minus padding
-    const unsigned wi_block_data_begin = wo_block_data_begin; // minus padding
-
-    // divide thread work
-    constexpr unsigned NThreadWork = (NPerBlock + NPerThread - 1) / NPerThread;
-    constexpr unsigned KThreadWork = (KPerBlock + KPerThread - 1) / KPerThread;
-    constexpr unsigned YThreadWork = YPerBlock;
-    constexpr unsigned XThreadWork = XPerBlock;
-
-    const unsigned thread_id = threadIdx.x;
-
-    itmp                            = thread_id;
-    const unsigned n_thread_work_id = itmp / (KThreadWork * YThreadWork * XThreadWork);
-    itmp -= n_thread_work_id * (KThreadWork * YThreadWork * XThreadWork);
-    const unsigned k_thread_work_id = itmp / (YThreadWork * XThreadWork);
-    itmp -= k_thread_work_id * (YThreadWork * XThreadWork);
-    const unsigned y_thread_work_id = itmp / XThreadWork;
-    const unsigned x_thread_work_id = itmp - y_thread_work_id * XThreadWork;
-
-    const unsigned n_thread_data_begin  = n_thread_work_id * NPerThread;
-    const unsigned k_thread_data_begin  = k_thread_work_id * KPerThread;
-    const unsigned ho_thread_data_begin = y_thread_work_id * OutTileSizeH;
-    const unsigned wo_thread_data_begin = x_thread_work_id * OutTileSizeW;
-
-    const unsigned hi_thread_data_begin = ho_thread_data_begin;
-    const unsigned wi_thread_data_begin = wo_thread_data_begin;
-
-#if 0
-    if(threadIdx.x == 0)
-    {
-        print_ConstantTensorDescriptor(in_global_desc, "gridwise_convolution:  in_global_desc: ");
-        print_ConstantTensorDescriptor(wei_global_desc, "gridwise_convolution: wei_global_desc: ");
-        print_ConstantTensorDescriptor(out_global_desc, "gridwise_convolution: out_global_desc: ");
-    }
-
-    printf("threadIdx.x %u \t"
-           "n_thread_data_begin %u, k_thread_data_begin %u, ho_thread_data_begin %u, "
-           "wo_thread_data_begin %u\n",
-           threadIdx.x,
-           n_thread_data_begin,
-           k_thread_data_begin,
-           ho_thread_data_begin,
-           wo_thread_data_begin);
-#endif
-
-    // set threadwise output tensor to 0
-    threadwise_4d_tensor_set_zero(out_thread_desc, p_out_thread);
-
-    for(unsigned c_block_data_begin = 0; c_block_data_begin < in_global_desc.GetLength(I1);
-        c_block_data_begin += CPerBlock, __syncthreads())
-    {
-        // copy input tensor to LDS
-        blockwise_4d_tensor_copy<TFloat,
-                                 decltype(in_global_desc),
-                                 decltype(in_block_desc),
-                                 decltype(in_block_desc),
-                                 BlockSize>(in_global_desc,
-                                            p_in_global +
-                                                in_global_desc.Get1dIndex(n_block_data_begin,
-                                                                          c_block_data_begin,
-                                                                          hi_block_data_begin,
-                                                                          wi_block_data_begin),
-                                            in_block_desc,
-                                            p_in_block,
-                                            in_block_desc);
-
-        // copy weight tensor to LDS
-        blockwise_4d_tensor_copy<TFloat,
-                                 decltype(wei_global_desc),
-                                 decltype(wei_block_desc),
-                                 decltype(wei_block_desc),
-                                 BlockSize>(
-            wei_global_desc,
-            p_wei_global + wei_global_desc.Get1dIndex(k_block_data_begin, c_block_data_begin, 0, 0),
-            wei_block_desc,
-            p_wei_block,
-            wei_block_desc);
-
-        __syncthreads();
-
-        for(unsigned c_thread_data = 0; c_thread_data < CPerBlock; c_thread_data += CPerThread)
-        {
-            // threadwise convolution
-            threadwise_direct_convolution_2(
-                in_thread_block_desc,
-                p_in_block + in_block_desc.Get1dIndex(n_thread_data_begin,
-                                                      c_thread_data,
-                                                      hi_thread_data_begin,
-                                                      wi_thread_data_begin),
-                wei_thread_block_desc,
-                p_wei_block + wei_block_desc.Get1dIndex(k_thread_data_begin, c_thread_data, 0, 0),
-                out_thread_desc,
-                p_out_thread);
-        }
-    }
-
-    // copy output tensor from register to global mem
-    threadwise_4d_tensor_copy(
-        out_thread_desc,
-        p_out_thread,
-        out_global_desc,
-        p_out_global + out_global_desc.Get1dIndex(n_block_data_begin + n_thread_data_begin,
-                                                  k_block_data_begin + k_thread_data_begin,
-                                                  ho_block_data_begin + ho_thread_data_begin,
-                                                  wo_block_data_begin + wo_thread_data_begin),
-        out_thread_desc);
-}
--- a/src/include/threadwise_direct_convolution.cuh
+++ b/src/include/threadwise_direct_convolution.cuh
@@ -79,11 +79,43 @@ __device__ void threadwise_direct_convolution_1(InDesc,
    }
 }

+// Optimized for scenario if p_in and p_wei are in LDS, p_out are in register
+// Copy in and wei into register before doing convolution
+template <class TFloat, class InDesc, class WeiDesc, class OutDesc>
+__device__ void threadwise_direct_convolution_2(InDesc,
+                                                TFloat* const __restrict__ p_in,
+                                                WeiDesc,
+                                                TFloat* const __restrict__ p_wei,
+                                                OutDesc,
+                                                TFloat* __restrict__ p_out)
+{
+    constexpr auto in_desc  = InDesc{};
+    constexpr auto wei_desc = WeiDesc{};
+    constexpr auto out_desc = OutDesc{};
+
+    constexpr auto in_reg_desc  = make_ConstantTensorDescriptor(in_desc.GetLengths());
+    constexpr auto wei_reg_desc = make_ConstantTensorDescriptor(wei_desc.GetLengths());
+
+    // register
+    TFloat p_in_reg[in_reg_desc.GetElementSpace()];
+    TFloat p_wei_reg[wei_reg_desc.GetElementSpace()];
+
+    // copy input tensor into register
+    threadwise_4d_tensor_copy(in_desc, p_in, in_reg_desc, p_in_reg, in_reg_desc);
+
+    // copy input tensor into register
+    threadwise_4d_tensor_copy(wei_desc, p_wei, wei_reg_desc, p_wei_reg, wei_reg_desc);
+
+    // do convolution
+    threadwise_direct_convolution_1(
+        in_reg_desc, p_in_reg, wei_reg_desc, p_wei_reg, out_desc, p_out);
+}
+
 // optimized for scenario where p_in and p_wei are in LDS, p_out is in register
 // break down a non-1x1 convolution into a sequence of 1x1 convolutions,
 // load 1x1 weight into register, and do 1x1 convolution in register.
 template <class TFloat, class InDesc, class WeiDesc, class OutDesc>
-__device__ void threadwise_direct_convolution_2(InDesc,
+__device__ void threadwise_direct_convolution_3(InDesc,
                                                TFloat* const __restrict__ p_in,
                                                WeiDesc,
                                                TFloat* const __restrict__ p_wei,
@@ -95,100 +127,100 @@ __device__ void threadwise_direct_convolution_2(InDesc,
    constexpr auto I2 = Number<2>{};
    constexpr auto I3 = Number<3>{};

-    constexpr auto in_desc_lds  = InDesc{};
-    constexpr auto wei_desc_lds = WeiDesc{};
-    constexpr auto out_desc_reg = OutDesc{};
+    constexpr auto in_desc  = InDesc{};
+    constexpr auto wei_desc = WeiDesc{};
+    constexpr auto out_desc = OutDesc{};

-    constexpr auto in_desc_reg =
-        make_ConstantTensorDescriptor(Sequence<in_desc_lds.GetLength(I0),
-                                               in_desc_lds.GetLength(I1),
-                                               out_desc_reg.GetLength(I2),
-                                               out_desc_reg.GetLength(I3)>{});
+    constexpr auto in_reg_desc = make_ConstantTensorDescriptor(Sequence<in_desc.GetLength(I0),
+                                                                        in_desc.GetLength(I1),
+                                                                        out_desc.GetLength(I2),
+                                                                        out_desc.GetLength(I3)>{});

-    constexpr auto wei_desc_reg = make_ConstantTensorDescriptor(
-        Sequence<wei_desc_lds.GetLength(I0), wei_desc_lds.GetLength(I1), 1, 1>{});
+    constexpr auto wei_reg_desc = make_ConstantTensorDescriptor(
+        Sequence<wei_desc.GetLength(I0), wei_desc.GetLength(I1), 1, 1>{});

-    TFloat p_in_reg[in_desc_reg.GetElementSpace()];
-    TFloat p_wei_reg[wei_desc_reg.GetElementSpace()];
+    TFloat p_in_reg[in_reg_desc.GetElementSpace()];
+    TFloat p_wei_reg[wei_reg_desc.GetElementSpace()];

    constexpr unsigned in_w_new_read = 1;

    constexpr auto in_desc_reg_new_read =
-        make_ConstantTensorDescriptor(Sequence<in_desc_reg.GetLength(I0),
-                                               in_desc_reg.GetLength(I1),
-                                               in_desc_reg.GetLength(I2),
+        make_ConstantTensorDescriptor(Sequence<in_reg_desc.GetLength(I0),
+                                               in_reg_desc.GetLength(I1),
+                                               in_reg_desc.GetLength(I2),
                                               in_w_new_read>{});

+#if 0
    // loop over vertical direction
-    for(unsigned s = 0; s < wei_desc_lds.GetLength(I2); ++s)
+    for(unsigned s = 0; s < wei_desc.GetLength(I2); ++s)
    {
-#if 1
        // read first input
-        threadwise_4d_tensor_copy(in_desc_lds,
-                                  p_in + in_desc_lds.Get1dIndex(0, 0, s, 0),
-                                  in_desc_reg,
+        threadwise_4d_tensor_copy(in_desc,
+                                  p_in + in_desc.Get1dIndex(0, 0, s, 0),
+                                  in_reg_desc,
                                  p_in_reg,
-                                  in_desc_reg);
+                                  in_reg_desc);

        // read first 1x1 weight
-        threadwise_4d_tensor_copy(wei_desc_lds,
-                                  p_wei + wei_desc_lds.Get1dIndex(0, 0, s, 0),
-                                  wei_desc_reg,
+        threadwise_4d_tensor_copy(wei_desc,
+                                  p_wei + wei_desc.Get1dIndex(0, 0, s, 0),
+                                  wei_reg_desc,
                                  p_wei_reg,
-                                  wei_desc_reg);
+                                  wei_reg_desc);

        // do first 1x1 conv
        threadwise_direct_convolution_1(
-            in_desc_reg, p_in_reg, wei_desc_reg, p_wei_reg, out_desc_reg, p_out);
+            in_reg_desc, p_in_reg, wei_reg_desc, p_wei_reg, out_desc, p_out);

        // loop over horizontal direction
-        for(unsigned r = 1; r < wei_desc_lds.GetLength(I3); ++r)
+        for(unsigned r = 1; r < wei_desc.GetLength(I3); ++r)
        {
            // read new weight
-            threadwise_4d_tensor_copy(wei_desc_lds,
-                                      p_wei + wei_desc_lds.Get1dIndex(0, 0, s, r),
-                                      wei_desc_reg,
+            threadwise_4d_tensor_copy(wei_desc,
+                                      p_wei + wei_desc.Get1dIndex(0, 0, s, r),
+                                      wei_reg_desc,
                                      p_wei_reg,
-                                      wei_desc_reg);
+                                      wei_reg_desc);

            // shift old input to the left
-            threadwise_4d_tensor_shift_down(in_desc_reg, p_in_reg, I3, Number<in_w_new_read>{});
+            threadwise_4d_tensor_shift_down(in_reg_desc, p_in_reg, I3, Number<in_w_new_read>{});

            // read new input
            threadwise_4d_tensor_copy(
-                in_desc_lds,
-                p_in + in_desc_lds.Get1dIndex(0, 0, s, in_desc_reg.GetLength(I3) + r - 1),
-                in_desc_reg,
+                in_desc,
+                p_in + in_desc.Get1dIndex(0, 0, s, r + in_reg_desc.GetLength(I3) - 1),
+                in_reg_desc,
                p_in_reg +
-                    in_desc_reg.Get1dIndex(0, 0, 0, in_desc_reg.GetLength(I3) - in_w_new_read),
+                    in_reg_desc.Get1dIndex(0, 0, 0, in_reg_desc.GetLength(I3) - in_w_new_read),
                in_desc_reg_new_read);

            // do 1x1 conv
            threadwise_direct_convolution_1(
-                in_desc_reg, p_in_reg, wei_desc_reg, p_wei_reg, out_desc_reg, p_out);
+                in_reg_desc, p_in_reg, wei_reg_desc, p_wei_reg, out_desc, p_out);
        }
+    }
 #elif 1
+    // loop over vertical direction
+    for(unsigned s = 0; s < wei_desc.GetLength(I2); ++s)
+    {
        // loop over horizontal direction
-        for(unsigned r = 0; r < wei_desc_lds.GetLength(I3); ++r)
+        for(unsigned r = 0; r < wei_desc.GetLength(I3); ++r)
        {
            // read new weight
-            threadwise_4d_tensor_copy(wei_desc_lds,
-                                      p_wei + wei_desc_lds.Get1dIndex(0, 0, s, r),
-                                      wei_desc_reg,
+            threadwise_4d_tensor_copy(wei_desc,
+                                      p_wei + wei_desc.Get1dIndex(0, 0, s, r),
+                                      wei_reg_desc,
                                      p_wei_reg,
-                                      wei_desc_reg);
+                                      wei_reg_desc);

            // read new input
-            threadwise_4d_tensor_copy(in_desc_lds,
-                                      p_in + in_desc_lds.Get1dIndex(0, 0, s, r),
-                                      in_desc_reg,
-                                      p_in_reg,
-                                      in_desc_reg);
+            threadwise_4d_tensor_copy(
+                in_desc, p_in + in_desc.Get1dIndex(0, 0, s, r), in_reg_desc, p_in_reg, in_reg_desc);

            // do 1x1 conv
            threadwise_direct_convolution_1(
-                in_desc_reg, p_in_reg, wei_desc_reg, p_wei_reg, out_desc_reg, p_out);
+                in_reg_desc, p_in_reg, wei_reg_desc, p_wei_reg, out_desc, p_out);
        }
-#endif
    }
-}
+#endif
+}