diff --git a/driver/conv.cu b/driver/conv.cu
index 80e5057aaf..17ef79f405 100644
--- a/driver/conv.cu
+++ b/driver/conv.cu
@@ -7,7 +7,6 @@
 #include "constant_tensor_descriptor.cuh"
 #include "device_direct_convolution_1.cuh"
 #include "device_direct_convolution_2.cuh"
-#include "device_direct_convolution_3.cuh"
 //#include "device_winograd_convolution.cuh"
 
 struct GeneratorTensor_1
@@ -390,12 +389,10 @@ int main()
 
     for(int i = 0; i < 40; ++i)
     {
-#if 1
+#if 0
         device_direct_convolution_1(in_desc, in, wei_desc, wei, out_desc, out_device);
-#elif 0
+#elif 1
         device_direct_convolution_2(in_desc, in, wei_desc, wei, out_desc, out_device);
-#elif 0
-        device_direct_convolution_3(in_desc, in, wei_desc, wei, out_desc, out_device);
 #elif 0
         device_winograd_convolution(in_desc, in, wei_desc, wei, out_desc, out_device);
 #endif
diff --git a/driver/device_direct_convolution_3.cuh b/driver/device_direct_convolution_3.cuh
deleted file mode 100644
index 884b3bf38e..0000000000
--- a/driver/device_direct_convolution_3.cuh
+++ /dev/null
@@ -1,120 +0,0 @@
-#pragma once
-#include "gridwise_direct_convolution_3.cuh"
-
-template <class T, class InDesc, class WeiDesc, class OutDesc>
-void device_direct_convolution_3(
-    InDesc, const Tensor<T>& in, WeiDesc, const Tensor<T>& wei, OutDesc, Tensor<T>& out)
-{
-    std::size_t data_sz = sizeof(T);
-    DeviceMem in_device_buf(data_sz * in.mDesc.GetElementSpace());
-    DeviceMem wei_device_buf(data_sz * wei.mDesc.GetElementSpace());
-    DeviceMem out_device_buf(data_sz * out.mDesc.GetElementSpace());
-
-    int num_thread = std::thread::hardware_concurrency();
-
-    in_device_buf.ToDevice(in.mData.data());
-    wei_device_buf.ToDevice(wei.mData.data());
-    out_device_buf.ToDevice(out.mData.data());
-
-    constexpr auto I0 = Number<0>{};
-    constexpr auto I1 = Number<1>{};
-    constexpr auto I2 = Number<2>{};
-    constexpr auto I3 = Number<3>{};
-
-    constexpr auto in_desc  = InDesc{};
-    constexpr auto wei_desc = WeiDesc{};
-    constexpr auto out_desc = OutDesc{};
-
-#if 1
-    constexpr unsigned OutTileSizeH = 2;
-    constexpr unsigned OutTileSizeW = 2;
-    constexpr unsigned NPerBlock    = 2;
-    constexpr unsigned KPerBlock    = 32;
-    constexpr unsigned CPerBlock    = 4;
-    constexpr unsigned YPerBlock    = 1;
-    constexpr unsigned XPerBlock    = 16;
-
-    constexpr unsigned NPerThread = 2;
-    constexpr unsigned KPerThread = 4;
-    constexpr unsigned CPerThread = 2;
-
-    constexpr unsigned BlockSize = 128;
-#elif 0
-    constexpr unsigned OutTileSizeH = 2;
-    constexpr unsigned OutTileSizeW = 2;
-    constexpr unsigned NPerBlock    = 2;
-    constexpr unsigned KPerBlock    = 32;
-    constexpr unsigned CPerBlock    = 4;
-    constexpr unsigned YPerBlock    = 1;
-    constexpr unsigned XPerBlock    = 27;
-
-    constexpr unsigned NPerThread = 2;
-    constexpr unsigned KPerThread = 4;
-    constexpr unsigned CPerThread = 2;
-
-    constexpr unsigned BlockSize = 216;
-#elif 0
-    constexpr unsigned OutTileSizeH = 2;
-    constexpr unsigned OutTileSizeW = 2;
-    constexpr unsigned NPerBlock    = 2;
-    constexpr unsigned KPerBlock    = 32;
-    constexpr unsigned CPerBlock    = 4;
-    constexpr unsigned YPerBlock    = 1;
-    constexpr unsigned XPerBlock    = 32;
-
-    constexpr unsigned NPerThread = 2;
-    constexpr unsigned KPerThread = 4;
-    constexpr unsigned CPerThread = 2;
-
-    constexpr unsigned BlockSize = 256;
-#endif
-
-    constexpr unsigned GridSize = (out_desc.GetLength(I0) / NPerBlock) *
-                                  (out_desc.GetLength(I1) / KPerBlock) *
-                                  (out_desc.GetLength(I2) / (OutTileSizeH * YPerBlock)) *
-                                  (out_desc.GetLength(I3) / (OutTileSizeW * XPerBlock));
-
-    dim3 block_dim(BlockSize);
-    dim3 grid_dim(GridSize);
-
-    printf("%s: BlockSize %u, GridSize %u \n", __func__, BlockSize, GridSize);
-
-    cudaEvent_t start, stop;
-    float elapsedTime;
-
-    cudaEventCreate(&start);
-    cudaEventRecord(start, 0);
-
-    gridwise_direct_convolution_3<T,
-                                  InDesc,
-                                  WeiDesc,
-                                  OutDesc,
-                                  OutTileSizeH,
-                                  OutTileSizeW,
-                                  NPerBlock,
-                                  KPerBlock,
-                                  CPerBlock,
-                                  YPerBlock,
-                                  XPerBlock,
-                                  NPerThread,
-                                  KPerThread,
-                                  CPerThread,
-                                  BlockSize,
-                                  GridSize>
-        <<<grid_dim, block_dim>>>(InDesc{},
-                                  static_cast<T*>(in_device_buf.GetDeviceBuffer()),
-                                  WeiDesc{},
-                                  static_cast<T*>(wei_device_buf.GetDeviceBuffer()),
-                                  OutDesc{},
-                                  static_cast<T*>(out_device_buf.GetDeviceBuffer()));
-
-    cudaEventCreate(&stop);
-    cudaEventRecord(stop, 0);
-    cudaEventSynchronize(stop);
-
-    cudaEventElapsedTime(&elapsedTime, start, stop);
-    printf("Elapsed time : %f ms\n", elapsedTime);
-
-    checkCudaErrors(cudaGetLastError());
-    out_device_buf.FromDevice(out.mData.data());
-}
diff --git a/src/include/blockwise_direct_convolution.cuh b/src/include/blockwise_direct_convolution.cuh
index 95005af765..fec85eb09c 100644
--- a/src/include/blockwise_direct_convolution.cuh
+++ b/src/include/blockwise_direct_convolution.cuh
@@ -58,8 +58,8 @@ __device__ void blockwise_direct_convolution(InBlockDesc,
     constexpr auto wei_thread_desc =
         make_ConstantTensorDescriptor(Sequence<KPerThread, CPerThread, S, R>{});
 
-    constexpr auto out_thread_desc = make_ConstantTensorDescriptor(
-        Sequence<NPerThread, KPerThread, OutTileSizeH, OutTileSizeW>{});
+    constexpr auto out_thread_desc =
+        get_output_4d_tensor_descriptor(in_thread_desc, wei_thread_desc);
 
     constexpr auto in_thread_block_desc =
         make_ConstantTensorDescriptor(in_thread_desc.GetLengths(), in_block_desc.GetStrides());
@@ -92,11 +92,9 @@ __device__ void blockwise_direct_convolution(InBlockDesc,
         unsigned hi_thread_data_begin = ho_thread_data_begin; // minus padding
         unsigned wi_thread_data_begin = wo_thread_data_begin; // minus padding
 
-        TFloat p_in_thread[in_thread_desc.GetElementSpace()];
-        TFloat p_wei_thread[wei_thread_desc.GetElementSpace()];
         TFloat p_out_thread[out_thread_desc.GetElementSpace()];
 
-        threadwise_4d_tensor_copy(out_thread_block_desc,
+        threadwise_4d_tensor_copy(out_block_desc,
                                   p_out_block + out_block_desc.Get1dIndex(n_thread_data_begin,
                                                                           k_thread_data_begin,
                                                                           ho_thread_data_begin,
@@ -108,38 +106,24 @@ __device__ void blockwise_direct_convolution(InBlockDesc,
         for(unsigned c_thread_data_begin = 0; c_thread_data_begin < in_block_desc.GetLength(I1);
             c_thread_data_begin += CPerThread)
         {
-            // copy input into register
-            threadwise_4d_tensor_copy(in_thread_block_desc,
-                                      p_in_block + in_block_desc.Get1dIndex(n_thread_data_begin,
-                                                                            c_thread_data_begin,
-                                                                            hi_thread_data_begin,
-                                                                            wi_thread_data_begin),
-                                      in_thread_desc,
-                                      p_in_thread,
-                                      in_thread_desc);
-
-            // copy weight into register
-            threadwise_4d_tensor_copy(
+            // threadwise convolution
+            threadwise_direct_convolution_2(
+                in_thread_block_desc,
+                p_in_block + in_block_desc.Get1dIndex(n_thread_data_begin,
+                                                      c_thread_data_begin,
+                                                      hi_thread_data_begin,
+                                                      wi_thread_data_begin),
                 wei_thread_block_desc,
                 p_wei_block +
                     wei_block_desc.Get1dIndex(k_thread_data_begin, c_thread_data_begin, 0, 0),
-                wei_thread_desc,
-                p_wei_thread,
-                wei_thread_desc);
-
-            // threadwise convolution
-            threadwise_direct_convolution_2(in_thread_desc,
-                                            p_in_thread,
-                                            wei_thread_desc,
-                                            p_wei_thread,
-                                            out_thread_desc,
-                                            p_out_thread);
+                out_thread_desc,
+                p_out_thread);
         }
 
         // copy output into LDS
         threadwise_4d_tensor_copy(out_thread_desc,
                                   p_out_thread,
-                                  out_thread_block_desc,
+                                  out_block_desc,
                                   p_out_block + out_block_desc.Get1dIndex(n_thread_data_begin,
                                                                           k_thread_data_begin,
                                                                           ho_thread_data_begin,
diff --git a/src/include/gridwise_direct_convolution_1.cuh b/src/include/gridwise_direct_convolution_1.cuh
index 8a339b14e4..7dd36dd966 100644
--- a/src/include/gridwise_direct_convolution_1.cuh
+++ b/src/include/gridwise_direct_convolution_1.cuh
@@ -49,18 +49,20 @@ __global__ void gridwise_direct_convolution_1(InGlobalDesc,
     constexpr unsigned YBlockWork = (out_global_desc.GetLength(I2) + HoPerBlock - 1) / HoPerBlock;
     constexpr unsigned XBlockWork = (out_global_desc.GetLength(I3) + WoPerBlock - 1) / WoPerBlock;
 
-    constexpr auto in_block_src_desc = make_ConstantTensorDescriptor(
+    constexpr auto in_block_global_desc = make_ConstantTensorDescriptor(
         Sequence<NPerBlock, CPerBlock, HiPerBlock, WiPerBlock>{}, in_global_desc.GetStrides());
 
-    constexpr auto wei_block_src_desc = make_ConstantTensorDescriptor(
+    constexpr auto wei_block_global_desc = make_ConstantTensorDescriptor(
         Sequence<KPerBlock, CPerBlock, S, R>{}, wei_global_desc.GetStrides());
 
-    constexpr auto out_block_src_desc = make_ConstantTensorDescriptor(
+    constexpr auto out_block_global_desc = make_ConstantTensorDescriptor(
         Sequence<NPerBlock, KPerBlock, HoPerBlock, WoPerBlock>{}, out_global_desc.GetStrides());
 
-    constexpr auto in_block_desc  = make_ConstantTensorDescriptor(in_block_src_desc.GetLengths());
-    constexpr auto wei_block_desc = make_ConstantTensorDescriptor(wei_block_src_desc.GetLengths());
-    constexpr auto out_block_desc = make_ConstantTensorDescriptor(out_block_src_desc.GetLengths());
+    constexpr auto in_block_desc = make_ConstantTensorDescriptor(in_block_global_desc.GetLengths());
+    constexpr auto wei_block_desc =
+        make_ConstantTensorDescriptor(wei_block_global_desc.GetLengths());
+    constexpr auto out_block_desc =
+        make_ConstantTensorDescriptor(out_block_global_desc.GetLengths());
 
     constexpr unsigned in_block_size  = in_block_desc.GetElementSpace();
     constexpr unsigned wei_block_size = wei_block_desc.GetElementSpace();
@@ -97,9 +99,9 @@ __global__ void gridwise_direct_convolution_1(InGlobalDesc,
         print_ConstantTensorDescriptor( in_global_desc, "gridwise_convolution:  in_global_desc: ");
         print_ConstantTensorDescriptor(wei_global_desc, "gridwise_convolution: wei_global_desc: ");
         print_ConstantTensorDescriptor(out_global_desc, "gridwise_convolution: out_global_desc: ");
-        print_ConstantTensorDescriptor( in_block_src_desc, "gridwise_convolution:  in_block_src_desc: ");
-        print_ConstantTensorDescriptor(wei_block_src_desc, "gridwise_convolution: wei_block_src_desc: ");
-        print_ConstantTensorDescriptor(out_block_src_desc, "gridwise_convolution: out_block_src_desc: ");
+        print_ConstantTensorDescriptor( in_block_global_desc, "gridwise_convolution:  in_block_global_desc: ");
+        print_ConstantTensorDescriptor(wei_block_global_desc, "gridwise_convolution: wei_block_global_desc: ");
+        print_ConstantTensorDescriptor(out_block_global_desc, "gridwise_convolution: out_block_global_desc: ");
         print_ConstantTensorDescriptor( in_block_desc, "gridwise_convolution:  in_block_desc: ");
         print_ConstantTensorDescriptor(wei_block_desc, "gridwise_convolution: wei_block_desc: ");
         print_ConstantTensorDescriptor(out_block_desc, "gridwise_convolution: out_block_desc: ");
@@ -128,10 +130,10 @@ __global__ void gridwise_direct_convolution_1(InGlobalDesc,
     {
         // copy input tensor to LDS
         blockwise_4d_tensor_copy<TFloat,
-                                 decltype(in_block_src_desc),
+                                 decltype(in_block_global_desc),
                                  decltype(in_block_desc),
                                  decltype(in_block_desc),
-                                 BlockSize>(in_block_src_desc,
+                                 BlockSize>(in_block_global_desc,
                                             p_in_global +
                                                 in_global_desc.Get1dIndex(n_block_work_begin,
                                                                           c_block_work_begin,
@@ -143,11 +145,11 @@ __global__ void gridwise_direct_convolution_1(InGlobalDesc,
 
         // copy weight tensor to LDS
         blockwise_4d_tensor_copy<TFloat,
-                                 decltype(wei_block_src_desc),
+                                 decltype(wei_block_global_desc),
                                  decltype(wei_block_desc),
                                  decltype(wei_block_desc),
                                  BlockSize>(
-            wei_block_src_desc,
+            wei_block_global_desc,
             p_wei_global + wei_global_desc.Get1dIndex(k_block_work_begin, c_block_work_begin, 0, 0),
             wei_block_desc,
             p_wei_block,
@@ -174,12 +176,12 @@ __global__ void gridwise_direct_convolution_1(InGlobalDesc,
     // copy output tensor from LDS to device mem
     blockwise_4d_tensor_copy<TFloat,
                              decltype(out_block_desc),
-                             decltype(out_block_src_desc),
+                             decltype(out_block_global_desc),
                              decltype(out_block_desc),
                              BlockSize>(
         out_block_desc,
         p_out_block,
-        out_block_src_desc,
+        out_block_global_desc,
         p_out_global +
             out_global_desc.Get1dIndex(
                 n_block_work_begin, k_block_work_begin, ho_block_work_begin, wo_block_work_begin),
diff --git a/src/include/gridwise_direct_convolution_2.cuh b/src/include/gridwise_direct_convolution_2.cuh
index d2040f1f90..3f8d1e50ff 100644
--- a/src/include/gridwise_direct_convolution_2.cuh
+++ b/src/include/gridwise_direct_convolution_2.cuh
@@ -63,18 +63,16 @@ __global__ void gridwise_direct_convolution_2(InGlobalDesc,
     constexpr unsigned InTileSizeH = OutTileSizeH + S - 1;
     constexpr unsigned InTileSizeW = OutTileSizeW + R - 1;
 
-    constexpr auto in_thread_desc =
-        make_ConstantTensorDescriptor(Sequence<NPerThread, CPerThread, InTileSizeH, InTileSizeW>{});
+    constexpr auto in_thread_block_desc = make_ConstantTensorDescriptor(
+        Sequence<NPerThread, CPerThread, InTileSizeH, InTileSizeW>{}, in_block_desc.GetStrides());
 
-    constexpr auto wei_thread_desc =
-        make_ConstantTensorDescriptor(Sequence<KPerThread, CPerThread, S, R>{});
+    constexpr auto wei_thread_block_desc = make_ConstantTensorDescriptor(
+        Sequence<KPerThread, CPerThread, S, R>{}, wei_block_desc.GetStrides());
 
     constexpr auto out_thread_desc =
-        get_output_4d_tensor_descriptor(in_thread_desc, wei_thread_desc);
+        get_output_4d_tensor_descriptor(in_thread_block_desc, wei_thread_block_desc);
 
     // register
-    TFloat p_in_thread[in_thread_desc.GetElementSpace()];
-    TFloat p_wei_thread[wei_thread_desc.GetElementSpace()];
     TFloat p_out_thread[out_thread_desc.GetElementSpace()];
 
     // divide block work
@@ -183,31 +181,30 @@ __global__ void gridwise_direct_convolution_2(InGlobalDesc,
 
         for(unsigned c_thread_data = 0; c_thread_data < CPerBlock; c_thread_data += CPerThread)
         {
-            // copy input tensor into register
-            threadwise_4d_tensor_copy(in_block_desc,
-                                      p_in_block + in_block_desc.Get1dIndex(n_thread_data_begin,
-                                                                            c_thread_data,
-                                                                            hi_thread_data_begin,
-                                                                            wi_thread_data_begin),
-                                      in_thread_desc,
-                                      p_in_thread,
-                                      in_thread_desc);
-
-            // copy weight tensor into register
-            threadwise_4d_tensor_copy(
-                wei_block_desc,
-                p_wei_block + wei_block_desc.Get1dIndex(k_thread_data_begin, c_thread_data, 0, 0),
-                wei_thread_desc,
-                p_wei_thread,
-                wei_thread_desc);
-
             // threadwise convolution
-            threadwise_direct_convolution_1(in_thread_desc,
-                                            p_in_thread,
-                                            wei_thread_desc,
-                                            p_wei_thread,
-                                            out_thread_desc,
-                                            p_out_thread);
+#if 1
+            threadwise_direct_convolution_2(
+                in_thread_block_desc,
+                p_in_block + in_block_desc.Get1dIndex(n_thread_data_begin,
+                                                      c_thread_data,
+                                                      hi_thread_data_begin,
+                                                      wi_thread_data_begin),
+                wei_thread_block_desc,
+                p_wei_block + wei_block_desc.Get1dIndex(k_thread_data_begin, c_thread_data, 0, 0),
+                out_thread_desc,
+                p_out_thread);
+#elif 1
+            threadwise_direct_convolution_3(
+                in_thread_block_desc,
+                p_in_block + in_block_desc.Get1dIndex(n_thread_data_begin,
+                                                      c_thread_data,
+                                                      hi_thread_data_begin,
+                                                      wi_thread_data_begin),
+                wei_thread_block_desc,
+                p_wei_block + wei_block_desc.Get1dIndex(k_thread_data_begin, c_thread_data, 0, 0),
+                out_thread_desc,
+                p_out_thread);
+#endif
         }
     }
 
diff --git a/src/include/gridwise_direct_convolution_3.cuh b/src/include/gridwise_direct_convolution_3.cuh
deleted file mode 100644
index f8bf42a011..0000000000
--- a/src/include/gridwise_direct_convolution_3.cuh
+++ /dev/null
@@ -1,208 +0,0 @@
-#pragma once
-#include "constant_tensor_descriptor.cuh"
-#include "blockwise_tensor_op.cuh"
-#include "blockwise_direct_convolution.cuh"
-#include "threadwise_tensor_op.cuh"
-#include "threadwise_direct_convolution.cuh"
-
-template <class TFloat,
-          class InGlobalDesc,
-          class WeiGlobalDesc,
-          class OutGlobalDesc,
-          unsigned OutTileSizeH,
-          unsigned OutTileSizeW,
-          unsigned NPerBlock,
-          unsigned KPerBlock,
-          unsigned CPerBlock,
-          unsigned YPerBlock,
-          unsigned XPerBlock,
-          unsigned NPerThread,
-          unsigned KPerThread,
-          unsigned CPerThread,
-          unsigned BlockSize,
-          unsigned GridSize>
-__global__ void gridwise_direct_convolution_3(InGlobalDesc,
-                                              TFloat* const __restrict__ p_in_global,
-                                              WeiGlobalDesc,
-                                              TFloat* const __restrict__ p_wei_global,
-                                              OutGlobalDesc,
-                                              TFloat* __restrict__ p_out_global)
-{
-    constexpr auto I0 = Number<0>{};
-    constexpr auto I1 = Number<1>{};
-    constexpr auto I2 = Number<2>{};
-    constexpr auto I3 = Number<3>{};
-
-    constexpr auto in_global_desc  = InGlobalDesc{};
-    constexpr auto wei_global_desc = WeiGlobalDesc{};
-    constexpr auto out_global_desc = OutGlobalDesc{};
-
-    constexpr unsigned S = wei_global_desc.GetLength(I2);
-    constexpr unsigned R = wei_global_desc.GetLength(I3);
-
-    constexpr unsigned HoPerBlock = OutTileSizeH * YPerBlock;
-    constexpr unsigned WoPerBlock = OutTileSizeW * XPerBlock;
-
-    constexpr unsigned HiPerBlock = YPerBlock * OutTileSizeH + S - 1;
-    constexpr unsigned WiPerBlock = XPerBlock * OutTileSizeW + R - 1;
-
-    constexpr auto in_block_desc =
-        make_ConstantTensorDescriptor(Sequence<NPerBlock, CPerBlock, HiPerBlock, WiPerBlock>{});
-
-    constexpr auto wei_block_desc =
-        make_ConstantTensorDescriptor(Sequence<KPerBlock, CPerBlock, S, R>{});
-
-    // shared mem
-    constexpr unsigned in_block_size  = in_block_desc.GetElementSpace();
-    constexpr unsigned wei_block_size = wei_block_desc.GetElementSpace();
-
-    __shared__ TFloat p_in_block[in_block_size];
-    __shared__ TFloat p_wei_block[wei_block_size];
-
-    // threadwise tensors
-    constexpr unsigned InTileSizeH = OutTileSizeH + S - 1;
-    constexpr unsigned InTileSizeW = OutTileSizeW + R - 1;
-
-    constexpr auto in_thread_block_desc = make_ConstantTensorDescriptor(
-        Sequence<NPerThread, CPerThread, InTileSizeH, InTileSizeW>{}, in_block_desc.GetStrides());
-
-    constexpr auto wei_thread_block_desc = make_ConstantTensorDescriptor(
-        Sequence<KPerThread, CPerThread, S, R>{}, wei_block_desc.GetStrides());
-
-    constexpr auto out_thread_desc =
-        get_output_4d_tensor_descriptor(in_thread_block_desc, wei_thread_block_desc);
-
-    // register
-    TFloat p_out_thread[out_thread_desc.GetElementSpace()];
-
-    // divide block work
-    constexpr unsigned NBlockWork = (out_global_desc.GetLength(I0) + NPerBlock - 1) / NPerBlock;
-    constexpr unsigned KBlockWork = (out_global_desc.GetLength(I1) + KPerBlock - 1) / KPerBlock;
-    constexpr unsigned YBlockWork = (out_global_desc.GetLength(I2) + HoPerBlock - 1) / HoPerBlock;
-    constexpr unsigned XBlockWork = (out_global_desc.GetLength(I3) + WoPerBlock - 1) / WoPerBlock;
-
-    const unsigned block_id = blockIdx.x;
-
-    unsigned itmp                  = block_id;
-    const unsigned n_block_work_id = itmp / (KBlockWork * YBlockWork * XBlockWork);
-    itmp -= n_block_work_id * (KBlockWork * YBlockWork * XBlockWork);
-    const unsigned k_block_work_id = itmp / (YBlockWork * XBlockWork);
-    itmp -= k_block_work_id * (YBlockWork * XBlockWork);
-    const unsigned y_block_work_id = itmp / XBlockWork;
-    const unsigned x_block_work_id = itmp - y_block_work_id * XBlockWork;
-
-    const unsigned n_block_data_begin = n_block_work_id * NPerBlock;
-    const unsigned k_block_data_begin = k_block_work_id * KPerBlock;
-    const unsigned y_block_data_begin = y_block_work_id * YPerBlock;
-    const unsigned x_block_data_begin = x_block_work_id * XPerBlock;
-
-    const unsigned ho_block_data_begin = y_block_data_begin * OutTileSizeH;
-    const unsigned wo_block_data_begin = x_block_data_begin * OutTileSizeW;
-
-    const unsigned hi_block_data_begin = ho_block_data_begin; // minus padding
-    const unsigned wi_block_data_begin = wo_block_data_begin; // minus padding
-
-    // divide thread work
-    constexpr unsigned NThreadWork = (NPerBlock + NPerThread - 1) / NPerThread;
-    constexpr unsigned KThreadWork = (KPerBlock + KPerThread - 1) / KPerThread;
-    constexpr unsigned YThreadWork = YPerBlock;
-    constexpr unsigned XThreadWork = XPerBlock;
-
-    const unsigned thread_id = threadIdx.x;
-
-    itmp                            = thread_id;
-    const unsigned n_thread_work_id = itmp / (KThreadWork * YThreadWork * XThreadWork);
-    itmp -= n_thread_work_id * (KThreadWork * YThreadWork * XThreadWork);
-    const unsigned k_thread_work_id = itmp / (YThreadWork * XThreadWork);
-    itmp -= k_thread_work_id * (YThreadWork * XThreadWork);
-    const unsigned y_thread_work_id = itmp / XThreadWork;
-    const unsigned x_thread_work_id = itmp - y_thread_work_id * XThreadWork;
-
-    const unsigned n_thread_data_begin  = n_thread_work_id * NPerThread;
-    const unsigned k_thread_data_begin  = k_thread_work_id * KPerThread;
-    const unsigned ho_thread_data_begin = y_thread_work_id * OutTileSizeH;
-    const unsigned wo_thread_data_begin = x_thread_work_id * OutTileSizeW;
-
-    const unsigned hi_thread_data_begin = ho_thread_data_begin;
-    const unsigned wi_thread_data_begin = wo_thread_data_begin;
-
-#if 0
-    if(threadIdx.x == 0)
-    {
-        print_ConstantTensorDescriptor(in_global_desc, "gridwise_convolution:  in_global_desc: ");
-        print_ConstantTensorDescriptor(wei_global_desc, "gridwise_convolution: wei_global_desc: ");
-        print_ConstantTensorDescriptor(out_global_desc, "gridwise_convolution: out_global_desc: ");
-    }
-
-    printf("threadIdx.x %u \t"
-           "n_thread_data_begin %u, k_thread_data_begin %u, ho_thread_data_begin %u, "
-           "wo_thread_data_begin %u\n",
-           threadIdx.x,
-           n_thread_data_begin,
-           k_thread_data_begin,
-           ho_thread_data_begin,
-           wo_thread_data_begin);
-#endif
-
-    // set threadwise output tensor to 0
-    threadwise_4d_tensor_set_zero(out_thread_desc, p_out_thread);
-
-    for(unsigned c_block_data_begin = 0; c_block_data_begin < in_global_desc.GetLength(I1);
-        c_block_data_begin += CPerBlock, __syncthreads())
-    {
-        // copy input tensor to LDS
-        blockwise_4d_tensor_copy<TFloat,
-                                 decltype(in_global_desc),
-                                 decltype(in_block_desc),
-                                 decltype(in_block_desc),
-                                 BlockSize>(in_global_desc,
-                                            p_in_global +
-                                                in_global_desc.Get1dIndex(n_block_data_begin,
-                                                                          c_block_data_begin,
-                                                                          hi_block_data_begin,
-                                                                          wi_block_data_begin),
-                                            in_block_desc,
-                                            p_in_block,
-                                            in_block_desc);
-
-        // copy weight tensor to LDS
-        blockwise_4d_tensor_copy<TFloat,
-                                 decltype(wei_global_desc),
-                                 decltype(wei_block_desc),
-                                 decltype(wei_block_desc),
-                                 BlockSize>(
-            wei_global_desc,
-            p_wei_global + wei_global_desc.Get1dIndex(k_block_data_begin, c_block_data_begin, 0, 0),
-            wei_block_desc,
-            p_wei_block,
-            wei_block_desc);
-
-        __syncthreads();
-
-        for(unsigned c_thread_data = 0; c_thread_data < CPerBlock; c_thread_data += CPerThread)
-        {
-            // threadwise convolution
-            threadwise_direct_convolution_2(
-                in_thread_block_desc,
-                p_in_block + in_block_desc.Get1dIndex(n_thread_data_begin,
-                                                      c_thread_data,
-                                                      hi_thread_data_begin,
-                                                      wi_thread_data_begin),
-                wei_thread_block_desc,
-                p_wei_block + wei_block_desc.Get1dIndex(k_thread_data_begin, c_thread_data, 0, 0),
-                out_thread_desc,
-                p_out_thread);
-        }
-    }
-
-    // copy output tensor from register to global mem
-    threadwise_4d_tensor_copy(
-        out_thread_desc,
-        p_out_thread,
-        out_global_desc,
-        p_out_global + out_global_desc.Get1dIndex(n_block_data_begin + n_thread_data_begin,
-                                                  k_block_data_begin + k_thread_data_begin,
-                                                  ho_block_data_begin + ho_thread_data_begin,
-                                                  wo_block_data_begin + wo_thread_data_begin),
-        out_thread_desc);
-}
diff --git a/src/include/threadwise_direct_convolution.cuh b/src/include/threadwise_direct_convolution.cuh
index ffd4f47849..a728ad68b2 100644
--- a/src/include/threadwise_direct_convolution.cuh
+++ b/src/include/threadwise_direct_convolution.cuh
@@ -79,11 +79,43 @@ __device__ void threadwise_direct_convolution_1(InDesc,
     }
 }
 
+// Optimized for scenario if p_in and p_wei are in LDS, p_out are in register
+// Copy in and wei into register before doing convolution
+template <class TFloat, class InDesc, class WeiDesc, class OutDesc>
+__device__ void threadwise_direct_convolution_2(InDesc,
+                                                TFloat* const __restrict__ p_in,
+                                                WeiDesc,
+                                                TFloat* const __restrict__ p_wei,
+                                                OutDesc,
+                                                TFloat* __restrict__ p_out)
+{
+    constexpr auto in_desc  = InDesc{};
+    constexpr auto wei_desc = WeiDesc{};
+    constexpr auto out_desc = OutDesc{};
+
+    constexpr auto in_reg_desc  = make_ConstantTensorDescriptor(in_desc.GetLengths());
+    constexpr auto wei_reg_desc = make_ConstantTensorDescriptor(wei_desc.GetLengths());
+
+    // register
+    TFloat p_in_reg[in_reg_desc.GetElementSpace()];
+    TFloat p_wei_reg[wei_reg_desc.GetElementSpace()];
+
+    // copy input tensor into register
+    threadwise_4d_tensor_copy(in_desc, p_in, in_reg_desc, p_in_reg, in_reg_desc);
+
+    // copy input tensor into register
+    threadwise_4d_tensor_copy(wei_desc, p_wei, wei_reg_desc, p_wei_reg, wei_reg_desc);
+
+    // do convolution
+    threadwise_direct_convolution_1(
+        in_reg_desc, p_in_reg, wei_reg_desc, p_wei_reg, out_desc, p_out);
+}
+
 // optimized for scenario where p_in and p_wei are in LDS, p_out is in register
 // break down a non-1x1 convolution into a sequence of 1x1 convolutions,
 // load 1x1 weight into register, and do 1x1 convolution in register.
 template <class TFloat, class InDesc, class WeiDesc, class OutDesc>
-__device__ void threadwise_direct_convolution_2(InDesc,
+__device__ void threadwise_direct_convolution_3(InDesc,
                                                 TFloat* const __restrict__ p_in,
                                                 WeiDesc,
                                                 TFloat* const __restrict__ p_wei,
@@ -95,100 +127,100 @@ __device__ void threadwise_direct_convolution_2(InDesc,
     constexpr auto I2 = Number<2>{};
     constexpr auto I3 = Number<3>{};
 
-    constexpr auto in_desc_lds  = InDesc{};
-    constexpr auto wei_desc_lds = WeiDesc{};
-    constexpr auto out_desc_reg = OutDesc{};
+    constexpr auto in_desc  = InDesc{};
+    constexpr auto wei_desc = WeiDesc{};
+    constexpr auto out_desc = OutDesc{};
 
-    constexpr auto in_desc_reg =
-        make_ConstantTensorDescriptor(Sequence<in_desc_lds.GetLength(I0),
-                                               in_desc_lds.GetLength(I1),
-                                               out_desc_reg.GetLength(I2),
-                                               out_desc_reg.GetLength(I3)>{});
+    constexpr auto in_reg_desc = make_ConstantTensorDescriptor(Sequence<in_desc.GetLength(I0),
+                                                                        in_desc.GetLength(I1),
+                                                                        out_desc.GetLength(I2),
+                                                                        out_desc.GetLength(I3)>{});
 
-    constexpr auto wei_desc_reg = make_ConstantTensorDescriptor(
-        Sequence<wei_desc_lds.GetLength(I0), wei_desc_lds.GetLength(I1), 1, 1>{});
+    constexpr auto wei_reg_desc = make_ConstantTensorDescriptor(
+        Sequence<wei_desc.GetLength(I0), wei_desc.GetLength(I1), 1, 1>{});
 
-    TFloat p_in_reg[in_desc_reg.GetElementSpace()];
-    TFloat p_wei_reg[wei_desc_reg.GetElementSpace()];
+    TFloat p_in_reg[in_reg_desc.GetElementSpace()];
+    TFloat p_wei_reg[wei_reg_desc.GetElementSpace()];
 
     constexpr unsigned in_w_new_read = 1;
 
     constexpr auto in_desc_reg_new_read =
-        make_ConstantTensorDescriptor(Sequence<in_desc_reg.GetLength(I0),
-                                               in_desc_reg.GetLength(I1),
-                                               in_desc_reg.GetLength(I2),
+        make_ConstantTensorDescriptor(Sequence<in_reg_desc.GetLength(I0),
+                                               in_reg_desc.GetLength(I1),
+                                               in_reg_desc.GetLength(I2),
                                                in_w_new_read>{});
 
+#if 0
     // loop over vertical direction
-    for(unsigned s = 0; s < wei_desc_lds.GetLength(I2); ++s)
+    for(unsigned s = 0; s < wei_desc.GetLength(I2); ++s)
     {
-#if 1
         // read first input
-        threadwise_4d_tensor_copy(in_desc_lds,
-                                  p_in + in_desc_lds.Get1dIndex(0, 0, s, 0),
-                                  in_desc_reg,
+        threadwise_4d_tensor_copy(in_desc,
+                                  p_in + in_desc.Get1dIndex(0, 0, s, 0),
+                                  in_reg_desc,
                                   p_in_reg,
-                                  in_desc_reg);
+                                  in_reg_desc);
 
         // read first 1x1 weight
-        threadwise_4d_tensor_copy(wei_desc_lds,
-                                  p_wei + wei_desc_lds.Get1dIndex(0, 0, s, 0),
-                                  wei_desc_reg,
+        threadwise_4d_tensor_copy(wei_desc,
+                                  p_wei + wei_desc.Get1dIndex(0, 0, s, 0),
+                                  wei_reg_desc,
                                   p_wei_reg,
-                                  wei_desc_reg);
+                                  wei_reg_desc);
 
         // do first 1x1 conv
         threadwise_direct_convolution_1(
-            in_desc_reg, p_in_reg, wei_desc_reg, p_wei_reg, out_desc_reg, p_out);
+            in_reg_desc, p_in_reg, wei_reg_desc, p_wei_reg, out_desc, p_out);
 
         // loop over horizontal direction
-        for(unsigned r = 1; r < wei_desc_lds.GetLength(I3); ++r)
+        for(unsigned r = 1; r < wei_desc.GetLength(I3); ++r)
         {
             // read new weight
-            threadwise_4d_tensor_copy(wei_desc_lds,
-                                      p_wei + wei_desc_lds.Get1dIndex(0, 0, s, r),
-                                      wei_desc_reg,
+            threadwise_4d_tensor_copy(wei_desc,
+                                      p_wei + wei_desc.Get1dIndex(0, 0, s, r),
+                                      wei_reg_desc,
                                       p_wei_reg,
-                                      wei_desc_reg);
+                                      wei_reg_desc);
 
             // shift old input to the left
-            threadwise_4d_tensor_shift_down(in_desc_reg, p_in_reg, I3, Number<in_w_new_read>{});
+            threadwise_4d_tensor_shift_down(in_reg_desc, p_in_reg, I3, Number<in_w_new_read>{});
 
             // read new input
             threadwise_4d_tensor_copy(
-                in_desc_lds,
-                p_in + in_desc_lds.Get1dIndex(0, 0, s, in_desc_reg.GetLength(I3) + r - 1),
-                in_desc_reg,
+                in_desc,
+                p_in + in_desc.Get1dIndex(0, 0, s, r + in_reg_desc.GetLength(I3) - 1),
+                in_reg_desc,
                 p_in_reg +
-                    in_desc_reg.Get1dIndex(0, 0, 0, in_desc_reg.GetLength(I3) - in_w_new_read),
+                    in_reg_desc.Get1dIndex(0, 0, 0, in_reg_desc.GetLength(I3) - in_w_new_read),
                 in_desc_reg_new_read);
 
             // do 1x1 conv
             threadwise_direct_convolution_1(
-                in_desc_reg, p_in_reg, wei_desc_reg, p_wei_reg, out_desc_reg, p_out);
+                in_reg_desc, p_in_reg, wei_reg_desc, p_wei_reg, out_desc, p_out);
         }
+    }
 #elif 1
+    // loop over vertical direction
+    for(unsigned s = 0; s < wei_desc.GetLength(I2); ++s)
+    {
         // loop over horizontal direction
-        for(unsigned r = 0; r < wei_desc_lds.GetLength(I3); ++r)
+        for(unsigned r = 0; r < wei_desc.GetLength(I3); ++r)
         {
             // read new weight
-            threadwise_4d_tensor_copy(wei_desc_lds,
-                                      p_wei + wei_desc_lds.Get1dIndex(0, 0, s, r),
-                                      wei_desc_reg,
+            threadwise_4d_tensor_copy(wei_desc,
+                                      p_wei + wei_desc.Get1dIndex(0, 0, s, r),
+                                      wei_reg_desc,
                                       p_wei_reg,
-                                      wei_desc_reg);
+                                      wei_reg_desc);
 
             // read new input
-            threadwise_4d_tensor_copy(in_desc_lds,
-                                      p_in + in_desc_lds.Get1dIndex(0, 0, s, r),
-                                      in_desc_reg,
-                                      p_in_reg,
-                                      in_desc_reg);
+            threadwise_4d_tensor_copy(
+                in_desc, p_in + in_desc.Get1dIndex(0, 0, s, r), in_reg_desc, p_in_reg, in_reg_desc);
 
             // do 1x1 conv
             threadwise_direct_convolution_1(
-                in_desc_reg, p_in_reg, wei_desc_reg, p_wei_reg, out_desc_reg, p_out);
+                in_reg_desc, p_in_reg, wei_reg_desc, p_wei_reg, out_desc, p_out);
         }
-#endif
     }
-}
+#endif
+}
\ No newline at end of file