diff --git a/driver/conv.cu b/driver/conv.cu
index ca8684464a..d2df8ae023 100644
--- a/driver/conv.cu
+++ b/driver/conv.cu
@@ -7,7 +7,6 @@
 #include "constant_tensor_descriptor.cuh"
 #include "device_direct_convolution_1.cuh"
 #include "device_direct_convolution_2.cuh"
-#include "device_winograd_convolution.cuh"
 
 struct GeneratorConstant
 {
@@ -61,10 +60,10 @@ void ostream_ConstantTensorDescriptor(TConstTensorDesc, std::ostream& os = std::
 {
     static_assert(TConstTensorDesc::nDim == 4, "nDim is not 4");
 
-    constexpr auto I0   = Index<0>{};
-    constexpr auto I1   = Index<1>{};
-    constexpr auto I2   = Index<2>{};
-    constexpr auto I3   = Index<3>{};
+    constexpr auto I0   = Number<0>{};
+    constexpr auto I1   = Number<1>{};
+    constexpr auto I2   = Number<2>{};
+    constexpr auto I3   = Number<3>{};
     constexpr auto desc = TConstTensorDesc{};
 
     os << "Lengths: {" << desc.GetLength(I0) << ", " << desc.GetLength(I1) << ", "
@@ -79,10 +78,10 @@ auto make_TensorDescriptor(TConstTensorDesc)
 {
     static_assert(TConstTensorDesc::nDim == 4, "nDim is not 4");
 
-    constexpr auto I0   = Index<0>{};
-    constexpr auto I1   = Index<1>{};
-    constexpr auto I2   = Index<2>{};
-    constexpr auto I3   = Index<3>{};
+    constexpr auto I0   = Number<0>{};
+    constexpr auto I1   = Number<1>{};
+    constexpr auto I2   = Number<2>{};
+    constexpr auto I3   = Number<3>{};
     constexpr auto desc = TConstTensorDesc{};
 
     std::initializer_list<unsigned> lengths = {
@@ -396,7 +395,7 @@ int main()
     Tensor<float> out_host(make_TensorDescriptor(out_desc));
     Tensor<float> out_device(make_TensorDescriptor(out_desc));
 
-#if 0
+#if 1
     std::size_t num_thread = std::thread::hardware_concurrency();
     in.GenerateTensorValue(GeneratorTensor_2{-5, 5}, num_thread);
     wei.GenerateTensorValue(GeneratorTensor_2{-5, 5}, num_thread);
@@ -405,17 +404,17 @@ int main()
     for(int i = 0; i < 20; ++i)
     {
 #if 1
-        device_direct_convolution_2(in_desc, in, wei_desc, wei, out_desc, out_device);
+        device_direct_convolution_1(in_desc, in, wei_desc, wei, out_desc, out_device);
 #else
         device_winograd_convolution(in_desc, in, wei_desc, wei, out_desc, out_device);
 #endif
     }
 
-#if 0
-    host_direct_convolution(in, wei, out_host);
+#if 1
+    host_winograd_3x3_convolution(in, wei, out_host);
     check_error(out_host, out_device);
 #elif 0
-    host_winograd_3x3_convolution(in, wei, out_host);
+    host_direct_convolution(in, wei, out_host);
     check_error(out_host, out_device);
 #endif
 
diff --git a/driver/device_direct_convolution_1.cuh b/driver/device_direct_convolution_1.cuh
index 5527dd3946..070a76dabc 100644
--- a/driver/device_direct_convolution_1.cuh
+++ b/driver/device_direct_convolution_1.cuh
@@ -16,10 +16,10 @@ void device_direct_convolution_1(
     wei_device_buf.ToDevice(wei.mData.data());
     out_device_buf.ToDevice(out.mData.data());
 
-    constexpr auto I0 = Index<0>{};
-    constexpr auto I1 = Index<1>{};
-    constexpr auto I2 = Index<2>{};
-    constexpr auto I3 = Index<3>{};
+    constexpr auto I0 = Number<0>{};
+    constexpr auto I1 = Number<1>{};
+    constexpr auto I2 = Number<2>{};
+    constexpr auto I3 = Number<3>{};
 
     constexpr auto in_desc          = InDesc{};
     constexpr auto wei_desc         = WeiDesc{};
diff --git a/driver/device_direct_convolution_2.cuh b/driver/device_direct_convolution_2.cuh
index 53384ee2af..f37af98086 100644
--- a/driver/device_direct_convolution_2.cuh
+++ b/driver/device_direct_convolution_2.cuh
@@ -16,10 +16,10 @@ void device_direct_convolution_2(
     wei_device_buf.ToDevice(wei.mData.data());
     out_device_buf.ToDevice(out.mData.data());
 
-    constexpr auto I0 = Index<0>{};
-    constexpr auto I1 = Index<1>{};
-    constexpr auto I2 = Index<2>{};
-    constexpr auto I3 = Index<3>{};
+    constexpr auto I0 = Number<0>{};
+    constexpr auto I1 = Number<1>{};
+    constexpr auto I2 = Number<2>{};
+    constexpr auto I3 = Number<3>{};
 
     constexpr auto in_desc          = InDesc{};
     constexpr auto wei_desc         = WeiDesc{};
@@ -36,11 +36,6 @@ void device_direct_convolution_2(
     constexpr unsigned KPerThread = 4;
     constexpr unsigned CPerThread = 2;
 
-    constexpr unsigned NBlockOpLen0 = 1;
-    constexpr unsigned NBlockOpLen1 = 1;
-    constexpr unsigned NBlockOpLen2 = 4;
-    constexpr unsigned NBlockOpLen3 = 32;
-
     constexpr unsigned BlockSize = 128;
 
     constexpr unsigned GridSize = (out_desc.GetLength(I0) / NPerBlock) *
@@ -73,10 +68,6 @@ void device_direct_convolution_2(
                                   NPerThread,
                                   KPerThread,
                                   CPerThread,
-                                  NBlockOpLen0,
-                                  NBlockOpLen1,
-                                  NBlockOpLen2,
-                                  NBlockOpLen3,
                                   BlockSize,
                                   GridSize>
         <<<grid_dim, block_dim>>>(InDesc{},
diff --git a/driver/device_winograd_convolution.cuh b/driver/device_winograd_convolution.cuh
deleted file mode 100644
index f233c39613..0000000000
--- a/driver/device_winograd_convolution.cuh
+++ /dev/null
@@ -1,89 +0,0 @@
-#pragma once
-#include "gridwise_winograd_convolution.cuh"
-
-template <class T, class InDesc, class WeiDesc, class OutDesc>
-void device_winograd_convolution(
-    InDesc, const Tensor<T>& in, WeiDesc, const Tensor<T>& wei, OutDesc, Tensor<T>& out)
-{
-    std::size_t data_sz = sizeof(T);
-    DeviceMem in_device_buf(data_sz * in.mDesc.GetElementSpace());
-    DeviceMem wei_device_buf(data_sz * wei.mDesc.GetElementSpace());
-    DeviceMem out_device_buf(data_sz * out.mDesc.GetElementSpace());
-
-    int num_thread = std::thread::hardware_concurrency();
-
-    in_device_buf.ToDevice(in.mData.data());
-    wei_device_buf.ToDevice(wei.mData.data());
-    out_device_buf.ToDevice(out.mData.data());
-
-    constexpr auto I0 = Index<0>{};
-    constexpr auto I1 = Index<1>{};
-    constexpr auto I2 = Index<2>{};
-    constexpr auto I3 = Index<3>{};
-
-    constexpr auto in_desc          = InDesc{};
-    constexpr auto wei_desc         = WeiDesc{};
-    constexpr auto out_desc         = OutDesc{};
-    constexpr unsigned OutTileSizeH = 2;
-    constexpr unsigned OutTileSizeW = 2;
-    constexpr unsigned NPerBlock    = 2;
-    constexpr unsigned KPerBlock    = 16;
-    constexpr unsigned CPerBlock    = 4;
-    constexpr unsigned YPerBlock    = 1;
-    constexpr unsigned XPerBlock    = 16;
-
-    constexpr unsigned NPerThread = 2;
-    constexpr unsigned KPerThread = 2;
-    constexpr unsigned CPerThread = 2;
-
-    constexpr unsigned BlockSize = 128;
-
-    constexpr unsigned GridSize = (out_desc.GetLength(I0) / NPerBlock) *
-                                  (out_desc.GetLength(I1) / KPerBlock) *
-                                  (out_desc.GetLength(I2) / (OutTileSizeH * YPerBlock)) *
-                                  (out_desc.GetLength(I3) / (OutTileSizeW * XPerBlock));
-
-    dim3 block_dim(BlockSize);
-    dim3 grid_dim(GridSize);
-
-    printf("%s: BlockSize %u, GridSize %u \n", __func__, BlockSize, GridSize);
-
-    cudaEvent_t start, stop;
-    float elapsedTime;
-
-    cudaEventCreate(&start);
-    cudaEventRecord(start, 0);
-
-    gridwise_winograd_convolution<T,
-                                  InDesc,
-                                  WeiDesc,
-                                  OutDesc,
-                                  OutTileSizeH,
-                                  OutTileSizeW,
-                                  NPerBlock,
-                                  KPerBlock,
-                                  CPerBlock,
-                                  YPerBlock,
-                                  XPerBlock,
-                                  NPerThread,
-                                  KPerThread,
-                                  CPerThread,
-                                  BlockSize,
-                                  GridSize>
-        <<<grid_dim, block_dim>>>(InDesc{},
-                                  static_cast<T*>(in_device_buf.GetDeviceBuffer()),
-                                  WeiDesc{},
-                                  static_cast<T*>(wei_device_buf.GetDeviceBuffer()),
-                                  OutDesc{},
-                                  static_cast<T*>(out_device_buf.GetDeviceBuffer()));
-
-    cudaEventCreate(&stop);
-    cudaEventRecord(stop, 0);
-    cudaEventSynchronize(stop);
-
-    cudaEventElapsedTime(&elapsedTime, start, stop);
-    printf("Elapsed time : %f ms\n", elapsedTime);
-
-    checkCudaErrors(cudaGetLastError());
-    out_device_buf.FromDevice(out.mData.data());
-}
\ No newline at end of file
diff --git a/src/include/blockwise_direct_convolution.cuh b/src/include/blockwise_direct_convolution.cuh
index 40a6be6aac..d70c45254b 100644
--- a/src/include/blockwise_direct_convolution.cuh
+++ b/src/include/blockwise_direct_convolution.cuh
@@ -17,10 +17,10 @@ __device__ void blockwise_convolution(InBlockDesc,
                                       OutBlockDesc,
                                       TFloat* __restrict__ p_out_block)
 {
-    constexpr auto I0 = Index<0>{};
-    constexpr auto I1 = Index<1>{};
-    constexpr auto I2 = Index<2>{};
-    constexpr auto I3 = Index<3>{};
+    constexpr auto I0 = Number<0>{};
+    constexpr auto I1 = Number<1>{};
+    constexpr auto I2 = Number<2>{};
+    constexpr auto I3 = Number<3>{};
 
     constexpr auto in_block_desc  = InBlockDesc{};
     constexpr auto wei_block_desc = WeiBlockDesc{};
@@ -88,72 +88,50 @@ __device__ void blockwise_convolution(InBlockDesc,
         TFloat p_wei_thread[wei_thread_src_desc.GetElementSpace()];
         TFloat p_out_thread[out_thread_src_desc.GetElementSpace()];
 
-        auto f_copy = [](const TFloat& src, TFloat& dst) { dst = src; };
-
         // copy input tensor into register
-        threadwise_4d_tensor_op_binary<TFloat,
-                                       decltype(in_thread_src_desc),
-                                       decltype(in_thread_dst_desc),
-                                       decltype(f_copy)>(
+        threadwise_4d_tensor_copy(
             in_thread_src_desc,
             p_in_block + in_block_desc.Get1dIndex(
                              n_thread_work_begin, 0, hi_thread_work_begin, wi_thread_work_begin),
             in_thread_dst_desc,
-            p_in_thread,
-            f_copy);
+            p_in_thread);
 
         for(unsigned k_thread_work_begin = 0; k_thread_work_begin < KPerBlock;
             ++k_thread_work_begin)
         {
             // copy weight tensor into register
-            threadwise_4d_tensor_op_binary<TFloat,
-                                           decltype(wei_thread_src_desc),
-                                           decltype(wei_thread_dst_desc),
-                                           decltype(f_copy)>(
-                wei_thread_src_desc,
-                p_wei_block + wei_block_desc.Get1dIndex(k_thread_work_begin, 0, 0, 0),
-                wei_thread_dst_desc,
-                p_wei_thread,
-                f_copy);
+            threadwise_4d_tensor_copy(wei_thread_src_desc,
+                                      p_wei_block +
+                                          wei_block_desc.Get1dIndex(k_thread_work_begin, 0, 0, 0),
+                                      wei_thread_dst_desc,
+                                      p_wei_thread);
 
             // copy output tensor into register
-            threadwise_4d_tensor_op_binary<TFloat,
-                                           decltype(out_thread_src_desc),
-                                           decltype(out_thread_dst_desc),
-                                           decltype(f_copy)>(
-                out_thread_src_desc,
-                p_out_block + out_block_desc.Get1dIndex(n_thread_work_begin,
-                                                        k_thread_work_begin,
-                                                        ho_thread_work_begin,
-                                                        wo_thread_work_begin),
-                out_thread_dst_desc,
-                p_out_thread,
-                f_copy);
+            threadwise_4d_tensor_copy(out_thread_src_desc,
+                                      p_out_block + out_block_desc.Get1dIndex(n_thread_work_begin,
+                                                                              k_thread_work_begin,
+                                                                              ho_thread_work_begin,
+                                                                              wo_thread_work_begin),
+                                      out_thread_dst_desc,
+                                      p_out_thread);
 
             // threadwise convolution
-            threadwise_direct_convolution<TFloat,
-                                          decltype(in_thread_dst_desc),
-                                          decltype(wei_thread_dst_desc),
-                                          decltype(out_thread_dst_desc)>(in_thread_dst_desc,
-                                                                         p_in_thread,
-                                                                         wei_thread_dst_desc,
-                                                                         p_wei_thread,
-                                                                         out_thread_dst_desc,
-                                                                         p_out_thread);
+            threadwise_direct_convolution(in_thread_dst_desc,
+                                          p_in_thread,
+                                          wei_thread_dst_desc,
+                                          p_wei_thread,
+                                          out_thread_dst_desc,
+                                          p_out_thread);
 
             // accumulate output tensor into LDS
-            threadwise_4d_tensor_op_binary<TFloat,
-                                           decltype(out_thread_dst_desc),
-                                           decltype(out_thread_src_desc),
-                                           decltype(f_copy)>(
-                out_thread_dst_desc,
-                p_out_thread,
-                out_thread_src_desc,
-                p_out_block + out_block_desc.Get1dIndex(n_thread_work_begin,
-                                                        k_thread_work_begin,
-                                                        ho_thread_work_begin,
-                                                        wo_thread_work_begin),
-                f_copy);
+            threadwise_4d_tensor_copy(out_thread_dst_desc,
+                                      p_out_thread,
+                                      out_thread_src_desc,
+                                      p_out_block +
+                                          out_block_desc.Get1dIndex(n_thread_work_begin,
+                                                                    k_thread_work_begin,
+                                                                    ho_thread_work_begin,
+                                                                    wo_thread_work_begin));
         }
     }
 }
diff --git a/src/include/blockwise_tensor_op.cuh b/src/include/blockwise_tensor_op.cuh
index 65a1e08c38..3b4bc58fdc 100644
--- a/src/include/blockwise_tensor_op.cuh
+++ b/src/include/blockwise_tensor_op.cuh
@@ -1,44 +1,31 @@
 #pragma once
 #include "constant_tensor_descriptor.cuh"
 
-#define BLOCKWISE_TENSOR_OP_METHOD 12
-
-#if BLOCKWISE_TENSOR_OP_METHOD == 11
-template <class TFloat,
-          class SrcDesc,
-          class DstDesc,
-          unsigned NBlockOpLen0,
-          unsigned NBlockOpLen1,
-          unsigned NBlockOpLen2,
-          unsigned NBlockOpLen3,
-          class F,
-          unsigned BlockSize>
-__device__ void blockwise_4d_tensor_op_binary(
-    SrcDesc, TFloat* const __restrict__ p_src, DstDesc, TFloat* __restrict__ p_dst, F f)
+template <class TFloat, class DstDesc, class F, unsigned BlockSize>
+__device__ void blockwise_4d_tensor_pointwise_op_unary(DstDesc, TFloat* __restrict__ p_dst, F f)
 {
-    constexpr auto I0 = Index<0>{};
-    constexpr auto I1 = Index<1>{};
-    constexpr auto I2 = Index<2>{};
-    constexpr auto I3 = Index<3>{};
+    constexpr auto I0 = Number<0>{};
+    constexpr auto I1 = Number<1>{};
+    constexpr auto I2 = Number<2>{};
+    constexpr auto I3 = Number<3>{};
 
-    constexpr auto src_desc = SrcDesc{};
     constexpr auto dst_desc = DstDesc{};
 
-    static_assert(is_same<decltype(src_desc.GetLengths()), decltype(dst_desc.GetLengths())>::value);
-
-    constexpr auto desc = make_ConstantTensorDescriptor(src_desc.GetLengths());
+    constexpr auto desc = make_ConstantTensorDescriptor(dst_desc.GetLengths());
 
 #if 0
     if(threadIdx.x == 0)
     {
-        print_ConstantTensorDescriptor(src_desc, "blockwise_4d_tensor_op_binary: src_desc: ");
-        print_ConstantTensorDescriptor(dst_desc, "blockwise_4d_tensor_op_binary: dst_desc: ");
+        print_ConstantTensorDescriptor(dst_desc, "blockwise_4d_tensor_op_unary: dst_desc: ");
+        print_ConstantTensorDescriptor(desc, "blockwise_4d_tensor_op_unary: desc: ");
     }
 #endif
 
-    for(unsigned i = threadIdx.x; i < desc.GetElementSize(); i += BlockSize)
+    constexpr unsigned NLoop = desc.GetElementSize() / BlockSize;
+
+    for(unsigned iloop = 0; iloop < NLoop; ++iloop)
     {
-        unsigned is = i;
+        unsigned is = threadIdx.x + iloop * BlockSize;
 
         const unsigned did0 = is / desc.GetStride(I0);
 
@@ -54,32 +41,48 @@ __device__ void blockwise_4d_tensor_op_binary(
 
         const unsigned did3 = is / desc.GetStride(I3);
 
-        const unsigned sindex = src_desc.Get1dIndex(did0, did1, did2, did3);
-
         const unsigned dindex = dst_desc.Get1dIndex(did0, did1, did2, did3);
 
-        f(p_src[sindex], p_dst[dindex]);
+        f(p_dst[dindex]);
+    }
+
+    constexpr bool has_tail = (desc.GetElementSize() > NLoop * BlockSize);
+
+    if(has_tail)
+    {
+        unsigned is = threadIdx.x + NLoop * BlockSize;
+
+        if(is < desc.GetElementSize())
+        {
+            const unsigned did0 = is / desc.GetStride(I0);
+
+            is -= did0 * desc.GetStride(I0);
+
+            const unsigned did1 = is / desc.GetStride(I1);
+
+            is -= did1 * desc.GetStride(I1);
+
+            const unsigned did2 = is / desc.GetStride(I2);
+
+            is -= did2 * desc.GetStride(I2);
+
+            const unsigned did3 = is / desc.GetStride(I3);
+
+            const unsigned dindex = dst_desc.Get1dIndex(did0, did1, did2, did3);
+
+            f(p_dst[dindex]);
+        }
     }
 }
-#endif
 
-#if BLOCKWISE_TENSOR_OP_METHOD == 12
-template <class TFloat,
-          class SrcDesc,
-          class DstDesc,
-          unsigned NBlockOpLen0,
-          unsigned NBlockOpLen1,
-          unsigned NBlockOpLen2,
-          unsigned NBlockOpLen3,
-          class F,
-          unsigned BlockSize>
-__device__ void blockwise_4d_tensor_op_binary(
+template <class TFloat, class SrcDesc, class DstDesc, class F, unsigned BlockSize>
+__device__ void blockwise_4d_tensor_pointwise_op_binary(
     SrcDesc, TFloat* const __restrict__ p_src, DstDesc, TFloat* __restrict__ p_dst, F f)
 {
-    constexpr auto I0 = Index<0>{};
-    constexpr auto I1 = Index<1>{};
-    constexpr auto I2 = Index<2>{};
-    constexpr auto I3 = Index<3>{};
+    constexpr auto I0 = Number<0>{};
+    constexpr auto I1 = Number<1>{};
+    constexpr auto I2 = Number<2>{};
+    constexpr auto I3 = Number<3>{};
 
     constexpr auto src_desc = SrcDesc{};
     constexpr auto dst_desc = DstDesc{};
@@ -154,386 +157,35 @@ __device__ void blockwise_4d_tensor_op_binary(
     }
 }
 
-template <class TFloat,
-          class DstDesc,
-          unsigned NBlockOpLen0,
-          unsigned NBlockOpLen1,
-          unsigned NBlockOpLen2,
-          unsigned NBlockOpLen3,
-          class F,
-          unsigned BlockSize>
-__device__ void blockwise_4d_tensor_op_unary(DstDesc, TFloat* __restrict__ p_dst, F f)
+template <class TFloat, class DstDesc, unsigned BlockSize>
+__device__ void blockwise_4d_tensor_set_zero(DstDesc, TFloat* __restrict__ p_dst)
 {
-    constexpr auto I0 = Index<0>{};
-    constexpr auto I1 = Index<1>{};
-    constexpr auto I2 = Index<2>{};
-    constexpr auto I3 = Index<3>{};
+    auto f_set_zero = [](TFloat& v) { v = TFloat(0); };
 
-    constexpr auto dst_desc = DstDesc{};
-
-    constexpr auto desc = make_ConstantTensorDescriptor(dst_desc.GetLengths());
-
-#if 0
-    if(threadIdx.x == 0)
-    {
-        print_ConstantTensorDescriptor(dst_desc, "blockwise_4d_tensor_op_unary: dst_desc: ");
-        print_ConstantTensorDescriptor(desc, "blockwise_4d_tensor_op_unary: desc: ");
-    }
-#endif
-
-    constexpr unsigned NLoop = desc.GetElementSize() / BlockSize;
-
-    for(unsigned iloop = 0; iloop < NLoop; ++iloop)
-    {
-        unsigned is = threadIdx.x + iloop * BlockSize;
-
-        const unsigned did0 = is / desc.GetStride(I0);
-
-        is -= did0 * desc.GetStride(I0);
-
-        const unsigned did1 = is / desc.GetStride(I1);
-
-        is -= did1 * desc.GetStride(I1);
-
-        const unsigned did2 = is / desc.GetStride(I2);
-
-        is -= did2 * desc.GetStride(I2);
-
-        const unsigned did3 = is / desc.GetStride(I3);
-
-        const unsigned dindex = dst_desc.Get1dIndex(did0, did1, did2, did3);
-
-        f(p_dst[dindex]);
-    }
-
-    constexpr bool has_tail = (desc.GetElementSize() > NLoop * BlockSize);
-
-    if(has_tail)
-    {
-        unsigned is = threadIdx.x + NLoop * BlockSize;
-
-        if(is < desc.GetElementSize())
-        {
-            const unsigned did0 = is / desc.GetStride(I0);
-
-            is -= did0 * desc.GetStride(I0);
-
-            const unsigned did1 = is / desc.GetStride(I1);
-
-            is -= did1 * desc.GetStride(I1);
-
-            const unsigned did2 = is / desc.GetStride(I2);
-
-            is -= did2 * desc.GetStride(I2);
-
-            const unsigned did3 = is / desc.GetStride(I3);
-
-            const unsigned dindex = dst_desc.Get1dIndex(did0, did1, did2, did3);
-
-            f(p_dst[dindex]);
-        }
-    }
+    blockwise_4d_tensor_pointwise_op_unary<TFloat, DstDesc, decltype(f_set_zero), BlockSize>(
+        DstDesc{}, p_dst, f_set_zero);
 }
-#endif
 
-#if BLOCKWISE_TENSOR_OP_METHOD == 21
-template <class TFloat,
-          class SrcDesc,
-          class DstDesc,
-          unsigned NBlockOpLen0,
-          unsigned NBlockOpLen1,
-          unsigned NBlockOpLen2,
-          unsigned NBlockOpLen3,
-          class F,
-          unsigned BlockSize>
-__device__ void blockwise_4d_tensor_op_binary(
-    SrcDesc, TFloat* const __restrict__ p_src, DstDesc, TFloat* __restrict__ p_dst, F f)
+template <class TFloat, class SrcDesc, class DstDesc, unsigned BlockSize>
+__device__ void blockwise_4d_tensor_copy(SrcDesc,
+                                         TFloat* const __restrict__ p_src,
+                                         DstDesc,
+                                         TFloat* __restrict__ p_dst)
 {
-    constexpr auto I0 = Index<0>{};
-    constexpr auto I1 = Index<1>{};
-    constexpr auto I2 = Index<2>{};
-    constexpr auto I3 = Index<3>{};
+    auto f_copy = [](const TFloat& src, TFloat& dst) { dst = src; };
 
-    constexpr auto src_desc = SrcDesc{};
-    constexpr auto dst_desc = DstDesc{};
-
-    static_assert(is_same<decltype(src_desc.GetLengths()), decltype(dst_desc.GetLengths())>::value);
-
-    constexpr unsigned NBlockOpStride3 = 1;
-    constexpr unsigned NBlockOpStride2 = NBlockOpLen3 * NBlockOpStride3;
-    constexpr unsigned NBlockOpStride1 = NBlockOpLen2 * NBlockOpStride2;
-    constexpr unsigned NBlockOpStride0 = NBlockOpLen1 * NBlockOpStride1;
-
-    unsigned itmp = threadIdx.x;
-
-    const unsigned did0_begin = itmp / NBlockOpStride0;
-
-    itmp -= did0_begin * NBlockOpStride0;
-
-    const unsigned did1_begin = itmp / NBlockOpStride1;
-
-    itmp -= did1_begin * NBlockOpStride1;
-
-    const unsigned did2_begin = itmp / NBlockOpStride2;
-
-    itmp -= did2_begin * NBlockOpStride2;
-
-    const unsigned did3_begin = itmp / NBlockOpStride3;
-
-    for(unsigned did0 = did0_begin; did0 < src_desc.GetLength(I0); did0 += NBlockOpLen0)
-    {
-        for(unsigned did1 = did1_begin; did1 < src_desc.GetLength(I1); did1 += NBlockOpLen1)
-        {
-            for(unsigned did2 = did2_begin; did2 < src_desc.GetLength(I2); did2 += NBlockOpLen2)
-            {
-                for(unsigned did3 = did3_begin; did3 < src_desc.GetLength(I3); did3 += NBlockOpLen3)
-                {
-                    const unsigned sindex = src_desc.Get1dIndex(did0, did1, did2, did3);
-
-                    const unsigned dindex = dst_desc.Get1dIndex(did0, did1, did2, did3);
-
-                    f(p_src[sindex], p_dst[dindex]);
-                }
-            }
-        }
-    }
+    blockwise_4d_tensor_pointwise_op_binary<TFloat, SrcDesc, DstDesc, decltype(f_copy), BlockSize>(
+        SrcDesc{}, p_src, DstDesc{}, p_dst, f_copy);
 }
-#endif
 
-#if BLOCKWISE_TENSOR_OP_METHOD == 22
-template <class TFloat,
-          class SrcDesc,
-          class DstDesc,
-          unsigned NBlockOpLen0,
-          unsigned NBlockOpLen1,
-          unsigned NBlockOpLen2,
-          unsigned NBlockOpLen3,
-          class F,
-          unsigned BlockSize>
-__device__ void blockwise_4d_tensor_op_binary(
-    SrcDesc, TFloat* const __restrict__ p_src, DstDesc, TFloat* __restrict__ p_dst, F f)
+template <class TFloat, class SrcDesc, class DstDesc, unsigned BlockSize>
+__device__ void blockwise_4d_tensor_accumulate(SrcDesc,
+                                               TFloat* const __restrict__ p_src,
+                                               DstDesc,
+                                               TFloat* __restrict__ p_dst)
 {
-    constexpr auto I0 = Index<0>{};
-    constexpr auto I1 = Index<1>{};
-    constexpr auto I2 = Index<2>{};
-    constexpr auto I3 = Index<3>{};
+    auto f_accum = [](const TFloat& src, TFloat& dst) { dst += src; };
 
-    constexpr auto src_desc = SrcDesc{};
-    constexpr auto dst_desc = DstDesc{};
-
-    static_assert(is_same<decltype(src_desc.GetLengths()), decltype(dst_desc.GetLengths())>::value);
-
-    constexpr unsigned NBlockOpStride3 = 1;
-    constexpr unsigned NBlockOpStride2 = NBlockOpLen3 * NBlockOpStride3;
-    constexpr unsigned NBlockOpStride1 = NBlockOpLen2 * NBlockOpStride2;
-    constexpr unsigned NBlockOpStride0 = NBlockOpLen1 * NBlockOpStride1;
-
-    unsigned itmp = threadIdx.x;
-
-    const unsigned did0_begin = itmp / NBlockOpStride0;
-
-    itmp -= did0_begin * NBlockOpStride0;
-
-    const unsigned did1_begin = itmp / NBlockOpStride1;
-
-    itmp -= did1_begin * NBlockOpStride1;
-
-    const unsigned did2_begin = itmp / NBlockOpStride2;
-
-    itmp -= did2_begin * NBlockOpStride2;
-
-    const unsigned did3_begin = itmp / NBlockOpStride3;
-
-    unsigned sindex = src_desc.Get1dIndex(did0_begin, did1_begin, did2_begin, did3_begin);
-    unsigned dindex = dst_desc.Get1dIndex(did0_begin, did1_begin, did2_begin, did3_begin);
-
-    for(unsigned did0 = did0_begin; did0 < src_desc.GetLength(I0); did0 += NBlockOpLen0)
-    {
-        const unsigned sindex_save0 = sindex;
-        const unsigned dindex_save0 = dindex;
-
-        for(unsigned did1 = did1_begin; did1 < src_desc.GetLength(I1); did1 += NBlockOpLen1)
-        {
-            const unsigned sindex_save1 = sindex;
-            const unsigned dindex_save1 = dindex;
-
-            for(unsigned did2 = did2_begin; did2 < src_desc.GetLength(I2); did2 += NBlockOpLen2)
-            {
-                const unsigned sindex_save2 = sindex;
-                const unsigned dindex_save2 = dindex;
-
-                for(unsigned did3 = did3_begin; did3 < src_desc.GetLength(I3); did3 += NBlockOpLen3)
-                {
-                    f(p_src[sindex], p_dst[dindex]);
-
-                    sindex += NBlockOpLen3 * src_desc.GetStride(I3);
-                    dindex += NBlockOpLen3 * dst_desc.GetStride(I3);
-                }
-
-                sindex = sindex_save2 + NBlockOpLen2 * src_desc.GetStride(I2);
-                dindex = dindex_save2 + NBlockOpLen2 * dst_desc.GetStride(I2);
-            }
-
-            sindex = sindex_save1 + NBlockOpLen1 * src_desc.GetStride(I1);
-            dindex = dindex_save1 + NBlockOpLen1 * dst_desc.GetStride(I1);
-        }
-
-        sindex = sindex_save0 + NBlockOpLen0 * src_desc.GetStride(I0);
-        dindex = dindex_save0 + NBlockOpLen0 * dst_desc.GetStride(I0);
-    }
-}
-#endif
-
-#if BLOCKWISE_TENSOR_OP_METHOD == 23
-template <class TFloat,
-          class SrcDesc,
-          class DstDesc,
-          unsigned NBlockOpLen0,
-          unsigned NBlockOpLen1,
-          unsigned NBlockOpLen2,
-          unsigned NBlockOpLen3,
-          class F,
-          unsigned BlockSize>
-__device__ void blockwise_4d_tensor_op_binary(
-    SrcDesc, TFloat* const __restrict__ p_src, DstDesc, TFloat* __restrict__ p_dst, F f)
-{
-    constexpr auto I0 = Index<0>{};
-    constexpr auto I1 = Index<1>{};
-    constexpr auto I2 = Index<2>{};
-    constexpr auto I3 = Index<3>{};
-
-    constexpr auto src_desc = SrcDesc{};
-    constexpr auto dst_desc = DstDesc{};
-
-    static_assert(is_same<decltype(src_desc.GetLengths()), decltype(dst_desc.GetLengths())>::value);
-
-    constexpr unsigned NBlockOpStride3 = 1;
-    constexpr unsigned NBlockOpStride2 = NBlockOpLen3 * NBlockOpStride3;
-    constexpr unsigned NBlockOpStride1 = NBlockOpLen2 * NBlockOpStride2;
-    constexpr unsigned NBlockOpStride0 = NBlockOpLen1 * NBlockOpStride1;
-
-    unsigned itmp = threadIdx.x;
-
-    const unsigned did0_begin = itmp / NBlockOpStride0;
-
-    itmp -= did0_begin * NBlockOpStride0;
-
-    const unsigned did1_begin = itmp / NBlockOpStride1;
-
-    itmp -= did1_begin * NBlockOpStride1;
-
-    const unsigned did2_begin = itmp / NBlockOpStride2;
-
-    itmp -= did2_begin * NBlockOpStride2;
-
-    const unsigned did3_begin = itmp / NBlockOpStride3;
-
-    unsigned sindex = src_desc.Get1dIndex(did0_begin, did1_begin, did2_begin, did3_begin);
-    unsigned dindex = dst_desc.Get1dIndex(did0_begin, did1_begin, did2_begin, did3_begin);
-
-    for(unsigned did0 = did0_begin; did0 < src_desc.GetLength(I0); did0 += NBlockOpLen0)
-    {
-        unsigned i1 = 0;
-        for(unsigned did1 = did1_begin; did1 < src_desc.GetLength(I1); did1 += NBlockOpLen1)
-        {
-            unsigned i2 = 0;
-            for(unsigned did2 = did2_begin; did2 < src_desc.GetLength(I2); did2 += NBlockOpLen2)
-            {
-                unsigned i3 = 0;
-                for(unsigned did3 = did3_begin; did3 < src_desc.GetLength(I3); did3 += NBlockOpLen3)
-                {
-                    f(p_src[sindex], p_dst[dindex]);
-
-                    sindex += NBlockOpLen3 * src_desc.GetStride(I3);
-                    dindex += NBlockOpLen3 * dst_desc.GetStride(I3);
-
-                    ++i3;
-                }
-
-                sindex += NBlockOpLen2 * src_desc.GetStride(I2) -
-                          i3 * NBlockOpLen3 * src_desc.GetStride(I3);
-                dindex += NBlockOpLen2 * dst_desc.GetStride(I2) -
-                          i3 * NBlockOpLen3 * dst_desc.GetStride(I3);
-
-                ++i2;
-            }
-
-            sindex +=
-                NBlockOpLen1 * src_desc.GetStride(I1) - i2 * NBlockOpLen2 * src_desc.GetStride(I2);
-            dindex +=
-                NBlockOpLen1 * dst_desc.GetStride(I1) - i2 * NBlockOpLen2 * dst_desc.GetStride(I2);
-
-            ++i1;
-        }
-
-        sindex +=
-            NBlockOpLen0 * src_desc.GetStride(I0) - i1 * NBlockOpLen1 * src_desc.GetStride(I1);
-        dindex +=
-            NBlockOpLen0 * dst_desc.GetStride(I0) - i1 * NBlockOpLen1 * dst_desc.GetStride(I1);
-    }
-}
-#endif
-
-#if BLOCKWISE_TENSOR_OP_METHOD == 31
-template <class TFloat,
-          class SrcDesc,
-          class DstDesc,
-          unsigned NBlockOpLen0,
-          unsigned NBlockOpLen1,
-          unsigned NBlockOpLen2,
-          unsigned NBlockOpLen3,
-          class F,
-          unsigned BlockSize>
-__device__ void blockwise_4d_tensor_op_binary(
-    SrcDesc, TFloat* const __restrict__ p_src, DstDesc, TFloat* __restrict__ p_dst, F f)
-{
-    constexpr auto I0 = Index<0>{};
-    constexpr auto I1 = Index<1>{};
-    constexpr auto I2 = Index<2>{};
-    constexpr auto I3 = Index<3>{};
-
-    constexpr auto src_desc = SrcDesc{};
-    constexpr auto dst_desc = DstDesc{};
-
-    static_assert(is_same<decltype(src_desc.GetLengths()), decltype(dst_desc.GetLengths())>::value);
-
-    constexpr unsigned NBlockOpStride3 = 1;
-    constexpr unsigned NBlockOpStride2 = NBlockOpLen3 * NBlockOpStride3;
-    constexpr unsigned NBlockOpStride1 = NBlockOpLen2 * NBlockOpStride2;
-    constexpr unsigned NBlockOpStride0 = NBlockOpLen1 * NBlockOpStride1;
-
-    unsigned itmp = threadIdx.x;
-
-    const unsigned did0_begin = itmp / NBlockOpStride0;
-
-    itmp -= did0_begin * NBlockOpStride0;
-
-    const unsigned did1_begin = itmp / NBlockOpStride1;
-
-    itmp -= did1_begin * NBlockOpStride1;
-
-    const unsigned did2_begin = itmp / NBlockOpStride2;
-
-    itmp -= did2_begin * NBlockOpStride2;
-
-    const unsigned did3_begin = itmp / NBlockOpStride3;
-
-    for(unsigned did0 = did0_begin; did0 < src_desc.GetLength(I0); did0 += NBlockOpLen0)
-    {
-        for(unsigned did1 = did1_begin; did1 < src_desc.GetLength(I1); did1 += NBlockOpLen1)
-        {
-            for(unsigned did2 = did2_begin; did2 < src_desc.GetLength(I2); did2 += NBlockOpLen2)
-            {
-                for(unsigned did3 = did3_begin; did3 < src_desc.GetLength(I3); did3 += NBlockOpLen3)
-                {
-                    const unsigned sindex = src_desc.Get1dIndex(did0, did1, did2, did3);
-
-                    const unsigned dindex = dst_desc.Get1dIndex(did0, did1, did2, did3);
-
-                    f(p_src[sindex], p_dst[dindex]);
-                }
-            }
-        }
-    }
-}
-#endif
+    blockwise_4d_tensor_pointwise_op_binary<TFloat, SrcDesc, DstDesc, decltype(f_accum), BlockSize>(
+        SrcDesc{}, p_src, DstDesc{}, p_dst, f_accum);
+}
\ No newline at end of file
diff --git a/src/include/blockwise_winograd_transform.cuh b/src/include/blockwise_winograd_transform.cuh
deleted file mode 100644
index 347d403d83..0000000000
--- a/src/include/blockwise_winograd_transform.cuh
+++ /dev/null
@@ -1,36 +0,0 @@
-#pragma once
-#include "constant_tensor_descriptor.cuh"
-
-template <class TFloat,
-          unsigned InTileSizeH,
-          unsigned InTileSizeW,
-          unsigned S,
-          unsigned R,
-          unsigned OutTileSizeH,
-          unsigned OutTileSizeW,
-          unsigned NPerBlock,
-          unsigned CPerBlock,
-          unsigned YPerBlock,
-          unsigned XPerBlock,
-          unsigned BlockSize>
-__device__ void blockwise_winograd_transform_input(TFloat* const __restrict__ p_in,
-                                                   TFloat* __restrict__ p_in_transform)
-{
-    p_in_transform[0] = 1;
-}
-
-template <class TFloat,
-          unsigned InTileSizeH,
-          unsigned InTileSizeW,
-          unsigned S,
-          unsigned R,
-          unsigned OutTileSizeH,
-          unsigned OutTileSizeW,
-          unsigned KPerBlock,
-          unsigned CPerBlock,
-          unsigned BlockSize>
-__device__ void blockwise_winograd_transform_weight(TFloat* const __restrict__ p_wei,
-                                                    TFloat* __restrict__ p_wei_transform)
-{
-    p_wei_transform[0] = 1;
-}
\ No newline at end of file
diff --git a/src/include/constant_tensor_descriptor.cuh b/src/include/constant_tensor_descriptor.cuh
index 8593d517b6..11c5c30e54 100644
--- a/src/include/constant_tensor_descriptor.cuh
+++ b/src/include/constant_tensor_descriptor.cuh
@@ -7,8 +7,8 @@ struct Constant
     const T mValue = N;
 };
 
-template <unsigned I>
-using Index = Constant<unsigned, I>;
+template <unsigned N>
+using Number = Constant<unsigned, N>;
 
 template <unsigned... Is>
 struct Sequence
@@ -18,7 +18,7 @@ struct Sequence
     const unsigned mData[nDim] = {Is...};
 
     template <unsigned I>
-    __host__ __device__ constexpr unsigned Get(Index<I>) const
+    __host__ __device__ constexpr unsigned Get(Number<I>) const
     {
         return mData[I];
     }
@@ -28,7 +28,7 @@ template <class Lengths, class Strides>
 struct ConstantTensorDescriptor
 {
     static constexpr unsigned nDim = Lengths::nDim;
-    using NDimConstant             = Index<nDim>;
+    using NDimConstant             = Number<nDim>;
 
     __host__ __device__ constexpr ConstantTensorDescriptor()
     {
@@ -42,15 +42,15 @@ struct ConstantTensorDescriptor
     __host__ __device__ constexpr Strides GetStrides() const { return Strides{}; }
 
     template <unsigned I>
-    __host__ __device__ constexpr unsigned GetLength(Index<I>) const
+    __host__ __device__ constexpr unsigned GetLength(Number<I>) const
     {
-        return Lengths{}.Get(Index<I>{});
+        return Lengths{}.Get(Number<I>{});
     }
 
     template <unsigned I>
-    __host__ __device__ constexpr unsigned GetStride(Index<I>) const
+    __host__ __device__ constexpr unsigned GetStride(Number<I>) const
     {
-        return Strides{}.Get(Index<I>{});
+        return Strides{}.Get(Number<I>{});
     }
 
     // this is ugly, only for 4d
@@ -58,10 +58,10 @@ struct ConstantTensorDescriptor
     {
         static_assert(nDim == 4, "nDim is not 4");
 
-        constexpr auto I0 = Index<0>{};
-        constexpr auto I1 = Index<1>{};
-        constexpr auto I2 = Index<2>{};
-        constexpr auto I3 = Index<3>{};
+        constexpr auto I0 = Number<0>{};
+        constexpr auto I1 = Number<1>{};
+        constexpr auto I2 = Number<2>{};
+        constexpr auto I3 = Number<3>{};
 
         return GetLength(I0) * GetLength(I1) * GetLength(I2) * GetLength(I3);
     }
@@ -71,10 +71,10 @@ struct ConstantTensorDescriptor
     {
         static_assert(nDim == 4, "nDim is not 4");
 
-        constexpr auto I0 = Index<0>{};
-        constexpr auto I1 = Index<1>{};
-        constexpr auto I2 = Index<2>{};
-        constexpr auto I3 = Index<3>{};
+        constexpr auto I0 = Number<0>{};
+        constexpr auto I1 = Number<1>{};
+        constexpr auto I2 = Number<2>{};
+        constexpr auto I3 = Number<3>{};
 
         return (GetLength(I0) - 1) * GetStride(I0) + (GetLength(I1) - 1) * GetStride(I1) +
                (GetLength(I2) - 1) * GetStride(I2) + (GetLength(I3) - 1) * GetStride(I3) + 1;
@@ -83,10 +83,10 @@ struct ConstantTensorDescriptor
     // this is ugly, only for 4d
     __host__ __device__ unsigned Get1dIndex(unsigned n, unsigned c, unsigned h, unsigned w) const
     {
-        constexpr auto I0 = Index<0>{};
-        constexpr auto I1 = Index<1>{};
-        constexpr auto I2 = Index<2>{};
-        constexpr auto I3 = Index<3>{};
+        constexpr auto I0 = Number<0>{};
+        constexpr auto I1 = Number<1>{};
+        constexpr auto I2 = Number<2>{};
+        constexpr auto I3 = Number<3>{};
 
         static_assert(nDim == 4, "nDim is not 4");
         return n * GetStride(I0) + c * GetStride(I1) + h * GetStride(I2) + w * GetStride(I3);
@@ -120,10 +120,10 @@ __host__ __device__ constexpr auto get_output_4d_tensor_descriptor(InDesc, WeiDe
     constexpr auto in_desc  = InDesc{};
     constexpr auto wei_desc = WeiDesc{};
 
-    constexpr auto I0 = Index<0>{};
-    constexpr auto I1 = Index<1>{};
-    constexpr auto I2 = Index<2>{};
-    constexpr auto I3 = Index<3>{};
+    constexpr auto I0 = Number<0>{};
+    constexpr auto I1 = Number<1>{};
+    constexpr auto I2 = Number<2>{};
+    constexpr auto I3 = Number<3>{};
 
     static_assert(in_desc.GetDimension() == 4, "input nDim is not 4");
     static_assert(wei_desc.GetDimension() == 4, "weight nDim is not 4");
@@ -150,10 +150,10 @@ __host__ __device__ void print_ConstantTensorDescriptor(TDesc, const char* s)
 {
     constexpr auto desc = TDesc{};
 
-    constexpr auto I0 = Index<0>{};
-    constexpr auto I1 = Index<1>{};
-    constexpr auto I2 = Index<2>{};
-    constexpr auto I3 = Index<3>{};
+    constexpr auto I0 = Number<0>{};
+    constexpr auto I1 = Number<1>{};
+    constexpr auto I2 = Number<2>{};
+    constexpr auto I3 = Number<3>{};
 
     static_assert(desc.GetDimension() == 4, "dim is not 4");
 
diff --git a/src/include/gridwise_direct_convolution_1.cuh b/src/include/gridwise_direct_convolution_1.cuh
index 52949b49c1..77675ee67e 100644
--- a/src/include/gridwise_direct_convolution_1.cuh
+++ b/src/include/gridwise_direct_convolution_1.cuh
@@ -27,10 +27,10 @@ __global__ void gridwise_direct_convolution_1(InGlobalDesc,
                                               OutGlobalDesc,
                                               TFloat* __restrict__ p_out_global)
 {
-    constexpr auto I0 = Index<0>{};
-    constexpr auto I1 = Index<1>{};
-    constexpr auto I2 = Index<2>{};
-    constexpr auto I3 = Index<3>{};
+    constexpr auto I0 = Number<0>{};
+    constexpr auto I1 = Number<1>{};
+    constexpr auto I2 = Number<2>{};
+    constexpr auto I3 = Number<3>{};
 
     constexpr auto in_global_desc  = InGlobalDesc{};
     constexpr auto wei_global_desc = WeiGlobalDesc{};
@@ -120,62 +120,38 @@ __global__ void gridwise_direct_convolution_1(InGlobalDesc,
     }
 #endif
 
-    auto f_set0 = [](TFloat& v) { v = TFloat(0); };
-    auto f_copy = [](const TFloat& src, TFloat& dst) { dst = src; };
-    auto f_accu = [](const TFloat& src, TFloat& dst) { dst += src; };
-
     // set output tensor in LDS to 0
-    blockwise_4d_tensor_op_unary<TFloat,
-                                 decltype(out_block_desc),
-                                 NBlockOpLen0,
-                                 NBlockOpLen1,
-                                 NBlockOpLen2,
-                                 NBlockOpLen3,
-                                 decltype(f_set0),
-                                 BlockSize>(out_block_desc, p_out_block, f_set0);
+    blockwise_4d_tensor_set_zero<TFloat, decltype(out_block_desc), BlockSize>(out_block_desc,
+                                                                              p_out_block);
 
     for(unsigned c_block_work_begin = 0; c_block_work_begin < in_global_desc.GetLength(I1);
-        c_block_work_begin += CPerBlock)
+        c_block_work_begin += CPerBlock, __syncthreads())
     {
 
         // copy input tensor to LDS
-        blockwise_4d_tensor_op_binary<TFloat,
-                                      decltype(in_block_src_desc),
-                                      decltype(in_block_desc),
-                                      NBlockOpLen0,
-                                      NBlockOpLen1,
-                                      NBlockOpLen2,
-                                      NBlockOpLen3,
-                                      decltype(f_copy),
-                                      BlockSize>(in_block_src_desc,
-                                                 p_in_global +
-                                                     in_global_desc.Get1dIndex(n_block_work_begin,
-                                                                               c_block_work_begin,
-                                                                               hi_block_work_begin,
-                                                                               wi_block_work_begin),
-                                                 in_block_desc,
-                                                 p_in_block,
-                                                 f_copy);
+        blockwise_4d_tensor_copy<TFloat,
+                                 decltype(in_block_src_desc),
+                                 decltype(in_block_desc),
+                                 BlockSize>(in_block_src_desc,
+                                            p_in_global +
+                                                in_global_desc.Get1dIndex(n_block_work_begin,
+                                                                          c_block_work_begin,
+                                                                          hi_block_work_begin,
+                                                                          wi_block_work_begin),
+                                            in_block_desc,
+                                            p_in_block);
 
         // copy weight tensor to LDS
-        blockwise_4d_tensor_op_binary<TFloat,
-                                      decltype(wei_block_src_desc),
-                                      decltype(wei_block_desc),
-                                      NBlockOpLen0,
-                                      NBlockOpLen1,
-                                      NBlockOpLen2,
-                                      NBlockOpLen3,
-                                      decltype(f_copy),
-                                      BlockSize>(
+        blockwise_4d_tensor_copy<TFloat,
+                                 decltype(wei_block_src_desc),
+                                 decltype(wei_block_desc),
+                                 BlockSize>(
             wei_block_src_desc,
             p_wei_global + wei_global_desc.Get1dIndex(k_block_work_begin, c_block_work_begin, 0, 0),
             wei_block_desc,
-            p_wei_block,
-            f_copy);
+            p_wei_block);
 
-#if 1
         __syncthreads();
-#endif
 
         // blockwise convolution
         blockwise_convolution<TFloat,
@@ -186,27 +162,17 @@ __global__ void gridwise_direct_convolution_1(InGlobalDesc,
                               OutTileSizeW,
                               BlockSize>(
             in_block_desc, p_in_block, wei_block_desc, p_wei_block, out_block_desc, p_out_block);
-
-#if 1
-        __syncthreads();
-#endif
     }
 
     // copy output tensor from LDS to device mem
-    blockwise_4d_tensor_op_binary<TFloat,
-                                  decltype(out_block_desc),
-                                  decltype(out_block_src_desc),
-                                  NBlockOpLen0,
-                                  NBlockOpLen1,
-                                  NBlockOpLen2,
-                                  NBlockOpLen3,
-                                  decltype(f_copy),
-                                  BlockSize>(
+    blockwise_4d_tensor_copy<TFloat,
+                             decltype(out_block_desc),
+                             decltype(out_block_src_desc),
+                             BlockSize>(
         out_block_desc,
         p_out_block,
         out_block_src_desc,
         p_out_global +
             out_global_desc.Get1dIndex(
-                n_block_work_begin, k_block_work_begin, ho_block_work_begin, wo_block_work_begin),
-        f_copy);
-}
+                n_block_work_begin, k_block_work_begin, ho_block_work_begin, wo_block_work_begin));
+}
\ No newline at end of file
diff --git a/src/include/gridwise_direct_convolution_2.cuh b/src/include/gridwise_direct_convolution_2.cuh
index 64a62466d3..f53577c981 100644
--- a/src/include/gridwise_direct_convolution_2.cuh
+++ b/src/include/gridwise_direct_convolution_2.cuh
@@ -19,10 +19,6 @@ template <class TFloat,
           unsigned NPerThread,
           unsigned KPerThread,
           unsigned CPerThread,
-          unsigned NBlockOpLen0,
-          unsigned NBlockOpLen1,
-          unsigned NBlockOpLen2,
-          unsigned NBlockOpLen3,
           unsigned BlockSize,
           unsigned GridSize>
 __global__ void gridwise_direct_convolution_2(InGlobalDesc,
@@ -32,10 +28,10 @@ __global__ void gridwise_direct_convolution_2(InGlobalDesc,
                                               OutGlobalDesc,
                                               TFloat* __restrict__ p_out_global)
 {
-    constexpr auto I0 = Index<0>{};
-    constexpr auto I1 = Index<1>{};
-    constexpr auto I2 = Index<2>{};
-    constexpr auto I3 = Index<3>{};
+    constexpr auto I0 = Number<0>{};
+    constexpr auto I1 = Number<1>{};
+    constexpr auto I2 = Number<2>{};
+    constexpr auto I3 = Number<3>{};
 
     constexpr auto in_global_desc  = InGlobalDesc{};
     constexpr auto wei_global_desc = WeiGlobalDesc{};
@@ -147,10 +143,6 @@ __global__ void gridwise_direct_convolution_2(InGlobalDesc,
     const unsigned hi_thread_data_offset = ho_thread_data_offset;
     const unsigned wi_thread_data_offset = wo_thread_data_offset;
 
-    // op
-    auto f_set0 = [](TFloat& v) { v = TFloat(0); };
-    auto f_copy = [](const TFloat& src, TFloat& dst) { dst = src; };
-
 #if 0
     if(threadIdx.x == 0)
     {
@@ -170,76 +162,54 @@ __global__ void gridwise_direct_convolution_2(InGlobalDesc,
 #endif
 
     // set threadwise output tensor to 0
-    threadwise_4d_tensor_op_unary<TFloat, decltype(out_thread_desc), decltype(f_set0)>(
-        out_thread_desc, p_out_thread, f_set0);
+    threadwise_4d_tensor_set_zero(out_thread_desc, p_out_thread);
 
     for(unsigned c_block_data_offset = 0; c_block_data_offset < in_global_desc.GetLength(I1);
         c_block_data_offset += CPerBlock, __syncthreads())
     {
         // copy input tensor to LDS
-        blockwise_4d_tensor_op_binary<TFloat,
-                                      decltype(in_block_global_desc),
-                                      decltype(in_block_desc),
-                                      NBlockOpLen0,
-                                      NBlockOpLen1,
-                                      NBlockOpLen2,
-                                      NBlockOpLen3,
-                                      decltype(f_copy),
-                                      BlockSize>(
-            in_block_global_desc,
-            p_in_global + in_global_desc.Get1dIndex(n_block_data_offset,
-                                                    c_block_data_offset,
-                                                    hi_block_data_offset,
-                                                    wi_block_data_offset),
-            in_block_desc,
-            p_in_block,
-            f_copy);
+        blockwise_4d_tensor_copy<TFloat,
+                                 decltype(in_block_global_desc),
+                                 decltype(in_block_desc),
+                                 BlockSize>(in_block_global_desc,
+                                            p_in_global +
+                                                in_global_desc.Get1dIndex(n_block_data_offset,
+                                                                          c_block_data_offset,
+                                                                          hi_block_data_offset,
+                                                                          wi_block_data_offset),
+                                            in_block_desc,
+                                            p_in_block);
 
         // copy weight tensor to LDS
-        blockwise_4d_tensor_op_binary<TFloat,
-                                      decltype(wei_block_global_desc),
-                                      decltype(wei_block_desc),
-                                      NBlockOpLen0,
-                                      NBlockOpLen1,
-                                      NBlockOpLen2,
-                                      NBlockOpLen3,
-                                      decltype(f_copy),
-                                      BlockSize>(
+        blockwise_4d_tensor_copy<TFloat,
+                                 decltype(wei_block_global_desc),
+                                 decltype(wei_block_desc),
+                                 BlockSize>(
             wei_block_global_desc,
             p_wei_global +
                 wei_global_desc.Get1dIndex(k_block_data_offset, c_block_data_offset, 0, 0),
             wei_block_desc,
-            p_wei_block,
-            f_copy);
+            p_wei_block);
 
         __syncthreads();
 
         for(unsigned c_thread_data = 0; c_thread_data < CPerBlock; c_thread_data += CPerThread)
         {
             // copy input tensor into register
-            threadwise_4d_tensor_op_binary<TFloat,
-                                           decltype(in_thread_block_desc),
-                                           decltype(in_thread_desc),
-                                           decltype(f_copy)>(
-                in_thread_block_desc,
-                p_in_block + in_block_desc.Get1dIndex(n_thread_data_offset,
-                                                      c_thread_data,
-                                                      hi_thread_data_offset,
-                                                      wi_thread_data_offset),
-                in_thread_desc,
-                p_in_thread,
-                f_copy);
+            threadwise_4d_tensor_copy(in_thread_block_desc,
+                                      p_in_block + in_block_desc.Get1dIndex(n_thread_data_offset,
+                                                                            c_thread_data,
+                                                                            hi_thread_data_offset,
+                                                                            wi_thread_data_offset),
+                                      in_thread_desc,
+                                      p_in_thread);
 
             // copy weight tensor into register
-            threadwise_4d_tensor_op_binary<TFloat,
-                                           decltype(wei_thread_block_desc),
-                                           decltype(wei_thread_desc),
-                                           decltype(f_copy)>(
+            threadwise_4d_tensor_copy(
                 wei_thread_block_desc,
                 p_wei_block + wei_block_desc.Get1dIndex(k_thread_data_offset, c_thread_data, 0, 0),
                 wei_thread_desc,
-                p_wei_thread,
-                f_copy);
+                p_wei_thread);
 
             // threadwise convolution
             threadwise_direct_convolution<TFloat,
@@ -255,16 +225,12 @@ __global__ void gridwise_direct_convolution_2(InGlobalDesc,
     }
 
     // copy output tensor from register to global mem
-    threadwise_4d_tensor_op_binary<TFloat,
-                                   decltype(out_thread_desc),
-                                   decltype(out_thread_global_desc),
-                                   decltype(f_copy)>(
+    threadwise_4d_tensor_copy(
         out_thread_desc,
         p_out_thread,
         out_thread_global_desc,
         p_out_global + out_global_desc.Get1dIndex(n_block_data_offset + n_thread_data_offset,
                                                   k_block_data_offset + k_thread_data_offset,
                                                   ho_block_data_offset + ho_thread_data_offset,
-                                                  wo_block_data_offset + wo_thread_data_offset),
-        f_copy);
+                                                  wo_block_data_offset + wo_thread_data_offset));
 }
diff --git a/src/include/gridwise_winograd_convolution.cuh b/src/include/gridwise_winograd_convolution.cuh
deleted file mode 100644
index 52e5b5059a..0000000000
--- a/src/include/gridwise_winograd_convolution.cuh
+++ /dev/null
@@ -1,246 +0,0 @@
-#pragma once
-#include "constant_tensor_descriptor.cuh"
-#include "blockwise_winograd_transform.cuh"
-#include "threadwise_winograd_transform.cuh"
-
-template <class TFloat,
-          class InGlobalDesc,
-          class WeiGlobalDesc,
-          class OutGlobalDesc,
-          unsigned OutTileSizeH,
-          unsigned OutTileSizeW,
-          unsigned NPerBlock,
-          unsigned KPerBlock,
-          unsigned CPerBlock,
-          unsigned YPerBlock,
-          unsigned XPerBlock,
-          unsigned NPerThread,
-          unsigned KPerThread,
-          unsigned CPerThread,
-          unsigned BlockSize,
-          unsigned GridSize>
-__global__ void gridwise_winograd_convolution(InGlobalDesc,
-                                              TFloat* const __restrict__ p_in_global,
-                                              WeiGlobalDesc,
-                                              TFloat* const __restrict__ p_wei_global,
-                                              OutGlobalDesc,
-                                              TFloat* __restrict__ p_out_global)
-{
-    constexpr auto I0 = Index<0>{};
-    constexpr auto I1 = Index<1>{};
-    constexpr auto I2 = Index<2>{};
-    constexpr auto I3 = Index<3>{};
-
-    constexpr auto in_global_desc  = InGlobalDesc{};
-    constexpr auto wei_global_desc = WeiGlobalDesc{};
-    constexpr auto out_global_desc = OutGlobalDesc{};
-
-    constexpr unsigned S = wei_global_desc.GetLength(I2);
-    constexpr unsigned R = wei_global_desc.GetLength(I3);
-
-    constexpr unsigned HoPerBlock = OutTileSizeH * YPerBlock;
-    constexpr unsigned WoPerBlock = OutTileSizeW * XPerBlock;
-
-    constexpr unsigned HiPerBlock = YPerBlock * OutTileSizeH + S - 1;
-    constexpr unsigned WiPerBlock = XPerBlock * OutTileSizeW + R - 1;
-
-    constexpr unsigned InTileSizeH = OutTileSizeH + S - 1;
-    constexpr unsigned InTileSizeW = OutTileSizeW + R - 1;
-
-    // divide block work
-    constexpr unsigned NBlockWork = (out_global_desc.GetLength(I0) + NPerBlock - 1) / NPerBlock;
-    constexpr unsigned KBlockWork = (out_global_desc.GetLength(I1) + KPerBlock - 1) / KPerBlock;
-    constexpr unsigned YBlockWork = (out_global_desc.GetLength(I2) + HoPerBlock - 1) / HoPerBlock;
-    constexpr unsigned XBlockWork = (out_global_desc.GetLength(I3) + WoPerBlock - 1) / WoPerBlock;
-
-    const unsigned block_id = blockIdx.x;
-
-    unsigned itmp                  = block_id;
-    const unsigned n_block_work_id = itmp / (KBlockWork * YBlockWork * XBlockWork);
-    itmp -= n_block_work_id * (KBlockWork * YBlockWork * XBlockWork);
-    const unsigned k_block_work_id = itmp / (YBlockWork * XBlockWork);
-    itmp -= k_block_work_id * (YBlockWork * XBlockWork);
-    const unsigned y_block_work_id = itmp / XBlockWork;
-    const unsigned x_block_work_id = itmp - y_block_work_id * XBlockWork;
-
-    const unsigned n_block_data_offset = n_block_work_id * NPerBlock;
-    const unsigned k_block_data_offset = k_block_work_id * KPerBlock;
-    const unsigned y_block_data_offset = y_block_work_id * YPerBlock;
-    const unsigned x_block_data_offset = x_block_work_id * XPerBlock;
-
-    const unsigned ho_block_data_offset = y_block_data_offset * OutTileSizeH;
-    const unsigned wo_block_data_offset = x_block_data_offset * OutTileSizeW;
-
-    const unsigned hi_block_data_offset = ho_block_data_offset; // minus padding
-    const unsigned wi_block_data_offset = wo_block_data_offset; // minus padding
-
-    // divide thread work
-    constexpr unsigned NThreadWork = (NPerBlock + NPerThread - 1) / NPerThread;
-    constexpr unsigned KThreadWork = (KPerBlock + KPerThread - 1) / KPerThread;
-    constexpr unsigned YThreadWork = YPerBlock;
-    constexpr unsigned XThreadWork = XPerBlock;
-
-    const unsigned thread_id = threadIdx.x;
-
-    itmp                            = thread_id;
-    const unsigned n_thread_work_id = itmp / (KThreadWork * YThreadWork * XThreadWork);
-    itmp -= n_thread_work_id * (KThreadWork * YThreadWork * XThreadWork);
-    const unsigned k_thread_work_id = itmp / (YThreadWork * XThreadWork);
-    itmp -= k_thread_work_id * (YThreadWork * XThreadWork);
-    const unsigned y_thread_work_id = itmp / XThreadWork;
-    const unsigned x_thread_work_id = itmp - y_thread_work_id * XThreadWork;
-
-    const unsigned n_thread_data_offset = n_thread_work_id * NPerThread;
-    const unsigned k_thread_data_offset = k_thread_work_id * KPerThread;
-    const unsigned y_thread_data_offset = y_thread_work_id;
-    const unsigned x_thread_data_offset = x_thread_work_id;
-
-    // op
-    auto f_set0 = [](TFloat& v) { v = TFloat(0); };
-    auto f_copy = [](const TFloat& src, TFloat& dst) { dst = src; };
-
-    // block data
-    constexpr auto in_transform_block_desc = make_ConstantTensorDescriptor(
-        Sequence<NPerBlock, CPerBlock, YPerBlock * InTileSizeH, XPerBlock * InTileSizeW>{});
-
-    constexpr auto wei_transform_block_desc =
-        make_ConstantTensorDescriptor(Sequence<KPerBlock, CPerBlock, InTileSizeH, InTileSizeW>{});
-
-    constexpr unsigned in_transform_block_size  = in_transform_block_desc.GetElementSpace();
-    constexpr unsigned wei_transform_block_size = wei_transform_block_desc.GetElementSpace();
-
-    __shared__ TFloat p_in_transform_block[in_transform_block_size];
-    __shared__ TFloat p_wei_transform_block[wei_transform_block_size];
-
-    // thread data
-    constexpr auto in_transform_thread_block_desc =
-        make_ConstantTensorDescriptor(Sequence<NPerThread, CPerThread, InTileSizeH, InTileSizeW>{},
-                                      in_transform_block_desc.GetStrides());
-
-    constexpr auto wei_transform_thread_block_desc =
-        make_ConstantTensorDescriptor(Sequence<KPerThread, CPerThread, InTileSizeH, InTileSizeW>{},
-                                      wei_transform_block_desc.GetStrides());
-
-    constexpr auto out_transform_thread_desc =
-        make_ConstantTensorDescriptor(Sequence<NPerThread, KPerThread, InTileSizeH, InTileSizeW>{});
-
-    constexpr auto out_thread_desc = make_ConstantTensorDescriptor(
-        Sequence<NPerThread, KPerThread, OutTileSizeH, OutTileSizeW>{});
-
-    constexpr auto out_thread_global_desc =
-        make_ConstantTensorDescriptor(out_thread_desc.GetLengths(), out_global_desc.GetStrides());
-
-    constexpr unsigned out_transform_thread_size = out_transform_thread_desc.GetElementSpace();
-    constexpr unsigned out_thread_size           = out_thread_desc.GetElementSpace();
-
-    TFloat p_out_transform_thread[out_transform_thread_size];
-    TFloat p_out_thread[out_thread_size];
-
-#if 0
-    if(blockIdx.x == 0 && threadIdx.x == 0)
-    {
-        printf("in_transform_block_size %u, wei_transform_block_size %u, out_transform_thread_size "
-               "%u, out_thread_size %u \n",
-               in_transform_block_size,
-               wei_transform_block_size,
-               out_transform_thread_size,
-               out_thread_size);
-    }
-#endif
-
-    // set threadwise output transform tensor to 0
-    threadwise_4d_tensor_op_unary<TFloat, decltype(out_transform_thread_desc), decltype(f_set0)>(
-        out_transform_thread_desc, p_out_transform_thread, f_set0);
-
-    for(unsigned c_block_data_offset = 0; c_block_data_offset < in_global_desc.GetLength(I1);
-        c_block_data_offset += CPerBlock, __syncthreads())
-    {
-        // blockwise transform input
-        blockwise_winograd_transform_input<TFloat,
-                                           InTileSizeH,
-                                           InTileSizeW,
-                                           S,
-                                           R,
-                                           OutTileSizeH,
-                                           OutTileSizeW,
-                                           NPerBlock,
-                                           CPerBlock,
-                                           YPerBlock,
-                                           XPerBlock,
-                                           BlockSize>(
-            p_in_global + in_global_desc.Get1dIndex(n_block_data_offset,
-                                                    c_block_data_offset,
-                                                    hi_block_data_offset,
-                                                    wi_block_data_offset),
-            p_in_transform_block);
-
-        // blockwise transform weights
-        blockwise_winograd_transform_weight<TFloat,
-                                            InTileSizeH,
-                                            InTileSizeW,
-                                            S,
-                                            R,
-                                            OutTileSizeH,
-                                            OutTileSizeW,
-                                            KPerBlock,
-                                            CPerBlock,
-                                            BlockSize>(
-            p_wei_global +
-                wei_global_desc.Get1dIndex(k_block_data_offset, c_block_data_offset, 0, 0),
-            p_wei_transform_block);
-
-        for(unsigned c_thread_data = 0; c_thread_data < CPerBlock; c_thread_data += CPerThread)
-        {
-            // threadwise point multiplication
-            threadwise_winograd_calculate_transformed_output<
-                TFloat,
-                decltype(in_transform_thread_block_desc),
-                decltype(wei_transform_thread_block_desc),
-                decltype(out_transform_thread_desc),
-                InTileSizeH,
-                InTileSizeW,
-                S,
-                R,
-                OutTileSizeH,
-                OutTileSizeW>(
-                in_transform_thread_block_desc,
-                p_in_transform_block +
-                    in_transform_block_desc.Get1dIndex(n_thread_data_offset,
-                                                       c_thread_data,
-                                                       y_thread_data_offset * InTileSizeH,
-                                                       x_thread_data_offset * InTileSizeW),
-                wei_transform_thread_block_desc,
-                p_wei_transform_block +
-                    wei_transform_block_desc.Get1dIndex(k_thread_data_offset, c_thread_data, 0, 0),
-                out_transform_thread_desc,
-                p_out_transform_thread);
-        }
-    };
-
-    // transform back
-    threadwise_winograd_reverse_transform_output<TFloat,
-                                                 decltype(out_transform_thread_desc),
-                                                 decltype(out_thread_desc),
-                                                 InTileSizeH,
-                                                 InTileSizeW,
-                                                 S,
-                                                 R,
-                                                 OutTileSizeH,
-                                                 OutTileSizeW>(
-        out_transform_thread_desc, p_out_transform_thread, out_thread_desc, p_out_thread);
-
-    // copy output tensor from register to global mem
-    threadwise_4d_tensor_op_binary<TFloat,
-                                   decltype(out_thread_desc),
-                                   decltype(out_thread_global_desc),
-                                   decltype(f_copy)>(
-        out_thread_desc,
-        p_out_thread,
-        out_thread_global_desc,
-        p_out_global +
-            out_global_desc.Get1dIndex(n_block_data_offset + n_thread_data_offset,
-                                       k_block_data_offset + k_thread_data_offset,
-                                       ho_block_data_offset + y_thread_data_offset * OutTileSizeH,
-                                       wo_block_data_offset + x_thread_data_offset * OutTileSizeW),
-        f_copy);
-}
\ No newline at end of file
diff --git a/src/include/threadwise_direct_convolution.cuh b/src/include/threadwise_direct_convolution.cuh
index 68e00e901a..3f9fb6be91 100644
--- a/src/include/threadwise_direct_convolution.cuh
+++ b/src/include/threadwise_direct_convolution.cuh
@@ -9,10 +9,10 @@ __device__ void threadwise_direct_convolution(InDesc,
                                               OutDesc,
                                               TFloat* __restrict__ p_out)
 {
-    constexpr auto I0 = Index<0>{};
-    constexpr auto I1 = Index<1>{};
-    constexpr auto I2 = Index<2>{};
-    constexpr auto I3 = Index<3>{};
+    constexpr auto I0 = Number<0>{};
+    constexpr auto I1 = Number<1>{};
+    constexpr auto I2 = Number<2>{};
+    constexpr auto I3 = Number<3>{};
 
     constexpr auto in_desc  = InDesc{};
     constexpr auto wei_desc = WeiDesc{};
diff --git a/src/include/threadwise_tensor_op.cuh b/src/include/threadwise_tensor_op.cuh
index 9440957e8c..6781ce57f3 100644
--- a/src/include/threadwise_tensor_op.cuh
+++ b/src/include/threadwise_tensor_op.cuh
@@ -1,50 +1,34 @@
 #pragma once
 #include "constant_tensor_descriptor.cuh"
 
-#define THREADWISE_TENSOR_OP_METHOD 0
-
-#if THREADWISE_TENSOR_OP_METHOD == 0
-template <class TFloat, class DstDesc, class F>
-__device__ void threadwise_4d_tensor_op_unary(DstDesc, TFloat* __restrict__ p_dst, F f)
+template <class TFloat, class Desc, class F>
+__device__ void threadwise_4d_tensor_pointwise_op_unary(Desc, TFloat* __restrict__ p_dst, F f)
 {
-    constexpr auto I0 = Index<0>{};
-    constexpr auto I1 = Index<1>{};
-    constexpr auto I2 = Index<2>{};
-    constexpr auto I3 = Index<3>{};
+    constexpr auto I0 = Number<0>{};
+    constexpr auto I1 = Number<1>{};
+    constexpr auto I2 = Number<2>{};
+    constexpr auto I3 = Number<3>{};
 
-    constexpr auto dst_desc = DstDesc{};
+    constexpr auto desc = Desc{};
 
 #if 0
     if(threadIdx.x == 0)
     {
-        print_ConstantTensorDescriptor(dst_desc, "threadwise_4d_tensor_op_unary: ");
+        print_ConstantTensorDescriptor(desc, "threadwise_4d_tensor_op_unary: ");
     }
 #endif
 
-    for(unsigned did0 = 0; did0 < dst_desc.GetLength(I0); ++did0)
+    for(unsigned did0 = 0; did0 < desc.GetLength(I0); ++did0)
     {
-        for(unsigned did1 = 0; did1 < dst_desc.GetLength(I1); ++did1)
+        for(unsigned did1 = 0; did1 < desc.GetLength(I1); ++did1)
         {
-            for(unsigned did2 = 0; did2 < dst_desc.GetLength(I2); ++did2)
+            for(unsigned did2 = 0; did2 < desc.GetLength(I2); ++did2)
             {
-                for(unsigned did3 = 0; did3 < dst_desc.GetLength(I3); ++did3)
+                for(unsigned did3 = 0; did3 < desc.GetLength(I3); ++did3)
                 {
-                    const unsigned dindex =
-                        dst_desc.GetStride(I0) * did0 + dst_desc.GetStride(I1) * did1 +
-                        dst_desc.GetStride(I2) * did2 + dst_desc.GetStride(I3) * did3;
+                    const unsigned dindex = desc.Get1dIndex(did0, did1, did2, did3);
 
                     f(p_dst[dindex]);
-
-#if 0
-                    if(threadIdx.x == 0)
-                    {
-                        printf("threadwise_4d_tensor_op_unary: thread id %u, \t"
-                               "dindex %u, p_dst[dindex] %f\n",
-                               threadIdx.x,
-                               dindex,
-                               p_dst[dindex]);
-                    }
-#endif
                 }
             }
         }
@@ -52,13 +36,13 @@ __device__ void threadwise_4d_tensor_op_unary(DstDesc, TFloat* __restrict__ p_ds
 }
 
 template <class TFloat, class SrcDesc, class DstDesc, class F>
-__device__ void threadwise_4d_tensor_op_binary(
+__device__ void threadwise_4d_tensor_pointwise_op_binary(
     SrcDesc, TFloat* const __restrict__ p_src, DstDesc, TFloat* __restrict__ p_dst, F f)
 {
-    constexpr auto I0 = Index<0>{};
-    constexpr auto I1 = Index<1>{};
-    constexpr auto I2 = Index<2>{};
-    constexpr auto I3 = Index<3>{};
+    constexpr auto I0 = Number<0>{};
+    constexpr auto I1 = Number<1>{};
+    constexpr auto I2 = Number<2>{};
+    constexpr auto I3 = Number<3>{};
 
     constexpr auto src_desc = SrcDesc{};
     constexpr auto dst_desc = DstDesc{};
@@ -81,99 +65,34 @@ __device__ void threadwise_4d_tensor_op_binary(
             {
                 for(unsigned did3 = 0; did3 < src_desc.GetLength(I3); ++did3)
                 {
-                    const unsigned sindex =
-                        src_desc.GetStride(I0) * did0 + src_desc.GetStride(I1) * did1 +
-                        src_desc.GetStride(I2) * did2 + src_desc.GetStride(I3) * did3;
+                    const unsigned sindex = src_desc.Get1dIndex(did0, did1, did2, did3);
 
-                    const unsigned dindex =
-                        dst_desc.GetStride(I0) * did0 + dst_desc.GetStride(I1) * did1 +
-                        dst_desc.GetStride(I2) * did2 + dst_desc.GetStride(I3) * did3;
+                    const unsigned dindex = dst_desc.Get1dIndex(did0, did1, did2, did3);
 
                     f(p_src[sindex], p_dst[dindex]);
-
-#if 0
-                    if(threadIdx.x == 0)
-                    {
-                        printf("threadwise_4d_tensor_op_binary: thread id %u, \t"
-                               "sindex %u, p_src[sindex] %f, \t"
-                               "dindex %u, p_dst[dindex] %f\n",
-                               threadIdx.x,
-                               sindex,
-                               p_src[sindex],
-                               dindex,
-                               p_dst[dindex]);
-                    }
-#endif
                 }
             }
         }
     }
 }
-#endif
 
-#if THREADWISE_TENSOR_OP_METHOD == 1
-template <class TFloat, class SrcDesc, class DstDesc, class F>
-__device__ void threadwise_4d_tensor_op(
-    SrcDesc, TFloat* const __restrict__ p_src, DstDesc, TFloat* __restrict__ p_dst, F f)
+template <class TFloat, class Desc>
+__device__ void threadwise_4d_tensor_set_zero(Desc, TFloat* __restrict__ p_dst)
 {
-    constexpr auto I0 = Index<0>{};
-    constexpr auto I1 = Index<1>{};
-    constexpr auto I2 = Index<2>{};
-    constexpr auto I3 = Index<3>{};
+    auto f_set_zero = [](TFloat& v) { v = TFloat(0); };
 
-    constexpr auto src_desc = SrcDesc{};
-    constexpr auto dst_desc = DstDesc{};
-
-    static_assert(is_same<decltype(src_desc.GetLengths()), decltype(dst_desc.GetLengths())>::value);
-
-#if 0
-    if(threadIdx.x == 0)
-    {
-        print_ConstantTensorDescriptor(src_desc, "threadwise_4d_tensor_op: src_desc: ");
-        print_ConstantTensorDescriptor(dst_desc, "threadwise_4d_tensor_op: dst_desc: ");
-    }
-#endif
-
-    unsigned sindex = 0;
-    unsigned dindex = 0;
-
-    for(unsigned did0 = 0; did0 < src_desc.GetLength(I0); ++did0)
-    {
-        for(unsigned did1 = 0; did1 < src_desc.GetLength(I1); ++did1)
-        {
-            for(unsigned did2 = 0; did2 < src_desc.GetLength(I2); ++did2)
-            {
-                for(unsigned did3 = 0; did3 < src_desc.GetLength(I3); ++did3)
-                {
-                    f(p_src[sindex], p_dst[dindex]);
-
-#if 0
-                    if(threadIdx.x == 0)
-                    {
-                        printf("threadwise_4d_tensor_op: 1: thread id %u, \t"
-                               "sindex %u, p_src[sindex] %f, \t"
-                               "dindex %u, p_dst[dindex] %f\n",
-                               threadIdx.x,
-                               sindex,
-                               p_src[sindex],
-                               dindex,
-                               p_dst[dindex]);
-                    }
-#endif
-                    sindex += src_desc.GetStride(I3);
-                    dindex += dst_desc.GetStride(I3);
-                }
-
-                sindex += src_desc.GetStride(I2) - src_desc.GetLength(I3) * src_desc.GetStride(I3);
-                dindex += dst_desc.GetStride(I2) - dst_desc.GetLength(I3) * dst_desc.GetStride(I3);
-            }
-
-            sindex += src_desc.GetStride(I1) - src_desc.GetLength(I2) * src_desc.GetStride(I2);
-            dindex += dst_desc.GetStride(I1) - dst_desc.GetLength(I2) * dst_desc.GetStride(I2);
-        }
-
-        sindex += src_desc.GetStride(I0) - src_desc.GetLength(I1) * src_desc.GetStride(I1);
-        dindex += dst_desc.GetStride(I0) - dst_desc.GetLength(I1) * dst_desc.GetStride(I1);
-    }
+    threadwise_4d_tensor_pointwise_op_unary<TFloat, Desc, decltype(f_set_zero)>(
+        Desc{}, p_dst, f_set_zero);
 }
-#endif
+
+template <class TFloat, class SrcDesc, class DstDesc>
+__device__ void threadwise_4d_tensor_copy(SrcDesc,
+                                          TFloat* const __restrict__ p_src,
+                                          DstDesc,
+                                          TFloat* __restrict__ p_dst)
+{
+    auto f_copy = [](const TFloat& src, TFloat& dst) { dst = src; };
+
+    threadwise_4d_tensor_pointwise_op_binary<TFloat, SrcDesc, DstDesc, decltype(f_copy)>(
+        SrcDesc{}, p_src, DstDesc{}, p_dst, f_copy);
+}
\ No newline at end of file
diff --git a/src/include/threadwise_winograd_transform.cuh b/src/include/threadwise_winograd_transform.cuh
deleted file mode 100644
index 8ef2af039b..0000000000
--- a/src/include/threadwise_winograd_transform.cuh
+++ /dev/null
@@ -1,138 +0,0 @@
-#pragma once
-#include "constant_tensor_descriptor.cuh"
-
-template <class TFloat,
-          class InTransThreadDesc,  //{NPerThread, CPerThread, InTileSizeH, InTileSizeW}
-          class WeiTransThreadDesc, //{KPerThread, CPerThread, InTileSizeH, InTileSizeW}
-          class OutTransThreadDesc, //{NPerThread, KPerThread, InTileSizeH, InTileSizeW}
-          unsigned InTileSizeH,
-          unsigned InTileSizeW,
-          unsigned S,
-          unsigned R,
-          unsigned OutTileSizeH,
-          unsigned OutTileSizeW>
-__device__ void
-threadwise_winograd_calculate_transformed_output(InTransThreadDesc,
-                                                 TFloat* const __restrict__ p_in_transform_thread,
-                                                 WeiTransThreadDesc,
-                                                 TFloat* const __restrict__ p_wei_transform_thread,
-                                                 OutTransThreadDesc,
-                                                 TFloat* __restrict__ p_out_transform_thread)
-{
-    constexpr auto I0 = Index<0>{};
-    constexpr auto I1 = Index<1>{};
-    constexpr auto I2 = Index<2>{};
-    constexpr auto I3 = Index<3>{};
-
-    constexpr auto in_transform_thread_desc  = InTransThreadDesc{};
-    constexpr auto wei_transform_thread_desc = WeiTransThreadDesc{};
-    constexpr auto out_transform_thread_desc = OutTransThreadDesc{};
-
-    for(unsigned n = 0; n < out_transform_thread_desc.GetLength(I0); ++n)
-    {
-        for(unsigned k = 0; k < out_transform_thread_desc.GetLength(I1); ++k)
-        {
-            for(unsigned h = 0; h < out_transform_thread_desc.GetLength(I2); ++h)
-            {
-                for(unsigned w = 0; w < out_transform_thread_desc.GetLength(I3); ++w)
-                {
-                    for(unsigned c = 0; c < wei_transform_thread_desc.GetLength(I1); ++c)
-                    {
-                        const unsigned in_index  = in_transform_thread_desc.Get1dIndex(n, c, h, w);
-                        const unsigned wei_index = wei_transform_thread_desc.Get1dIndex(k, c, h, w);
-                        const unsigned out_index = out_transform_thread_desc.Get1dIndex(n, k, h, w);
-
-                        p_out_transform_thread[out_index] +=
-                            p_wei_transform_thread[wei_index] * p_in_transform_thread[in_index];
-                    }
-                }
-            }
-        }
-    }
-}
-
-template <class TFloat,
-          class OutTransThreadDesc, //{NPerThread, KPerThread,  InTileSizeH,  InTileSizeW}
-          class OutThreadDesc,      //{NPerThread, CPerThread, OutTileSizeH, OutTileSizeW}
-          unsigned InTileSizeH,
-          unsigned InTileSizeW,
-          unsigned S,
-          unsigned R,
-          unsigned OutTileSizeH,
-          unsigned OutTileSizeW>
-__device__ void
-threadwise_winograd_reverse_transform_output(OutTransThreadDesc,
-                                             TFloat* const __restrict__ p_out_transform_thread,
-                                             OutThreadDesc,
-                                             TFloat* __restrict__ p_out_thread)
-{
-    static_assert(InTileSizeH == 4, "wrong");
-    static_assert(InTileSizeW == 4, "wrong");
-    static_assert(S == 3, "wrong");
-    static_assert(R == 3, "wrong");
-    static_assert(OutTileSizeH == 2, "wrong");
-    static_assert(OutTileSizeW == 2, "wrong");
-
-    constexpr auto I0 = Index<0>{};
-    constexpr auto I1 = Index<1>{};
-    constexpr auto I2 = Index<2>{};
-    constexpr auto I3 = Index<3>{};
-
-    constexpr auto out_transform_thread_desc = OutTransThreadDesc{};
-    constexpr auto out_thread_desc           = OutThreadDesc{};
-
-    static_assert(InTileSizeH == out_transform_thread_desc.GetLength(I2), "wrong");
-    static_assert(InTileSizeW == out_transform_thread_desc.GetLength(I3), "wrong");
-    static_assert(OutTileSizeH == out_thread_desc.GetLength(I2), "wrong");
-    static_assert(OutTileSizeW == out_thread_desc.GetLength(I3), "wrong");
-
-    for(unsigned n = 0; n < out_thread_desc.GetLength(I0); ++n)
-    {
-        for(unsigned k = 0; k < out_thread_desc.GetLength(I1); ++k)
-        {
-            p_out_thread[out_thread_desc.Get1dIndex(n, k, 0, 0)] =
-                p_out_transform_thread[out_transform_thread_desc.Get1dIndex(n, k, 0, 0)] +
-                p_out_transform_thread[out_transform_thread_desc.Get1dIndex(n, k, 0, 1)] +
-                p_out_transform_thread[out_transform_thread_desc.Get1dIndex(n, k, 0, 2)] +
-                p_out_transform_thread[out_transform_thread_desc.Get1dIndex(n, k, 1, 0)] +
-                p_out_transform_thread[out_transform_thread_desc.Get1dIndex(n, k, 1, 1)] +
-                p_out_transform_thread[out_transform_thread_desc.Get1dIndex(n, k, 1, 2)] +
-                p_out_transform_thread[out_transform_thread_desc.Get1dIndex(n, k, 2, 0)] +
-                p_out_transform_thread[out_transform_thread_desc.Get1dIndex(n, k, 2, 1)] +
-                p_out_transform_thread[out_transform_thread_desc.Get1dIndex(n, k, 2, 2)];
-
-            p_out_thread[out_thread_desc.Get1dIndex(n, k, 0, 1)] =
-                p_out_transform_thread[out_transform_thread_desc.Get1dIndex(n, k, 0, 1)] -
-                p_out_transform_thread[out_transform_thread_desc.Get1dIndex(n, k, 0, 2)] -
-                p_out_transform_thread[out_transform_thread_desc.Get1dIndex(n, k, 0, 3)] +
-                p_out_transform_thread[out_transform_thread_desc.Get1dIndex(n, k, 1, 1)] -
-                p_out_transform_thread[out_transform_thread_desc.Get1dIndex(n, k, 1, 2)] -
-                p_out_transform_thread[out_transform_thread_desc.Get1dIndex(n, k, 1, 3)] +
-                p_out_transform_thread[out_transform_thread_desc.Get1dIndex(n, k, 2, 1)] -
-                p_out_transform_thread[out_transform_thread_desc.Get1dIndex(n, k, 2, 2)] -
-                p_out_transform_thread[out_transform_thread_desc.Get1dIndex(n, k, 2, 3)];
-
-            p_out_thread[out_thread_desc.Get1dIndex(n, k, 1, 0)] =
-                p_out_transform_thread[out_transform_thread_desc.Get1dIndex(n, k, 1, 0)] +
-                p_out_transform_thread[out_transform_thread_desc.Get1dIndex(n, k, 1, 1)] +
-                p_out_transform_thread[out_transform_thread_desc.Get1dIndex(n, k, 1, 2)] -
-                p_out_transform_thread[out_transform_thread_desc.Get1dIndex(n, k, 2, 0)] -
-                p_out_transform_thread[out_transform_thread_desc.Get1dIndex(n, k, 2, 1)] -
-                p_out_transform_thread[out_transform_thread_desc.Get1dIndex(n, k, 2, 2)] -
-                p_out_transform_thread[out_transform_thread_desc.Get1dIndex(n, k, 3, 0)] -
-                p_out_transform_thread[out_transform_thread_desc.Get1dIndex(n, k, 3, 1)] -
-                p_out_transform_thread[out_transform_thread_desc.Get1dIndex(n, k, 3, 2)];
-
-            p_out_thread[out_thread_desc.Get1dIndex(n, k, 1, 1)] =
-                p_out_transform_thread[out_transform_thread_desc.Get1dIndex(n, k, 1, 1)] -
-                p_out_transform_thread[out_transform_thread_desc.Get1dIndex(n, k, 1, 2)] -
-                p_out_transform_thread[out_transform_thread_desc.Get1dIndex(n, k, 1, 3)] -
-                p_out_transform_thread[out_transform_thread_desc.Get1dIndex(n, k, 2, 1)] +
-                p_out_transform_thread[out_transform_thread_desc.Get1dIndex(n, k, 2, 2)] +
-                p_out_transform_thread[out_transform_thread_desc.Get1dIndex(n, k, 2, 3)] -
-                p_out_transform_thread[out_transform_thread_desc.Get1dIndex(n, k, 3, 1)] +
-                p_out_transform_thread[out_transform_thread_desc.Get1dIndex(n, k, 3, 2)] +
-                p_out_transform_thread[out_transform_thread_desc.Get1dIndex(n, k, 3, 3)];
-        }
-    }
-}
\ No newline at end of file