From 73480fee3635310aedbbec68b6084c94cfd2457d Mon Sep 17 00:00:00 2001
From: Chao Liu <lc.roy86@gmail.com>
Date: Thu, 15 Nov 2018 23:53:23 -0600
Subject: [PATCH] refactor

---
 driver/conv.cu                         |  91 +-------------------
 driver/device_direct_convolution_2.cuh |  91 ++++++++++++++++++++
 src/include/direct_convolution_2.cuh   |  32 +++----
 src/include/threadwise_tensor_op.cuh   | 110 ++-----------------------
 4 files changed, 115 insertions(+), 209 deletions(-)
 create mode 100644 driver/device_direct_convolution_2.cuh
diff --git a/driver/conv.cu b/driver/conv.cu
index 4a62be7fc4..2eb32528a4 100644
--- a/driver/conv.cu
+++ b/driver/conv.cu
@@ -5,7 +5,7 @@
 #include "nvToolsExt.h"
 #include "tensor.hpp"
 #include "constant_tensor_descriptor.cuh"
-#include "direct_convolution_2.cuh"
+#include "device_direct_convolution_2.cuh"
 
 template <class T>
 struct GeneratorConstant
@@ -109,95 +109,6 @@ void host_convolution(const Tensor<T>& in, const Tensor<T>& wei, Tensor<T>& out)
     f_par(std::thread::hardware_concurrency());
 }
 
-template <class T, class InDesc, class WeiDesc, class OutDesc>
-void device_convolution(
-    InDesc, const Tensor<T>& in, WeiDesc, const Tensor<T>& wei, OutDesc, Tensor<T>& out)
-{
-    std::size_t data_sz = sizeof(T);
-    DeviceMem in_device_buf(data_sz * in.mDesc.GetElementSpace());
-    DeviceMem wei_device_buf(data_sz * wei.mDesc.GetElementSpace());
-    DeviceMem out_device_buf(data_sz * out.mDesc.GetElementSpace());
-
-    int num_thread = std::thread::hardware_concurrency();
-
-    in_device_buf.ToDevice(in.mData.data());
-    wei_device_buf.ToDevice(wei.mData.data());
-    out_device_buf.ToDevice(out.mData.data());
-
-    constexpr auto I0 = Index<0>{};
-    constexpr auto I1 = Index<1>{};
-    constexpr auto I2 = Index<2>{};
-    constexpr auto I3 = Index<3>{};
-
-    constexpr auto in_desc          = InDesc{};
-    constexpr auto wei_desc         = WeiDesc{};
-    constexpr auto out_desc         = OutDesc{};
-    constexpr unsigned OutTileSizeH = 2;
-    constexpr unsigned OutTileSizeW = 2;
-    constexpr unsigned NPerBlock    = 1;
-    constexpr unsigned KPerBlock    = 4;
-    constexpr unsigned CPerBlock    = 2;
-    constexpr unsigned YPerBlock    = 8;
-    constexpr unsigned XPerBlock    = 16;
-
-    constexpr unsigned NBlockOpLen0 = 1;
-    constexpr unsigned NBlockOpLen1 = 1;
-    constexpr unsigned NBlockOpLen2 = 4;
-    constexpr unsigned NBlockOpLen3 = 32;
-
-    constexpr unsigned BlockSize = 128;
-
-    constexpr unsigned GridSize = (out_desc.GetLength(I0) / NPerBlock) *
-                                  (out_desc.GetLength(I1) / KPerBlock) *
-                                  (out_desc.GetLength(I2) / (OutTileSizeH * YPerBlock)) *
-                                  (out_desc.GetLength(I3) / (OutTileSizeW * XPerBlock));
-
-    dim3 block_dim(BlockSize);
-    dim3 grid_dim(GridSize);
-
-    printf("%s: BlockSize %u, GridSize %u \n", __func__, BlockSize, GridSize);
-
-    cudaEvent_t start, stop;
-    float elapsedTime;
-
-    cudaEventCreate(&start);
-    cudaEventRecord(start, 0);
-
-    gridwise_convolution<T,
-                         InDesc,
-                         WeiDesc,
-                         OutDesc,
-                         OutTileSizeH,
-                         OutTileSizeW,
-                         NPerBlock,
-                         KPerBlock,
-                         CPerBlock,
-                         YPerBlock,
-                         XPerBlock,
-                         NBlockOpLen0,
-                         NBlockOpLen1,
-                         NBlockOpLen2,
-                         NBlockOpLen3,
-                         BlockSize,
-                         GridSize>
-        <<<grid_dim, block_dim>>>(InDesc{},
-                                  static_cast<T*>(in_device_buf.GetDeviceBuffer()),
-                                  WeiDesc{},
-                                  static_cast<T*>(wei_device_buf.GetDeviceBuffer()),
-                                  OutDesc{},
-                                  static_cast<T*>(out_device_buf.GetDeviceBuffer()));
-
-    cudaEventCreate(&stop);
-    cudaEventRecord(stop, 0);
-    cudaEventSynchronize(stop);
-
-    cudaEventElapsedTime(&elapsedTime, start, stop);
-    printf("Elapsed time : %f ms\n", elapsedTime);
-
-    checkCudaErrors(cudaGetLastError());
-    out_device_buf.FromDevice(out.mData.data());
-}
-
 int main()
 {
 #if 0
diff --git a/driver/device_direct_convolution_2.cuh b/driver/device_direct_convolution_2.cuh
new file mode 100644
index 0000000000..7fa644bdf5
--- /dev/null
+++ b/driver/device_direct_convolution_2.cuh
@@ -0,0 +1,91 @@
+#pragma once
+#include "direct_convolution_2.cuh"
+
+template <class T, class InDesc, class WeiDesc, class OutDesc>
+void device_convolution(
+    InDesc, const Tensor<T>& in, WeiDesc, const Tensor<T>& wei, OutDesc, Tensor<T>& out)
+{
+    std::size_t data_sz = sizeof(T);
+    DeviceMem in_device_buf(data_sz * in.mDesc.GetElementSpace());
+    DeviceMem wei_device_buf(data_sz * wei.mDesc.GetElementSpace());
+    DeviceMem out_device_buf(data_sz * out.mDesc.GetElementSpace());
+
+    int num_thread = std::thread::hardware_concurrency();
+
+    in_device_buf.ToDevice(in.mData.data());
+    wei_device_buf.ToDevice(wei.mData.data());
+    out_device_buf.ToDevice(out.mData.data());
+
+    constexpr auto I0 = Index<0>{};
+    constexpr auto I1 = Index<1>{};
+    constexpr auto I2 = Index<2>{};
+    constexpr auto I3 = Index<3>{};
+
+    constexpr auto in_desc          = InDesc{};
+    constexpr auto wei_desc         = WeiDesc{};
+    constexpr auto out_desc         = OutDesc{};
+    constexpr unsigned OutTileSizeH = 2;
+    constexpr unsigned OutTileSizeW = 2;
+    constexpr unsigned NPerBlock    = 1;
+    constexpr unsigned KPerBlock    = 4;
+    constexpr unsigned CPerBlock    = 2;
+    constexpr unsigned YPerBlock    = 8;
+    constexpr unsigned XPerBlock    = 16;
+
+    constexpr unsigned NBlockOpLen0 = 1;
+    constexpr unsigned NBlockOpLen1 = 1;
+    constexpr unsigned NBlockOpLen2 = 4;
+    constexpr unsigned NBlockOpLen3 = 32;
+
+    constexpr unsigned BlockSize = 128;
+
+    constexpr unsigned GridSize = (out_desc.GetLength(I0) / NPerBlock) *
+                                  (out_desc.GetLength(I1) / KPerBlock) *
+                                  (out_desc.GetLength(I2) / (OutTileSizeH * YPerBlock)) *
+                                  (out_desc.GetLength(I3) / (OutTileSizeW * XPerBlock));
+
+    dim3 block_dim(BlockSize);
+    dim3 grid_dim(GridSize);
+
+    printf("%s: BlockSize %u, GridSize %u \n", __func__, BlockSize, GridSize);
+
+    cudaEvent_t start, stop;
+    float elapsedTime;
+
+    cudaEventCreate(&start);
+    cudaEventRecord(start, 0);
+
+    gridwise_convolution<T,
+                         InDesc,
+                         WeiDesc,
+                         OutDesc,
+                         OutTileSizeH,
+                         OutTileSizeW,
+                         NPerBlock,
+                         KPerBlock,
+                         CPerBlock,
+                         YPerBlock,
+                         XPerBlock,
+                         NBlockOpLen0,
+                         NBlockOpLen1,
+                         NBlockOpLen2,
+                         NBlockOpLen3,
+                         BlockSize,
+                         GridSize>
+        <<<grid_dim, block_dim>>>(InDesc{},
+                                  static_cast<T*>(in_device_buf.GetDeviceBuffer()),
+                                  WeiDesc{},
+                                  static_cast<T*>(wei_device_buf.GetDeviceBuffer()),
+                                  OutDesc{},
+                                  static_cast<T*>(out_device_buf.GetDeviceBuffer()));
+
+    cudaEventCreate(&stop);
+    cudaEventRecord(stop, 0);
+    cudaEventSynchronize(stop);
+
+    cudaEventElapsedTime(&elapsedTime, start, stop);
+    printf("Elapsed time : %f ms\n", elapsedTime);
+
+    checkCudaErrors(cudaGetLastError());
+    out_device_buf.FromDevice(out.mData.data());
+}
diff --git a/src/include/direct_convolution_2.cuh b/src/include/direct_convolution_2.cuh
index 50e3b0e9dc..ff79871370 100644
--- a/src/include/direct_convolution_2.cuh
+++ b/src/include/direct_convolution_2.cuh
@@ -92,10 +92,10 @@ __device__ void blockwise_convolution(InDesc,
         auto f_copy = [](const TFloat& src, TFloat& dst) { dst = src; };
 
         // copy input tensor into register
-        threadwise_4d_tensor_op_in<TFloat,
-                                   decltype(in_thread_src_desc),
-                                   decltype(in_thread_dst_desc),
-                                   decltype(f_copy)>(
+        threadwise_4d_tensor_op_binary<TFloat,
+                                       decltype(in_thread_src_desc),
+                                       decltype(in_thread_dst_desc),
+                                       decltype(f_copy)>(
             in_thread_src_desc,
             p_in_lds + in_desc.Get1dIndex(
                            n_thread_work_begin, 0, hi_thread_work_begin, wi_thread_work_begin),
@@ -107,10 +107,10 @@ __device__ void blockwise_convolution(InDesc,
             ++k_thread_work_begin)
         {
             // copy weight tensor into register
-            threadwise_4d_tensor_op_wei<TFloat,
-                                        decltype(wei_thread_src_desc),
-                                        decltype(wei_thread_dst_desc),
-                                        decltype(f_copy)>(
+            threadwise_4d_tensor_op_binary<TFloat,
+                                           decltype(wei_thread_src_desc),
+                                           decltype(wei_thread_dst_desc),
+                                           decltype(f_copy)>(
                 wei_thread_src_desc,
                 p_wei_lds + wei_desc.Get1dIndex(k_thread_work_begin, 0, 0, 0),
                 wei_thread_dst_desc,
@@ -118,10 +118,10 @@ __device__ void blockwise_convolution(InDesc,
                 f_copy);
 
             // copy output tensor into register
-            threadwise_4d_tensor_op_out<TFloat,
-                                        decltype(out_thread_src_desc),
-                                        decltype(out_thread_dst_desc),
-                                        decltype(f_copy)>(
+            threadwise_4d_tensor_op_binary<TFloat,
+                                           decltype(out_thread_src_desc),
+                                           decltype(out_thread_dst_desc),
+                                           decltype(f_copy)>(
                 out_thread_src_desc,
                 p_out_lds + out_desc.Get1dIndex(n_thread_work_begin,
                                                 k_thread_work_begin,
@@ -143,10 +143,10 @@ __device__ void blockwise_convolution(InDesc,
                                                                          p_out_thread);
 
             // accumulate output tensor into LDS
-            threadwise_4d_tensor_op_out<TFloat,
-                                        decltype(out_thread_dst_desc),
-                                        decltype(out_thread_src_desc),
-                                        decltype(f_copy)>(
+            threadwise_4d_tensor_op_binary<TFloat,
+                                           decltype(out_thread_dst_desc),
+                                           decltype(out_thread_src_desc),
+                                           decltype(f_copy)>(
                 out_thread_dst_desc,
                 p_out_thread,
                 out_thread_src_desc,
diff --git a/src/include/threadwise_tensor_op.cuh b/src/include/threadwise_tensor_op.cuh
index 40509abd5e..78b626dfe1 100644
--- a/src/include/threadwise_tensor_op.cuh
+++ b/src/include/threadwise_tensor_op.cuh
@@ -5,58 +5,35 @@
 
 #if THREADWISE_TENSOR_OP_METHOD == 0
 template <class TFloat, class SrcDesc, class DstDesc, class F>
-__device__ void threadwise_4d_tensor_op_in(
-    SrcDesc, TFloat* const __restrict__ p_src, DstDesc, TFloat* __restrict__ p_dst, F f)
+__device__ void threadwise_4d_tensor_op_unary(DstDesc, TFloat* __restrict__ p_dst, F f)
 {
     constexpr auto I0 = Index<0>{};
     constexpr auto I1 = Index<1>{};
     constexpr auto I2 = Index<2>{};
     constexpr auto I3 = Index<3>{};
 
-    constexpr auto src_desc = SrcDesc{};
     constexpr auto dst_desc = DstDesc{};
 
-    static_assert(is_same<decltype(src_desc.GetLengths()), decltype(dst_desc.GetLengths())>::value);
-
 #if 0
     if(threadIdx.x == 0)
     {
-        print_ConstantTensorDescriptor(src_desc);
         print_ConstantTensorDescriptor(dst_desc);
     }
 #endif
 
-    for(unsigned did0 = 0; did0 < src_desc.GetLength(I0); ++did0)
+    for(unsigned did0 = 0; did0 < dst_desc.GetLength(I0); ++did0)
     {
-        for(unsigned did1 = 0; did1 < src_desc.GetLength(I1); ++did1)
+        for(unsigned did1 = 0; did1 < dst_desc.GetLength(I1); ++did1)
         {
-            for(unsigned did2 = 0; did2 < src_desc.GetLength(I2); ++did2)
+            for(unsigned did2 = 0; did2 < dst_desc.GetLength(I2); ++did2)
             {
-                for(unsigned did3 = 0; did3 < src_desc.GetLength(I3); ++did3)
+                for(unsigned did3 = 0; did3 < dst_desc.GetLength(I3); ++did3)
                 {
-                    const unsigned sindex =
-                        src_desc.GetStride(I0) * did0 + src_desc.GetStride(I1) * did1 +
-                        src_desc.GetStride(I2) * did2 + src_desc.GetStride(I3) * did3;
-
                     const unsigned dindex =
                         dst_desc.GetStride(I0) * did0 + dst_desc.GetStride(I1) * did1 +
                         dst_desc.GetStride(I2) * did2 + dst_desc.GetStride(I3) * did3;
 
-                    f(p_src[sindex], p_dst[dindex]);
-
-#if 0
-                    if(threadIdx.x == 0)
-                    {
-                        printf("threadwise_4d_tensor_op: 1: thread id %u, \t"
-                               "sindex %u, p_src[sindex] %f, \t"
-                               "dindex %u, p_dst[dindex] %f\n",
-                               threadIdx.x,
-                               sindex,
-                               p_src[sindex],
-                               dindex,
-                               p_dst[dindex]);
-                    }
-#endif
+                    f(p_dst[dindex]);
                 }
             }
         }
@@ -64,7 +41,7 @@ __device__ void threadwise_4d_tensor_op_in(
 }
 
 template <class TFloat, class SrcDesc, class DstDesc, class F>
-__device__ void threadwise_4d_tensor_op_wei(
+__device__ void threadwise_4d_tensor_op_binary(
     SrcDesc, TFloat* const __restrict__ p_src, DstDesc, TFloat* __restrict__ p_dst, F f)
 {
     constexpr auto I0 = Index<0>{};
@@ -102,79 +79,6 @@ __device__ void threadwise_4d_tensor_op_wei(
                         dst_desc.GetStride(I2) * did2 + dst_desc.GetStride(I3) * did3;
 
                     f(p_src[sindex], p_dst[dindex]);
-
-#if 0
-                    if(threadIdx.x == 0)
-                    {
-                        printf("threadwise_4d_tensor_op: 1: thread id %u, \t"
-                               "sindex %u, p_src[sindex] %f, \t"
-                               "dindex %u, p_dst[dindex] %f\n",
-                               threadIdx.x,
-                               sindex,
-                               p_src[sindex],
-                               dindex,
-                               p_dst[dindex]);
-                    }
-#endif
-                }
-            }
-        }
-    }
-}
-
-template <class TFloat, class SrcDesc, class DstDesc, class F>
-__device__ void threadwise_4d_tensor_op_out(
-    SrcDesc, TFloat* const __restrict__ p_src, DstDesc, TFloat* __restrict__ p_dst, F f)
-{
-    constexpr auto I0 = Index<0>{};
-    constexpr auto I1 = Index<1>{};
-    constexpr auto I2 = Index<2>{};
-    constexpr auto I3 = Index<3>{};
-
-    constexpr auto src_desc = SrcDesc{};
-    constexpr auto dst_desc = DstDesc{};
-
-    static_assert(is_same<decltype(src_desc.GetLengths()), decltype(dst_desc.GetLengths())>::value);
-
-#if 0
-    if(threadIdx.x == 0)
-    {
-        print_ConstantTensorDescriptor(src_desc);
-        print_ConstantTensorDescriptor(dst_desc);
-    }
-#endif
-
-    for(unsigned did0 = 0; did0 < src_desc.GetLength(I0); ++did0)
-    {
-        for(unsigned did1 = 0; did1 < src_desc.GetLength(I1); ++did1)
-        {
-            for(unsigned did2 = 0; did2 < src_desc.GetLength(I2); ++did2)
-            {
-                for(unsigned did3 = 0; did3 < src_desc.GetLength(I3); ++did3)
-                {
-                    const unsigned sindex =
-                        src_desc.GetStride(I0) * did0 + src_desc.GetStride(I1) * did1 +
-                        src_desc.GetStride(I2) * did2 + src_desc.GetStride(I3) * did3;
-
-                    const unsigned dindex =
-                        dst_desc.GetStride(I0) * did0 + dst_desc.GetStride(I1) * did1 +
-                        dst_desc.GetStride(I2) * did2 + dst_desc.GetStride(I3) * did3;
-
-                    f(p_src[sindex], p_dst[dindex]);
-
-#if 0
-                    if(threadIdx.x == 0)
-                    {
-                        printf("threadwise_4d_tensor_op: 1: thread id %u, \t"
-                               "sindex %u, p_src[sindex] %f, \t"
-                               "dindex %u, p_dst[dindex] %f\n",
-                               threadIdx.x,
-                               sindex,
-                               p_src[sindex],
-                               dindex,
-                               p_dst[dindex]);
-                    }
-#endif
                 }
             }
         }