From ac1f62be3fbdad4f37d98ad0928914fe591a6364 Mon Sep 17 00:00:00 2001
From: Chao Liu <lc.roy86@gmail.com>
Date: Mon, 7 Jan 2019 23:01:41 -0600
Subject: [PATCH] refactor

---
 src/include/blockwise_direct_convolution.cuh  |  6 +-
 src/include/blockwise_tensor_op.cuh           | 43 +++--------
 src/include/constant_tensor_descriptor.cuh    | 76 +++++++++++++------
 src/include/gridwise_direct_convolution_1.cuh | 29 ++-----
 src/include/gridwise_direct_convolution_2.cuh | 14 +---
 5 files changed, 79 insertions(+), 89 deletions(-)
diff --git a/src/include/blockwise_direct_convolution.cuh b/src/include/blockwise_direct_convolution.cuh
index dd849eb4ca..3078e77b9d 100644
--- a/src/include/blockwise_direct_convolution.cuh
+++ b/src/include/blockwise_direct_convolution.cuh
@@ -3,7 +3,8 @@
 #include "threadwise_tensor_op.cuh"
 #include "threadwise_direct_convolution.cuh"
 
-template <class TFloat,
+template <unsigned BlockSize,
+          class TFloat,
           class InBlockDesc,
           class WeiBlockDesc,
           class OutBlockDesc,
@@ -11,8 +12,7 @@ template <class TFloat,
           unsigned OutTileSizeW,
           unsigned NPerThread,
           unsigned KPerThread,
-          unsigned CPerThread,
-          unsigned BlockSize>
+          unsigned CPerThread>
 __device__ void blockwise_direct_convolution(InBlockDesc,
                                              TFloat* const __restrict__ p_in_block,
                                              WeiBlockDesc,
diff --git a/src/include/blockwise_tensor_op.cuh b/src/include/blockwise_tensor_op.cuh
index 3b39da4f1b..9de53046bd 100644
--- a/src/include/blockwise_tensor_op.cuh
+++ b/src/include/blockwise_tensor_op.cuh
@@ -1,7 +1,7 @@
 #pragma once
 #include "constant_tensor_descriptor.cuh"
 
-template <class TFloat, class DstDesc, class F, unsigned BlockSize>
+template <unsigned BlockSize, class TFloat, class DstDesc, class F>
 __device__ void
 blockwise_4d_tensor_pointwise_operation_unary(DstDesc, TFloat* __restrict__ p_dst, F f)
 {
@@ -78,13 +78,13 @@ blockwise_4d_tensor_pointwise_operation_unary(DstDesc, TFloat* __restrict__ p_ds
 
 // TODO: in order to optimize mem access for different mem type,
 // need to write specialized version
-template <class TFloat,
+template <unsigned BlockSize,
+          class TFloat,
           class SrcDesc,
           class DstDesc,
           class RefDesc,
           class Reorder,
-          class F,
-          unsigned BlockSize>
+          class F>
 __device__ void
 blockwise_4d_tensor_pointwise_operation_binary_reorder(SrcDesc,
                                                        TFloat* const __restrict__ p_src,
@@ -170,21 +170,20 @@ blockwise_4d_tensor_pointwise_operation_binary_reorder(SrcDesc,
     }
 }
 
-template <class TFloat, class DstDesc, unsigned BlockSize>
+template <unsigned BlockSize, class TFloat, class DstDesc>
 __device__ void blockwise_4d_tensor_set_zero(DstDesc, TFloat* __restrict__ p_dst)
 {
     auto f_set_zero = [](TFloat& v) { v = TFloat(0); };
 
-    blockwise_4d_tensor_pointwise_operation_unary<TFloat, DstDesc, decltype(f_set_zero), BlockSize>(
-        DstDesc{}, p_dst, f_set_zero);
+    blockwise_4d_tensor_pointwise_operation_unary<BlockSize>(DstDesc{}, p_dst, f_set_zero);
 }
 
-template <class TFloat,
+template <unsigned BlockSize,
+          class TFloat,
           class SrcDesc,
           class DstDesc,
           class RefDesc,
-          class Reorder,
-          unsigned BlockSize>
+          class Reorder>
 __device__ void blockwise_4d_tensor_copy_reorder(SrcDesc,
                                                  TFloat* const __restrict__ p_src,
                                                  DstDesc,
@@ -194,34 +193,16 @@ __device__ void blockwise_4d_tensor_copy_reorder(SrcDesc,
 {
     auto f_copy = [](const TFloat& src, TFloat& dst) { dst = src; };
 
-    blockwise_4d_tensor_pointwise_operation_binary_reorder<TFloat,
-                                                           SrcDesc,
-                                                           DstDesc,
-                                                           RefDesc,
-                                                           Reorder,
-                                                           decltype(f_copy),
-                                                           BlockSize>(
+    blockwise_4d_tensor_pointwise_operation_binary_reorder<BlockSize>(
         SrcDesc{}, p_src, DstDesc{}, p_dst, RefDesc{}, Reorder{}, f_copy);
 }
 
-template <class TFloat, class SrcDesc, class DstDesc, class RefDesc, unsigned BlockSize>
+template <unsigned BlockSize, class TFloat, class SrcDesc, class DstDesc, class RefDesc>
 __device__ void blockwise_4d_tensor_copy(
     SrcDesc, TFloat* const __restrict__ p_src, DstDesc, TFloat* __restrict__ p_dst, RefDesc)
 {
     constexpr auto reorder = Sequence<0, 1, 2, 3>{};
 
-    blockwise_4d_tensor_copy_reorder<TFloat,
-                                     SrcDesc,
-                                     DstDesc,
-                                     RefDesc,
-                                     decltype(reorder),
-                                     BlockSize>(
+    blockwise_4d_tensor_copy_reorder<BlockSize>(
         SrcDesc{}, p_src, DstDesc{}, p_dst, RefDesc{}, reorder);
 }
-
-template <class TFloat, class ImDesc, class WDesc, class ColDesc, unsigned BlockSize>
-__device__ void blockwise_4d_tensor_im2col(
-    ImDesc, const __restrict__ TFloat* p_im, WDesc, ColDesc, __restrict__ TFloat* p_col)
-{
-    // do nothing
-}
diff --git a/src/include/constant_tensor_descriptor.cuh b/src/include/constant_tensor_descriptor.cuh
index de7542b1c3..b3b508b67f 100644
--- a/src/include/constant_tensor_descriptor.cuh
+++ b/src/include/constant_tensor_descriptor.cuh
@@ -24,12 +24,55 @@ struct Sequence
     }
 
     template <unsigned I>
-    __host__ __device__ constexpr auto GetNumber(Number<I>) const
+    __host__ __device__ constexpr auto GetConstant(Number<I>) const
     {
         constexpr unsigned N = Get(I);
 
         return Number<N>{};
     }
+
+    template <unsigned I0, unsigned I1>
+    __host__ __device__ constexpr auto Reorder(Number<I0>, Number<I1>) const
+    {
+        constexpr unsigned IR0 = Get(Number<I0>{});
+        constexpr unsigned IR1 = Get(Number<I1>{});
+
+        return Sequence<IR0, IR1>{};
+    }
+
+    template <unsigned I0, unsigned I1, unsigned I2>
+    __host__ __device__ constexpr auto Reorder(Number<I0>, Number<I1>, Number<I2>) const
+    {
+        constexpr unsigned IR0 = Get(Number<I0>{});
+        constexpr unsigned IR1 = Get(Number<I1>{});
+        constexpr unsigned IR2 = Get(Number<I2>{});
+
+        return Sequence<IR0, IR1, IR2>{};
+    }
+
+    template <unsigned I0, unsigned I1, unsigned I2, unsigned I3>
+    __host__ __device__ constexpr auto Reorder(Number<I0>, Number<I1>, Number<I2>, Number<I3>) const
+    {
+        constexpr unsigned IR0 = Get(Number<I0>{});
+        constexpr unsigned IR1 = Get(Number<I1>{});
+        constexpr unsigned IR2 = Get(Number<I2>{});
+        constexpr unsigned IR3 = Get(Number<I3>{});
+
+        return Sequence<IR0, IR1, IR2, IR3>{};
+    }
+
+    template <unsigned I0, unsigned I1, unsigned I2, unsigned I3, unsigned I4>
+    __host__ __device__ constexpr auto
+        Reorder(Number<I0>, Number<I1>, Number<I2>, Number<I3>, Number<I4>) const
+    {
+        constexpr unsigned IR0 = Get(Number<I0>{});
+        constexpr unsigned IR1 = Get(Number<I1>{});
+        constexpr unsigned IR2 = Get(Number<I2>{});
+        constexpr unsigned IR3 = Get(Number<I3>{});
+        constexpr unsigned IR4 = Get(Number<I4>{});
+
+        return Sequence<IR0, IR1, IR2, IR3, IR4>{};
+    }
 };
 
 template <class Lengths, class Strides>
@@ -99,6 +142,15 @@ struct ConstantTensorDescriptor
         static_assert(nDim == 4, "nDim is not 4");
         return n * GetStride(I0) + c * GetStride(I1) + h * GetStride(I2) + w * GetStride(I3);
     }
+
+    template <class... Is>
+    __host__ __device__ constexpr auto Reorder(Is... is) const
+    {
+        constexpr auto lengths = Lengths{}.Reorder(is...);
+        constexpr auto strides = Strides{}.Reorder(is...);
+
+        return ConstantTensorDescriptor<decltype(lengths), decltype(strides)>{};
+    }
 };
 
 // this is ugly, only for 4d
@@ -121,28 +173,6 @@ __host__ __device__ constexpr auto make_ConstantTensorDescriptor(Lengths, Stride
     return ConstantTensorDescriptor<Lengths, Strides>{};
 }
 
-// this is ugly, only for 4d
-template <class Desc, class Reorder>
-__host__ __device__ constexpr auto get_reordered_4d_tensor_descriptor(Desc, Reorder)
-{
-    constexpr auto I0 = Number<0>{};
-    constexpr auto I1 = Number<1>{};
-    constexpr auto I2 = Number<2>{};
-    constexpr auto I3 = Number<3>{};
-
-    constexpr auto IT0 = Reorder{}.GetNumber(I0);
-    constexpr auto IT1 = Reorder{}.GetNumber(I1);
-    constexpr auto IT2 = Reorder{}.GetNumber(I2);
-    constexpr auto IT3 = Reorder{}.GetNumber(I3);
-
-    constexpr unsigned L0 = Desc{}.GetLength(IT0);
-    constexpr unsigned L1 = Desc{}.GetLength(IT1);
-    constexpr unsigned L2 = Desc{}.GetLength(IT2);
-    constexpr unsigned L3 = Desc{}.GetLength(IT3);
-
-    return make_ConstantTensorDescriptor(Sequence<L0, L1, L2, L3>{});
-}
-
 // this is ugly, only for 4d
 template <class InDesc, class WeiDesc>
 __host__ __device__ constexpr auto get_convolution_output_4d_tensor_descriptor(InDesc, WeiDesc)
diff --git a/src/include/gridwise_direct_convolution_1.cuh b/src/include/gridwise_direct_convolution_1.cuh
index 7dd36dd966..1ec2cd83a8 100644
--- a/src/include/gridwise_direct_convolution_1.cuh
+++ b/src/include/gridwise_direct_convolution_1.cuh
@@ -122,18 +122,13 @@ __global__ void gridwise_direct_convolution_1(InGlobalDesc,
 #endif
 
     // set output tensor in LDS to 0
-    blockwise_4d_tensor_set_zero<TFloat, decltype(out_block_desc), BlockSize>(out_block_desc,
-                                                                              p_out_block);
+    blockwise_4d_tensor_set_zero<BlockSize>(out_block_desc, p_out_block);
 
     for(unsigned c_block_work_begin = 0; c_block_work_begin < in_global_desc.GetLength(I1);
         c_block_work_begin += CPerBlock)
     {
         // copy input tensor to LDS
-        blockwise_4d_tensor_copy<TFloat,
-                                 decltype(in_block_global_desc),
-                                 decltype(in_block_desc),
-                                 decltype(in_block_desc),
-                                 BlockSize>(in_block_global_desc,
+        blockwise_4d_tensor_copy<BlockSize>(in_block_global_desc,
                                             p_in_global +
                                                 in_global_desc.Get1dIndex(n_block_work_begin,
                                                                           c_block_work_begin,
@@ -144,11 +139,7 @@ __global__ void gridwise_direct_convolution_1(InGlobalDesc,
                                             in_block_desc);
 
         // copy weight tensor to LDS
-        blockwise_4d_tensor_copy<TFloat,
-                                 decltype(wei_block_global_desc),
-                                 decltype(wei_block_desc),
-                                 decltype(wei_block_desc),
-                                 BlockSize>(
+        blockwise_4d_tensor_copy<BlockSize>(
             wei_block_global_desc,
             p_wei_global + wei_global_desc.Get1dIndex(k_block_work_begin, c_block_work_begin, 0, 0),
             wei_block_desc,
@@ -158,7 +149,8 @@ __global__ void gridwise_direct_convolution_1(InGlobalDesc,
         __syncthreads();
 
         // blockwise convolution
-        blockwise_direct_convolution<TFloat,
+        blockwise_direct_convolution<BlockSize,
+                                     TFloat,
                                      decltype(in_block_desc),
                                      decltype(wei_block_desc),
                                      decltype(out_block_desc),
@@ -166,19 +158,14 @@ __global__ void gridwise_direct_convolution_1(InGlobalDesc,
                                      OutTileSizeW,
                                      NPerThread,
                                      KPerThread,
-                                     CPerThread,
-                                     BlockSize>(
+                                     CPerThread>(
             in_block_desc, p_in_block, wei_block_desc, p_wei_block, out_block_desc, p_out_block);
 
         __syncthreads();
     }
 
     // copy output tensor from LDS to device mem
-    blockwise_4d_tensor_copy<TFloat,
-                             decltype(out_block_desc),
-                             decltype(out_block_global_desc),
-                             decltype(out_block_desc),
-                             BlockSize>(
+    blockwise_4d_tensor_copy<BlockSize>(
         out_block_desc,
         p_out_block,
         out_block_global_desc,
@@ -186,4 +173,4 @@ __global__ void gridwise_direct_convolution_1(InGlobalDesc,
             out_global_desc.Get1dIndex(
                 n_block_work_begin, k_block_work_begin, ho_block_work_begin, wo_block_work_begin),
         out_block_desc);
-}
\ No newline at end of file
+}
diff --git a/src/include/gridwise_direct_convolution_2.cuh b/src/include/gridwise_direct_convolution_2.cuh
index fe3d4ff55f..90d5a46a76 100644
--- a/src/include/gridwise_direct_convolution_2.cuh
+++ b/src/include/gridwise_direct_convolution_2.cuh
@@ -151,11 +151,7 @@ __global__ void gridwise_direct_convolution_2(InGlobalDesc,
         c_block_data_begin += CPerBlock, __syncthreads())
     {
         // copy input tensor to LDS
-        blockwise_4d_tensor_copy<TFloat,
-                                 decltype(in_global_desc),
-                                 decltype(in_block_desc),
-                                 decltype(in_block_desc),
-                                 BlockSize>(in_global_desc,
+        blockwise_4d_tensor_copy<BlockSize>(in_global_desc,
                                             p_in_global +
                                                 in_global_desc.Get1dIndex(n_block_data_begin,
                                                                           c_block_data_begin,
@@ -166,11 +162,7 @@ __global__ void gridwise_direct_convolution_2(InGlobalDesc,
                                             in_block_desc);
 
         // copy weight tensor to LDS
-        blockwise_4d_tensor_copy<TFloat,
-                                 decltype(wei_global_desc),
-                                 decltype(wei_block_desc),
-                                 decltype(wei_block_desc),
-                                 BlockSize>(
+        blockwise_4d_tensor_copy<BlockSize>(
             wei_global_desc,
             p_wei_global + wei_global_desc.Get1dIndex(k_block_data_begin, c_block_data_begin, 0, 0),
             wei_block_desc,
@@ -182,7 +174,7 @@ __global__ void gridwise_direct_convolution_2(InGlobalDesc,
         for(unsigned c_thread_data = 0; c_thread_data < CPerBlock; c_thread_data += CPerThread)
         {
             // threadwise convolution
-#if 0
+#if 1
             threadwise_direct_convolution_2(
                 in_thread_block_desc,
                 p_in_block + in_block_desc.Get1dIndex(n_thread_data_begin,