refactor

2026-05-13 01:36:06 +00:00 · 2019-01-07 23:01:41 -06:00
parent 3dbd47252c
commit ac1f62be3f
5 changed files with 79 additions and 89 deletions
--- a/src/include/blockwise_direct_convolution.cuh
+++ b/src/include/blockwise_direct_convolution.cuh
@@ -3,7 +3,8 @@
 #include "threadwise_tensor_op.cuh"
 #include "threadwise_direct_convolution.cuh"

-template <class TFloat,
+template <unsigned BlockSize,
+          class TFloat,
          class InBlockDesc,
          class WeiBlockDesc,
          class OutBlockDesc,
@@ -11,8 +12,7 @@ template <class TFloat,
          unsigned OutTileSizeW,
          unsigned NPerThread,
          unsigned KPerThread,
-          unsigned CPerThread,
-          unsigned BlockSize>
+          unsigned CPerThread>
 __device__ void blockwise_direct_convolution(InBlockDesc,
                                             TFloat* const __restrict__ p_in_block,
                                             WeiBlockDesc,
--- a/src/include/blockwise_tensor_op.cuh
+++ b/src/include/blockwise_tensor_op.cuh
@@ -1,7 +1,7 @@
 #pragma once
 #include "constant_tensor_descriptor.cuh"

-template <class TFloat, class DstDesc, class F, unsigned BlockSize>
+template <unsigned BlockSize, class TFloat, class DstDesc, class F>
 __device__ void
 blockwise_4d_tensor_pointwise_operation_unary(DstDesc, TFloat* __restrict__ p_dst, F f)
 {
@@ -78,13 +78,13 @@ blockwise_4d_tensor_pointwise_operation_unary(DstDesc, TFloat* __restrict__ p_ds

 // TODO: in order to optimize mem access for different mem type,
 // need to write specialized version
-template <class TFloat,
+template <unsigned BlockSize,
+          class TFloat,
          class SrcDesc,
          class DstDesc,
          class RefDesc,
          class Reorder,
-          class F,
-          unsigned BlockSize>
+          class F>
 __device__ void
 blockwise_4d_tensor_pointwise_operation_binary_reorder(SrcDesc,
                                                       TFloat* const __restrict__ p_src,
@@ -170,21 +170,20 @@ blockwise_4d_tensor_pointwise_operation_binary_reorder(SrcDesc,
    }
 }

-template <class TFloat, class DstDesc, unsigned BlockSize>
+template <unsigned BlockSize, class TFloat, class DstDesc>
 __device__ void blockwise_4d_tensor_set_zero(DstDesc, TFloat* __restrict__ p_dst)
 {
    auto f_set_zero = [](TFloat& v) { v = TFloat(0); };

-    blockwise_4d_tensor_pointwise_operation_unary<TFloat, DstDesc, decltype(f_set_zero), BlockSize>(
-        DstDesc{}, p_dst, f_set_zero);
+    blockwise_4d_tensor_pointwise_operation_unary<BlockSize>(DstDesc{}, p_dst, f_set_zero);
 }

-template <class TFloat,
+template <unsigned BlockSize,
+          class TFloat,
          class SrcDesc,
          class DstDesc,
          class RefDesc,
-          class Reorder,
-          unsigned BlockSize>
+          class Reorder>
 __device__ void blockwise_4d_tensor_copy_reorder(SrcDesc,
                                                 TFloat* const __restrict__ p_src,
                                                 DstDesc,
@@ -194,34 +193,16 @@ __device__ void blockwise_4d_tensor_copy_reorder(SrcDesc,
 {
    auto f_copy = [](const TFloat& src, TFloat& dst) { dst = src; };

-    blockwise_4d_tensor_pointwise_operation_binary_reorder<TFloat,
-                                                           SrcDesc,
-                                                           DstDesc,
-                                                           RefDesc,
-                                                           Reorder,
-                                                           decltype(f_copy),
-                                                           BlockSize>(
+    blockwise_4d_tensor_pointwise_operation_binary_reorder<BlockSize>(
        SrcDesc{}, p_src, DstDesc{}, p_dst, RefDesc{}, Reorder{}, f_copy);
 }

-template <class TFloat, class SrcDesc, class DstDesc, class RefDesc, unsigned BlockSize>
+template <unsigned BlockSize, class TFloat, class SrcDesc, class DstDesc, class RefDesc>
 __device__ void blockwise_4d_tensor_copy(
    SrcDesc, TFloat* const __restrict__ p_src, DstDesc, TFloat* __restrict__ p_dst, RefDesc)
 {
    constexpr auto reorder = Sequence<0, 1, 2, 3>{};

-    blockwise_4d_tensor_copy_reorder<TFloat,
-                                     SrcDesc,
-                                     DstDesc,
-                                     RefDesc,
-                                     decltype(reorder),
-                                     BlockSize>(
+    blockwise_4d_tensor_copy_reorder<BlockSize>(
        SrcDesc{}, p_src, DstDesc{}, p_dst, RefDesc{}, reorder);
 }
-
-template <class TFloat, class ImDesc, class WDesc, class ColDesc, unsigned BlockSize>
-__device__ void blockwise_4d_tensor_im2col(
-    ImDesc, const __restrict__ TFloat* p_im, WDesc, ColDesc, __restrict__ TFloat* p_col)
-{
-    // do nothing
-}
--- a/src/include/constant_tensor_descriptor.cuh
+++ b/src/include/constant_tensor_descriptor.cuh
@@ -24,12 +24,55 @@ struct Sequence
    }

    template <unsigned I>
-    __host__ __device__ constexpr auto GetNumber(Number<I>) const
+    __host__ __device__ constexpr auto GetConstant(Number<I>) const
    {
        constexpr unsigned N = Get(I);

        return Number<N>{};
    }
+
+    template <unsigned I0, unsigned I1>
+    __host__ __device__ constexpr auto Reorder(Number<I0>, Number<I1>) const
+    {
+        constexpr unsigned IR0 = Get(Number<I0>{});
+        constexpr unsigned IR1 = Get(Number<I1>{});
+
+        return Sequence<IR0, IR1>{};
+    }
+
+    template <unsigned I0, unsigned I1, unsigned I2>
+    __host__ __device__ constexpr auto Reorder(Number<I0>, Number<I1>, Number<I2>) const
+    {
+        constexpr unsigned IR0 = Get(Number<I0>{});
+        constexpr unsigned IR1 = Get(Number<I1>{});
+        constexpr unsigned IR2 = Get(Number<I2>{});
+
+        return Sequence<IR0, IR1, IR2>{};
+    }
+
+    template <unsigned I0, unsigned I1, unsigned I2, unsigned I3>
+    __host__ __device__ constexpr auto Reorder(Number<I0>, Number<I1>, Number<I2>, Number<I3>) const
+    {
+        constexpr unsigned IR0 = Get(Number<I0>{});
+        constexpr unsigned IR1 = Get(Number<I1>{});
+        constexpr unsigned IR2 = Get(Number<I2>{});
+        constexpr unsigned IR3 = Get(Number<I3>{});
+
+        return Sequence<IR0, IR1, IR2, IR3>{};
+    }
+
+    template <unsigned I0, unsigned I1, unsigned I2, unsigned I3, unsigned I4>
+    __host__ __device__ constexpr auto
+        Reorder(Number<I0>, Number<I1>, Number<I2>, Number<I3>, Number<I4>) const
+    {
+        constexpr unsigned IR0 = Get(Number<I0>{});
+        constexpr unsigned IR1 = Get(Number<I1>{});
+        constexpr unsigned IR2 = Get(Number<I2>{});
+        constexpr unsigned IR3 = Get(Number<I3>{});
+        constexpr unsigned IR4 = Get(Number<I4>{});
+
+        return Sequence<IR0, IR1, IR2, IR3, IR4>{};
+    }
 };

 template <class Lengths, class Strides>
@@ -99,6 +142,15 @@ struct ConstantTensorDescriptor
        static_assert(nDim == 4, "nDim is not 4");
        return n * GetStride(I0) + c * GetStride(I1) + h * GetStride(I2) + w * GetStride(I3);
    }
+
+    template <class... Is>
+    __host__ __device__ constexpr auto Reorder(Is... is) const
+    {
+        constexpr auto lengths = Lengths{}.Reorder(is...);
+        constexpr auto strides = Strides{}.Reorder(is...);
+
+        return ConstantTensorDescriptor<decltype(lengths), decltype(strides)>{};
+    }
 };

 // this is ugly, only for 4d
@@ -121,28 +173,6 @@ __host__ __device__ constexpr auto make_ConstantTensorDescriptor(Lengths, Stride
    return ConstantTensorDescriptor<Lengths, Strides>{};
 }

-// this is ugly, only for 4d
-template <class Desc, class Reorder>
-__host__ __device__ constexpr auto get_reordered_4d_tensor_descriptor(Desc, Reorder)
-{
-    constexpr auto I0 = Number<0>{};
-    constexpr auto I1 = Number<1>{};
-    constexpr auto I2 = Number<2>{};
-    constexpr auto I3 = Number<3>{};
-
-    constexpr auto IT0 = Reorder{}.GetNumber(I0);
-    constexpr auto IT1 = Reorder{}.GetNumber(I1);
-    constexpr auto IT2 = Reorder{}.GetNumber(I2);
-    constexpr auto IT3 = Reorder{}.GetNumber(I3);
-
-    constexpr unsigned L0 = Desc{}.GetLength(IT0);
-    constexpr unsigned L1 = Desc{}.GetLength(IT1);
-    constexpr unsigned L2 = Desc{}.GetLength(IT2);
-    constexpr unsigned L3 = Desc{}.GetLength(IT3);
-
-    return make_ConstantTensorDescriptor(Sequence<L0, L1, L2, L3>{});
-}
-
 // this is ugly, only for 4d
 template <class InDesc, class WeiDesc>
 __host__ __device__ constexpr auto get_convolution_output_4d_tensor_descriptor(InDesc, WeiDesc)
--- a/src/include/gridwise_direct_convolution_1.cuh
+++ b/src/include/gridwise_direct_convolution_1.cuh
@@ -122,18 +122,13 @@ __global__ void gridwise_direct_convolution_1(InGlobalDesc,
 #endif

    // set output tensor in LDS to 0
-    blockwise_4d_tensor_set_zero<TFloat, decltype(out_block_desc), BlockSize>(out_block_desc,
-                                                                              p_out_block);
+    blockwise_4d_tensor_set_zero<BlockSize>(out_block_desc, p_out_block);

    for(unsigned c_block_work_begin = 0; c_block_work_begin < in_global_desc.GetLength(I1);
        c_block_work_begin += CPerBlock)
    {
        // copy input tensor to LDS
-        blockwise_4d_tensor_copy<TFloat,
-                                 decltype(in_block_global_desc),
-                                 decltype(in_block_desc),
-                                 decltype(in_block_desc),
-                                 BlockSize>(in_block_global_desc,
+        blockwise_4d_tensor_copy<BlockSize>(in_block_global_desc,
                                            p_in_global +
                                                in_global_desc.Get1dIndex(n_block_work_begin,
                                                                          c_block_work_begin,
@@ -144,11 +139,7 @@ __global__ void gridwise_direct_convolution_1(InGlobalDesc,
                                            in_block_desc);

        // copy weight tensor to LDS
-        blockwise_4d_tensor_copy<TFloat,
-                                 decltype(wei_block_global_desc),
-                                 decltype(wei_block_desc),
-                                 decltype(wei_block_desc),
-                                 BlockSize>(
+        blockwise_4d_tensor_copy<BlockSize>(
            wei_block_global_desc,
            p_wei_global + wei_global_desc.Get1dIndex(k_block_work_begin, c_block_work_begin, 0, 0),
            wei_block_desc,
@@ -158,7 +149,8 @@ __global__ void gridwise_direct_convolution_1(InGlobalDesc,
        __syncthreads();

        // blockwise convolution
-        blockwise_direct_convolution<TFloat,
+        blockwise_direct_convolution<BlockSize,
+                                     TFloat,
                                     decltype(in_block_desc),
                                     decltype(wei_block_desc),
                                     decltype(out_block_desc),
@@ -166,19 +158,14 @@ __global__ void gridwise_direct_convolution_1(InGlobalDesc,
                                     OutTileSizeW,
                                     NPerThread,
                                     KPerThread,
-                                     CPerThread,
-                                     BlockSize>(
+                                     CPerThread>(
            in_block_desc, p_in_block, wei_block_desc, p_wei_block, out_block_desc, p_out_block);

        __syncthreads();
    }

    // copy output tensor from LDS to device mem
-    blockwise_4d_tensor_copy<TFloat,
-                             decltype(out_block_desc),
-                             decltype(out_block_global_desc),
-                             decltype(out_block_desc),
-                             BlockSize>(
+    blockwise_4d_tensor_copy<BlockSize>(
        out_block_desc,
        p_out_block,
        out_block_global_desc,
@@ -186,4 +173,4 @@ __global__ void gridwise_direct_convolution_1(InGlobalDesc,
            out_global_desc.Get1dIndex(
                n_block_work_begin, k_block_work_begin, ho_block_work_begin, wo_block_work_begin),
        out_block_desc);
-}
+}
--- a/src/include/gridwise_direct_convolution_2.cuh
+++ b/src/include/gridwise_direct_convolution_2.cuh
@@ -151,11 +151,7 @@ __global__ void gridwise_direct_convolution_2(InGlobalDesc,
        c_block_data_begin += CPerBlock, __syncthreads())
    {
        // copy input tensor to LDS
-        blockwise_4d_tensor_copy<TFloat,
-                                 decltype(in_global_desc),
-                                 decltype(in_block_desc),
-                                 decltype(in_block_desc),
-                                 BlockSize>(in_global_desc,
+        blockwise_4d_tensor_copy<BlockSize>(in_global_desc,
                                            p_in_global +
                                                in_global_desc.Get1dIndex(n_block_data_begin,
                                                                          c_block_data_begin,
@@ -166,11 +162,7 @@ __global__ void gridwise_direct_convolution_2(InGlobalDesc,
                                            in_block_desc);

        // copy weight tensor to LDS
-        blockwise_4d_tensor_copy<TFloat,
-                                 decltype(wei_global_desc),
-                                 decltype(wei_block_desc),
-                                 decltype(wei_block_desc),
-                                 BlockSize>(
+        blockwise_4d_tensor_copy<BlockSize>(
            wei_global_desc,
            p_wei_global + wei_global_desc.Get1dIndex(k_block_data_begin, c_block_data_begin, 0, 0),
            wei_block_desc,
@@ -182,7 +174,7 @@ __global__ void gridwise_direct_convolution_2(InGlobalDesc,
        for(unsigned c_thread_data = 0; c_thread_data < CPerBlock; c_thread_data += CPerThread)
        {
            // threadwise convolution
-#if 0
+#if 1
            threadwise_direct_convolution_2(
                in_thread_block_desc,
                p_in_block + in_block_desc.Get1dIndex(n_thread_data_begin,