From ac1f62be3fbdad4f37d98ad0928914fe591a6364 Mon Sep 17 00:00:00 2001 From: Chao Liu Date: Mon, 7 Jan 2019 23:01:41 -0600 Subject: [PATCH] refactor --- src/include/blockwise_direct_convolution.cuh | 6 +- src/include/blockwise_tensor_op.cuh | 43 +++-------- src/include/constant_tensor_descriptor.cuh | 76 +++++++++++++------ src/include/gridwise_direct_convolution_1.cuh | 29 ++----- src/include/gridwise_direct_convolution_2.cuh | 14 +--- 5 files changed, 79 insertions(+), 89 deletions(-) diff --git a/src/include/blockwise_direct_convolution.cuh b/src/include/blockwise_direct_convolution.cuh index dd849eb4ca..3078e77b9d 100644 --- a/src/include/blockwise_direct_convolution.cuh +++ b/src/include/blockwise_direct_convolution.cuh @@ -3,7 +3,8 @@ #include "threadwise_tensor_op.cuh" #include "threadwise_direct_convolution.cuh" -template + unsigned CPerThread> __device__ void blockwise_direct_convolution(InBlockDesc, TFloat* const __restrict__ p_in_block, WeiBlockDesc, diff --git a/src/include/blockwise_tensor_op.cuh b/src/include/blockwise_tensor_op.cuh index 3b39da4f1b..9de53046bd 100644 --- a/src/include/blockwise_tensor_op.cuh +++ b/src/include/blockwise_tensor_op.cuh @@ -1,7 +1,7 @@ #pragma once #include "constant_tensor_descriptor.cuh" -template +template __device__ void blockwise_4d_tensor_pointwise_operation_unary(DstDesc, TFloat* __restrict__ p_dst, F f) { @@ -78,13 +78,13 @@ blockwise_4d_tensor_pointwise_operation_unary(DstDesc, TFloat* __restrict__ p_ds // TODO: in order to optimize mem access for different mem type, // need to write specialized version -template + class F> __device__ void blockwise_4d_tensor_pointwise_operation_binary_reorder(SrcDesc, TFloat* const __restrict__ p_src, @@ -170,21 +170,20 @@ blockwise_4d_tensor_pointwise_operation_binary_reorder(SrcDesc, } } -template +template __device__ void blockwise_4d_tensor_set_zero(DstDesc, TFloat* __restrict__ p_dst) { auto f_set_zero = [](TFloat& v) { v = TFloat(0); }; - blockwise_4d_tensor_pointwise_operation_unary( - DstDesc{}, p_dst, f_set_zero); + blockwise_4d_tensor_pointwise_operation_unary(DstDesc{}, p_dst, f_set_zero); } -template + class Reorder> __device__ void blockwise_4d_tensor_copy_reorder(SrcDesc, TFloat* const __restrict__ p_src, DstDesc, @@ -194,34 +193,16 @@ __device__ void blockwise_4d_tensor_copy_reorder(SrcDesc, { auto f_copy = [](const TFloat& src, TFloat& dst) { dst = src; }; - blockwise_4d_tensor_pointwise_operation_binary_reorder( + blockwise_4d_tensor_pointwise_operation_binary_reorder( SrcDesc{}, p_src, DstDesc{}, p_dst, RefDesc{}, Reorder{}, f_copy); } -template +template __device__ void blockwise_4d_tensor_copy( SrcDesc, TFloat* const __restrict__ p_src, DstDesc, TFloat* __restrict__ p_dst, RefDesc) { constexpr auto reorder = Sequence<0, 1, 2, 3>{}; - blockwise_4d_tensor_copy_reorder( + blockwise_4d_tensor_copy_reorder( SrcDesc{}, p_src, DstDesc{}, p_dst, RefDesc{}, reorder); } - -template -__device__ void blockwise_4d_tensor_im2col( - ImDesc, const __restrict__ TFloat* p_im, WDesc, ColDesc, __restrict__ TFloat* p_col) -{ - // do nothing -} diff --git a/src/include/constant_tensor_descriptor.cuh b/src/include/constant_tensor_descriptor.cuh index de7542b1c3..b3b508b67f 100644 --- a/src/include/constant_tensor_descriptor.cuh +++ b/src/include/constant_tensor_descriptor.cuh @@ -24,12 +24,55 @@ struct Sequence } template - __host__ __device__ constexpr auto GetNumber(Number) const + __host__ __device__ constexpr auto GetConstant(Number) const { constexpr unsigned N = Get(I); return Number{}; } + + template + __host__ __device__ constexpr auto Reorder(Number, Number) const + { + constexpr unsigned IR0 = Get(Number{}); + constexpr unsigned IR1 = Get(Number{}); + + return Sequence{}; + } + + template + __host__ __device__ constexpr auto Reorder(Number, Number, Number) const + { + constexpr unsigned IR0 = Get(Number{}); + constexpr unsigned IR1 = Get(Number{}); + constexpr unsigned IR2 = Get(Number{}); + + return Sequence{}; + } + + template + __host__ __device__ constexpr auto Reorder(Number, Number, Number, Number) const + { + constexpr unsigned IR0 = Get(Number{}); + constexpr unsigned IR1 = Get(Number{}); + constexpr unsigned IR2 = Get(Number{}); + constexpr unsigned IR3 = Get(Number{}); + + return Sequence{}; + } + + template + __host__ __device__ constexpr auto + Reorder(Number, Number, Number, Number, Number) const + { + constexpr unsigned IR0 = Get(Number{}); + constexpr unsigned IR1 = Get(Number{}); + constexpr unsigned IR2 = Get(Number{}); + constexpr unsigned IR3 = Get(Number{}); + constexpr unsigned IR4 = Get(Number{}); + + return Sequence{}; + } }; template @@ -99,6 +142,15 @@ struct ConstantTensorDescriptor static_assert(nDim == 4, "nDim is not 4"); return n * GetStride(I0) + c * GetStride(I1) + h * GetStride(I2) + w * GetStride(I3); } + + template + __host__ __device__ constexpr auto Reorder(Is... is) const + { + constexpr auto lengths = Lengths{}.Reorder(is...); + constexpr auto strides = Strides{}.Reorder(is...); + + return ConstantTensorDescriptor{}; + } }; // this is ugly, only for 4d @@ -121,28 +173,6 @@ __host__ __device__ constexpr auto make_ConstantTensorDescriptor(Lengths, Stride return ConstantTensorDescriptor{}; } -// this is ugly, only for 4d -template -__host__ __device__ constexpr auto get_reordered_4d_tensor_descriptor(Desc, Reorder) -{ - constexpr auto I0 = Number<0>{}; - constexpr auto I1 = Number<1>{}; - constexpr auto I2 = Number<2>{}; - constexpr auto I3 = Number<3>{}; - - constexpr auto IT0 = Reorder{}.GetNumber(I0); - constexpr auto IT1 = Reorder{}.GetNumber(I1); - constexpr auto IT2 = Reorder{}.GetNumber(I2); - constexpr auto IT3 = Reorder{}.GetNumber(I3); - - constexpr unsigned L0 = Desc{}.GetLength(IT0); - constexpr unsigned L1 = Desc{}.GetLength(IT1); - constexpr unsigned L2 = Desc{}.GetLength(IT2); - constexpr unsigned L3 = Desc{}.GetLength(IT3); - - return make_ConstantTensorDescriptor(Sequence{}); -} - // this is ugly, only for 4d template __host__ __device__ constexpr auto get_convolution_output_4d_tensor_descriptor(InDesc, WeiDesc) diff --git a/src/include/gridwise_direct_convolution_1.cuh b/src/include/gridwise_direct_convolution_1.cuh index 7dd36dd966..1ec2cd83a8 100644 --- a/src/include/gridwise_direct_convolution_1.cuh +++ b/src/include/gridwise_direct_convolution_1.cuh @@ -122,18 +122,13 @@ __global__ void gridwise_direct_convolution_1(InGlobalDesc, #endif // set output tensor in LDS to 0 - blockwise_4d_tensor_set_zero(out_block_desc, - p_out_block); + blockwise_4d_tensor_set_zero(out_block_desc, p_out_block); for(unsigned c_block_work_begin = 0; c_block_work_begin < in_global_desc.GetLength(I1); c_block_work_begin += CPerBlock) { // copy input tensor to LDS - blockwise_4d_tensor_copy(in_block_global_desc, + blockwise_4d_tensor_copy(in_block_global_desc, p_in_global + in_global_desc.Get1dIndex(n_block_work_begin, c_block_work_begin, @@ -144,11 +139,7 @@ __global__ void gridwise_direct_convolution_1(InGlobalDesc, in_block_desc); // copy weight tensor to LDS - blockwise_4d_tensor_copy( + blockwise_4d_tensor_copy( wei_block_global_desc, p_wei_global + wei_global_desc.Get1dIndex(k_block_work_begin, c_block_work_begin, 0, 0), wei_block_desc, @@ -158,7 +149,8 @@ __global__ void gridwise_direct_convolution_1(InGlobalDesc, __syncthreads(); // blockwise convolution - blockwise_direct_convolution( + CPerThread>( in_block_desc, p_in_block, wei_block_desc, p_wei_block, out_block_desc, p_out_block); __syncthreads(); } // copy output tensor from LDS to device mem - blockwise_4d_tensor_copy( + blockwise_4d_tensor_copy( out_block_desc, p_out_block, out_block_global_desc, @@ -186,4 +173,4 @@ __global__ void gridwise_direct_convolution_1(InGlobalDesc, out_global_desc.Get1dIndex( n_block_work_begin, k_block_work_begin, ho_block_work_begin, wo_block_work_begin), out_block_desc); -} \ No newline at end of file +} diff --git a/src/include/gridwise_direct_convolution_2.cuh b/src/include/gridwise_direct_convolution_2.cuh index fe3d4ff55f..90d5a46a76 100644 --- a/src/include/gridwise_direct_convolution_2.cuh +++ b/src/include/gridwise_direct_convolution_2.cuh @@ -151,11 +151,7 @@ __global__ void gridwise_direct_convolution_2(InGlobalDesc, c_block_data_begin += CPerBlock, __syncthreads()) { // copy input tensor to LDS - blockwise_4d_tensor_copy(in_global_desc, + blockwise_4d_tensor_copy(in_global_desc, p_in_global + in_global_desc.Get1dIndex(n_block_data_begin, c_block_data_begin, @@ -166,11 +162,7 @@ __global__ void gridwise_direct_convolution_2(InGlobalDesc, in_block_desc); // copy weight tensor to LDS - blockwise_4d_tensor_copy( + blockwise_4d_tensor_copy( wei_global_desc, p_wei_global + wei_global_desc.Get1dIndex(k_block_data_begin, c_block_data_begin, 0, 0), wei_block_desc, @@ -182,7 +174,7 @@ __global__ void gridwise_direct_convolution_2(InGlobalDesc, for(unsigned c_thread_data = 0; c_thread_data < CPerBlock; c_thread_data += CPerThread) { // threadwise convolution -#if 0 +#if 1 threadwise_direct_convolution_2( in_thread_block_desc, p_in_block + in_block_desc.Get1dIndex(n_thread_data_begin,