From adf4b173b30f463d56d111c42116e1d20e194cf4 Mon Sep 17 00:00:00 2001 From: Chao Liu Date: Thu, 15 Nov 2018 23:22:06 -0600 Subject: [PATCH] refactor --- driver/conv.cu | 20 +-- src/include/blockwise_tensor_op.cuh | 216 ++++++++++++++------------- src/include/direct_convolution_2.cuh | 112 +++++++------- 3 files changed, 176 insertions(+), 172 deletions(-) diff --git a/driver/conv.cu b/driver/conv.cu index 5b0f56efd6..4a62be7fc4 100644 --- a/driver/conv.cu +++ b/driver/conv.cu @@ -140,10 +140,10 @@ void device_convolution( constexpr unsigned YPerBlock = 8; constexpr unsigned XPerBlock = 16; - constexpr unsigned NBlockCopyLen0 = 1; - constexpr unsigned NBlockCopyLen1 = 1; - constexpr unsigned NBlockCopyLen2 = 4; - constexpr unsigned NBlockCopyLen3 = 32; + constexpr unsigned NBlockOpLen0 = 1; + constexpr unsigned NBlockOpLen1 = 1; + constexpr unsigned NBlockOpLen2 = 4; + constexpr unsigned NBlockOpLen3 = 32; constexpr unsigned BlockSize = 128; @@ -174,10 +174,10 @@ void device_convolution( CPerBlock, YPerBlock, XPerBlock, - NBlockCopyLen0, - NBlockCopyLen1, - NBlockCopyLen2, - NBlockCopyLen3, + NBlockOpLen0, + NBlockOpLen1, + NBlockOpLen2, + NBlockOpLen3, BlockSize, GridSize> <<>>(InDesc{}, @@ -248,7 +248,7 @@ int main() int num_thread = std::thread::hardware_concurrency(); -#if 0 +#if 1 in.GenerateTensorValue(GeneratorTensor{}, num_thread); wei.GenerateTensorValue(GeneratorTensor{}, num_thread); out_host.GenerateTensorValue(GeneratorConstant{0}, num_thread); @@ -258,7 +258,7 @@ int main() device_convolution(in_desc, in, wei_desc, wei, out_desc, out_device); -#if 0 +#if 1 host_convolution(in, wei, out_host); float error = 0; diff --git a/src/include/blockwise_tensor_op.cuh b/src/include/blockwise_tensor_op.cuh index 232f0de172..65a1e08c38 100644 --- a/src/include/blockwise_tensor_op.cuh +++ b/src/include/blockwise_tensor_op.cuh @@ -7,10 +7,10 @@ template __device__ void blockwise_4d_tensor_op_binary( @@ -67,10 +67,10 @@ __device__ void blockwise_4d_tensor_op_binary( template __device__ void blockwise_4d_tensor_op_binary( @@ -156,10 +156,10 @@ __device__ void blockwise_4d_tensor_op_binary( template __device__ void blockwise_4d_tensor_op_unary(DstDesc, TFloat* __restrict__ p_dst, F f) @@ -240,10 +240,10 @@ __device__ void blockwise_4d_tensor_op_unary(DstDesc, TFloat* __restrict__ p_dst template __device__ void blockwise_4d_tensor_op_binary( @@ -259,34 +259,34 @@ __device__ void blockwise_4d_tensor_op_binary( static_assert(is_same::value); - constexpr unsigned NWorkStride3 = 1; - constexpr unsigned NWorkStride2 = NWorkLen3 * NWorkStride3; - constexpr unsigned NWorkStride1 = NWorkLen2 * NWorkStride2; - constexpr unsigned NWorkStride0 = NWorkLen1 * NWorkStride1; + constexpr unsigned NBlockOpStride3 = 1; + constexpr unsigned NBlockOpStride2 = NBlockOpLen3 * NBlockOpStride3; + constexpr unsigned NBlockOpStride1 = NBlockOpLen2 * NBlockOpStride2; + constexpr unsigned NBlockOpStride0 = NBlockOpLen1 * NBlockOpStride1; unsigned itmp = threadIdx.x; - const unsigned did0_begin = itmp / NWorkStride0; + const unsigned did0_begin = itmp / NBlockOpStride0; - itmp -= did0_begin * NWorkStride0; + itmp -= did0_begin * NBlockOpStride0; - const unsigned did1_begin = itmp / NWorkStride1; + const unsigned did1_begin = itmp / NBlockOpStride1; - itmp -= did1_begin * NWorkStride1; + itmp -= did1_begin * NBlockOpStride1; - const unsigned did2_begin = itmp / NWorkStride2; + const unsigned did2_begin = itmp / NBlockOpStride2; - itmp -= did2_begin * NWorkStride2; + itmp -= did2_begin * NBlockOpStride2; - const unsigned did3_begin = itmp / NWorkStride3; + const unsigned did3_begin = itmp / NBlockOpStride3; - for(unsigned did0 = did0_begin; did0 < src_desc.GetLength(I0); did0 += NWorkLen0) + for(unsigned did0 = did0_begin; did0 < src_desc.GetLength(I0); did0 += NBlockOpLen0) { - for(unsigned did1 = did1_begin; did1 < src_desc.GetLength(I1); did1 += NWorkLen1) + for(unsigned did1 = did1_begin; did1 < src_desc.GetLength(I1); did1 += NBlockOpLen1) { - for(unsigned did2 = did2_begin; did2 < src_desc.GetLength(I2); did2 += NWorkLen2) + for(unsigned did2 = did2_begin; did2 < src_desc.GetLength(I2); did2 += NBlockOpLen2) { - for(unsigned did3 = did3_begin; did3 < src_desc.GetLength(I3); did3 += NWorkLen3) + for(unsigned did3 = did3_begin; did3 < src_desc.GetLength(I3); did3 += NBlockOpLen3) { const unsigned sindex = src_desc.Get1dIndex(did0, did1, did2, did3); @@ -304,10 +304,10 @@ __device__ void blockwise_4d_tensor_op_binary( template __device__ void blockwise_4d_tensor_op_binary( @@ -323,63 +323,63 @@ __device__ void blockwise_4d_tensor_op_binary( static_assert(is_same::value); - constexpr unsigned NWorkStride3 = 1; - constexpr unsigned NWorkStride2 = NWorkLen3 * NWorkStride3; - constexpr unsigned NWorkStride1 = NWorkLen2 * NWorkStride2; - constexpr unsigned NWorkStride0 = NWorkLen1 * NWorkStride1; + constexpr unsigned NBlockOpStride3 = 1; + constexpr unsigned NBlockOpStride2 = NBlockOpLen3 * NBlockOpStride3; + constexpr unsigned NBlockOpStride1 = NBlockOpLen2 * NBlockOpStride2; + constexpr unsigned NBlockOpStride0 = NBlockOpLen1 * NBlockOpStride1; unsigned itmp = threadIdx.x; - const unsigned did0_begin = itmp / NWorkStride0; + const unsigned did0_begin = itmp / NBlockOpStride0; - itmp -= did0_begin * NWorkStride0; + itmp -= did0_begin * NBlockOpStride0; - const unsigned did1_begin = itmp / NWorkStride1; + const unsigned did1_begin = itmp / NBlockOpStride1; - itmp -= did1_begin * NWorkStride1; + itmp -= did1_begin * NBlockOpStride1; - const unsigned did2_begin = itmp / NWorkStride2; + const unsigned did2_begin = itmp / NBlockOpStride2; - itmp -= did2_begin * NWorkStride2; + itmp -= did2_begin * NBlockOpStride2; - const unsigned did3_begin = itmp / NWorkStride3; + const unsigned did3_begin = itmp / NBlockOpStride3; unsigned sindex = src_desc.Get1dIndex(did0_begin, did1_begin, did2_begin, did3_begin); unsigned dindex = dst_desc.Get1dIndex(did0_begin, did1_begin, did2_begin, did3_begin); - for(unsigned did0 = did0_begin; did0 < src_desc.GetLength(I0); did0 += NWorkLen0) + for(unsigned did0 = did0_begin; did0 < src_desc.GetLength(I0); did0 += NBlockOpLen0) { const unsigned sindex_save0 = sindex; const unsigned dindex_save0 = dindex; - for(unsigned did1 = did1_begin; did1 < src_desc.GetLength(I1); did1 += NWorkLen1) + for(unsigned did1 = did1_begin; did1 < src_desc.GetLength(I1); did1 += NBlockOpLen1) { const unsigned sindex_save1 = sindex; const unsigned dindex_save1 = dindex; - for(unsigned did2 = did2_begin; did2 < src_desc.GetLength(I2); did2 += NWorkLen2) + for(unsigned did2 = did2_begin; did2 < src_desc.GetLength(I2); did2 += NBlockOpLen2) { const unsigned sindex_save2 = sindex; const unsigned dindex_save2 = dindex; - for(unsigned did3 = did3_begin; did3 < src_desc.GetLength(I3); did3 += NWorkLen3) + for(unsigned did3 = did3_begin; did3 < src_desc.GetLength(I3); did3 += NBlockOpLen3) { f(p_src[sindex], p_dst[dindex]); - sindex += NWorkLen3 * src_desc.GetStride(I3); - dindex += NWorkLen3 * dst_desc.GetStride(I3); + sindex += NBlockOpLen3 * src_desc.GetStride(I3); + dindex += NBlockOpLen3 * dst_desc.GetStride(I3); } - sindex = sindex_save2 + NWorkLen2 * src_desc.GetStride(I2); - dindex = dindex_save2 + NWorkLen2 * dst_desc.GetStride(I2); + sindex = sindex_save2 + NBlockOpLen2 * src_desc.GetStride(I2); + dindex = dindex_save2 + NBlockOpLen2 * dst_desc.GetStride(I2); } - sindex = sindex_save1 + NWorkLen1 * src_desc.GetStride(I1); - dindex = dindex_save1 + NWorkLen1 * dst_desc.GetStride(I1); + sindex = sindex_save1 + NBlockOpLen1 * src_desc.GetStride(I1); + dindex = dindex_save1 + NBlockOpLen1 * dst_desc.GetStride(I1); } - sindex = sindex_save0 + NWorkLen0 * src_desc.GetStride(I0); - dindex = dindex_save0 + NWorkLen0 * dst_desc.GetStride(I0); + sindex = sindex_save0 + NBlockOpLen0 * src_desc.GetStride(I0); + dindex = dindex_save0 + NBlockOpLen0 * dst_desc.GetStride(I0); } } #endif @@ -388,10 +388,10 @@ __device__ void blockwise_4d_tensor_op_binary( template __device__ void blockwise_4d_tensor_op_binary( @@ -407,65 +407,69 @@ __device__ void blockwise_4d_tensor_op_binary( static_assert(is_same::value); - constexpr unsigned NWorkStride3 = 1; - constexpr unsigned NWorkStride2 = NWorkLen3 * NWorkStride3; - constexpr unsigned NWorkStride1 = NWorkLen2 * NWorkStride2; - constexpr unsigned NWorkStride0 = NWorkLen1 * NWorkStride1; + constexpr unsigned NBlockOpStride3 = 1; + constexpr unsigned NBlockOpStride2 = NBlockOpLen3 * NBlockOpStride3; + constexpr unsigned NBlockOpStride1 = NBlockOpLen2 * NBlockOpStride2; + constexpr unsigned NBlockOpStride0 = NBlockOpLen1 * NBlockOpStride1; unsigned itmp = threadIdx.x; - const unsigned did0_begin = itmp / NWorkStride0; + const unsigned did0_begin = itmp / NBlockOpStride0; - itmp -= did0_begin * NWorkStride0; + itmp -= did0_begin * NBlockOpStride0; - const unsigned did1_begin = itmp / NWorkStride1; + const unsigned did1_begin = itmp / NBlockOpStride1; - itmp -= did1_begin * NWorkStride1; + itmp -= did1_begin * NBlockOpStride1; - const unsigned did2_begin = itmp / NWorkStride2; + const unsigned did2_begin = itmp / NBlockOpStride2; - itmp -= did2_begin * NWorkStride2; + itmp -= did2_begin * NBlockOpStride2; - const unsigned did3_begin = itmp / NWorkStride3; + const unsigned did3_begin = itmp / NBlockOpStride3; unsigned sindex = src_desc.Get1dIndex(did0_begin, did1_begin, did2_begin, did3_begin); unsigned dindex = dst_desc.Get1dIndex(did0_begin, did1_begin, did2_begin, did3_begin); - for(unsigned did0 = did0_begin; did0 < src_desc.GetLength(I0); did0 += NWorkLen0) + for(unsigned did0 = did0_begin; did0 < src_desc.GetLength(I0); did0 += NBlockOpLen0) { unsigned i1 = 0; - for(unsigned did1 = did1_begin; did1 < src_desc.GetLength(I1); did1 += NWorkLen1) + for(unsigned did1 = did1_begin; did1 < src_desc.GetLength(I1); did1 += NBlockOpLen1) { unsigned i2 = 0; - for(unsigned did2 = did2_begin; did2 < src_desc.GetLength(I2); did2 += NWorkLen2) + for(unsigned did2 = did2_begin; did2 < src_desc.GetLength(I2); did2 += NBlockOpLen2) { unsigned i3 = 0; - for(unsigned did3 = did3_begin; did3 < src_desc.GetLength(I3); did3 += NWorkLen3) + for(unsigned did3 = did3_begin; did3 < src_desc.GetLength(I3); did3 += NBlockOpLen3) { f(p_src[sindex], p_dst[dindex]); - sindex += NWorkLen3 * src_desc.GetStride(I3); - dindex += NWorkLen3 * dst_desc.GetStride(I3); + sindex += NBlockOpLen3 * src_desc.GetStride(I3); + dindex += NBlockOpLen3 * dst_desc.GetStride(I3); ++i3; } - sindex += - NWorkLen2 * src_desc.GetStride(I2) - i3 * NWorkLen3 * src_desc.GetStride(I3); - dindex += - NWorkLen2 * dst_desc.GetStride(I2) - i3 * NWorkLen3 * dst_desc.GetStride(I3); + sindex += NBlockOpLen2 * src_desc.GetStride(I2) - + i3 * NBlockOpLen3 * src_desc.GetStride(I3); + dindex += NBlockOpLen2 * dst_desc.GetStride(I2) - + i3 * NBlockOpLen3 * dst_desc.GetStride(I3); ++i2; } - sindex += NWorkLen1 * src_desc.GetStride(I1) - i2 * NWorkLen2 * src_desc.GetStride(I2); - dindex += NWorkLen1 * dst_desc.GetStride(I1) - i2 * NWorkLen2 * dst_desc.GetStride(I2); + sindex += + NBlockOpLen1 * src_desc.GetStride(I1) - i2 * NBlockOpLen2 * src_desc.GetStride(I2); + dindex += + NBlockOpLen1 * dst_desc.GetStride(I1) - i2 * NBlockOpLen2 * dst_desc.GetStride(I2); ++i1; } - sindex += NWorkLen0 * src_desc.GetStride(I0) - i1 * NWorkLen1 * src_desc.GetStride(I1); - dindex += NWorkLen0 * dst_desc.GetStride(I0) - i1 * NWorkLen1 * dst_desc.GetStride(I1); + sindex += + NBlockOpLen0 * src_desc.GetStride(I0) - i1 * NBlockOpLen1 * src_desc.GetStride(I1); + dindex += + NBlockOpLen0 * dst_desc.GetStride(I0) - i1 * NBlockOpLen1 * dst_desc.GetStride(I1); } } #endif @@ -474,10 +478,10 @@ __device__ void blockwise_4d_tensor_op_binary( template __device__ void blockwise_4d_tensor_op_binary( @@ -493,34 +497,34 @@ __device__ void blockwise_4d_tensor_op_binary( static_assert(is_same::value); - constexpr unsigned NWorkStride3 = 1; - constexpr unsigned NWorkStride2 = NWorkLen3 * NWorkStride3; - constexpr unsigned NWorkStride1 = NWorkLen2 * NWorkStride2; - constexpr unsigned NWorkStride0 = NWorkLen1 * NWorkStride1; + constexpr unsigned NBlockOpStride3 = 1; + constexpr unsigned NBlockOpStride2 = NBlockOpLen3 * NBlockOpStride3; + constexpr unsigned NBlockOpStride1 = NBlockOpLen2 * NBlockOpStride2; + constexpr unsigned NBlockOpStride0 = NBlockOpLen1 * NBlockOpStride1; unsigned itmp = threadIdx.x; - const unsigned did0_begin = itmp / NWorkStride0; + const unsigned did0_begin = itmp / NBlockOpStride0; - itmp -= did0_begin * NWorkStride0; + itmp -= did0_begin * NBlockOpStride0; - const unsigned did1_begin = itmp / NWorkStride1; + const unsigned did1_begin = itmp / NBlockOpStride1; - itmp -= did1_begin * NWorkStride1; + itmp -= did1_begin * NBlockOpStride1; - const unsigned did2_begin = itmp / NWorkStride2; + const unsigned did2_begin = itmp / NBlockOpStride2; - itmp -= did2_begin * NWorkStride2; + itmp -= did2_begin * NBlockOpStride2; - const unsigned did3_begin = itmp / NWorkStride3; + const unsigned did3_begin = itmp / NBlockOpStride3; - for(unsigned did0 = did0_begin; did0 < src_desc.GetLength(I0); did0 += NWorkLen0) + for(unsigned did0 = did0_begin; did0 < src_desc.GetLength(I0); did0 += NBlockOpLen0) { - for(unsigned did1 = did1_begin; did1 < src_desc.GetLength(I1); did1 += NWorkLen1) + for(unsigned did1 = did1_begin; did1 < src_desc.GetLength(I1); did1 += NBlockOpLen1) { - for(unsigned did2 = did2_begin; did2 < src_desc.GetLength(I2); did2 += NWorkLen2) + for(unsigned did2 = did2_begin; did2 < src_desc.GetLength(I2); did2 += NBlockOpLen2) { - for(unsigned did3 = did3_begin; did3 < src_desc.GetLength(I3); did3 += NWorkLen3) + for(unsigned did3 = did3_begin; did3 < src_desc.GetLength(I3); did3 += NBlockOpLen3) { const unsigned sindex = src_desc.Get1dIndex(did0, did1, did2, did3); diff --git a/src/include/direct_convolution_2.cuh b/src/include/direct_convolution_2.cuh index 35bec64255..50e3b0e9dc 100644 --- a/src/include/direct_convolution_2.cuh +++ b/src/include/direct_convolution_2.cuh @@ -12,11 +12,11 @@ template __device__ void blockwise_convolution(InDesc, - TFloat* const __restrict__ p_in, + TFloat* const __restrict__ p_in_lds, WeiDesc, - TFloat* const __restrict__ p_wei, + TFloat* const __restrict__ p_wei_lds, OutDesc, - TFloat* __restrict__ p_out) + TFloat* __restrict__ p_out_lds) { constexpr auto I0 = Index<0>{}; constexpr auto I1 = Index<1>{}; @@ -97,8 +97,8 @@ __device__ void blockwise_convolution(InDesc, decltype(in_thread_dst_desc), decltype(f_copy)>( in_thread_src_desc, - p_in + in_desc.Get1dIndex( - n_thread_work_begin, 0, hi_thread_work_begin, wi_thread_work_begin), + p_in_lds + in_desc.Get1dIndex( + n_thread_work_begin, 0, hi_thread_work_begin, wi_thread_work_begin), in_thread_dst_desc, p_in_thread, f_copy); @@ -112,7 +112,7 @@ __device__ void blockwise_convolution(InDesc, decltype(wei_thread_dst_desc), decltype(f_copy)>( wei_thread_src_desc, - p_wei + wei_desc.Get1dIndex(k_thread_work_begin, 0, 0, 0), + p_wei_lds + wei_desc.Get1dIndex(k_thread_work_begin, 0, 0, 0), wei_thread_dst_desc, p_wei_thread, f_copy); @@ -123,10 +123,10 @@ __device__ void blockwise_convolution(InDesc, decltype(out_thread_dst_desc), decltype(f_copy)>( out_thread_src_desc, - p_out + out_desc.Get1dIndex(n_thread_work_begin, - k_thread_work_begin, - ho_thread_work_begin, - wo_thread_work_begin), + p_out_lds + out_desc.Get1dIndex(n_thread_work_begin, + k_thread_work_begin, + ho_thread_work_begin, + wo_thread_work_begin), out_thread_dst_desc, p_out_thread, f_copy); @@ -150,10 +150,10 @@ __device__ void blockwise_convolution(InDesc, out_thread_dst_desc, p_out_thread, out_thread_src_desc, - p_out + out_desc.Get1dIndex(n_thread_work_begin, - k_thread_work_begin, - ho_thread_work_begin, - wo_thread_work_begin), + p_out_lds + out_desc.Get1dIndex(n_thread_work_begin, + k_thread_work_begin, + ho_thread_work_begin, + wo_thread_work_begin), f_copy); } } @@ -170,18 +170,18 @@ template __global__ void gridwise_convolution(InDesc, - TFloat* const __restrict__ p_in, + TFloat* const __restrict__ p_in_glb, WeiDesc, - TFloat* const __restrict__ p_wei, + TFloat* const __restrict__ p_wei_glb, OutDesc, - TFloat* __restrict__ p_out) + TFloat* __restrict__ p_out_glb) { constexpr auto I0 = Index<0>{}; constexpr auto I1 = Index<1>{}; @@ -222,13 +222,13 @@ __global__ void gridwise_convolution(InDesc, constexpr auto out_block_lds_desc = make_ConstantTensorDescriptor(out_block_glb_desc.GetLengths()); - constexpr unsigned in_block_size = in_block_lds_desc.GetElementSize(); - constexpr unsigned wei_block_size = wei_block_lds_desc.GetElementSize(); - constexpr unsigned out_block_size = out_block_lds_desc.GetElementSize(); + constexpr unsigned in_block_size = in_block_lds_desc.GetElementSpace(); + constexpr unsigned wei_block_size = wei_block_lds_desc.GetElementSpace(); + constexpr unsigned out_block_size = out_block_lds_desc.GetElementSpace(); - __shared__ TFloat p_in_block[in_block_size]; - __shared__ TFloat p_wei_block[wei_block_size]; - __shared__ TFloat p_out_block[out_block_size]; + __shared__ TFloat p_in_block_lds[in_block_size]; + __shared__ TFloat p_wei_block_lds[wei_block_size]; + __shared__ TFloat p_out_block_lds[out_block_size]; const unsigned block_id = blockIdx.x; @@ -286,12 +286,12 @@ __global__ void gridwise_convolution(InDesc, // set output tensor in LDS to 0 blockwise_4d_tensor_op_unary(out_block_lds_desc, p_out_block, f_set0); + BlockSize>(out_block_lds_desc, p_out_block_lds, f_set0); for(unsigned c_block_work_begin = 0; c_block_work_begin < in_desc.GetLength(I1); c_block_work_begin += CPerBlock) @@ -301,35 +301,35 @@ __global__ void gridwise_convolution(InDesc, blockwise_4d_tensor_op_binary( in_block_glb_desc, - p_in + in_block_glb_desc.Get1dIndex(n_block_work_begin, - c_block_work_begin, - hi_block_work_begin, - wi_block_work_begin), + p_in_glb + in_block_glb_desc.Get1dIndex(n_block_work_begin, + c_block_work_begin, + hi_block_work_begin, + wi_block_work_begin), in_block_lds_desc, - p_in_block, + p_in_block_lds, f_copy); // copy weight tensor to LDS blockwise_4d_tensor_op_binary( wei_block_glb_desc, - p_wei + wei_block_glb_desc.Get1dIndex(k_block_work_begin, c_block_work_begin, 0, 0), + p_wei_glb + wei_block_glb_desc.Get1dIndex(k_block_work_begin, c_block_work_begin, 0, 0), wei_block_lds_desc, - p_wei_block, + p_wei_block_lds, f_copy); #if 1 @@ -344,11 +344,11 @@ __global__ void gridwise_convolution(InDesc, OutTileSizeH, OutTileSizeW, BlockSize>(in_block_lds_desc, - p_in_block, + p_in_block_lds, wei_block_lds_desc, - p_wei_block, + p_wei_block_lds, out_block_lds_desc, - p_out_block); + p_out_block_lds); #if 1 __syncthreads(); @@ -359,16 +359,16 @@ __global__ void gridwise_convolution(InDesc, blockwise_4d_tensor_op_binary( out_block_lds_desc, - p_out_block, + p_out_block_lds, out_block_glb_desc, - p_out + + p_out_glb + out_block_glb_desc.Get1dIndex( n_block_work_begin, k_block_work_begin, ho_block_work_begin, wo_block_work_begin), f_copy);