From 39775d484c4d15a5b895edfc9d2323f05ab2d3d4 Mon Sep 17 00:00:00 2001 From: Chao Liu Date: Tue, 18 Dec 2018 03:22:12 -0600 Subject: [PATCH] another version of direct conv --- driver/conv.cu | 13 +- driver/device_direct_convolution_1.cuh | 2 +- driver/device_direct_convolution_2.cuh | 6 +- driver/device_direct_convolution_3.cuh | 120 ++++++++++ src/include/blockwise_direct_convolution.cuh | 24 +- src/include/blockwise_tensor_op.cuh | 93 ++++---- src/include/constant_tensor_descriptor.cuh | 2 +- src/include/gridwise_direct_convolution_1.cuh | 12 +- src/include/gridwise_direct_convolution_2.cuh | 78 +++---- src/include/gridwise_direct_convolution_3.cuh | 208 ++++++++++++++++++ src/include/threadwise_direct_convolution.cuh | 127 ++++++++++- src/include/threadwise_tensor_op.cuh | 68 +++--- 12 files changed, 596 insertions(+), 157 deletions(-) create mode 100644 driver/device_direct_convolution_3.cuh create mode 100644 src/include/gridwise_direct_convolution_3.cuh diff --git a/driver/conv.cu b/driver/conv.cu index f486d9ad51..80e5057aaf 100644 --- a/driver/conv.cu +++ b/driver/conv.cu @@ -7,6 +7,7 @@ #include "constant_tensor_descriptor.cuh" #include "device_direct_convolution_1.cuh" #include "device_direct_convolution_2.cuh" +#include "device_direct_convolution_3.cuh" //#include "device_winograd_convolution.cuh" struct GeneratorTensor_1 @@ -338,7 +339,7 @@ int main() constexpr unsigned K = 1; constexpr unsigned S = 3; constexpr unsigned R = 3; -#elif 0 +#elif 1 constexpr unsigned N = 64; constexpr unsigned C = 256; constexpr unsigned HI = 34; @@ -346,7 +347,7 @@ int main() constexpr unsigned K = 64; constexpr unsigned S = 3; constexpr unsigned R = 3; -#elif 1 +#elif 0 constexpr unsigned N = 64; constexpr unsigned C = 64; constexpr unsigned HI = 56; @@ -387,11 +388,15 @@ int main() wei.GenerateTensorValue(GeneratorTensor_2{-5, 5}, num_thread); #endif - for(int i = 0; i < 20; ++i) + for(int i = 0; i < 40; ++i) { #if 1 device_direct_convolution_1(in_desc, in, wei_desc, wei, out_desc, out_device); -#else +#elif 0 + device_direct_convolution_2(in_desc, in, wei_desc, wei, out_desc, out_device); +#elif 0 + device_direct_convolution_3(in_desc, in, wei_desc, wei, out_desc, out_device); +#elif 0 device_winograd_convolution(in_desc, in, wei_desc, wei, out_desc, out_device); #endif } diff --git a/driver/device_direct_convolution_1.cuh b/driver/device_direct_convolution_1.cuh index fbd1b98d41..68dc000173 100644 --- a/driver/device_direct_convolution_1.cuh +++ b/driver/device_direct_convolution_1.cuh @@ -25,7 +25,7 @@ void device_direct_convolution_1( constexpr auto wei_desc = WeiDesc{}; constexpr auto out_desc = OutDesc{}; -#if 0 +#if 1 constexpr unsigned OutTileSizeH = 2; constexpr unsigned OutTileSizeW = 2; constexpr unsigned NPerBlock = 2; diff --git a/driver/device_direct_convolution_2.cuh b/driver/device_direct_convolution_2.cuh index 5c08870a26..9e3d8b2d92 100644 --- a/driver/device_direct_convolution_2.cuh +++ b/driver/device_direct_convolution_2.cuh @@ -25,7 +25,7 @@ void device_direct_convolution_2( constexpr auto wei_desc = WeiDesc{}; constexpr auto out_desc = OutDesc{}; -#if 0 +#if 1 constexpr unsigned OutTileSizeH = 2; constexpr unsigned OutTileSizeW = 2; constexpr unsigned NPerBlock = 2; @@ -39,7 +39,7 @@ void device_direct_convolution_2( constexpr unsigned CPerThread = 2; constexpr unsigned BlockSize = 128; -#elif 1 +#elif 0 constexpr unsigned OutTileSizeH = 2; constexpr unsigned OutTileSizeW = 2; constexpr unsigned NPerBlock = 2; @@ -53,7 +53,7 @@ void device_direct_convolution_2( constexpr unsigned CPerThread = 2; constexpr unsigned BlockSize = 216; -#elif 1 +#elif 0 constexpr unsigned OutTileSizeH = 2; constexpr unsigned OutTileSizeW = 2; constexpr unsigned NPerBlock = 2; diff --git a/driver/device_direct_convolution_3.cuh b/driver/device_direct_convolution_3.cuh new file mode 100644 index 0000000000..884b3bf38e --- /dev/null +++ b/driver/device_direct_convolution_3.cuh @@ -0,0 +1,120 @@ +#pragma once +#include "gridwise_direct_convolution_3.cuh" + +template +void device_direct_convolution_3( + InDesc, const Tensor& in, WeiDesc, const Tensor& wei, OutDesc, Tensor& out) +{ + std::size_t data_sz = sizeof(T); + DeviceMem in_device_buf(data_sz * in.mDesc.GetElementSpace()); + DeviceMem wei_device_buf(data_sz * wei.mDesc.GetElementSpace()); + DeviceMem out_device_buf(data_sz * out.mDesc.GetElementSpace()); + + int num_thread = std::thread::hardware_concurrency(); + + in_device_buf.ToDevice(in.mData.data()); + wei_device_buf.ToDevice(wei.mData.data()); + out_device_buf.ToDevice(out.mData.data()); + + constexpr auto I0 = Number<0>{}; + constexpr auto I1 = Number<1>{}; + constexpr auto I2 = Number<2>{}; + constexpr auto I3 = Number<3>{}; + + constexpr auto in_desc = InDesc{}; + constexpr auto wei_desc = WeiDesc{}; + constexpr auto out_desc = OutDesc{}; + +#if 1 + constexpr unsigned OutTileSizeH = 2; + constexpr unsigned OutTileSizeW = 2; + constexpr unsigned NPerBlock = 2; + constexpr unsigned KPerBlock = 32; + constexpr unsigned CPerBlock = 4; + constexpr unsigned YPerBlock = 1; + constexpr unsigned XPerBlock = 16; + + constexpr unsigned NPerThread = 2; + constexpr unsigned KPerThread = 4; + constexpr unsigned CPerThread = 2; + + constexpr unsigned BlockSize = 128; +#elif 0 + constexpr unsigned OutTileSizeH = 2; + constexpr unsigned OutTileSizeW = 2; + constexpr unsigned NPerBlock = 2; + constexpr unsigned KPerBlock = 32; + constexpr unsigned CPerBlock = 4; + constexpr unsigned YPerBlock = 1; + constexpr unsigned XPerBlock = 27; + + constexpr unsigned NPerThread = 2; + constexpr unsigned KPerThread = 4; + constexpr unsigned CPerThread = 2; + + constexpr unsigned BlockSize = 216; +#elif 0 + constexpr unsigned OutTileSizeH = 2; + constexpr unsigned OutTileSizeW = 2; + constexpr unsigned NPerBlock = 2; + constexpr unsigned KPerBlock = 32; + constexpr unsigned CPerBlock = 4; + constexpr unsigned YPerBlock = 1; + constexpr unsigned XPerBlock = 32; + + constexpr unsigned NPerThread = 2; + constexpr unsigned KPerThread = 4; + constexpr unsigned CPerThread = 2; + + constexpr unsigned BlockSize = 256; +#endif + + constexpr unsigned GridSize = (out_desc.GetLength(I0) / NPerBlock) * + (out_desc.GetLength(I1) / KPerBlock) * + (out_desc.GetLength(I2) / (OutTileSizeH * YPerBlock)) * + (out_desc.GetLength(I3) / (OutTileSizeW * XPerBlock)); + + dim3 block_dim(BlockSize); + dim3 grid_dim(GridSize); + + printf("%s: BlockSize %u, GridSize %u \n", __func__, BlockSize, GridSize); + + cudaEvent_t start, stop; + float elapsedTime; + + cudaEventCreate(&start); + cudaEventRecord(start, 0); + + gridwise_direct_convolution_3 + <<>>(InDesc{}, + static_cast(in_device_buf.GetDeviceBuffer()), + WeiDesc{}, + static_cast(wei_device_buf.GetDeviceBuffer()), + OutDesc{}, + static_cast(out_device_buf.GetDeviceBuffer())); + + cudaEventCreate(&stop); + cudaEventRecord(stop, 0); + cudaEventSynchronize(stop); + + cudaEventElapsedTime(&elapsedTime, start, stop); + printf("Elapsed time : %f ms\n", elapsedTime); + + checkCudaErrors(cudaGetLastError()); + out_device_buf.FromDevice(out.mData.data()); +} diff --git a/src/include/blockwise_direct_convolution.cuh b/src/include/blockwise_direct_convolution.cuh index 879e682cc4..95005af765 100644 --- a/src/include/blockwise_direct_convolution.cuh +++ b/src/include/blockwise_direct_convolution.cuh @@ -102,7 +102,8 @@ __device__ void blockwise_direct_convolution(InBlockDesc, ho_thread_data_begin, wo_thread_data_begin), out_thread_desc, - p_out_thread); + p_out_thread, + out_thread_desc); for(unsigned c_thread_data_begin = 0; c_thread_data_begin < in_block_desc.GetLength(I1); c_thread_data_begin += CPerThread) @@ -114,7 +115,8 @@ __device__ void blockwise_direct_convolution(InBlockDesc, hi_thread_data_begin, wi_thread_data_begin), in_thread_desc, - p_in_thread); + p_in_thread, + in_thread_desc); // copy weight into register threadwise_4d_tensor_copy( @@ -122,15 +124,16 @@ __device__ void blockwise_direct_convolution(InBlockDesc, p_wei_block + wei_block_desc.Get1dIndex(k_thread_data_begin, c_thread_data_begin, 0, 0), wei_thread_desc, - p_wei_thread); + p_wei_thread, + wei_thread_desc); // threadwise convolution - threadwise_direct_convolution(in_thread_desc, - p_in_thread, - wei_thread_desc, - p_wei_thread, - out_thread_desc, - p_out_thread); + threadwise_direct_convolution_2(in_thread_desc, + p_in_thread, + wei_thread_desc, + p_wei_thread, + out_thread_desc, + p_out_thread); } // copy output into LDS @@ -140,6 +143,7 @@ __device__ void blockwise_direct_convolution(InBlockDesc, p_out_block + out_block_desc.Get1dIndex(n_thread_data_begin, k_thread_data_begin, ho_thread_data_begin, - wo_thread_data_begin)); + wo_thread_data_begin), + out_thread_desc); } } diff --git a/src/include/blockwise_tensor_op.cuh b/src/include/blockwise_tensor_op.cuh index 3b4bc58fdc..f404c1d2dc 100644 --- a/src/include/blockwise_tensor_op.cuh +++ b/src/include/blockwise_tensor_op.cuh @@ -75,84 +75,82 @@ __device__ void blockwise_4d_tensor_pointwise_op_unary(DstDesc, TFloat* __restri } } -template +template __device__ void blockwise_4d_tensor_pointwise_op_binary( - SrcDesc, TFloat* const __restrict__ p_src, DstDesc, TFloat* __restrict__ p_dst, F f) + DescA, TFloat* const __restrict__ p_a, DescB, TFloat* __restrict__ p_b, DescRef, F f) { constexpr auto I0 = Number<0>{}; constexpr auto I1 = Number<1>{}; constexpr auto I2 = Number<2>{}; constexpr auto I3 = Number<3>{}; - constexpr auto src_desc = SrcDesc{}; - constexpr auto dst_desc = DstDesc{}; - - static_assert(is_same::value); - - constexpr auto desc = make_ConstantTensorDescriptor(src_desc.GetLengths()); + constexpr auto desc_a = DescA{}; + constexpr auto desc_b = DescB{}; + constexpr auto desc_ref = DescRef{}; #if 0 if(threadIdx.x == 0) { - print_ConstantTensorDescriptor(src_desc, "blockwise_4d_tensor_op_binary: src_desc: "); - print_ConstantTensorDescriptor(dst_desc, "blockwise_4d_tensor_op_binary: dst_desc: "); + print_ConstantTensorDescriptor(desc_a, "blockwise_4d_tensor_op_binary: desc_a: "); + print_ConstantTensorDescriptor(desc_b, "blockwise_4d_tensor_op_binary: desc_b: "); + print_ConstantTensorDescriptor(desc_ref, "blockwise_4d_tensor_op_binary: desc_ref: "); } #endif - constexpr unsigned NLoop = desc.GetElementSize() / BlockSize; + constexpr unsigned NLoop = desc_ref.GetElementSize() / BlockSize; for(unsigned iloop = 0; iloop < NLoop; ++iloop) { unsigned is = threadIdx.x + iloop * BlockSize; - const unsigned did0 = is / desc.GetStride(I0); + const unsigned did0 = is / desc_ref.GetStride(I0); - is -= did0 * desc.GetStride(I0); + is -= did0 * desc_ref.GetStride(I0); - const unsigned did1 = is / desc.GetStride(I1); + const unsigned did1 = is / desc_ref.GetStride(I1); - is -= did1 * desc.GetStride(I1); + is -= did1 * desc_ref.GetStride(I1); - const unsigned did2 = is / desc.GetStride(I2); + const unsigned did2 = is / desc_ref.GetStride(I2); - is -= did2 * desc.GetStride(I2); + is -= did2 * desc_ref.GetStride(I2); - const unsigned did3 = is / desc.GetStride(I3); + const unsigned did3 = is / desc_ref.GetStride(I3); - const unsigned sindex = src_desc.Get1dIndex(did0, did1, did2, did3); + const unsigned aindex = desc_a.Get1dIndex(did0, did1, did2, did3); - const unsigned dindex = dst_desc.Get1dIndex(did0, did1, did2, did3); + const unsigned bindex = desc_b.Get1dIndex(did0, did1, did2, did3); - f(p_src[sindex], p_dst[dindex]); + f(p_a[aindex], p_b[bindex]); } - constexpr bool has_tail = (desc.GetElementSize() > NLoop * BlockSize); + constexpr bool has_tail = (desc_ref.GetElementSize() > NLoop * BlockSize); if(has_tail) { unsigned is = threadIdx.x + NLoop * BlockSize; - if(is < desc.GetElementSize()) + if(is < desc_ref.GetElementSize()) { - const unsigned did0 = is / desc.GetStride(I0); + const unsigned did0 = is / desc_ref.GetStride(I0); - is -= did0 * desc.GetStride(I0); + is -= did0 * desc_ref.GetStride(I0); - const unsigned did1 = is / desc.GetStride(I1); + const unsigned did1 = is / desc_ref.GetStride(I1); - is -= did1 * desc.GetStride(I1); + is -= did1 * desc_ref.GetStride(I1); - const unsigned did2 = is / desc.GetStride(I2); + const unsigned did2 = is / desc_ref.GetStride(I2); - is -= did2 * desc.GetStride(I2); + is -= did2 * desc_ref.GetStride(I2); - const unsigned did3 = is / desc.GetStride(I3); + const unsigned did3 = is / desc_ref.GetStride(I3); - const unsigned sindex = src_desc.Get1dIndex(did0, did1, did2, did3); + const unsigned aindex = desc_a.Get1dIndex(did0, did1, did2, did3); - const unsigned dindex = dst_desc.Get1dIndex(did0, did1, did2, did3); + const unsigned bindex = desc_b.Get1dIndex(did0, did1, did2, did3); - f(p_src[sindex], p_dst[dindex]); + f(p_a[aindex], p_b[bindex]); } } } @@ -166,26 +164,17 @@ __device__ void blockwise_4d_tensor_set_zero(DstDesc, TFloat* __restrict__ p_dst DstDesc{}, p_dst, f_set_zero); } -template -__device__ void blockwise_4d_tensor_copy(SrcDesc, - TFloat* const __restrict__ p_src, - DstDesc, - TFloat* __restrict__ p_dst) +template +__device__ void blockwise_4d_tensor_copy( + SrcDesc, TFloat* const __restrict__ p_src, DstDesc, TFloat* __restrict__ p_dst, RefDesc) { auto f_copy = [](const TFloat& src, TFloat& dst) { dst = src; }; - blockwise_4d_tensor_pointwise_op_binary( - SrcDesc{}, p_src, DstDesc{}, p_dst, f_copy); + blockwise_4d_tensor_pointwise_op_binary( + SrcDesc{}, p_src, DstDesc{}, p_dst, RefDesc{}, f_copy); } - -template -__device__ void blockwise_4d_tensor_accumulate(SrcDesc, - TFloat* const __restrict__ p_src, - DstDesc, - TFloat* __restrict__ p_dst) -{ - auto f_accum = [](const TFloat& src, TFloat& dst) { dst += src; }; - - blockwise_4d_tensor_pointwise_op_binary( - SrcDesc{}, p_src, DstDesc{}, p_dst, f_accum); -} \ No newline at end of file diff --git a/src/include/constant_tensor_descriptor.cuh b/src/include/constant_tensor_descriptor.cuh index 11c5c30e54..5ce3f0deaa 100644 --- a/src/include/constant_tensor_descriptor.cuh +++ b/src/include/constant_tensor_descriptor.cuh @@ -4,7 +4,7 @@ template struct Constant { - const T mValue = N; + static const T mValue = N; }; template diff --git a/src/include/gridwise_direct_convolution_1.cuh b/src/include/gridwise_direct_convolution_1.cuh index 5c77f0c9d2..8a339b14e4 100644 --- a/src/include/gridwise_direct_convolution_1.cuh +++ b/src/include/gridwise_direct_convolution_1.cuh @@ -130,6 +130,7 @@ __global__ void gridwise_direct_convolution_1(InGlobalDesc, blockwise_4d_tensor_copy(in_block_src_desc, p_in_global + in_global_desc.Get1dIndex(n_block_work_begin, @@ -137,17 +138,20 @@ __global__ void gridwise_direct_convolution_1(InGlobalDesc, hi_block_work_begin, wi_block_work_begin), in_block_desc, - p_in_block); + p_in_block, + in_block_desc); // copy weight tensor to LDS blockwise_4d_tensor_copy( wei_block_src_desc, p_wei_global + wei_global_desc.Get1dIndex(k_block_work_begin, c_block_work_begin, 0, 0), wei_block_desc, - p_wei_block); + p_wei_block, + wei_block_desc); __syncthreads(); @@ -171,11 +175,13 @@ __global__ void gridwise_direct_convolution_1(InGlobalDesc, blockwise_4d_tensor_copy( out_block_desc, p_out_block, out_block_src_desc, p_out_global + out_global_desc.Get1dIndex( - n_block_work_begin, k_block_work_begin, ho_block_work_begin, wo_block_work_begin)); + n_block_work_begin, k_block_work_begin, ho_block_work_begin, wo_block_work_begin), + out_block_desc); } \ No newline at end of file diff --git a/src/include/gridwise_direct_convolution_2.cuh b/src/include/gridwise_direct_convolution_2.cuh index 008d5be695..d2040f1f90 100644 --- a/src/include/gridwise_direct_convolution_2.cuh +++ b/src/include/gridwise_direct_convolution_2.cuh @@ -46,15 +46,11 @@ __global__ void gridwise_direct_convolution_2(InGlobalDesc, constexpr unsigned HiPerBlock = YPerBlock * OutTileSizeH + S - 1; constexpr unsigned WiPerBlock = XPerBlock * OutTileSizeW + R - 1; - constexpr auto in_block_global_desc = make_ConstantTensorDescriptor( - Sequence{}, in_global_desc.GetStrides()); + constexpr auto in_block_desc = + make_ConstantTensorDescriptor(Sequence{}); - constexpr auto wei_block_global_desc = make_ConstantTensorDescriptor( - Sequence{}, wei_global_desc.GetStrides()); - - constexpr auto in_block_desc = make_ConstantTensorDescriptor(in_block_global_desc.GetLengths()); constexpr auto wei_block_desc = - make_ConstantTensorDescriptor(wei_block_global_desc.GetLengths()); + make_ConstantTensorDescriptor(Sequence{}); // shared mem constexpr unsigned in_block_size = in_block_desc.GetElementSpace(); @@ -67,30 +63,19 @@ __global__ void gridwise_direct_convolution_2(InGlobalDesc, constexpr unsigned InTileSizeH = OutTileSizeH + S - 1; constexpr unsigned InTileSizeW = OutTileSizeW + R - 1; - constexpr auto in_thread_block_desc = make_ConstantTensorDescriptor( - Sequence{}, in_block_desc.GetStrides()); - - constexpr auto wei_thread_block_desc = make_ConstantTensorDescriptor( - Sequence{}, wei_block_desc.GetStrides()); - constexpr auto in_thread_desc = - make_ConstantTensorDescriptor(in_thread_block_desc.GetLengths()); + make_ConstantTensorDescriptor(Sequence{}); + constexpr auto wei_thread_desc = - make_ConstantTensorDescriptor(wei_thread_block_desc.GetLengths()); + make_ConstantTensorDescriptor(Sequence{}); + constexpr auto out_thread_desc = get_output_4d_tensor_descriptor(in_thread_desc, wei_thread_desc); - constexpr auto out_thread_global_desc = - make_ConstantTensorDescriptor(out_thread_desc.GetLengths(), out_global_desc.GetStrides()); - // register - constexpr unsigned in_thread_size = in_thread_desc.GetElementSpace(); - constexpr unsigned wei_thread_size = wei_thread_desc.GetElementSpace(); - constexpr unsigned out_thread_size = out_thread_desc.GetElementSpace(); - - TFloat p_in_thread[in_thread_size]; - TFloat p_wei_thread[wei_thread_size]; - TFloat p_out_thread[out_thread_size]; + TFloat p_in_thread[in_thread_desc.GetElementSpace()]; + TFloat p_wei_thread[wei_thread_desc.GetElementSpace()]; + TFloat p_out_thread[out_thread_desc.GetElementSpace()]; // divide block work constexpr unsigned NBlockWork = (out_global_desc.GetLength(I0) + NPerBlock - 1) / NPerBlock; @@ -169,54 +154,60 @@ __global__ void gridwise_direct_convolution_2(InGlobalDesc, { // copy input tensor to LDS blockwise_4d_tensor_copy(in_block_global_desc, + decltype(in_block_desc), + BlockSize>(in_global_desc, p_in_global + in_global_desc.Get1dIndex(n_block_data_begin, c_block_data_begin, hi_block_data_begin, wi_block_data_begin), in_block_desc, - p_in_block); + p_in_block, + in_block_desc); // copy weight tensor to LDS blockwise_4d_tensor_copy( - wei_block_global_desc, + wei_global_desc, p_wei_global + wei_global_desc.Get1dIndex(k_block_data_begin, c_block_data_begin, 0, 0), wei_block_desc, - p_wei_block); + p_wei_block, + wei_block_desc); __syncthreads(); for(unsigned c_thread_data = 0; c_thread_data < CPerBlock; c_thread_data += CPerThread) { // copy input tensor into register - threadwise_4d_tensor_copy(in_thread_block_desc, + threadwise_4d_tensor_copy(in_block_desc, p_in_block + in_block_desc.Get1dIndex(n_thread_data_begin, c_thread_data, hi_thread_data_begin, wi_thread_data_begin), in_thread_desc, - p_in_thread); + p_in_thread, + in_thread_desc); // copy weight tensor into register threadwise_4d_tensor_copy( - wei_thread_block_desc, + wei_block_desc, p_wei_block + wei_block_desc.Get1dIndex(k_thread_data_begin, c_thread_data, 0, 0), wei_thread_desc, - p_wei_thread); + p_wei_thread, + wei_thread_desc); // threadwise convolution - threadwise_direct_convolution(in_thread_desc, - p_in_thread, - wei_thread_desc, - p_wei_thread, - out_thread_desc, - p_out_thread); + threadwise_direct_convolution_1(in_thread_desc, + p_in_thread, + wei_thread_desc, + p_wei_thread, + out_thread_desc, + p_out_thread); } } @@ -224,9 +215,10 @@ __global__ void gridwise_direct_convolution_2(InGlobalDesc, threadwise_4d_tensor_copy( out_thread_desc, p_out_thread, - out_thread_global_desc, + out_global_desc, p_out_global + out_global_desc.Get1dIndex(n_block_data_begin + n_thread_data_begin, k_block_data_begin + k_thread_data_begin, ho_block_data_begin + ho_thread_data_begin, - wo_block_data_begin + wo_thread_data_begin)); + wo_block_data_begin + wo_thread_data_begin), + out_thread_desc); } diff --git a/src/include/gridwise_direct_convolution_3.cuh b/src/include/gridwise_direct_convolution_3.cuh new file mode 100644 index 0000000000..f8bf42a011 --- /dev/null +++ b/src/include/gridwise_direct_convolution_3.cuh @@ -0,0 +1,208 @@ +#pragma once +#include "constant_tensor_descriptor.cuh" +#include "blockwise_tensor_op.cuh" +#include "blockwise_direct_convolution.cuh" +#include "threadwise_tensor_op.cuh" +#include "threadwise_direct_convolution.cuh" + +template +__global__ void gridwise_direct_convolution_3(InGlobalDesc, + TFloat* const __restrict__ p_in_global, + WeiGlobalDesc, + TFloat* const __restrict__ p_wei_global, + OutGlobalDesc, + TFloat* __restrict__ p_out_global) +{ + constexpr auto I0 = Number<0>{}; + constexpr auto I1 = Number<1>{}; + constexpr auto I2 = Number<2>{}; + constexpr auto I3 = Number<3>{}; + + constexpr auto in_global_desc = InGlobalDesc{}; + constexpr auto wei_global_desc = WeiGlobalDesc{}; + constexpr auto out_global_desc = OutGlobalDesc{}; + + constexpr unsigned S = wei_global_desc.GetLength(I2); + constexpr unsigned R = wei_global_desc.GetLength(I3); + + constexpr unsigned HoPerBlock = OutTileSizeH * YPerBlock; + constexpr unsigned WoPerBlock = OutTileSizeW * XPerBlock; + + constexpr unsigned HiPerBlock = YPerBlock * OutTileSizeH + S - 1; + constexpr unsigned WiPerBlock = XPerBlock * OutTileSizeW + R - 1; + + constexpr auto in_block_desc = + make_ConstantTensorDescriptor(Sequence{}); + + constexpr auto wei_block_desc = + make_ConstantTensorDescriptor(Sequence{}); + + // shared mem + constexpr unsigned in_block_size = in_block_desc.GetElementSpace(); + constexpr unsigned wei_block_size = wei_block_desc.GetElementSpace(); + + __shared__ TFloat p_in_block[in_block_size]; + __shared__ TFloat p_wei_block[wei_block_size]; + + // threadwise tensors + constexpr unsigned InTileSizeH = OutTileSizeH + S - 1; + constexpr unsigned InTileSizeW = OutTileSizeW + R - 1; + + constexpr auto in_thread_block_desc = make_ConstantTensorDescriptor( + Sequence{}, in_block_desc.GetStrides()); + + constexpr auto wei_thread_block_desc = make_ConstantTensorDescriptor( + Sequence{}, wei_block_desc.GetStrides()); + + constexpr auto out_thread_desc = + get_output_4d_tensor_descriptor(in_thread_block_desc, wei_thread_block_desc); + + // register + TFloat p_out_thread[out_thread_desc.GetElementSpace()]; + + // divide block work + constexpr unsigned NBlockWork = (out_global_desc.GetLength(I0) + NPerBlock - 1) / NPerBlock; + constexpr unsigned KBlockWork = (out_global_desc.GetLength(I1) + KPerBlock - 1) / KPerBlock; + constexpr unsigned YBlockWork = (out_global_desc.GetLength(I2) + HoPerBlock - 1) / HoPerBlock; + constexpr unsigned XBlockWork = (out_global_desc.GetLength(I3) + WoPerBlock - 1) / WoPerBlock; + + const unsigned block_id = blockIdx.x; + + unsigned itmp = block_id; + const unsigned n_block_work_id = itmp / (KBlockWork * YBlockWork * XBlockWork); + itmp -= n_block_work_id * (KBlockWork * YBlockWork * XBlockWork); + const unsigned k_block_work_id = itmp / (YBlockWork * XBlockWork); + itmp -= k_block_work_id * (YBlockWork * XBlockWork); + const unsigned y_block_work_id = itmp / XBlockWork; + const unsigned x_block_work_id = itmp - y_block_work_id * XBlockWork; + + const unsigned n_block_data_begin = n_block_work_id * NPerBlock; + const unsigned k_block_data_begin = k_block_work_id * KPerBlock; + const unsigned y_block_data_begin = y_block_work_id * YPerBlock; + const unsigned x_block_data_begin = x_block_work_id * XPerBlock; + + const unsigned ho_block_data_begin = y_block_data_begin * OutTileSizeH; + const unsigned wo_block_data_begin = x_block_data_begin * OutTileSizeW; + + const unsigned hi_block_data_begin = ho_block_data_begin; // minus padding + const unsigned wi_block_data_begin = wo_block_data_begin; // minus padding + + // divide thread work + constexpr unsigned NThreadWork = (NPerBlock + NPerThread - 1) / NPerThread; + constexpr unsigned KThreadWork = (KPerBlock + KPerThread - 1) / KPerThread; + constexpr unsigned YThreadWork = YPerBlock; + constexpr unsigned XThreadWork = XPerBlock; + + const unsigned thread_id = threadIdx.x; + + itmp = thread_id; + const unsigned n_thread_work_id = itmp / (KThreadWork * YThreadWork * XThreadWork); + itmp -= n_thread_work_id * (KThreadWork * YThreadWork * XThreadWork); + const unsigned k_thread_work_id = itmp / (YThreadWork * XThreadWork); + itmp -= k_thread_work_id * (YThreadWork * XThreadWork); + const unsigned y_thread_work_id = itmp / XThreadWork; + const unsigned x_thread_work_id = itmp - y_thread_work_id * XThreadWork; + + const unsigned n_thread_data_begin = n_thread_work_id * NPerThread; + const unsigned k_thread_data_begin = k_thread_work_id * KPerThread; + const unsigned ho_thread_data_begin = y_thread_work_id * OutTileSizeH; + const unsigned wo_thread_data_begin = x_thread_work_id * OutTileSizeW; + + const unsigned hi_thread_data_begin = ho_thread_data_begin; + const unsigned wi_thread_data_begin = wo_thread_data_begin; + +#if 0 + if(threadIdx.x == 0) + { + print_ConstantTensorDescriptor(in_global_desc, "gridwise_convolution: in_global_desc: "); + print_ConstantTensorDescriptor(wei_global_desc, "gridwise_convolution: wei_global_desc: "); + print_ConstantTensorDescriptor(out_global_desc, "gridwise_convolution: out_global_desc: "); + } + + printf("threadIdx.x %u \t" + "n_thread_data_begin %u, k_thread_data_begin %u, ho_thread_data_begin %u, " + "wo_thread_data_begin %u\n", + threadIdx.x, + n_thread_data_begin, + k_thread_data_begin, + ho_thread_data_begin, + wo_thread_data_begin); +#endif + + // set threadwise output tensor to 0 + threadwise_4d_tensor_set_zero(out_thread_desc, p_out_thread); + + for(unsigned c_block_data_begin = 0; c_block_data_begin < in_global_desc.GetLength(I1); + c_block_data_begin += CPerBlock, __syncthreads()) + { + // copy input tensor to LDS + blockwise_4d_tensor_copy(in_global_desc, + p_in_global + + in_global_desc.Get1dIndex(n_block_data_begin, + c_block_data_begin, + hi_block_data_begin, + wi_block_data_begin), + in_block_desc, + p_in_block, + in_block_desc); + + // copy weight tensor to LDS + blockwise_4d_tensor_copy( + wei_global_desc, + p_wei_global + wei_global_desc.Get1dIndex(k_block_data_begin, c_block_data_begin, 0, 0), + wei_block_desc, + p_wei_block, + wei_block_desc); + + __syncthreads(); + + for(unsigned c_thread_data = 0; c_thread_data < CPerBlock; c_thread_data += CPerThread) + { + // threadwise convolution + threadwise_direct_convolution_2( + in_thread_block_desc, + p_in_block + in_block_desc.Get1dIndex(n_thread_data_begin, + c_thread_data, + hi_thread_data_begin, + wi_thread_data_begin), + wei_thread_block_desc, + p_wei_block + wei_block_desc.Get1dIndex(k_thread_data_begin, c_thread_data, 0, 0), + out_thread_desc, + p_out_thread); + } + } + + // copy output tensor from register to global mem + threadwise_4d_tensor_copy( + out_thread_desc, + p_out_thread, + out_global_desc, + p_out_global + out_global_desc.Get1dIndex(n_block_data_begin + n_thread_data_begin, + k_block_data_begin + k_thread_data_begin, + ho_block_data_begin + ho_thread_data_begin, + wo_block_data_begin + wo_thread_data_begin), + out_thread_desc); +} diff --git a/src/include/threadwise_direct_convolution.cuh b/src/include/threadwise_direct_convolution.cuh index 1a00dfde51..ffd4f47849 100644 --- a/src/include/threadwise_direct_convolution.cuh +++ b/src/include/threadwise_direct_convolution.cuh @@ -1,13 +1,14 @@ #pragma once #include "constant_tensor_descriptor.cuh" +// optimized for scenario if p_in, p_wei, p_out are in register template -__device__ void threadwise_direct_convolution(InDesc, - TFloat* const __restrict__ p_in, - WeiDesc, - TFloat* const __restrict__ p_wei, - OutDesc, - TFloat* __restrict__ p_out) +__device__ void threadwise_direct_convolution_1(InDesc, + TFloat* const __restrict__ p_in, + WeiDesc, + TFloat* const __restrict__ p_wei, + OutDesc, + TFloat* __restrict__ p_out) { constexpr auto I0 = Number<0>{}; constexpr auto I1 = Number<1>{}; @@ -77,3 +78,117 @@ __device__ void threadwise_direct_convolution(InDesc, } } } + +// optimized for scenario where p_in and p_wei are in LDS, p_out is in register +// break down a non-1x1 convolution into a sequence of 1x1 convolutions, +// load 1x1 weight into register, and do 1x1 convolution in register. +template +__device__ void threadwise_direct_convolution_2(InDesc, + TFloat* const __restrict__ p_in, + WeiDesc, + TFloat* const __restrict__ p_wei, + OutDesc, + TFloat* __restrict__ p_out) +{ + constexpr auto I0 = Number<0>{}; + constexpr auto I1 = Number<1>{}; + constexpr auto I2 = Number<2>{}; + constexpr auto I3 = Number<3>{}; + + constexpr auto in_desc_lds = InDesc{}; + constexpr auto wei_desc_lds = WeiDesc{}; + constexpr auto out_desc_reg = OutDesc{}; + + constexpr auto in_desc_reg = + make_ConstantTensorDescriptor(Sequence{}); + + constexpr auto wei_desc_reg = make_ConstantTensorDescriptor( + Sequence{}); + + TFloat p_in_reg[in_desc_reg.GetElementSpace()]; + TFloat p_wei_reg[wei_desc_reg.GetElementSpace()]; + + constexpr unsigned in_w_new_read = 1; + + constexpr auto in_desc_reg_new_read = + make_ConstantTensorDescriptor(Sequence{}); + + // loop over vertical direction + for(unsigned s = 0; s < wei_desc_lds.GetLength(I2); ++s) + { +#if 1 + // read first input + threadwise_4d_tensor_copy(in_desc_lds, + p_in + in_desc_lds.Get1dIndex(0, 0, s, 0), + in_desc_reg, + p_in_reg, + in_desc_reg); + + // read first 1x1 weight + threadwise_4d_tensor_copy(wei_desc_lds, + p_wei + wei_desc_lds.Get1dIndex(0, 0, s, 0), + wei_desc_reg, + p_wei_reg, + wei_desc_reg); + + // do first 1x1 conv + threadwise_direct_convolution_1( + in_desc_reg, p_in_reg, wei_desc_reg, p_wei_reg, out_desc_reg, p_out); + + // loop over horizontal direction + for(unsigned r = 1; r < wei_desc_lds.GetLength(I3); ++r) + { + // read new weight + threadwise_4d_tensor_copy(wei_desc_lds, + p_wei + wei_desc_lds.Get1dIndex(0, 0, s, r), + wei_desc_reg, + p_wei_reg, + wei_desc_reg); + + // shift old input to the left + threadwise_4d_tensor_shift_down(in_desc_reg, p_in_reg, I3, Number{}); + + // read new input + threadwise_4d_tensor_copy( + in_desc_lds, + p_in + in_desc_lds.Get1dIndex(0, 0, s, in_desc_reg.GetLength(I3) + r - 1), + in_desc_reg, + p_in_reg + + in_desc_reg.Get1dIndex(0, 0, 0, in_desc_reg.GetLength(I3) - in_w_new_read), + in_desc_reg_new_read); + + // do 1x1 conv + threadwise_direct_convolution_1( + in_desc_reg, p_in_reg, wei_desc_reg, p_wei_reg, out_desc_reg, p_out); + } +#elif 1 + // loop over horizontal direction + for(unsigned r = 0; r < wei_desc_lds.GetLength(I3); ++r) + { + // read new weight + threadwise_4d_tensor_copy(wei_desc_lds, + p_wei + wei_desc_lds.Get1dIndex(0, 0, s, r), + wei_desc_reg, + p_wei_reg, + wei_desc_reg); + + // read new input + threadwise_4d_tensor_copy(in_desc_lds, + p_in + in_desc_lds.Get1dIndex(0, 0, s, r), + in_desc_reg, + p_in_reg, + in_desc_reg); + + // do 1x1 conv + threadwise_direct_convolution_1( + in_desc_reg, p_in_reg, wei_desc_reg, p_wei_reg, out_desc_reg, p_out); + } +#endif + } +} diff --git a/src/include/threadwise_tensor_op.cuh b/src/include/threadwise_tensor_op.cuh index 99701d795f..17cf533dec 100644 --- a/src/include/threadwise_tensor_op.cuh +++ b/src/include/threadwise_tensor_op.cuh @@ -35,41 +35,41 @@ __device__ void threadwise_4d_tensor_pointwise_op_unary(Desc, TFloat* __restrict } } -template +template __device__ void threadwise_4d_tensor_pointwise_op_binary( - SrcDesc, TFloat* const __restrict__ p_src, DstDesc, TFloat* __restrict__ p_dst, F f) + DescA, TFloat* const __restrict__ p_a, DescB, TFloat* __restrict__ p_b, DescRef, F f) { constexpr auto I0 = Number<0>{}; constexpr auto I1 = Number<1>{}; constexpr auto I2 = Number<2>{}; constexpr auto I3 = Number<3>{}; - constexpr auto src_desc = SrcDesc{}; - constexpr auto dst_desc = DstDesc{}; - - static_assert(is_same::value); + constexpr auto desc_a = DescA{}; + constexpr auto desc_b = DescB{}; + constexpr auto desc_ref = DescRef{}; #if 0 if(threadIdx.x == 0) { - print_ConstantTensorDescriptor(src_desc, "threadwise_4d_tensor_op_binary: src_desc: "); - print_ConstantTensorDescriptor(dst_desc, "threadwise_4d_tensor_op_binary: dst_desc: "); + print_ConstantTensorDescriptor(desc_a, "threadwise_4d_tensor_op_binary: desc_a: "); + print_ConstantTensorDescriptor(desc_b, "threadwise_4d_tensor_op_binary: desc_b: "); + print_ConstantTensorDescriptor(desc_ref, "threadwise_4d_tensor_op_binary: desc_ref: "); } #endif - for(unsigned did0 = 0; did0 < src_desc.GetLength(I0); ++did0) + for(unsigned did0 = 0; did0 < desc_ref.GetLength(I0); ++did0) { - for(unsigned did1 = 0; did1 < src_desc.GetLength(I1); ++did1) + for(unsigned did1 = 0; did1 < desc_ref.GetLength(I1); ++did1) { - for(unsigned did2 = 0; did2 < src_desc.GetLength(I2); ++did2) + for(unsigned did2 = 0; did2 < desc_ref.GetLength(I2); ++did2) { - for(unsigned did3 = 0; did3 < src_desc.GetLength(I3); ++did3) + for(unsigned did3 = 0; did3 < desc_ref.GetLength(I3); ++did3) { - const unsigned sindex = src_desc.Get1dIndex(did0, did1, did2, did3); + const unsigned aindex = desc_a.Get1dIndex(did0, did1, did2, did3); - const unsigned dindex = dst_desc.Get1dIndex(did0, did1, did2, did3); + const unsigned bindex = desc_b.Get1dIndex(did0, did1, did2, did3); - f(p_src[sindex], p_dst[dindex]); + f(p_a[aindex], p_b[bindex]); } } } @@ -85,20 +85,18 @@ __device__ void threadwise_4d_tensor_set_zero(Desc, TFloat* __restrict__ p) Desc{}, p, f_set_zero); } -template -__device__ void threadwise_4d_tensor_copy(SrcDesc, - TFloat* const __restrict__ p_src, - DstDesc, - TFloat* __restrict__ p_dst) +template +__device__ void threadwise_4d_tensor_copy( + SrcDesc, TFloat* const __restrict__ p_src, DstDesc, TFloat* __restrict__ p_dst, RefDesc) { auto f_copy = [](const TFloat& src, TFloat& dst) { dst = src; }; - threadwise_4d_tensor_pointwise_op_binary( - SrcDesc{}, p_src, DstDesc{}, p_dst, f_copy); + threadwise_4d_tensor_pointwise_op_binary( + SrcDesc{}, p_src, DstDesc{}, p_dst, RefDesc{}, f_copy); } -template -__device__ void threadwise_4d_tensor_shift_down(Desc, TFloat* __restrict__ p, IDim, unsigned shift) +template +__device__ void threadwise_4d_tensor_shift_down(Desc, TFloat* __restrict__ p, IDim, NShift) { constexpr auto I0 = Number<0>{}; constexpr auto I1 = Number<1>{}; @@ -114,17 +112,19 @@ __device__ void threadwise_4d_tensor_shift_down(Desc, TFloat* __restrict__ p, ID } #endif - const unsigned did0_end = - is_same::value ? desc.GetLength(I0) - shift : desc.GetLength(I0); + constexpr unsigned nshift = NShift::mValue; - const unsigned did1_end = - is_same::value ? desc.GetLength(I1) - shift : desc.GetLength(I1); + constexpr unsigned did0_end = + is_same::value ? desc.GetLength(I0) - nshift : desc.GetLength(I0); - const unsigned did2_end = - is_same::value ? desc.GetLength(I2) - shift : desc.GetLength(I2); + constexpr unsigned did1_end = + is_same::value ? desc.GetLength(I1) - nshift : desc.GetLength(I1); - const unsigned did3_end = - is_same::value ? desc.GetLength(I3) - shift : desc.GetLength(I3); + constexpr unsigned did2_end = + is_same::value ? desc.GetLength(I2) - nshift : desc.GetLength(I2); + + constexpr unsigned did3_end = + is_same::value ? desc.GetLength(I3) - nshift : desc.GetLength(I3); for(unsigned did0 = 0; did0 < did0_end; ++did0) { @@ -136,11 +136,11 @@ __device__ void threadwise_4d_tensor_shift_down(Desc, TFloat* __restrict__ p, ID { const unsigned dindex = desc.Get1dIndex(did0, did1, did2, did3); - const unsigned sindex = dindex + shift * desc.GetStride(IDim{}); + const unsigned sindex = dindex + nshift * desc.GetStride(IDim{}); p[dindex] = p[sindex]; } } } } -} \ No newline at end of file +}