diff --git a/driver/conv.cu b/driver/conv.cu index ca8684464a..d2df8ae023 100644 --- a/driver/conv.cu +++ b/driver/conv.cu @@ -7,7 +7,6 @@ #include "constant_tensor_descriptor.cuh" #include "device_direct_convolution_1.cuh" #include "device_direct_convolution_2.cuh" -#include "device_winograd_convolution.cuh" struct GeneratorConstant { @@ -61,10 +60,10 @@ void ostream_ConstantTensorDescriptor(TConstTensorDesc, std::ostream& os = std:: { static_assert(TConstTensorDesc::nDim == 4, "nDim is not 4"); - constexpr auto I0 = Index<0>{}; - constexpr auto I1 = Index<1>{}; - constexpr auto I2 = Index<2>{}; - constexpr auto I3 = Index<3>{}; + constexpr auto I0 = Number<0>{}; + constexpr auto I1 = Number<1>{}; + constexpr auto I2 = Number<2>{}; + constexpr auto I3 = Number<3>{}; constexpr auto desc = TConstTensorDesc{}; os << "Lengths: {" << desc.GetLength(I0) << ", " << desc.GetLength(I1) << ", " @@ -79,10 +78,10 @@ auto make_TensorDescriptor(TConstTensorDesc) { static_assert(TConstTensorDesc::nDim == 4, "nDim is not 4"); - constexpr auto I0 = Index<0>{}; - constexpr auto I1 = Index<1>{}; - constexpr auto I2 = Index<2>{}; - constexpr auto I3 = Index<3>{}; + constexpr auto I0 = Number<0>{}; + constexpr auto I1 = Number<1>{}; + constexpr auto I2 = Number<2>{}; + constexpr auto I3 = Number<3>{}; constexpr auto desc = TConstTensorDesc{}; std::initializer_list lengths = { @@ -396,7 +395,7 @@ int main() Tensor out_host(make_TensorDescriptor(out_desc)); Tensor out_device(make_TensorDescriptor(out_desc)); -#if 0 +#if 1 std::size_t num_thread = std::thread::hardware_concurrency(); in.GenerateTensorValue(GeneratorTensor_2{-5, 5}, num_thread); wei.GenerateTensorValue(GeneratorTensor_2{-5, 5}, num_thread); @@ -405,17 +404,17 @@ int main() for(int i = 0; i < 20; ++i) { #if 1 - device_direct_convolution_2(in_desc, in, wei_desc, wei, out_desc, out_device); + device_direct_convolution_1(in_desc, in, wei_desc, wei, out_desc, out_device); #else device_winograd_convolution(in_desc, in, wei_desc, wei, out_desc, out_device); #endif } -#if 0 - host_direct_convolution(in, wei, out_host); +#if 1 + host_winograd_3x3_convolution(in, wei, out_host); check_error(out_host, out_device); #elif 0 - host_winograd_3x3_convolution(in, wei, out_host); + host_direct_convolution(in, wei, out_host); check_error(out_host, out_device); #endif diff --git a/driver/device_direct_convolution_1.cuh b/driver/device_direct_convolution_1.cuh index 5527dd3946..070a76dabc 100644 --- a/driver/device_direct_convolution_1.cuh +++ b/driver/device_direct_convolution_1.cuh @@ -16,10 +16,10 @@ void device_direct_convolution_1( wei_device_buf.ToDevice(wei.mData.data()); out_device_buf.ToDevice(out.mData.data()); - constexpr auto I0 = Index<0>{}; - constexpr auto I1 = Index<1>{}; - constexpr auto I2 = Index<2>{}; - constexpr auto I3 = Index<3>{}; + constexpr auto I0 = Number<0>{}; + constexpr auto I1 = Number<1>{}; + constexpr auto I2 = Number<2>{}; + constexpr auto I3 = Number<3>{}; constexpr auto in_desc = InDesc{}; constexpr auto wei_desc = WeiDesc{}; diff --git a/driver/device_direct_convolution_2.cuh b/driver/device_direct_convolution_2.cuh index 53384ee2af..f37af98086 100644 --- a/driver/device_direct_convolution_2.cuh +++ b/driver/device_direct_convolution_2.cuh @@ -16,10 +16,10 @@ void device_direct_convolution_2( wei_device_buf.ToDevice(wei.mData.data()); out_device_buf.ToDevice(out.mData.data()); - constexpr auto I0 = Index<0>{}; - constexpr auto I1 = Index<1>{}; - constexpr auto I2 = Index<2>{}; - constexpr auto I3 = Index<3>{}; + constexpr auto I0 = Number<0>{}; + constexpr auto I1 = Number<1>{}; + constexpr auto I2 = Number<2>{}; + constexpr auto I3 = Number<3>{}; constexpr auto in_desc = InDesc{}; constexpr auto wei_desc = WeiDesc{}; @@ -36,11 +36,6 @@ void device_direct_convolution_2( constexpr unsigned KPerThread = 4; constexpr unsigned CPerThread = 2; - constexpr unsigned NBlockOpLen0 = 1; - constexpr unsigned NBlockOpLen1 = 1; - constexpr unsigned NBlockOpLen2 = 4; - constexpr unsigned NBlockOpLen3 = 32; - constexpr unsigned BlockSize = 128; constexpr unsigned GridSize = (out_desc.GetLength(I0) / NPerBlock) * @@ -73,10 +68,6 @@ void device_direct_convolution_2( NPerThread, KPerThread, CPerThread, - NBlockOpLen0, - NBlockOpLen1, - NBlockOpLen2, - NBlockOpLen3, BlockSize, GridSize> <<>>(InDesc{}, diff --git a/driver/device_winograd_convolution.cuh b/driver/device_winograd_convolution.cuh deleted file mode 100644 index f233c39613..0000000000 --- a/driver/device_winograd_convolution.cuh +++ /dev/null @@ -1,89 +0,0 @@ -#pragma once -#include "gridwise_winograd_convolution.cuh" - -template -void device_winograd_convolution( - InDesc, const Tensor& in, WeiDesc, const Tensor& wei, OutDesc, Tensor& out) -{ - std::size_t data_sz = sizeof(T); - DeviceMem in_device_buf(data_sz * in.mDesc.GetElementSpace()); - DeviceMem wei_device_buf(data_sz * wei.mDesc.GetElementSpace()); - DeviceMem out_device_buf(data_sz * out.mDesc.GetElementSpace()); - - int num_thread = std::thread::hardware_concurrency(); - - in_device_buf.ToDevice(in.mData.data()); - wei_device_buf.ToDevice(wei.mData.data()); - out_device_buf.ToDevice(out.mData.data()); - - constexpr auto I0 = Index<0>{}; - constexpr auto I1 = Index<1>{}; - constexpr auto I2 = Index<2>{}; - constexpr auto I3 = Index<3>{}; - - constexpr auto in_desc = InDesc{}; - constexpr auto wei_desc = WeiDesc{}; - constexpr auto out_desc = OutDesc{}; - constexpr unsigned OutTileSizeH = 2; - constexpr unsigned OutTileSizeW = 2; - constexpr unsigned NPerBlock = 2; - constexpr unsigned KPerBlock = 16; - constexpr unsigned CPerBlock = 4; - constexpr unsigned YPerBlock = 1; - constexpr unsigned XPerBlock = 16; - - constexpr unsigned NPerThread = 2; - constexpr unsigned KPerThread = 2; - constexpr unsigned CPerThread = 2; - - constexpr unsigned BlockSize = 128; - - constexpr unsigned GridSize = (out_desc.GetLength(I0) / NPerBlock) * - (out_desc.GetLength(I1) / KPerBlock) * - (out_desc.GetLength(I2) / (OutTileSizeH * YPerBlock)) * - (out_desc.GetLength(I3) / (OutTileSizeW * XPerBlock)); - - dim3 block_dim(BlockSize); - dim3 grid_dim(GridSize); - - printf("%s: BlockSize %u, GridSize %u \n", __func__, BlockSize, GridSize); - - cudaEvent_t start, stop; - float elapsedTime; - - cudaEventCreate(&start); - cudaEventRecord(start, 0); - - gridwise_winograd_convolution - <<>>(InDesc{}, - static_cast(in_device_buf.GetDeviceBuffer()), - WeiDesc{}, - static_cast(wei_device_buf.GetDeviceBuffer()), - OutDesc{}, - static_cast(out_device_buf.GetDeviceBuffer())); - - cudaEventCreate(&stop); - cudaEventRecord(stop, 0); - cudaEventSynchronize(stop); - - cudaEventElapsedTime(&elapsedTime, start, stop); - printf("Elapsed time : %f ms\n", elapsedTime); - - checkCudaErrors(cudaGetLastError()); - out_device_buf.FromDevice(out.mData.data()); -} \ No newline at end of file diff --git a/src/include/blockwise_direct_convolution.cuh b/src/include/blockwise_direct_convolution.cuh index 40a6be6aac..d70c45254b 100644 --- a/src/include/blockwise_direct_convolution.cuh +++ b/src/include/blockwise_direct_convolution.cuh @@ -17,10 +17,10 @@ __device__ void blockwise_convolution(InBlockDesc, OutBlockDesc, TFloat* __restrict__ p_out_block) { - constexpr auto I0 = Index<0>{}; - constexpr auto I1 = Index<1>{}; - constexpr auto I2 = Index<2>{}; - constexpr auto I3 = Index<3>{}; + constexpr auto I0 = Number<0>{}; + constexpr auto I1 = Number<1>{}; + constexpr auto I2 = Number<2>{}; + constexpr auto I3 = Number<3>{}; constexpr auto in_block_desc = InBlockDesc{}; constexpr auto wei_block_desc = WeiBlockDesc{}; @@ -88,72 +88,50 @@ __device__ void blockwise_convolution(InBlockDesc, TFloat p_wei_thread[wei_thread_src_desc.GetElementSpace()]; TFloat p_out_thread[out_thread_src_desc.GetElementSpace()]; - auto f_copy = [](const TFloat& src, TFloat& dst) { dst = src; }; - // copy input tensor into register - threadwise_4d_tensor_op_binary( + threadwise_4d_tensor_copy( in_thread_src_desc, p_in_block + in_block_desc.Get1dIndex( n_thread_work_begin, 0, hi_thread_work_begin, wi_thread_work_begin), in_thread_dst_desc, - p_in_thread, - f_copy); + p_in_thread); for(unsigned k_thread_work_begin = 0; k_thread_work_begin < KPerBlock; ++k_thread_work_begin) { // copy weight tensor into register - threadwise_4d_tensor_op_binary( - wei_thread_src_desc, - p_wei_block + wei_block_desc.Get1dIndex(k_thread_work_begin, 0, 0, 0), - wei_thread_dst_desc, - p_wei_thread, - f_copy); + threadwise_4d_tensor_copy(wei_thread_src_desc, + p_wei_block + + wei_block_desc.Get1dIndex(k_thread_work_begin, 0, 0, 0), + wei_thread_dst_desc, + p_wei_thread); // copy output tensor into register - threadwise_4d_tensor_op_binary( - out_thread_src_desc, - p_out_block + out_block_desc.Get1dIndex(n_thread_work_begin, - k_thread_work_begin, - ho_thread_work_begin, - wo_thread_work_begin), - out_thread_dst_desc, - p_out_thread, - f_copy); + threadwise_4d_tensor_copy(out_thread_src_desc, + p_out_block + out_block_desc.Get1dIndex(n_thread_work_begin, + k_thread_work_begin, + ho_thread_work_begin, + wo_thread_work_begin), + out_thread_dst_desc, + p_out_thread); // threadwise convolution - threadwise_direct_convolution(in_thread_dst_desc, - p_in_thread, - wei_thread_dst_desc, - p_wei_thread, - out_thread_dst_desc, - p_out_thread); + threadwise_direct_convolution(in_thread_dst_desc, + p_in_thread, + wei_thread_dst_desc, + p_wei_thread, + out_thread_dst_desc, + p_out_thread); // accumulate output tensor into LDS - threadwise_4d_tensor_op_binary( - out_thread_dst_desc, - p_out_thread, - out_thread_src_desc, - p_out_block + out_block_desc.Get1dIndex(n_thread_work_begin, - k_thread_work_begin, - ho_thread_work_begin, - wo_thread_work_begin), - f_copy); + threadwise_4d_tensor_copy(out_thread_dst_desc, + p_out_thread, + out_thread_src_desc, + p_out_block + + out_block_desc.Get1dIndex(n_thread_work_begin, + k_thread_work_begin, + ho_thread_work_begin, + wo_thread_work_begin)); } } } diff --git a/src/include/blockwise_tensor_op.cuh b/src/include/blockwise_tensor_op.cuh index 65a1e08c38..3b4bc58fdc 100644 --- a/src/include/blockwise_tensor_op.cuh +++ b/src/include/blockwise_tensor_op.cuh @@ -1,44 +1,31 @@ #pragma once #include "constant_tensor_descriptor.cuh" -#define BLOCKWISE_TENSOR_OP_METHOD 12 - -#if BLOCKWISE_TENSOR_OP_METHOD == 11 -template -__device__ void blockwise_4d_tensor_op_binary( - SrcDesc, TFloat* const __restrict__ p_src, DstDesc, TFloat* __restrict__ p_dst, F f) +template +__device__ void blockwise_4d_tensor_pointwise_op_unary(DstDesc, TFloat* __restrict__ p_dst, F f) { - constexpr auto I0 = Index<0>{}; - constexpr auto I1 = Index<1>{}; - constexpr auto I2 = Index<2>{}; - constexpr auto I3 = Index<3>{}; + constexpr auto I0 = Number<0>{}; + constexpr auto I1 = Number<1>{}; + constexpr auto I2 = Number<2>{}; + constexpr auto I3 = Number<3>{}; - constexpr auto src_desc = SrcDesc{}; constexpr auto dst_desc = DstDesc{}; - static_assert(is_same::value); - - constexpr auto desc = make_ConstantTensorDescriptor(src_desc.GetLengths()); + constexpr auto desc = make_ConstantTensorDescriptor(dst_desc.GetLengths()); #if 0 if(threadIdx.x == 0) { - print_ConstantTensorDescriptor(src_desc, "blockwise_4d_tensor_op_binary: src_desc: "); - print_ConstantTensorDescriptor(dst_desc, "blockwise_4d_tensor_op_binary: dst_desc: "); + print_ConstantTensorDescriptor(dst_desc, "blockwise_4d_tensor_op_unary: dst_desc: "); + print_ConstantTensorDescriptor(desc, "blockwise_4d_tensor_op_unary: desc: "); } #endif - for(unsigned i = threadIdx.x; i < desc.GetElementSize(); i += BlockSize) + constexpr unsigned NLoop = desc.GetElementSize() / BlockSize; + + for(unsigned iloop = 0; iloop < NLoop; ++iloop) { - unsigned is = i; + unsigned is = threadIdx.x + iloop * BlockSize; const unsigned did0 = is / desc.GetStride(I0); @@ -54,32 +41,48 @@ __device__ void blockwise_4d_tensor_op_binary( const unsigned did3 = is / desc.GetStride(I3); - const unsigned sindex = src_desc.Get1dIndex(did0, did1, did2, did3); - const unsigned dindex = dst_desc.Get1dIndex(did0, did1, did2, did3); - f(p_src[sindex], p_dst[dindex]); + f(p_dst[dindex]); + } + + constexpr bool has_tail = (desc.GetElementSize() > NLoop * BlockSize); + + if(has_tail) + { + unsigned is = threadIdx.x + NLoop * BlockSize; + + if(is < desc.GetElementSize()) + { + const unsigned did0 = is / desc.GetStride(I0); + + is -= did0 * desc.GetStride(I0); + + const unsigned did1 = is / desc.GetStride(I1); + + is -= did1 * desc.GetStride(I1); + + const unsigned did2 = is / desc.GetStride(I2); + + is -= did2 * desc.GetStride(I2); + + const unsigned did3 = is / desc.GetStride(I3); + + const unsigned dindex = dst_desc.Get1dIndex(did0, did1, did2, did3); + + f(p_dst[dindex]); + } } } -#endif -#if BLOCKWISE_TENSOR_OP_METHOD == 12 -template -__device__ void blockwise_4d_tensor_op_binary( +template +__device__ void blockwise_4d_tensor_pointwise_op_binary( SrcDesc, TFloat* const __restrict__ p_src, DstDesc, TFloat* __restrict__ p_dst, F f) { - constexpr auto I0 = Index<0>{}; - constexpr auto I1 = Index<1>{}; - constexpr auto I2 = Index<2>{}; - constexpr auto I3 = Index<3>{}; + constexpr auto I0 = Number<0>{}; + constexpr auto I1 = Number<1>{}; + constexpr auto I2 = Number<2>{}; + constexpr auto I3 = Number<3>{}; constexpr auto src_desc = SrcDesc{}; constexpr auto dst_desc = DstDesc{}; @@ -154,386 +157,35 @@ __device__ void blockwise_4d_tensor_op_binary( } } -template -__device__ void blockwise_4d_tensor_op_unary(DstDesc, TFloat* __restrict__ p_dst, F f) +template +__device__ void blockwise_4d_tensor_set_zero(DstDesc, TFloat* __restrict__ p_dst) { - constexpr auto I0 = Index<0>{}; - constexpr auto I1 = Index<1>{}; - constexpr auto I2 = Index<2>{}; - constexpr auto I3 = Index<3>{}; + auto f_set_zero = [](TFloat& v) { v = TFloat(0); }; - constexpr auto dst_desc = DstDesc{}; - - constexpr auto desc = make_ConstantTensorDescriptor(dst_desc.GetLengths()); - -#if 0 - if(threadIdx.x == 0) - { - print_ConstantTensorDescriptor(dst_desc, "blockwise_4d_tensor_op_unary: dst_desc: "); - print_ConstantTensorDescriptor(desc, "blockwise_4d_tensor_op_unary: desc: "); - } -#endif - - constexpr unsigned NLoop = desc.GetElementSize() / BlockSize; - - for(unsigned iloop = 0; iloop < NLoop; ++iloop) - { - unsigned is = threadIdx.x + iloop * BlockSize; - - const unsigned did0 = is / desc.GetStride(I0); - - is -= did0 * desc.GetStride(I0); - - const unsigned did1 = is / desc.GetStride(I1); - - is -= did1 * desc.GetStride(I1); - - const unsigned did2 = is / desc.GetStride(I2); - - is -= did2 * desc.GetStride(I2); - - const unsigned did3 = is / desc.GetStride(I3); - - const unsigned dindex = dst_desc.Get1dIndex(did0, did1, did2, did3); - - f(p_dst[dindex]); - } - - constexpr bool has_tail = (desc.GetElementSize() > NLoop * BlockSize); - - if(has_tail) - { - unsigned is = threadIdx.x + NLoop * BlockSize; - - if(is < desc.GetElementSize()) - { - const unsigned did0 = is / desc.GetStride(I0); - - is -= did0 * desc.GetStride(I0); - - const unsigned did1 = is / desc.GetStride(I1); - - is -= did1 * desc.GetStride(I1); - - const unsigned did2 = is / desc.GetStride(I2); - - is -= did2 * desc.GetStride(I2); - - const unsigned did3 = is / desc.GetStride(I3); - - const unsigned dindex = dst_desc.Get1dIndex(did0, did1, did2, did3); - - f(p_dst[dindex]); - } - } + blockwise_4d_tensor_pointwise_op_unary( + DstDesc{}, p_dst, f_set_zero); } -#endif -#if BLOCKWISE_TENSOR_OP_METHOD == 21 -template -__device__ void blockwise_4d_tensor_op_binary( - SrcDesc, TFloat* const __restrict__ p_src, DstDesc, TFloat* __restrict__ p_dst, F f) +template +__device__ void blockwise_4d_tensor_copy(SrcDesc, + TFloat* const __restrict__ p_src, + DstDesc, + TFloat* __restrict__ p_dst) { - constexpr auto I0 = Index<0>{}; - constexpr auto I1 = Index<1>{}; - constexpr auto I2 = Index<2>{}; - constexpr auto I3 = Index<3>{}; + auto f_copy = [](const TFloat& src, TFloat& dst) { dst = src; }; - constexpr auto src_desc = SrcDesc{}; - constexpr auto dst_desc = DstDesc{}; - - static_assert(is_same::value); - - constexpr unsigned NBlockOpStride3 = 1; - constexpr unsigned NBlockOpStride2 = NBlockOpLen3 * NBlockOpStride3; - constexpr unsigned NBlockOpStride1 = NBlockOpLen2 * NBlockOpStride2; - constexpr unsigned NBlockOpStride0 = NBlockOpLen1 * NBlockOpStride1; - - unsigned itmp = threadIdx.x; - - const unsigned did0_begin = itmp / NBlockOpStride0; - - itmp -= did0_begin * NBlockOpStride0; - - const unsigned did1_begin = itmp / NBlockOpStride1; - - itmp -= did1_begin * NBlockOpStride1; - - const unsigned did2_begin = itmp / NBlockOpStride2; - - itmp -= did2_begin * NBlockOpStride2; - - const unsigned did3_begin = itmp / NBlockOpStride3; - - for(unsigned did0 = did0_begin; did0 < src_desc.GetLength(I0); did0 += NBlockOpLen0) - { - for(unsigned did1 = did1_begin; did1 < src_desc.GetLength(I1); did1 += NBlockOpLen1) - { - for(unsigned did2 = did2_begin; did2 < src_desc.GetLength(I2); did2 += NBlockOpLen2) - { - for(unsigned did3 = did3_begin; did3 < src_desc.GetLength(I3); did3 += NBlockOpLen3) - { - const unsigned sindex = src_desc.Get1dIndex(did0, did1, did2, did3); - - const unsigned dindex = dst_desc.Get1dIndex(did0, did1, did2, did3); - - f(p_src[sindex], p_dst[dindex]); - } - } - } - } + blockwise_4d_tensor_pointwise_op_binary( + SrcDesc{}, p_src, DstDesc{}, p_dst, f_copy); } -#endif -#if BLOCKWISE_TENSOR_OP_METHOD == 22 -template -__device__ void blockwise_4d_tensor_op_binary( - SrcDesc, TFloat* const __restrict__ p_src, DstDesc, TFloat* __restrict__ p_dst, F f) +template +__device__ void blockwise_4d_tensor_accumulate(SrcDesc, + TFloat* const __restrict__ p_src, + DstDesc, + TFloat* __restrict__ p_dst) { - constexpr auto I0 = Index<0>{}; - constexpr auto I1 = Index<1>{}; - constexpr auto I2 = Index<2>{}; - constexpr auto I3 = Index<3>{}; + auto f_accum = [](const TFloat& src, TFloat& dst) { dst += src; }; - constexpr auto src_desc = SrcDesc{}; - constexpr auto dst_desc = DstDesc{}; - - static_assert(is_same::value); - - constexpr unsigned NBlockOpStride3 = 1; - constexpr unsigned NBlockOpStride2 = NBlockOpLen3 * NBlockOpStride3; - constexpr unsigned NBlockOpStride1 = NBlockOpLen2 * NBlockOpStride2; - constexpr unsigned NBlockOpStride0 = NBlockOpLen1 * NBlockOpStride1; - - unsigned itmp = threadIdx.x; - - const unsigned did0_begin = itmp / NBlockOpStride0; - - itmp -= did0_begin * NBlockOpStride0; - - const unsigned did1_begin = itmp / NBlockOpStride1; - - itmp -= did1_begin * NBlockOpStride1; - - const unsigned did2_begin = itmp / NBlockOpStride2; - - itmp -= did2_begin * NBlockOpStride2; - - const unsigned did3_begin = itmp / NBlockOpStride3; - - unsigned sindex = src_desc.Get1dIndex(did0_begin, did1_begin, did2_begin, did3_begin); - unsigned dindex = dst_desc.Get1dIndex(did0_begin, did1_begin, did2_begin, did3_begin); - - for(unsigned did0 = did0_begin; did0 < src_desc.GetLength(I0); did0 += NBlockOpLen0) - { - const unsigned sindex_save0 = sindex; - const unsigned dindex_save0 = dindex; - - for(unsigned did1 = did1_begin; did1 < src_desc.GetLength(I1); did1 += NBlockOpLen1) - { - const unsigned sindex_save1 = sindex; - const unsigned dindex_save1 = dindex; - - for(unsigned did2 = did2_begin; did2 < src_desc.GetLength(I2); did2 += NBlockOpLen2) - { - const unsigned sindex_save2 = sindex; - const unsigned dindex_save2 = dindex; - - for(unsigned did3 = did3_begin; did3 < src_desc.GetLength(I3); did3 += NBlockOpLen3) - { - f(p_src[sindex], p_dst[dindex]); - - sindex += NBlockOpLen3 * src_desc.GetStride(I3); - dindex += NBlockOpLen3 * dst_desc.GetStride(I3); - } - - sindex = sindex_save2 + NBlockOpLen2 * src_desc.GetStride(I2); - dindex = dindex_save2 + NBlockOpLen2 * dst_desc.GetStride(I2); - } - - sindex = sindex_save1 + NBlockOpLen1 * src_desc.GetStride(I1); - dindex = dindex_save1 + NBlockOpLen1 * dst_desc.GetStride(I1); - } - - sindex = sindex_save0 + NBlockOpLen0 * src_desc.GetStride(I0); - dindex = dindex_save0 + NBlockOpLen0 * dst_desc.GetStride(I0); - } -} -#endif - -#if BLOCKWISE_TENSOR_OP_METHOD == 23 -template -__device__ void blockwise_4d_tensor_op_binary( - SrcDesc, TFloat* const __restrict__ p_src, DstDesc, TFloat* __restrict__ p_dst, F f) -{ - constexpr auto I0 = Index<0>{}; - constexpr auto I1 = Index<1>{}; - constexpr auto I2 = Index<2>{}; - constexpr auto I3 = Index<3>{}; - - constexpr auto src_desc = SrcDesc{}; - constexpr auto dst_desc = DstDesc{}; - - static_assert(is_same::value); - - constexpr unsigned NBlockOpStride3 = 1; - constexpr unsigned NBlockOpStride2 = NBlockOpLen3 * NBlockOpStride3; - constexpr unsigned NBlockOpStride1 = NBlockOpLen2 * NBlockOpStride2; - constexpr unsigned NBlockOpStride0 = NBlockOpLen1 * NBlockOpStride1; - - unsigned itmp = threadIdx.x; - - const unsigned did0_begin = itmp / NBlockOpStride0; - - itmp -= did0_begin * NBlockOpStride0; - - const unsigned did1_begin = itmp / NBlockOpStride1; - - itmp -= did1_begin * NBlockOpStride1; - - const unsigned did2_begin = itmp / NBlockOpStride2; - - itmp -= did2_begin * NBlockOpStride2; - - const unsigned did3_begin = itmp / NBlockOpStride3; - - unsigned sindex = src_desc.Get1dIndex(did0_begin, did1_begin, did2_begin, did3_begin); - unsigned dindex = dst_desc.Get1dIndex(did0_begin, did1_begin, did2_begin, did3_begin); - - for(unsigned did0 = did0_begin; did0 < src_desc.GetLength(I0); did0 += NBlockOpLen0) - { - unsigned i1 = 0; - for(unsigned did1 = did1_begin; did1 < src_desc.GetLength(I1); did1 += NBlockOpLen1) - { - unsigned i2 = 0; - for(unsigned did2 = did2_begin; did2 < src_desc.GetLength(I2); did2 += NBlockOpLen2) - { - unsigned i3 = 0; - for(unsigned did3 = did3_begin; did3 < src_desc.GetLength(I3); did3 += NBlockOpLen3) - { - f(p_src[sindex], p_dst[dindex]); - - sindex += NBlockOpLen3 * src_desc.GetStride(I3); - dindex += NBlockOpLen3 * dst_desc.GetStride(I3); - - ++i3; - } - - sindex += NBlockOpLen2 * src_desc.GetStride(I2) - - i3 * NBlockOpLen3 * src_desc.GetStride(I3); - dindex += NBlockOpLen2 * dst_desc.GetStride(I2) - - i3 * NBlockOpLen3 * dst_desc.GetStride(I3); - - ++i2; - } - - sindex += - NBlockOpLen1 * src_desc.GetStride(I1) - i2 * NBlockOpLen2 * src_desc.GetStride(I2); - dindex += - NBlockOpLen1 * dst_desc.GetStride(I1) - i2 * NBlockOpLen2 * dst_desc.GetStride(I2); - - ++i1; - } - - sindex += - NBlockOpLen0 * src_desc.GetStride(I0) - i1 * NBlockOpLen1 * src_desc.GetStride(I1); - dindex += - NBlockOpLen0 * dst_desc.GetStride(I0) - i1 * NBlockOpLen1 * dst_desc.GetStride(I1); - } -} -#endif - -#if BLOCKWISE_TENSOR_OP_METHOD == 31 -template -__device__ void blockwise_4d_tensor_op_binary( - SrcDesc, TFloat* const __restrict__ p_src, DstDesc, TFloat* __restrict__ p_dst, F f) -{ - constexpr auto I0 = Index<0>{}; - constexpr auto I1 = Index<1>{}; - constexpr auto I2 = Index<2>{}; - constexpr auto I3 = Index<3>{}; - - constexpr auto src_desc = SrcDesc{}; - constexpr auto dst_desc = DstDesc{}; - - static_assert(is_same::value); - - constexpr unsigned NBlockOpStride3 = 1; - constexpr unsigned NBlockOpStride2 = NBlockOpLen3 * NBlockOpStride3; - constexpr unsigned NBlockOpStride1 = NBlockOpLen2 * NBlockOpStride2; - constexpr unsigned NBlockOpStride0 = NBlockOpLen1 * NBlockOpStride1; - - unsigned itmp = threadIdx.x; - - const unsigned did0_begin = itmp / NBlockOpStride0; - - itmp -= did0_begin * NBlockOpStride0; - - const unsigned did1_begin = itmp / NBlockOpStride1; - - itmp -= did1_begin * NBlockOpStride1; - - const unsigned did2_begin = itmp / NBlockOpStride2; - - itmp -= did2_begin * NBlockOpStride2; - - const unsigned did3_begin = itmp / NBlockOpStride3; - - for(unsigned did0 = did0_begin; did0 < src_desc.GetLength(I0); did0 += NBlockOpLen0) - { - for(unsigned did1 = did1_begin; did1 < src_desc.GetLength(I1); did1 += NBlockOpLen1) - { - for(unsigned did2 = did2_begin; did2 < src_desc.GetLength(I2); did2 += NBlockOpLen2) - { - for(unsigned did3 = did3_begin; did3 < src_desc.GetLength(I3); did3 += NBlockOpLen3) - { - const unsigned sindex = src_desc.Get1dIndex(did0, did1, did2, did3); - - const unsigned dindex = dst_desc.Get1dIndex(did0, did1, did2, did3); - - f(p_src[sindex], p_dst[dindex]); - } - } - } - } -} -#endif + blockwise_4d_tensor_pointwise_op_binary( + SrcDesc{}, p_src, DstDesc{}, p_dst, f_accum); +} \ No newline at end of file diff --git a/src/include/blockwise_winograd_transform.cuh b/src/include/blockwise_winograd_transform.cuh deleted file mode 100644 index 347d403d83..0000000000 --- a/src/include/blockwise_winograd_transform.cuh +++ /dev/null @@ -1,36 +0,0 @@ -#pragma once -#include "constant_tensor_descriptor.cuh" - -template -__device__ void blockwise_winograd_transform_input(TFloat* const __restrict__ p_in, - TFloat* __restrict__ p_in_transform) -{ - p_in_transform[0] = 1; -} - -template -__device__ void blockwise_winograd_transform_weight(TFloat* const __restrict__ p_wei, - TFloat* __restrict__ p_wei_transform) -{ - p_wei_transform[0] = 1; -} \ No newline at end of file diff --git a/src/include/constant_tensor_descriptor.cuh b/src/include/constant_tensor_descriptor.cuh index 8593d517b6..11c5c30e54 100644 --- a/src/include/constant_tensor_descriptor.cuh +++ b/src/include/constant_tensor_descriptor.cuh @@ -7,8 +7,8 @@ struct Constant const T mValue = N; }; -template -using Index = Constant; +template +using Number = Constant; template struct Sequence @@ -18,7 +18,7 @@ struct Sequence const unsigned mData[nDim] = {Is...}; template - __host__ __device__ constexpr unsigned Get(Index) const + __host__ __device__ constexpr unsigned Get(Number) const { return mData[I]; } @@ -28,7 +28,7 @@ template struct ConstantTensorDescriptor { static constexpr unsigned nDim = Lengths::nDim; - using NDimConstant = Index; + using NDimConstant = Number; __host__ __device__ constexpr ConstantTensorDescriptor() { @@ -42,15 +42,15 @@ struct ConstantTensorDescriptor __host__ __device__ constexpr Strides GetStrides() const { return Strides{}; } template - __host__ __device__ constexpr unsigned GetLength(Index) const + __host__ __device__ constexpr unsigned GetLength(Number) const { - return Lengths{}.Get(Index{}); + return Lengths{}.Get(Number{}); } template - __host__ __device__ constexpr unsigned GetStride(Index) const + __host__ __device__ constexpr unsigned GetStride(Number) const { - return Strides{}.Get(Index{}); + return Strides{}.Get(Number{}); } // this is ugly, only for 4d @@ -58,10 +58,10 @@ struct ConstantTensorDescriptor { static_assert(nDim == 4, "nDim is not 4"); - constexpr auto I0 = Index<0>{}; - constexpr auto I1 = Index<1>{}; - constexpr auto I2 = Index<2>{}; - constexpr auto I3 = Index<3>{}; + constexpr auto I0 = Number<0>{}; + constexpr auto I1 = Number<1>{}; + constexpr auto I2 = Number<2>{}; + constexpr auto I3 = Number<3>{}; return GetLength(I0) * GetLength(I1) * GetLength(I2) * GetLength(I3); } @@ -71,10 +71,10 @@ struct ConstantTensorDescriptor { static_assert(nDim == 4, "nDim is not 4"); - constexpr auto I0 = Index<0>{}; - constexpr auto I1 = Index<1>{}; - constexpr auto I2 = Index<2>{}; - constexpr auto I3 = Index<3>{}; + constexpr auto I0 = Number<0>{}; + constexpr auto I1 = Number<1>{}; + constexpr auto I2 = Number<2>{}; + constexpr auto I3 = Number<3>{}; return (GetLength(I0) - 1) * GetStride(I0) + (GetLength(I1) - 1) * GetStride(I1) + (GetLength(I2) - 1) * GetStride(I2) + (GetLength(I3) - 1) * GetStride(I3) + 1; @@ -83,10 +83,10 @@ struct ConstantTensorDescriptor // this is ugly, only for 4d __host__ __device__ unsigned Get1dIndex(unsigned n, unsigned c, unsigned h, unsigned w) const { - constexpr auto I0 = Index<0>{}; - constexpr auto I1 = Index<1>{}; - constexpr auto I2 = Index<2>{}; - constexpr auto I3 = Index<3>{}; + constexpr auto I0 = Number<0>{}; + constexpr auto I1 = Number<1>{}; + constexpr auto I2 = Number<2>{}; + constexpr auto I3 = Number<3>{}; static_assert(nDim == 4, "nDim is not 4"); return n * GetStride(I0) + c * GetStride(I1) + h * GetStride(I2) + w * GetStride(I3); @@ -120,10 +120,10 @@ __host__ __device__ constexpr auto get_output_4d_tensor_descriptor(InDesc, WeiDe constexpr auto in_desc = InDesc{}; constexpr auto wei_desc = WeiDesc{}; - constexpr auto I0 = Index<0>{}; - constexpr auto I1 = Index<1>{}; - constexpr auto I2 = Index<2>{}; - constexpr auto I3 = Index<3>{}; + constexpr auto I0 = Number<0>{}; + constexpr auto I1 = Number<1>{}; + constexpr auto I2 = Number<2>{}; + constexpr auto I3 = Number<3>{}; static_assert(in_desc.GetDimension() == 4, "input nDim is not 4"); static_assert(wei_desc.GetDimension() == 4, "weight nDim is not 4"); @@ -150,10 +150,10 @@ __host__ __device__ void print_ConstantTensorDescriptor(TDesc, const char* s) { constexpr auto desc = TDesc{}; - constexpr auto I0 = Index<0>{}; - constexpr auto I1 = Index<1>{}; - constexpr auto I2 = Index<2>{}; - constexpr auto I3 = Index<3>{}; + constexpr auto I0 = Number<0>{}; + constexpr auto I1 = Number<1>{}; + constexpr auto I2 = Number<2>{}; + constexpr auto I3 = Number<3>{}; static_assert(desc.GetDimension() == 4, "dim is not 4"); diff --git a/src/include/gridwise_direct_convolution_1.cuh b/src/include/gridwise_direct_convolution_1.cuh index 52949b49c1..77675ee67e 100644 --- a/src/include/gridwise_direct_convolution_1.cuh +++ b/src/include/gridwise_direct_convolution_1.cuh @@ -27,10 +27,10 @@ __global__ void gridwise_direct_convolution_1(InGlobalDesc, OutGlobalDesc, TFloat* __restrict__ p_out_global) { - constexpr auto I0 = Index<0>{}; - constexpr auto I1 = Index<1>{}; - constexpr auto I2 = Index<2>{}; - constexpr auto I3 = Index<3>{}; + constexpr auto I0 = Number<0>{}; + constexpr auto I1 = Number<1>{}; + constexpr auto I2 = Number<2>{}; + constexpr auto I3 = Number<3>{}; constexpr auto in_global_desc = InGlobalDesc{}; constexpr auto wei_global_desc = WeiGlobalDesc{}; @@ -120,62 +120,38 @@ __global__ void gridwise_direct_convolution_1(InGlobalDesc, } #endif - auto f_set0 = [](TFloat& v) { v = TFloat(0); }; - auto f_copy = [](const TFloat& src, TFloat& dst) { dst = src; }; - auto f_accu = [](const TFloat& src, TFloat& dst) { dst += src; }; - // set output tensor in LDS to 0 - blockwise_4d_tensor_op_unary(out_block_desc, p_out_block, f_set0); + blockwise_4d_tensor_set_zero(out_block_desc, + p_out_block); for(unsigned c_block_work_begin = 0; c_block_work_begin < in_global_desc.GetLength(I1); - c_block_work_begin += CPerBlock) + c_block_work_begin += CPerBlock, __syncthreads()) { // copy input tensor to LDS - blockwise_4d_tensor_op_binary(in_block_src_desc, - p_in_global + - in_global_desc.Get1dIndex(n_block_work_begin, - c_block_work_begin, - hi_block_work_begin, - wi_block_work_begin), - in_block_desc, - p_in_block, - f_copy); + blockwise_4d_tensor_copy(in_block_src_desc, + p_in_global + + in_global_desc.Get1dIndex(n_block_work_begin, + c_block_work_begin, + hi_block_work_begin, + wi_block_work_begin), + in_block_desc, + p_in_block); // copy weight tensor to LDS - blockwise_4d_tensor_op_binary( + blockwise_4d_tensor_copy( wei_block_src_desc, p_wei_global + wei_global_desc.Get1dIndex(k_block_work_begin, c_block_work_begin, 0, 0), wei_block_desc, - p_wei_block, - f_copy); + p_wei_block); -#if 1 __syncthreads(); -#endif // blockwise convolution blockwise_convolution( in_block_desc, p_in_block, wei_block_desc, p_wei_block, out_block_desc, p_out_block); - -#if 1 - __syncthreads(); -#endif } // copy output tensor from LDS to device mem - blockwise_4d_tensor_op_binary( + blockwise_4d_tensor_copy( out_block_desc, p_out_block, out_block_src_desc, p_out_global + out_global_desc.Get1dIndex( - n_block_work_begin, k_block_work_begin, ho_block_work_begin, wo_block_work_begin), - f_copy); -} + n_block_work_begin, k_block_work_begin, ho_block_work_begin, wo_block_work_begin)); +} \ No newline at end of file diff --git a/src/include/gridwise_direct_convolution_2.cuh b/src/include/gridwise_direct_convolution_2.cuh index 64a62466d3..f53577c981 100644 --- a/src/include/gridwise_direct_convolution_2.cuh +++ b/src/include/gridwise_direct_convolution_2.cuh @@ -19,10 +19,6 @@ template __global__ void gridwise_direct_convolution_2(InGlobalDesc, @@ -32,10 +28,10 @@ __global__ void gridwise_direct_convolution_2(InGlobalDesc, OutGlobalDesc, TFloat* __restrict__ p_out_global) { - constexpr auto I0 = Index<0>{}; - constexpr auto I1 = Index<1>{}; - constexpr auto I2 = Index<2>{}; - constexpr auto I3 = Index<3>{}; + constexpr auto I0 = Number<0>{}; + constexpr auto I1 = Number<1>{}; + constexpr auto I2 = Number<2>{}; + constexpr auto I3 = Number<3>{}; constexpr auto in_global_desc = InGlobalDesc{}; constexpr auto wei_global_desc = WeiGlobalDesc{}; @@ -147,10 +143,6 @@ __global__ void gridwise_direct_convolution_2(InGlobalDesc, const unsigned hi_thread_data_offset = ho_thread_data_offset; const unsigned wi_thread_data_offset = wo_thread_data_offset; - // op - auto f_set0 = [](TFloat& v) { v = TFloat(0); }; - auto f_copy = [](const TFloat& src, TFloat& dst) { dst = src; }; - #if 0 if(threadIdx.x == 0) { @@ -170,76 +162,54 @@ __global__ void gridwise_direct_convolution_2(InGlobalDesc, #endif // set threadwise output tensor to 0 - threadwise_4d_tensor_op_unary( - out_thread_desc, p_out_thread, f_set0); + threadwise_4d_tensor_set_zero(out_thread_desc, p_out_thread); for(unsigned c_block_data_offset = 0; c_block_data_offset < in_global_desc.GetLength(I1); c_block_data_offset += CPerBlock, __syncthreads()) { // copy input tensor to LDS - blockwise_4d_tensor_op_binary( - in_block_global_desc, - p_in_global + in_global_desc.Get1dIndex(n_block_data_offset, - c_block_data_offset, - hi_block_data_offset, - wi_block_data_offset), - in_block_desc, - p_in_block, - f_copy); + blockwise_4d_tensor_copy(in_block_global_desc, + p_in_global + + in_global_desc.Get1dIndex(n_block_data_offset, + c_block_data_offset, + hi_block_data_offset, + wi_block_data_offset), + in_block_desc, + p_in_block); // copy weight tensor to LDS - blockwise_4d_tensor_op_binary( + blockwise_4d_tensor_copy( wei_block_global_desc, p_wei_global + wei_global_desc.Get1dIndex(k_block_data_offset, c_block_data_offset, 0, 0), wei_block_desc, - p_wei_block, - f_copy); + p_wei_block); __syncthreads(); for(unsigned c_thread_data = 0; c_thread_data < CPerBlock; c_thread_data += CPerThread) { // copy input tensor into register - threadwise_4d_tensor_op_binary( - in_thread_block_desc, - p_in_block + in_block_desc.Get1dIndex(n_thread_data_offset, - c_thread_data, - hi_thread_data_offset, - wi_thread_data_offset), - in_thread_desc, - p_in_thread, - f_copy); + threadwise_4d_tensor_copy(in_thread_block_desc, + p_in_block + in_block_desc.Get1dIndex(n_thread_data_offset, + c_thread_data, + hi_thread_data_offset, + wi_thread_data_offset), + in_thread_desc, + p_in_thread); // copy weight tensor into register - threadwise_4d_tensor_op_binary( + threadwise_4d_tensor_copy( wei_thread_block_desc, p_wei_block + wei_block_desc.Get1dIndex(k_thread_data_offset, c_thread_data, 0, 0), wei_thread_desc, - p_wei_thread, - f_copy); + p_wei_thread); // threadwise convolution threadwise_direct_convolution( + threadwise_4d_tensor_copy( out_thread_desc, p_out_thread, out_thread_global_desc, p_out_global + out_global_desc.Get1dIndex(n_block_data_offset + n_thread_data_offset, k_block_data_offset + k_thread_data_offset, ho_block_data_offset + ho_thread_data_offset, - wo_block_data_offset + wo_thread_data_offset), - f_copy); + wo_block_data_offset + wo_thread_data_offset)); } diff --git a/src/include/gridwise_winograd_convolution.cuh b/src/include/gridwise_winograd_convolution.cuh deleted file mode 100644 index 52e5b5059a..0000000000 --- a/src/include/gridwise_winograd_convolution.cuh +++ /dev/null @@ -1,246 +0,0 @@ -#pragma once -#include "constant_tensor_descriptor.cuh" -#include "blockwise_winograd_transform.cuh" -#include "threadwise_winograd_transform.cuh" - -template -__global__ void gridwise_winograd_convolution(InGlobalDesc, - TFloat* const __restrict__ p_in_global, - WeiGlobalDesc, - TFloat* const __restrict__ p_wei_global, - OutGlobalDesc, - TFloat* __restrict__ p_out_global) -{ - constexpr auto I0 = Index<0>{}; - constexpr auto I1 = Index<1>{}; - constexpr auto I2 = Index<2>{}; - constexpr auto I3 = Index<3>{}; - - constexpr auto in_global_desc = InGlobalDesc{}; - constexpr auto wei_global_desc = WeiGlobalDesc{}; - constexpr auto out_global_desc = OutGlobalDesc{}; - - constexpr unsigned S = wei_global_desc.GetLength(I2); - constexpr unsigned R = wei_global_desc.GetLength(I3); - - constexpr unsigned HoPerBlock = OutTileSizeH * YPerBlock; - constexpr unsigned WoPerBlock = OutTileSizeW * XPerBlock; - - constexpr unsigned HiPerBlock = YPerBlock * OutTileSizeH + S - 1; - constexpr unsigned WiPerBlock = XPerBlock * OutTileSizeW + R - 1; - - constexpr unsigned InTileSizeH = OutTileSizeH + S - 1; - constexpr unsigned InTileSizeW = OutTileSizeW + R - 1; - - // divide block work - constexpr unsigned NBlockWork = (out_global_desc.GetLength(I0) + NPerBlock - 1) / NPerBlock; - constexpr unsigned KBlockWork = (out_global_desc.GetLength(I1) + KPerBlock - 1) / KPerBlock; - constexpr unsigned YBlockWork = (out_global_desc.GetLength(I2) + HoPerBlock - 1) / HoPerBlock; - constexpr unsigned XBlockWork = (out_global_desc.GetLength(I3) + WoPerBlock - 1) / WoPerBlock; - - const unsigned block_id = blockIdx.x; - - unsigned itmp = block_id; - const unsigned n_block_work_id = itmp / (KBlockWork * YBlockWork * XBlockWork); - itmp -= n_block_work_id * (KBlockWork * YBlockWork * XBlockWork); - const unsigned k_block_work_id = itmp / (YBlockWork * XBlockWork); - itmp -= k_block_work_id * (YBlockWork * XBlockWork); - const unsigned y_block_work_id = itmp / XBlockWork; - const unsigned x_block_work_id = itmp - y_block_work_id * XBlockWork; - - const unsigned n_block_data_offset = n_block_work_id * NPerBlock; - const unsigned k_block_data_offset = k_block_work_id * KPerBlock; - const unsigned y_block_data_offset = y_block_work_id * YPerBlock; - const unsigned x_block_data_offset = x_block_work_id * XPerBlock; - - const unsigned ho_block_data_offset = y_block_data_offset * OutTileSizeH; - const unsigned wo_block_data_offset = x_block_data_offset * OutTileSizeW; - - const unsigned hi_block_data_offset = ho_block_data_offset; // minus padding - const unsigned wi_block_data_offset = wo_block_data_offset; // minus padding - - // divide thread work - constexpr unsigned NThreadWork = (NPerBlock + NPerThread - 1) / NPerThread; - constexpr unsigned KThreadWork = (KPerBlock + KPerThread - 1) / KPerThread; - constexpr unsigned YThreadWork = YPerBlock; - constexpr unsigned XThreadWork = XPerBlock; - - const unsigned thread_id = threadIdx.x; - - itmp = thread_id; - const unsigned n_thread_work_id = itmp / (KThreadWork * YThreadWork * XThreadWork); - itmp -= n_thread_work_id * (KThreadWork * YThreadWork * XThreadWork); - const unsigned k_thread_work_id = itmp / (YThreadWork * XThreadWork); - itmp -= k_thread_work_id * (YThreadWork * XThreadWork); - const unsigned y_thread_work_id = itmp / XThreadWork; - const unsigned x_thread_work_id = itmp - y_thread_work_id * XThreadWork; - - const unsigned n_thread_data_offset = n_thread_work_id * NPerThread; - const unsigned k_thread_data_offset = k_thread_work_id * KPerThread; - const unsigned y_thread_data_offset = y_thread_work_id; - const unsigned x_thread_data_offset = x_thread_work_id; - - // op - auto f_set0 = [](TFloat& v) { v = TFloat(0); }; - auto f_copy = [](const TFloat& src, TFloat& dst) { dst = src; }; - - // block data - constexpr auto in_transform_block_desc = make_ConstantTensorDescriptor( - Sequence{}); - - constexpr auto wei_transform_block_desc = - make_ConstantTensorDescriptor(Sequence{}); - - constexpr unsigned in_transform_block_size = in_transform_block_desc.GetElementSpace(); - constexpr unsigned wei_transform_block_size = wei_transform_block_desc.GetElementSpace(); - - __shared__ TFloat p_in_transform_block[in_transform_block_size]; - __shared__ TFloat p_wei_transform_block[wei_transform_block_size]; - - // thread data - constexpr auto in_transform_thread_block_desc = - make_ConstantTensorDescriptor(Sequence{}, - in_transform_block_desc.GetStrides()); - - constexpr auto wei_transform_thread_block_desc = - make_ConstantTensorDescriptor(Sequence{}, - wei_transform_block_desc.GetStrides()); - - constexpr auto out_transform_thread_desc = - make_ConstantTensorDescriptor(Sequence{}); - - constexpr auto out_thread_desc = make_ConstantTensorDescriptor( - Sequence{}); - - constexpr auto out_thread_global_desc = - make_ConstantTensorDescriptor(out_thread_desc.GetLengths(), out_global_desc.GetStrides()); - - constexpr unsigned out_transform_thread_size = out_transform_thread_desc.GetElementSpace(); - constexpr unsigned out_thread_size = out_thread_desc.GetElementSpace(); - - TFloat p_out_transform_thread[out_transform_thread_size]; - TFloat p_out_thread[out_thread_size]; - -#if 0 - if(blockIdx.x == 0 && threadIdx.x == 0) - { - printf("in_transform_block_size %u, wei_transform_block_size %u, out_transform_thread_size " - "%u, out_thread_size %u \n", - in_transform_block_size, - wei_transform_block_size, - out_transform_thread_size, - out_thread_size); - } -#endif - - // set threadwise output transform tensor to 0 - threadwise_4d_tensor_op_unary( - out_transform_thread_desc, p_out_transform_thread, f_set0); - - for(unsigned c_block_data_offset = 0; c_block_data_offset < in_global_desc.GetLength(I1); - c_block_data_offset += CPerBlock, __syncthreads()) - { - // blockwise transform input - blockwise_winograd_transform_input( - p_in_global + in_global_desc.Get1dIndex(n_block_data_offset, - c_block_data_offset, - hi_block_data_offset, - wi_block_data_offset), - p_in_transform_block); - - // blockwise transform weights - blockwise_winograd_transform_weight( - p_wei_global + - wei_global_desc.Get1dIndex(k_block_data_offset, c_block_data_offset, 0, 0), - p_wei_transform_block); - - for(unsigned c_thread_data = 0; c_thread_data < CPerBlock; c_thread_data += CPerThread) - { - // threadwise point multiplication - threadwise_winograd_calculate_transformed_output< - TFloat, - decltype(in_transform_thread_block_desc), - decltype(wei_transform_thread_block_desc), - decltype(out_transform_thread_desc), - InTileSizeH, - InTileSizeW, - S, - R, - OutTileSizeH, - OutTileSizeW>( - in_transform_thread_block_desc, - p_in_transform_block + - in_transform_block_desc.Get1dIndex(n_thread_data_offset, - c_thread_data, - y_thread_data_offset * InTileSizeH, - x_thread_data_offset * InTileSizeW), - wei_transform_thread_block_desc, - p_wei_transform_block + - wei_transform_block_desc.Get1dIndex(k_thread_data_offset, c_thread_data, 0, 0), - out_transform_thread_desc, - p_out_transform_thread); - } - }; - - // transform back - threadwise_winograd_reverse_transform_output( - out_transform_thread_desc, p_out_transform_thread, out_thread_desc, p_out_thread); - - // copy output tensor from register to global mem - threadwise_4d_tensor_op_binary( - out_thread_desc, - p_out_thread, - out_thread_global_desc, - p_out_global + - out_global_desc.Get1dIndex(n_block_data_offset + n_thread_data_offset, - k_block_data_offset + k_thread_data_offset, - ho_block_data_offset + y_thread_data_offset * OutTileSizeH, - wo_block_data_offset + x_thread_data_offset * OutTileSizeW), - f_copy); -} \ No newline at end of file diff --git a/src/include/threadwise_direct_convolution.cuh b/src/include/threadwise_direct_convolution.cuh index 68e00e901a..3f9fb6be91 100644 --- a/src/include/threadwise_direct_convolution.cuh +++ b/src/include/threadwise_direct_convolution.cuh @@ -9,10 +9,10 @@ __device__ void threadwise_direct_convolution(InDesc, OutDesc, TFloat* __restrict__ p_out) { - constexpr auto I0 = Index<0>{}; - constexpr auto I1 = Index<1>{}; - constexpr auto I2 = Index<2>{}; - constexpr auto I3 = Index<3>{}; + constexpr auto I0 = Number<0>{}; + constexpr auto I1 = Number<1>{}; + constexpr auto I2 = Number<2>{}; + constexpr auto I3 = Number<3>{}; constexpr auto in_desc = InDesc{}; constexpr auto wei_desc = WeiDesc{}; diff --git a/src/include/threadwise_tensor_op.cuh b/src/include/threadwise_tensor_op.cuh index 9440957e8c..6781ce57f3 100644 --- a/src/include/threadwise_tensor_op.cuh +++ b/src/include/threadwise_tensor_op.cuh @@ -1,50 +1,34 @@ #pragma once #include "constant_tensor_descriptor.cuh" -#define THREADWISE_TENSOR_OP_METHOD 0 - -#if THREADWISE_TENSOR_OP_METHOD == 0 -template -__device__ void threadwise_4d_tensor_op_unary(DstDesc, TFloat* __restrict__ p_dst, F f) +template +__device__ void threadwise_4d_tensor_pointwise_op_unary(Desc, TFloat* __restrict__ p_dst, F f) { - constexpr auto I0 = Index<0>{}; - constexpr auto I1 = Index<1>{}; - constexpr auto I2 = Index<2>{}; - constexpr auto I3 = Index<3>{}; + constexpr auto I0 = Number<0>{}; + constexpr auto I1 = Number<1>{}; + constexpr auto I2 = Number<2>{}; + constexpr auto I3 = Number<3>{}; - constexpr auto dst_desc = DstDesc{}; + constexpr auto desc = Desc{}; #if 0 if(threadIdx.x == 0) { - print_ConstantTensorDescriptor(dst_desc, "threadwise_4d_tensor_op_unary: "); + print_ConstantTensorDescriptor(desc, "threadwise_4d_tensor_op_unary: "); } #endif - for(unsigned did0 = 0; did0 < dst_desc.GetLength(I0); ++did0) + for(unsigned did0 = 0; did0 < desc.GetLength(I0); ++did0) { - for(unsigned did1 = 0; did1 < dst_desc.GetLength(I1); ++did1) + for(unsigned did1 = 0; did1 < desc.GetLength(I1); ++did1) { - for(unsigned did2 = 0; did2 < dst_desc.GetLength(I2); ++did2) + for(unsigned did2 = 0; did2 < desc.GetLength(I2); ++did2) { - for(unsigned did3 = 0; did3 < dst_desc.GetLength(I3); ++did3) + for(unsigned did3 = 0; did3 < desc.GetLength(I3); ++did3) { - const unsigned dindex = - dst_desc.GetStride(I0) * did0 + dst_desc.GetStride(I1) * did1 + - dst_desc.GetStride(I2) * did2 + dst_desc.GetStride(I3) * did3; + const unsigned dindex = desc.Get1dIndex(did0, did1, did2, did3); f(p_dst[dindex]); - -#if 0 - if(threadIdx.x == 0) - { - printf("threadwise_4d_tensor_op_unary: thread id %u, \t" - "dindex %u, p_dst[dindex] %f\n", - threadIdx.x, - dindex, - p_dst[dindex]); - } -#endif } } } @@ -52,13 +36,13 @@ __device__ void threadwise_4d_tensor_op_unary(DstDesc, TFloat* __restrict__ p_ds } template -__device__ void threadwise_4d_tensor_op_binary( +__device__ void threadwise_4d_tensor_pointwise_op_binary( SrcDesc, TFloat* const __restrict__ p_src, DstDesc, TFloat* __restrict__ p_dst, F f) { - constexpr auto I0 = Index<0>{}; - constexpr auto I1 = Index<1>{}; - constexpr auto I2 = Index<2>{}; - constexpr auto I3 = Index<3>{}; + constexpr auto I0 = Number<0>{}; + constexpr auto I1 = Number<1>{}; + constexpr auto I2 = Number<2>{}; + constexpr auto I3 = Number<3>{}; constexpr auto src_desc = SrcDesc{}; constexpr auto dst_desc = DstDesc{}; @@ -81,99 +65,34 @@ __device__ void threadwise_4d_tensor_op_binary( { for(unsigned did3 = 0; did3 < src_desc.GetLength(I3); ++did3) { - const unsigned sindex = - src_desc.GetStride(I0) * did0 + src_desc.GetStride(I1) * did1 + - src_desc.GetStride(I2) * did2 + src_desc.GetStride(I3) * did3; + const unsigned sindex = src_desc.Get1dIndex(did0, did1, did2, did3); - const unsigned dindex = - dst_desc.GetStride(I0) * did0 + dst_desc.GetStride(I1) * did1 + - dst_desc.GetStride(I2) * did2 + dst_desc.GetStride(I3) * did3; + const unsigned dindex = dst_desc.Get1dIndex(did0, did1, did2, did3); f(p_src[sindex], p_dst[dindex]); - -#if 0 - if(threadIdx.x == 0) - { - printf("threadwise_4d_tensor_op_binary: thread id %u, \t" - "sindex %u, p_src[sindex] %f, \t" - "dindex %u, p_dst[dindex] %f\n", - threadIdx.x, - sindex, - p_src[sindex], - dindex, - p_dst[dindex]); - } -#endif } } } } } -#endif -#if THREADWISE_TENSOR_OP_METHOD == 1 -template -__device__ void threadwise_4d_tensor_op( - SrcDesc, TFloat* const __restrict__ p_src, DstDesc, TFloat* __restrict__ p_dst, F f) +template +__device__ void threadwise_4d_tensor_set_zero(Desc, TFloat* __restrict__ p_dst) { - constexpr auto I0 = Index<0>{}; - constexpr auto I1 = Index<1>{}; - constexpr auto I2 = Index<2>{}; - constexpr auto I3 = Index<3>{}; + auto f_set_zero = [](TFloat& v) { v = TFloat(0); }; - constexpr auto src_desc = SrcDesc{}; - constexpr auto dst_desc = DstDesc{}; - - static_assert(is_same::value); - -#if 0 - if(threadIdx.x == 0) - { - print_ConstantTensorDescriptor(src_desc, "threadwise_4d_tensor_op: src_desc: "); - print_ConstantTensorDescriptor(dst_desc, "threadwise_4d_tensor_op: dst_desc: "); - } -#endif - - unsigned sindex = 0; - unsigned dindex = 0; - - for(unsigned did0 = 0; did0 < src_desc.GetLength(I0); ++did0) - { - for(unsigned did1 = 0; did1 < src_desc.GetLength(I1); ++did1) - { - for(unsigned did2 = 0; did2 < src_desc.GetLength(I2); ++did2) - { - for(unsigned did3 = 0; did3 < src_desc.GetLength(I3); ++did3) - { - f(p_src[sindex], p_dst[dindex]); - -#if 0 - if(threadIdx.x == 0) - { - printf("threadwise_4d_tensor_op: 1: thread id %u, \t" - "sindex %u, p_src[sindex] %f, \t" - "dindex %u, p_dst[dindex] %f\n", - threadIdx.x, - sindex, - p_src[sindex], - dindex, - p_dst[dindex]); - } -#endif - sindex += src_desc.GetStride(I3); - dindex += dst_desc.GetStride(I3); - } - - sindex += src_desc.GetStride(I2) - src_desc.GetLength(I3) * src_desc.GetStride(I3); - dindex += dst_desc.GetStride(I2) - dst_desc.GetLength(I3) * dst_desc.GetStride(I3); - } - - sindex += src_desc.GetStride(I1) - src_desc.GetLength(I2) * src_desc.GetStride(I2); - dindex += dst_desc.GetStride(I1) - dst_desc.GetLength(I2) * dst_desc.GetStride(I2); - } - - sindex += src_desc.GetStride(I0) - src_desc.GetLength(I1) * src_desc.GetStride(I1); - dindex += dst_desc.GetStride(I0) - dst_desc.GetLength(I1) * dst_desc.GetStride(I1); - } + threadwise_4d_tensor_pointwise_op_unary( + Desc{}, p_dst, f_set_zero); } -#endif + +template +__device__ void threadwise_4d_tensor_copy(SrcDesc, + TFloat* const __restrict__ p_src, + DstDesc, + TFloat* __restrict__ p_dst) +{ + auto f_copy = [](const TFloat& src, TFloat& dst) { dst = src; }; + + threadwise_4d_tensor_pointwise_op_binary( + SrcDesc{}, p_src, DstDesc{}, p_dst, f_copy); +} \ No newline at end of file diff --git a/src/include/threadwise_winograd_transform.cuh b/src/include/threadwise_winograd_transform.cuh deleted file mode 100644 index 8ef2af039b..0000000000 --- a/src/include/threadwise_winograd_transform.cuh +++ /dev/null @@ -1,138 +0,0 @@ -#pragma once -#include "constant_tensor_descriptor.cuh" - -template -__device__ void -threadwise_winograd_calculate_transformed_output(InTransThreadDesc, - TFloat* const __restrict__ p_in_transform_thread, - WeiTransThreadDesc, - TFloat* const __restrict__ p_wei_transform_thread, - OutTransThreadDesc, - TFloat* __restrict__ p_out_transform_thread) -{ - constexpr auto I0 = Index<0>{}; - constexpr auto I1 = Index<1>{}; - constexpr auto I2 = Index<2>{}; - constexpr auto I3 = Index<3>{}; - - constexpr auto in_transform_thread_desc = InTransThreadDesc{}; - constexpr auto wei_transform_thread_desc = WeiTransThreadDesc{}; - constexpr auto out_transform_thread_desc = OutTransThreadDesc{}; - - for(unsigned n = 0; n < out_transform_thread_desc.GetLength(I0); ++n) - { - for(unsigned k = 0; k < out_transform_thread_desc.GetLength(I1); ++k) - { - for(unsigned h = 0; h < out_transform_thread_desc.GetLength(I2); ++h) - { - for(unsigned w = 0; w < out_transform_thread_desc.GetLength(I3); ++w) - { - for(unsigned c = 0; c < wei_transform_thread_desc.GetLength(I1); ++c) - { - const unsigned in_index = in_transform_thread_desc.Get1dIndex(n, c, h, w); - const unsigned wei_index = wei_transform_thread_desc.Get1dIndex(k, c, h, w); - const unsigned out_index = out_transform_thread_desc.Get1dIndex(n, k, h, w); - - p_out_transform_thread[out_index] += - p_wei_transform_thread[wei_index] * p_in_transform_thread[in_index]; - } - } - } - } - } -} - -template -__device__ void -threadwise_winograd_reverse_transform_output(OutTransThreadDesc, - TFloat* const __restrict__ p_out_transform_thread, - OutThreadDesc, - TFloat* __restrict__ p_out_thread) -{ - static_assert(InTileSizeH == 4, "wrong"); - static_assert(InTileSizeW == 4, "wrong"); - static_assert(S == 3, "wrong"); - static_assert(R == 3, "wrong"); - static_assert(OutTileSizeH == 2, "wrong"); - static_assert(OutTileSizeW == 2, "wrong"); - - constexpr auto I0 = Index<0>{}; - constexpr auto I1 = Index<1>{}; - constexpr auto I2 = Index<2>{}; - constexpr auto I3 = Index<3>{}; - - constexpr auto out_transform_thread_desc = OutTransThreadDesc{}; - constexpr auto out_thread_desc = OutThreadDesc{}; - - static_assert(InTileSizeH == out_transform_thread_desc.GetLength(I2), "wrong"); - static_assert(InTileSizeW == out_transform_thread_desc.GetLength(I3), "wrong"); - static_assert(OutTileSizeH == out_thread_desc.GetLength(I2), "wrong"); - static_assert(OutTileSizeW == out_thread_desc.GetLength(I3), "wrong"); - - for(unsigned n = 0; n < out_thread_desc.GetLength(I0); ++n) - { - for(unsigned k = 0; k < out_thread_desc.GetLength(I1); ++k) - { - p_out_thread[out_thread_desc.Get1dIndex(n, k, 0, 0)] = - p_out_transform_thread[out_transform_thread_desc.Get1dIndex(n, k, 0, 0)] + - p_out_transform_thread[out_transform_thread_desc.Get1dIndex(n, k, 0, 1)] + - p_out_transform_thread[out_transform_thread_desc.Get1dIndex(n, k, 0, 2)] + - p_out_transform_thread[out_transform_thread_desc.Get1dIndex(n, k, 1, 0)] + - p_out_transform_thread[out_transform_thread_desc.Get1dIndex(n, k, 1, 1)] + - p_out_transform_thread[out_transform_thread_desc.Get1dIndex(n, k, 1, 2)] + - p_out_transform_thread[out_transform_thread_desc.Get1dIndex(n, k, 2, 0)] + - p_out_transform_thread[out_transform_thread_desc.Get1dIndex(n, k, 2, 1)] + - p_out_transform_thread[out_transform_thread_desc.Get1dIndex(n, k, 2, 2)]; - - p_out_thread[out_thread_desc.Get1dIndex(n, k, 0, 1)] = - p_out_transform_thread[out_transform_thread_desc.Get1dIndex(n, k, 0, 1)] - - p_out_transform_thread[out_transform_thread_desc.Get1dIndex(n, k, 0, 2)] - - p_out_transform_thread[out_transform_thread_desc.Get1dIndex(n, k, 0, 3)] + - p_out_transform_thread[out_transform_thread_desc.Get1dIndex(n, k, 1, 1)] - - p_out_transform_thread[out_transform_thread_desc.Get1dIndex(n, k, 1, 2)] - - p_out_transform_thread[out_transform_thread_desc.Get1dIndex(n, k, 1, 3)] + - p_out_transform_thread[out_transform_thread_desc.Get1dIndex(n, k, 2, 1)] - - p_out_transform_thread[out_transform_thread_desc.Get1dIndex(n, k, 2, 2)] - - p_out_transform_thread[out_transform_thread_desc.Get1dIndex(n, k, 2, 3)]; - - p_out_thread[out_thread_desc.Get1dIndex(n, k, 1, 0)] = - p_out_transform_thread[out_transform_thread_desc.Get1dIndex(n, k, 1, 0)] + - p_out_transform_thread[out_transform_thread_desc.Get1dIndex(n, k, 1, 1)] + - p_out_transform_thread[out_transform_thread_desc.Get1dIndex(n, k, 1, 2)] - - p_out_transform_thread[out_transform_thread_desc.Get1dIndex(n, k, 2, 0)] - - p_out_transform_thread[out_transform_thread_desc.Get1dIndex(n, k, 2, 1)] - - p_out_transform_thread[out_transform_thread_desc.Get1dIndex(n, k, 2, 2)] - - p_out_transform_thread[out_transform_thread_desc.Get1dIndex(n, k, 3, 0)] - - p_out_transform_thread[out_transform_thread_desc.Get1dIndex(n, k, 3, 1)] - - p_out_transform_thread[out_transform_thread_desc.Get1dIndex(n, k, 3, 2)]; - - p_out_thread[out_thread_desc.Get1dIndex(n, k, 1, 1)] = - p_out_transform_thread[out_transform_thread_desc.Get1dIndex(n, k, 1, 1)] - - p_out_transform_thread[out_transform_thread_desc.Get1dIndex(n, k, 1, 2)] - - p_out_transform_thread[out_transform_thread_desc.Get1dIndex(n, k, 1, 3)] - - p_out_transform_thread[out_transform_thread_desc.Get1dIndex(n, k, 2, 1)] + - p_out_transform_thread[out_transform_thread_desc.Get1dIndex(n, k, 2, 2)] + - p_out_transform_thread[out_transform_thread_desc.Get1dIndex(n, k, 2, 3)] - - p_out_transform_thread[out_transform_thread_desc.Get1dIndex(n, k, 3, 1)] + - p_out_transform_thread[out_transform_thread_desc.Get1dIndex(n, k, 3, 2)] + - p_out_transform_thread[out_transform_thread_desc.Get1dIndex(n, k, 3, 3)]; - } - } -} \ No newline at end of file