From 1812666a473f00b333dca8aae5dafa64ccabaf7b Mon Sep 17 00:00:00 2001 From: Chao Liu Date: Wed, 14 Nov 2018 08:55:45 -0600 Subject: [PATCH] improved blockwise_tensor_op --- driver/conv.cu | 6 +- src/include/blockwise_tensor_op.cuh | 485 ++++++++++++++++++----- src/include/device_tensor_descriptor.cuh | 60 --- src/include/direct_convolution.cuh | 4 + src/include/threadwise_tensor_op.cuh | 71 ++++ 5 files changed, 472 insertions(+), 154 deletions(-) delete mode 100644 src/include/device_tensor_descriptor.cuh diff --git a/driver/conv.cu b/driver/conv.cu index 3d00b7bb5e..b3c8716a40 100644 --- a/driver/conv.cu +++ b/driver/conv.cu @@ -26,7 +26,7 @@ struct GeneratorTensor T operator()(Is... is) { #if 1 - return std::rand() / RAND_MAX; + return T(std::rand()) / T(RAND_MAX); #elif 0 std::initializer_list ls = {static_cast(is)...}; @@ -142,8 +142,8 @@ void device_convolution( constexpr unsigned NBlockCopyLen0 = 1; constexpr unsigned NBlockCopyLen1 = 1; - constexpr unsigned NBlockCopyLen2 = 2; - constexpr unsigned NBlockCopyLen3 = 16; + constexpr unsigned NBlockCopyLen2 = 4; + constexpr unsigned NBlockCopyLen3 = 32; constexpr unsigned BlockSize = 128; diff --git a/src/include/blockwise_tensor_op.cuh b/src/include/blockwise_tensor_op.cuh index 768fc0ee23..b311253eaa 100644 --- a/src/include/blockwise_tensor_op.cuh +++ b/src/include/blockwise_tensor_op.cuh @@ -1,94 +1,9 @@ #pragma once #include "constant_tensor_descriptor.cuh" -#if 0 -template -__device__ void blockwise_4d_tensor_op( - SrcDesc, TFloat* const __restrict__ p_src, DstDesc, TFloat* __restrict__ p_dst, F f) -{ - constexpr auto I0 = Index<0>{}; - constexpr auto I1 = Index<1>{}; - constexpr auto I2 = Index<2>{}; - constexpr auto I3 = Index<3>{}; - - constexpr auto src_desc = SrcDesc{}; - constexpr auto dst_desc = DstDesc{}; - - static_assert(is_same::value); - -#if 0 - if(threadIdx.x == 0) - { - print_ConstantTensorDescriptor(src_desc, "blockwise_4d_tensor_op: src_desc: "); - print_ConstantTensorDescriptor(dst_desc, "blockwise_4d_tensor_op: dst_desc: "); - } -#endif - - constexpr unsigned NWorkStride3 = 1; - constexpr unsigned NWorkStride2 = NWorkLen3 * NWorkStride3; - constexpr unsigned NWorkStride1 = NWorkLen2 * NWorkStride2; - constexpr unsigned NWorkStride0 = NWorkLen1 * NWorkStride1; - - unsigned itmp = - threadIdx.x; - - const unsigned did0_begin = itmp / NWorkStride0; - - itmp -= did0_begin * NWorkStride0; - - const unsigned did1_begin = itmp / NWorkStride1; - - itmp -= did1_begin * NWorkStride1; - - const unsigned did2_begin = itmp / NWorkStride2; - - itmp -= did2_begin * NWorkStride2; - - const unsigned did3_begin = itmp / NWorkStride3; - - for(unsigned did0 = did0_begin; did0 < src_desc.GetLength(I0); did0 += NWorkLen0) - { - for(unsigned did1 = did1_begin; did1 < src_desc.GetLength(I1); did1 += NWorkLen1) - { - for(unsigned did2 = did2_begin; did2 < src_desc.GetLength(I2); did2 += NWorkLen2) - { - for(unsigned did3 = did3_begin; did3 < src_desc.GetLength(I3); did3 += NWorkLen3) - { - const unsigned sindex = src_desc.Get1dIndex(did0, did1, did2, did3); - - const unsigned dindex = dst_desc.Get1dIndex(did0, did1, did2, did3); - - f(p_src[dindex], p_dst[sindex]); - -#if 0 - // if(threadIdx.x == 0) - { - printf("blockwise_4d_tensor_op: 1: thread id %u, \t" - "sindex %u, p_src[sindex] %f, \t" - "dindex %u, p_dst[dindex] %f\n", - threadIdx.x, - sindex, - p_src[sindex], - dindex, - p_dst[dindex]); - } -#endif - } - } - } - } -} - -#elif 1 +#define BLOCKWISE_TENSOR_OP_METHOD 12 +#if BLOCKWISE_TENSOR_OP_METHOD == 11 template +__device__ void blockwise_4d_tensor_op( + SrcDesc, TFloat* const __restrict__ p_src, DstDesc, TFloat* __restrict__ p_dst, F f) +{ + constexpr auto I0 = Index<0>{}; + constexpr auto I1 = Index<1>{}; + constexpr auto I2 = Index<2>{}; + constexpr auto I3 = Index<3>{}; + + constexpr auto src_desc = SrcDesc{}; + constexpr auto dst_desc = DstDesc{}; + + static_assert(is_same::value); + + constexpr auto desc = make_ConstantTensorDescriptor(src_desc.GetLengths()); + +#if 0 + if(threadIdx.x == 0) + { + print_ConstantTensorDescriptor(src_desc, "blockwise_4d_tensor_op: src_desc: "); + print_ConstantTensorDescriptor(dst_desc, "blockwise_4d_tensor_op: dst_desc: "); + } +#endif + + constexpr unsigned NLoop = desc.GetElementSize() / BlockSize; + + for(unsigned iloop = 0; iloop + 1 < NLoop; ++iloop) + { + unsigned is = threadIdx.x + iloop * BlockSize; + + const unsigned did0 = is / desc.GetStride(I0); + + is -= did0 * desc.GetStride(I0); + + const unsigned did1 = is / desc.GetStride(I1); + + is -= did1 * desc.GetStride(I1); + + const unsigned did2 = is / desc.GetStride(I2); + + is -= did2 * desc.GetStride(I2); + + const unsigned did3 = is / desc.GetStride(I3); + + const unsigned sindex = src_desc.Get1dIndex(did0, did1, did2, did3); + + const unsigned dindex = dst_desc.Get1dIndex(did0, did1, did2, did3); + + f(p_src[sindex], p_dst[dindex]); + } + + constexpr bool has_tail = (desc.GetElementSize() > NLoop * BlockSize); + + if(has_tail) + { + unsigned is = threadIdx.x + NLoop * BlockSize; + + if(is < desc.GetElementSize()) + { + const unsigned did0 = is / desc.GetStride(I0); + + is -= did0 * desc.GetStride(I0); + + const unsigned did1 = is / desc.GetStride(I1); + + is -= did1 * desc.GetStride(I1); + + const unsigned did2 = is / desc.GetStride(I2); + + is -= did2 * desc.GetStride(I2); + + const unsigned did3 = is / desc.GetStride(I3); + + const unsigned sindex = src_desc.Get1dIndex(did0, did1, did2, did3); + + const unsigned dindex = dst_desc.Get1dIndex(did0, did1, did2, did3); + + f(p_src[sindex], p_dst[dindex]); + } + } +} +#endif + +#if BLOCKWISE_TENSOR_OP_METHOD == 21 +template +__device__ void blockwise_4d_tensor_op( + SrcDesc, TFloat* const __restrict__ p_src, DstDesc, TFloat* __restrict__ p_dst, F f) +{ + constexpr auto I0 = Index<0>{}; + constexpr auto I1 = Index<1>{}; + constexpr auto I2 = Index<2>{}; + constexpr auto I3 = Index<3>{}; + + constexpr auto src_desc = SrcDesc{}; + constexpr auto dst_desc = DstDesc{}; + + static_assert(is_same::value); + + constexpr unsigned NWorkStride3 = 1; + constexpr unsigned NWorkStride2 = NWorkLen3 * NWorkStride3; + constexpr unsigned NWorkStride1 = NWorkLen2 * NWorkStride2; + constexpr unsigned NWorkStride0 = NWorkLen1 * NWorkStride1; + + unsigned itmp = threadIdx.x; + + const unsigned did0_begin = itmp / NWorkStride0; + + itmp -= did0_begin * NWorkStride0; + + const unsigned did1_begin = itmp / NWorkStride1; + + itmp -= did1_begin * NWorkStride1; + + const unsigned did2_begin = itmp / NWorkStride2; + + itmp -= did2_begin * NWorkStride2; + + const unsigned did3_begin = itmp / NWorkStride3; + + for(unsigned did0 = did0_begin; did0 < src_desc.GetLength(I0); did0 += NWorkLen0) + { + for(unsigned did1 = did1_begin; did1 < src_desc.GetLength(I1); did1 += NWorkLen1) + { + for(unsigned did2 = did2_begin; did2 < src_desc.GetLength(I2); did2 += NWorkLen2) + { + for(unsigned did3 = did3_begin; did3 < src_desc.GetLength(I3); did3 += NWorkLen3) + { + const unsigned sindex = src_desc.Get1dIndex(did0, did1, did2, did3); + + const unsigned dindex = dst_desc.Get1dIndex(did0, did1, did2, did3); + + f(p_src[sindex], p_dst[dindex]); + } + } + } + } +} +#endif + +#if BLOCKWISE_TENSOR_OP_METHOD == 22 +template +__device__ void blockwise_4d_tensor_op( + SrcDesc, TFloat* const __restrict__ p_src, DstDesc, TFloat* __restrict__ p_dst, F f) +{ + constexpr auto I0 = Index<0>{}; + constexpr auto I1 = Index<1>{}; + constexpr auto I2 = Index<2>{}; + constexpr auto I3 = Index<3>{}; + + constexpr auto src_desc = SrcDesc{}; + constexpr auto dst_desc = DstDesc{}; + + static_assert(is_same::value); + + constexpr unsigned NWorkStride3 = 1; + constexpr unsigned NWorkStride2 = NWorkLen3 * NWorkStride3; + constexpr unsigned NWorkStride1 = NWorkLen2 * NWorkStride2; + constexpr unsigned NWorkStride0 = NWorkLen1 * NWorkStride1; + + unsigned itmp = threadIdx.x; + + const unsigned did0_begin = itmp / NWorkStride0; + + itmp -= did0_begin * NWorkStride0; + + const unsigned did1_begin = itmp / NWorkStride1; + + itmp -= did1_begin * NWorkStride1; + + const unsigned did2_begin = itmp / NWorkStride2; + + itmp -= did2_begin * NWorkStride2; + + const unsigned did3_begin = itmp / NWorkStride3; + + unsigned sindex = src_desc.Get1dIndex(did0_begin, did1_begin, did2_begin, did3_begin); + unsigned dindex = dst_desc.Get1dIndex(did0_begin, did1_begin, did2_begin, did3_begin); + + for(unsigned did0 = did0_begin; did0 < src_desc.GetLength(I0); did0 += NWorkLen0) + { + const unsigned sindex_save0 = sindex; + const unsigned dindex_save0 = dindex; + + for(unsigned did1 = did1_begin; did1 < src_desc.GetLength(I1); did1 += NWorkLen1) + { + const unsigned sindex_save1 = sindex; + const unsigned dindex_save1 = dindex; + + for(unsigned did2 = did2_begin; did2 < src_desc.GetLength(I2); did2 += NWorkLen2) + { + const unsigned sindex_save2 = sindex; + const unsigned dindex_save2 = dindex; + + for(unsigned did3 = did3_begin; did3 < src_desc.GetLength(I3); did3 += NWorkLen3) + { + f(p_src[sindex], p_dst[dindex]); + + sindex += NWorkLen3 * src_desc.GetStride(I3); + dindex += NWorkLen3 * dst_desc.GetStride(I3); + } + + sindex = sindex_save2 + NWorkLen2 * src_desc.GetStride(I2); + dindex = dindex_save2 + NWorkLen2 * dst_desc.GetStride(I2); + } + + sindex = sindex_save1 + NWorkLen1 * src_desc.GetStride(I1); + dindex = dindex_save1 + NWorkLen1 * dst_desc.GetStride(I1); + } + + sindex = sindex_save0 + NWorkLen0 * src_desc.GetStride(I0); + dindex = dindex_save0 + NWorkLen0 * dst_desc.GetStride(I0); + } +} +#endif + +#if BLOCKWISE_TENSOR_OP_METHOD == 23 +template +__device__ void blockwise_4d_tensor_op( + SrcDesc, TFloat* const __restrict__ p_src, DstDesc, TFloat* __restrict__ p_dst, F f) +{ + constexpr auto I0 = Index<0>{}; + constexpr auto I1 = Index<1>{}; + constexpr auto I2 = Index<2>{}; + constexpr auto I3 = Index<3>{}; + + constexpr auto src_desc = SrcDesc{}; + constexpr auto dst_desc = DstDesc{}; + + static_assert(is_same::value); + + constexpr unsigned NWorkStride3 = 1; + constexpr unsigned NWorkStride2 = NWorkLen3 * NWorkStride3; + constexpr unsigned NWorkStride1 = NWorkLen2 * NWorkStride2; + constexpr unsigned NWorkStride0 = NWorkLen1 * NWorkStride1; + + unsigned itmp = threadIdx.x; + + const unsigned did0_begin = itmp / NWorkStride0; + + itmp -= did0_begin * NWorkStride0; + + const unsigned did1_begin = itmp / NWorkStride1; + + itmp -= did1_begin * NWorkStride1; + + const unsigned did2_begin = itmp / NWorkStride2; + + itmp -= did2_begin * NWorkStride2; + + const unsigned did3_begin = itmp / NWorkStride3; + + unsigned sindex = src_desc.Get1dIndex(did0_begin, did1_begin, did2_begin, did3_begin); + unsigned dindex = dst_desc.Get1dIndex(did0_begin, did1_begin, did2_begin, did3_begin); + + for(unsigned did0 = did0_begin; did0 < src_desc.GetLength(I0); did0 += NWorkLen0) + { + unsigned i1 = 0; + for(unsigned did1 = did1_begin; did1 < src_desc.GetLength(I1); did1 += NWorkLen1) + { + unsigned i2 = 0; + for(unsigned did2 = did2_begin; did2 < src_desc.GetLength(I2); did2 += NWorkLen2) + { + unsigned i3 = 0; + for(unsigned did3 = did3_begin; did3 < src_desc.GetLength(I3); did3 += NWorkLen3) + { + f(p_src[sindex], p_dst[dindex]); + + sindex += NWorkLen3 * src_desc.GetStride(I3); + dindex += NWorkLen3 * dst_desc.GetStride(I3); + + ++i3; + } + + sindex += + NWorkLen2 * src_desc.GetStride(I2) - i3 * NWorkLen3 * src_desc.GetStride(I3); + dindex += + NWorkLen2 * dst_desc.GetStride(I2) - i3 * NWorkLen3 * dst_desc.GetStride(I3); + + ++i2; + } + + sindex += NWorkLen1 * src_desc.GetStride(I1) - i2 * NWorkLen2 * src_desc.GetStride(I2); + dindex += NWorkLen1 * dst_desc.GetStride(I1) - i2 * NWorkLen2 * dst_desc.GetStride(I2); + + ++i1; + } + + sindex += NWorkLen0 * src_desc.GetStride(I0) - i1 * NWorkLen1 * src_desc.GetStride(I1); + dindex += NWorkLen0 * dst_desc.GetStride(I0) - i1 * NWorkLen1 * dst_desc.GetStride(I1); + } +} +#endif + +#if BLOCKWISE_TENSOR_OP_METHOD == 31 +template +__device__ void blockwise_4d_tensor_op( + SrcDesc, TFloat* const __restrict__ p_src, DstDesc, TFloat* __restrict__ p_dst, F f) +{ + constexpr auto I0 = Index<0>{}; + constexpr auto I1 = Index<1>{}; + constexpr auto I2 = Index<2>{}; + constexpr auto I3 = Index<3>{}; + + constexpr auto src_desc = SrcDesc{}; + constexpr auto dst_desc = DstDesc{}; + + static_assert(is_same::value); + + constexpr unsigned NWorkStride3 = 1; + constexpr unsigned NWorkStride2 = NWorkLen3 * NWorkStride3; + constexpr unsigned NWorkStride1 = NWorkLen2 * NWorkStride2; + constexpr unsigned NWorkStride0 = NWorkLen1 * NWorkStride1; + + unsigned itmp = threadIdx.x; + + const unsigned did0_begin = itmp / NWorkStride0; + + itmp -= did0_begin * NWorkStride0; + + const unsigned did1_begin = itmp / NWorkStride1; + + itmp -= did1_begin * NWorkStride1; + + const unsigned did2_begin = itmp / NWorkStride2; + + itmp -= did2_begin * NWorkStride2; + + const unsigned did3_begin = itmp / NWorkStride3; + + for(unsigned did0 = did0_begin; did0 < src_desc.GetLength(I0); did0 += NWorkLen0) + { + for(unsigned did1 = did1_begin; did1 < src_desc.GetLength(I1); did1 += NWorkLen1) + { + for(unsigned did2 = did2_begin; did2 < src_desc.GetLength(I2); did2 += NWorkLen2) + { + for(unsigned did3 = did3_begin; did3 < src_desc.GetLength(I3); did3 += NWorkLen3) + { + const unsigned sindex = src_desc.Get1dIndex(did0, did1, did2, did3); + + const unsigned dindex = dst_desc.Get1dIndex(did0, did1, did2, did3); + + f(p_src[sindex], p_dst[dindex]); + } + } + } + } +} +#endif diff --git a/src/include/device_tensor_descriptor.cuh b/src/include/device_tensor_descriptor.cuh deleted file mode 100644 index 403e547a33..0000000000 --- a/src/include/device_tensor_descriptor.cuh +++ /dev/null @@ -1,60 +0,0 @@ -#pragma once -#include -#include "constant_tensor_descriptor.cuh" -#include "helper_cuda.h" -#include "tensor.hpp" - -template -struct DeviceTensorDescriptor -{ - __host__ __device__ DeviceTensorDescriptor() = default; - - __host__ DeviceTensorDescriptor(const TensorDescriptor& host_desc) - { - assert(NDim == host_desc.GetDimension()); - std::copy(host_desc.GetLengths().begin(), host_desc.GetLengths().end(), mpLengths); - std::copy(host_desc.GetStrides().begin(), host_desc.GetStrides().end(), mpStrides); - } - - __host__ __device__ unsigned GetLength(unsigned i) const { return mpLengths[i]; } - - __host__ __device__ unsigned GetStride(unsigned i) const { return mpStrides[i]; } - - // this is ugly, only for 4d - __host__ __device__ unsigned Get1dIndex(unsigned n, unsigned c, unsigned h, unsigned w) const - { - return n * mpStrides[0] + c * mpStrides[1] + h * mpStrides[2] + w * mpStrides[3]; - } - - unsigned mpLengths[NDim]; - unsigned mpStrides[NDim]; -}; - -// this is ugly, only for 4d -template -__host__ __device__ auto make_DeviceTensorDescriptor(TConstTensorDesc) -{ - static_assert(TConstTensorDesc::nDim == 4, "nDim is not 4"); - - constexpr auto I0 = Index<0>{}; - constexpr auto I1 = Index<1>{}; - constexpr auto I2 = Index<2>{}; - constexpr auto I3 = Index<3>{}; - constexpr auto const_desc = TConstTensorDesc{}; - - constexpr auto ndim = const_desc.GetDimension(); - - auto desc = DeviceTensorDescriptor{}; - - desc.mpLengths[0] = const_desc.GetLength(I0); - desc.mpLengths[1] = const_desc.GetLength(I1); - desc.mpLengths[2] = const_desc.GetLength(I2); - desc.mpLengths[3] = const_desc.GetLength(I3); - - desc.mpStrides[0] = const_desc.GetStride(I0); - desc.mpStrides[1] = const_desc.GetStride(I1); - desc.mpStrides[2] = const_desc.GetStride(I2); - desc.mpStrides[3] = const_desc.GetStride(I3); - - return desc; -} diff --git a/src/include/direct_convolution.cuh b/src/include/direct_convolution.cuh index 461842723a..19aafece97 100644 --- a/src/include/direct_convolution.cuh +++ b/src/include/direct_convolution.cuh @@ -283,6 +283,7 @@ __global__ void gridwise_convolution(InDesc, { auto f_copy = [](const TFloat& src, TFloat& dst) { dst = src; }; +#if 1 // copy input tensor to LDS blockwise_4d_tensor_op __device__ void threadwise_4d_tensor_op( SrcDesc, TFloat* const __restrict__ p_src, DstDesc, TFloat* __restrict__ p_dst, F f) @@ -59,3 +62,71 @@ __device__ void threadwise_4d_tensor_op( } } } +#endif + +#if THREADWISE_TENSOR_OP_METHOD == 1 +template +__device__ void threadwise_4d_tensor_op( + SrcDesc, TFloat* const __restrict__ p_src, DstDesc, TFloat* __restrict__ p_dst, F f) +{ + constexpr auto I0 = Index<0>{}; + constexpr auto I1 = Index<1>{}; + constexpr auto I2 = Index<2>{}; + constexpr auto I3 = Index<3>{}; + + constexpr auto src_desc = SrcDesc{}; + constexpr auto dst_desc = DstDesc{}; + + static_assert(is_same::value); + +#if 0 + if(threadIdx.x == 0) + { + print_ConstantTensorDescriptor(src_desc); + print_ConstantTensorDescriptor(dst_desc); + } +#endif + + unsigned sindex = 0; + unsigned dindex = 0; + + for(unsigned did0 = 0; did0 < src_desc.GetLength(I0); ++did0) + { + for(unsigned did1 = 0; did1 < src_desc.GetLength(I1); ++did1) + { + for(unsigned did2 = 0; did2 < src_desc.GetLength(I2); ++did2) + { + for(unsigned did3 = 0; did3 < src_desc.GetLength(I3); ++did3) + { + f(p_src[sindex], p_dst[dindex]); + +#if 0 + if(threadIdx.x == 0) + { + printf("threadwise_4d_tensor_op: 1: thread id %u, \t" + "sindex %u, p_src[sindex] %f, \t" + "dindex %u, p_dst[dindex] %f\n", + threadIdx.x, + sindex, + p_src[sindex], + dindex, + p_dst[dindex]); + } +#endif + sindex += src_desc.GetStride(I3); + dindex += dst_desc.GetStride(I3); + } + + sindex += src_desc.GetStride(I2) - src_desc.GetLength(I3) * src_desc.GetStride(I3); + dindex += dst_desc.GetStride(I2) - dst_desc.GetLength(I3) * dst_desc.GetStride(I3); + } + + sindex += src_desc.GetStride(I1) - src_desc.GetLength(I2) * src_desc.GetStride(I2); + dindex += dst_desc.GetStride(I1) - dst_desc.GetLength(I2) * dst_desc.GetStride(I2); + } + + sindex += src_desc.GetStride(I0) - src_desc.GetLength(I1) * src_desc.GetStride(I1); + dindex += dst_desc.GetStride(I0) - dst_desc.GetLength(I1) * dst_desc.GetStride(I1); + } +} +#endif