From 21c918162e81211d2b7ec3a555accef1235957eb Mon Sep 17 00:00:00 2001 From: Chao Liu Date: Fri, 4 Jan 2019 14:48:57 -0600 Subject: [PATCH] added blockwise tensor reorder operation --- driver/conv.cu | 5 +- src/include/blockwise_direct_convolution.cuh | 2 +- src/include/blockwise_tensor_op.cuh | 139 ++++++++++++------ src/include/constant_tensor_descriptor.cuh | 32 +++- src/include/gridwise_direct_convolution_2.cuh | 2 +- 5 files changed, 129 insertions(+), 51 deletions(-) diff --git a/driver/conv.cu b/driver/conv.cu index 17ef79f405..0dc8a03b48 100644 --- a/driver/conv.cu +++ b/driver/conv.cu @@ -7,6 +7,7 @@ #include "constant_tensor_descriptor.cuh" #include "device_direct_convolution_1.cuh" #include "device_direct_convolution_2.cuh" +//#include "device_implicit_gemm_convolution.cuh" //#include "device_winograd_convolution.cuh" struct GeneratorTensor_1 @@ -366,7 +367,7 @@ int main() auto in_desc = make_ConstantTensorDescriptor(Sequence{}); auto wei_desc = make_ConstantTensorDescriptor(Sequence{}); - auto out_desc = get_output_4d_tensor_descriptor(in_desc, wei_desc); + auto out_desc = get_convolution_output_4d_tensor_descriptor(in_desc, wei_desc); ostream_ConstantTensorDescriptor(in_desc, std::cout << "in_desc: "); ostream_ConstantTensorDescriptor(wei_desc, std::cout << "wei_desc: "); @@ -393,6 +394,8 @@ int main() device_direct_convolution_1(in_desc, in, wei_desc, wei, out_desc, out_device); #elif 1 device_direct_convolution_2(in_desc, in, wei_desc, wei, out_desc, out_device); +#elif 0 + device_implicit_gemm_convolution(in_desc, in, wei_desc, wei, out_desc, out_device); #elif 0 device_winograd_convolution(in_desc, in, wei_desc, wei, out_desc, out_device); #endif diff --git a/src/include/blockwise_direct_convolution.cuh b/src/include/blockwise_direct_convolution.cuh index fec85eb09c..dd849eb4ca 100644 --- a/src/include/blockwise_direct_convolution.cuh +++ b/src/include/blockwise_direct_convolution.cuh @@ -59,7 +59,7 @@ __device__ void blockwise_direct_convolution(InBlockDesc, make_ConstantTensorDescriptor(Sequence{}); constexpr auto out_thread_desc = - get_output_4d_tensor_descriptor(in_thread_desc, wei_thread_desc); + get_convolution_output_4d_tensor_descriptor(in_thread_desc, wei_thread_desc); constexpr auto in_thread_block_desc = make_ConstantTensorDescriptor(in_thread_desc.GetLengths(), in_block_desc.GetStrides()); diff --git a/src/include/blockwise_tensor_op.cuh b/src/include/blockwise_tensor_op.cuh index f404c1d2dc..040727ebb9 100644 --- a/src/include/blockwise_tensor_op.cuh +++ b/src/include/blockwise_tensor_op.cuh @@ -2,7 +2,8 @@ #include "constant_tensor_descriptor.cuh" template -__device__ void blockwise_4d_tensor_pointwise_op_unary(DstDesc, TFloat* __restrict__ p_dst, F f) +__device__ void +blockwise_4d_tensor_pointwise_operation_unary(DstDesc, TFloat* __restrict__ p_dst, F f) { constexpr auto I0 = Number<0>{}; constexpr auto I1 = Number<1>{}; @@ -75,82 +76,94 @@ __device__ void blockwise_4d_tensor_pointwise_op_unary(DstDesc, TFloat* __restri } } -template -__device__ void blockwise_4d_tensor_pointwise_op_binary( - DescA, TFloat* const __restrict__ p_a, DescB, TFloat* __restrict__ p_b, DescRef, F f) +template +__device__ void +blockwise_4d_tensor_pointwise_operation_binary_reorder(SrcDesc, + TFloat* const __restrict__ p_src, + DstDesc, + TFloat* __restrict__ p_dst, + RefDesc, + Reorder, + F f) { constexpr auto I0 = Number<0>{}; constexpr auto I1 = Number<1>{}; constexpr auto I2 = Number<2>{}; constexpr auto I3 = Number<3>{}; - constexpr auto desc_a = DescA{}; - constexpr auto desc_b = DescB{}; - constexpr auto desc_ref = DescRef{}; + constexpr unsigned IT0 = Reorder{}.Get(I0); + constexpr unsigned IT1 = Reorder{}.Get(I1); + constexpr unsigned IT2 = Reorder{}.Get(I2); + constexpr unsigned IT3 = Reorder{}.Get(I3); -#if 0 - if(threadIdx.x == 0) - { - print_ConstantTensorDescriptor(desc_a, "blockwise_4d_tensor_op_binary: desc_a: "); - print_ConstantTensorDescriptor(desc_b, "blockwise_4d_tensor_op_binary: desc_b: "); - print_ConstantTensorDescriptor(desc_ref, "blockwise_4d_tensor_op_binary: desc_ref: "); - } -#endif + constexpr auto src_desc = SrcDesc{}; + constexpr auto dst_desc = DstDesc{}; + constexpr auto ref_desc = RefDesc{}; - constexpr unsigned NLoop = desc_ref.GetElementSize() / BlockSize; + constexpr unsigned NLoop = ref_desc.GetElementSize() / BlockSize; for(unsigned iloop = 0; iloop < NLoop; ++iloop) { unsigned is = threadIdx.x + iloop * BlockSize; - const unsigned did0 = is / desc_ref.GetStride(I0); + unsigned did[4]; - is -= did0 * desc_ref.GetStride(I0); + did[0] = is / ref_desc.GetStride(I0); - const unsigned did1 = is / desc_ref.GetStride(I1); + is -= did[0] * ref_desc.GetStride(I0); - is -= did1 * desc_ref.GetStride(I1); + did[1] = is / ref_desc.GetStride(I1); - const unsigned did2 = is / desc_ref.GetStride(I2); + is -= did[1] * ref_desc.GetStride(I1); - is -= did2 * desc_ref.GetStride(I2); + did[2] = is / ref_desc.GetStride(I2); - const unsigned did3 = is / desc_ref.GetStride(I3); + is -= did[2] * ref_desc.GetStride(I2); - const unsigned aindex = desc_a.Get1dIndex(did0, did1, did2, did3); + did[3] = is / ref_desc.GetStride(I3); - const unsigned bindex = desc_b.Get1dIndex(did0, did1, did2, did3); + const unsigned aindex = src_desc.Get1dIndex(did[0], did[1], did[2], did[3]); - f(p_a[aindex], p_b[bindex]); + const unsigned bindex = dst_desc.Get1dIndex(did[IT0], did[IT1], did[IT2], did[IT3]); + + f(p_src[aindex], p_dst[bindex]); } - constexpr bool has_tail = (desc_ref.GetElementSize() > NLoop * BlockSize); + constexpr bool has_tail = (ref_desc.GetElementSize() > NLoop * BlockSize); if(has_tail) { unsigned is = threadIdx.x + NLoop * BlockSize; - if(is < desc_ref.GetElementSize()) + if(is < ref_desc.GetElementSize()) { - const unsigned did0 = is / desc_ref.GetStride(I0); + unsigned did[4]; - is -= did0 * desc_ref.GetStride(I0); + did[0] = is / ref_desc.GetStride(I0); - const unsigned did1 = is / desc_ref.GetStride(I1); + is -= did[0] * ref_desc.GetStride(I0); - is -= did1 * desc_ref.GetStride(I1); + did[1] = is / ref_desc.GetStride(I1); - const unsigned did2 = is / desc_ref.GetStride(I2); + is -= did[1] * ref_desc.GetStride(I1); - is -= did2 * desc_ref.GetStride(I2); + did[2] = is / ref_desc.GetStride(I2); - const unsigned did3 = is / desc_ref.GetStride(I3); + is -= did[2] * ref_desc.GetStride(I2); - const unsigned aindex = desc_a.Get1dIndex(did0, did1, did2, did3); + did[3] = is / ref_desc.GetStride(I3); - const unsigned bindex = desc_b.Get1dIndex(did0, did1, did2, did3); + const unsigned aindex = src_desc.Get1dIndex(did[0], did[1], did[2], did[3]); - f(p_a[aindex], p_b[bindex]); + const unsigned bindex = dst_desc.Get1dIndex(did[IT0], did[IT1], did[IT2], did[IT3]); + + f(p_src[aindex], p_dst[bindex]); } } } @@ -160,21 +173,53 @@ __device__ void blockwise_4d_tensor_set_zero(DstDesc, TFloat* __restrict__ p_dst { auto f_set_zero = [](TFloat& v) { v = TFloat(0); }; - blockwise_4d_tensor_pointwise_op_unary( + blockwise_4d_tensor_pointwise_operation_unary( DstDesc{}, p_dst, f_set_zero); } +template +__device__ void blockwise_4d_tensor_copy_reorder(SrcDesc, + TFloat* const __restrict__ p_src, + DstDesc, + TFloat* __restrict__ p_dst, + RefDesc, + Reorder) +{ + auto f_copy = [](const TFloat& src, TFloat& dst) { dst = src; }; + + blockwise_4d_tensor_pointwise_operation_binary_reorder( + SrcDesc{}, p_src, DstDesc{}, p_dst, RefDesc{}, Reorder{}, f_copy); +} + template __device__ void blockwise_4d_tensor_copy( SrcDesc, TFloat* const __restrict__ p_src, DstDesc, TFloat* __restrict__ p_dst, RefDesc) { - auto f_copy = [](const TFloat& src, TFloat& dst) { dst = src; }; + constexpr auto reorder = Sequence<0, 1, 2, 3>{}; - blockwise_4d_tensor_pointwise_op_binary( - SrcDesc{}, p_src, DstDesc{}, p_dst, RefDesc{}, f_copy); + blockwise_4d_tensor_copy_reorder( + SrcDesc{}, p_src, DstDesc{}, p_dst, RefDesc{}, reorder); +} + +template +__device__ void blockwise_4d_tensor_im2col( + ImDesc, const __restrict__ TFloat* p_im, WDesc, ColDesc, __restrict__ TFloat* p_col) +{ + // do nothing } diff --git a/src/include/constant_tensor_descriptor.cuh b/src/include/constant_tensor_descriptor.cuh index 5ce3f0deaa..de7542b1c3 100644 --- a/src/include/constant_tensor_descriptor.cuh +++ b/src/include/constant_tensor_descriptor.cuh @@ -22,6 +22,14 @@ struct Sequence { return mData[I]; } + + template + __host__ __device__ constexpr auto GetNumber(Number) const + { + constexpr unsigned N = Get(I); + + return Number{}; + } }; template @@ -113,9 +121,31 @@ __host__ __device__ constexpr auto make_ConstantTensorDescriptor(Lengths, Stride return ConstantTensorDescriptor{}; } +// this is ugly, only for 4d +template +__host__ __device__ constexpr auto get_reordered_4d_tensor_descriptor(Desc, Reorder) +{ + constexpr auto I0 = Number<0>{}; + constexpr auto I1 = Number<1>{}; + constexpr auto I2 = Number<2>{}; + constexpr auto I3 = Number<3>{}; + + constexpr auto IT0 = Reorder{}.GetNumber(I0); + constexpr auto IT1 = Reorder{}.GetNumber(I1); + constexpr auto IT2 = Reorder{}.GetNumber(I2); + constexpr auto IT3 = Reorder{}.GetNumber(I3); + + constexpr unsigned L0 = Desc{}.GetLength(IT0); + constexpr unsigned L1 = Desc{}.GetLength(IT1); + constexpr unsigned L2 = Desc{}.GetLength(IT2); + constexpr unsigned L3 = Desc{}.GetLength(IT3); + + return make_ConstantTensorDescriptor(Sequence{}); +} + // this is ugly, only for 4d template -__host__ __device__ constexpr auto get_output_4d_tensor_descriptor(InDesc, WeiDesc) +__host__ __device__ constexpr auto get_convolution_output_4d_tensor_descriptor(InDesc, WeiDesc) { constexpr auto in_desc = InDesc{}; constexpr auto wei_desc = WeiDesc{}; diff --git a/src/include/gridwise_direct_convolution_2.cuh b/src/include/gridwise_direct_convolution_2.cuh index 3f8d1e50ff..1917fba013 100644 --- a/src/include/gridwise_direct_convolution_2.cuh +++ b/src/include/gridwise_direct_convolution_2.cuh @@ -70,7 +70,7 @@ __global__ void gridwise_direct_convolution_2(InGlobalDesc, Sequence{}, wei_block_desc.GetStrides()); constexpr auto out_thread_desc = - get_output_4d_tensor_descriptor(in_thread_block_desc, wei_thread_block_desc); + get_convolution_output_4d_tensor_descriptor(in_thread_block_desc, wei_thread_block_desc); // register TFloat p_out_thread[out_thread_desc.GetElementSpace()];