From 29496c95d3d04eafae5eb9d0de2b3e4673df3a73 Mon Sep 17 00:00:00 2001 From: Chao Liu Date: Thu, 15 Nov 2018 20:16:58 -0600 Subject: [PATCH] clean up --- src/include/direct_convolution_2.cuh | 72 +++++++-------- src/include/threadwise_tensor_op.cuh | 127 +++++++++++++++++++++++++-- 2 files changed, 152 insertions(+), 47 deletions(-) diff --git a/src/include/direct_convolution_2.cuh b/src/include/direct_convolution_2.cuh index 9c89bcc7ec..6e8b8580c5 100644 --- a/src/include/direct_convolution_2.cuh +++ b/src/include/direct_convolution_2.cuh @@ -92,47 +92,44 @@ __device__ void blockwise_convolution(InDesc, auto f_copy = [](const TFloat& src, TFloat& dst) { dst = src; }; // copy input tensor into register - threadwise_4d_tensor_op( + threadwise_4d_tensor_op_in( in_thread_src_desc, p_in + in_desc.Get1dIndex( n_thread_work_begin, 0, hi_thread_work_begin, wi_thread_work_begin), in_thread_dst_desc, p_in_thread, - f_copy, - false); + f_copy); for(unsigned k_thread_work_begin = 0; k_thread_work_begin < KPerBlock; ++k_thread_work_begin) { // copy weight tensor into register - threadwise_4d_tensor_op( + threadwise_4d_tensor_op_wei( wei_thread_src_desc, p_wei + wei_desc.Get1dIndex(k_thread_work_begin, 0, 0, 0), wei_thread_dst_desc, p_wei_thread, - f_copy, - false); + f_copy); // copy output tensor into register - threadwise_4d_tensor_op(out_thread_src_desc, - p_out + - out_desc.Get1dIndex(n_thread_work_begin, - k_thread_work_begin, - ho_thread_work_begin, - wo_thread_work_begin), - out_thread_dst_desc, - p_out_thread, - f_copy, - false); + threadwise_4d_tensor_op_out( + out_thread_src_desc, + p_out + out_desc.Get1dIndex(n_thread_work_begin, + k_thread_work_begin, + ho_thread_work_begin, + wo_thread_work_begin), + out_thread_dst_desc, + p_out_thread, + f_copy); // threadwise convolution threadwise_direct_convolution(out_thread_dst_desc, - p_out_thread, - out_thread_src_desc, - p_out + - out_desc.Get1dIndex(n_thread_work_begin, - k_thread_work_begin, - ho_thread_work_begin, - wo_thread_work_begin), - f_copy, - false); + threadwise_4d_tensor_op_out( + out_thread_dst_desc, + p_out_thread, + out_thread_src_desc, + p_out + out_desc.Get1dIndex(n_thread_work_begin, + k_thread_work_begin, + ho_thread_work_begin, + wo_thread_work_begin), + f_copy); } } } diff --git a/src/include/threadwise_tensor_op.cuh b/src/include/threadwise_tensor_op.cuh index 521a20ba91..40509abd5e 100644 --- a/src/include/threadwise_tensor_op.cuh +++ b/src/include/threadwise_tensor_op.cuh @@ -5,12 +5,8 @@ #if THREADWISE_TENSOR_OP_METHOD == 0 template -__device__ void threadwise_4d_tensor_op(SrcDesc, - TFloat* const __restrict__ p_src, - DstDesc, - TFloat* __restrict__ p_dst, - F f, - bool flag = false) +__device__ void threadwise_4d_tensor_op_in( + SrcDesc, TFloat* const __restrict__ p_src, DstDesc, TFloat* __restrict__ p_dst, F f) { constexpr auto I0 = Index<0>{}; constexpr auto I1 = Index<1>{}; @@ -30,9 +26,122 @@ __device__ void threadwise_4d_tensor_op(SrcDesc, } #endif -#if 1 - if(flag && threadIdx.x != 0) - return; + for(unsigned did0 = 0; did0 < src_desc.GetLength(I0); ++did0) + { + for(unsigned did1 = 0; did1 < src_desc.GetLength(I1); ++did1) + { + for(unsigned did2 = 0; did2 < src_desc.GetLength(I2); ++did2) + { + for(unsigned did3 = 0; did3 < src_desc.GetLength(I3); ++did3) + { + const unsigned sindex = + src_desc.GetStride(I0) * did0 + src_desc.GetStride(I1) * did1 + + src_desc.GetStride(I2) * did2 + src_desc.GetStride(I3) * did3; + + const unsigned dindex = + dst_desc.GetStride(I0) * did0 + dst_desc.GetStride(I1) * did1 + + dst_desc.GetStride(I2) * did2 + dst_desc.GetStride(I3) * did3; + + f(p_src[sindex], p_dst[dindex]); + +#if 0 + if(threadIdx.x == 0) + { + printf("threadwise_4d_tensor_op: 1: thread id %u, \t" + "sindex %u, p_src[sindex] %f, \t" + "dindex %u, p_dst[dindex] %f\n", + threadIdx.x, + sindex, + p_src[sindex], + dindex, + p_dst[dindex]); + } +#endif + } + } + } + } +} + +template +__device__ void threadwise_4d_tensor_op_wei( + SrcDesc, TFloat* const __restrict__ p_src, DstDesc, TFloat* __restrict__ p_dst, F f) +{ + constexpr auto I0 = Index<0>{}; + constexpr auto I1 = Index<1>{}; + constexpr auto I2 = Index<2>{}; + constexpr auto I3 = Index<3>{}; + + constexpr auto src_desc = SrcDesc{}; + constexpr auto dst_desc = DstDesc{}; + + static_assert(is_same::value); + +#if 0 + if(threadIdx.x == 0) + { + print_ConstantTensorDescriptor(src_desc); + print_ConstantTensorDescriptor(dst_desc); + } +#endif + + for(unsigned did0 = 0; did0 < src_desc.GetLength(I0); ++did0) + { + for(unsigned did1 = 0; did1 < src_desc.GetLength(I1); ++did1) + { + for(unsigned did2 = 0; did2 < src_desc.GetLength(I2); ++did2) + { + for(unsigned did3 = 0; did3 < src_desc.GetLength(I3); ++did3) + { + const unsigned sindex = + src_desc.GetStride(I0) * did0 + src_desc.GetStride(I1) * did1 + + src_desc.GetStride(I2) * did2 + src_desc.GetStride(I3) * did3; + + const unsigned dindex = + dst_desc.GetStride(I0) * did0 + dst_desc.GetStride(I1) * did1 + + dst_desc.GetStride(I2) * did2 + dst_desc.GetStride(I3) * did3; + + f(p_src[sindex], p_dst[dindex]); + +#if 0 + if(threadIdx.x == 0) + { + printf("threadwise_4d_tensor_op: 1: thread id %u, \t" + "sindex %u, p_src[sindex] %f, \t" + "dindex %u, p_dst[dindex] %f\n", + threadIdx.x, + sindex, + p_src[sindex], + dindex, + p_dst[dindex]); + } +#endif + } + } + } + } +} + +template +__device__ void threadwise_4d_tensor_op_out( + SrcDesc, TFloat* const __restrict__ p_src, DstDesc, TFloat* __restrict__ p_dst, F f) +{ + constexpr auto I0 = Index<0>{}; + constexpr auto I1 = Index<1>{}; + constexpr auto I2 = Index<2>{}; + constexpr auto I3 = Index<3>{}; + + constexpr auto src_desc = SrcDesc{}; + constexpr auto dst_desc = DstDesc{}; + + static_assert(is_same::value); + +#if 0 + if(threadIdx.x == 0) + { + print_ConstantTensorDescriptor(src_desc); + print_ConstantTensorDescriptor(dst_desc); + } #endif for(unsigned did0 = 0; did0 < src_desc.GetLength(I0); ++did0)