From 73480fee3635310aedbbec68b6084c94cfd2457d Mon Sep 17 00:00:00 2001 From: Chao Liu Date: Thu, 15 Nov 2018 23:53:23 -0600 Subject: [PATCH] refactor --- driver/conv.cu | 91 +------------------- driver/device_direct_convolution_2.cuh | 91 ++++++++++++++++++++ src/include/direct_convolution_2.cuh | 32 +++---- src/include/threadwise_tensor_op.cuh | 110 ++----------------------- 4 files changed, 115 insertions(+), 209 deletions(-) create mode 100644 driver/device_direct_convolution_2.cuh diff --git a/driver/conv.cu b/driver/conv.cu index 4a62be7fc4..2eb32528a4 100644 --- a/driver/conv.cu +++ b/driver/conv.cu @@ -5,7 +5,7 @@ #include "nvToolsExt.h" #include "tensor.hpp" #include "constant_tensor_descriptor.cuh" -#include "direct_convolution_2.cuh" +#include "device_direct_convolution_2.cuh" template struct GeneratorConstant @@ -109,95 +109,6 @@ void host_convolution(const Tensor& in, const Tensor& wei, Tensor& out) f_par(std::thread::hardware_concurrency()); } -template -void device_convolution( - InDesc, const Tensor& in, WeiDesc, const Tensor& wei, OutDesc, Tensor& out) -{ - std::size_t data_sz = sizeof(T); - DeviceMem in_device_buf(data_sz * in.mDesc.GetElementSpace()); - DeviceMem wei_device_buf(data_sz * wei.mDesc.GetElementSpace()); - DeviceMem out_device_buf(data_sz * out.mDesc.GetElementSpace()); - - int num_thread = std::thread::hardware_concurrency(); - - in_device_buf.ToDevice(in.mData.data()); - wei_device_buf.ToDevice(wei.mData.data()); - out_device_buf.ToDevice(out.mData.data()); - - constexpr auto I0 = Index<0>{}; - constexpr auto I1 = Index<1>{}; - constexpr auto I2 = Index<2>{}; - constexpr auto I3 = Index<3>{}; - - constexpr auto in_desc = InDesc{}; - constexpr auto wei_desc = WeiDesc{}; - constexpr auto out_desc = OutDesc{}; - constexpr unsigned OutTileSizeH = 2; - constexpr unsigned OutTileSizeW = 2; - constexpr unsigned NPerBlock = 1; - constexpr unsigned KPerBlock = 4; - constexpr unsigned CPerBlock = 2; - constexpr unsigned YPerBlock = 8; - constexpr unsigned XPerBlock = 16; - - constexpr unsigned NBlockOpLen0 = 1; - constexpr unsigned NBlockOpLen1 = 1; - constexpr unsigned NBlockOpLen2 = 4; - constexpr unsigned NBlockOpLen3 = 32; - - constexpr unsigned BlockSize = 128; - - constexpr unsigned GridSize = (out_desc.GetLength(I0) / NPerBlock) * - (out_desc.GetLength(I1) / KPerBlock) * - (out_desc.GetLength(I2) / (OutTileSizeH * YPerBlock)) * - (out_desc.GetLength(I3) / (OutTileSizeW * XPerBlock)); - - dim3 block_dim(BlockSize); - dim3 grid_dim(GridSize); - - printf("%s: BlockSize %u, GridSize %u \n", __func__, BlockSize, GridSize); - - cudaEvent_t start, stop; - float elapsedTime; - - cudaEventCreate(&start); - cudaEventRecord(start, 0); - - gridwise_convolution - <<>>(InDesc{}, - static_cast(in_device_buf.GetDeviceBuffer()), - WeiDesc{}, - static_cast(wei_device_buf.GetDeviceBuffer()), - OutDesc{}, - static_cast(out_device_buf.GetDeviceBuffer())); - - cudaEventCreate(&stop); - cudaEventRecord(stop, 0); - cudaEventSynchronize(stop); - - cudaEventElapsedTime(&elapsedTime, start, stop); - printf("Elapsed time : %f ms\n", elapsedTime); - - checkCudaErrors(cudaGetLastError()); - out_device_buf.FromDevice(out.mData.data()); -} - int main() { #if 0 diff --git a/driver/device_direct_convolution_2.cuh b/driver/device_direct_convolution_2.cuh new file mode 100644 index 0000000000..7fa644bdf5 --- /dev/null +++ b/driver/device_direct_convolution_2.cuh @@ -0,0 +1,91 @@ +#pragma once +#include "direct_convolution_2.cuh" + +template +void device_convolution( + InDesc, const Tensor& in, WeiDesc, const Tensor& wei, OutDesc, Tensor& out) +{ + std::size_t data_sz = sizeof(T); + DeviceMem in_device_buf(data_sz * in.mDesc.GetElementSpace()); + DeviceMem wei_device_buf(data_sz * wei.mDesc.GetElementSpace()); + DeviceMem out_device_buf(data_sz * out.mDesc.GetElementSpace()); + + int num_thread = std::thread::hardware_concurrency(); + + in_device_buf.ToDevice(in.mData.data()); + wei_device_buf.ToDevice(wei.mData.data()); + out_device_buf.ToDevice(out.mData.data()); + + constexpr auto I0 = Index<0>{}; + constexpr auto I1 = Index<1>{}; + constexpr auto I2 = Index<2>{}; + constexpr auto I3 = Index<3>{}; + + constexpr auto in_desc = InDesc{}; + constexpr auto wei_desc = WeiDesc{}; + constexpr auto out_desc = OutDesc{}; + constexpr unsigned OutTileSizeH = 2; + constexpr unsigned OutTileSizeW = 2; + constexpr unsigned NPerBlock = 1; + constexpr unsigned KPerBlock = 4; + constexpr unsigned CPerBlock = 2; + constexpr unsigned YPerBlock = 8; + constexpr unsigned XPerBlock = 16; + + constexpr unsigned NBlockOpLen0 = 1; + constexpr unsigned NBlockOpLen1 = 1; + constexpr unsigned NBlockOpLen2 = 4; + constexpr unsigned NBlockOpLen3 = 32; + + constexpr unsigned BlockSize = 128; + + constexpr unsigned GridSize = (out_desc.GetLength(I0) / NPerBlock) * + (out_desc.GetLength(I1) / KPerBlock) * + (out_desc.GetLength(I2) / (OutTileSizeH * YPerBlock)) * + (out_desc.GetLength(I3) / (OutTileSizeW * XPerBlock)); + + dim3 block_dim(BlockSize); + dim3 grid_dim(GridSize); + + printf("%s: BlockSize %u, GridSize %u \n", __func__, BlockSize, GridSize); + + cudaEvent_t start, stop; + float elapsedTime; + + cudaEventCreate(&start); + cudaEventRecord(start, 0); + + gridwise_convolution + <<>>(InDesc{}, + static_cast(in_device_buf.GetDeviceBuffer()), + WeiDesc{}, + static_cast(wei_device_buf.GetDeviceBuffer()), + OutDesc{}, + static_cast(out_device_buf.GetDeviceBuffer())); + + cudaEventCreate(&stop); + cudaEventRecord(stop, 0); + cudaEventSynchronize(stop); + + cudaEventElapsedTime(&elapsedTime, start, stop); + printf("Elapsed time : %f ms\n", elapsedTime); + + checkCudaErrors(cudaGetLastError()); + out_device_buf.FromDevice(out.mData.data()); +} diff --git a/src/include/direct_convolution_2.cuh b/src/include/direct_convolution_2.cuh index 50e3b0e9dc..ff79871370 100644 --- a/src/include/direct_convolution_2.cuh +++ b/src/include/direct_convolution_2.cuh @@ -92,10 +92,10 @@ __device__ void blockwise_convolution(InDesc, auto f_copy = [](const TFloat& src, TFloat& dst) { dst = src; }; // copy input tensor into register - threadwise_4d_tensor_op_in( + threadwise_4d_tensor_op_binary( in_thread_src_desc, p_in_lds + in_desc.Get1dIndex( n_thread_work_begin, 0, hi_thread_work_begin, wi_thread_work_begin), @@ -107,10 +107,10 @@ __device__ void blockwise_convolution(InDesc, ++k_thread_work_begin) { // copy weight tensor into register - threadwise_4d_tensor_op_wei( + threadwise_4d_tensor_op_binary( wei_thread_src_desc, p_wei_lds + wei_desc.Get1dIndex(k_thread_work_begin, 0, 0, 0), wei_thread_dst_desc, @@ -118,10 +118,10 @@ __device__ void blockwise_convolution(InDesc, f_copy); // copy output tensor into register - threadwise_4d_tensor_op_out( + threadwise_4d_tensor_op_binary( out_thread_src_desc, p_out_lds + out_desc.Get1dIndex(n_thread_work_begin, k_thread_work_begin, @@ -143,10 +143,10 @@ __device__ void blockwise_convolution(InDesc, p_out_thread); // accumulate output tensor into LDS - threadwise_4d_tensor_op_out( + threadwise_4d_tensor_op_binary( out_thread_dst_desc, p_out_thread, out_thread_src_desc, diff --git a/src/include/threadwise_tensor_op.cuh b/src/include/threadwise_tensor_op.cuh index 40509abd5e..78b626dfe1 100644 --- a/src/include/threadwise_tensor_op.cuh +++ b/src/include/threadwise_tensor_op.cuh @@ -5,58 +5,35 @@ #if THREADWISE_TENSOR_OP_METHOD == 0 template -__device__ void threadwise_4d_tensor_op_in( - SrcDesc, TFloat* const __restrict__ p_src, DstDesc, TFloat* __restrict__ p_dst, F f) +__device__ void threadwise_4d_tensor_op_unary(DstDesc, TFloat* __restrict__ p_dst, F f) { constexpr auto I0 = Index<0>{}; constexpr auto I1 = Index<1>{}; constexpr auto I2 = Index<2>{}; constexpr auto I3 = Index<3>{}; - constexpr auto src_desc = SrcDesc{}; constexpr auto dst_desc = DstDesc{}; - static_assert(is_same::value); - #if 0 if(threadIdx.x == 0) { - print_ConstantTensorDescriptor(src_desc); print_ConstantTensorDescriptor(dst_desc); } #endif - for(unsigned did0 = 0; did0 < src_desc.GetLength(I0); ++did0) + for(unsigned did0 = 0; did0 < dst_desc.GetLength(I0); ++did0) { - for(unsigned did1 = 0; did1 < src_desc.GetLength(I1); ++did1) + for(unsigned did1 = 0; did1 < dst_desc.GetLength(I1); ++did1) { - for(unsigned did2 = 0; did2 < src_desc.GetLength(I2); ++did2) + for(unsigned did2 = 0; did2 < dst_desc.GetLength(I2); ++did2) { - for(unsigned did3 = 0; did3 < src_desc.GetLength(I3); ++did3) + for(unsigned did3 = 0; did3 < dst_desc.GetLength(I3); ++did3) { - const unsigned sindex = - src_desc.GetStride(I0) * did0 + src_desc.GetStride(I1) * did1 + - src_desc.GetStride(I2) * did2 + src_desc.GetStride(I3) * did3; - const unsigned dindex = dst_desc.GetStride(I0) * did0 + dst_desc.GetStride(I1) * did1 + dst_desc.GetStride(I2) * did2 + dst_desc.GetStride(I3) * did3; - f(p_src[sindex], p_dst[dindex]); - -#if 0 - if(threadIdx.x == 0) - { - printf("threadwise_4d_tensor_op: 1: thread id %u, \t" - "sindex %u, p_src[sindex] %f, \t" - "dindex %u, p_dst[dindex] %f\n", - threadIdx.x, - sindex, - p_src[sindex], - dindex, - p_dst[dindex]); - } -#endif + f(p_dst[dindex]); } } } @@ -64,7 +41,7 @@ __device__ void threadwise_4d_tensor_op_in( } template -__device__ void threadwise_4d_tensor_op_wei( +__device__ void threadwise_4d_tensor_op_binary( SrcDesc, TFloat* const __restrict__ p_src, DstDesc, TFloat* __restrict__ p_dst, F f) { constexpr auto I0 = Index<0>{}; @@ -102,79 +79,6 @@ __device__ void threadwise_4d_tensor_op_wei( dst_desc.GetStride(I2) * did2 + dst_desc.GetStride(I3) * did3; f(p_src[sindex], p_dst[dindex]); - -#if 0 - if(threadIdx.x == 0) - { - printf("threadwise_4d_tensor_op: 1: thread id %u, \t" - "sindex %u, p_src[sindex] %f, \t" - "dindex %u, p_dst[dindex] %f\n", - threadIdx.x, - sindex, - p_src[sindex], - dindex, - p_dst[dindex]); - } -#endif - } - } - } - } -} - -template -__device__ void threadwise_4d_tensor_op_out( - SrcDesc, TFloat* const __restrict__ p_src, DstDesc, TFloat* __restrict__ p_dst, F f) -{ - constexpr auto I0 = Index<0>{}; - constexpr auto I1 = Index<1>{}; - constexpr auto I2 = Index<2>{}; - constexpr auto I3 = Index<3>{}; - - constexpr auto src_desc = SrcDesc{}; - constexpr auto dst_desc = DstDesc{}; - - static_assert(is_same::value); - -#if 0 - if(threadIdx.x == 0) - { - print_ConstantTensorDescriptor(src_desc); - print_ConstantTensorDescriptor(dst_desc); - } -#endif - - for(unsigned did0 = 0; did0 < src_desc.GetLength(I0); ++did0) - { - for(unsigned did1 = 0; did1 < src_desc.GetLength(I1); ++did1) - { - for(unsigned did2 = 0; did2 < src_desc.GetLength(I2); ++did2) - { - for(unsigned did3 = 0; did3 < src_desc.GetLength(I3); ++did3) - { - const unsigned sindex = - src_desc.GetStride(I0) * did0 + src_desc.GetStride(I1) * did1 + - src_desc.GetStride(I2) * did2 + src_desc.GetStride(I3) * did3; - - const unsigned dindex = - dst_desc.GetStride(I0) * did0 + dst_desc.GetStride(I1) * did1 + - dst_desc.GetStride(I2) * did2 + dst_desc.GetStride(I3) * did3; - - f(p_src[sindex], p_dst[dindex]); - -#if 0 - if(threadIdx.x == 0) - { - printf("threadwise_4d_tensor_op: 1: thread id %u, \t" - "sindex %u, p_src[sindex] %f, \t" - "dindex %u, p_dst[dindex] %f\n", - threadIdx.x, - sindex, - p_src[sindex], - dindex, - p_dst[dindex]); - } -#endif } } }