diff --git a/driver/conv.cu b/driver/conv.cu index 80e5057aaf..17ef79f405 100644 --- a/driver/conv.cu +++ b/driver/conv.cu @@ -7,7 +7,6 @@ #include "constant_tensor_descriptor.cuh" #include "device_direct_convolution_1.cuh" #include "device_direct_convolution_2.cuh" -#include "device_direct_convolution_3.cuh" //#include "device_winograd_convolution.cuh" struct GeneratorTensor_1 @@ -390,12 +389,10 @@ int main() for(int i = 0; i < 40; ++i) { -#if 1 +#if 0 device_direct_convolution_1(in_desc, in, wei_desc, wei, out_desc, out_device); -#elif 0 +#elif 1 device_direct_convolution_2(in_desc, in, wei_desc, wei, out_desc, out_device); -#elif 0 - device_direct_convolution_3(in_desc, in, wei_desc, wei, out_desc, out_device); #elif 0 device_winograd_convolution(in_desc, in, wei_desc, wei, out_desc, out_device); #endif diff --git a/driver/device_direct_convolution_3.cuh b/driver/device_direct_convolution_3.cuh deleted file mode 100644 index 884b3bf38e..0000000000 --- a/driver/device_direct_convolution_3.cuh +++ /dev/null @@ -1,120 +0,0 @@ -#pragma once -#include "gridwise_direct_convolution_3.cuh" - -template -void device_direct_convolution_3( - InDesc, const Tensor& in, WeiDesc, const Tensor& wei, OutDesc, Tensor& out) -{ - std::size_t data_sz = sizeof(T); - DeviceMem in_device_buf(data_sz * in.mDesc.GetElementSpace()); - DeviceMem wei_device_buf(data_sz * wei.mDesc.GetElementSpace()); - DeviceMem out_device_buf(data_sz * out.mDesc.GetElementSpace()); - - int num_thread = std::thread::hardware_concurrency(); - - in_device_buf.ToDevice(in.mData.data()); - wei_device_buf.ToDevice(wei.mData.data()); - out_device_buf.ToDevice(out.mData.data()); - - constexpr auto I0 = Number<0>{}; - constexpr auto I1 = Number<1>{}; - constexpr auto I2 = Number<2>{}; - constexpr auto I3 = Number<3>{}; - - constexpr auto in_desc = InDesc{}; - constexpr auto wei_desc = WeiDesc{}; - constexpr auto out_desc = OutDesc{}; - -#if 1 - constexpr unsigned OutTileSizeH = 2; - constexpr unsigned OutTileSizeW = 2; - constexpr unsigned NPerBlock = 2; - constexpr unsigned KPerBlock = 32; - constexpr unsigned CPerBlock = 4; - constexpr unsigned YPerBlock = 1; - constexpr unsigned XPerBlock = 16; - - constexpr unsigned NPerThread = 2; - constexpr unsigned KPerThread = 4; - constexpr unsigned CPerThread = 2; - - constexpr unsigned BlockSize = 128; -#elif 0 - constexpr unsigned OutTileSizeH = 2; - constexpr unsigned OutTileSizeW = 2; - constexpr unsigned NPerBlock = 2; - constexpr unsigned KPerBlock = 32; - constexpr unsigned CPerBlock = 4; - constexpr unsigned YPerBlock = 1; - constexpr unsigned XPerBlock = 27; - - constexpr unsigned NPerThread = 2; - constexpr unsigned KPerThread = 4; - constexpr unsigned CPerThread = 2; - - constexpr unsigned BlockSize = 216; -#elif 0 - constexpr unsigned OutTileSizeH = 2; - constexpr unsigned OutTileSizeW = 2; - constexpr unsigned NPerBlock = 2; - constexpr unsigned KPerBlock = 32; - constexpr unsigned CPerBlock = 4; - constexpr unsigned YPerBlock = 1; - constexpr unsigned XPerBlock = 32; - - constexpr unsigned NPerThread = 2; - constexpr unsigned KPerThread = 4; - constexpr unsigned CPerThread = 2; - - constexpr unsigned BlockSize = 256; -#endif - - constexpr unsigned GridSize = (out_desc.GetLength(I0) / NPerBlock) * - (out_desc.GetLength(I1) / KPerBlock) * - (out_desc.GetLength(I2) / (OutTileSizeH * YPerBlock)) * - (out_desc.GetLength(I3) / (OutTileSizeW * XPerBlock)); - - dim3 block_dim(BlockSize); - dim3 grid_dim(GridSize); - - printf("%s: BlockSize %u, GridSize %u \n", __func__, BlockSize, GridSize); - - cudaEvent_t start, stop; - float elapsedTime; - - cudaEventCreate(&start); - cudaEventRecord(start, 0); - - gridwise_direct_convolution_3 - <<>>(InDesc{}, - static_cast(in_device_buf.GetDeviceBuffer()), - WeiDesc{}, - static_cast(wei_device_buf.GetDeviceBuffer()), - OutDesc{}, - static_cast(out_device_buf.GetDeviceBuffer())); - - cudaEventCreate(&stop); - cudaEventRecord(stop, 0); - cudaEventSynchronize(stop); - - cudaEventElapsedTime(&elapsedTime, start, stop); - printf("Elapsed time : %f ms\n", elapsedTime); - - checkCudaErrors(cudaGetLastError()); - out_device_buf.FromDevice(out.mData.data()); -} diff --git a/src/include/blockwise_direct_convolution.cuh b/src/include/blockwise_direct_convolution.cuh index 95005af765..fec85eb09c 100644 --- a/src/include/blockwise_direct_convolution.cuh +++ b/src/include/blockwise_direct_convolution.cuh @@ -58,8 +58,8 @@ __device__ void blockwise_direct_convolution(InBlockDesc, constexpr auto wei_thread_desc = make_ConstantTensorDescriptor(Sequence{}); - constexpr auto out_thread_desc = make_ConstantTensorDescriptor( - Sequence{}); + constexpr auto out_thread_desc = + get_output_4d_tensor_descriptor(in_thread_desc, wei_thread_desc); constexpr auto in_thread_block_desc = make_ConstantTensorDescriptor(in_thread_desc.GetLengths(), in_block_desc.GetStrides()); @@ -92,11 +92,9 @@ __device__ void blockwise_direct_convolution(InBlockDesc, unsigned hi_thread_data_begin = ho_thread_data_begin; // minus padding unsigned wi_thread_data_begin = wo_thread_data_begin; // minus padding - TFloat p_in_thread[in_thread_desc.GetElementSpace()]; - TFloat p_wei_thread[wei_thread_desc.GetElementSpace()]; TFloat p_out_thread[out_thread_desc.GetElementSpace()]; - threadwise_4d_tensor_copy(out_thread_block_desc, + threadwise_4d_tensor_copy(out_block_desc, p_out_block + out_block_desc.Get1dIndex(n_thread_data_begin, k_thread_data_begin, ho_thread_data_begin, @@ -108,38 +106,24 @@ __device__ void blockwise_direct_convolution(InBlockDesc, for(unsigned c_thread_data_begin = 0; c_thread_data_begin < in_block_desc.GetLength(I1); c_thread_data_begin += CPerThread) { - // copy input into register - threadwise_4d_tensor_copy(in_thread_block_desc, - p_in_block + in_block_desc.Get1dIndex(n_thread_data_begin, - c_thread_data_begin, - hi_thread_data_begin, - wi_thread_data_begin), - in_thread_desc, - p_in_thread, - in_thread_desc); - - // copy weight into register - threadwise_4d_tensor_copy( + // threadwise convolution + threadwise_direct_convolution_2( + in_thread_block_desc, + p_in_block + in_block_desc.Get1dIndex(n_thread_data_begin, + c_thread_data_begin, + hi_thread_data_begin, + wi_thread_data_begin), wei_thread_block_desc, p_wei_block + wei_block_desc.Get1dIndex(k_thread_data_begin, c_thread_data_begin, 0, 0), - wei_thread_desc, - p_wei_thread, - wei_thread_desc); - - // threadwise convolution - threadwise_direct_convolution_2(in_thread_desc, - p_in_thread, - wei_thread_desc, - p_wei_thread, - out_thread_desc, - p_out_thread); + out_thread_desc, + p_out_thread); } // copy output into LDS threadwise_4d_tensor_copy(out_thread_desc, p_out_thread, - out_thread_block_desc, + out_block_desc, p_out_block + out_block_desc.Get1dIndex(n_thread_data_begin, k_thread_data_begin, ho_thread_data_begin, diff --git a/src/include/gridwise_direct_convolution_1.cuh b/src/include/gridwise_direct_convolution_1.cuh index 8a339b14e4..7dd36dd966 100644 --- a/src/include/gridwise_direct_convolution_1.cuh +++ b/src/include/gridwise_direct_convolution_1.cuh @@ -49,18 +49,20 @@ __global__ void gridwise_direct_convolution_1(InGlobalDesc, constexpr unsigned YBlockWork = (out_global_desc.GetLength(I2) + HoPerBlock - 1) / HoPerBlock; constexpr unsigned XBlockWork = (out_global_desc.GetLength(I3) + WoPerBlock - 1) / WoPerBlock; - constexpr auto in_block_src_desc = make_ConstantTensorDescriptor( + constexpr auto in_block_global_desc = make_ConstantTensorDescriptor( Sequence{}, in_global_desc.GetStrides()); - constexpr auto wei_block_src_desc = make_ConstantTensorDescriptor( + constexpr auto wei_block_global_desc = make_ConstantTensorDescriptor( Sequence{}, wei_global_desc.GetStrides()); - constexpr auto out_block_src_desc = make_ConstantTensorDescriptor( + constexpr auto out_block_global_desc = make_ConstantTensorDescriptor( Sequence{}, out_global_desc.GetStrides()); - constexpr auto in_block_desc = make_ConstantTensorDescriptor(in_block_src_desc.GetLengths()); - constexpr auto wei_block_desc = make_ConstantTensorDescriptor(wei_block_src_desc.GetLengths()); - constexpr auto out_block_desc = make_ConstantTensorDescriptor(out_block_src_desc.GetLengths()); + constexpr auto in_block_desc = make_ConstantTensorDescriptor(in_block_global_desc.GetLengths()); + constexpr auto wei_block_desc = + make_ConstantTensorDescriptor(wei_block_global_desc.GetLengths()); + constexpr auto out_block_desc = + make_ConstantTensorDescriptor(out_block_global_desc.GetLengths()); constexpr unsigned in_block_size = in_block_desc.GetElementSpace(); constexpr unsigned wei_block_size = wei_block_desc.GetElementSpace(); @@ -97,9 +99,9 @@ __global__ void gridwise_direct_convolution_1(InGlobalDesc, print_ConstantTensorDescriptor( in_global_desc, "gridwise_convolution: in_global_desc: "); print_ConstantTensorDescriptor(wei_global_desc, "gridwise_convolution: wei_global_desc: "); print_ConstantTensorDescriptor(out_global_desc, "gridwise_convolution: out_global_desc: "); - print_ConstantTensorDescriptor( in_block_src_desc, "gridwise_convolution: in_block_src_desc: "); - print_ConstantTensorDescriptor(wei_block_src_desc, "gridwise_convolution: wei_block_src_desc: "); - print_ConstantTensorDescriptor(out_block_src_desc, "gridwise_convolution: out_block_src_desc: "); + print_ConstantTensorDescriptor( in_block_global_desc, "gridwise_convolution: in_block_global_desc: "); + print_ConstantTensorDescriptor(wei_block_global_desc, "gridwise_convolution: wei_block_global_desc: "); + print_ConstantTensorDescriptor(out_block_global_desc, "gridwise_convolution: out_block_global_desc: "); print_ConstantTensorDescriptor( in_block_desc, "gridwise_convolution: in_block_desc: "); print_ConstantTensorDescriptor(wei_block_desc, "gridwise_convolution: wei_block_desc: "); print_ConstantTensorDescriptor(out_block_desc, "gridwise_convolution: out_block_desc: "); @@ -128,10 +130,10 @@ __global__ void gridwise_direct_convolution_1(InGlobalDesc, { // copy input tensor to LDS blockwise_4d_tensor_copy(in_block_src_desc, + BlockSize>(in_block_global_desc, p_in_global + in_global_desc.Get1dIndex(n_block_work_begin, c_block_work_begin, @@ -143,11 +145,11 @@ __global__ void gridwise_direct_convolution_1(InGlobalDesc, // copy weight tensor to LDS blockwise_4d_tensor_copy( - wei_block_src_desc, + wei_block_global_desc, p_wei_global + wei_global_desc.Get1dIndex(k_block_work_begin, c_block_work_begin, 0, 0), wei_block_desc, p_wei_block, @@ -174,12 +176,12 @@ __global__ void gridwise_direct_convolution_1(InGlobalDesc, // copy output tensor from LDS to device mem blockwise_4d_tensor_copy( out_block_desc, p_out_block, - out_block_src_desc, + out_block_global_desc, p_out_global + out_global_desc.Get1dIndex( n_block_work_begin, k_block_work_begin, ho_block_work_begin, wo_block_work_begin), diff --git a/src/include/gridwise_direct_convolution_2.cuh b/src/include/gridwise_direct_convolution_2.cuh index d2040f1f90..3f8d1e50ff 100644 --- a/src/include/gridwise_direct_convolution_2.cuh +++ b/src/include/gridwise_direct_convolution_2.cuh @@ -63,18 +63,16 @@ __global__ void gridwise_direct_convolution_2(InGlobalDesc, constexpr unsigned InTileSizeH = OutTileSizeH + S - 1; constexpr unsigned InTileSizeW = OutTileSizeW + R - 1; - constexpr auto in_thread_desc = - make_ConstantTensorDescriptor(Sequence{}); + constexpr auto in_thread_block_desc = make_ConstantTensorDescriptor( + Sequence{}, in_block_desc.GetStrides()); - constexpr auto wei_thread_desc = - make_ConstantTensorDescriptor(Sequence{}); + constexpr auto wei_thread_block_desc = make_ConstantTensorDescriptor( + Sequence{}, wei_block_desc.GetStrides()); constexpr auto out_thread_desc = - get_output_4d_tensor_descriptor(in_thread_desc, wei_thread_desc); + get_output_4d_tensor_descriptor(in_thread_block_desc, wei_thread_block_desc); // register - TFloat p_in_thread[in_thread_desc.GetElementSpace()]; - TFloat p_wei_thread[wei_thread_desc.GetElementSpace()]; TFloat p_out_thread[out_thread_desc.GetElementSpace()]; // divide block work @@ -183,31 +181,30 @@ __global__ void gridwise_direct_convolution_2(InGlobalDesc, for(unsigned c_thread_data = 0; c_thread_data < CPerBlock; c_thread_data += CPerThread) { - // copy input tensor into register - threadwise_4d_tensor_copy(in_block_desc, - p_in_block + in_block_desc.Get1dIndex(n_thread_data_begin, - c_thread_data, - hi_thread_data_begin, - wi_thread_data_begin), - in_thread_desc, - p_in_thread, - in_thread_desc); - - // copy weight tensor into register - threadwise_4d_tensor_copy( - wei_block_desc, - p_wei_block + wei_block_desc.Get1dIndex(k_thread_data_begin, c_thread_data, 0, 0), - wei_thread_desc, - p_wei_thread, - wei_thread_desc); - // threadwise convolution - threadwise_direct_convolution_1(in_thread_desc, - p_in_thread, - wei_thread_desc, - p_wei_thread, - out_thread_desc, - p_out_thread); +#if 1 + threadwise_direct_convolution_2( + in_thread_block_desc, + p_in_block + in_block_desc.Get1dIndex(n_thread_data_begin, + c_thread_data, + hi_thread_data_begin, + wi_thread_data_begin), + wei_thread_block_desc, + p_wei_block + wei_block_desc.Get1dIndex(k_thread_data_begin, c_thread_data, 0, 0), + out_thread_desc, + p_out_thread); +#elif 1 + threadwise_direct_convolution_3( + in_thread_block_desc, + p_in_block + in_block_desc.Get1dIndex(n_thread_data_begin, + c_thread_data, + hi_thread_data_begin, + wi_thread_data_begin), + wei_thread_block_desc, + p_wei_block + wei_block_desc.Get1dIndex(k_thread_data_begin, c_thread_data, 0, 0), + out_thread_desc, + p_out_thread); +#endif } } diff --git a/src/include/gridwise_direct_convolution_3.cuh b/src/include/gridwise_direct_convolution_3.cuh deleted file mode 100644 index f8bf42a011..0000000000 --- a/src/include/gridwise_direct_convolution_3.cuh +++ /dev/null @@ -1,208 +0,0 @@ -#pragma once -#include "constant_tensor_descriptor.cuh" -#include "blockwise_tensor_op.cuh" -#include "blockwise_direct_convolution.cuh" -#include "threadwise_tensor_op.cuh" -#include "threadwise_direct_convolution.cuh" - -template -__global__ void gridwise_direct_convolution_3(InGlobalDesc, - TFloat* const __restrict__ p_in_global, - WeiGlobalDesc, - TFloat* const __restrict__ p_wei_global, - OutGlobalDesc, - TFloat* __restrict__ p_out_global) -{ - constexpr auto I0 = Number<0>{}; - constexpr auto I1 = Number<1>{}; - constexpr auto I2 = Number<2>{}; - constexpr auto I3 = Number<3>{}; - - constexpr auto in_global_desc = InGlobalDesc{}; - constexpr auto wei_global_desc = WeiGlobalDesc{}; - constexpr auto out_global_desc = OutGlobalDesc{}; - - constexpr unsigned S = wei_global_desc.GetLength(I2); - constexpr unsigned R = wei_global_desc.GetLength(I3); - - constexpr unsigned HoPerBlock = OutTileSizeH * YPerBlock; - constexpr unsigned WoPerBlock = OutTileSizeW * XPerBlock; - - constexpr unsigned HiPerBlock = YPerBlock * OutTileSizeH + S - 1; - constexpr unsigned WiPerBlock = XPerBlock * OutTileSizeW + R - 1; - - constexpr auto in_block_desc = - make_ConstantTensorDescriptor(Sequence{}); - - constexpr auto wei_block_desc = - make_ConstantTensorDescriptor(Sequence{}); - - // shared mem - constexpr unsigned in_block_size = in_block_desc.GetElementSpace(); - constexpr unsigned wei_block_size = wei_block_desc.GetElementSpace(); - - __shared__ TFloat p_in_block[in_block_size]; - __shared__ TFloat p_wei_block[wei_block_size]; - - // threadwise tensors - constexpr unsigned InTileSizeH = OutTileSizeH + S - 1; - constexpr unsigned InTileSizeW = OutTileSizeW + R - 1; - - constexpr auto in_thread_block_desc = make_ConstantTensorDescriptor( - Sequence{}, in_block_desc.GetStrides()); - - constexpr auto wei_thread_block_desc = make_ConstantTensorDescriptor( - Sequence{}, wei_block_desc.GetStrides()); - - constexpr auto out_thread_desc = - get_output_4d_tensor_descriptor(in_thread_block_desc, wei_thread_block_desc); - - // register - TFloat p_out_thread[out_thread_desc.GetElementSpace()]; - - // divide block work - constexpr unsigned NBlockWork = (out_global_desc.GetLength(I0) + NPerBlock - 1) / NPerBlock; - constexpr unsigned KBlockWork = (out_global_desc.GetLength(I1) + KPerBlock - 1) / KPerBlock; - constexpr unsigned YBlockWork = (out_global_desc.GetLength(I2) + HoPerBlock - 1) / HoPerBlock; - constexpr unsigned XBlockWork = (out_global_desc.GetLength(I3) + WoPerBlock - 1) / WoPerBlock; - - const unsigned block_id = blockIdx.x; - - unsigned itmp = block_id; - const unsigned n_block_work_id = itmp / (KBlockWork * YBlockWork * XBlockWork); - itmp -= n_block_work_id * (KBlockWork * YBlockWork * XBlockWork); - const unsigned k_block_work_id = itmp / (YBlockWork * XBlockWork); - itmp -= k_block_work_id * (YBlockWork * XBlockWork); - const unsigned y_block_work_id = itmp / XBlockWork; - const unsigned x_block_work_id = itmp - y_block_work_id * XBlockWork; - - const unsigned n_block_data_begin = n_block_work_id * NPerBlock; - const unsigned k_block_data_begin = k_block_work_id * KPerBlock; - const unsigned y_block_data_begin = y_block_work_id * YPerBlock; - const unsigned x_block_data_begin = x_block_work_id * XPerBlock; - - const unsigned ho_block_data_begin = y_block_data_begin * OutTileSizeH; - const unsigned wo_block_data_begin = x_block_data_begin * OutTileSizeW; - - const unsigned hi_block_data_begin = ho_block_data_begin; // minus padding - const unsigned wi_block_data_begin = wo_block_data_begin; // minus padding - - // divide thread work - constexpr unsigned NThreadWork = (NPerBlock + NPerThread - 1) / NPerThread; - constexpr unsigned KThreadWork = (KPerBlock + KPerThread - 1) / KPerThread; - constexpr unsigned YThreadWork = YPerBlock; - constexpr unsigned XThreadWork = XPerBlock; - - const unsigned thread_id = threadIdx.x; - - itmp = thread_id; - const unsigned n_thread_work_id = itmp / (KThreadWork * YThreadWork * XThreadWork); - itmp -= n_thread_work_id * (KThreadWork * YThreadWork * XThreadWork); - const unsigned k_thread_work_id = itmp / (YThreadWork * XThreadWork); - itmp -= k_thread_work_id * (YThreadWork * XThreadWork); - const unsigned y_thread_work_id = itmp / XThreadWork; - const unsigned x_thread_work_id = itmp - y_thread_work_id * XThreadWork; - - const unsigned n_thread_data_begin = n_thread_work_id * NPerThread; - const unsigned k_thread_data_begin = k_thread_work_id * KPerThread; - const unsigned ho_thread_data_begin = y_thread_work_id * OutTileSizeH; - const unsigned wo_thread_data_begin = x_thread_work_id * OutTileSizeW; - - const unsigned hi_thread_data_begin = ho_thread_data_begin; - const unsigned wi_thread_data_begin = wo_thread_data_begin; - -#if 0 - if(threadIdx.x == 0) - { - print_ConstantTensorDescriptor(in_global_desc, "gridwise_convolution: in_global_desc: "); - print_ConstantTensorDescriptor(wei_global_desc, "gridwise_convolution: wei_global_desc: "); - print_ConstantTensorDescriptor(out_global_desc, "gridwise_convolution: out_global_desc: "); - } - - printf("threadIdx.x %u \t" - "n_thread_data_begin %u, k_thread_data_begin %u, ho_thread_data_begin %u, " - "wo_thread_data_begin %u\n", - threadIdx.x, - n_thread_data_begin, - k_thread_data_begin, - ho_thread_data_begin, - wo_thread_data_begin); -#endif - - // set threadwise output tensor to 0 - threadwise_4d_tensor_set_zero(out_thread_desc, p_out_thread); - - for(unsigned c_block_data_begin = 0; c_block_data_begin < in_global_desc.GetLength(I1); - c_block_data_begin += CPerBlock, __syncthreads()) - { - // copy input tensor to LDS - blockwise_4d_tensor_copy(in_global_desc, - p_in_global + - in_global_desc.Get1dIndex(n_block_data_begin, - c_block_data_begin, - hi_block_data_begin, - wi_block_data_begin), - in_block_desc, - p_in_block, - in_block_desc); - - // copy weight tensor to LDS - blockwise_4d_tensor_copy( - wei_global_desc, - p_wei_global + wei_global_desc.Get1dIndex(k_block_data_begin, c_block_data_begin, 0, 0), - wei_block_desc, - p_wei_block, - wei_block_desc); - - __syncthreads(); - - for(unsigned c_thread_data = 0; c_thread_data < CPerBlock; c_thread_data += CPerThread) - { - // threadwise convolution - threadwise_direct_convolution_2( - in_thread_block_desc, - p_in_block + in_block_desc.Get1dIndex(n_thread_data_begin, - c_thread_data, - hi_thread_data_begin, - wi_thread_data_begin), - wei_thread_block_desc, - p_wei_block + wei_block_desc.Get1dIndex(k_thread_data_begin, c_thread_data, 0, 0), - out_thread_desc, - p_out_thread); - } - } - - // copy output tensor from register to global mem - threadwise_4d_tensor_copy( - out_thread_desc, - p_out_thread, - out_global_desc, - p_out_global + out_global_desc.Get1dIndex(n_block_data_begin + n_thread_data_begin, - k_block_data_begin + k_thread_data_begin, - ho_block_data_begin + ho_thread_data_begin, - wo_block_data_begin + wo_thread_data_begin), - out_thread_desc); -} diff --git a/src/include/threadwise_direct_convolution.cuh b/src/include/threadwise_direct_convolution.cuh index ffd4f47849..a728ad68b2 100644 --- a/src/include/threadwise_direct_convolution.cuh +++ b/src/include/threadwise_direct_convolution.cuh @@ -79,11 +79,43 @@ __device__ void threadwise_direct_convolution_1(InDesc, } } +// Optimized for scenario if p_in and p_wei are in LDS, p_out are in register +// Copy in and wei into register before doing convolution +template +__device__ void threadwise_direct_convolution_2(InDesc, + TFloat* const __restrict__ p_in, + WeiDesc, + TFloat* const __restrict__ p_wei, + OutDesc, + TFloat* __restrict__ p_out) +{ + constexpr auto in_desc = InDesc{}; + constexpr auto wei_desc = WeiDesc{}; + constexpr auto out_desc = OutDesc{}; + + constexpr auto in_reg_desc = make_ConstantTensorDescriptor(in_desc.GetLengths()); + constexpr auto wei_reg_desc = make_ConstantTensorDescriptor(wei_desc.GetLengths()); + + // register + TFloat p_in_reg[in_reg_desc.GetElementSpace()]; + TFloat p_wei_reg[wei_reg_desc.GetElementSpace()]; + + // copy input tensor into register + threadwise_4d_tensor_copy(in_desc, p_in, in_reg_desc, p_in_reg, in_reg_desc); + + // copy input tensor into register + threadwise_4d_tensor_copy(wei_desc, p_wei, wei_reg_desc, p_wei_reg, wei_reg_desc); + + // do convolution + threadwise_direct_convolution_1( + in_reg_desc, p_in_reg, wei_reg_desc, p_wei_reg, out_desc, p_out); +} + // optimized for scenario where p_in and p_wei are in LDS, p_out is in register // break down a non-1x1 convolution into a sequence of 1x1 convolutions, // load 1x1 weight into register, and do 1x1 convolution in register. template -__device__ void threadwise_direct_convolution_2(InDesc, +__device__ void threadwise_direct_convolution_3(InDesc, TFloat* const __restrict__ p_in, WeiDesc, TFloat* const __restrict__ p_wei, @@ -95,100 +127,100 @@ __device__ void threadwise_direct_convolution_2(InDesc, constexpr auto I2 = Number<2>{}; constexpr auto I3 = Number<3>{}; - constexpr auto in_desc_lds = InDesc{}; - constexpr auto wei_desc_lds = WeiDesc{}; - constexpr auto out_desc_reg = OutDesc{}; + constexpr auto in_desc = InDesc{}; + constexpr auto wei_desc = WeiDesc{}; + constexpr auto out_desc = OutDesc{}; - constexpr auto in_desc_reg = - make_ConstantTensorDescriptor(Sequence{}); + constexpr auto in_reg_desc = make_ConstantTensorDescriptor(Sequence{}); - constexpr auto wei_desc_reg = make_ConstantTensorDescriptor( - Sequence{}); + constexpr auto wei_reg_desc = make_ConstantTensorDescriptor( + Sequence{}); - TFloat p_in_reg[in_desc_reg.GetElementSpace()]; - TFloat p_wei_reg[wei_desc_reg.GetElementSpace()]; + TFloat p_in_reg[in_reg_desc.GetElementSpace()]; + TFloat p_wei_reg[wei_reg_desc.GetElementSpace()]; constexpr unsigned in_w_new_read = 1; constexpr auto in_desc_reg_new_read = - make_ConstantTensorDescriptor(Sequence{}); +#if 0 // loop over vertical direction - for(unsigned s = 0; s < wei_desc_lds.GetLength(I2); ++s) + for(unsigned s = 0; s < wei_desc.GetLength(I2); ++s) { -#if 1 // read first input - threadwise_4d_tensor_copy(in_desc_lds, - p_in + in_desc_lds.Get1dIndex(0, 0, s, 0), - in_desc_reg, + threadwise_4d_tensor_copy(in_desc, + p_in + in_desc.Get1dIndex(0, 0, s, 0), + in_reg_desc, p_in_reg, - in_desc_reg); + in_reg_desc); // read first 1x1 weight - threadwise_4d_tensor_copy(wei_desc_lds, - p_wei + wei_desc_lds.Get1dIndex(0, 0, s, 0), - wei_desc_reg, + threadwise_4d_tensor_copy(wei_desc, + p_wei + wei_desc.Get1dIndex(0, 0, s, 0), + wei_reg_desc, p_wei_reg, - wei_desc_reg); + wei_reg_desc); // do first 1x1 conv threadwise_direct_convolution_1( - in_desc_reg, p_in_reg, wei_desc_reg, p_wei_reg, out_desc_reg, p_out); + in_reg_desc, p_in_reg, wei_reg_desc, p_wei_reg, out_desc, p_out); // loop over horizontal direction - for(unsigned r = 1; r < wei_desc_lds.GetLength(I3); ++r) + for(unsigned r = 1; r < wei_desc.GetLength(I3); ++r) { // read new weight - threadwise_4d_tensor_copy(wei_desc_lds, - p_wei + wei_desc_lds.Get1dIndex(0, 0, s, r), - wei_desc_reg, + threadwise_4d_tensor_copy(wei_desc, + p_wei + wei_desc.Get1dIndex(0, 0, s, r), + wei_reg_desc, p_wei_reg, - wei_desc_reg); + wei_reg_desc); // shift old input to the left - threadwise_4d_tensor_shift_down(in_desc_reg, p_in_reg, I3, Number{}); + threadwise_4d_tensor_shift_down(in_reg_desc, p_in_reg, I3, Number{}); // read new input threadwise_4d_tensor_copy( - in_desc_lds, - p_in + in_desc_lds.Get1dIndex(0, 0, s, in_desc_reg.GetLength(I3) + r - 1), - in_desc_reg, + in_desc, + p_in + in_desc.Get1dIndex(0, 0, s, r + in_reg_desc.GetLength(I3) - 1), + in_reg_desc, p_in_reg + - in_desc_reg.Get1dIndex(0, 0, 0, in_desc_reg.GetLength(I3) - in_w_new_read), + in_reg_desc.Get1dIndex(0, 0, 0, in_reg_desc.GetLength(I3) - in_w_new_read), in_desc_reg_new_read); // do 1x1 conv threadwise_direct_convolution_1( - in_desc_reg, p_in_reg, wei_desc_reg, p_wei_reg, out_desc_reg, p_out); + in_reg_desc, p_in_reg, wei_reg_desc, p_wei_reg, out_desc, p_out); } + } #elif 1 + // loop over vertical direction + for(unsigned s = 0; s < wei_desc.GetLength(I2); ++s) + { // loop over horizontal direction - for(unsigned r = 0; r < wei_desc_lds.GetLength(I3); ++r) + for(unsigned r = 0; r < wei_desc.GetLength(I3); ++r) { // read new weight - threadwise_4d_tensor_copy(wei_desc_lds, - p_wei + wei_desc_lds.Get1dIndex(0, 0, s, r), - wei_desc_reg, + threadwise_4d_tensor_copy(wei_desc, + p_wei + wei_desc.Get1dIndex(0, 0, s, r), + wei_reg_desc, p_wei_reg, - wei_desc_reg); + wei_reg_desc); // read new input - threadwise_4d_tensor_copy(in_desc_lds, - p_in + in_desc_lds.Get1dIndex(0, 0, s, r), - in_desc_reg, - p_in_reg, - in_desc_reg); + threadwise_4d_tensor_copy( + in_desc, p_in + in_desc.Get1dIndex(0, 0, s, r), in_reg_desc, p_in_reg, in_reg_desc); // do 1x1 conv threadwise_direct_convolution_1( - in_desc_reg, p_in_reg, wei_desc_reg, p_wei_reg, out_desc_reg, p_out); + in_reg_desc, p_in_reg, wei_reg_desc, p_wei_reg, out_desc, p_out); } -#endif } -} +#endif +} \ No newline at end of file