diff --git a/driver/conv.cu b/driver/conv.cu index 5ac73c874e..e5c8549ec7 100644 --- a/driver/conv.cu +++ b/driver/conv.cu @@ -111,9 +111,10 @@ void host_convolution(const Tensor& in, const Tensor& wei, Tensor& out) f_par(std::thread::hardware_concurrency()); } -template +#if 0 +template void device_convolution( - InDesc, const Tensor& in, WeiDesc, const Tensor& wei, OutDesc, Tensor& out) + const Tensor& in, const Tensor& wei, Tensor& out) { DeviceTensorDescriptor<4> in_desc_device(in.mDesc); DeviceTensorDescriptor<4> wei_desc_device(wei.mDesc); @@ -144,7 +145,7 @@ void device_convolution( dim3 block_dim(64, 1, 1); dim3 grid_dim(1, 1, 1); -#if 0 + gridwise_convolution <<>>(in_desc_device, static_cast(in_device_buf.GetDeviceBuffer()), @@ -152,19 +153,78 @@ void device_convolution( static_cast(wei_device_buf.GetDeviceBuffer()), out_desc_device, static_cast(out_device_buf.GetDeviceBuffer())); + + checkCudaErrors(cudaGetLastError()); + out_device_buf.FromDevice(out.mData.data()); +} #else - gridwise_convolution +template +void const_device_convolution( + InDesc, const Tensor& in, WeiDesc, const Tensor& wei, OutDesc, Tensor& out) +{ + std::size_t data_sz = sizeof(T); + DeviceMem in_device_buf(data_sz * in.mDesc.GetElementSpace()); + DeviceMem wei_device_buf(data_sz * wei.mDesc.GetElementSpace()); + DeviceMem out_device_buf(data_sz * out.mDesc.GetElementSpace()); + + int num_thread = std::thread::hardware_concurrency(); + + out.GenerateTensorValue(GeneratorConstant{0}, num_thread); + + in_device_buf.ToDevice(in.mData.data()); + wei_device_buf.ToDevice(wei.mData.data()); + out_device_buf.ToDevice(out.mData.data()); + + dim3 block_dim(64, 1, 1); + dim3 grid_dim(1, 1, 1); + + constexpr auto I0 = Index<0>{}; + constexpr auto I1 = Index<1>{}; + constexpr auto I2 = Index<2>{}; + constexpr auto I3 = Index<3>{}; + + constexpr auto in_desc = InDesc{}; + constexpr auto wei_desc = WeiDesc{}; + constexpr auto out_desc = OutDesc{}; + constexpr unsigned NPerBlock = 1; + constexpr unsigned KPerBlock = 1; + constexpr unsigned CPerBlockLoop = 1; + constexpr unsigned OutTileSizeH = 2; + constexpr unsigned OutTileSizeW = 2; + constexpr unsigned YPerBlock = (out_desc.GetLength(I2) + OutTileSizeH - 1) / OutTileSizeH; + constexpr unsigned XPerBlock = (out_desc.GetLength(I3) + OutTileSizeW - 1) / OutTileSizeW; + + constexpr unsigned NBlockCopyLen0 = 1; + constexpr unsigned NBlockCopyLen1 = 1; + constexpr unsigned NBlockCopyLen2 = 1; + constexpr unsigned NBlockCopyLen3 = 64; + + gridwise_convolution <<>>(InDesc{}, static_cast(in_device_buf.GetDeviceBuffer()), WeiDesc{}, static_cast(wei_device_buf.GetDeviceBuffer()), OutDesc{}, static_cast(out_device_buf.GetDeviceBuffer())); -#endif checkCudaErrors(cudaGetLastError()); out_device_buf.FromDevice(out.mData.data()); } +#endif int main() { @@ -176,14 +236,22 @@ int main() constexpr unsigned K = 1; constexpr unsigned S = 3; constexpr unsigned R = 3; -#elif 0 +#elif 1 constexpr unsigned N = 1; constexpr unsigned C = 1; - constexpr unsigned HI = 130; - constexpr unsigned WI = 130; + constexpr unsigned HI = 36; + constexpr unsigned WI = 36; constexpr unsigned K = 1; constexpr unsigned S = 3; constexpr unsigned R = 3; +#elif 0 + constexpr unsigned N = 1; + constexpr unsigned C = 1; + constexpr unsigned HI = 130; + constexpr unsigned WI = 130; + constexpr unsigned K = 1; + constexpr unsigned S = 3; + constexpr unsigned R = 3; #elif 0 constexpr unsigned N = 3; constexpr unsigned C = 16; @@ -214,7 +282,12 @@ int main() wei.GenerateTensorValue(GeneratorTensor{}, num_thread); host_convolution(in, wei, out_host); - device_convolution(in_desc, in, wei_desc, wei, out_desc, out_device); + +#if 0 + device_convolution(in, wei, out_device); +#else + const_device_convolution(in_desc, in, wei_desc, wei, out_desc, out_device); +#endif std::cout << __func__ << ": done" << std::endl; diff --git a/src/include/constant_direct_convolution.cuh b/src/include/constant_direct_convolution.cuh index ebc1adfdbd..19b432b45e 100644 --- a/src/include/constant_direct_convolution.cuh +++ b/src/include/constant_direct_convolution.cuh @@ -231,17 +231,8 @@ template + unsigned OutTileSizeW> __device__ void blockwise_convolution(InDesc, TFloat* const __restrict__ p_in, WeiDesc, @@ -258,6 +249,19 @@ __device__ void blockwise_convolution(InDesc, constexpr auto wei_desc = WeiDesc{}; constexpr auto out_desc = OutDesc{}; + constexpr unsigned S = wei_desc.GetLength(I2); + constexpr unsigned R = wei_desc.GetLength(I3); + + constexpr unsigned NPerBlock = out_desc.GetLength(I0); + constexpr unsigned KPerBlock = out_desc.GetLength(I1); + constexpr unsigned YPerBlock = (out_desc.GetLength(I2) + OutTileSizeH - 1) / OutTileSizeH; + constexpr unsigned XPerBlock = (out_desc.GetLength(I3) + OutTileSizeW - 1) / OutTileSizeW; + + constexpr unsigned CPerBlockLoop = in_desc.GetLength(I1); + + constexpr unsigned InTileSizeH = OutTileSizeH + S - 1; + constexpr unsigned InTileSizeW = OutTileSizeW + R - 1; + #if 1 if(threadIdx.x == 0) { @@ -383,15 +387,17 @@ template + unsigned NBlockCopyLen0, + unsigned NBlockCopyLen1, + unsigned NBlockCopyLen2, + unsigned NBlockCopyLen3> __global__ void gridwise_convolution(InDesc, TFloat* const __restrict__ p_in, WeiDesc, @@ -420,11 +426,10 @@ __global__ void gridwise_convolution(InDesc, } #endif - constexpr unsigned NBlockWork = (in_desc.GetLength(I0) + NPerBlock - 1) / NPerBlock; - constexpr unsigned YBlockWork = (in_desc.GetLength(I2) + YPerBlock - 1) / YPerBlock; - constexpr unsigned XBlockWork = (in_desc.GetLength(I3) + XPerBlock - 1) / XPerBlock; - - constexpr unsigned KBlockWork = (wei_desc.GetLength(I1) + KPerBlock - 1) / KPerBlock; + constexpr unsigned NBlockWork = (out_desc.GetLength(I0) + NPerBlock - 1) / NPerBlock; + constexpr unsigned KBlockWork = (out_desc.GetLength(I1) + KPerBlock - 1) / KPerBlock; + constexpr unsigned YBlockWork = (out_desc.GetLength(I2) + YPerBlock - 1) / YPerBlock; + constexpr unsigned XBlockWork = (out_desc.GetLength(I3) + XPerBlock - 1) / XPerBlock; const unsigned block_id = blockIdx.x + blockIdx.y * gridDim.x + blockIdx.z * (gridDim.y * gridDim.x); @@ -434,6 +439,7 @@ __global__ void gridwise_convolution(InDesc, CPerBlockLoop, YPerBlock * OutTileSizeH + S - 1, XPerBlock * OutTileSizeW + R - 1>{}); + constexpr auto wei_block_desc = make_ConstantTensorDescriptor(Sequence{}); @@ -474,10 +480,10 @@ __global__ void gridwise_convolution(InDesc, blockwise_4d_tensor_op(in_desc, p_in + in_desc.Get1dIndex(n_block_work_begin, c_block_work_begin, @@ -491,10 +497,10 @@ __global__ void gridwise_convolution(InDesc, blockwise_4d_tensor_op( wei_desc, p_wei + wei_desc.Get1dIndex(k_block_work_begin, c_block_work_begin, 0, 0), @@ -506,10 +512,10 @@ __global__ void gridwise_convolution(InDesc, blockwise_4d_tensor_op(out_desc, p_out + out_desc.Get1dIndex(n_block_work_begin, k_block_work_begin, @@ -526,17 +532,8 @@ __global__ void gridwise_convolution(InDesc, decltype(in_block_desc), decltype(wei_block_desc), decltype(out_block_desc), - S, - R, - InTileSizeH, - InTileSizeW, OutTileSizeH, - OutTileSizeW, - NPerBlock, - KPerBlock, - YPerBlock, - XPerBlock, - CPerBlockLoop>( + OutTileSizeW>( in_block_desc, p_in_block, wei_block_desc, p_wei_block, out_block_desc, p_out_block); __syncthreads(); @@ -545,10 +542,10 @@ __global__ void gridwise_convolution(InDesc, blockwise_4d_tensor_op(out_block_desc, p_out_block, out_desc,