diff --git a/driver/conv.cu b/driver/conv.cu index 2eb32528a4..92f516b4e3 100644 --- a/driver/conv.cu +++ b/driver/conv.cu @@ -27,7 +27,7 @@ struct GeneratorTensor { #if 1 return T(std::rand()) / T(RAND_MAX); -#elif 0 +#elif std::initializer_list ls = {static_cast(is)...}; return std::accumulate(ls.begin(), ls.end(), std::size_t(0)); diff --git a/src/include/blockwise_convolution.cuh b/src/include/blockwise_convolution.cuh new file mode 100644 index 0000000000..cf8e02867c --- /dev/null +++ b/src/include/blockwise_convolution.cuh @@ -0,0 +1,159 @@ +#pragma once +#include "constant_tensor_descriptor.cuh" +#include "threadwise_tensor_op.cuh" +#include "threadwise_convolution.cuh" + +template +__device__ void blockwise_convolution(InBlockDesc, + TFloat* const __restrict__ p_in_block, + WeiBlockDesc, + TFloat* const __restrict__ p_wei_block, + OutBlockDesc, + TFloat* __restrict__ p_out_block) +{ + constexpr auto I0 = Index<0>{}; + constexpr auto I1 = Index<1>{}; + constexpr auto I2 = Index<2>{}; + constexpr auto I3 = Index<3>{}; + + constexpr auto in_block_desc = InBlockDesc{}; + constexpr auto wei_block_desc = WeiBlockDesc{}; + constexpr auto out_block_desc = OutBlockDesc{}; + + constexpr unsigned S = wei_block_desc.GetLength(I2); + constexpr unsigned R = wei_block_desc.GetLength(I3); + + constexpr unsigned NPerBlock = out_block_desc.GetLength(I0); + constexpr unsigned KPerBlock = out_block_desc.GetLength(I1); + constexpr unsigned YPerBlock = (out_block_desc.GetLength(I2) + OutTileSizeH - 1) / OutTileSizeH; + constexpr unsigned XPerBlock = (out_block_desc.GetLength(I3) + OutTileSizeW - 1) / OutTileSizeW; + + constexpr unsigned CPerBlock = in_block_desc.GetLength(I1); + + constexpr unsigned InTileSizeH = OutTileSizeH + S - 1; + constexpr unsigned InTileSizeW = OutTileSizeW + R - 1; + +#if 0 + if(threadIdx.x == 0) + { + print_ConstantTensorDescriptor(in_block_desc); + print_ConstantTensorDescriptor(wei_block_desc); + print_ConstantTensorDescriptor(out_block_desc); + } +#endif + + constexpr auto in_thread_src_desc = make_ConstantTensorDescriptor( + Sequence<1, CPerBlock, InTileSizeH, InTileSizeW>{}, in_block_desc.GetStrides()); + + constexpr auto wei_thread_src_desc = + make_ConstantTensorDescriptor(Sequence<1, CPerBlock, S, R>{}, wei_block_desc.GetStrides()); + + constexpr auto out_thread_src_desc = make_ConstantTensorDescriptor( + Sequence<1, 1, OutTileSizeH, OutTileSizeW>{}, out_block_desc.GetStrides()); + + constexpr auto in_thread_dst_desc = + make_ConstantTensorDescriptor(in_thread_src_desc.GetLengths()); + + constexpr auto wei_thread_dst_desc = + make_ConstantTensorDescriptor(wei_thread_src_desc.GetLengths()); + + constexpr auto out_thread_dst_desc = + make_ConstantTensorDescriptor(out_thread_src_desc.GetLengths()); + + const unsigned thread_id = threadIdx.x; + + for(unsigned thread_work_id = thread_id; thread_work_id < NPerBlock * YPerBlock * XPerBlock; + thread_work_id += BlockSize) + { + unsigned itmp = thread_work_id; + unsigned n_thread_work_id = itmp / (YPerBlock * XPerBlock); + itmp -= n_thread_work_id * (YPerBlock * XPerBlock); + unsigned y_thread_work_id = itmp / XPerBlock; + unsigned x_thread_work_id = itmp - y_thread_work_id * XPerBlock; + + unsigned n_thread_work_begin = n_thread_work_id * 1; + unsigned ho_thread_work_begin = y_thread_work_id * OutTileSizeH; + unsigned wo_thread_work_begin = x_thread_work_id * OutTileSizeW; + + unsigned hi_thread_work_begin = ho_thread_work_begin; // minus padding + unsigned wi_thread_work_begin = wo_thread_work_begin; // minus padding + + TFloat p_in_thread[in_thread_src_desc.GetElementSpace()]; + TFloat p_wei_thread[wei_thread_src_desc.GetElementSpace()]; + TFloat p_out_thread[out_thread_src_desc.GetElementSpace()]; + + auto f_copy = [](const TFloat& src, TFloat& dst) { dst = src; }; + + // copy input tensor into register + threadwise_4d_tensor_op_binary( + in_thread_src_desc, + p_in_block + in_block_desc.Get1dIndex( + n_thread_work_begin, 0, hi_thread_work_begin, wi_thread_work_begin), + in_thread_dst_desc, + p_in_thread, + f_copy); + + for(unsigned k_thread_work_begin = 0; k_thread_work_begin < KPerBlock; + ++k_thread_work_begin) + { + // copy weight tensor into register + threadwise_4d_tensor_op_binary( + wei_thread_src_desc, + p_wei_block + wei_block_desc.Get1dIndex(k_thread_work_begin, 0, 0, 0), + wei_thread_dst_desc, + p_wei_thread, + f_copy); + + // copy output tensor into register + threadwise_4d_tensor_op_binary( + out_thread_src_desc, + p_out_block + out_block_desc.Get1dIndex(n_thread_work_begin, + k_thread_work_begin, + ho_thread_work_begin, + wo_thread_work_begin), + out_thread_dst_desc, + p_out_thread, + f_copy); + + // threadwise convolution + threadwise_direct_convolution(in_thread_dst_desc, + p_in_thread, + wei_thread_dst_desc, + p_wei_thread, + out_thread_dst_desc, + p_out_thread); + + // accumulate output tensor into LDS + threadwise_4d_tensor_op_binary( + out_thread_dst_desc, + p_out_thread, + out_thread_src_desc, + p_out_block + out_block_desc.Get1dIndex(n_thread_work_begin, + k_thread_work_begin, + ho_thread_work_begin, + wo_thread_work_begin), + f_copy); + } + } +}