From 1b648f2f42bf5b82421289cff350ac7af6ec46ea Mon Sep 17 00:00:00 2001 From: Chao Liu Date: Sun, 4 Nov 2018 04:08:51 -0600 Subject: [PATCH] use constant tensor descriptor --- driver/conv.cu | 128 ++++- src/include/constant_direct_convolution.cuh | 561 ++++++++++++++++++++ src/include/constant_tensor_descriptor.cuh | 169 ++++++ src/include/device_tensor.cuh | 30 -- src/include/device_tensor_descriptor.cuh | 60 +++ src/include/direct_convolution.cuh | 20 +- src/include/tensor.hpp | 25 +- src/tensor.cpp | 11 +- 8 files changed, 913 insertions(+), 91 deletions(-) create mode 100644 src/include/constant_direct_convolution.cuh create mode 100644 src/include/constant_tensor_descriptor.cuh delete mode 100644 src/include/device_tensor.cuh create mode 100644 src/include/device_tensor_descriptor.cuh diff --git a/driver/conv.cu b/driver/conv.cu index c3d42b65cf..5ac73c874e 100644 --- a/driver/conv.cu +++ b/driver/conv.cu @@ -3,8 +3,14 @@ #include #include "nvToolsExt.h" #include "tensor.hpp" -#include "device_tensor.cuh" +#include "constant_tensor_descriptor.cuh" +#include "device_tensor_descriptor.cuh" + +#if 0 #include "direct_convolution.cuh" +#else +#include "constant_direct_convolution.cuh" +#endif template struct GeneratorConstant @@ -38,11 +44,46 @@ struct GeneratorTensor } }; -template -void host_convolution(const Tensor& in, - const Tensor& wei, - Tensor& out, - std::size_t num_thread) +// this is ugly, only for 4d +template +void ostream_ConstantTensorDescriptor(TConstTensorDesc, std::ostream& os = std::cout) +{ + static_assert(TConstTensorDesc::nDim == 4, "nDim is not 4"); + + constexpr auto I0 = Index<0>{}; + constexpr auto I1 = Index<1>{}; + constexpr auto I2 = Index<2>{}; + constexpr auto I3 = Index<3>{}; + constexpr auto desc = TConstTensorDesc{}; + + os << "Lengths: {" << desc.GetLength(I0) << ", " << desc.GetLength(I1) << ", " + << desc.GetLength(I2) << ", " << desc.GetLength(I3) << "}, " + << "Strides: {" << desc.GetStride(I0) << ", " << desc.GetStride(I1) << ", " + << desc.GetStride(I2) << ", " << desc.GetStride(I3) << "}" << std::endl; +} + +// this is ugly, only for 4d +template +auto make_TensorDescriptor(TConstTensorDesc) +{ + static_assert(TConstTensorDesc::nDim == 4, "nDim is not 4"); + + constexpr auto I0 = Index<0>{}; + constexpr auto I1 = Index<1>{}; + constexpr auto I2 = Index<2>{}; + constexpr auto I3 = Index<3>{}; + constexpr auto desc = TConstTensorDesc{}; + + std::initializer_list lengths = { + desc.GetLength(I0), desc.GetLength(I1), desc.GetLength(I2), desc.GetLength(I3)}; + std::initializer_list strides = { + desc.GetStride(I0), desc.GetStride(I1), desc.GetStride(I2), desc.GetStride(I3)}; + + return TensorDescriptor(lengths, strides); +} + +template +void host_convolution(const Tensor& in, const Tensor& wei, Tensor& out) { auto f = [&](auto n, auto k, auto ho, auto wo) { double v = 0; @@ -67,12 +108,12 @@ void host_convolution(const Tensor& in, out.mDesc.GetLengths()[2], out.mDesc.GetLengths()[3]); - f_par(num_thread); + f_par(std::thread::hardware_concurrency()); } -template -void device_convolution(const Tensor& in, const Tensor& wei, Tensor& out) - +template +void device_convolution( + InDesc, const Tensor& in, WeiDesc, const Tensor& wei, OutDesc, Tensor& out) { DeviceTensorDescriptor<4> in_desc_device(in.mDesc); DeviceTensorDescriptor<4> wei_desc_device(wei.mDesc); @@ -103,6 +144,7 @@ void device_convolution(const Tensor& in, const Tensor& wei, Tensor& ou dim3 block_dim(64, 1, 1); dim3 grid_dim(1, 1, 1); +#if 0 gridwise_convolution <<>>(in_desc_device, static_cast(in_device_buf.GetDeviceBuffer()), @@ -110,6 +152,15 @@ void device_convolution(const Tensor& in, const Tensor& wei, Tensor& ou static_cast(wei_device_buf.GetDeviceBuffer()), out_desc_device, static_cast(out_device_buf.GetDeviceBuffer())); +#else + gridwise_convolution + <<>>(InDesc{}, + static_cast(in_device_buf.GetDeviceBuffer()), + WeiDesc{}, + static_cast(wei_device_buf.GetDeviceBuffer()), + OutDesc{}, + static_cast(out_device_buf.GetDeviceBuffer())); +#endif checkCudaErrors(cudaGetLastError()); out_device_buf.FromDevice(out.mData.data()); @@ -117,34 +168,53 @@ void device_convolution(const Tensor& in, const Tensor& wei, Tensor& ou int main() { -#if 0 - Tensor in({3, 16, 130, 130}); - Tensor wei({4, 16, 3, 3}); - Tensor out_host({3, 4, 128, 128}); +#if 1 + constexpr unsigned N = 1; + constexpr unsigned C = 1; + constexpr unsigned HI = 18; + constexpr unsigned WI = 18; + constexpr unsigned K = 1; + constexpr unsigned S = 3; + constexpr unsigned R = 3; #elif 0 - Tensor in({1, 1, 130, 130}); - Tensor wei({1, 1, 3, 3}); - Tensor out_host({1, 1, 128, 128}); -#elif 1 - Tensor in({1, 1, 18, 18}); - Tensor wei({1, 1, 3, 3}); - Tensor out_host({1, 1, 16, 16}); -#else - Tensor in({1, 1, 4, 4}); - Tensor wei({1, 1, 3, 3}); - Tensor out_host({1, 1, 2, 2}); + constexpr unsigned N = 1; + constexpr unsigned C = 1; + constexpr unsigned HI = 130; + constexpr unsigned WI = 130; + constexpr unsigned K = 1; + constexpr unsigned S = 3; + constexpr unsigned R = 3; +#elif 0 + constexpr unsigned N = 3; + constexpr unsigned C = 16; + constexpr unsigned HI = 130; + constexpr unsigned WI = 130; + constexpr unsigned K = 4; + constexpr unsigned S = 3; + constexpr unsigned R = 3; #endif + + auto in_desc = make_ConstantTensorDescriptor(Sequence{}); + auto wei_desc = make_ConstantTensorDescriptor(Sequence{}); + auto out_desc = get_output_4d_tensor_descriptor(in_desc, wei_desc); + + ostream_ConstantTensorDescriptor(in_desc, std::cout << "in_desc: "); + ostream_ConstantTensorDescriptor(wei_desc, std::cout << "wei_desc: "); + ostream_ConstantTensorDescriptor(out_desc, std::cout << "out_desc: "); + + Tensor in(make_TensorDescriptor(in_desc)); + Tensor wei(make_TensorDescriptor(wei_desc)); + Tensor out_host(make_TensorDescriptor(out_desc)); + Tensor out_device = out_host; int num_thread = std::thread::hardware_concurrency(); - std::cout << __func__ << ": num_thread " << num_thread << std::endl; - in.GenerateTensorValue(GeneratorTensor{}, num_thread); wei.GenerateTensorValue(GeneratorTensor{}, num_thread); - host_convolution(in, wei, out_host, num_thread); - device_convolution(in, wei, out_device); + host_convolution(in, wei, out_host); + device_convolution(in_desc, in, wei_desc, wei, out_desc, out_device); std::cout << __func__ << ": done" << std::endl; diff --git a/src/include/constant_direct_convolution.cuh b/src/include/constant_direct_convolution.cuh new file mode 100644 index 0000000000..ebc1adfdbd --- /dev/null +++ b/src/include/constant_direct_convolution.cuh @@ -0,0 +1,561 @@ +#pragma once +#include "constant_tensor_descriptor.cuh" + +template +__device__ void blockwise_4d_tensor_op( + SrcDesc, TFloat* const __restrict__ p_src, DstDesc, TFloat* __restrict__ p_dst, F f) +{ + constexpr auto I0 = Index<0>{}; + constexpr auto I1 = Index<1>{}; + constexpr auto I2 = Index<2>{}; + constexpr auto I3 = Index<3>{}; + + constexpr auto src_desc = SrcDesc{}; + constexpr auto dst_desc = DstDesc{}; + +#if 1 + if(threadIdx.x == 0) + { + print_ConstantTensorDescriptor(src_desc); + print_ConstantTensorDescriptor(dst_desc); + } +#endif + + constexpr unsigned NWorkStride3 = 1; + constexpr unsigned NWorkStride2 = NWorkLen3 * NWorkStride3; + constexpr unsigned NWorkStride1 = NWorkLen2 * NWorkStride2; + constexpr unsigned NWorkStride0 = NWorkLen1 * NWorkStride1; + + unsigned itmp = + threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * (blockDim.y * blockDim.x); + + const unsigned did0_begin = itmp / NWorkStride0; + + itmp -= did0_begin * NWorkStride0; + + const unsigned did1_begin = itmp / NWorkStride1; + + itmp -= did1_begin * NWorkStride1; + + const unsigned did2_begin = itmp / NWorkStride2; + + itmp -= did2_begin * NWorkStride2; + + const unsigned did3_begin = itmp / NWorkStride3; + + for(unsigned did0 = did0_begin; did0 < src_desc.GetLength(I0); did0 += NWorkLen0) + { + for(unsigned did1 = did1_begin; did1 < src_desc.GetLength(I1); did1 += NWorkLen1) + { + for(unsigned did2 = did2_begin; did2 < src_desc.GetLength(I2); did2 += NWorkLen2) + { + for(unsigned did3 = did3_begin; did3 < src_desc.GetLength(I3); did3 += NWorkLen3) + { + const unsigned sindex = + src_desc.GetStride(I0) * did0 + src_desc.GetStride(I1) * did1 + + src_desc.GetStride(I2) * did2 + src_desc.GetStride(I3) * did3; + + const unsigned dindex = + dst_desc.GetStride(I0) * did0 + dst_desc.GetStride(I1) * did1 + + dst_desc.GetStride(I2) * did2 + dst_desc.GetStride(I3) * did3; + + f(p_src[dindex], p_dst[sindex]); + +#if 0 + // if(threadIdx.x == 0) + { + printf("blockwise_4d_tensor_op: 1: thread id %u, \t" + "sindex %u, p_src[sindex] %f, \t" + "dindex %u, p_dst[dindex] %f\n", + threadIdx.x, + sindex, + p_src[sindex], + dindex, + p_dst[dindex]); + } +#endif + } + } + } + } +} + +template +__device__ void threadwise_4d_tensor_op( + SrcDesc, TFloat* const __restrict__ p_src, DstDesc, TFloat* __restrict__ p_dst, F f) +{ + constexpr auto I0 = Index<0>{}; + constexpr auto I1 = Index<1>{}; + constexpr auto I2 = Index<2>{}; + constexpr auto I3 = Index<3>{}; + + constexpr auto src_desc = SrcDesc{}; + constexpr auto dst_desc = DstDesc{}; + +#if 1 + if(threadIdx.x == 0) + { + print_ConstantTensorDescriptor(src_desc); + print_ConstantTensorDescriptor(dst_desc); + } +#endif + + for(unsigned did0 = 0; did0 < src_desc.GetLength(I0); ++did0) + { + for(unsigned did1 = 0; did1 < src_desc.GetLength(I1); ++did1) + { + for(unsigned did2 = 0; did2 < src_desc.GetLength(I2); ++did2) + { + for(unsigned did3 = 0; did3 < src_desc.GetLength(I3); ++did3) + { + const unsigned sindex = + src_desc.GetStride(I0) * did0 + src_desc.GetStride(I1) * did1 + + src_desc.GetStride(I2) * did2 + src_desc.GetStride(I3) * did3; + + const unsigned dindex = + dst_desc.GetStride(I0) * did0 + dst_desc.GetStride(I1) * did1 + + dst_desc.GetStride(I2) * did2 + dst_desc.GetStride(I3) * did3; + + f(p_src[sindex], p_dst[dindex]); + +#if 0 + if(threadIdx.x == 0) + { + printf("threadwise_4d_tensor_op: 1: thread id %u, \t" + "sindex %u, p_src[sindex] %f, \t" + "dindex %u, p_dst[dindex] %f\n", + threadIdx.x, + sindex, + p_src[sindex], + dindex, + p_dst[dindex]); + } +#endif + } + } + } + } +} + +template +__device__ void threadwise_direct_convolution(InDesc, + TFloat* const __restrict__ p_in, + WeiDesc, + TFloat* const __restrict__ p_wei, + OutDesc, + TFloat* __restrict__ p_out) +{ + constexpr auto I0 = Index<0>{}; + constexpr auto I1 = Index<1>{}; + constexpr auto I2 = Index<2>{}; + constexpr auto I3 = Index<3>{}; + + constexpr auto in_desc = InDesc{}; + constexpr auto wei_desc = WeiDesc{}; + constexpr auto out_desc = OutDesc{}; + +#if 1 + if(threadIdx.x == 0) + { + print_ConstantTensorDescriptor(in_desc); + print_ConstantTensorDescriptor(wei_desc); + print_ConstantTensorDescriptor(out_desc); + } +#endif + + for(unsigned n = 0; n < out_desc.GetLength(I0); ++n) + { + for(unsigned k = 0; k < out_desc.GetLength(I1); ++k) + { + for(unsigned ho = 0; ho < out_desc.GetLength(I2); ++ho) + { + for(unsigned wo = 0; wo < out_desc.GetLength(I3); ++wo) + { + for(unsigned c = 0; c < wei_desc.GetLength(I1); ++c) + { + for(unsigned s = 0; s < wei_desc.GetLength(I2); ++s) + { + for(unsigned r = 0; r < wei_desc.GetLength(I3); ++r) + { + const unsigned hi = ho + s; + const unsigned wi = wo + r; + + const unsigned in_index = + in_desc.GetStride(I0) * n + in_desc.GetStride(I1) * c + + in_desc.GetStride(I2) * hi + in_desc.GetStride(I3) * wi; + + const unsigned wei_index = + wei_desc.GetStride(I0) * k + wei_desc.GetStride(I1) * c + + wei_desc.GetStride(I2) * s + in_desc.GetStride(I3) * r; + + const unsigned out_index = + out_desc.GetStride(I0) * n + out_desc.GetStride(I1) * k + + out_desc.GetStride(I2) * ho + out_desc.GetStride(I3) * wo; + + p_out[out_index] += p_wei[wei_index] * p_in[in_index]; + +#if 0 + if(threadIdx.x == 0) + { + printf("threadwise_direct_convolution: 1: \t" + "threadIdx.x %u\t" + "out_index %u, p_out[out_index] %f, \t" + "wei_index %u, p_wei[wei_index] %f, \t" + "in_index %u, p_in[in_index] %f\n", + threadIdx.x, + out_index, + p_out[out_index], + wei_index, + p_wei[wei_index], + in_index, + p_in[in_index]); + } +#endif + } + } + } + } + } + } + } +} + +template +__device__ void blockwise_convolution(InDesc, + TFloat* const __restrict__ p_in, + WeiDesc, + TFloat* const __restrict__ p_wei, + OutDesc, + TFloat* __restrict__ p_out) +{ + constexpr auto I0 = Index<0>{}; + constexpr auto I1 = Index<1>{}; + constexpr auto I2 = Index<2>{}; + constexpr auto I3 = Index<3>{}; + + constexpr auto in_desc = InDesc{}; + constexpr auto wei_desc = WeiDesc{}; + constexpr auto out_desc = OutDesc{}; + +#if 1 + if(threadIdx.x == 0) + { + print_ConstantTensorDescriptor(in_desc); + print_ConstantTensorDescriptor(wei_desc); + print_ConstantTensorDescriptor(out_desc); + } +#endif + + constexpr auto in_thread_src_desc = make_ConstantTensorDescriptor( + Sequence<1, CPerBlockLoop, OutTileSizeH + S - 1, OutTileSizeW + R - 1>{}, + in_desc.GetStrides()); + + constexpr auto wei_thread_src_desc = + make_ConstantTensorDescriptor(Sequence<1, CPerBlockLoop, S, R>{}, wei_desc.GetStrides()); + + constexpr auto out_thread_src_desc = make_ConstantTensorDescriptor( + Sequence<1, 1, OutTileSizeH, OutTileSizeW>{}, out_desc.GetStrides()); + + constexpr auto in_thread_dst_desc = + make_ConstantTensorDescriptor(in_thread_src_desc.GetLengths()); + + constexpr auto wei_thread_dst_desc = + make_ConstantTensorDescriptor(wei_thread_src_desc.GetLengths()); + + constexpr auto out_thread_dst_desc = + make_ConstantTensorDescriptor(out_thread_src_desc.GetLengths()); + + const unsigned thread_sz = blockDim.x * blockDim.y * blockDim.z; + + const unsigned thread_id = + threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * (blockDim.y * blockDim.x); + + for(unsigned thread_work_id = thread_id; + thread_work_id < NPerBlock * KPerBlock * YPerBlock * XPerBlock; + thread_work_id += thread_sz) + { + unsigned itmp = thread_work_id; + unsigned n_thread_work_id = itmp / (KPerBlock * YPerBlock * XPerBlock); + itmp -= n_thread_work_id * (KPerBlock * YPerBlock * XPerBlock); + unsigned k_thread_work_id = itmp / (YPerBlock * XPerBlock); + itmp -= k_thread_work_id * (YPerBlock * XPerBlock); + unsigned y_thread_work_id = itmp / XPerBlock; + unsigned x_thread_work_id = itmp - y_thread_work_id * XPerBlock; + + unsigned n_thread_work_begin = n_thread_work_id * 1; + unsigned k_thread_work_begin = k_thread_work_id * 1; + unsigned ho_thread_work_begin = y_thread_work_id * OutTileSizeH; + unsigned wo_thread_work_begin = x_thread_work_id * OutTileSizeW; + + unsigned hi_thread_work_begin = ho_thread_work_begin; // minus padding + unsigned wi_thread_work_begin = wo_thread_work_begin; // minus padding + + TFloat p_in_thread[1 * CPerBlockLoop * InTileSizeH * InTileSizeW]; + TFloat p_wei_thread[1 * CPerBlockLoop * S * R]; + TFloat p_out_thread[1 * 1 * OutTileSizeH * OutTileSizeW]; + + auto f_copy = [](const TFloat& src, TFloat& dst) { dst = src; }; + + // copy input tensor into register + threadwise_4d_tensor_op( + in_thread_src_desc, + p_in + in_desc.Get1dIndex( + n_thread_work_begin, 0, hi_thread_work_begin, wi_thread_work_begin), + in_thread_dst_desc, + p_in_thread, + f_copy); + + // copy weight tensor into register + threadwise_4d_tensor_op( + wei_thread_src_desc, + p_wei + wei_desc.Get1dIndex(k_thread_work_begin, 0, 0, 0), + wei_thread_dst_desc, + p_wei_thread, + f_copy); + + // copy output tensor into register + threadwise_4d_tensor_op(out_thread_src_desc, + p_out + out_desc.Get1dIndex(n_thread_work_begin, + k_thread_work_begin, + ho_thread_work_begin, + wo_thread_work_begin), + out_thread_dst_desc, + p_out_thread, + f_copy); + + // threadwise convolution + threadwise_direct_convolution(in_thread_dst_desc, + p_in_thread, + wei_thread_dst_desc, + p_wei_thread, + out_thread_dst_desc, + p_out_thread); + + // accumulate output tensor into device mem + threadwise_4d_tensor_op(out_thread_dst_desc, + p_out_thread, + out_thread_src_desc, + p_out + out_desc.Get1dIndex(n_thread_work_begin, + k_thread_work_begin, + ho_thread_work_begin, + wo_thread_work_begin), + f_copy); + } +} + +template +__global__ void gridwise_convolution(InDesc, + TFloat* const __restrict__ p_in, + WeiDesc, + TFloat* const __restrict__ p_wei, + OutDesc, + TFloat* __restrict__ p_out) +{ + constexpr auto I0 = Index<0>{}; + constexpr auto I1 = Index<1>{}; + constexpr auto I2 = Index<2>{}; + constexpr auto I3 = Index<3>{}; + + constexpr auto in_desc = InDesc{}; + constexpr auto wei_desc = WeiDesc{}; + constexpr auto out_desc = OutDesc{}; + + constexpr unsigned S = wei_desc.GetLength(I2); + constexpr unsigned R = wei_desc.GetLength(I3); + +#if 1 + if(threadIdx.x == 0) + { + print_ConstantTensorDescriptor(in_desc); + print_ConstantTensorDescriptor(wei_desc); + print_ConstantTensorDescriptor(out_desc); + } +#endif + + constexpr unsigned NBlockWork = (in_desc.GetLength(I0) + NPerBlock - 1) / NPerBlock; + constexpr unsigned YBlockWork = (in_desc.GetLength(I2) + YPerBlock - 1) / YPerBlock; + constexpr unsigned XBlockWork = (in_desc.GetLength(I3) + XPerBlock - 1) / XPerBlock; + + constexpr unsigned KBlockWork = (wei_desc.GetLength(I1) + KPerBlock - 1) / KPerBlock; + + const unsigned block_id = + blockIdx.x + blockIdx.y * gridDim.x + blockIdx.z * (gridDim.y * gridDim.x); + + constexpr auto in_block_desc = + make_ConstantTensorDescriptor(Sequence{}); + constexpr auto wei_block_desc = + make_ConstantTensorDescriptor(Sequence{}); + + constexpr auto out_block_desc = make_ConstantTensorDescriptor( + Sequence{}); + + __shared__ TFloat p_in_block[NPerBlock * CPerBlockLoop * (YPerBlock * OutTileSizeH + S - 1) * + (XPerBlock * OutTileSizeW + R - 1)]; + __shared__ TFloat p_wei_block[KPerBlock * CPerBlockLoop * S * R]; + __shared__ TFloat p_out_block[NPerBlock * KPerBlock * (YPerBlock * OutTileSizeH) * + (XPerBlock * OutTileSizeW)]; + + unsigned itmp = block_id; + unsigned n_block_work_id = itmp / (KBlockWork * YBlockWork * XBlockWork); + itmp -= n_block_work_id * (KBlockWork * YBlockWork * XBlockWork); + unsigned k_block_work_id = itmp / (YBlockWork * XBlockWork); + itmp -= k_block_work_id * (YBlockWork * XBlockWork); + unsigned y_block_work_id = itmp / XBlockWork; + unsigned x_block_work_id = itmp - y_block_work_id * XBlockWork; + + unsigned n_block_work_begin = n_block_work_id * NPerBlock; + unsigned k_block_work_begin = k_block_work_id * KPerBlock; + unsigned y_block_work_begin = y_block_work_id * YPerBlock; + unsigned x_block_work_begin = x_block_work_id * XPerBlock; + + unsigned ho_block_work_begin = y_block_work_begin * OutTileSizeH; + unsigned wo_block_work_begin = x_block_work_begin * OutTileSizeW; + + unsigned hi_block_work_begin = ho_block_work_begin; // minus padding + unsigned wi_block_work_begin = wo_block_work_begin; // minus padding + + for(unsigned c_block_work_begin = 0; c_block_work_begin < in_desc.GetLength(I1); + c_block_work_begin += CPerBlockLoop) + { + auto f_copy = [](const TFloat& src, TFloat& dst) { dst = src; }; + + // copy input tensor to LDS + blockwise_4d_tensor_op(in_desc, + p_in + in_desc.Get1dIndex(n_block_work_begin, + c_block_work_begin, + hi_block_work_begin, + wi_block_work_begin), + in_block_desc, + p_in_block, + f_copy); + + // copy weight tensor to LDS + blockwise_4d_tensor_op( + wei_desc, + p_wei + wei_desc.Get1dIndex(k_block_work_begin, c_block_work_begin, 0, 0), + wei_block_desc, + p_wei_block, + f_copy); + + // copy output tensor to LDS + blockwise_4d_tensor_op(out_desc, + p_out + out_desc.Get1dIndex(n_block_work_begin, + k_block_work_begin, + ho_block_work_begin, + wo_block_work_begin), + out_block_desc, + p_out_block, + f_copy); + + __syncthreads(); + + // blockwise convolution + blockwise_convolution( + in_block_desc, p_in_block, wei_block_desc, p_wei_block, out_block_desc, p_out_block); + + __syncthreads(); + + // accum output tensor from LDS to device mem + blockwise_4d_tensor_op(out_block_desc, + p_out_block, + out_desc, + p_out + out_desc.Get1dIndex(n_block_work_begin, + k_block_work_begin, + ho_block_work_begin, + wo_block_work_begin), + f_copy); + } +} diff --git a/src/include/constant_tensor_descriptor.cuh b/src/include/constant_tensor_descriptor.cuh new file mode 100644 index 0000000000..37e98a82c1 --- /dev/null +++ b/src/include/constant_tensor_descriptor.cuh @@ -0,0 +1,169 @@ +#pragma once +#include "helper_cuda.h" + +template +struct Constant +{ + const T mValue = N; +}; + +template +using Index = Constant; + +template +struct Sequence +{ + static constexpr unsigned nDim = sizeof...(Is); + + const unsigned mData[nDim] = {Is...}; + + template + __host__ __device__ constexpr unsigned Get(Index) const + { + return mData[I]; + } +}; + +#if 0 +template +void for_each(F f, std::integer_sequence) +{ + f(Is)...; +} + +template +void for_n_time(F f, Constant) +{ + for_each(f, std::make_integer_sequence{}); +} +#endif + +template +struct ConstantTensorDescriptor +{ + static constexpr unsigned nDim = Lengths::nDim; + using NDimConstant = Index; + + __host__ __device__ constexpr ConstantTensorDescriptor() + { + static_assert(Lengths::nDim == Strides::nDim, "nDim not consistent"); + } + + __host__ __device__ constexpr unsigned GetDimension() const { return nDim; } + + __host__ __device__ constexpr Lengths GetLengths() const { return Lengths{}; } + + __host__ __device__ constexpr Strides GetStrides() const { return Strides{}; } + + template + __host__ __device__ constexpr unsigned GetLength(Index) const + { + return Lengths{}.Get(Index{}); + } + + template + __host__ __device__ constexpr unsigned GetStride(Index) const + { + return Strides{}.Get(Index{}); + } + +#if 0 + template + __host__ __device__ unsigned Get1dIndex(Is... is) const + { + static_assert(nDim == sizeof...(Is), "nDim not consistent"); + const unsigned iss[nDim] = {static_cast(is)...}; + unsigned idx = 0; + for_n_time([&](auto iDim) { idx += iss[iDim] * GetStride(); }, NDimConstant{}); + return idx; + } +#elif 1 + // this is ugly, only for 4d + __host__ __device__ unsigned Get1dIndex(unsigned n, unsigned c, unsigned h, unsigned w) const + { + constexpr auto I0 = Index<0>{}; + constexpr auto I1 = Index<1>{}; + constexpr auto I2 = Index<2>{}; + constexpr auto I3 = Index<3>{}; + + static_assert(nDim == 4, "nDim not consistent"); + return n * GetStride(I0) + c * GetStride(I1) + h * GetStride(I2) + w * GetStride(I3); + } +#endif +}; + +// this is ugly, only for 4d +template +__host__ __device__ constexpr auto calculate_default_strides(Sequence) +{ + return Sequence{}; +} + +template +__host__ __device__ constexpr auto make_ConstantTensorDescriptor(Lengths) +{ + using Strides = decltype(calculate_default_strides(Lengths{})); + return ConstantTensorDescriptor{}; +} + +template +__host__ __device__ constexpr auto make_ConstantTensorDescriptor(Lengths, Strides) +{ + return ConstantTensorDescriptor{}; +} + +// this is ugly, only for 4d +template +__host__ __device__ constexpr auto get_output_4d_tensor_descriptor(InDesc, WeiDesc) +{ + constexpr auto in_desc = InDesc{}; + constexpr auto wei_desc = WeiDesc{}; + + constexpr auto I0 = Index<0>{}; + constexpr auto I1 = Index<1>{}; + constexpr auto I2 = Index<2>{}; + constexpr auto I3 = Index<3>{}; + + static_assert(in_desc.GetDimension() == 4, "input nDim is not 4"); + static_assert(wei_desc.GetDimension() == 4, "weight nDim is not 4"); + static_assert(in_desc.GetLength(I1) == wei_desc.GetLength(I1), + "input & weight dimension not consistent"); + + constexpr auto N = in_desc.GetLength(I0); + constexpr auto HI = in_desc.GetLength(I2); + constexpr auto WI = in_desc.GetLength(I3); + + constexpr auto K = wei_desc.GetLength(I0); + constexpr auto S = wei_desc.GetLength(I2); + constexpr auto R = wei_desc.GetLength(I3); + + constexpr auto HO = HI - S + 1; + constexpr auto WO = WI - R + 1; + + return make_ConstantTensorDescriptor(Sequence{}); +} + +// this is ugly, only for 4d +template +__host__ __device__ void print_ConstantTensorDescriptor(TDesc) +{ + constexpr auto desc = TDesc{}; + + constexpr auto I0 = Index<0>{}; + constexpr auto I1 = Index<1>{}; + constexpr auto I2 = Index<2>{}; + constexpr auto I3 = Index<3>{}; + + static_assert(desc.GetDimension() == 4, "dim is not 4"); + + printf("dim %u, lengths {%u %u %u %u}, strides {%u %u %u %u}\n", + desc.GetDimension(), + desc.GetLength(I0), + desc.GetLength(I1), + desc.GetLength(I2), + desc.GetLength(I3), + desc.GetStride(I0), + desc.GetStride(I1), + desc.GetStride(I2), + desc.GetStride(I3)); +} diff --git a/src/include/device_tensor.cuh b/src/include/device_tensor.cuh deleted file mode 100644 index f715dc2ff7..0000000000 --- a/src/include/device_tensor.cuh +++ /dev/null @@ -1,30 +0,0 @@ -#pragma once -#include -#include "helper_cuda.h" -#include "tensor.hpp" - -template -struct DeviceTensorDescriptor -{ - __host__ __device__ DeviceTensorDescriptor() = default; - - __host__ DeviceTensorDescriptor(const TensorDescriptor& host_desc) - { - assert(NDim == host_desc.GetDimension()); - std::copy(host_desc.GetLengths().begin(), host_desc.GetLengths().end(), mpLengths); - std::copy(host_desc.GetStrides().begin(), host_desc.GetStrides().end(), mpStrides); - } - - __host__ __device__ unsigned GetLength(unsigned i) const { return mpLengths[i]; } - - __host__ __device__ unsigned GetStride(unsigned i) const { return mpStrides[i]; } - - // this is ugly - __host__ __device__ unsigned Get1dIndex(unsigned n, unsigned c, unsigned h, unsigned w) const - { - return n * mpStrides[0] + c * mpStrides[1] + h * mpStrides[2] + w * mpStrides[3]; - } - - unsigned mpLengths[NDim]; - unsigned mpStrides[NDim]; -}; diff --git a/src/include/device_tensor_descriptor.cuh b/src/include/device_tensor_descriptor.cuh new file mode 100644 index 0000000000..403e547a33 --- /dev/null +++ b/src/include/device_tensor_descriptor.cuh @@ -0,0 +1,60 @@ +#pragma once +#include +#include "constant_tensor_descriptor.cuh" +#include "helper_cuda.h" +#include "tensor.hpp" + +template +struct DeviceTensorDescriptor +{ + __host__ __device__ DeviceTensorDescriptor() = default; + + __host__ DeviceTensorDescriptor(const TensorDescriptor& host_desc) + { + assert(NDim == host_desc.GetDimension()); + std::copy(host_desc.GetLengths().begin(), host_desc.GetLengths().end(), mpLengths); + std::copy(host_desc.GetStrides().begin(), host_desc.GetStrides().end(), mpStrides); + } + + __host__ __device__ unsigned GetLength(unsigned i) const { return mpLengths[i]; } + + __host__ __device__ unsigned GetStride(unsigned i) const { return mpStrides[i]; } + + // this is ugly, only for 4d + __host__ __device__ unsigned Get1dIndex(unsigned n, unsigned c, unsigned h, unsigned w) const + { + return n * mpStrides[0] + c * mpStrides[1] + h * mpStrides[2] + w * mpStrides[3]; + } + + unsigned mpLengths[NDim]; + unsigned mpStrides[NDim]; +}; + +// this is ugly, only for 4d +template +__host__ __device__ auto make_DeviceTensorDescriptor(TConstTensorDesc) +{ + static_assert(TConstTensorDesc::nDim == 4, "nDim is not 4"); + + constexpr auto I0 = Index<0>{}; + constexpr auto I1 = Index<1>{}; + constexpr auto I2 = Index<2>{}; + constexpr auto I3 = Index<3>{}; + constexpr auto const_desc = TConstTensorDesc{}; + + constexpr auto ndim = const_desc.GetDimension(); + + auto desc = DeviceTensorDescriptor{}; + + desc.mpLengths[0] = const_desc.GetLength(I0); + desc.mpLengths[1] = const_desc.GetLength(I1); + desc.mpLengths[2] = const_desc.GetLength(I2); + desc.mpLengths[3] = const_desc.GetLength(I3); + + desc.mpStrides[0] = const_desc.GetStride(I0); + desc.mpStrides[1] = const_desc.GetStride(I1); + desc.mpStrides[2] = const_desc.GetStride(I2); + desc.mpStrides[3] = const_desc.GetStride(I3); + + return desc; +} diff --git a/src/include/direct_convolution.cuh b/src/include/direct_convolution.cuh index b71985c3c1..55ac787061 100644 --- a/src/include/direct_convolution.cuh +++ b/src/include/direct_convolution.cuh @@ -1,5 +1,5 @@ #pragma once -#include "device_tensor.cuh" +#include "device_tensor_descriptor.cuh" template & src_desc TFloat* __restrict__ p_dst, F f) { -#if 1 +#if 0 if(threadIdx.x == 0) { printf("blockwise_4d_tensor_op: 0: \t" @@ -80,7 +80,7 @@ __device__ void blockwise_4d_tensor_op(const DeviceTensorDescriptor<4>& src_desc f(p_src[dindex], p_dst[sindex]); -#if 1 +#if 0 // if(threadIdx.x == 0) { printf("blockwise_4d_tensor_op: 1: thread id %u, \t" @@ -106,7 +106,7 @@ __device__ void threadwise_4d_tensor_op(const DeviceTensorDescriptor<4>& src_des TFloat* __restrict__ p_dst, F f) { -#if 1 +#if 0 if(threadIdx.x == 0) { printf("threadwise_4d_tensor_op: 0: \t" @@ -151,7 +151,7 @@ __device__ void threadwise_4d_tensor_op(const DeviceTensorDescriptor<4>& src_des f(p_src[sindex], p_dst[dindex]); -#if 1 +#if 0 if(threadIdx.x == 0) { printf("threadwise_4d_tensor_op: 1: thread id %u, \t" @@ -178,7 +178,7 @@ __device__ void threadwise_direct_convolution(const DeviceTensorDescriptor<4>& i const DeviceTensorDescriptor<4>& out_desc, TFloat* __restrict__ p_out) { -#if 1 +#if 0 if(threadIdx.x == 0) { printf("threadwise_direct_convolution: 0: \t" @@ -212,7 +212,7 @@ __device__ void threadwise_direct_convolution(const DeviceTensorDescriptor<4>& i out_desc.GetStride(2), out_desc.GetStride(3)); } -#elif 1 +#elif 0 { printf("threadwise_direct_convolution: 0: \t" "threadIdx.x %u \t" @@ -275,7 +275,7 @@ __device__ void threadwise_direct_convolution(const DeviceTensorDescriptor<4>& i p_out[out_index] += p_wei[wei_index] * p_in[in_index]; -#if 1 +#if 0 if(threadIdx.x == 0) { printf("threadwise_direct_convolution: 1: \t" @@ -320,7 +320,7 @@ __device__ void blockwise_convolution(const DeviceTensorDescriptor<4>& in_desc, const DeviceTensorDescriptor<4>& out_desc, TFloat* __restrict__ p_out) { -#if 1 +#if 0 if(threadIdx.x == 0) { printf("blockwise_convolution: 0: \t" @@ -501,7 +501,7 @@ __global__ void gridwise_convolution(const DeviceTensorDescriptor<4> in_desc, const DeviceTensorDescriptor<4> out_desc, TFloat* __restrict__ p_out) { -#if 1 +#if 0 if(threadIdx.x == 0) { printf("gridwise_convolution: 0: \t" diff --git a/src/include/tensor.hpp b/src/include/tensor.hpp index 4bcd81b122..39e8949a21 100644 --- a/src/include/tensor.hpp +++ b/src/include/tensor.hpp @@ -69,28 +69,25 @@ auto construct_f_unpack_args(F, T args) struct TensorDescriptor { TensorDescriptor() = delete; - TensorDescriptor(DataType_t t, std::initializer_list lens); - TensorDescriptor(DataType_t t, - std::initializer_list lens, + TensorDescriptor(std::initializer_list lens); + TensorDescriptor(std::initializer_list lens, std::initializer_list strides); - TensorDescriptor(DataType_t t, std::vector lens, std::vector strides); + TensorDescriptor(std::vector lens, std::vector strides); void CalculateStrides(); template - TensorDescriptor(DataType_t t, const Range& lens) - : mLens(lens.begin(), lens.end()), mDataType(t) + TensorDescriptor(const Range& lens) : mLens(lens.begin(), lens.end()) { this->CalculateStrides(); } template - TensorDescriptor(DataType_t t, const Range1& lens, const Range2& strides) - : mLens(lens.begin(), lens.end()), mStrides(strides.begin(), strides.end()), mDataType(t) + TensorDescriptor(const Range1& lens, const Range2& strides) + : mLens(lens.begin(), lens.end()), mStrides(strides.begin(), strides.end()) { } - DataType_t GetDataType() const; std::size_t GetDimension() const; std::size_t GetElementSize() const; std::size_t GetElementSpace() const; @@ -107,7 +104,6 @@ struct TensorDescriptor } private: - DataType_t mDataType; std::vector mLens; std::vector mStrides; }; @@ -220,22 +216,23 @@ template struct Tensor { template - Tensor(std::initializer_list lens) - : mDesc(DataType{}, lens), mData(mDesc.GetElementSpace()) + Tensor(std::initializer_list lens) : mDesc(lens), mData(mDesc.GetElementSpace()) { } template - Tensor(std::vector lens) : mDesc(DataType{}, lens), mData(mDesc.GetElementSpace()) + Tensor(std::vector lens) : mDesc(lens), mData(mDesc.GetElementSpace()) { } template Tensor(std::vector lens, std::vector strides) - : mDesc(DataType{}, lens, strides), mData(mDesc.GetElementSpace()) + : mDesc(lens, strides), mData(mDesc.GetElementSpace()) { } + Tensor(const TensorDescriptor& desc) : mDesc(desc), mData(mDesc.GetElementSpace()) {} + template void GenerateTensorValue(G g, std::size_t num_thread = 1) { diff --git a/src/tensor.cpp b/src/tensor.cpp index 852d0b8046..ee3d39837d 100644 --- a/src/tensor.cpp +++ b/src/tensor.cpp @@ -3,16 +3,13 @@ #include "tensor.hpp" -TensorDescriptor::TensorDescriptor(DataType_t t, std::initializer_list lens) - : mLens(lens), mDataType(t) +TensorDescriptor::TensorDescriptor(std::initializer_list lens) : mLens(lens) { this->CalculateStrides(); } -TensorDescriptor::TensorDescriptor(DataType_t t, - std::vector lens, - std::vector strides) - : mLens(lens), mStrides(strides), mDataType(t) +TensorDescriptor::TensorDescriptor(std::vector lens, std::vector strides) + : mLens(lens), mStrides(strides) { } @@ -28,8 +25,6 @@ void TensorDescriptor::CalculateStrides() mLens.rbegin(), mLens.rend() - 1, mStrides.rbegin() + 1, std::multiplies()); } -DataType_t TensorDescriptor::GetDataType() const { return mDataType; } - std::size_t TensorDescriptor::GetDimension() const { return mLens.size(); } std::size_t TensorDescriptor::GetElementSize() const