diff --git a/driver/conv.cu b/driver/conv.cu index 0dc8a03b48..545fa4419a 100644 --- a/driver/conv.cu +++ b/driver/conv.cu @@ -5,6 +5,7 @@ #include "nvToolsExt.h" #include "tensor.hpp" #include "constant_tensor_descriptor.cuh" +#include "conv_common.cuh" #include "device_direct_convolution_1.cuh" #include "device_direct_convolution_2.cuh" //#include "device_implicit_gemm_convolution.cuh" @@ -367,7 +368,7 @@ int main() auto in_desc = make_ConstantTensorDescriptor(Sequence{}); auto wei_desc = make_ConstantTensorDescriptor(Sequence{}); - auto out_desc = get_convolution_output_4d_tensor_descriptor(in_desc, wei_desc); + auto out_desc = get_convolution_output_default_4d_tensor_descriptor(in_desc, wei_desc); ostream_ConstantTensorDescriptor(in_desc, std::cout << "in_desc: "); ostream_ConstantTensorDescriptor(wei_desc, std::cout << "wei_desc: "); diff --git a/src/include/blockwise_direct_convolution.cuh b/src/include/blockwise_direct_convolution.cuh index 0d6c749648..85bf1bddae 100644 --- a/src/include/blockwise_direct_convolution.cuh +++ b/src/include/blockwise_direct_convolution.cuh @@ -59,7 +59,7 @@ __device__ void blockwise_direct_convolution(InBlockDesc, make_ConstantTensorDescriptor(Sequence{}); constexpr auto out_thread_desc = - get_convolution_output_4d_tensor_descriptor(in_thread_desc, wei_thread_desc); + get_convolution_output_default_4d_tensor_descriptor(in_thread_desc, wei_thread_desc); constexpr auto in_thread_block_desc = make_ConstantTensorDescriptor(in_thread_desc.GetLengths(), in_block_desc.GetStrides()); diff --git a/src/include/constant_tensor_descriptor.cuh b/src/include/constant_tensor_descriptor.cuh index b3b508b67f..a04ba63dd3 100644 --- a/src/include/constant_tensor_descriptor.cuh +++ b/src/include/constant_tensor_descriptor.cuh @@ -23,14 +23,6 @@ struct Sequence return mData[I]; } - template - __host__ __device__ constexpr auto GetConstant(Number) const - { - constexpr unsigned N = Get(I); - - return Number{}; - } - template __host__ __device__ constexpr auto Reorder(Number, Number) const { @@ -61,17 +53,15 @@ struct Sequence return Sequence{}; } - template - __host__ __device__ constexpr auto - Reorder(Number, Number, Number, Number, Number) const + template + __host__ __device__ constexpr auto Reorder(Sequence) const { constexpr unsigned IR0 = Get(Number{}); constexpr unsigned IR1 = Get(Number{}); constexpr unsigned IR2 = Get(Number{}); constexpr unsigned IR3 = Get(Number{}); - constexpr unsigned IR4 = Get(Number{}); - return Sequence{}; + return Sequence{}; } }; @@ -132,7 +122,8 @@ struct ConstantTensorDescriptor } // this is ugly, only for 4d - __host__ __device__ unsigned Get1dIndex(unsigned n, unsigned c, unsigned h, unsigned w) const + __host__ __device__ unsigned + Get1dIndex(unsigned i0, unsigned i1, unsigned i2, unsigned i3) const { constexpr auto I0 = Number<0>{}; constexpr auto I1 = Number<1>{}; @@ -140,24 +131,24 @@ struct ConstantTensorDescriptor constexpr auto I3 = Number<3>{}; static_assert(nDim == 4, "nDim is not 4"); - return n * GetStride(I0) + c * GetStride(I1) + h * GetStride(I2) + w * GetStride(I3); - } - - template - __host__ __device__ constexpr auto Reorder(Is... is) const - { - constexpr auto lengths = Lengths{}.Reorder(is...); - constexpr auto strides = Strides{}.Reorder(is...); - - return ConstantTensorDescriptor{}; + return i0 * GetStride(I0) + i1 * GetStride(I1) + i2 * GetStride(I2) + i3 * GetStride(I3); } }; // this is ugly, only for 4d -template -__host__ __device__ constexpr auto calculate_default_strides(Sequence) +template +__host__ __device__ constexpr auto calculate_default_strides(Sequence) { - return Sequence{}; + return Sequence{}; +} + +// this is ugly, only for 4d +template +__host__ __device__ constexpr auto calculate_full_lengths(Sequence) +{ + static_assert((S0 % S1 == 0) && (S1 % S2 == 0) && (S2 % S3 == 0), "cannot be evenly divided!"); + + return Sequence<1, S0 / S1, S1 / S2, S2 / S3>{}; } template @@ -173,37 +164,6 @@ __host__ __device__ constexpr auto make_ConstantTensorDescriptor(Lengths, Stride return ConstantTensorDescriptor{}; } -// this is ugly, only for 4d -template -__host__ __device__ constexpr auto get_convolution_output_4d_tensor_descriptor(InDesc, WeiDesc) -{ - constexpr auto in_desc = InDesc{}; - constexpr auto wei_desc = WeiDesc{}; - - constexpr auto I0 = Number<0>{}; - constexpr auto I1 = Number<1>{}; - constexpr auto I2 = Number<2>{}; - constexpr auto I3 = Number<3>{}; - - static_assert(in_desc.GetDimension() == 4, "input nDim is not 4"); - static_assert(wei_desc.GetDimension() == 4, "weight nDim is not 4"); - static_assert(in_desc.GetLength(I1) == wei_desc.GetLength(I1), - "input & weight dimension not consistent"); - - constexpr auto N = in_desc.GetLength(I0); - constexpr auto HI = in_desc.GetLength(I2); - constexpr auto WI = in_desc.GetLength(I3); - - constexpr auto K = wei_desc.GetLength(I0); - constexpr auto S = wei_desc.GetLength(I2); - constexpr auto R = wei_desc.GetLength(I3); - - constexpr auto HO = HI - S + 1; - constexpr auto WO = WI - R + 1; - - return make_ConstantTensorDescriptor(Sequence{}); -} - // this is ugly, only for 4d template __host__ __device__ void print_ConstantTensorDescriptor(TDesc, const char* s) diff --git a/src/include/conv_common.cuh b/src/include/conv_common.cuh new file mode 100644 index 0000000000..81f0b167af --- /dev/null +++ b/src/include/conv_common.cuh @@ -0,0 +1,34 @@ +#pragma once +#include "constant_tensor_descriptor.cuh" + +// this is ugly, only for 4d +template +__host__ __device__ constexpr auto get_convolution_output_default_4d_tensor_descriptor(InDesc, + WeiDesc) +{ + constexpr auto in_desc = InDesc{}; + constexpr auto wei_desc = WeiDesc{}; + + constexpr auto I0 = Number<0>{}; + constexpr auto I1 = Number<1>{}; + constexpr auto I2 = Number<2>{}; + constexpr auto I3 = Number<3>{}; + + static_assert(in_desc.GetDimension() == 4, "input nDim is not 4"); + static_assert(wei_desc.GetDimension() == 4, "weight nDim is not 4"); + static_assert(in_desc.GetLength(I1) == wei_desc.GetLength(I1), + "input & weight dimension not consistent"); + + constexpr auto N = in_desc.GetLength(I0); + constexpr auto HI = in_desc.GetLength(I2); + constexpr auto WI = in_desc.GetLength(I3); + + constexpr auto K = wei_desc.GetLength(I0); + constexpr auto S = wei_desc.GetLength(I2); + constexpr auto R = wei_desc.GetLength(I3); + + constexpr auto HO = HI - S + 1; + constexpr auto WO = WI - R + 1; + + return make_ConstantTensorDescriptor(Sequence{}); +} diff --git a/src/include/gridwise_direct_convolution_2.cuh b/src/include/gridwise_direct_convolution_2.cuh index 909cdf33f6..e0ef90c0aa 100644 --- a/src/include/gridwise_direct_convolution_2.cuh +++ b/src/include/gridwise_direct_convolution_2.cuh @@ -69,8 +69,8 @@ __global__ void gridwise_direct_convolution_2(InGlobalDesc, constexpr auto wei_thread_block_desc = make_ConstantTensorDescriptor( Sequence{}, wei_block_desc.GetStrides()); - constexpr auto out_thread_desc = - get_convolution_output_4d_tensor_descriptor(in_thread_block_desc, wei_thread_block_desc); + constexpr auto out_thread_desc = get_convolution_output_default_4d_tensor_descriptor( + in_thread_block_desc, wei_thread_block_desc); // register Float p_out_thread[out_thread_desc.GetElementSpace()];