diff --git a/driver/conv.cu b/driver/conv.cu index 12ff9a0879..0265730949 100644 --- a/driver/conv.cu +++ b/driver/conv.cu @@ -391,7 +391,7 @@ int main() constexpr unsigned HPad = 0; constexpr unsigned WPad = 0; -#elif 0 +#elif 1 // 3x3, 34x34 constexpr unsigned N = 64; constexpr unsigned C = 256; @@ -484,7 +484,7 @@ int main() constexpr unsigned HPad = 1; constexpr unsigned WPad = 1; -#elif 1 +#elif 0 // 1x1 filter, 28x28 image constexpr unsigned N = 16; constexpr unsigned C = 256; @@ -608,7 +608,7 @@ int main() nrepeat); #endif -#if 0 +#if 1 if(S == 3 && R == 3) { host_winograd_3x3_convolution(in_nchw, wei_kcsr, out_nkhw_host, lower_pads, upper_pads); diff --git a/driver/device_implicit_gemm_convolution_2_cnhw_csrk_knhw.cuh b/driver/device_implicit_gemm_convolution_2_cnhw_csrk_knhw.cuh index 650ef70e06..c765f8aa58 100644 --- a/driver/device_implicit_gemm_convolution_2_cnhw_csrk_knhw.cuh +++ b/driver/device_implicit_gemm_convolution_2_cnhw_csrk_knhw.cuh @@ -67,7 +67,7 @@ void device_implicit_gemm_convolution_2_cnhw_csrk_knhw(InDesc, Tensor out_knhw(make_TensorDescriptor(out_knhw_desc)); -#if 0 +#if 1 // 3x3, 34x34 constexpr unsigned BPerBlock = 128; constexpr unsigned KPerBlock = 64; @@ -86,11 +86,11 @@ void device_implicit_gemm_convolution_2_cnhw_csrk_knhw(InDesc, constexpr unsigned WeiBlockCopyThreadPerDim0 = 4; constexpr unsigned WeiBlockCopyThreadPerDim1 = 16; - constexpr unsigned InBlockCopyDataPerRead = 2; + constexpr unsigned InBlockCopyDataPerRead = 4; constexpr unsigned WeiBlockCopyDataPerRead = 4; constexpr unsigned BlockSize = 128; -#elif 1 +#elif 0 // 1x1, 28x28 constexpr unsigned BPerBlock = 64; constexpr unsigned KPerBlock = 64; @@ -112,6 +112,29 @@ void device_implicit_gemm_convolution_2_cnhw_csrk_knhw(InDesc, constexpr unsigned InBlockCopyDataPerRead = 4; constexpr unsigned WeiBlockCopyDataPerRead = 4; + constexpr unsigned BlockSize = 64; +#elif 1 + // 1x1, 28x28 try + constexpr unsigned BPerBlock = 64; + constexpr unsigned KPerBlock = 64; + constexpr unsigned CPerBlock = 8; + + constexpr unsigned BPerThread = 4; + constexpr unsigned KPerThread = 16; + constexpr unsigned CPerThread = 1; + + constexpr unsigned GemmThreadPerColumnPerCluster = 4; + constexpr unsigned GemmThreadPerRowPerCluster = 8; + + constexpr unsigned InBlockCopyThreadPerDim0 = 4; + constexpr unsigned InBlockCopyThreadPerDim1 = 16; + + constexpr unsigned WeiBlockCopyThreadPerDim0 = 4; + constexpr unsigned WeiBlockCopyThreadPerDim1 = 16; + + constexpr unsigned InBlockCopyDataPerRead = 4; + constexpr unsigned WeiBlockCopyDataPerRead = 4; + constexpr unsigned BlockSize = 64; #endif diff --git a/src/include/ConstantTensorDescriptor.cuh b/src/include/ConstantTensorDescriptor.cuh index 9b62832355..90bf761a94 100644 --- a/src/include/ConstantTensorDescriptor.cuh +++ b/src/include/ConstantTensorDescriptor.cuh @@ -33,15 +33,6 @@ __host__ __device__ constexpr auto calculate_default_strides_aligned(Sequence{}; } -// this is ugly, only for 4d -template -__host__ __device__ constexpr auto calculate_full_lengths(Sequence) -{ - static_assert((S0 % S1 == 0) && (S1 % S2 == 0) && (S2 % S3 == 0), "cannot be evenly divided!"); - - return Sequence<1, S0 / S1, S1 / S2, S2 / S3>{}; -} - template struct ConstantTensorDescriptor { @@ -71,7 +62,6 @@ struct ConstantTensorDescriptor return Strides{}.Get(Number{}); } - // this is ugly, only for 4d __host__ __device__ constexpr unsigned GetElementSize() const { static_assert(nDim >= 2 && nDim <= 4, "nDim"); @@ -102,16 +92,20 @@ struct ConstantTensorDescriptor } } - __host__ __device__ constexpr unsigned GetElementSpace() const + template > + __host__ __device__ constexpr unsigned GetElementSpace(Align align = Align{}) const { static_assert(nDim >= 2 && nDim <= 4, "nDim"); + constexpr unsigned align_size = align.Get(); + if(nDim == 2) { constexpr auto I0 = Number<0>{}; constexpr auto I1 = Number<1>{}; - return (GetLength(I0) - 1) * GetStride(I0) + (GetLength(I1) - 1) * GetStride(I1) + 1; + return (GetLength(I0) - 1) * GetStride(I0) + (GetLength(I1) - 1) * GetStride(I1) + + align_size; } else if(nDim == 3) { @@ -120,7 +114,7 @@ struct ConstantTensorDescriptor constexpr auto I2 = Number<2>{}; return (GetLength(I0) - 1) * GetStride(I0) + (GetLength(I1) - 1) * GetStride(I1) + - (GetLength(I2) - 1) * GetStride(I2) + 1; + (GetLength(I2) - 1) * GetStride(I2) + align_size; } else if(nDim == 4) { @@ -130,7 +124,8 @@ struct ConstantTensorDescriptor constexpr auto I3 = Number<3>{}; return (GetLength(I0) - 1) * GetStride(I0) + (GetLength(I1) - 1) * GetStride(I1) + - (GetLength(I2) - 1) * GetStride(I2) + (GetLength(I3) - 1) * GetStride(I3) + 1; + (GetLength(I2) - 1) * GetStride(I2) + (GetLength(I3) - 1) * GetStride(I3) + + align_size; } } diff --git a/src/include/blockwise_2d_tensor_op.cuh b/src/include/blockwise_2d_tensor_op.cuh index 0f7204b4cb..593b761e43 100644 --- a/src/include/blockwise_2d_tensor_op.cuh +++ b/src/include/blockwise_2d_tensor_op.cuh @@ -399,11 +399,21 @@ struct Blockwise2dTensorCopy3 // we allow out-of-bound read from src in D1 dimension, // but we need to make sure dst stride is big enough, - // so that the out-of-bound write won't overwrite next line + // so that the out-of-bound write won't contaminate next line in dst static_assert(thread_per_d1 * DataPerRead <= DstDesc{}.GetStride(I0), - "wrong! out-of-bound write will overwrite next line!\n"); + "wrong! out-of-bound write will contaminate next line!\n"); - static_assert(thread_per_d0 >= 1, "wrong! not enough threads to cover L1 dimension\n"); + static_assert(thread_per_d0 >= 1, "wrong! not enough threads to cover one line\n"); + + constexpr unsigned num_active_thread = thread_per_d0 * thread_per_d1; + + if(BlockSize > num_active_thread) + { + if(get_thread_local_1d_id() >= num_active_thread) + { + return; + } + } const unsigned thread_id_d0 = get_thread_local_1d_id() / thread_per_d1; const unsigned thread_id_d1 = get_thread_local_1d_id() - thread_id_d0 * thread_per_d1; diff --git a/src/include/gridwise_implicit_gemm_convolution_2_cnhw_csrk_knhw.cuh b/src/include/gridwise_implicit_gemm_convolution_2_cnhw_csrk_knhw.cuh index d33142028b..877c595ba5 100644 --- a/src/include/gridwise_implicit_gemm_convolution_2_cnhw_csrk_knhw.cuh +++ b/src/include/gridwise_implicit_gemm_convolution_2_cnhw_csrk_knhw.cuh @@ -75,27 +75,14 @@ gridwise_implicit_gemm_convolution_2_cnhw_csrk_knhw(InGlobalDesc, constexpr auto wei_ek_global_desc = make_ConstantTensorDescriptor(Sequence{}); // tensor view of blockwise input and weight -#if 0 - constexpr auto in_cb_block_desc = - make_ConstantTensorDescriptor(Sequence{}); -#else constexpr auto in_cb_block_desc = make_ConstantTensorDescriptor_aligned( Sequence{}, Number{}); -#endif -#if 0 - constexpr auto wei_ek_block_desc = - make_ConstantTensorDescriptor(Sequence{}); - - constexpr auto wei_csrk_block_desc = - make_ConstantTensorDescriptor(Sequence{}); -#else constexpr auto wei_ek_block_desc = make_ConstantTensorDescriptor_aligned( Sequence{}, Number{}); constexpr auto wei_csrk_block_desc = make_ConstantTensorDescriptor_aligned( Sequence{}, Number{}); -#endif // tensor view of threadwise output in register constexpr auto out_kb_thread_desc = @@ -203,12 +190,19 @@ gridwise_implicit_gemm_convolution_2_cnhw_csrk_knhw(InGlobalDesc, GemmThreadPerRowPerCluster, true>{}; - // LDS - constexpr unsigned in_block_size = in_cb_block_desc.GetElementSpace(); - constexpr unsigned wei_block_size = wei_csrk_block_desc.GetElementSpace(); + // LDS: be careful of alignment + constexpr unsigned in_block_size = + in_cb_block_desc.GetElementSpace(Number{}); - __shared__ Float p_in_block[in_block_size]; - __shared__ Float p_wei_block[wei_block_size]; + constexpr unsigned wei_block_size = + wei_csrk_block_desc.GetElementSpace(Number{}); + + constexpr unsigned max_align = InBlockCopyDataPerRead > WeiBlockCopyDataPerRead + ? InBlockCopyDataPerRead + : WeiBlockCopyDataPerRead; + + __shared__ Float p_in_block[max_align * ((in_block_size + max_align - 1) / max_align)]; + __shared__ Float p_wei_block[max_align * ((wei_block_size + max_align - 1) / max_align)]; // register Float p_out_thread[out_kb_thread_desc.GetElementSpace()];