From 079d63a788f7d39381ce1513b7c9090d3ac3f439 Mon Sep 17 00:00:00 2001 From: Chao Liu Date: Tue, 5 Feb 2019 23:19:57 -0600 Subject: [PATCH] bug fixes --- driver/conv.cu | 6 +- ...icit_gemm_convolution_2_cnhw_csrk_knhw.cuh | 27 +++++++- ...icit_gemm_convolution_2_cnhw_srck_knhw.cuh | 49 ++++++-------- src/include/ConstantTensorDescriptor.cuh | 25 +++++++ src/include/blockwise_2d_tensor_op.cuh | 66 ++++++++++++------- ...mm_convolution_1_chwn_csrk_khwn_padded.cuh | 2 +- ...icit_gemm_convolution_2_cnhw_csrk_knhw.cuh | 59 +++++++++-------- ...volution_2_cnhw_csrk_knhw_lds_pipeline.cuh | 2 +- ...icit_gemm_convolution_2_cnhw_srck_knhw.cuh | 2 +- 9 files changed, 151 insertions(+), 87 deletions(-) diff --git a/driver/conv.cu b/driver/conv.cu index b43779d974..12ff9a0879 100644 --- a/driver/conv.cu +++ b/driver/conv.cu @@ -389,8 +389,8 @@ int main() constexpr unsigned S = 3; constexpr unsigned R = 3; - constexpr unsigned HPad = 1; - constexpr unsigned WPad = 1; + constexpr unsigned HPad = 0; + constexpr unsigned WPad = 0; #elif 0 // 3x3, 34x34 constexpr unsigned N = 64; @@ -593,8 +593,6 @@ int main() device_implicit_gemm_convolution_2_cnhw_srck_knhw #elif 1 device_implicit_gemm_convolution_2_cnhw_csrk_knhw -#elif 0 - device_winograd_convolution #endif (in_nchw_desc, in_nchw, wei_kcsr_desc, wei_kcsr, out_nkhw_desc, out_nkhw_device, nrepeat); diff --git a/driver/device_implicit_gemm_convolution_2_cnhw_csrk_knhw.cuh b/driver/device_implicit_gemm_convolution_2_cnhw_csrk_knhw.cuh index 61d1d9cd6c..650ef70e06 100644 --- a/driver/device_implicit_gemm_convolution_2_cnhw_csrk_knhw.cuh +++ b/driver/device_implicit_gemm_convolution_2_cnhw_csrk_knhw.cuh @@ -67,7 +67,30 @@ void device_implicit_gemm_convolution_2_cnhw_csrk_knhw(InDesc, Tensor out_knhw(make_TensorDescriptor(out_knhw_desc)); -#if 1 +#if 0 + // 3x3, 34x34 + constexpr unsigned BPerBlock = 128; + constexpr unsigned KPerBlock = 64; + constexpr unsigned CPerBlock = 4; + + constexpr unsigned BPerThread = 4; + constexpr unsigned KPerThread = 16; + constexpr unsigned CPerThread = 1; + + constexpr unsigned GemmThreadPerColumnPerCluster = 4; + constexpr unsigned GemmThreadPerRowPerCluster = 8; + + constexpr unsigned InBlockCopyThreadPerDim0 = 4; + constexpr unsigned InBlockCopyThreadPerDim1 = 16; + + constexpr unsigned WeiBlockCopyThreadPerDim0 = 4; + constexpr unsigned WeiBlockCopyThreadPerDim1 = 16; + + constexpr unsigned InBlockCopyDataPerRead = 2; + constexpr unsigned WeiBlockCopyDataPerRead = 4; + + constexpr unsigned BlockSize = 128; +#elif 1 // 1x1, 28x28 constexpr unsigned BPerBlock = 64; constexpr unsigned KPerBlock = 64; @@ -120,7 +143,7 @@ void device_implicit_gemm_convolution_2_cnhw_csrk_knhw(InDesc, #if 1 gridwise_implicit_gemm_convolution_2_cnhw_csrk_knhw -#elif 0 +#elif 1 gridwise_implicit_gemm_convolution_2_cnhw_csrk_knhw_lds_pipeline #endif out_knhw(make_TensorDescriptor(out_knhw_desc)); #if 0 - constexpr unsigned BPerBlock = 256; + constexpr unsigned BPerBlock = 128; constexpr unsigned KPerBlock = 1; constexpr unsigned CPerBlock = 1; - constexpr unsigned BPerThread = 8; + constexpr unsigned BPerThread = 4; constexpr unsigned KPerThread = 1; constexpr unsigned CPerThread = 1; - constexpr unsigned GemmThreadPerColumnPerCluster = 1; - constexpr unsigned GemmThreadPerRowPerCluster = 4; + constexpr unsigned GemmThreadPerColumnPerCluster = 1; + constexpr unsigned GemmThreadPerRowPerCluster = 1; + + constexpr unsigned InBlockCopyThreadPerDim0 = 4; + constexpr unsigned InBlockCopyThreadPerDim1 = 16; constexpr unsigned BlockSize = 32; -#elif 0 +#elif 1 + // 3x3, 34x34 constexpr unsigned BPerBlock = 128; constexpr unsigned KPerBlock = 64; - constexpr unsigned CPerBlock = 2; + constexpr unsigned CPerBlock = 4; - constexpr unsigned BPerThread = 8; - constexpr unsigned KPerThread = 8; + constexpr unsigned BPerThread = 4; + constexpr unsigned KPerThread = 16; constexpr unsigned CPerThread = 1; constexpr unsigned GemmThreadPerColumnPerCluster = 4; - constexpr unsigned GemmThreadPerRowPerCluster = 4; + constexpr unsigned GemmThreadPerRowPerCluster = 8; - constexpr unsigned BlockSize = 128; -#elif 0 - constexpr unsigned BPerBlock = 128; - constexpr unsigned KPerBlock = 64; - constexpr unsigned CPerBlock = 2; - - constexpr unsigned BPerThread = 8; - constexpr unsigned KPerThread = 8; - constexpr unsigned CPerThread = 1; - - constexpr unsigned GemmThreadPerColumnPerCluster = 4; - constexpr unsigned GemmThreadPerRowPerCluster = 4; - - constexpr unsigned InBlockCopyThreadPerDim0 = 2; - constexpr unsigned InBlockCopyThreadPerDim1 = 64; + constexpr unsigned InBlockCopyThreadPerDim0 = 4; + constexpr unsigned InBlockCopyThreadPerDim1 = 16; constexpr unsigned BlockSize = 128; #elif 1 // 1x1, 28x28 constexpr unsigned BPerBlock = 64; - constexpr unsigned KPerBlock = 128; + constexpr unsigned KPerBlock = 64; constexpr unsigned CPerBlock = 8; constexpr unsigned BPerThread = 4; constexpr unsigned KPerThread = 16; - constexpr unsigned CPerThread = 2; + constexpr unsigned CPerThread = 1; - constexpr unsigned GemmThreadPerColumnPerCluster = 8; + constexpr unsigned GemmThreadPerColumnPerCluster = 4; constexpr unsigned GemmThreadPerRowPerCluster = 8; - constexpr unsigned InBlockCopyThreadPerDim0 = 8; + constexpr unsigned InBlockCopyThreadPerDim0 = 4; constexpr unsigned InBlockCopyThreadPerDim1 = 16; - constexpr unsigned BlockSize = 128; + constexpr unsigned BlockSize = 64; #endif constexpr unsigned GridSize = diff --git a/src/include/ConstantTensorDescriptor.cuh b/src/include/ConstantTensorDescriptor.cuh index 1f82f06b2b..9b62832355 100644 --- a/src/include/ConstantTensorDescriptor.cuh +++ b/src/include/ConstantTensorDescriptor.cuh @@ -15,6 +15,24 @@ __host__ __device__ constexpr auto calculate_default_strides(Sequence{}; } +// this is ugly, only for 2d +template +__host__ __device__ constexpr auto calculate_default_strides_aligned(Sequence, + Number) +{ + constexpr unsigned L1_align = Align * ((L1 + Align - 1) / Align); + return Sequence{}; +} + +// this is ugly, only for 4d +template +__host__ __device__ constexpr auto calculate_default_strides_aligned(Sequence, + Number) +{ + constexpr unsigned L3_align = Align * ((L3 + Align - 1) / Align); + return Sequence{}; +} + // this is ugly, only for 4d template __host__ __device__ constexpr auto calculate_full_lengths(Sequence) @@ -170,6 +188,13 @@ __host__ __device__ constexpr auto make_ConstantTensorDescriptor(Lengths, Stride return ConstantTensorDescriptor{}; } +template +__host__ __device__ constexpr auto make_ConstantTensorDescriptor_aligned(Lengths, Number) +{ + using Strides = decltype(calculate_default_strides_aligned(Lengths{}, Number{})); + return ConstantTensorDescriptor{}; +} + template __host__ __device__ void print_ConstantTensorDescriptor(TDesc, const char* s) { diff --git a/src/include/blockwise_2d_tensor_op.cuh b/src/include/blockwise_2d_tensor_op.cuh index 71b916f27f..0f7204b4cb 100644 --- a/src/include/blockwise_2d_tensor_op.cuh +++ b/src/include/blockwise_2d_tensor_op.cuh @@ -203,6 +203,9 @@ struct Blockwise2dTensorCopy2 { static_assert(is_same::value, "wrong! only support float!\n"); + using Float4 = float4; + using Float2 = float2; + if(get_thread_local_1d_id() >= ThreadPerDim0 * ThreadPerDim1) return; @@ -212,18 +215,28 @@ struct Blockwise2dTensorCopy2 constexpr auto src_desc = SrcDesc{}; constexpr auto dst_desc = DstDesc{}; + // check alignment + constexpr bool align_v4 = + src_desc.GetStride(I0) % 4 == 0 && dst_desc.GetStride(I0) % 4 == 0; + + constexpr bool align_v2 = + src_desc.GetStride(I0) % 2 == 0 && dst_desc.GetStride(I0) % 2 == 0; + constexpr unsigned L0 = SrcOpLengths{}.Get(I0); constexpr unsigned L1 = SrcOpLengths{}.Get(I1); constexpr unsigned Dim0Loop = L0 / ThreadPerDim0; constexpr bool d0_has_tail = (L0 > ThreadPerDim0 * Dim0Loop); - constexpr unsigned Dim1V4Loop = L1 / (ThreadPerDim1 * 4); + constexpr unsigned Dim1V4Loop = align_v4 ? L1 / (ThreadPerDim1 * 4) : 0; + constexpr unsigned Dim1V2Loop = - (L1 - Dim1V4Loop * (ThreadPerDim1 * 4)) / (ThreadPerDim1 * 2); + align_v2 ? (L1 - Dim1V4Loop * (ThreadPerDim1 * 4)) / (ThreadPerDim1 * 2) : 0; + constexpr unsigned Dim1V1Loop = (L1 - Dim1V4Loop * (ThreadPerDim1 * 4) - Dim1V2Loop * (ThreadPerDim1 * 2)) / ThreadPerDim1; + constexpr bool d1_has_tail = (L1 > ThreadPerDim1 * (4 * Dim1V4Loop + 2 * Dim1V2Loop + Dim1V1Loop)); @@ -239,8 +252,8 @@ struct Blockwise2dTensorCopy2 const unsigned sindex = src_desc.Get1dIndex(did0, did1); const unsigned dindex = dst_desc.Get1dIndex(did0, did1); - *(reinterpret_cast(p_dst + dindex)) = - *(reinterpret_cast(p_src + sindex)); + *(reinterpret_cast(p_dst + dindex)) = + *(reinterpret_cast(p_src + sindex)); } // v2 @@ -252,8 +265,8 @@ struct Blockwise2dTensorCopy2 const unsigned sindex = src_desc.Get1dIndex(did0, did1); const unsigned dindex = dst_desc.Get1dIndex(did0, did1); - *(reinterpret_cast(p_dst + dindex)) = - *(reinterpret_cast(p_src + sindex)); + *(reinterpret_cast(p_dst + dindex)) = + *(reinterpret_cast(p_src + sindex)); } // v1 @@ -300,8 +313,8 @@ struct Blockwise2dTensorCopy2 const unsigned sindex = src_desc.Get1dIndex(did0, did1); const unsigned dindex = dst_desc.Get1dIndex(did0, did1); - *(reinterpret_cast(p_dst + dindex)) = - *(reinterpret_cast(p_src + sindex)); + *(reinterpret_cast(p_dst + dindex)) = + *(reinterpret_cast(p_src + sindex)); } // v2 @@ -313,8 +326,8 @@ struct Blockwise2dTensorCopy2 const unsigned sindex = src_desc.Get1dIndex(did0, did1); const unsigned dindex = dst_desc.Get1dIndex(did0, did1); - *(reinterpret_cast(p_dst + dindex)) = - *(reinterpret_cast(p_src + sindex)); + *(reinterpret_cast(p_dst + dindex)) = + *(reinterpret_cast(p_src + sindex)); } // v1 @@ -356,7 +369,7 @@ template struct Blockwise2dTensorCopy3 { @@ -374,16 +387,23 @@ struct Blockwise2dTensorCopy3 static_assert(DataPerRead == 1 || DataPerRead == 2 || DataPerRead == 4, "wrong! only support DataPerRead == 1, 2 or 4!\n"); - constexpr unsigned L0 = SrcOpLengths{}.Get(I0); - constexpr unsigned L1 = SrcOpLengths{}.Get(I1); + static_assert(SrcDesc{}.GetStride(I0) % DataPerRead == 0 && + DstDesc{}.GetStride(I0) % DataPerRead == 0, + "src and dst stride should be multiple of DataPerRead to keep alignment"); - static_assert(L1 % DataPerRead == 0, "wrong! only support mod(L1, DataPerRead) == 0\n"); + constexpr unsigned L0 = CopyLengths{}.Get(I0); + constexpr unsigned L1 = CopyLengths{}.Get(I1); - constexpr unsigned thread_per_d1 = L1 / DataPerRead; + constexpr unsigned thread_per_d1 = (L1 + DataPerRead - 1) / DataPerRead; constexpr unsigned thread_per_d0 = BlockSize / thread_per_d1; - static_assert(thread_per_d1 <= BlockSize, - "wrong! not enough threads to cover L1 dimension\n"); + // we allow out-of-bound read from src in D1 dimension, + // but we need to make sure dst stride is big enough, + // so that the out-of-bound write won't overwrite next line + static_assert(thread_per_d1 * DataPerRead <= DstDesc{}.GetStride(I0), + "wrong! out-of-bound write will overwrite next line!\n"); + + static_assert(thread_per_d0 >= 1, "wrong! not enough threads to cover L1 dimension\n"); const unsigned thread_id_d0 = get_thread_local_1d_id() / thread_per_d1; const unsigned thread_id_d1 = get_thread_local_1d_id() - thread_id_d0 * thread_per_d1; @@ -402,17 +422,17 @@ struct Blockwise2dTensorCopy3 constexpr auto I0 = Number<0>{}; constexpr auto I1 = Number<1>{}; - constexpr unsigned L0 = SrcOpLengths{}.Get(I0); - constexpr unsigned L1 = SrcOpLengths{}.Get(I1); + constexpr unsigned L0 = CopyLengths{}.Get(I0); + constexpr unsigned L1 = CopyLengths{}.Get(I1); - constexpr unsigned thread_per_d1 = L1 / DataPerRead; + constexpr unsigned thread_per_d1 = (L1 + DataPerRead - 1) / DataPerRead; constexpr unsigned thread_per_d0 = BlockSize / thread_per_d1; constexpr unsigned num_active_thread = thread_per_d0 * thread_per_d1; if(BlockSize > num_active_thread) { - if(get_thread_local_1d_id() > num_active_thread) + if(get_thread_local_1d_id() >= num_active_thread) { return; } @@ -420,8 +440,6 @@ struct Blockwise2dTensorCopy3 constexpr unsigned nloop_d0 = L0 / thread_per_d0; - constexpr bool has_tail_d0 = (L0 > nloop_d0 * thread_per_d0); - constexpr unsigned src_loop_stride = SrcDesc{}.GetStride(I0) * thread_per_d0; constexpr unsigned dst_loop_stride = DstDesc{}.GetStride(I0) * thread_per_d0; @@ -450,6 +468,8 @@ struct Blockwise2dTensorCopy3 } } + constexpr bool has_tail_d0 = (L0 > nloop_d0 * thread_per_d0); + if(has_tail_d0) { constexpr unsigned tail_d0 = L0 - nloop_d0 * thread_per_d0; diff --git a/src/include/gridwise_implicit_gemm_convolution_1_chwn_csrk_khwn_padded.cuh b/src/include/gridwise_implicit_gemm_convolution_1_chwn_csrk_khwn_padded.cuh index e5fcdb9aaf..0c58bcc4de 100644 --- a/src/include/gridwise_implicit_gemm_convolution_1_chwn_csrk_khwn_padded.cuh +++ b/src/include/gridwise_implicit_gemm_convolution_1_chwn_csrk_khwn_padded.cuh @@ -173,7 +173,7 @@ gridwise_implicit_gemm_convolution_1_chwn_csrk_khwn_padded(Float* const __restri // a series of blockwise batched GEMM // C_matrix += transpose(A_matrix) * B_matrix // A_matrix and B_matrix saved in LDS, C_matrix saved in register - // A_matrix[C,K] is a sub-matrix of wei_block[S,R,C,K] + // A_matrix[C,K] is a sub-matrix of wei_block[C,S,R,K] // B_matrix[C,Wo*N] is a sub-matrix of in_block[C,Hi,Wi,N] // C_matrix[K,Wo*N] is a sub-matrix of out_block[Ho,K,Wo,N] const auto a_cxk_block_mtx_desc = make_ConstantMatrixDescriptor( diff --git a/src/include/gridwise_implicit_gemm_convolution_2_cnhw_csrk_knhw.cuh b/src/include/gridwise_implicit_gemm_convolution_2_cnhw_csrk_knhw.cuh index 65d508e9ea..d33142028b 100644 --- a/src/include/gridwise_implicit_gemm_convolution_2_cnhw_csrk_knhw.cuh +++ b/src/include/gridwise_implicit_gemm_convolution_2_cnhw_csrk_knhw.cuh @@ -70,35 +70,32 @@ gridwise_implicit_gemm_convolution_2_cnhw_csrk_knhw(InGlobalDesc, const unsigned k_block_data_begin = k_block_work_id * KPerBlock; const unsigned b_block_data_begin = b_block_work_id * BPerBlock; -#if 0 - if(get_thread_local_1d_id() == 0) - { - printf("K %u B %u, BGhostRead %u\n", K, B, BGhostRead); - - printf("%u %u, KBlockWork %u BBlockWork %u, k_block_data_begin %u b_block_data_begin %u\n", - get_block_1d_id(), - get_thread_local_1d_id(), - KBlockWork, - BBlockWork, - k_block_data_begin, - b_block_data_begin); - } -#endif - // flattend (2d) tensor view of gridwise input - constexpr auto in_cb_global_desc = make_ConstantTensorDescriptor(Sequence{}); - + constexpr auto in_cb_global_desc = make_ConstantTensorDescriptor(Sequence{}); constexpr auto wei_ek_global_desc = make_ConstantTensorDescriptor(Sequence{}); // tensor view of blockwise input and weight +#if 0 constexpr auto in_cb_block_desc = make_ConstantTensorDescriptor(Sequence{}); +#else + constexpr auto in_cb_block_desc = make_ConstantTensorDescriptor_aligned( + Sequence{}, Number{}); +#endif +#if 0 constexpr auto wei_ek_block_desc = make_ConstantTensorDescriptor(Sequence{}); constexpr auto wei_csrk_block_desc = make_ConstantTensorDescriptor(Sequence{}); +#else + constexpr auto wei_ek_block_desc = make_ConstantTensorDescriptor_aligned( + Sequence{}, Number{}); + + constexpr auto wei_csrk_block_desc = make_ConstantTensorDescriptor_aligned( + Sequence{}, Number{}); +#endif // tensor view of threadwise output in register constexpr auto out_kb_thread_desc = @@ -107,8 +104,16 @@ gridwise_implicit_gemm_convolution_2_cnhw_csrk_knhw(InGlobalDesc, #if 0 if(get_thread_local_1d_id() == 0 && get_block_1d_id() == 0) { + print_ConstantTensorDescriptor(in_cnhw_global_desc, "in_cnhw_global_desc"); + print_ConstantTensorDescriptor(wei_csrk_global_desc, "wei_csrk_global_desc"); + print_ConstantTensorDescriptor(out_knhw_global_desc, "out_knhw_global_desc"); + + print_ConstantTensorDescriptor(in_cb_global_desc, "in_cb_global_desc"); + print_ConstantTensorDescriptor(wei_ek_global_desc, "wei_ek_global_desc"); + print_ConstantTensorDescriptor(in_cb_block_desc, "in_cb_block_desc"); print_ConstantTensorDescriptor(wei_csrk_block_desc, "wei_csrk_block_desc"); + print_ConstantTensorDescriptor(wei_ek_block_desc, "wei_ek_block_desc"); print_ConstantTensorDescriptor(out_kb_thread_desc, "out_kb_thread_desc"); printf("KPerBlock %u\n", KPerBlock); @@ -120,10 +125,10 @@ gridwise_implicit_gemm_convolution_2_cnhw_csrk_knhw(InGlobalDesc, #if 0 const auto blockwise_in_copy = Blockwise2dTensorCopy1{}; + Float, + decltype(in_cb_global_desc), + decltype(in_cb_block_desc), + decltype(in_cb_block_desc.GetLengths())>{}; #elif 0 const auto blockwise_in_copy = Blockwise2dTensorCopy2{}, Number{}); // constexpr doesn't compile + Number{}, + Number{}, + Number{}); // constexpr doesn't compile const auto b_cxb_block_mtx_desc = make_ConstantMatrixDescriptor( Number{}, @@ -217,7 +224,7 @@ gridwise_implicit_gemm_convolution_2_cnhw_csrk_knhw(InGlobalDesc, for(unsigned c_block_data_begin = 0; c_block_data_begin < C; c_block_data_begin += CPerBlock, p_in_global_block_offset += CPerBlock * in_cb_global_desc.GetStride(I0), - p_wei_global_block_offset += CPerBlock * wei_csrk_global_desc.GetStride(I2), + p_wei_global_block_offset += CPerBlock * wei_csrk_global_desc.GetStride(I0), __syncthreads()) { // input: global mem to LDS, @@ -233,9 +240,9 @@ gridwise_implicit_gemm_convolution_2_cnhw_csrk_knhw(InGlobalDesc, { for(unsigned r = 0; r < R; ++r) { - auto f_accum = [](auto& c, const auto&& ab) { c += ab; }; + auto f_accum = [](auto& acc, const auto&& v) { acc += v; }; - blockwise_gemm.Run(p_wei_block + wei_csrk_block_desc.Get1dIndex(s, r, 0, 0), + blockwise_gemm.Run(p_wei_block + wei_csrk_block_desc.Get1dIndex(0, s, r, 0), p_in_block + s * Wi + r, p_out_thread, f_accum); diff --git a/src/include/gridwise_implicit_gemm_convolution_2_cnhw_csrk_knhw_lds_pipeline.cuh b/src/include/gridwise_implicit_gemm_convolution_2_cnhw_csrk_knhw_lds_pipeline.cuh index 465971a7f4..70cb8a465f 100644 --- a/src/include/gridwise_implicit_gemm_convolution_2_cnhw_csrk_knhw_lds_pipeline.cuh +++ b/src/include/gridwise_implicit_gemm_convolution_2_cnhw_csrk_knhw_lds_pipeline.cuh @@ -259,7 +259,7 @@ __global__ void gridwise_implicit_gemm_convolution_2_cnhw_csrk_knhw_lds_pipeline { auto f_accum = [](auto& c, const auto&& ab) { c += ab; }; - blockwise_gemm.Run(p_wei_block_now + wei_csrk_block_desc.Get1dIndex(s, r, 0, 0), + blockwise_gemm.Run(p_wei_block_now + wei_csrk_block_desc.Get1dIndex(0, s, r, 0), p_in_block_now + s * Wi + r, p_out_thread, f_accum); diff --git a/src/include/gridwise_implicit_gemm_convolution_2_cnhw_srck_knhw.cuh b/src/include/gridwise_implicit_gemm_convolution_2_cnhw_srck_knhw.cuh index 20a6dff81f..c60108e003 100644 --- a/src/include/gridwise_implicit_gemm_convolution_2_cnhw_srck_knhw.cuh +++ b/src/include/gridwise_implicit_gemm_convolution_2_cnhw_srck_knhw.cuh @@ -108,7 +108,7 @@ gridwise_implicit_gemm_convolution_2_cnhw_srck_knhw(InGlobalDesc, // blockwise in copy // formmat is [CPerBlock,BPerBlock + BGhostRead] -#if 1 +#if 0 const auto blockwise_in_copy = Blockwise2dTensorCopy1