From 61ac08661d47ecf84f4e0afc59d3261c035d4226 Mon Sep 17 00:00:00 2001 From: Chao Liu Date: Mon, 11 Feb 2019 22:36:17 -0600 Subject: [PATCH] tune for 1x1 --- driver/conv.cu | 10 +- ...icit_gemm_convolution_2_cnhw_csrk_knhw.cuh | 48 +++- src/include/blockwise_2d_tensor_op.cuh | 34 +-- src/include/blockwise_gemm.cuh | 240 ++++++++++++++---- ...icit_gemm_convolution_1_chwn_csrk_khwn.cuh | 8 +- ...icit_gemm_convolution_2_cnhw_csrk_knhw.cuh | 14 +- ...ion_2_cnhw_csrk_knhw_lds_double_buffer.cuh | 18 +- src/include/threadwise_gemm.cuh | 29 ++- 8 files changed, 284 insertions(+), 117 deletions(-) diff --git a/driver/conv.cu b/driver/conv.cu index 249e9d0562..5d039a4f86 100644 --- a/driver/conv.cu +++ b/driver/conv.cu @@ -391,7 +391,7 @@ int main() constexpr unsigned HPad = 0; constexpr unsigned WPad = 0; -#elif 1 +#elif 0 // 3x3, 34x34 constexpr unsigned N = 64; constexpr unsigned C = 256; @@ -490,7 +490,7 @@ int main() constexpr unsigned HPad = 1; constexpr unsigned WPad = 1; -#elif 0 +#elif 1 // 1x1 filter, 28x28 image constexpr unsigned N = 16; constexpr unsigned C = 256; @@ -582,7 +582,7 @@ int main() wei_kcsr.GenerateTensorValue(GeneratorTensor_1{}, num_thread); #endif - unsigned nrepeat = 100; + unsigned nrepeat = 200; #if 1 #if 0 @@ -593,11 +593,11 @@ int main() device_implicit_gemm_convolution_1_nchw_kcsr #elif 0 device_implicit_gemm_convolution_1_nchw_srck_nkhw -#elif 1 +#elif 0 device_implicit_gemm_convolution_1_chwn_csrk_khwn #elif 0 device_implicit_gemm_convolution_2_cnhw_srck_knhw -#elif 0 +#elif 1 device_implicit_gemm_convolution_2_cnhw_csrk_knhw #endif (in_nchw_desc, in_nchw, wei_kcsr_desc, wei_kcsr, out_nkhw_desc, out_nkhw_device, nrepeat); diff --git a/driver/device_implicit_gemm_convolution_2_cnhw_csrk_knhw.cuh b/driver/device_implicit_gemm_convolution_2_cnhw_csrk_knhw.cuh index de439b54bf..e801e15b2b 100644 --- a/driver/device_implicit_gemm_convolution_2_cnhw_csrk_knhw.cuh +++ b/driver/device_implicit_gemm_convolution_2_cnhw_csrk_knhw.cuh @@ -67,7 +67,7 @@ void device_implicit_gemm_convolution_2_cnhw_csrk_knhw(InDesc, Tensor out_knhw(make_TensorDescriptor(out_knhw_desc)); -#if 1 +#if 0 // 3x3, 34x34 constexpr unsigned BPerBlock = 128; constexpr unsigned KPerBlock = 64; @@ -78,9 +78,9 @@ void device_implicit_gemm_convolution_2_cnhw_csrk_knhw(InDesc, constexpr unsigned GemmMPerThreadSubC = 4; constexpr unsigned GemmNPerThreadSubC = 4; - constexpr unsigned GemmMLevel0Cluster = 8; + constexpr unsigned GemmMLevel0Cluster = 4; constexpr unsigned GemmNLevel0Cluster = 2; - constexpr unsigned GemmMLevel1Cluster = 1; + constexpr unsigned GemmMLevel1Cluster = 2; constexpr unsigned GemmNLevel1Cluster = 8; constexpr unsigned GemmKPerThreadLoop = 1; @@ -98,7 +98,7 @@ void device_implicit_gemm_convolution_2_cnhw_csrk_knhw(InDesc, constexpr unsigned BlockSize = 128; #elif 0 - // 1x1, 28x28 + // 1x1, 28x28, 64 threads constexpr unsigned BPerBlock = 64; constexpr unsigned KPerBlock = 64; constexpr unsigned CPerBlock = 8; @@ -108,9 +108,9 @@ void device_implicit_gemm_convolution_2_cnhw_csrk_knhw(InDesc, constexpr unsigned GemmMPerThreadSubC = 4; constexpr unsigned GemmNPerThreadSubC = 4; - constexpr unsigned GemmMLevel0Cluster = 8; + constexpr unsigned GemmMLevel0Cluster = 4; constexpr unsigned GemmNLevel0Cluster = 2; - constexpr unsigned GemmMLevel1Cluster = 1; + constexpr unsigned GemmMLevel1Cluster = 2; constexpr unsigned GemmNLevel1Cluster = 4; constexpr unsigned GemmKPerThreadLoop = 1; @@ -128,7 +128,37 @@ void device_implicit_gemm_convolution_2_cnhw_csrk_knhw(InDesc, constexpr unsigned BlockSize = 64; #elif 1 - // 1x1, 28x28 try + // 1x1, 28x28, 128 threads + constexpr unsigned BPerBlock = 64; + constexpr unsigned KPerBlock = 128; + constexpr unsigned CPerBlock = 8; + + constexpr unsigned BPerThread = 8; + constexpr unsigned KPerThread = 8; + + constexpr unsigned GemmMPerThreadSubC = 4; + constexpr unsigned GemmNPerThreadSubC = 4; + constexpr unsigned GemmMLevel0Cluster = 4; + constexpr unsigned GemmNLevel0Cluster = 2; + constexpr unsigned GemmMLevel1Cluster = 4; + constexpr unsigned GemmNLevel1Cluster = 4; + constexpr unsigned GemmKPerThreadLoop = 1; + + constexpr unsigned GemmThreadPerColumnPerCluster = 8; + constexpr unsigned GemmThreadPerRowPerCluster = 8; + + constexpr unsigned InBlockCopyThreadPerDim0 = 4; + constexpr unsigned InBlockCopyThreadPerDim1 = 16; + + constexpr unsigned WeiBlockCopyThreadPerDim0 = 4; + constexpr unsigned WeiBlockCopyThreadPerDim1 = 16; + + constexpr unsigned InBlockCopyDataPerRead = 4; + constexpr unsigned WeiBlockCopyDataPerRead = 4; + + constexpr unsigned BlockSize = 128; +#elif 1 + // 1x1, 28x28, 256 thread constexpr unsigned BPerBlock = 128; constexpr unsigned KPerBlock = 128; constexpr unsigned CPerBlock = 8; @@ -138,9 +168,9 @@ void device_implicit_gemm_convolution_2_cnhw_csrk_knhw(InDesc, constexpr unsigned GemmMPerThreadSubC = 4; constexpr unsigned GemmNPerThreadSubC = 4; - constexpr unsigned GemmMLevel0Cluster = 8; + constexpr unsigned GemmMLevel0Cluster = 4; constexpr unsigned GemmNLevel0Cluster = 4; - constexpr unsigned GemmMLevel1Cluster = 2; + constexpr unsigned GemmMLevel1Cluster = 4; constexpr unsigned GemmNLevel1Cluster = 4; constexpr unsigned GemmKPerThreadLoop = 1; diff --git a/src/include/blockwise_2d_tensor_op.cuh b/src/include/blockwise_2d_tensor_op.cuh index 593b761e43..1231a3863b 100644 --- a/src/include/blockwise_2d_tensor_op.cuh +++ b/src/include/blockwise_2d_tensor_op.cuh @@ -70,7 +70,7 @@ template __device__ void blockwise_2d_tensor_pointwise_operation_binary_reorder_by_get_dst_from_src( SrcDesc, - Float* const __restrict__ p_src, + const Float* __restrict__ p_src, DstDesc, Float* __restrict__ p_dst, SrcOpLengths, @@ -149,7 +149,7 @@ template __device__ void blockwise_2d_tensor_copy_reorder_by_get_dst_from_src(SrcDesc, - Float* const __restrict__ p_src, + const Float* __restrict__ p_src, DstDesc, Float* __restrict__ p_dst, SrcOpLengths, @@ -164,7 +164,7 @@ blockwise_2d_tensor_copy_reorder_by_get_dst_from_src(SrcDesc, template struct Blockwise2dTensorCopy1 { - __device__ void Run(Float* const __restrict__ p_src, Float* __restrict__ p_dst) const + __device__ void Run(const Float* __restrict__ p_src, Float* __restrict__ p_dst) const { constexpr auto dst_from_src_reorder = Sequence<0, 1>{}; @@ -199,7 +199,7 @@ struct Blockwise2dTensorCopy2 mThreadId1 = get_thread_local_1d_id() - mThreadId0 * ThreadPerDim1; } - __device__ void Run(Float* const __restrict__ p_src, Float* __restrict__ p_dst) const + __device__ void Run(const Float* __restrict__ p_src, Float* __restrict__ p_dst) const { static_assert(is_same::value, "wrong! only support float!\n"); @@ -253,7 +253,7 @@ struct Blockwise2dTensorCopy2 const unsigned dindex = dst_desc.Get1dIndex(did0, did1); *(reinterpret_cast(p_dst + dindex)) = - *(reinterpret_cast(p_src + sindex)); + *(reinterpret_cast(p_src + sindex)); } // v2 @@ -266,7 +266,7 @@ struct Blockwise2dTensorCopy2 const unsigned dindex = dst_desc.Get1dIndex(did0, did1); *(reinterpret_cast(p_dst + dindex)) = - *(reinterpret_cast(p_src + sindex)); + *(reinterpret_cast(p_src + sindex)); } // v1 @@ -314,7 +314,7 @@ struct Blockwise2dTensorCopy2 const unsigned dindex = dst_desc.Get1dIndex(did0, did1); *(reinterpret_cast(p_dst + dindex)) = - *(reinterpret_cast(p_src + sindex)); + *(reinterpret_cast(p_src + sindex)); } // v2 @@ -327,7 +327,7 @@ struct Blockwise2dTensorCopy2 const unsigned dindex = dst_desc.Get1dIndex(did0, did1); *(reinterpret_cast(p_dst + dindex)) = - *(reinterpret_cast(p_src + sindex)); + *(reinterpret_cast(p_src + sindex)); } // v1 @@ -422,7 +422,7 @@ struct Blockwise2dTensorCopy3 mDstMyThreadOffset = DstDesc{}.Get1dIndex(thread_id_d0, thread_id_d1 * DataPerRead); } - __device__ void Run(Float* const __restrict__ p_src, Float* __restrict__ p_dst) const + __device__ void Run(const Float* __restrict__ p_src, Float* __restrict__ p_dst) const { static_assert(is_same::value, "wrong! only support float!\n"); @@ -463,14 +463,14 @@ struct Blockwise2dTensorCopy3 else if(DataPerRead == 2) { *(reinterpret_cast(p_dst + mDstMyThreadOffset + iloop * dst_loop_stride)) = - *(reinterpret_cast(p_src + mSrcMyThreadOffset + - iloop * src_loop_stride)); + *(reinterpret_cast(p_src + mSrcMyThreadOffset + + iloop * src_loop_stride)); } else if(DataPerRead == 4) { *(reinterpret_cast(p_dst + mDstMyThreadOffset + iloop * dst_loop_stride)) = - *(reinterpret_cast(p_src + mSrcMyThreadOffset + - iloop * src_loop_stride)); + *(reinterpret_cast(p_src + mSrcMyThreadOffset + + iloop * src_loop_stride)); } else { @@ -495,15 +495,15 @@ struct Blockwise2dTensorCopy3 { *(reinterpret_cast(p_dst + mDstMyThreadOffset + nloop_d0 * dst_loop_stride)) = - *(reinterpret_cast(p_src + mSrcMyThreadOffset + - nloop_d0 * src_loop_stride)); + *(reinterpret_cast(p_src + mSrcMyThreadOffset + + nloop_d0 * src_loop_stride)); } else if(DataPerRead == 4) { *(reinterpret_cast(p_dst + mDstMyThreadOffset + nloop_d0 * dst_loop_stride)) = - *(reinterpret_cast(p_src + mSrcMyThreadOffset + - nloop_d0 * src_loop_stride)); + *(reinterpret_cast(p_src + mSrcMyThreadOffset + + nloop_d0 * src_loop_stride)); } else { diff --git a/src/include/blockwise_gemm.cuh b/src/include/blockwise_gemm.cuh index ef5c092de4..802bade82c 100644 --- a/src/include/blockwise_gemm.cuh +++ b/src/include/blockwise_gemm.cuh @@ -29,8 +29,8 @@ struct Blockwise1dStridedBatchedGemmBlockABlockBThreadC __device__ Blockwise1dStridedBatchedGemmBlockABlockBThreadC() { - const auto a_block_mtx = BlockMatrixA{}; // constexpr doesn't compile - const auto b_block_mtx = BlockMatrixB{}; // constexpr doesn't compile + constexpr auto a_block_mtx = BlockMatrixA{}; + constexpr auto b_block_mtx = BlockMatrixB{}; const auto c_thread_mtx_index = GetBeginOfThreadMatrixC(get_thread_local_1d_id()); @@ -66,8 +66,8 @@ struct Blockwise1dStridedBatchedGemmBlockABlockBThreadC if(TransA && (!TransB) && (!TransC)) { - const auto a_block_mtx = BlockMatrixA{}; // constexpr doesn't compile - const auto b_block_mtx = BlockMatrixB{}; // constexpr doesn't compile + constexpr auto a_block_mtx = BlockMatrixA{}; + constexpr auto b_block_mtx = BlockMatrixB{}; static_assert(a_block_mtx.NRow() == b_block_mtx.NRow(), "wrong! k dimension not consistent!"); @@ -75,7 +75,7 @@ struct Blockwise1dStridedBatchedGemmBlockABlockBThreadC constexpr unsigned MPerBlock = a_block_mtx.NCol(); constexpr unsigned NPerBlock = b_block_mtx.NCol(); - const auto c_thread_mtx = ThreadMatrixC{}; // constexpr doesn't compile + constexpr auto c_thread_mtx = ThreadMatrixC{}; // divide thread work constexpr unsigned MPerThread = c_thread_mtx.NRow(); @@ -117,9 +117,9 @@ struct Blockwise1dStridedBatchedGemmBlockABlockBThreadC } template - __device__ void Run(FloatA* const p_a_block, - FloatB* const p_b_block, - FloatC* p_c_thread, + __device__ void Run(const FloatA* __restrict__ p_a_block, + const FloatB* __restrict__ p_b_block, + FloatC* __restrict__ p_c_thread, Accumulator f_accum) const { if(TransA && (!TransB) && (!TransC)) @@ -243,8 +243,8 @@ struct BlockwiseGemmBlockABlockBThreadC __device__ BlockwiseGemmBlockABlockBThreadC() { - const auto a_block_mtx = BlockMatrixA{}; // constexpr doesn't compile - const auto b_block_mtx = BlockMatrixB{}; // constexpr doesn't compile + constexpr auto a_block_mtx = BlockMatrixA{}; + constexpr auto b_block_mtx = BlockMatrixB{}; const auto c_thread_mtx_index = GetBeginOfThreadMatrixC(get_thread_local_1d_id()); @@ -278,8 +278,8 @@ struct BlockwiseGemmBlockABlockBThreadC if(TransA && (!TransB) && (!TransC)) { - constexpr auto a_block_mtx = BlockMatrixA{}; // constexpr doesn't compile - constexpr auto b_block_mtx = BlockMatrixB{}; // constexpr doesn't compile + constexpr auto a_block_mtx = BlockMatrixA{}; + constexpr auto b_block_mtx = BlockMatrixB{}; static_assert(a_block_mtx.NRow() == b_block_mtx.NRow(), "wrong! k dimension not consistent!"); @@ -287,7 +287,7 @@ struct BlockwiseGemmBlockABlockBThreadC constexpr unsigned MPerBlock = a_block_mtx.NCol(); constexpr unsigned NPerBlock = b_block_mtx.NCol(); - constexpr auto c_thread_mtx = ThreadMatrixC{}; // constexpr doesn't compile + constexpr auto c_thread_mtx = ThreadMatrixC{}; // divide thread work constexpr unsigned MPerThread = c_thread_mtx.NRow(); @@ -367,9 +367,9 @@ struct BlockwiseGemmBlockABlockBThreadC } template - __device__ void Run(FloatA* const p_a_block, - FloatB* const p_b_block, - FloatC* p_c_thread, + __device__ void Run(const FloatA* __restrict__ p_a_block, + const FloatB* __restrict__ p_b_block, + FloatC* __restrict__ p_c_thread, Accumulator f_accum) const { if(TransA && (!TransB) && (!TransC)) @@ -459,9 +459,9 @@ struct BlockwiseGemmBlockABlockBThreadCTransANormalBNormalC_v2 static_assert(BlockSize == ThreadPerLevel1Cluster, "wrong! wrong blocksize\n"); - const auto a_block_mtx = BlockMatrixA{}; // constexpr doesn't compile - const auto b_block_mtx = BlockMatrixB{}; // constexpr doesn't compile - const auto c_thread_mtx = ThreadMatrixC{}; // constexpr doesn't compile + constexpr auto a_block_mtx = BlockMatrixA{}; + constexpr auto b_block_mtx = BlockMatrixB{}; + constexpr auto c_thread_mtx = ThreadMatrixC{}; static_assert(a_block_mtx.NRow() == b_block_mtx.NRow(), "wrong! K dimension not consistent\n"); @@ -529,7 +529,7 @@ struct BlockwiseGemmBlockABlockBThreadCTransANormalBNormalC_v2 __device__ static MatrixIndex GetDistanceFromBeginOfThreadMatrixC(unsigned m_in_c, unsigned n_in_c) { - const auto c_thread_mtx = ThreadMatrixC{}; // constexpr doesn't compile + constexpr auto c_thread_mtx = ThreadMatrixC{}; constexpr unsigned MPerThread = c_thread_mtx.NRow(); constexpr unsigned NPerThread = c_thread_mtx.NCol(); @@ -551,17 +551,17 @@ struct BlockwiseGemmBlockABlockBThreadCTransANormalBNormalC_v2 } template - __device__ void Run(FloatA* const p_a_block, - FloatB* const p_b_block, - FloatC* p_c_thread, + __device__ void Run(const FloatA* __restrict__ p_a_block, + const FloatB* __restrict__ p_b_block, + FloatC* __restrict__ p_c_thread, Accumulator f_accum) const { constexpr auto True = integral_constant{}; constexpr auto False = integral_constant{}; - const auto a_block_mtx = BlockMatrixA{}; // constexpr doesn't compile - const auto b_block_mtx = BlockMatrixB{}; // constexpr doesn't compile - const auto c_thread_mtx = ThreadMatrixC{}; // constexpr doesn't compile + constexpr auto a_block_mtx = BlockMatrixA{}; + constexpr auto b_block_mtx = BlockMatrixB{}; + constexpr auto c_thread_mtx = ThreadMatrixC{}; constexpr unsigned M = a_block_mtx.NCol(); constexpr unsigned N = b_block_mtx.NCol(); @@ -571,22 +571,18 @@ struct BlockwiseGemmBlockABlockBThreadCTransANormalBNormalC_v2 constexpr unsigned NPerThread = c_thread_mtx.NCol(); // thread A, B for GEMM - const auto a_thread_mtx = make_ConstantMatrixDescriptor( - Number{}, Number{}); // constexpr doesn't compile + constexpr auto a_thread_mtx = + make_ConstantMatrixDescriptor(Number{}, Number{}); - const auto b_thread_mtx = make_ConstantMatrixDescriptor( - Number{}, Number{}); // constexpr doesn't compile + constexpr auto b_thread_mtx = + make_ConstantMatrixDescriptor(Number{}, Number{}); // thread A-sub, B-sub for copy - const auto a_thread_sub_mtx = - make_ConstantMatrixDescriptor(Number{}, - Number{}, - Number{}); // constexpr doesn't compile + constexpr auto a_thread_sub_mtx = make_ConstantMatrixDescriptor( + Number{}, Number{}, Number{}); - const auto b_thread_sub_mtx = - make_ConstantMatrixDescriptor(Number{}, - Number{}, - Number{}); // constexpr doesn't compile + constexpr auto b_thread_sub_mtx = make_ConstantMatrixDescriptor( + Number{}, Number{}, Number{}); FloatA p_a_thread[a_thread_mtx.GetElementSpace()]; FloatB p_b_thread[b_thread_mtx.GetElementSpace()]; @@ -606,26 +602,26 @@ struct BlockwiseGemmBlockABlockBThreadCTransANormalBNormalC_v2 // copy A-sub to form A for(unsigned m_repeat = 0; m_repeat < MRepeat; ++m_repeat) { - threadwise_matrix_copy(a_block_mtx, - p_a_block + mMyThreadOffsetA + - k_begin * a_block_mtx.RowStride() + - m_repeat * MPerLevel1Cluster, - a_thread_sub_mtx, - p_a_thread + m_repeat * MPerThreadSubC, - a_thread_sub_mtx.GetLengths()); + threadwise_matrix_copy( + a_block_mtx, + p_a_block + a_block_mtx.Get1dIndex(k_begin, m_repeat * MPerLevel1Cluster) + + mMyThreadOffsetA, + a_thread_mtx, + p_a_thread + a_thread_mtx.Get1dIndex(0, m_repeat * MPerThreadSubC), + a_thread_sub_mtx.GetLengths()); } #pragma unroll // copy B-sub to form B for(unsigned n_repeat = 0; n_repeat < NRepeat; ++n_repeat) { - threadwise_matrix_copy(b_block_mtx, - p_b_block + mMyThreadOffsetB + - k_begin * b_block_mtx.RowStride() + - n_repeat * NPerLevel1Cluster, - b_thread_sub_mtx, - p_b_thread + n_repeat * NPerThreadSubC, - b_thread_sub_mtx.GetLengths()); + threadwise_matrix_copy( + b_block_mtx, + p_b_block + b_block_mtx.Get1dIndex(k_begin, n_repeat * NPerLevel1Cluster) + + mMyThreadOffsetB, + b_thread_mtx, + p_b_thread + b_thread_mtx.Get1dIndex(0, n_repeat * NPerThreadSubC), + b_thread_sub_mtx.GetLengths()); } // C = A * B @@ -778,4 +774,144 @@ struct BlockwiseGemmBlockABlockBThreadCTransANormalBNormalC_v2 f_accum); } } + + template + __device__ void Run_v2(const FloatA* __restrict__ p_a_block, + const FloatB* __restrict__ p_b_block, + FloatC* __restrict__ p_c_thread, + Accumulator f_accum) const + { + constexpr auto True = integral_constant{}; + constexpr auto False = integral_constant{}; + + constexpr auto a_block_mtx = BlockMatrixA{}; + constexpr auto b_block_mtx = BlockMatrixB{}; + constexpr auto c_thread_mtx = ThreadMatrixC{}; + + constexpr unsigned M = a_block_mtx.NCol(); + constexpr unsigned N = b_block_mtx.NCol(); + constexpr unsigned K = a_block_mtx.NRow(); + + constexpr unsigned MPerThread = c_thread_mtx.NRow(); + constexpr unsigned NPerThread = c_thread_mtx.NCol(); + + // thread A-sub, B-sub, C-sub + constexpr auto a_thread_sub_mtx = make_ConstantMatrixDescriptor( + Number{}, Number{}, Number{}); + + constexpr auto b_thread_sub_mtx = make_ConstantMatrixDescriptor( + Number{}, Number{}, Number{}); + + constexpr auto c_thread_sub_mtx = make_ConstantMatrixDescriptor( + Number{}, Number{}, Number{}); + + // thread A, B + constexpr auto a_thread_mtx = + make_ConstantMatrixDescriptor(Number{}, Number{}); + + constexpr auto b_thread_mtx = + make_ConstantMatrixDescriptor(Number{}, Number{}); + + FloatA p_a_thread[a_thread_mtx.GetElementSpace()]; + FloatB p_b_thread[b_thread_mtx.GetElementSpace()]; + + constexpr unsigned MPerLevel1Cluster = MPerThreadSubC * MLevel0Cluster * MLevel1Cluster; + constexpr unsigned NPerLevel1Cluster = NPerThreadSubC * NLevel0Cluster * NLevel1Cluster; + + constexpr unsigned MRepeat = MPerThread / MPerThreadSubC; + constexpr unsigned NRepeat = NPerThread / NPerThreadSubC; + +#pragma unroll + // loop over k + for(unsigned k_begin = 0; k_begin < K; k_begin += KPerThreadLoop) + { + // C-sub(s) in first row-wise subblock of C + { + // copy first A-sub + threadwise_matrix_copy(a_block_mtx, + p_a_block + a_block_mtx.Get1dIndex(k_begin, 0) + + mMyThreadOffsetA, + a_thread_mtx, + p_a_thread, + a_thread_sub_mtx.GetLengths()); + + // copy first B-sub + threadwise_matrix_copy(b_block_mtx, + p_b_block + b_block_mtx.Get1dIndex(k_begin, 0) + + mMyThreadOffsetB, + b_thread_mtx, + p_b_thread, + b_thread_sub_mtx.GetLengths()); + + // do first sub GEMM + threadwise_gemm(a_thread_sub_mtx, + True, + p_a_thread, + b_thread_sub_mtx, + False, + p_b_thread, + c_thread_sub_mtx, + False, + p_c_thread, + f_accum); + +#pragma unroll + // copy next B-sub, and do GEMM + for(unsigned n_repeat = 1; n_repeat < NRepeat; ++n_repeat) + { + threadwise_matrix_copy( + b_block_mtx, + p_b_block + b_block_mtx.Get1dIndex(k_begin, n_repeat * NPerLevel1Cluster) + + mMyThreadOffsetB, + b_thread_mtx, + p_b_thread + b_thread_mtx.Get1dIndex(0, n_repeat * NPerThreadSubC), + b_thread_sub_mtx.GetLengths()); + + threadwise_gemm( + a_thread_sub_mtx, + True, + p_a_thread, + b_thread_sub_mtx, + False, + p_b_thread + b_thread_mtx.Get1dIndex(0, n_repeat * NPerThreadSubC), + c_thread_sub_mtx, + False, + p_c_thread + c_thread_mtx.Get1dIndex(0, n_repeat * NPerThreadSubC), + f_accum); + } + +#pragma unroll + // loop over rest of row-wise subblock + // all B-sub(s) has been copied, so only A-sub(s) need to be copied + for(unsigned m_repeat = 1; m_repeat < MRepeat; ++m_repeat) + { + // copy a A-sub + threadwise_matrix_copy( + a_block_mtx, + p_a_block + a_block_mtx.Get1dIndex(k_begin, m_repeat * MPerLevel1Cluster) + + mMyThreadOffsetA, + a_thread_mtx, + p_a_thread + a_thread_mtx.Get1dIndex(0, m_repeat * MPerThreadSubC), + a_thread_sub_mtx.GetLengths()); + + // do some GEMMs + for(unsigned n_repeat = 0; n_repeat < NRepeat; ++n_repeat) + { + threadwise_gemm( + a_thread_sub_mtx, + True, + p_a_thread + a_thread_mtx.Get1dIndex(0, m_repeat * MPerThreadSubC), + b_thread_sub_mtx, + False, + p_b_thread + b_thread_mtx.Get1dIndex(0, n_repeat * NPerThreadSubC), + c_thread_sub_mtx, + False, + p_c_thread + c_thread_mtx.Get1dIndex(m_repeat * MPerThreadSubC, + n_repeat * NPerThreadSubC), + f_accum); + } + } + } + } + } }; diff --git a/src/include/gridwise_implicit_gemm_convolution_1_chwn_csrk_khwn.cuh b/src/include/gridwise_implicit_gemm_convolution_1_chwn_csrk_khwn.cuh index 326e5939ef..bc58bf77ad 100644 --- a/src/include/gridwise_implicit_gemm_convolution_1_chwn_csrk_khwn.cuh +++ b/src/include/gridwise_implicit_gemm_convolution_1_chwn_csrk_khwn.cuh @@ -201,11 +201,11 @@ gridwise_implicit_gemm_convolution_1_chwn_csrk_khwn(InGlobalDesc, // set threadwise output tensor to 0 threadwise_4d_tensor_set_zero(out_hkwn_thread_desc, p_out_thread); - Float* p_in_global_block_begin = + const Float* p_in_global_block_begin = p_in_global + in_chwn_global_desc.Get1dIndex( 0, hi_block_data_begin, wi_block_data_begin, n_block_data_begin); - Float* p_wei_global_block_begin = + const Float* p_wei_global_block_begin = p_wei_global + wei_csrk_global_desc.Get1dIndex(0, 0, 0, k_block_data_begin); for(unsigned c_block_data_begin = 0; c_block_data_begin < C; c_block_data_begin += CPerBlock, @@ -213,15 +213,11 @@ gridwise_implicit_gemm_convolution_1_chwn_csrk_khwn(InGlobalDesc, p_wei_global_block_begin += CPerBlock * wei_csrk_global_desc.GetStride(I0), __syncthreads()) { -#if 1 // input: global mem to LDS, blockwise_in_copy.Run(p_in_global_block_begin, p_in_block); -#endif -#if 1 // weight: global mem to LDS, blockwise_wei_copy.Run(p_wei_global_block_begin, p_wei_block); -#endif __syncthreads(); diff --git a/src/include/gridwise_implicit_gemm_convolution_2_cnhw_csrk_knhw.cuh b/src/include/gridwise_implicit_gemm_convolution_2_cnhw_csrk_knhw.cuh index 5e9bbd7a29..b2533448f6 100644 --- a/src/include/gridwise_implicit_gemm_convolution_2_cnhw_csrk_knhw.cuh +++ b/src/include/gridwise_implicit_gemm_convolution_2_cnhw_csrk_knhw.cuh @@ -36,11 +36,11 @@ template __global__ void gridwise_implicit_gemm_convolution_2_cnhw_csrk_knhw(InGlobalDesc, - Float* const __restrict__ p_in_global, + const Float* const __restrict__ p_in_global, WeiGlobalDesc, - Float* const __restrict__ p_wei_global, + const Float* const __restrict__ p_wei_global, OutGlobalDesc, - Float* __restrict__ p_out_global) + Float* const __restrict__ p_out_global) { constexpr auto I0 = Number<0>{}; constexpr auto I1 = Number<1>{}; @@ -228,10 +228,10 @@ gridwise_implicit_gemm_convolution_2_cnhw_csrk_knhw(InGlobalDesc, // set threadwise output tensor to 0 threadwise_2d_tensor_set_zero(out_kb_thread_desc, p_out_thread); - Float* p_in_global_block_offset = + const Float* p_in_global_block_offset = p_in_global + in_cb_global_desc.Get1dIndex(0, b_block_data_begin); - Float* p_wei_global_block_offset = + const Float* p_wei_global_block_offset = p_wei_global + wei_csrk_global_desc.Get1dIndex(0, 0, 0, k_block_data_begin); for(unsigned c_block_data_begin = 0; c_block_data_begin < C; c_block_data_begin += CPerBlock, @@ -256,7 +256,9 @@ gridwise_implicit_gemm_convolution_2_cnhw_csrk_knhw(InGlobalDesc, #if 1 blockwise_gemm.Run -#else +#elif 0 + blockwise_gemm.Run_v2 +#elif 0 blockwise_gemm.Run_RegisterDoubleBuffer #endif (p_wei_block + wei_csrk_block_desc.Get1dIndex(0, s, r, 0), diff --git a/src/include/gridwise_implicit_gemm_convolution_2_cnhw_csrk_knhw_lds_double_buffer.cuh b/src/include/gridwise_implicit_gemm_convolution_2_cnhw_csrk_knhw_lds_double_buffer.cuh index fd021957dc..7e10a62faf 100644 --- a/src/include/gridwise_implicit_gemm_convolution_2_cnhw_csrk_knhw_lds_double_buffer.cuh +++ b/src/include/gridwise_implicit_gemm_convolution_2_cnhw_csrk_knhw_lds_double_buffer.cuh @@ -123,7 +123,7 @@ __global__ void gridwise_implicit_gemm_convolution_2_cnhw_csrk_knhw_lds_double_b decltype(in_cb_global_desc), decltype(in_cb_block_desc), decltype(in_cb_block_desc.GetLengths())>{}; -#elif 1 +#elif 0 const auto blockwise_in_copy = Blockwise2dTensorCopy2{}; -#elif 1 +#elif 0 const auto blockwise_wei_copy = Blockwise2dTensorCopy2 -__device__ void -threadwise_matrix_copy(SrcMatrix, Float* const p_src, DstMatrix, Float* p_dst, Sequence) +__device__ void threadwise_matrix_copy(SrcMatrix, + const Float* __restrict__ p_src, + DstMatrix, + Float* __restrict__ p_dst, + Sequence) { - const auto src_mtx = SrcMatrix{}; // constexpr doesn't compile - const auto dst_mtx = DstMatrix{}; // constexpr doesn't compile + constexpr auto src_mtx = SrcMatrix{}; + constexpr auto dst_mtx = DstMatrix{}; for(unsigned i = 0; i < NRow; ++i) { @@ -31,30 +34,30 @@ template __device__ void threadwise_gemm(MatrixA, integral_constant, - FloatA* const p_a_thread, + const FloatA* __restrict__ p_a_thread, MatrixB, integral_constant, - FloatB* const p_b_thread, + const FloatB* __restrict__ p_b_thread, MatrixC, integral_constant, - FloatC* p_c_thread, + FloatC* __restrict__ p_c_thread, Accumulator f_accum) { if(TransA && (!TransB) && (!TransC)) { - const auto a_mtx = MatrixA{}; // constexpr doesn't compile - const auto b_mtx = MatrixB{}; // constexpr doesn't compile - const auto c_mtx = MatrixC{}; // constexpr doesn't compile + constexpr auto a_mtx = MatrixA{}; + constexpr auto b_mtx = MatrixB{}; + constexpr auto c_mtx = MatrixC{}; constexpr unsigned M = c_mtx.NRow(); constexpr unsigned N = c_mtx.NCol(); constexpr unsigned K = a_mtx.NRow(); // A is transposed - for(unsigned i = 0; i < M; ++i) + for(unsigned k = 0; k < K; ++k) { - for(unsigned j = 0; j < N; ++j) + for(unsigned i = 0; i < M; ++i) { - for(unsigned k = 0; k < K; ++k) + for(unsigned j = 0; j < N; ++j) { const unsigned aindex = a_mtx.Get1dIndex(k, i); // A is transposed const unsigned bindex = b_mtx.Get1dIndex(k, j);