From 6614729a680f1f0b2161a22e3c0fd951107707c3 Mon Sep 17 00:00:00 2001 From: Chao Liu Date: Tue, 5 Feb 2019 17:06:53 -0600 Subject: [PATCH] add another version of blockwise 2d copy, refactor --- ...icit_gemm_convolution_2_cnhw_csrk_knhw.cuh | 7 +- src/include/blockwise_2d_tensor_op.cuh | 184 ++++++++++-------- src/include/blockwise_4d_tensor_op.cuh | 39 +--- src/include/blockwise_gemm.cuh | 12 +- src/include/gridwise_direct_convolution_1.cuh | 36 ++-- src/include/gridwise_direct_convolution_2.cuh | 24 +-- ...icit_gemm_convolution_1_chwn_csrk_khwn.cuh | 55 +++--- ...mm_convolution_1_chwn_csrk_khwn_padded.cuh | 82 ++++---- ...n_1_chwn_csrk_khwn_padded_lds_pipeline.cuh | 80 ++++---- ..._implicit_gemm_convolution_1_nchw_kcsr.cuh | 51 +++-- ...icit_gemm_convolution_1_nchw_srck_nkhw.cuh | 43 ++-- ...icit_gemm_convolution_2_cnhw_csrk_knhw.cuh | 91 +++++---- ...volution_2_cnhw_csrk_knhw_lds_pipeline.cuh | 87 +++++---- ...icit_gemm_convolution_2_cnhw_srck_knhw.cuh | 64 +++--- ...volution_2_cnhw_srck_knhw_lds_pipeline.cuh | 70 ++++--- 15 files changed, 467 insertions(+), 458 deletions(-) diff --git a/driver/device_implicit_gemm_convolution_2_cnhw_csrk_knhw.cuh b/driver/device_implicit_gemm_convolution_2_cnhw_csrk_knhw.cuh index f17411e68b..0d066dbdb7 100644 --- a/driver/device_implicit_gemm_convolution_2_cnhw_csrk_knhw.cuh +++ b/driver/device_implicit_gemm_convolution_2_cnhw_csrk_knhw.cuh @@ -86,6 +86,9 @@ void device_implicit_gemm_convolution_2_cnhw_csrk_knhw(InDesc, constexpr unsigned WeiBlockCopyThreadPerDim0 = 4; constexpr unsigned WeiBlockCopyThreadPerDim1 = 16; + constexpr unsigned InBlockCopyDataPerRead = 4; + constexpr unsigned WeiBlockCopyDataPerRead = 4; + constexpr unsigned BlockSize = 64; #endif @@ -137,7 +140,9 @@ void device_implicit_gemm_convolution_2_cnhw_csrk_knhw(InDesc, InBlockCopyThreadPerDim0, InBlockCopyThreadPerDim1, WeiBlockCopyThreadPerDim0, - WeiBlockCopyThreadPerDim1> + WeiBlockCopyThreadPerDim1, + InBlockCopyDataPerRead, + WeiBlockCopyDataPerRead> <<>>(in_cnhw_desc, static_cast(in_cnhw_device_buf.GetDeviceBuffer()), wei_csrk_desc, diff --git a/src/include/blockwise_2d_tensor_op.cuh b/src/include/blockwise_2d_tensor_op.cuh index cbc42e01d0..dd440fe1a2 100644 --- a/src/include/blockwise_2d_tensor_op.cuh +++ b/src/include/blockwise_2d_tensor_op.cuh @@ -162,9 +162,9 @@ blockwise_2d_tensor_copy_reorder_by_get_dst_from_src(SrcDesc, } template -struct blockwise_2d_tensor_copy_1 +struct Blockwise2dTensorCopy1 { - __device__ void run(Float* const __restrict__ p_src, Float* __restrict__ p_dst) const + __device__ void Run(Float* const __restrict__ p_src, Float* __restrict__ p_dst) const { constexpr auto dst_from_src_reorder = Sequence<0, 1>{}; @@ -173,6 +173,8 @@ struct blockwise_2d_tensor_copy_1 } }; +// need to be aligned to float4 and float2 +// stride1 need to be 1 for both source and destination template -struct blockwise_2d_tensor_copy_2 +struct Blockwise2dTensorCopy2 { unsigned mThreadId0; unsigned mThreadId1; - __device__ blockwise_2d_tensor_copy_2() + __device__ Blockwise2dTensorCopy2() { - static_assert(is_same::value, "wrong! type is not float!\n"); + constexpr auto I0 = Number<0>{}; + constexpr auto I1 = Number<1>{}; + + static_assert(SrcDesc{}.GetStride(I1) == 1 && DstDesc{}.GetStride(I1) == 1, + "wrong! stride is not 1!\n"); mThreadId0 = get_thread_local_1d_id() / ThreadPerDim1; mThreadId1 = get_thread_local_1d_id() - mThreadId0 * ThreadPerDim1; } - __device__ void run(Float* const __restrict__ p_src, Float* __restrict__ p_dst) const + __device__ void Run(Float* const __restrict__ p_src, Float* __restrict__ p_dst) const { + static_assert(is_same::value, "wrong! only support float!\n"); + if(get_thread_local_1d_id() >= ThreadPerDim0 * ThreadPerDim1) return; @@ -227,22 +235,12 @@ struct blockwise_2d_tensor_copy_2 for(unsigned d1v4loop = 0; d1v4loop < Dim1V4Loop; ++d1v4loop) { unsigned did1 = d1v4loop * 4 * ThreadPerDim1 + 4 * mThreadId1; -#if 1 + const unsigned sindex = src_desc.Get1dIndex(did0, did1); const unsigned dindex = dst_desc.Get1dIndex(did0, did1); *(reinterpret_cast(p_dst + dindex)) = *(reinterpret_cast(p_src + sindex)); - -#else - for(unsigned i = 0; i < 4; ++i) - { - const unsigned sindex = src_desc.Get1dIndex(did0, did1 + i); - const unsigned dindex = dst_desc.Get1dIndex(did0, did1 + i); - - p_dst[dindex] = p_src[sindex]; - } -#endif } // v2 @@ -251,22 +249,11 @@ struct blockwise_2d_tensor_copy_2 unsigned did1 = Dim1V4Loop * 4 * ThreadPerDim1 + d1v2loop * 2 * ThreadPerDim1 + 2 * mThreadId1; -#if 1 const unsigned sindex = src_desc.Get1dIndex(did0, did1); const unsigned dindex = dst_desc.Get1dIndex(did0, did1); *(reinterpret_cast(p_dst + dindex)) = *(reinterpret_cast(p_src + sindex)); - -#else - for(unsigned i = 0; i < 2; ++i) - { - const unsigned sindex = src_desc.Get1dIndex(did0, did1 + i); - const unsigned dindex = dst_desc.Get1dIndex(did0, did1 + i); - - p_dst[dindex] = p_src[sindex]; - } -#endif } // v1 @@ -310,22 +297,11 @@ struct blockwise_2d_tensor_copy_2 { unsigned did1 = d1v4loop * 4 * ThreadPerDim1 + 4 * mThreadId1; -#if 1 const unsigned sindex = src_desc.Get1dIndex(did0, did1); const unsigned dindex = dst_desc.Get1dIndex(did0, did1); *(reinterpret_cast(p_dst + dindex)) = *(reinterpret_cast(p_src + sindex)); - -#else - for(unsigned i = 0; i < 4; ++i) - { - const unsigned sindex = src_desc.Get1dIndex(did0, did1 + i); - const unsigned dindex = dst_desc.Get1dIndex(did0, did1 + i); - - p_dst[dindex] = p_src[sindex]; - } -#endif } // v2 @@ -334,22 +310,11 @@ struct blockwise_2d_tensor_copy_2 unsigned did1 = Dim1V4Loop * 4 * ThreadPerDim1 + d1v2loop * 2 * ThreadPerDim1 + 2 * mThreadId1; -#if 1 const unsigned sindex = src_desc.Get1dIndex(did0, did1); const unsigned dindex = dst_desc.Get1dIndex(did0, did1); *(reinterpret_cast(p_dst + dindex)) = *(reinterpret_cast(p_src + sindex)); - -#else - for(unsigned i = 0; i < 2; ++i) - { - const unsigned sindex = src_desc.Get1dIndex(did0, did1 + i); - const unsigned dindex = dst_desc.Get1dIndex(did0, did1 + i); - - p_dst[dindex] = p_src[sindex]; - } -#endif } // v1 @@ -385,49 +350,104 @@ struct blockwise_2d_tensor_copy_2 } }; -template -struct blockwise_2d_tensor_copy_dummy_1 +// starting point need to be aligned to float4 or float2 or float +// stride1 need to be 1 for both source and destination +template +struct Blockwise2dTensorCopy3 { - unsigned mBegin; + unsigned mSrcMyThreadOffset; + unsigned mDstMyThreadOffset; - __device__ blockwise_2d_tensor_copy_dummy_1() + __device__ Blockwise2dTensorCopy3() { - constexpr unsigned n_total = - make_ConstantTensorDescriptor(SrcOpLengths{}).GetElementSpace(); + constexpr auto I0 = Number<0>{}; + constexpr auto I1 = Number<1>{}; - constexpr unsigned n_per_thread = n_total / BlockSize; + static_assert(SrcDesc{}.GetStride(I1) == 1 && DstDesc{}.GetStride(I1) == 1, + "wrong! only support stride1 == 1!\n"); - mBegin = n_per_thread * get_thread_local_1d_id(); + static_assert(DataPerRead == 1 || DataPerRead == 2 || DataPerRead == 4, + "wrong! only support DataPerRead == 1, 2 or 4!\n"); + + constexpr unsigned L0 = SrcOpLengths{}.Get(I0); + constexpr unsigned L1 = SrcOpLengths{}.Get(I1); + + static_assert(L1 % DataPerRead == 0, "wrong! only support mod(L1, DataPerRead) == 0\n"); + + constexpr unsigned thread_per_d1 = L1 / DataPerRead; + constexpr unsigned thread_per_d0 = BlockSize / thread_per_d1; + + static_assert(thread_per_d1 <= BlockSize, + "wrong! not enough threads to cover L1 dimension\n"); + + const unsigned thread_id_d0 = get_thread_local_1d_id() / thread_per_d1; + const unsigned thread_id_d1 = get_thread_local_1d_id() - thread_id_d0 * thread_per_d1; + + mSrcMyThreadOffset = SrcDesc{}.Get1dIndex(thread_id_d0, thread_id_d1 * DataPerRead); + mDstMyThreadOffset = DstDesc{}.Get1dIndex(thread_id_d0, thread_id_d1 * DataPerRead); } - __device__ void run(Float* const __restrict__ p_src, Float* __restrict__ p_dst) const + __device__ void Run(Float* const __restrict__ p_src, Float* __restrict__ p_dst) const { - constexpr unsigned n_total = - make_ConstantTensorDescriptor(SrcOpLengths{}).GetElementSpace(); + static_assert(is_same::value, "wrong! only support float!\n"); - constexpr unsigned n_per_thread = n_total / BlockSize; + using Float2 = float2; + using Float4 = float4; - for(unsigned i = 0; i < n_per_thread; ++i) + constexpr auto I0 = Number<0>{}; + constexpr auto I1 = Number<1>{}; + + constexpr unsigned L0 = SrcOpLengths{}.Get(I0); + constexpr unsigned L1 = SrcOpLengths{}.Get(I1); + + constexpr unsigned thread_per_d1 = L1 / DataPerRead; + constexpr unsigned thread_per_d0 = BlockSize / thread_per_d1; + + constexpr unsigned num_active_thread = thread_per_d0 * thread_per_d1; + + if(BlockSize > num_active_thread) { - p_dst[mBegin + i] = p_src[mBegin + i]; - } - } -}; - -template -struct blockwise_2d_tensor_copy_dummy_2 -{ - __device__ void run(Float* const __restrict__ p_src, Float* __restrict__ p_dst) const - { - constexpr unsigned n_total = - make_ConstantTensorDescriptor(SrcOpLengths{}).GetElementSpace(); - - constexpr unsigned n_per_thread = n_total / BlockSize; - - for(unsigned i = 0; i < n_per_thread; ++i) - { - unsigned index = get_thread_local_1d_id() + BlockSize * i; - p_dst[index] = p_src[index]; + if(get_thread_local_1d_id() > num_active_thread) + { + return; + } + } + + constexpr unsigned nloop_d0 = L0 / thread_per_d0; + + constexpr bool has_tail_d0 = (L0 > nloop_d0 * thread_per_d0); + + constexpr unsigned src_loop_stride = SrcDesc{}.GetStride(I0) * thread_per_d0; + constexpr unsigned dst_loop_stride = DstDesc{}.GetStride(I0) * thread_per_d0; + + for(unsigned iloop = 0; iloop < nloop_d0; ++iloop) + { + if(DataPerRead == 1) + { + p_dst[mDstMyThreadOffset + iloop * dst_loop_stride] = + p_src[mSrcMyThreadOffset + iloop * src_loop_stride]; + } + else if(DataPerRead == 2) + { + *(reinterpret_cast(p_dst + mDstMyThreadOffset + iloop * dst_loop_stride)) = + *(reinterpret_cast(p_src + mSrcMyThreadOffset + + iloop * src_loop_stride)); + } + else if(DataPerRead == 4) + { + *(reinterpret_cast(p_dst + mDstMyThreadOffset + iloop * dst_loop_stride)) = + *(reinterpret_cast(p_src + mSrcMyThreadOffset + + iloop * src_loop_stride)); + } + else + { + assert(false); + } } } }; diff --git a/src/include/blockwise_4d_tensor_op.cuh b/src/include/blockwise_4d_tensor_op.cuh index eca267e2e3..5cc53977cb 100644 --- a/src/include/blockwise_4d_tensor_op.cuh +++ b/src/include/blockwise_4d_tensor_op.cuh @@ -200,9 +200,9 @@ blockwise_4d_tensor_copy_reorder_by_get_dst_from_src(SrcDesc, } template -struct blockwise_4d_tensor_copy_1 +struct Blockwise4dTensorCopy1 { - __device__ void run(Float* const __restrict__ p_src, Float* __restrict__ p_dst) const + __device__ void Run(Float* const __restrict__ p_src, Float* __restrict__ p_dst) const { constexpr auto dst_from_src_reorder = Sequence<0, 1, 2, 3>{}; @@ -217,9 +217,9 @@ template -struct blockwise_chwn_tensor_copy_with_padding +struct BlockwiseChwnTensorCopyPadded { - __device__ void run(Float* const __restrict__ p_src, + __device__ void Run(Float* const __restrict__ p_src, unsigned c_block_data_begin, unsigned ho_block_data_begin, unsigned wo_block_data_begin, @@ -336,33 +336,4 @@ struct blockwise_chwn_tensor_copy_with_padding } } } -}; - -template -struct blockwise_4d_tensor_copy_dummy -{ - unsigned mBegin; - - __device__ blockwise_4d_tensor_copy_dummy() - { - constexpr unsigned n_total = - make_ConstantTensorDescriptor(SrcOpLengths{}).GetElementSpace(); - - constexpr unsigned n_per_thread = n_total / BlockSize; - - mBegin = n_per_thread * get_thread_local_1d_id(); - } - - __device__ void run(Float* const __restrict__ p_src, Float* __restrict__ p_dst) const - { - constexpr unsigned n_total = - make_ConstantTensorDescriptor(SrcOpLengths{}).GetElementSpace(); - - constexpr unsigned n_per_thread = n_total / BlockSize; - - for(unsigned i = 0; i < n_per_thread; ++i) - { - p_dst[mBegin + i] = p_src[mBegin + i]; - } - } -}; +}; \ No newline at end of file diff --git a/src/include/blockwise_gemm.cuh b/src/include/blockwise_gemm.cuh index ded4cdd017..7d99a96eab 100644 --- a/src/include/blockwise_gemm.cuh +++ b/src/include/blockwise_gemm.cuh @@ -15,7 +15,7 @@ template -struct blockwise_1d_strided_batched_gemm_block_a_block_b_thread_c +struct Blockwise1dStridedBatchedGemmBlockABlockBThreadC { unsigned mMyThreadOffsetA = 0; unsigned mMyThreadOffsetB = 0; @@ -27,7 +27,7 @@ struct blockwise_1d_strided_batched_gemm_block_a_block_b_thread_c unsigned col_begin; }; - __device__ blockwise_1d_strided_batched_gemm_block_a_block_b_thread_c() + __device__ Blockwise1dStridedBatchedGemmBlockABlockBThreadC() { const auto a_block_mtx = BlockMatrixA{}; // constexpr doesn't compile const auto b_block_mtx = BlockMatrixB{}; // constexpr doesn't compile @@ -117,7 +117,7 @@ struct blockwise_1d_strided_batched_gemm_block_a_block_b_thread_c } template - __device__ void run(FloatA* const p_a_block, + __device__ void Run(FloatA* const p_a_block, FloatB* const p_b_block, FloatC* p_c_thread, Accumulator f_accum) const @@ -230,7 +230,7 @@ template -struct blockwise_gemm_block_a_block_b_thread_c +struct BlockwiseGemmBlockABlockBThreadC { unsigned mMyThreadOffsetA = 0; unsigned mMyThreadOffsetB = 0; @@ -241,7 +241,7 @@ struct blockwise_gemm_block_a_block_b_thread_c unsigned col_begin; }; - __device__ blockwise_gemm_block_a_block_b_thread_c() + __device__ BlockwiseGemmBlockABlockBThreadC() { const auto a_block_mtx = BlockMatrixA{}; // constexpr doesn't compile const auto b_block_mtx = BlockMatrixB{}; // constexpr doesn't compile @@ -360,7 +360,7 @@ struct blockwise_gemm_block_a_block_b_thread_c } template - __device__ void run(FloatA* const p_a_block, + __device__ void Run(FloatA* const p_a_block, FloatB* const p_b_block, FloatC* p_c_thread, Accumulator f_accum) const diff --git a/src/include/gridwise_direct_convolution_1.cuh b/src/include/gridwise_direct_convolution_1.cuh index e8a99536bb..ab7f14dd06 100644 --- a/src/include/gridwise_direct_convolution_1.cuh +++ b/src/include/gridwise_direct_convolution_1.cuh @@ -122,25 +122,25 @@ __global__ void gridwise_direct_convolution_1(InGlobalDesc, #endif constexpr auto blockwise_in_copy = - blockwise_4d_tensor_copy_1{}; + Blockwise4dTensorCopy1{}; constexpr auto blockwise_wei_copy = - blockwise_4d_tensor_copy_1{}; + Blockwise4dTensorCopy1{}; constexpr auto blockwise_out_copy = - blockwise_4d_tensor_copy_1{}; + Blockwise4dTensorCopy1{}; // set output tensor in LDS to 0 blockwise_4d_tensor_set_zero(out_block_desc, p_out_block); @@ -149,14 +149,14 @@ __global__ void gridwise_direct_convolution_1(InGlobalDesc, c_block_work_begin += CPerBlock) { // copy input tensor to LDS - blockwise_in_copy.run(p_in_global + in_global_desc.Get1dIndex(n_block_work_begin, + blockwise_in_copy.Run(p_in_global + in_global_desc.Get1dIndex(n_block_work_begin, c_block_work_begin, hi_block_work_begin, wi_block_work_begin), p_in_block); // copy weight tensor to LDS - blockwise_wei_copy.run( + blockwise_wei_copy.Run( p_wei_global + wei_global_desc.Get1dIndex(k_block_work_begin, c_block_work_begin, 0, 0), p_wei_block); @@ -179,7 +179,7 @@ __global__ void gridwise_direct_convolution_1(InGlobalDesc, } // copy output tensor from LDS to device mem - blockwise_out_copy.run(p_out_block, + blockwise_out_copy.Run(p_out_block, p_out_global + out_global_desc.Get1dIndex(n_block_work_begin, k_block_work_begin, ho_block_work_begin, diff --git a/src/include/gridwise_direct_convolution_2.cuh b/src/include/gridwise_direct_convolution_2.cuh index aeafbacca3..408fb5e05a 100644 --- a/src/include/gridwise_direct_convolution_2.cuh +++ b/src/include/gridwise_direct_convolution_2.cuh @@ -145,18 +145,18 @@ __global__ void gridwise_direct_convolution_2(InGlobalDesc, #endif constexpr auto blockwise_in_copy = - blockwise_4d_tensor_copy_1{}; + Blockwise4dTensorCopy1{}; constexpr auto blockwise_wei_copy = - blockwise_4d_tensor_copy_1{}; + Blockwise4dTensorCopy1{}; // set threadwise output tensor to 0 threadwise_4d_tensor_set_zero(out_thread_desc, p_out_thread); @@ -165,14 +165,14 @@ __global__ void gridwise_direct_convolution_2(InGlobalDesc, c_block_data_begin += CPerBlock, __syncthreads()) { // copy input tensor to LDS - blockwise_in_copy.run(p_in_global + in_global_desc.Get1dIndex(n_block_data_begin, + blockwise_in_copy.Run(p_in_global + in_global_desc.Get1dIndex(n_block_data_begin, c_block_data_begin, hi_block_data_begin, wi_block_data_begin), p_in_block); // copy weight tensor to LDS - blockwise_wei_copy.run( + blockwise_wei_copy.Run( p_wei_global + wei_global_desc.Get1dIndex(k_block_data_begin, c_block_data_begin, 0, 0), p_wei_block); diff --git a/src/include/gridwise_implicit_gemm_convolution_1_chwn_csrk_khwn.cuh b/src/include/gridwise_implicit_gemm_convolution_1_chwn_csrk_khwn.cuh index b3bc395af6..0f32ca42ec 100644 --- a/src/include/gridwise_implicit_gemm_convolution_1_chwn_csrk_khwn.cuh +++ b/src/include/gridwise_implicit_gemm_convolution_1_chwn_csrk_khwn.cuh @@ -106,19 +106,19 @@ gridwise_implicit_gemm_convolution_1_chwn_csrk_khwn(InGlobalDesc, // blockwise copy // input: format is [C, Hi, Wi, N] constexpr auto blockwise_in_copy = - blockwise_4d_tensor_copy_1{}; + Blockwise4dTensorCopy1{}; // weight: format is [S,R,C,K] constexpr auto blockwise_wei_copy = - blockwise_4d_tensor_copy_1{}; + Blockwise4dTensorCopy1{}; // a series of blockwise batched GEMM // C_matrix += transpose(A_matrix) * B_matrix @@ -140,21 +140,20 @@ gridwise_implicit_gemm_convolution_1_chwn_csrk_khwn(InGlobalDesc, Number{}, Number{}); // constexpr doesn't compile const auto blockwise_batch_gemm = - blockwise_1d_strided_batched_gemm_block_a_block_b_thread_c{}; + Blockwise1dStridedBatchedGemmBlockABlockBThreadC{}; // LDS constexpr unsigned in_block_size = in_chwn_block_desc.GetElementSpace(); @@ -183,12 +182,12 @@ gridwise_implicit_gemm_convolution_1_chwn_csrk_khwn(InGlobalDesc, { #if 1 // input: global mem to LDS, - blockwise_in_copy.run(p_in_global_block_begin, p_in_block); + blockwise_in_copy.Run(p_in_global_block_begin, p_in_block); #endif #if 1 // weight: global mem to LDS, - blockwise_wei_copy.run(p_wei_global_block_begin, p_wei_block); + blockwise_wei_copy.Run(p_wei_global_block_begin, p_wei_block); #endif __syncthreads(); @@ -200,7 +199,7 @@ gridwise_implicit_gemm_convolution_1_chwn_csrk_khwn(InGlobalDesc, { auto f_accum = [](auto& acc, const auto&& v) { acc += v; }; - blockwise_batch_gemm.run(p_wei_block + wei_csrk_block_desc.Get1dIndex(0, s, r, 0), + blockwise_batch_gemm.Run(p_wei_block + wei_csrk_block_desc.Get1dIndex(0, s, r, 0), p_in_block + in_chwn_block_desc.Get1dIndex(0, s, r, 0), p_out_thread, f_accum); diff --git a/src/include/gridwise_implicit_gemm_convolution_1_chwn_csrk_khwn_padded.cuh b/src/include/gridwise_implicit_gemm_convolution_1_chwn_csrk_khwn_padded.cuh index 5bb1f67579..e5fcdb9aaf 100644 --- a/src/include/gridwise_implicit_gemm_convolution_1_chwn_csrk_khwn_padded.cuh +++ b/src/include/gridwise_implicit_gemm_convolution_1_chwn_csrk_khwn_padded.cuh @@ -136,39 +136,38 @@ gridwise_implicit_gemm_convolution_1_chwn_csrk_khwn_padded(Float* const __restri #endif constexpr auto blockwise_in_copy = - blockwise_chwn_tensor_copy_with_padding{}; + BlockwiseChwnTensorCopyPadded{}; #if 1 // weight: format is [C,S,R,K] constexpr auto blockwise_wei_copy = - blockwise_4d_tensor_copy_1{}; + Blockwise4dTensorCopy1{}; #elif 1 // weight: format is [C*S*R,K] constexpr auto blockwise_wei_copy = - blockwise_2d_tensor_copy_1{}; + Blockwise2dTensorCopy1{}; #elif 1 // weight: format is [C*S*R,K] - const auto blockwise_wei_copy = - blockwise_2d_tensor_copy_2{}; + const auto blockwise_wei_copy = Blockwise2dTensorCopy2{}; #endif // a series of blockwise batched GEMM @@ -191,21 +190,20 @@ gridwise_implicit_gemm_convolution_1_chwn_csrk_khwn_padded(Float* const __restri Number{}, Number{}); // constexpr doesn't compile const auto blockwise_batch_gemm = - blockwise_1d_strided_batched_gemm_block_a_block_b_thread_c{}; + Blockwise1dStridedBatchedGemmBlockABlockBThreadC{}; // LDS constexpr unsigned in_block_size = in_chwn_block_desc.GetElementSpace(); @@ -229,7 +227,7 @@ gridwise_implicit_gemm_convolution_1_chwn_csrk_khwn_padded(Float* const __restri { #if 1 // input: global mem to LDS, - blockwise_in_copy.run(p_in_global, + blockwise_in_copy.Run(p_in_global, c_block_data_begin, ho_block_data_begin, wo_block_data_begin, @@ -243,7 +241,7 @@ gridwise_implicit_gemm_convolution_1_chwn_csrk_khwn_padded(Float* const __restri #if 1 // weight: global mem to LDS, - blockwise_wei_copy.run(p_wei_global_block_begin, p_wei_block); + blockwise_wei_copy.Run(p_wei_global_block_begin, p_wei_block); #endif __syncthreads(); @@ -255,7 +253,7 @@ gridwise_implicit_gemm_convolution_1_chwn_csrk_khwn_padded(Float* const __restri { auto f_accum = [](auto& acc, const auto&& v) { acc += v; }; - blockwise_batch_gemm.run(p_wei_block + wei_csrk_block_desc.Get1dIndex(0, s, r, 0), + blockwise_batch_gemm.Run(p_wei_block + wei_csrk_block_desc.Get1dIndex(0, s, r, 0), p_in_block + in_chwn_block_desc.Get1dIndex(0, s, r, 0), p_out_thread, f_accum); diff --git a/src/include/gridwise_implicit_gemm_convolution_1_chwn_csrk_khwn_padded_lds_pipeline.cuh b/src/include/gridwise_implicit_gemm_convolution_1_chwn_csrk_khwn_padded_lds_pipeline.cuh index c92b500f72..b1be45968a 100644 --- a/src/include/gridwise_implicit_gemm_convolution_1_chwn_csrk_khwn_padded_lds_pipeline.cuh +++ b/src/include/gridwise_implicit_gemm_convolution_1_chwn_csrk_khwn_padded_lds_pipeline.cuh @@ -136,17 +136,17 @@ __global__ void gridwise_implicit_gemm_convolution_1_chwn_csrk_khwn_padded_lds_p #endif constexpr auto blockwise_in_copy = - blockwise_chwn_tensor_copy_with_padding{}; + BlockwiseChwnTensorCopyPadded{}; #if 0 // weight: format is [C,S,R,K] constexpr auto blockwise_wei_copy = - blockwise_4d_tensor_copy_1{}; + Blockwise2dTensorCopy1{}; #elif 1 // weight: format is [C*S*R,K] - const auto blockwise_wei_copy = - blockwise_2d_tensor_copy_2{}; + const auto blockwise_wei_copy = Blockwise2dTensorCopy2{}; #endif // a series of blockwise batched GEMM @@ -191,21 +190,20 @@ __global__ void gridwise_implicit_gemm_convolution_1_chwn_csrk_khwn_padded_lds_p Number{}, Number{}); // constexpr doesn't compile const auto blockwise_batch_gemm = - blockwise_1d_strided_batched_gemm_block_a_block_b_thread_c{}; + Blockwise1dStridedBatchedGemmBlockABlockBThreadC{}; // LDS constexpr unsigned in_block_size = in_chwn_block_desc.GetElementSpace(); @@ -229,7 +227,7 @@ __global__ void gridwise_implicit_gemm_convolution_1_chwn_csrk_khwn_padded_lds_p // prelog: load data // input: global mem to LDS, - blockwise_in_copy.run(p_in_global, + blockwise_in_copy.Run(p_in_global, 0, ho_block_data_begin, wo_block_data_begin, @@ -241,7 +239,7 @@ __global__ void gridwise_implicit_gemm_convolution_1_chwn_csrk_khwn_padded_lds_p w_block_pad_up); // weight: global mem to LDS, - blockwise_wei_copy.run(p_wei_global_block_begin, p_wei_block_0); + blockwise_wei_copy.Run(p_wei_global_block_begin, p_wei_block_0); p_wei_global_block_begin += CPerBlock * wei_ek_global_desc.GetStride(I0); @@ -263,7 +261,7 @@ __global__ void gridwise_implicit_gemm_convolution_1_chwn_csrk_khwn_padded_lds_p // preload next data #if 1 // input: global mem to LDS, - blockwise_in_copy.run(p_in_global, + blockwise_in_copy.Run(p_in_global, c_block_data_begin, ho_block_data_begin, wo_block_data_begin, @@ -277,7 +275,7 @@ __global__ void gridwise_implicit_gemm_convolution_1_chwn_csrk_khwn_padded_lds_p #if 1 // weight: global mem to LDS, - blockwise_wei_copy.run(p_wei_global_block_begin, p_wei_block_next); + blockwise_wei_copy.Run(p_wei_global_block_begin, p_wei_block_next); #endif // a series of batched GEMM @@ -287,7 +285,7 @@ __global__ void gridwise_implicit_gemm_convolution_1_chwn_csrk_khwn_padded_lds_p { auto f_accum = [](auto& acc, const auto&& v) { acc += v; }; - blockwise_batch_gemm.run(p_wei_block_now + + blockwise_batch_gemm.Run(p_wei_block_now + wei_csrk_block_desc.Get1dIndex(0, s, r, 0), p_in_block_now + in_chwn_block_desc.Get1dIndex(0, s, r, 0), p_out_thread, @@ -310,7 +308,7 @@ __global__ void gridwise_implicit_gemm_convolution_1_chwn_csrk_khwn_padded_lds_p { auto f_accum = [](auto& acc, const auto&& v) { acc += v; }; - blockwise_batch_gemm.run(p_wei_block_now + + blockwise_batch_gemm.Run(p_wei_block_now + wei_csrk_block_desc.Get1dIndex(0, s, r, 0), p_in_block_now + in_chwn_block_desc.Get1dIndex(0, s, r, 0), p_out_thread, diff --git a/src/include/gridwise_implicit_gemm_convolution_1_nchw_kcsr.cuh b/src/include/gridwise_implicit_gemm_convolution_1_nchw_kcsr.cuh index b822f72d7b..a7e6810883 100644 --- a/src/include/gridwise_implicit_gemm_convolution_1_nchw_kcsr.cuh +++ b/src/include/gridwise_implicit_gemm_convolution_1_nchw_kcsr.cuh @@ -127,21 +127,20 @@ gridwise_implicit_gemm_convolution_1_nchw_kcsr(InGlobalDesc, Number{}, Number{}); // constexpr doesn't compile const auto blockwise_batch_gemm = - blockwise_1d_strided_batched_gemm_block_a_block_b_thread_c{}; + Blockwise1dStridedBatchedGemmBlockABlockBThreadC{}; // LDS constexpr unsigned in_block_size = in_chwn_block_desc.GetElementSpace(); @@ -175,15 +174,15 @@ gridwise_implicit_gemm_convolution_1_nchw_kcsr(InGlobalDesc, #else // input: global mem to LDS, // no format conversion, this is wrong, for performance study only! - blockwise_4d_tensor_copy(in_nchw_global_desc, - p_in_global + - in_nchw_global_desc.Get1dIndex(n_block_data_begin, - c_block_data_begin, - hi_block_data_begin, - wi_block_data_begin), - in_nchw_block_desc, - p_in_block, - in_nchw_block_desc.GetLengths()); + Blockwise4dTensorCopy(in_nchw_global_desc, + p_in_global + + in_nchw_global_desc.Get1dIndex(n_block_data_begin, + c_block_data_begin, + hi_block_data_begin, + wi_block_data_begin), + in_nchw_block_desc, + p_in_block, + in_nchw_block_desc.GetLengths()); #endif #if 1 @@ -200,7 +199,7 @@ gridwise_implicit_gemm_convolution_1_nchw_kcsr(InGlobalDesc, #else // weight: global mem to LDS, // no format conversion, this is wrong, for performance study only! - blockwise_4d_tensor_copy( + Blockwise4dTensorCopy( wei_kcsr_global_desc, p_wei_global + wei_kcsr_global_desc.Get1dIndex(k_block_data_begin, c_block_data_begin, 0, 0), @@ -219,7 +218,7 @@ gridwise_implicit_gemm_convolution_1_nchw_kcsr(InGlobalDesc, { auto f_accum = [](auto& c, const auto&& ab) { c += ab; }; - blockwise_batch_gemm.run(p_wei_block + wei_srck_block_desc.Get1dIndex(s, r, 0, 0), + blockwise_batch_gemm.Run(p_wei_block + wei_srck_block_desc.Get1dIndex(s, r, 0, 0), p_in_block + in_chwn_block_desc.Get1dIndex(0, s, r, 0), p_out_thread, f_accum); diff --git a/src/include/gridwise_implicit_gemm_convolution_1_nchw_srck_nkhw.cuh b/src/include/gridwise_implicit_gemm_convolution_1_nchw_srck_nkhw.cuh index 7b954956cf..abd143c02d 100644 --- a/src/include/gridwise_implicit_gemm_convolution_1_nchw_srck_nkhw.cuh +++ b/src/include/gridwise_implicit_gemm_convolution_1_nchw_srck_nkhw.cuh @@ -109,11 +109,11 @@ gridwise_implicit_gemm_convolution_1_nchw_srck_nkhw(InGlobalDesc, // blockwise copy // wei: format is [S,R,C,K], no conversion needed constexpr auto blockwise_wei_copy = - blockwise_4d_tensor_copy_1{}; + Blockwise4dTensorCopy1{}; // a series of blockwise batched GEMM // C_matrix += transpose(A_matrix) * B_matrix @@ -133,21 +133,20 @@ gridwise_implicit_gemm_convolution_1_nchw_srck_nkhw(InGlobalDesc, Number{}, Number{}); // constexpr doesn't compile const auto blockwise_batch_gemm = - blockwise_1d_strided_batched_gemm_block_a_block_b_thread_c{}; + Blockwise1dStridedBatchedGemmBlockABlockBThreadC{}; // LDS constexpr unsigned in_block_size = in_chwn_block_desc.GetElementSpace(); @@ -183,7 +182,7 @@ gridwise_implicit_gemm_convolution_1_nchw_srck_nkhw(InGlobalDesc, #if 1 // weight: global mem to LDS, // format is [S,R,C,K], no conversion needed - blockwise_wei_copy.run(p_wei_global + wei_srck_global_desc.Get1dIndex( + blockwise_wei_copy.Run(p_wei_global + wei_srck_global_desc.Get1dIndex( 0, 0, c_block_data_begin, k_block_data_begin), p_wei_block); #endif @@ -197,7 +196,7 @@ gridwise_implicit_gemm_convolution_1_nchw_srck_nkhw(InGlobalDesc, { auto f_accum = [](auto& c, const auto&& ab) { c += ab; }; - blockwise_batch_gemm.run(p_wei_block + wei_srck_block_desc.Get1dIndex(s, r, 0, 0), + blockwise_batch_gemm.Run(p_wei_block + wei_srck_block_desc.Get1dIndex(s, r, 0, 0), p_in_block + in_chwn_block_desc.Get1dIndex(0, s, r, 0), p_out_thread, f_accum); diff --git a/src/include/gridwise_implicit_gemm_convolution_2_cnhw_csrk_knhw.cuh b/src/include/gridwise_implicit_gemm_convolution_2_cnhw_csrk_knhw.cuh index 49e4643794..59c0aa419a 100644 --- a/src/include/gridwise_implicit_gemm_convolution_2_cnhw_csrk_knhw.cuh +++ b/src/include/gridwise_implicit_gemm_convolution_2_cnhw_csrk_knhw.cuh @@ -25,7 +25,9 @@ template + unsigned WeiBlockCopyThreadPerDim1, + unsigned InBlockCopyDataPerRead, + unsigned WeiBlockCopyDataPerRead> __global__ void gridwise_implicit_gemm_convolution_2_cnhw_csrk_knhw(InGlobalDesc, Float* const __restrict__ p_in_global, @@ -117,40 +119,52 @@ gridwise_implicit_gemm_convolution_2_cnhw_csrk_knhw(InGlobalDesc, // formmat is [CPerBlock,BPerBlock + BGhostRead] #if 0 const auto blockwise_in_copy = - blockwise_2d_tensor_copy_1{}; +#elif 0 + const auto blockwise_in_copy = Blockwise2dTensorCopy2{}; #elif 1 - const auto blockwise_in_copy = - blockwise_2d_tensor_copy_2{}; + const auto blockwise_in_copy = Blockwise2dTensorCopy3{}; #endif // blockwise wei copy // format is [CPerBlock*S*R,KPerBlock] #if 0 const auto blockwise_wei_copy = - blockwise_2d_tensor_copy_1{}; + Blockwise2dTensorCopy1{}; +#elif 0 + const auto blockwise_wei_copy = Blockwise2dTensorCopy2{}; #elif 1 - const auto blockwise_wei_copy = - blockwise_2d_tensor_copy_2{}; + const auto blockwise_wei_copy = Blockwise2dTensorCopy3{}; #endif // a series of blockwise GEMM @@ -170,18 +184,17 @@ gridwise_implicit_gemm_convolution_2_cnhw_csrk_knhw(InGlobalDesc, const auto c_kxb_thread_mtx_desc = make_ConstantMatrixDescriptor( Number{}, Number{}); // constexpr doesn't compile - const auto blockwise_gemm = - blockwise_gemm_block_a_block_b_thread_c{}; + const auto blockwise_gemm = BlockwiseGemmBlockABlockBThreadC{}; // LDS constexpr unsigned in_block_size = in_cb_block_desc.GetElementSpace(); @@ -208,10 +221,10 @@ gridwise_implicit_gemm_convolution_2_cnhw_csrk_knhw(InGlobalDesc, __syncthreads()) { // input: global mem to LDS, - blockwise_in_copy.run(p_in_global_block_offset, p_in_block); + blockwise_in_copy.Run(p_in_global_block_offset, p_in_block); // weight: global mem to LDS, - blockwise_wei_copy.run(p_wei_global_block_offset, p_wei_block); + blockwise_wei_copy.Run(p_wei_global_block_offset, p_wei_block); __syncthreads(); @@ -222,7 +235,7 @@ gridwise_implicit_gemm_convolution_2_cnhw_csrk_knhw(InGlobalDesc, { auto f_accum = [](auto& c, const auto&& ab) { c += ab; }; - blockwise_gemm.run(p_wei_block + wei_csrk_block_desc.Get1dIndex(s, r, 0, 0), + blockwise_gemm.Run(p_wei_block + wei_csrk_block_desc.Get1dIndex(s, r, 0, 0), p_in_block + s * Wi + r, p_out_thread, f_accum); @@ -283,10 +296,8 @@ gridwise_implicit_gemm_convolution_2_cnhw_csrk_knhw(InGlobalDesc, #endif if(n_data < N && h_data < Ho && w_data < Wo) { -#if 1 p_out_global[out_knhw_global_desc.Get1dIndex(k_data, n_data, h_data, w_data)] = p_out_thread[out_kb_thread_desc.Get1dIndex(k, b)]; -#endif } } } diff --git a/src/include/gridwise_implicit_gemm_convolution_2_cnhw_csrk_knhw_lds_pipeline.cuh b/src/include/gridwise_implicit_gemm_convolution_2_cnhw_csrk_knhw_lds_pipeline.cuh index 78c0d5a419..0222752f17 100644 --- a/src/include/gridwise_implicit_gemm_convolution_2_cnhw_csrk_knhw_lds_pipeline.cuh +++ b/src/include/gridwise_implicit_gemm_convolution_2_cnhw_csrk_knhw_lds_pipeline.cuh @@ -25,7 +25,9 @@ template + unsigned WeiBlockCopyThreadPerDim1, + unsigned InBlockCopyDataPerRead, + unsigned WeiBlockCopyDataPerRead> __global__ void gridwise_implicit_gemm_convolution_2_cnhw_csrk_knhw_lds_pipeline( InGlobalDesc, Float* const __restrict__ p_in_global, @@ -117,40 +119,52 @@ __global__ void gridwise_implicit_gemm_convolution_2_cnhw_csrk_knhw_lds_pipeline // formmat is [CPerBlock,BPerBlock + BGhostRead] #if 0 const auto blockwise_in_copy = - blockwise_2d_tensor_copy_1{}; +#elif 0 + const auto blockwise_in_copy = Blockwise2dTensorCopy2{}; #elif 1 - const auto blockwise_in_copy = - blockwise_2d_tensor_copy_2{}; + const auto blockwise_in_copy = Blockwise2dTensorCopy3{}; #endif // blockwise wei copy // format is [CPerBlock*S*R,KPerBlock] #if 0 const auto blockwise_wei_copy = - blockwise_2d_tensor_copy_1{}; +#elif 0 + const auto blockwise_wei_copy = Blockwise2dTensorCopy2{}; #elif 1 - const auto blockwise_wei_copy = - blockwise_2d_tensor_copy_2{}; + const auto blockwise_wei_copy = Blockwise2dTensorCopy3{}; #endif // a series of blockwise GEMM @@ -170,18 +184,17 @@ __global__ void gridwise_implicit_gemm_convolution_2_cnhw_csrk_knhw_lds_pipeline const auto c_kxb_thread_mtx_desc = make_ConstantMatrixDescriptor( Number{}, Number{}); // constexpr doesn't compile - const auto blockwise_gemm = - blockwise_gemm_block_a_block_b_thread_c{}; + const auto blockwise_gemm = BlockwiseGemmBlockABlockBThreadC{}; // LDS constexpr unsigned in_block_size = in_cb_block_desc.GetElementSpace(); @@ -205,10 +218,10 @@ __global__ void gridwise_implicit_gemm_convolution_2_cnhw_csrk_knhw_lds_pipeline // prelog : preload data // input: global mem to LDS, - blockwise_in_copy.run(p_in_global_block_offset, p_in_block_0); + blockwise_in_copy.Run(p_in_global_block_offset, p_in_block_0); // weight: global mem to LDS, - blockwise_wei_copy.run(p_wei_global_block_offset, p_wei_block_0); + blockwise_wei_copy.Run(p_wei_global_block_offset, p_wei_block_0); p_in_global_block_offset += CPerBlock * in_cb_global_desc.GetStride(I0); @@ -234,10 +247,10 @@ __global__ void gridwise_implicit_gemm_convolution_2_cnhw_csrk_knhw_lds_pipeline Float* p_wei_block_next = even_loop ? p_wei_block_1 : p_wei_block_0; // input: global mem to LDS, - blockwise_in_copy.run(p_in_global_block_offset, p_in_block_next); + blockwise_in_copy.Run(p_in_global_block_offset, p_in_block_next); // weight: global mem to LDS, - blockwise_wei_copy.run(p_wei_global_block_offset, p_wei_block_next); + blockwise_wei_copy.Run(p_wei_global_block_offset, p_wei_block_next); // a series of GEMM for(unsigned s = 0; s < S; ++s) @@ -246,7 +259,7 @@ __global__ void gridwise_implicit_gemm_convolution_2_cnhw_csrk_knhw_lds_pipeline { auto f_accum = [](auto& c, const auto&& ab) { c += ab; }; - blockwise_gemm.run(p_wei_block_now + wei_csrk_block_desc.Get1dIndex(s, r, 0, 0), + blockwise_gemm.Run(p_wei_block_now + wei_csrk_block_desc.Get1dIndex(s, r, 0, 0), p_in_block_now + s * Wi + r, p_out_thread, f_accum); @@ -268,7 +281,7 @@ __global__ void gridwise_implicit_gemm_convolution_2_cnhw_csrk_knhw_lds_pipeline { auto f_accum = [](auto& c, const auto&& ab) { c += ab; }; - blockwise_gemm.run(p_wei_block_now + wei_csrk_block_desc.Get1dIndex(s, r, 0, 0), + blockwise_gemm.Run(p_wei_block_now + wei_csrk_block_desc.Get1dIndex(s, r, 0, 0), p_in_block_now + s * Wi + r, p_out_thread, f_accum); diff --git a/src/include/gridwise_implicit_gemm_convolution_2_cnhw_srck_knhw.cuh b/src/include/gridwise_implicit_gemm_convolution_2_cnhw_srck_knhw.cuh index 883f00d120..5acb9d13be 100644 --- a/src/include/gridwise_implicit_gemm_convolution_2_cnhw_srck_knhw.cuh +++ b/src/include/gridwise_implicit_gemm_convolution_2_cnhw_srck_knhw.cuh @@ -110,30 +110,29 @@ gridwise_implicit_gemm_convolution_2_cnhw_srck_knhw(InGlobalDesc, // formmat is [CPerBlock,BPerBlock + BGhostRead] #if 1 const auto blockwise_in_copy = - blockwise_2d_tensor_copy_1{}; + Blockwise2dTensorCopy1{}; #elif 1 - const auto blockwise_in_copy = - blockwise_2d_tensor_copy_2{}; + const auto blockwise_in_copy = Blockwise2dTensorCopy2{}; #endif // blockwise wei copy // format is [S,R,CPerBlock,KPerBlock] const auto blockwise_wei_copy = - blockwise_4d_tensor_copy_1{}; + Blockwise4dTensorCopy1{}; // a series of blockwise GEMM // c_mtx += transpose(a_mtx) * b_mtx @@ -152,18 +151,17 @@ gridwise_implicit_gemm_convolution_2_cnhw_srck_knhw(InGlobalDesc, const auto c_kxb_thread_mtx_desc = make_ConstantMatrixDescriptor( Number{}, Number{}); // constexpr doesn't compile - const auto blockwise_gemm = - blockwise_gemm_block_a_block_b_thread_c{}; + const auto blockwise_gemm = BlockwiseGemmBlockABlockBThreadC{}; // LDS constexpr unsigned in_block_size = in_cb_block_desc.GetElementSpace(); @@ -191,12 +189,12 @@ gridwise_implicit_gemm_convolution_2_cnhw_srck_knhw(InGlobalDesc, { #if 1 // input: global mem to LDS, - blockwise_in_copy.run(p_in_global_block_offset, p_in_block); + blockwise_in_copy.Run(p_in_global_block_offset, p_in_block); #endif #if 1 // weight: global mem to LDS, - blockwise_wei_copy.run(p_wei_global_block_offset, p_wei_block); + blockwise_wei_copy.Run(p_wei_global_block_offset, p_wei_block); #endif __syncthreads(); @@ -209,7 +207,7 @@ gridwise_implicit_gemm_convolution_2_cnhw_srck_knhw(InGlobalDesc, { auto f_accum = [](auto& c, const auto&& ab) { c += ab; }; - blockwise_gemm.run(p_wei_block + wei_srck_block_desc.Get1dIndex(s, r, 0, 0), + blockwise_gemm.Run(p_wei_block + wei_srck_block_desc.Get1dIndex(s, r, 0, 0), p_in_block + s * Wi + r, p_out_thread, f_accum); diff --git a/src/include/gridwise_implicit_gemm_convolution_2_cnhw_srck_knhw_lds_pipeline.cuh b/src/include/gridwise_implicit_gemm_convolution_2_cnhw_srck_knhw_lds_pipeline.cuh index d9f8ba258f..9d9ff8f3fb 100644 --- a/src/include/gridwise_implicit_gemm_convolution_2_cnhw_srck_knhw_lds_pipeline.cuh +++ b/src/include/gridwise_implicit_gemm_convolution_2_cnhw_srck_knhw_lds_pipeline.cuh @@ -110,20 +110,19 @@ __global__ void gridwise_implicit_gemm_convolution_2_cnhw_srck_knhw_lds_pipeline // formmat is [CPerBlock,BPerBlock + BGhostRead] #if 1 const auto blockwise_in_copy = - blockwise_2d_tensor_copy_1{}; + Blockwise2dTensorCopy1{}; #elif 1 - const auto blockwise_in_copy = - blockwise_2d_tensor_copy_2{}; + const auto blockwise_in_copy = Blockwise2dTensorCopy2{}; #elif 0 const auto blockwise_in_copy = blockwise_2d_tensor_copy_dummy_2{}; + Blockwise4dTensorCopy1{}; #else const auto blockwise_wei_copy = blockwise_4d_tensor_copy_dummy{}, Number{}); // constexpr doesn't compile - const auto blockwise_gemm = - blockwise_gemm_block_a_block_b_thread_c{}; + const auto blockwise_gemm = BlockwiseGemmBlockABlockBThreadC{}; // LDS constexpr unsigned in_block_size = in_cb_block_desc.GetElementSpace(); @@ -201,13 +199,13 @@ __global__ void gridwise_implicit_gemm_convolution_2_cnhw_srck_knhw_lds_pipeline // prelog: load data #if 1 // input: global mem to LDS, - blockwise_in_copy.run(p_in_global + in_cb_global_desc.Get1dIndex(0, b_block_data_begin), + blockwise_in_copy.Run(p_in_global + in_cb_global_desc.Get1dIndex(0, b_block_data_begin), p_in_block_0); #endif #if 1 // weight: global mem to LDS, - blockwise_wei_copy.run( + blockwise_wei_copy.Run( p_wei_global + wei_srck_global_desc.Get1dIndex(0, 0, 0, k_block_data_begin), p_wei_block_0); #endif @@ -227,14 +225,14 @@ __global__ void gridwise_implicit_gemm_convolution_2_cnhw_srck_knhw_lds_pipeline #if 1 // preload next data // input: global mem to LDS, - blockwise_in_copy.run(p_in_global + in_cb_global_desc.Get1dIndex( + blockwise_in_copy.Run(p_in_global + in_cb_global_desc.Get1dIndex( c_block_data_begin + CPerBlock, b_block_data_begin), p_in_block_next); #endif #if 1 // weight: global mem to LDS, - blockwise_wei_copy.run(p_wei_global + + blockwise_wei_copy.Run(p_wei_global + wei_srck_global_desc.Get1dIndex( 0, 0, c_block_data_begin + CPerBlock, k_block_data_begin), p_wei_block_next); @@ -247,7 +245,7 @@ __global__ void gridwise_implicit_gemm_convolution_2_cnhw_srck_knhw_lds_pipeline { auto f_accum = [](auto& c, const auto&& ab) { c += ab; }; - blockwise_gemm.run(p_wei_block_now + wei_srck_block_desc.Get1dIndex(s, r, 0, 0), + blockwise_gemm.Run(p_wei_block_now + wei_srck_block_desc.Get1dIndex(s, r, 0, 0), p_in_block_now + s * Wi + r, p_out_thread, f_accum); @@ -269,7 +267,7 @@ __global__ void gridwise_implicit_gemm_convolution_2_cnhw_srck_knhw_lds_pipeline { auto f_accum = [](auto& c, const auto&& ab) { c += ab; }; - blockwise_gemm.run(p_wei_block_now + wei_srck_block_desc.Get1dIndex(s, r, 0, 0), + blockwise_gemm.Run(p_wei_block_now + wei_srck_block_desc.Get1dIndex(s, r, 0, 0), p_in_block_now + s * Wi + r, p_out_thread, f_accum);