From e9733a9f8934fefbc2776a0859142520858f6554 Mon Sep 17 00:00:00 2001 From: Chao Liu Date: Fri, 2 Aug 2019 02:30:43 -0500 Subject: [PATCH] experimenting TensorCoordinate and new merged tensor copy operator --- ...tion_implicit_gemm_v4r1_nchw_kcyx_nkhw.hpp | 14 +++- ...tion_implicit_gemm_v4r4_nchw_kcyx_nkhw.hpp | 78 ++++++++----------- driver/src/driver.cpp | 2 +- 3 files changed, 46 insertions(+), 48 deletions(-) diff --git a/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw.hpp b/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw.hpp index 3e0654b5ba..d25469ba21 100644 --- a/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw.hpp +++ b/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw.hpp @@ -299,7 +299,10 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw blockwise_in_copy.Run(p_in_global, p_in_block); blockwise_wei_copy.Run(p_wei_global, p_wei_block); #else - using InSrcMergedDimSubLengthsHack = Sequence<1, 1, 1, 1>; + using InSrcMergedDimSubLengthsHack = Sequence; using InDstMergedDimSubLengthsHack = Sequence<1, 1, 1, 1>; blockwise_in_copy.Run_hack(p_in_global, p_in_block, @@ -388,6 +391,10 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw arithmetic_sequence_gen<0, 8, 1>::type{}, Number<1>{}); #else + + using OutSrcMergedDimSliceLengthsHack = Sequence<1, 1, 1, 1, 1, 1, 1, 1>; + using OutDstMergedDimSliceLengthsHack = Sequence<1, 1, 1, 1, 1, 1, 1, 1>; + ThreadwiseGenericTensorSliceCopy_v2< Float, decltype(out_n0_n1_n2_k0_k1_k2_h_w_thread_desc), @@ -396,7 +403,10 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw MergedTensorCoordinate, decltype(out_n0_n1_n2_k0_k1_k2_h_w_thread_desc.GetLengths())>( {0, 0, 0, 0, 0, 0, 0, 0}, {0, 0, 0, 0, 0, 0, 0, 0}) - .Run(p_out_thread, p_out_thread_on_global); + .Run_hack(p_out_thread, + p_out_thread_on_global, + OutSrcMergedDimSliceLengthsHack{}, + OutDstMergedDimSliceLengthsHack{}); #endif } } diff --git a/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw.hpp b/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw.hpp index 918f71073d..4aace546c4 100644 --- a/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw.hpp +++ b/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw.hpp @@ -237,8 +237,24 @@ struct GridwiseConvolutionImplicitGemm_v4r4_nchw_kcyx_nkhw for(index_t e_block_data_begin = 0; e_block_data_begin < E; e_block_data_begin += EPerBlock) { +#if 0 blockwise_in_copy.Run(p_in_global, p_in_block); blockwise_wei_copy.Run(p_wei_global, p_wei_block); +#else + using InSrcMergedDimSubLengthsHack = InBlockCopySubLengths_E_B; + using InDstMergedDimSubLengthsHack = Sequence<1, 1>; + blockwise_in_copy.Run_hack(p_in_global, + p_in_block, + InSrcMergedDimSubLengthsHack{}, + InDstMergedDimSubLengthsHack{}); + + using WeiSrcMergedDimSubLengthsHack = Sequence<1, 1>; + using WeiDstMergedDimSubLengthsHack = Sequence<1, 1>; + blockwise_wei_copy.Run_hack(p_wei_global, + p_wei_block, + WeiSrcMergedDimSubLengthsHack{}, + WeiDstMergedDimSubLengthsHack{}); +#endif __syncthreads(); @@ -272,36 +288,6 @@ struct GridwiseConvolutionImplicitGemm_v4r4_nchw_kcyx_nkhw const index_t b_thread_data_on_global = b_block_data_on_global + c_thread_mtx_on_block.col; -#if 0 - // origin of dst in device memory - Float* p_out_thread_on_global = p_out_global + - out_k_b_global_desc.GetOffsetFromMultiIndex( - k_thread_data_on_global, b_thread_data_on_global); - - // dst descriptor - constexpr auto out_k0_k1_b0_b1_global_desc = - out_k_b_global_desc.Fold(I1, Number{}).Fold(I0, Number{}); - - // src descriptor - constexpr auto out_k0_k1_b0_b1_thread_desc = make_ConstantTensorDescriptor_packed( - Sequence{}); - - const auto threadwise_out_copy = - ThreadwiseGenericTensorSliceCopy_v2::type, - 1, - 1>({0, 0, 0, 0}, - {k_thread_data_on_global / K1, - k_thread_data_on_global % K1, - b_thread_data_on_global / B1, - b_thread_data_on_global % B1}); - - threadwise_out_copy.Run(p_out_thread, p_out_thread_on_global); -#elif 1 // This is a hack, because slicing a merged dimension is not supported yet. // This should be replaced with logic above, once slicing a merged dimension support // become available @@ -316,35 +302,37 @@ struct GridwiseConvolutionImplicitGemm_v4r4_nchw_kcyx_nkhw constexpr auto out_k0_k1_b_thread_desc = make_ConstantTensorDescriptor_packed( Sequence{}); + using OutThreadCopySliceLengths = + Sequence; + auto threadwise_out_copy = ThreadwiseGenericTensorSliceCopy_v2< Float, -#if 1 // debug decltype(out_k0_k1_b_thread_desc), decltype(out_k0_k1_b_global_desc), NormalTensorCoordinate, MergedTensorCoordinate, -#else - decltype(out_k0_k1_b_thread_desc), - decltype( - make_ConstantTensorDescriptor_packed(out_k0_k1_b_global_desc.GetLengths())), - NormalTensorCoordinate, - NormalTensorCoordinate, -#endif - Sequence>( - {0, 0, 0}, - {k_thread_data_on_global / K1, - k_thread_data_on_global % K1, - b_thread_data_on_global}); + OutThreadCopySliceLengths>({0, 0, 0}, + {k_thread_data_on_global / K1, + k_thread_data_on_global % K1, + b_thread_data_on_global}); for(index_t nrepeat = 0; nrepeat < GemmNRepeat; ++nrepeat) { +#if 0 threadwise_out_copy.Run(p_out_thread, p_out_global); +#else + using OutSrcMergedDimSubLengthsHack = Sequence<1, 1, 1>; + using OutDstMergedDimSubLengthsHack = + Sequence<1, 1, OutThreadCopySliceLengths{}[2]>; + threadwise_out_copy.Run_hack(p_out_thread, + p_out_global, + OutSrcMergedDimSubLengthsHack{}, + OutDstMergedDimSubLengthsHack{}); +#endif threadwise_out_copy.MoveSrcSlicingWindow({0, 0, GemmNPerThreadSubC}, true); threadwise_out_copy.MoveDstSlicingWindow({0, 0, B1}, true); } -#endif } } }; diff --git a/driver/src/driver.cpp b/driver/src/driver.cpp index c9488b211a..8749fc1ae9 100644 --- a/driver/src/driver.cpp +++ b/driver/src/driver.cpp @@ -532,7 +532,7 @@ int main(int argc, char* argv[]) #elif 0 device_convolution_implicit_gemm_v3_nchw_cyxk_nkhw( (in_nchw_desc, in_nchw, wei_kcyx_desc, wei_kcyx, out_nkhw_desc, out_nkhw_device, nrepeat); -#elif 1 +#elif 0 device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw(in_nchw_desc, in_nchw, wei_kcyx_desc,