diff --git a/driver/device_convolution_implicit_gemm_v1_nchw_cyxk_nkhw.hpp b/driver/device_convolution_implicit_gemm_v1_nchw_cyxk_nkhw.hpp index 39fcaef9be..b89a6d9bd3 100644 --- a/driver/device_convolution_implicit_gemm_v1_nchw_cyxk_nkhw.hpp +++ b/driver/device_convolution_implicit_gemm_v1_nchw_cyxk_nkhw.hpp @@ -57,7 +57,7 @@ void device_convolution_implicit_gemm_v1_nchw_cyxk_nkhw(InDesc, wei_cyxk_device_buf.ToDevice(wei_cyxk.mData.data()); out_nkhw_device_buf.ToDevice(out_nkhw.mData.data()); -#if 1 +#if 0 // for 3x3, 34x34, v1r3, Pascal constexpr index_t BlockSize = 128; @@ -127,7 +127,7 @@ void device_convolution_implicit_gemm_v1_nchw_cyxk_nkhw(InDesc, constexpr index_t WeiBlockCopyDataPerRead_K = 4; constexpr index_t OutThreadCopyDataPerWrite_W = 4; -#elif 0 +#elif 1 // for 3x3, 34x34, v1r3, Vega 20, WoPerBlock = 16 constexpr index_t BlockSize = 256; @@ -313,7 +313,7 @@ void device_convolution_implicit_gemm_v1_nchw_cyxk_nkhw(InDesc, for(index_t i = 0; i < nrepeat; ++i) { constexpr auto gridwise_conv = -#if 0 +#if 1 GridwiseConvolutionImplicitGemm_v1r3_nchw_cyxk_nkhw #else GridwiseConvolutionImplicitGemm_v1r3_lds_double_buffer_nchw_cyxk_nkhw diff --git a/driver/device_convolution_implicit_gemm_v3_nchw_cyxk_nkhw.hpp b/driver/device_convolution_implicit_gemm_v3_nchw_cyxk_nkhw.hpp index 4c20376e1e..3319e59c9a 100644 --- a/driver/device_convolution_implicit_gemm_v3_nchw_cyxk_nkhw.hpp +++ b/driver/device_convolution_implicit_gemm_v3_nchw_cyxk_nkhw.hpp @@ -62,7 +62,6 @@ void device_convolution_implicit_gemm_v3_nchw_cyxk_nkhw(InDesc, constexpr index_t B = (N * Ho * Wo) / (N1 * N2); #if 1 - // for 3x3, 28x28, v3 constexpr index_t BlockSize = 256; constexpr index_t BPerBlock = 16; diff --git a/src/include/gridwise_convolution_implicit_gemm_v1r3_nchw_cyxk_nkhw.hip.hpp b/src/include/gridwise_convolution_implicit_gemm_v1r3_nchw_cyxk_nkhw.hip.hpp index 7c525b1c17..444622e84e 100644 --- a/src/include/gridwise_convolution_implicit_gemm_v1r3_nchw_cyxk_nkhw.hip.hpp +++ b/src/include/gridwise_convolution_implicit_gemm_v1r3_nchw_cyxk_nkhw.hip.hpp @@ -83,7 +83,7 @@ struct GridwiseConvolutionImplicitGemm_v1r3_nchw_cyxk_nkhw constexpr index_t HBlockWork = mod_conv::integer_divide_ceil(Ho, HoPerBlock); constexpr index_t WBlockWork = mod_conv::integer_divide_ceil(Wo, WoPerBlock); - constexpr auto block_work_desc = make_ConstantTensorDescriptor( + constexpr auto block_work_desc = make_ConstantTensorDescriptor_default_rank_packed( Sequence{}); const auto block_work_multi_id = @@ -99,7 +99,7 @@ struct GridwiseConvolutionImplicitGemm_v1r3_nchw_cyxk_nkhw // global tensor view constexpr auto wei_c_k_global_desc = - make_ConstantTensorDescriptor(Sequence{}, Sequence{}); + make_ConstantTensorDescriptor_default_rank(Sequence{}, Sequence{}); // LDS tensor view // be careful of alignment @@ -108,7 +108,7 @@ struct GridwiseConvolutionImplicitGemm_v1r3_nchw_cyxk_nkhw GemmDataPerReadA, GemmDataPerReadB); - constexpr auto in_c_h_w_n_block_desc = make_ConstantTensorDescriptor_aligned( + constexpr auto in_c_h_w_n_block_desc = make_ConstantTensorDescriptor_default_rank_aligned( Sequence{}, Number{}); @@ -117,12 +117,12 @@ struct GridwiseConvolutionImplicitGemm_v1r3_nchw_cyxk_nkhw static_assert(in_c_h_w_n_block_desc.GetStride(I1) % GemmDataPerReadB == 0, "GemmDataPerReadB alignment requirement is not meet"); - constexpr auto wei_c_k_block_desc = make_ConstantTensorDescriptor_aligned( + constexpr auto wei_c_k_block_desc = make_ConstantTensorDescriptor_default_rank_aligned( Sequence{}, Number{}); // tensor view of threadwise output in register - constexpr auto out_k_h_w_n_thread_desc = make_ConstantTensorDescriptor( + constexpr auto out_k_h_w_n_thread_desc = make_ConstantTensorDescriptor_default_rank_packed( Sequence{}); // blockwise copy @@ -140,7 +140,7 @@ struct GridwiseConvolutionImplicitGemm_v1r3_nchw_cyxk_nkhw decltype(map_chwn2nchw), InBlockReorderMapThreadCluster2SrcCluster_CHNW2NCHW, InBlockReorderDataPerRead_W, - InBlockReorderDataPerWrite_N>{}; + InBlockReorderDataPerWrite_N>({0, 0, 0, 0}, {0, 0, 0, 0}); // blockwise wei copy // format is [CPerBlock, KPerBlock] @@ -150,7 +150,7 @@ struct GridwiseConvolutionImplicitGemm_v1r3_nchw_cyxk_nkhw decltype(wei_c_k_global_desc), decltype(wei_c_k_block_desc), decltype(wei_c_k_block_desc.GetLengths()), - WeiBlockCopyDataPerRead_K>{}; + WeiBlockCopyDataPerRead_K>({0, 0}, {0, 0}); // a series of blockwise batched GEMM // C_matrix += transpose(A_matrix) * B_matrix @@ -194,7 +194,7 @@ struct GridwiseConvolutionImplicitGemm_v1r3_nchw_cyxk_nkhw // choose GEMM implementation here const auto run_blockwise_batch_gemm = [&](auto... Xs) { -#if 0 +#if 1 return blockwise_batch_gemm.Run(Xs...); #elif 0 return blockwise_batch_gemm.Run_asm(Xs...); @@ -249,7 +249,6 @@ struct GridwiseConvolutionImplicitGemm_v1r3_nchw_cyxk_nkhw { for(index_t x = 0; x < X; ++x) { -#if 1 blockwise_in_copy_reorder.Run(p_in_global_block_offset + in_n_c_h_w_global_desc.GetOffsetFromMultiIndex(0, 0, y, x), p_in_block); @@ -257,23 +256,6 @@ struct GridwiseConvolutionImplicitGemm_v1r3_nchw_cyxk_nkhw blockwise_wei_copy.Run(p_wei_global_block_offset + wei_c_y_x_k_global_desc.GetOffsetFromMultiIndex(0, y, x, 0), p_wei_block); -#else - Float p_in_clipboard[blockwise_in_copy_reorder.GetRegisterClipboardSize()]; - Float p_wei_clipboard[blockwise_wei_copy.GetRegisterClipboardSize()]; - - blockwise_in_copy_reorder.RunLoadRegisterClipboard( - p_in_global_block_offset + in_n_c_h_w_global_desc.GetOffsetFromMultiIndex(0, 0, y, x), - p_in_clipboard); - - blockwise_wei_copy.RunLoadRegisterClipboard( - p_wei_global_block_offset + wei_c_y_x_k_global_desc.GetOffsetFromMultiIndex(0, y, x, 0), - p_wei_clipboard); - - blockwise_wei_copy.RunStoreRegisterClipboard(p_wei_clipboard, p_wei_block); - - blockwise_in_copy_reorder.RunStoreRegisterClipboard(p_in_clipboard, p_in_block); - -#endif __syncthreads(); @@ -304,24 +286,9 @@ struct GridwiseConvolutionImplicitGemm_v1r3_nchw_cyxk_nkhw p_wei_global_block_offset += CPerBlock * wei_c_y_x_k_global_desc.GetStride(I0)) { -#if 0 - blockwise_in_copy_reorder.Run(p_in_global_block_offset, - p_in_block); + blockwise_in_copy_reorder.Run(p_in_global_block_offset, p_in_block); - blockwise_wei_copy.Run(p_wei_global_block_offset, - p_wei_block); -#else - Float p_in_clipboard[blockwise_in_copy_reorder.GetRegisterClipboardSize()]; - Float p_wei_clipboard[blockwise_wei_copy.GetRegisterClipboardSize()]; - - blockwise_in_copy_reorder.RunLoadRegisterClipboard(p_in_global_block_offset, - p_in_clipboard); - blockwise_wei_copy.RunLoadRegisterClipboard(p_wei_global_block_offset, - p_wei_clipboard); - - blockwise_wei_copy.RunStoreRegisterClipboard(p_wei_clipboard, p_wei_block); - blockwise_in_copy_reorder.RunStoreRegisterClipboard(p_in_clipboard, p_in_block); -#endif + blockwise_wei_copy.Run(p_wei_global_block_offset, p_wei_block); __syncthreads(); @@ -342,13 +309,12 @@ struct GridwiseConvolutionImplicitGemm_v1r3_nchw_cyxk_nkhw const index_t wo_thread_data_begin = c_thread_mtx_begin.col / NPerBlock; const index_t n_thread_data_begin = c_thread_mtx_begin.col % NPerBlock; - static_if{}([&](auto f_dummy) { // f_dummy do nothing but - // perfect forwarding. - // Using this trick to - // make this lambda a generic lambda, so it won't be compiled until - // instantiated + static_if{}([&](auto fwd) { + // fwd do nothing but perfect forwarding. + // Using this trick to make this lambda a generic lambda, so it won't be compiled until + // begin instantiated here static_assert( - (f_dummy(GemmNPerThreadSubC) <= NPerBlock && NPerBlock % GemmNPerThreadSubC == 0), + (fwd(GemmNPerThreadSubC) <= NPerBlock && NPerBlock % GemmNPerThreadSubC == 0), "wrong!"); // output is a 10d tensor @@ -356,38 +322,33 @@ struct GridwiseConvolutionImplicitGemm_v1r3_nchw_cyxk_nkhw constexpr index_t N1 = NPerBlock / N2; constexpr index_t W2 = - (GemmNLevel0Cluster * GemmNLevel1Cluster) / f_dummy(NPerBlock / GemmNPerThreadSubC); + (GemmNLevel0Cluster * GemmNLevel1Cluster) / fwd(NPerBlock / GemmNPerThreadSubC); constexpr index_t W1 = WoPerBlock / W2; constexpr index_t K2 = GemmMPerThreadSubC; constexpr index_t K1 = KPerBlock / KPerThread; - constexpr auto out_10d_global_desc = - make_ConstantTensorDescriptor(Sequence{}); + constexpr auto out_10d_global_desc = fwd(out_n_k_h_w_global_desc) + .Fold(I3, Number{}, Number{}) + .Fold(I1, Number{}, Number{}) + .Fold(I0, Number{}, Number{}); - constexpr auto out_10d_thread_desc = make_ConstantTensorDescriptor( - Sequence{}); + constexpr auto out_10d_thread_desc = fwd(out_k_h_w_n_thread_desc) + .Fold(I3, Number<1>{}, Number{}) + .Fold(I2, Number{}, Number<1>{}) + .Fold(I0, Number<1>{}, Number{}); #if 0 - if(get_thread_local_1d_id() == 0 && get_block_1d_id() == 0) - { - print_ConstantTensorDescriptor(out_k_h_w_n_thread_desc, - "out_k_h_w_n_thread_desc"); - print_ConstantTensorDescriptor(out_10d_thread_desc, "out_10d_thread_desc"); + if(get_thread_local_1d_id() == 0 && get_block_1d_id() == 0) + { + print_ConstantTensorDescriptor(out_k_h_w_n_thread_desc, + "a: out_k_h_w_n_thread_desc"); + print_ConstantTensorDescriptor(out_10d_thread_desc, "a: out_10d_thread_desc"); - print_ConstantTensorDescriptor(out_k_h_w_n_global_desc, - "out_k_h_w_n_global_desc"); - print_ConstantTensorDescriptor(out_10d_global_desc, "out_10d_global_desc"); - } + print_ConstantTensorDescriptor(out_n_k_h_w_global_desc, + "a: out_n_k_h_w_global_desc"); + print_ConstantTensorDescriptor(out_10d_global_desc, "a: out_10d_global_desc"); + } #endif constexpr auto map_out_global2thread = Sequence<7, 8, 9, 0, 1, 2, 3, 4, 5, 6>{}; @@ -405,8 +366,8 @@ struct GridwiseConvolutionImplicitGemm_v1r3_nchw_cyxk_nkhw out_10d_thread_desc.GetLengths(), map_out_global2thread); // Number{}); - }).else_([&](auto f_dummy) { - static_assert(f_dummy(GemmNPerThreadSubC) >= NPerBlock && NPerThread == NPerBlock && + }).else_([&](auto fwd) { + static_assert(fwd(GemmNPerThreadSubC) >= NPerBlock && NPerThread == NPerBlock && GemmNPerThreadSubC % NPerThread == 0, "wrong!"); @@ -415,34 +376,40 @@ struct GridwiseConvolutionImplicitGemm_v1r3_nchw_cyxk_nkhw constexpr index_t W3 = GemmNPerThreadSubC / NPerBlock; constexpr index_t W2 = GemmNLevel0Cluster * GemmNLevel1Cluster; - constexpr index_t W1 = WoPerBlock / f_dummy(W2 * W3); + constexpr index_t W1 = WoPerBlock / fwd(W2 * W3); constexpr index_t K2 = GemmMPerThreadSubC; constexpr index_t K1 = KPerBlock / KPerThread; - constexpr auto out_10d_global_desc = make_ConstantTensorDescriptor( - Sequence{}); + constexpr auto out_10d_global_desc = + fwd(out_n_k_h_w_global_desc) + .Fold(I3, Number{}, Number{}, Number{}) + .Fold(I1, Number{}, Number{}) + .Fold(I0, Number{}); - constexpr auto out_10d_thread_desc = make_ConstantTensorDescriptor( - Sequence{}); + constexpr auto out_10d_thread_desc = + fwd(out_k_h_w_n_thread_desc) + .Fold(I3, Number{}) + .Fold(I2, Number{}, Number<1>{}, Number{}) + .Fold(I0, Number<1>{}, Number{}); #if 0 - if(get_thread_local_1d_id() == 0 && get_block_1d_id() == 0) - { - print_ConstantTensorDescriptor(out_k_h_w_n_thread_desc, - "out_k_h_w_n_thread_desc"); - print_ConstantTensorDescriptor(out_10d_thread_desc, "out_10d_thread_desc"); + if(get_thread_local_1d_id() == 0 && get_block_1d_id() == 0) + { + print_ConstantTensorDescriptor(out_k_h_w_n_thread_desc, + "b: out_k_h_w_n_thread_desc"); + print_ConstantTensorDescriptor(out_10d_thread_desc, "b: out_10d_thread_desc"); - print_ConstantTensorDescriptor(out_k_h_w_n_global_desc, - "out_k_h_w_n_global_desc"); - print_ConstantTensorDescriptor(out_10d_global_desc, "out_10d_global_desc"); - - } + print_ConstantTensorDescriptor(out_n_k_h_w_global_desc, + "b: out_n_k_h_w_global_desc"); + print_ConstantTensorDescriptor(out_10d_global_desc, "b: out_10d_global_desc"); + } #endif constexpr auto map_out_global2thread = Sequence<8, 9, 0, 1, 2, 3, 4, 5, 6, 7>{}; - threadwise_tensor_slice_copy_reorder_given_dst2src_v2( +#if 0 + threadwise_tensor_slice_copy_reorder_given_dst2src_v3( out_10d_thread_desc, p_out_thread, out_10d_global_desc, @@ -453,8 +420,24 @@ struct GridwiseConvolutionImplicitGemm_v1r3_nchw_cyxk_nkhw ho_block_data_begin + ho_thread_data_begin, wo_block_data_begin + wo_thread_data_begin), out_10d_thread_desc.GetLengths(), - map_out_global2thread); - // Number{}); + map_out_global2thread, + Number{}); +#else + threadwise_tensor_slice_copy_generic( + out_10d_thread_desc.ReorderGivenNew2Old(map_out_global2thread), + p_out_thread, + make_zero_array(), + out_10d_global_desc, + p_out_global + + out_n_k_h_w_global_desc.GetOffsetFromMultiIndex( + n_block_data_begin + n_thread_data_begin, + k_block_data_begin + k_thread_data_begin, + ho_block_data_begin + ho_thread_data_begin, + wo_block_data_begin + wo_thread_data_begin), + make_zero_array(), + out_10d_thread_desc.GetLengths().ReorderGivenNew2Old(map_out_global2thread), + arithmetic_sequence_gen<0, 10, 1>::SeqType{}); +#endif }); } }; diff --git a/src/include/gridwise_convolution_implicit_gemm_v3_nchw_cyxk_nkhw.hip.hpp b/src/include/gridwise_convolution_implicit_gemm_v3_nchw_cyxk_nkhw.hip.hpp index 8446374049..c59ca57aad 100644 --- a/src/include/gridwise_convolution_implicit_gemm_v3_nchw_cyxk_nkhw.hip.hpp +++ b/src/include/gridwise_convolution_implicit_gemm_v3_nchw_cyxk_nkhw.hip.hpp @@ -151,6 +151,7 @@ struct GridwiseConvolutionImplicitGemm_v3_nchw_cyxk_nkhw // slice a tensor, and copy it into another tensor // this copy operator already have blockwise offset built-in const auto blockwise_wei_copy = +#if 0 BlockwiseTensorSliceCopy_generic_v1( {0, k_block_data_on_global}, {0, 0}); +#else + Blockwise2dTensorCopy3({0, k_block_data_on_global}, + {0, 0}); +#endif - // GEMM definition - // c_mtx += transpose(a_mtx) * b_mtx - // a_mtx[CPerBlock, KPerBlock] is in LDS - // b_mtx[CPerBlocl, N1 * BPerBlock * N2] is in LDS - // c_mtx[KPerBlock, N1 * BPerBlock * N2] is distributed among threads, and saved in - // register - constexpr auto a_c_k_block_mtx_desc = make_ConstantMatrixDescriptor( - Number{}, Number{}, Number{}); + // GEMM definition + // c_mtx += transpose(a_mtx) * b_mtx + // a_mtx[CPerBlock, KPerBlock] is in LDS + // b_mtx[CPerBlocl, N1 * BPerBlock * N2] is in LDS + // c_mtx[KPerBlock, N1 * BPerBlock * N2] is distributed among threads, and saved in + // register + constexpr auto a_c_k_block_mtx_desc = + make_ConstantMatrixDescriptor(Number{}, + Number{}, + Number{}); constexpr auto b_c_n1bn2_block_mtx_desc = make_ConstantMatrixDescriptor(Number{},