From 1480375fa643624ebfc40ad7633c311711d33e2a Mon Sep 17 00:00:00 2001 From: Chao Liu Date: Sun, 14 Jul 2019 01:32:40 -0500 Subject: [PATCH] adding implicit GEMM v4r2 --- ..._v4r2_nchw_kcyx_nkhw_lds_double_buffer.hpp | 49 ++------------- ...tion_implicit_gemm_v4r2_nchw_kcyx_nkhw.hpp | 61 +++++++++++++++---- 2 files changed, 55 insertions(+), 55 deletions(-) diff --git a/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r2_nchw_kcyx_nkhw_lds_double_buffer.hpp b/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r2_nchw_kcyx_nkhw_lds_double_buffer.hpp index 499cc1f0df..3dfce55cc2 100644 --- a/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r2_nchw_kcyx_nkhw_lds_double_buffer.hpp +++ b/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r2_nchw_kcyx_nkhw_lds_double_buffer.hpp @@ -29,9 +29,6 @@ template {}); + make_ConstantTensorDescriptor_packed( + Sequence{}); // input blockwise copy // slice a merged tensor, reorder and copy to a normal tensor @@ -251,9 +242,8 @@ struct GridwiseConvolutionImplicitGemm_v4r2_nchw_kcyx_nkhw_lds_double_buffer // c_thread_mtx definition: this is a mess // TODO:: more elegent way of defining c_thread_mtx constexpr auto c_k0k2_n0ho0wo0n2ho2wo2_thread_mtx_desc = - make_ConstantMatrixDescriptor_packed( - Number{}, - Number{}); + make_ConstantMatrixDescriptor_packed(Number{}, + Number{}); const auto blockwise_gemm = BlockwiseGemmBlockABlockBThreadCTransANormalBNormalC_v2< BlockSize, @@ -384,18 +374,8 @@ struct GridwiseConvolutionImplicitGemm_v4r2_nchw_kcyx_nkhw_lds_double_buffer // define tensor descriptor for threadwise copy // output memory layout descriptor in register constexpr auto out_k0_k1_k2_n0_ho0_wo0_n1_ho1_wo1_n2_ho2_wo2_thread_mem_desc = - make_ConstantTensorDescriptor_packed(Sequence{}); + make_ConstantTensorDescriptor_packed( + Sequence{}); // output tensor descriptor in register, src of threadwise copy constexpr auto out_n0_n1_n2_k0_k1_k2_ho0_ho1_ho2_wo0_wo1_wo2_thread_desc = @@ -440,7 +420,6 @@ struct GridwiseConvolutionImplicitGemm_v4r2_nchw_kcyx_nkhw_lds_double_buffer out_k_n0_ho0_wo0_b_n2_ho2_wo2_global_merged_desc.GetOffsetFromMultiIndex( k_thread_data_on_global, 0, 0, 0, b_thread_data_on_global, 0, 0, 0); -#if 1 threadwise_generic_tensor_slice_copy_v1( out_n0_n1_n2_k0_k1_k2_ho0_ho1_ho2_wo0_wo1_wo2_thread_desc, p_out_thread, @@ -451,22 +430,6 @@ struct GridwiseConvolutionImplicitGemm_v4r2_nchw_kcyx_nkhw_lds_double_buffer out_n0_n1_n2_k0_k1_k2_ho0_ho1_ho2_wo0_wo1_wo2_thread_desc.GetLengths(), arithmetic_sequence_gen<0, 12, 1>::type{}, Number<1>{}); -#else - if(get_thread_local_1d_id() == 0 && get_block_1d_id() == 0) - { - print_ConstantTensorDescriptor( - "out thread: ", out_n0_n1_n2_k0_k1_k2_ho0_ho1_ho2_wo0_wo1_wo2_thread_desc); - printf("size: %d\n", - out_n0_n1_n2_k0_k1_k2_ho0_ho1_ho2_wo0_wo1_wo2_thread_desc.GetElementSize()); - - for(index_t i = 0; - i < out_n0_n1_n2_k0_k1_k2_ho0_ho1_ho2_wo0_wo1_wo2_thread_desc.GetElementSize(); - ++i) - { - p_out_global[0] = p_out_thread[i]; - } - } -#endif } } }; diff --git a/driver/include/device_convolution_implicit_gemm_v4r2_nchw_kcyx_nkhw.hpp b/driver/include/device_convolution_implicit_gemm_v4r2_nchw_kcyx_nkhw.hpp index c10c54793d..6947698f57 100644 --- a/driver/include/device_convolution_implicit_gemm_v4r2_nchw_kcyx_nkhw.hpp +++ b/driver/include/device_convolution_implicit_gemm_v4r2_nchw_kcyx_nkhw.hpp @@ -53,15 +53,14 @@ void device_convolution_implicit_gemm_v4r2_nchw_kcyx_nkhw(InDesc, wei_kcyx_device_buf.ToDevice(wei_kcyx.mData.data()); out_nkhw_device_buf.ToDevice(out_nkhw.mData.data()); -#if 1 +#if 0 // 1x1 filter, 8x8 image constexpr index_t N0 = 1; - constexpr index_t N2 = 1; - constexpr index_t Ho0 = 1; - constexpr index_t Ho2 = 1; - constexpr index_t Wo0 = 2; + + constexpr index_t N2 = 1; + constexpr index_t Ho2 = 1; constexpr index_t Wo2 = 4; constexpr index_t BlockSize = 256; @@ -70,10 +69,6 @@ void device_convolution_implicit_gemm_v4r2_nchw_kcyx_nkhw(InDesc, constexpr index_t KPerBlock = 128; constexpr index_t EPerBlock = 8; - constexpr index_t N0PerBlock = 1; - constexpr index_t Ho0PerBlock = 1; - constexpr index_t Wo0PerBlock = 2; - constexpr index_t GemmMPerThreadSubC = 4; constexpr index_t GemmNPerThreadSubC = 4; constexpr index_t GemmMLevel0Cluster = 4; @@ -101,6 +96,51 @@ void device_convolution_implicit_gemm_v4r2_nchw_kcyx_nkhw(InDesc, using WeiBlockCopySrcAccessOrder = Sequence<1, 0>; // [K, E] using WeiBlockCopyDstAccessOrder = Sequence<0, 1>; // [E, K] + constexpr index_t WeiBlockCopySrcDataPerRead_E = 4; + constexpr index_t WeiBlockCopyDstDataPerWrite_K = 1; +#elif 1 + // 1x1 filter, 8x8 image + constexpr index_t N0 = 1; + constexpr index_t Ho0 = 2; + constexpr index_t Wo0 = 1; + + constexpr index_t N2 = 2; + constexpr index_t Ho2 = 2; + constexpr index_t Wo2 = 1; + + constexpr index_t BlockSize = 256; + + constexpr index_t BPerBlock = 16; + constexpr index_t KPerBlock = 128; + constexpr index_t EPerBlock = 8; + + constexpr index_t GemmMPerThreadSubC = 4; + constexpr index_t GemmNPerThreadSubC = 4; + constexpr index_t GemmMLevel0Cluster = 4; + constexpr index_t GemmNLevel0Cluster = 4; + constexpr index_t GemmMLevel1Cluster = 4; + constexpr index_t GemmNLevel1Cluster = 4; + constexpr index_t GemmKPerThreadLoop = 1; + constexpr index_t GemmDataPerReadA = 4; + constexpr index_t GemmDataPerReadB = 4; + + using InBlockCopySubLengths_E_N0_Ho0_Wo0_B_N2_Ho2_Wo2 = Sequence<1, 1, 2, 1, 1, 2, 1, 1>; + using InBlockCopyClusterLengths_E_N0_Ho0_Wo0_B_N2_Ho2_Wo2 = Sequence<8, 1, 1, 1, 16, 1, 2, 1>; + using InBlockCopyThreadClusterArrangeOrder = + Sequence<0, 1, 5, 2, 6, 3, 4, 7>; // [E, N0, N2, Ho0, Ho2, Wo0, B, Wo2] + using InBlockCopySrcAccessOrder = + Sequence<0, 1, 5, 2, 6, 3, 4, 7>; // [E, N0, N2, Ho0, Ho2, Wo0, B, Wo2] + using InBlockCopyDstAccessOrder = + Sequence<0, 1, 2, 3, 4, 5, 6, 7>; // [E, N0, Ho0, Wo0, B, N2, Ho2, Wo2] + + constexpr index_t InBlockCopyDataPerAccess_W2 = 1; + + using WeiBlockCopySubLengths_E_K = Sequence<4, 1>; + using WeiBlockCopyClusterLengths_E_K = Sequence<2, 128>; + using WeiBlockCopyThreadClusterArrangeOrder = Sequence<1, 0>; // [K, E] + using WeiBlockCopySrcAccessOrder = Sequence<1, 0>; // [K, E] + using WeiBlockCopyDstAccessOrder = Sequence<0, 1>; // [E, K] + constexpr index_t WeiBlockCopySrcDataPerRead_E = 4; constexpr index_t WeiBlockCopyDstDataPerWrite_K = 1; #endif @@ -137,9 +177,6 @@ void device_convolution_implicit_gemm_v4r2_nchw_kcyx_nkhw(InDesc, BPerBlock, KPerBlock, EPerBlock, - N0PerBlock, - Ho0PerBlock, - Wo0PerBlock, GemmMPerThreadSubC, GemmNPerThreadSubC, GemmMLevel0Cluster,