From 3ce77700b62df4bb17832c048bbbcb965a457833 Mon Sep 17 00:00:00 2001 From: Chao Liu Date: Fri, 26 Apr 2019 15:34:55 -0500 Subject: [PATCH] debugging ds_read asm --- ...lution_implicit_gemm_v1_chwn_cyxk_khwn.hpp | 2 +- ...lution_implicit_gemm_v1_nchw_cyxk_khwn.hpp | 109 ++++++++++++------ driver/driver.hip.cpp | 2 +- ..._implicit_gemm_v1r3_chwn_cyxk_khwn.hip.hpp | 27 +++-- ...3_lds_double_buffer_chwn_cyxk_khwn.hip.hpp | 47 +++----- ..._implicit_gemm_v1r3_nchw_cyxk_khwn.hip.hpp | 15 ++- 6 files changed, 118 insertions(+), 84 deletions(-) diff --git a/driver/device_convolution_implicit_gemm_v1_chwn_cyxk_khwn.hpp b/driver/device_convolution_implicit_gemm_v1_chwn_cyxk_khwn.hpp index abd771285d..938fba037a 100644 --- a/driver/device_convolution_implicit_gemm_v1_chwn_cyxk_khwn.hpp +++ b/driver/device_convolution_implicit_gemm_v1_chwn_cyxk_khwn.hpp @@ -475,7 +475,7 @@ void device_convolution_implicit_gemm_v1_chwn_cyxk_khwn(InDesc, GridwiseConvolutionImplicitGemm_v1r1_chwn_cyxk_khwn #elif 0 GridwiseConvolutionImplicitGemm_v1r2_chwn_cyxk_khwn -#elif 1 +#elif 0 GridwiseConvolutionImplicitGemm_v1r3_chwn_cyxk_khwn #elif 1 GridwiseConvolutionImplicitGemm_v1r3_lds_double_buffer_chwn_cyxk_khwn diff --git a/driver/device_convolution_implicit_gemm_v1_nchw_cyxk_khwn.hpp b/driver/device_convolution_implicit_gemm_v1_nchw_cyxk_khwn.hpp index 6fe414b89d..22658d35ef 100644 --- a/driver/device_convolution_implicit_gemm_v1_nchw_cyxk_khwn.hpp +++ b/driver/device_convolution_implicit_gemm_v1_nchw_cyxk_khwn.hpp @@ -65,6 +65,76 @@ void device_convolution_implicit_gemm_v1_nchw_cyxk_khwn(InDesc, out_khwn_device_buf.ToDevice(out_khwn.mData.data()); #if 0 + // for 3x3, 34x34, v1r3, Pascal + constexpr index_t BlockSize = 128; + + constexpr index_t NPerBlock = 2; + constexpr index_t KPerBlock = 128; + constexpr index_t CPerBlock = 8; + constexpr index_t HoPerBlock = 2; + constexpr index_t WoPerBlock = 16; + + constexpr index_t NPerThread = 2; + constexpr index_t KPerThread = 8; + constexpr index_t HoPerThread = 1; + constexpr index_t WoPerThread = 4; + + constexpr index_t GemmMPerThreadSubC = 4; + constexpr index_t GemmNPerThreadSubC = 4; + constexpr index_t GemmMLevel0Cluster = 4; + constexpr index_t GemmNLevel0Cluster = 2; + constexpr index_t GemmMLevel1Cluster = 4; + constexpr index_t GemmNLevel1Cluster = 2; + constexpr index_t GemmKPerThreadLoop = 1; + constexpr index_t GemmDataPerReadA = 4; + constexpr index_t GemmDataPerReadB = 4; + + using InBlockReorderSrcSubLengths_NCHW = Sequence<2, 1, 2, 1>; + using InBlockReorderSrcClusterLengths_NCHW = Sequence<1, 8, 1, 16>; + using InBlockReorderMapThreadCluster2SrcCluster_CHNW2NCHW = Sequence<1, 2, 0, 3>; + constexpr index_t InBlockReorderDataPerRead_W = 1; // v1r3 cannot do vector load input for NCHW + constexpr index_t InBlockReorderDataPerWrite_N = 2; + + using WeiBlockCopyClusterLengths = Sequence<0, 0>; // not used + constexpr index_t WeiBlockCopyDataPerRead_K = 4; + + constexpr index_t OutThreadCopyDataPerWrite_N = 2; +#elif 1 + // for 3x3, 34x34, v1r3, Vega 20 + constexpr index_t BlockSize = 256; + + constexpr index_t NPerBlock = 2; + constexpr index_t KPerBlock = 128; + constexpr index_t CPerBlock = 8; + constexpr index_t HoPerBlock = 4; + constexpr index_t WoPerBlock = 16; + + constexpr index_t NPerThread = 2; + constexpr index_t KPerThread = 8; + constexpr index_t HoPerThread = 1; + constexpr index_t WoPerThread = 4; + + constexpr index_t GemmMPerThreadSubC = 4; + constexpr index_t GemmNPerThreadSubC = 4; + constexpr index_t GemmMLevel0Cluster = 4; + constexpr index_t GemmNLevel0Cluster = 2; + constexpr index_t GemmMLevel1Cluster = 4; + constexpr index_t GemmNLevel1Cluster = 2; + constexpr index_t GemmKPerThreadLoop = 1; + constexpr index_t GemmDataPerReadA = 4; + constexpr index_t GemmDataPerReadB = 4; + + using InBlockReorderSrcSubLengths_NCHW = Sequence<2, 1, 2, 1>; + using InBlockReorderSrcClusterLengths_NCHW = Sequence<1, 8, 2, 16>; + using InBlockReorderMapThreadCluster2SrcCluster_CHNW2NCHW = Sequence<1, 2, 0, 3>; + constexpr index_t InBlockReorderDataPerRead_W = 1; // v1r3 cannot do vector load input for NCHW + constexpr index_t InBlockReorderDataPerWrite_N = 2; + + using WeiBlockCopyClusterLengths = Sequence<0, 0>; // not used + constexpr index_t WeiBlockCopyDataPerRead_K = 4; + + constexpr index_t OutThreadCopyDataPerWrite_N = 2; +#elif 0 // for 3x3, 28x28, v1r2, Pascal constexpr index_t BlockSize = 128; @@ -133,41 +203,6 @@ void device_convolution_implicit_gemm_v1_nchw_cyxk_khwn(InDesc, using WeiBlockCopyClusterLengths = Sequence<0, 0>; // not used constexpr index_t WeiBlockCopyDataPerRead_K = 4; - constexpr index_t OutThreadCopyDataPerWrite_N = 2; -#elif 1 - // for 3x3, 34x34, v1r3, Pascal - constexpr index_t BlockSize = 128; - - constexpr index_t NPerBlock = 2; - constexpr index_t KPerBlock = 128; - constexpr index_t CPerBlock = 8; - constexpr index_t HoPerBlock = 2; - constexpr index_t WoPerBlock = 16; - - constexpr index_t NPerThread = 2; - constexpr index_t KPerThread = 8; - constexpr index_t HoPerThread = 1; - constexpr index_t WoPerThread = 4; - - constexpr index_t GemmMPerThreadSubC = 4; - constexpr index_t GemmNPerThreadSubC = 4; - constexpr index_t GemmMLevel0Cluster = 4; - constexpr index_t GemmNLevel0Cluster = 2; - constexpr index_t GemmMLevel1Cluster = 4; - constexpr index_t GemmNLevel1Cluster = 2; - constexpr index_t GemmKPerThreadLoop = 1; - constexpr index_t GemmDataPerReadA = 4; - constexpr index_t GemmDataPerReadB = 4; - - using InBlockReorderSrcSubLengths_NCHW = Sequence<2, 1, 2, 1>; - using InBlockReorderSrcClusterLengths_NCHW = Sequence<1, 8, 1, 16>; - using InBlockReorderMapThreadCluster2SrcCluster_CHNW2NCHW = Sequence<1, 2, 0, 3>; - constexpr index_t InBlockReorderDataPerRead_W = 1; // v1r3 cannot do vector load input for NCHW - constexpr index_t InBlockReorderDataPerWrite_N = 2; - - using WeiBlockCopyClusterLengths = Sequence<0, 0>; // not used - constexpr index_t WeiBlockCopyDataPerRead_K = 4; - constexpr index_t OutThreadCopyDataPerWrite_N = 2; #endif @@ -182,9 +217,9 @@ void device_convolution_implicit_gemm_v1_nchw_cyxk_khwn(InDesc, constexpr auto gridwise_conv = #if 0 GridwiseConvolutionImplicitGemm_v1r2_nchw_cyxk_khwn -#elif 0 - GridwiseConvolutionImplicitGemm_v1r3_nchw_cyxk_khwn #elif 1 + GridwiseConvolutionImplicitGemm_v1r3_nchw_cyxk_khwn +#elif 0 GridwiseConvolutionImplicitGemm_v1r3_lds_double_buffer_nchw_cyxk_khwn #endif {}; + // choose GEMM implementation here + const auto run_blockwise_batch_gemm = [&](auto... Xs) { +#if 1 + return blockwise_batch_gemm.Run(Xs...); +#elif 0 + return blockwise_batch_gemm.Run_asm(Xs...); +#else + return blockwise_batch_gemm.Run_asm_v2(Xs...); +#endif + }; + // LDS: be careful of alignment // TODO:: need to properly implement tensor descriptor with alignment constexpr index_t in_block_space = @@ -241,13 +252,7 @@ struct GridwiseConvolutionImplicitGemm_v1r3_chwn_cyxk_khwn __syncthreads(); -#if 1 - blockwise_batch_gemm.Run(p_wei_block, p_in_block, p_out_thread); -#elif 0 - blockwise_batch_gemm.Run_asm(p_wei_block, p_in_block, p_out_thread); -#elif 1 - blockwise_batch_gemm.Run_asm_v2(p_wei_block, p_in_block, p_out_thread); -#endif + run_blockwise_batch_gemm(p_wei_block, p_in_block, p_out_thread); __syncthreads(); } @@ -279,13 +284,7 @@ struct GridwiseConvolutionImplicitGemm_v1r3_chwn_cyxk_khwn __syncthreads(); -#if 1 - blockwise_batch_gemm.Run(p_wei_block, p_in_block, p_out_thread); -#elif 0 - blockwise_batch_gemm.Run_asm(p_wei_block, p_in_block, p_out_thread); -#elif 1 - blockwise_batch_gemm.Run_asm_v2(p_wei_block, p_in_block, p_out_thread); -#endif + run_blockwise_batch_gemm(p_wei_block, p_in_block, p_out_thread); __syncthreads(); } diff --git a/src/include/gridwise_convolution_implicit_gemm_v1r3_lds_double_buffer_chwn_cyxk_khwn.hip.hpp b/src/include/gridwise_convolution_implicit_gemm_v1r3_lds_double_buffer_chwn_cyxk_khwn.hip.hpp index 5595d596e9..a266b2de1e 100644 --- a/src/include/gridwise_convolution_implicit_gemm_v1r3_lds_double_buffer_chwn_cyxk_khwn.hip.hpp +++ b/src/include/gridwise_convolution_implicit_gemm_v1r3_lds_double_buffer_chwn_cyxk_khwn.hip.hpp @@ -199,6 +199,17 @@ struct GridwiseConvolutionImplicitGemm_v1r3_lds_double_buffer_chwn_cyxk_khwn GemmDataPerReadA, GemmDataPerReadB>{}; + // choose GEMM implementation here + const auto run_blockwise_batch_gemm = [&](auto... Xs) { +#if 0 + return blockwise_batch_gemm.Run(Xs...); +#elif 0 + return blockwise_batch_gemm.Run_asm(Xs...); +#else + return blockwise_batch_gemm.Run_asm_v2(Xs...); +#endif + }; + // LDS: be careful of alignment constexpr index_t in_block_space = in_c_h_w_n_block_desc.GetElementSpace(Number{}); @@ -293,15 +304,7 @@ struct GridwiseConvolutionImplicitGemm_v1r3_lds_double_buffer_chwn_cyxk_khwn blockwise_wei_copy.RunLoadRegisterClipboard(p_wei_global_block_offset, p_wei_register_clipboard); -// LDS double buffer: GEMM on current data -#if 1 - blockwise_batch_gemm.Run -#elif 0 - blockwise_batch_gemm.Run_asm -#else - blockwise_batch_gemm.Run_asm_v2 -#endif - (p_wei_block_now, p_in_block_now, p_out_thread); + run_blockwise_batch_gemm(p_wei_block_now, p_in_block_now, p_out_thread); // LDS double buffer: store next data to LDS blockwise_in_copy.RunStoreRegisterClipboard(p_in_register_clipboard, @@ -328,15 +331,8 @@ struct GridwiseConvolutionImplicitGemm_v1r3_lds_double_buffer_chwn_cyxk_khwn blockwise_wei_copy.RunLoadRegisterClipboard(p_wei_global_block_offset, p_wei_register_clipboard); -// LDS double buffer: GEMM on current data -#if 1 - blockwise_batch_gemm.Run -#elif 0 - blockwise_batch_gemm.Run_asm -#else - blockwise_batch_gemm.Run_asm_v2 -#endif - (p_wei_block_double, p_in_block_double, p_out_thread); + // LDS double buffer: GEMM on current data + run_blockwise_batch_gemm(p_wei_block_double, p_in_block_double, p_out_thread); // LDS double buffer: store next data to LDS blockwise_in_copy.RunStoreRegisterClipboard(p_in_register_clipboard, @@ -347,17 +343,10 @@ struct GridwiseConvolutionImplicitGemm_v1r3_lds_double_buffer_chwn_cyxk_khwn // odd iteration __syncthreads(); -// LDS double buffer: GEMM on current data -#if 1 - blockwise_batch_gemm.Run -#elif 0 - blockwise_batch_gemm.Run_asm -#else - blockwise_batch_gemm.Run_asm_v2 -#endif - (p_wei_block_double + wei_block_space, - p_in_block_double + in_block_space, - p_out_thread); + // LDS double buffer: GEMM on current data + run_blockwise_batch_gemm(p_wei_block_double + wei_block_space, + p_in_block_double + in_block_space, + p_out_thread); } } } diff --git a/src/include/gridwise_convolution_implicit_gemm_v1r3_nchw_cyxk_khwn.hip.hpp b/src/include/gridwise_convolution_implicit_gemm_v1r3_nchw_cyxk_khwn.hip.hpp index 062d3c4540..5e6fde6ef2 100644 --- a/src/include/gridwise_convolution_implicit_gemm_v1r3_nchw_cyxk_khwn.hip.hpp +++ b/src/include/gridwise_convolution_implicit_gemm_v1r3_nchw_cyxk_khwn.hip.hpp @@ -193,6 +193,17 @@ struct GridwiseConvolutionImplicitGemm_v1r3_nchw_cyxk_khwn GemmDataPerReadA, GemmDataPerReadB>{}; + // choose GEMM implementation here + const auto run_blockwise_batch_gemm = [&](auto... Xs) { +#if 0 + return blockwise_batch_gemm.Run(Xs...); +#elif 0 + return blockwise_batch_gemm.Run_asm(Xs...); +#else + return blockwise_batch_gemm.Run_asm_v2(Xs...); +#endif + }; + // LDS: be careful of alignment constexpr index_t in_block_space = in_c_h_w_n_block_desc.GetElementSpace(Number{}); @@ -267,7 +278,7 @@ struct GridwiseConvolutionImplicitGemm_v1r3_nchw_cyxk_khwn __syncthreads(); - blockwise_batch_gemm.Run(p_wei_block, p_in_block, p_out_thread); + run_blockwise_batch_gemm(p_wei_block, p_in_block, p_out_thread); __syncthreads(); } @@ -314,7 +325,7 @@ struct GridwiseConvolutionImplicitGemm_v1r3_nchw_cyxk_khwn __syncthreads(); - blockwise_batch_gemm.Run(p_wei_block, p_in_block, p_out_thread); + run_blockwise_batch_gemm(p_wei_block, p_in_block, p_out_thread); __syncthreads(); }