diff --git a/driver/device_implicit_gemm_convolution_2_chwn_cyxk_khwn.hpp b/driver/device_implicit_gemm_convolution_2_chwn_cyxk_khwn.hpp index 8bd57049e2..1bf7921abe 100644 --- a/driver/device_implicit_gemm_convolution_2_chwn_cyxk_khwn.hpp +++ b/driver/device_implicit_gemm_convolution_2_chwn_cyxk_khwn.hpp @@ -191,7 +191,7 @@ void device_implicit_gemm_convolution_2_chwn_cyxk_khwn(InDesc, constexpr index_t WeiBlockCopyDataPerRead = 4; constexpr index_t BlockSize = 256; -#elif 0 +#elif 1 // 1x1, 14x14, Pascal, enable lds_double_buffer, disable register double buffer constexpr index_t BPerBlock = 64; constexpr index_t KPerBlock = 128; @@ -216,9 +216,10 @@ void device_implicit_gemm_convolution_2_chwn_cyxk_khwn(InDesc, constexpr index_t InBlockCopyDataPerRead = 4; constexpr index_t WeiBlockCopyDataPerRead = 4; + constexpr index_t OutThreadCopyDataPerWrite = 4; constexpr index_t BlockSize = 128; -#elif 1 +#elif 0 // 1x1, 14x14, Vega 20, enable lds_double_buffer, disable register_double_buffer constexpr index_t BPerBlock = 128; constexpr index_t KPerBlock = 128; @@ -243,7 +244,6 @@ void device_implicit_gemm_convolution_2_chwn_cyxk_khwn(InDesc, constexpr index_t InBlockCopyDataPerRead = 4; constexpr index_t WeiBlockCopyDataPerRead = 4; - constexpr index_t OutThreadCopyDataPerWrite = 4; constexpr index_t BlockSize = 256; diff --git a/src/include/gridwise_convolution_implicit_gemm_v1_chwn_cyxk_khwn.hip.hpp b/src/include/gridwise_convolution_implicit_gemm_v1_chwn_cyxk_khwn.hip.hpp index 14b090f809..64490c765b 100644 --- a/src/include/gridwise_convolution_implicit_gemm_v1_chwn_cyxk_khwn.hip.hpp +++ b/src/include/gridwise_convolution_implicit_gemm_v1_chwn_cyxk_khwn.hip.hpp @@ -169,20 +169,19 @@ struct GridwiseConvolutionImplicitGemm_v1_chwn_cyxk_khwn HoPerThread>{}; // LDS: be careful of alignment - constexpr index_t in_block_element_size = - in_chwn_block_desc.GetElementSpace(Number{}); + constexpr index_t max_align = + mod_conv::max(index_t(4), InBlockCopyDataPerRead, WeiBlockCopyDataPerRead); - constexpr index_t wei_block_element_size = - wei_cyxk_block_desc.GetElementSpace(Number{}); + constexpr index_t in_block_space = + in_chwn_block_desc.GetElementSpace(Number{}); - constexpr index_t max_align = InBlockCopyDataPerRead > WeiBlockCopyDataPerRead - ? InBlockCopyDataPerRead - : WeiBlockCopyDataPerRead; + constexpr index_t wei_block_space = + wei_cyxk_block_desc.GetElementSpace(Number{}); __shared__ Float - p_in_block[max_align * ((in_block_element_size + max_align - 1) / max_align)]; + p_in_block[in_block_space]; __shared__ Float - p_wei_block[max_align * ((wei_block_element_size + max_align - 1) / max_align)]; + p_wei_block[wei_block_space]; // register Float p_out_thread[out_khwn_thread_desc.GetElementSpace()]; diff --git a/src/include/gridwise_convolution_implicit_gemm_v2_chwn_cyxk_khwn_lds_double_buffer.hip.hpp b/src/include/gridwise_convolution_implicit_gemm_v2_chwn_cyxk_khwn_lds_double_buffer.hip.hpp index 234f750aca..554624bf48 100644 --- a/src/include/gridwise_convolution_implicit_gemm_v2_chwn_cyxk_khwn_lds_double_buffer.hip.hpp +++ b/src/include/gridwise_convolution_implicit_gemm_v2_chwn_cyxk_khwn_lds_double_buffer.hip.hpp @@ -208,11 +208,11 @@ struct GridwiseConvolutionImplicitGemm_v2_chwn_cyxk_khwn_lds_double_buffer blockwise_wei_copy.RunLoadRegisterClipboard(p_wei_global_block_offset, p_wei_register_clipboard); -#if 0 +#if 1 blockwise_in_copy.RunStoreRegisterClipboard(p_in_register_clipboard, p_in_block_double); blockwise_wei_copy.RunStoreRegisterClipboard(p_wei_register_clipboard, p_wei_block_double); -#elif 1 +#else vmcnt(0); blockwise_in_copy.RunStoreRegisterClipboard_asm(p_in_register_clipboard, p_in_block_double); @@ -266,11 +266,11 @@ struct GridwiseConvolutionImplicitGemm_v2_chwn_cyxk_khwn_lds_double_buffer { for(index_t x = 0; x < X; ++x) { -#if 0 +#if 1 blockwise_gemm.Run #elif 0 blockwise_gemm.Run_RegisterDoubleBuffer -#elif 1 +#elif 0 blockwise_gemm.Run_asm #endif (p_wei_block_now + wei_cyxk_block_desc.Get1dIndex(0, y, x, 0), @@ -279,12 +279,12 @@ struct GridwiseConvolutionImplicitGemm_v2_chwn_cyxk_khwn_lds_double_buffer } } -#if 0 +#if 1 blockwise_in_copy.RunStoreRegisterClipboard(p_in_register_clipboard, p_in_block_next); blockwise_wei_copy.RunStoreRegisterClipboard(p_wei_register_clipboard, p_wei_block_next); -#elif 1 +#else vmcnt(0); blockwise_in_copy.RunStoreRegisterClipboard_asm(p_in_register_clipboard, p_in_block_next); @@ -315,11 +315,11 @@ struct GridwiseConvolutionImplicitGemm_v2_chwn_cyxk_khwn_lds_double_buffer { for(index_t x = 0; x < X; ++x) { -#if 0 +#if 1 blockwise_gemm.Run #elif 0 blockwise_gemm.Run_RegisterDoubleBuffer -#elif 1 +#elif 0 blockwise_gemm.Run_asm #endif (p_wei_block_double + wei_cyxk_block_desc.Get1dIndex(0, y, x, 0), @@ -328,7 +328,7 @@ struct GridwiseConvolutionImplicitGemm_v2_chwn_cyxk_khwn_lds_double_buffer } } -#if 0 +#if 1 blockwise_in_copy.RunStoreRegisterClipboard(p_in_register_clipboard, p_in_block_double + in_block_space); @@ -349,11 +349,11 @@ struct GridwiseConvolutionImplicitGemm_v2_chwn_cyxk_khwn_lds_double_buffer { for(index_t x = 0; x < X; ++x) { -#if 0 +#if 1 blockwise_gemm.Run #elif 0 blockwise_gemm.Run_RegisterDoubleBuffer -#elif 1 +#elif 0 blockwise_gemm.Run_asm #endif (p_wei_block_double + wei_block_space +