diff --git a/src/include/gridwise_convolution_implicit_gemm_v2_chwn_cyxk_khwn.hip.hpp b/src/include/gridwise_convolution_implicit_gemm_v2_chwn_cyxk_khwn.hip.hpp index cbe913b65e..eee1dd63ad 100644 --- a/src/include/gridwise_convolution_implicit_gemm_v2_chwn_cyxk_khwn.hip.hpp +++ b/src/include/gridwise_convolution_implicit_gemm_v2_chwn_cyxk_khwn.hip.hpp @@ -215,17 +215,11 @@ struct GridwiseConvolutionImplicitGemm_v2_chwn_cyxk_khwn blockwise_in_copy.RunLoadRegisterClipboard(p_in_global_block_offset, p_in_register_clipboard); - blockwise_wei_copy.RunLoadRegisterClipboard(p_wei_global_block_offset, p_wei_register_clipboard); -#if 1 + blockwise_in_copy.RunStoreRegisterClipboard(p_in_register_clipboard, p_in_block); blockwise_wei_copy.RunStoreRegisterClipboard(p_wei_register_clipboard, p_wei_block); -#else - vmcnt(0); - blockwise_in_copy.RunStoreRegisterClipboard_asm(p_in_register_clipboard, p_in_block); - blockwise_wei_copy.RunStoreRegisterClipboard_asm(p_wei_register_clipboard, p_wei_block); -#endif __syncthreads(); diff --git a/src/include/gridwise_convolution_implicit_gemm_v2_chwn_cyxk_khwn_lds_double_buffer.hip.hpp b/src/include/gridwise_convolution_implicit_gemm_v2_chwn_cyxk_khwn_lds_double_buffer.hip.hpp index 82c3b11a4f..023cfc68e5 100644 --- a/src/include/gridwise_convolution_implicit_gemm_v2_chwn_cyxk_khwn_lds_double_buffer.hip.hpp +++ b/src/include/gridwise_convolution_implicit_gemm_v2_chwn_cyxk_khwn_lds_double_buffer.hip.hpp @@ -262,11 +262,11 @@ struct GridwiseConvolutionImplicitGemm_v2_chwn_cyxk_khwn_lds_double_buffer { for(index_t x = 0; x < X; ++x) { -#if 0 +#if 1 blockwise_gemm.Run #elif 0 blockwise_gemm.Run_RegisterDoubleBuffer -#elif 1 +#elif 0 blockwise_gemm.Run_asm #endif (p_wei_block_now + wei_cyxk_block_desc.Get1dIndex(0, y, x, 0), @@ -303,11 +303,11 @@ struct GridwiseConvolutionImplicitGemm_v2_chwn_cyxk_khwn_lds_double_buffer { for(index_t x = 0; x < X; ++x) { -#if 0 +#if 1 blockwise_gemm.Run #elif 0 blockwise_gemm.Run_RegisterDoubleBuffer -#elif 1 +#elif 0 blockwise_gemm.Run_asm #endif (p_wei_block_double + wei_cyxk_block_desc.Get1dIndex(0, y, x, 0), @@ -328,11 +328,11 @@ struct GridwiseConvolutionImplicitGemm_v2_chwn_cyxk_khwn_lds_double_buffer { for(index_t x = 0; x < X; ++x) { -#if 0 +#if 1 blockwise_gemm.Run #elif 0 blockwise_gemm.Run_RegisterDoubleBuffer -#elif 1 +#elif 0 blockwise_gemm.Run_asm #endif (p_wei_block_double + wei_block_space + @@ -350,9 +350,8 @@ struct GridwiseConvolutionImplicitGemm_v2_chwn_cyxk_khwn_lds_double_buffer const index_t k_thread_data_begin = k_block_data_begin + c_thread_mtx_begin.row; const index_t b_thread_data_begin = b_block_data_begin + c_thread_mtx_begin.col; -#if 1 if(Y == 1 && X == 1) - { // pure 1x1 conv + { // pure 1x1 conv (non padding, 1x1 stride) constexpr index_t K2_ = GemmMPerThreadSubC; constexpr index_t K1_ = KPerBlock / KPerThread; constexpr index_t B2_ = GemmNPerThreadSubC; @@ -376,7 +375,6 @@ struct GridwiseConvolutionImplicitGemm_v2_chwn_cyxk_khwn_lds_double_buffer Number{}); } else -#endif { for(index_t k = 0; k < out_kb_thread_desc.GetLength(I0); ++k) {