From 0983d205ad16eceaaae8c38527e4712882c2c5af Mon Sep 17 00:00:00 2001 From: Chao Liu Date: Fri, 5 Apr 2019 18:54:26 -0500 Subject: [PATCH] debugging --- ...icit_gemm_convolution_2_chwn_cyxk_khwn.hpp | 5 +- driver/driver.hip.cpp | 4 +- ...on_implicit_gemm_v2_chwn_cyxk_khwn.hip.hpp | 56 ++++--------- ...2_chwn_cyxk_khwn_lds_double_buffer.hip.hpp | 81 +++++++++---------- src/include/threadwise_gemm.hip.hpp | 25 +----- 5 files changed, 62 insertions(+), 109 deletions(-) diff --git a/driver/device_implicit_gemm_convolution_2_chwn_cyxk_khwn.hpp b/driver/device_implicit_gemm_convolution_2_chwn_cyxk_khwn.hpp index 0612052e00..e98cc11350 100644 --- a/driver/device_implicit_gemm_convolution_2_chwn_cyxk_khwn.hpp +++ b/driver/device_implicit_gemm_convolution_2_chwn_cyxk_khwn.hpp @@ -191,7 +191,7 @@ void device_implicit_gemm_convolution_2_chwn_cyxk_khwn(InDesc, constexpr index_t WeiBlockCopyDataPerRead = 4; constexpr index_t BlockSize = 256; -#elif 0 +#elif 1 // 1x1, 14x14, Vega 20, disable lds_double_buffer, enable register double buffer constexpr index_t BPerBlock = 64; constexpr index_t KPerBlock = 128; @@ -208,9 +208,6 @@ void device_implicit_gemm_convolution_2_chwn_cyxk_khwn(InDesc, constexpr index_t GemmNLevel1Cluster = 4; constexpr index_t GemmKPerThreadLoop = 1; - constexpr index_t GemmThreadPerColumnPerCluster = 8; - constexpr index_t GemmThreadPerRowPerCluster = 8; - constexpr index_t InBlockCopyThreadPerDim0 = 4; constexpr index_t InBlockCopyThreadPerDim1 = 16; diff --git a/driver/driver.hip.cpp b/driver/driver.hip.cpp index 1607a9e802..a83e4082c7 100644 --- a/driver/driver.hip.cpp +++ b/driver/driver.hip.cpp @@ -580,7 +580,7 @@ int main(int argc, char* argv[]) constexpr index_t HPad = 0; constexpr index_t WPad = 0; -#elif 1 +#elif 0 // 1x1 filter, 14x14 image, C = 2048 constexpr index_t N = 128; constexpr index_t C = 2048; @@ -592,7 +592,7 @@ int main(int argc, char* argv[]) constexpr index_t HPad = 0; constexpr index_t WPad = 0; -#elif 0 +#elif 1 // 1x1 filter, 14x14 image, C = 512 constexpr index_t N = 128; constexpr index_t C = 512; diff --git a/src/include/gridwise_convolution_implicit_gemm_v2_chwn_cyxk_khwn.hip.hpp b/src/include/gridwise_convolution_implicit_gemm_v2_chwn_cyxk_khwn.hip.hpp index dfde97d39a..48b8298cd3 100644 --- a/src/include/gridwise_convolution_implicit_gemm_v2_chwn_cyxk_khwn.hip.hpp +++ b/src/include/gridwise_convolution_implicit_gemm_v2_chwn_cyxk_khwn.hip.hpp @@ -19,8 +19,6 @@ template {}); -#if 0 - if(get_thread_local_1d_id() == 0 && get_block_1d_id() == 0) - { - print_ConstantTensorDescriptor(in_chwn_global_desc, "in_chwn_global_desc"); - print_ConstantTensorDescriptor(wei_cyxk_global_desc, "wei_cyxk_global_desc"); - print_ConstantTensorDescriptor(out_khwn_global_desc, "out_khwn_global_desc"); - - print_ConstantTensorDescriptor(in_cb_global_desc, "in_cb_global_desc"); - print_ConstantTensorDescriptor(wei_ek_global_desc, "wei_ek_global_desc"); - - print_ConstantTensorDescriptor(in_cb_block_desc, "in_cb_block_desc"); - print_ConstantTensorDescriptor(wei_cyxk_block_desc, "wei_cyxk_block_desc"); - print_ConstantTensorDescriptor(wei_ek_block_desc, "wei_ek_block_desc"); - print_ConstantTensorDescriptor(out_kb_thread_desc, "out_kb_thread_desc"); - - printf("KPerBlock %u\n", KPerBlock); - } -#endif - // blockwise in copy // formmat is [CPerBlock,BPerBlock + BGhostRead] #if 0 - const auto blockwise_in_copy = - Blockwise2dTensorCopy1{}; + const auto blockwise_in_copy = + Blockwise2dTensorCopy1{}; #elif 0 const auto blockwise_in_copy = Blockwise2dTensorCopy2{}; + const auto blockwise_wei_copy = + Blockwise2dTensorCopy1{}; #elif 0 const auto blockwise_wei_copy = Blockwise2dTensorCopy2{}); + constexpr index_t in_block_space = in_cb_block_desc.GetElementSpace(Number{}); - constexpr index_t wei_block_element_space = + constexpr index_t wei_block_space = wei_cyxk_block_desc.GetElementSpace(Number{}); - __shared__ Float p_in_block[in_block_element_space]; - __shared__ Float p_wei_block[wei_block_element_space]; + __shared__ Float p_in_block[in_block_space]; + __shared__ Float p_wei_block[wei_block_space]; const Float* p_in_global_block_offset = p_in_global + in_cb_global_desc.Get1dIndex(0, b_block_data_begin); @@ -229,7 +207,7 @@ struct GridwiseConvolutionImplicitGemm_v2_chwn_cyxk_khwn __syncthreads()) { // load data -#if 0 +#if 1 blockwise_in_copy.Run(p_in_global_block_offset, p_in_block); blockwise_wei_copy.Run(p_wei_global_block_offset, p_wei_block); #elif 0 diff --git a/src/include/gridwise_convolution_implicit_gemm_v2_chwn_cyxk_khwn_lds_double_buffer.hip.hpp b/src/include/gridwise_convolution_implicit_gemm_v2_chwn_cyxk_khwn_lds_double_buffer.hip.hpp index 0f0d72864f..b919036df8 100644 --- a/src/include/gridwise_convolution_implicit_gemm_v2_chwn_cyxk_khwn_lds_double_buffer.hip.hpp +++ b/src/include/gridwise_convolution_implicit_gemm_v2_chwn_cyxk_khwn_lds_double_buffer.hip.hpp @@ -67,6 +67,8 @@ struct GridwiseConvolutionImplicitGemm_v2_chwn_cyxk_khwn_lds_double_buffer constexpr index_t B = N * Hi * Wi; constexpr index_t BGhostRead = (Y - 1) * Wi + (X - 1); + static_assert(C % (2 * CPerBlock) == 0, "C cannot be evenly divided"); + // divide block work by 2d: [K, B] constexpr index_t KBlockWork = (K + KPerBlock - 1) / KPerBlock; constexpr index_t BBlockWork = (B + BPerBlock - 1) / BPerBlock; @@ -184,15 +186,14 @@ struct GridwiseConvolutionImplicitGemm_v2_chwn_cyxk_khwn_lds_double_buffer constexpr index_t max_align = mod_conv::max(index_t(4), InBlockCopyDataPerRead, WeiBlockCopyDataPerRead); - constexpr index_t in_block_element_space = - in_cb_block_desc.GetElementSpace(Number{}); + constexpr index_t in_block_space = in_cb_block_desc.GetElementSpace(Number{}); - constexpr index_t wei_block_element_space = + constexpr index_t wei_block_space = wei_cyxk_block_desc.GetElementSpace(Number{}); // LDS double buffer - __shared__ Float p_in_block_double[2 * in_block_element_space]; - __shared__ Float p_wei_block_double[2 * wei_block_element_space]; + __shared__ Float p_in_block_double[2 * in_block_space]; + __shared__ Float p_wei_block_double[2 * wei_block_space]; const Float* p_in_global_block_offset = p_in_global + in_cb_global_desc.Get1dIndex(0, b_block_data_begin); @@ -202,10 +203,10 @@ struct GridwiseConvolutionImplicitGemm_v2_chwn_cyxk_khwn_lds_double_buffer // preload data into LDS { -#if 0 +#if 1 blockwise_in_copy.Run(p_in_global_block_offset, p_in_block_double); blockwise_wei_copy.Run(p_wei_global_block_offset, p_wei_block_double); -#elif 1 +#elif 0 Float p_in_register_clipboard[blockwise_in_copy.GetRegisterClipboardSize()]; Float p_wei_register_clipboard[blockwise_wei_copy.GetRegisterClipboardSize()]; @@ -237,22 +238,22 @@ struct GridwiseConvolutionImplicitGemm_v2_chwn_cyxk_khwn_lds_double_buffer const bool even_loop = (iloop % 2 == 0); Float* p_in_block_now = - even_loop ? p_in_block_double : p_in_block_double + in_block_element_space; + even_loop ? p_in_block_double : p_in_block_double + in_block_space; Float* p_wei_block_now = - even_loop ? p_wei_block_double : p_wei_block_double + wei_block_element_space; + even_loop ? p_wei_block_double : p_wei_block_double + wei_block_space; Float* p_in_block_next = - even_loop ? p_in_block_double + in_block_element_space : p_in_block_double; + even_loop ? p_in_block_double + in_block_space : p_in_block_double; Float* p_wei_block_next = - even_loop ? p_wei_block_double + wei_block_element_space : p_wei_block_double; - - p_in_global_block_offset += CPerBlock * in_cb_global_desc.GetStride(I0); - p_wei_global_block_offset += CPerBlock * wei_cyxk_global_desc.GetStride(I0); + even_loop ? p_wei_block_double + wei_block_space : p_wei_block_double; // load next data Float p_in_register_clipboard[blockwise_in_copy.GetRegisterClipboardSize()]; Float p_wei_register_clipboard[blockwise_wei_copy.GetRegisterClipboardSize()]; + p_in_global_block_offset += CPerBlock * in_cb_global_desc.GetStride(I0); + p_wei_global_block_offset += CPerBlock * wei_cyxk_global_desc.GetStride(I0); + __syncthreads(); blockwise_in_copy.RunLoadRegisterClipboard(p_in_global_block_offset, @@ -267,25 +268,25 @@ struct GridwiseConvolutionImplicitGemm_v2_chwn_cyxk_khwn_lds_double_buffer { for(index_t x = 0; x < X; ++x) { -#if 0 +#if 1 blockwise_gemm.Run #elif 0 blockwise_gemm.Run_RegisterDoubleBuffer #elif 1 blockwise_gemm.Run_asm #endif - (p_wei_block_now + wei_cyxk_block_desc.Get1dIndex(0, y, x, 0), - p_in_block_now + y * Wi + x, - p_out_thread); + (p_wei_block_now + wei_cyxk_block_desc.Get1dIndex(0, y, x, 0), + p_in_block_now + y * Wi + x, + p_out_thread); } } -#if 0 +#if 1 blockwise_in_copy.RunStoreRegisterClipboard(p_in_register_clipboard, p_in_block_next); blockwise_wei_copy.RunStoreRegisterClipboard(p_wei_register_clipboard, p_wei_block_next); -#elif 1 +#elif 0 // if work with RunLoadRegisterClipboard_asm, need to wait vmcnt(0); @@ -298,7 +299,6 @@ struct GridwiseConvolutionImplicitGemm_v2_chwn_cyxk_khwn_lds_double_buffer } // tail - if(C % 2 == 0) { // even p_in_global_block_offset += CPerBlock * in_cb_global_desc.GetStride(I0); @@ -319,34 +319,34 @@ struct GridwiseConvolutionImplicitGemm_v2_chwn_cyxk_khwn_lds_double_buffer { for(index_t x = 0; x < X; ++x) { -#if 0 +#if 1 blockwise_gemm.Run #elif 0 blockwise_gemm.Run_RegisterDoubleBuffer #elif 1 blockwise_gemm.Run_asm #endif - (p_wei_block_double + wei_cyxk_block_desc.Get1dIndex(0, y, x, 0), - p_in_block_double + y * Wi + x, - p_out_thread); + (p_wei_block_double + wei_cyxk_block_desc.Get1dIndex(0, y, x, 0), + p_in_block_double + y * Wi + x, + p_out_thread); } } -#if 0 +#if 1 blockwise_in_copy.RunStoreRegisterClipboard(p_in_register_clipboard, - p_in_block_double + in_block_element_space); + p_in_block_double + in_block_space); - blockwise_wei_copy.RunStoreRegisterClipboard( - p_wei_register_clipboard, p_wei_block_double + wei_block_element_space); + blockwise_wei_copy.RunStoreRegisterClipboard(p_wei_register_clipboard, + p_wei_block_double + wei_block_space); #else // if work with RunLoadRegisterClipboard_asm, need to wait vmcnt(0); - blockwise_in_copy.RunStoreRegisterClipboard_asm( - p_in_register_clipboard, p_in_block_double + in_block_element_space); + blockwise_in_copy.RunStoreRegisterClipboard_asm(p_in_register_clipboard, + p_in_block_double + in_block_space); - blockwise_wei_copy.RunStoreRegisterClipboard_asm( - p_wei_register_clipboard, p_wei_block_double + wei_block_element_space); + blockwise_wei_copy.RunStoreRegisterClipboard_asm(p_wei_register_clipboard, + p_wei_block_double + wei_block_space); #endif // odd @@ -356,25 +356,20 @@ struct GridwiseConvolutionImplicitGemm_v2_chwn_cyxk_khwn_lds_double_buffer { for(index_t x = 0; x < X; ++x) { -#if 0 +#if 1 blockwise_gemm.Run #elif 0 blockwise_gemm.Run_RegisterDoubleBuffer #elif 1 blockwise_gemm.Run_asm #endif - (p_wei_block_double + in_block_element_space + - wei_cyxk_block_desc.Get1dIndex(0, y, x, 0), - p_in_block_double + wei_block_element_space + y * Wi + x, - p_out_thread); + (p_wei_block_double + in_block_space + + wei_cyxk_block_desc.Get1dIndex(0, y, x, 0), + p_in_block_double + wei_block_space + y * Wi + x, + p_out_thread); } } } - else - { - // not implemented - assert(false); - } // output: register to global mem, const auto c_thread_mtx_begin = diff --git a/src/include/threadwise_gemm.hip.hpp b/src/include/threadwise_gemm.hip.hpp index 0797fe0b8c..79fc1bf699 100644 --- a/src/include/threadwise_gemm.hip.hpp +++ b/src/include/threadwise_gemm.hip.hpp @@ -14,29 +14,12 @@ __device__ void threadwise_matrix_copy(SrcMatrix, for(index_t i = 0; i < NRow; ++i) { - // optimize for vector-4 load - if(NCol % 4 == 0) + for(index_t j = 0; j < NCol; ++j) { - using vector_t = typename vector_type::MemoryType; + const index_t src_index = src_mtx.Get1dIndex(i, j); + const index_t dst_index = dst_mtx.Get1dIndex(i, j); - for(index_t j = 0; j < NCol / 4; ++j) - { - const index_t src_index = src_mtx.Get1dIndex(i, 4 * j); - const index_t dst_index = dst_mtx.Get1dIndex(i, 4 * j); - - *reinterpret_cast(&p_dst[dst_index]) = - *reinterpret_cast(&p_src[src_index]); - } - } - else - { - for(index_t j = 0; j < NCol; ++j) - { - const index_t src_index = src_mtx.Get1dIndex(i, j); - const index_t dst_index = dst_mtx.Get1dIndex(i, j); - - p_dst[dst_index] = p_src[src_index]; - } + p_dst[dst_index] = p_src[src_index]; } } }