From 8669e242ad424187bd3818128f8570e359c66903 Mon Sep 17 00:00:00 2001 From: Chao Liu Date: Mon, 15 Jul 2019 22:00:48 -0500 Subject: [PATCH] debugging --- ..._v4r3_nchw_kcyx_nkhw_lds_double_buffer.hpp | 35 +++++++++++++++---- ...tion_implicit_gemm_v4r3_nchw_kcyx_nkhw.hpp | 8 ++--- driver/src/driver.cpp | 2 +- 3 files changed, 33 insertions(+), 12 deletions(-) diff --git a/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r3_nchw_kcyx_nkhw_lds_double_buffer.hpp b/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r3_nchw_kcyx_nkhw_lds_double_buffer.hpp index af6d89bcf2..5b55ed46a5 100644 --- a/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r3_nchw_kcyx_nkhw_lds_double_buffer.hpp +++ b/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r3_nchw_kcyx_nkhw_lds_double_buffer.hpp @@ -100,7 +100,8 @@ struct GridwiseConvolutionImplicitGemm_v4r3_nchw_kcyx_nkhw_lds_double_buffer constexpr index_t B = N0 * Ho0 * Wo0; - static_assert(N == N0 * N1 * N2 && Ho == Ho * Ho1 * Ho2 && Wo == Wo0 * Wo1 * Wo2, "wrong!"); + static_assert(N == N0 * N1 * N2 && Ho == Ho0 * Ho1 * Ho2 && Wo == Wo0 * Wo1 * Wo2, + "wrong!"); static_assert((X == 1 || ConvDilationW % InBlockCopyDataPerAccess_W2 == 0), "wrong! aligment requirement for vectorized global load of input tensor will " @@ -179,12 +180,6 @@ struct GridwiseConvolutionImplicitGemm_v4r3_nchw_kcyx_nkhw_lds_double_buffer InBlockCopyDataPerAccess_W2>({0, 0, 0, 0, b_block_data_on_global, 0, 0, 0}, {0, 0, 0, 0, 0, 0, 0, 0}); -#if 0 - { - printf("id (%d %d), in offset: %d %d\n", get_block_1d_id(), get_thread_local_1d_id(), blockwise_in_copy.mThreadSrcOffset, blockwise_in_copy.mThreadDstOffset); - } -#endif - // weight tensor // tensor descriptor in device memory, src of blockwise copy constexpr auto wei_e_k_global_desc = @@ -214,6 +209,19 @@ struct GridwiseConvolutionImplicitGemm_v4r3_nchw_kcyx_nkhw_lds_double_buffer WeiBlockCopyDstDataPerWrite_K>( {0, k_block_data_on_global}, {0, 0}); +#if 0 + if(get_block_1d_id() == 0) + { + printf("id (%d %d), in offset: %d %d, wei offset %d %d\n", + get_block_1d_id(), + get_thread_local_1d_id(), + blockwise_in_copy.mThreadSrcOffset, + blockwise_in_copy.mThreadDstOffset, + blockwise_wei_copy.mThreadSrcOffset, + blockwise_wei_copy.mThreadDstOffset); + } +#endif + // GEMM definition // c_mtx += transpose(a_mtx) * b_mtx // a_mtx[EPerBlock, KPerBlock] is in LDS @@ -324,6 +332,19 @@ struct GridwiseConvolutionImplicitGemm_v4r3_nchw_kcyx_nkhw_lds_double_buffer blockwise_wei_copy.RunLoadRegisterClipboard(p_wei_block_on_global, p_wei_register_clipboard); +#if 1 + if(get_block_1d_id() == 0) + { + printf("tid (%d %d), %f %f %f %f\n", + get_block_1d_id(), + get_thread_local_1d_id(), + p_wei_register_clipboard[0], + p_wei_register_clipboard[1], + p_wei_register_clipboard[2], + p_wei_register_clipboard[3]); + } +#endif + // LDS double buffer: GEMM on current data blockwise_gemm.Run(p_wei_block_now, p_in_block_now, p_out_thread); diff --git a/driver/include/device_convolution_implicit_gemm_v4r3_nchw_kcyx_nkhw.hpp b/driver/include/device_convolution_implicit_gemm_v4r3_nchw_kcyx_nkhw.hpp index 6e9d240d02..5669ea7243 100644 --- a/driver/include/device_convolution_implicit_gemm_v4r3_nchw_kcyx_nkhw.hpp +++ b/driver/include/device_convolution_implicit_gemm_v4r3_nchw_kcyx_nkhw.hpp @@ -90,14 +90,14 @@ void device_convolution_implicit_gemm_v4r3_nchw_kcyx_nkhw(InDesc, constexpr index_t InBlockCopyDataPerAccess_W2 = 4; - using WeiBlockCopySubLengths_E_K = Sequence<4, 1>; - using WeiBlockCopyClusterLengths_E_K = Sequence<2, 128>; + using WeiBlockCopySubLengths_E_K = Sequence<2, 2>; + using WeiBlockCopyClusterLengths_E_K = Sequence<4, 64>; using WeiBlockCopyThreadClusterArrangeOrder = Sequence<1, 0>; // [K, E] using WeiBlockCopySrcAccessOrder = Sequence<1, 0>; // [K, E] using WeiBlockCopyDstAccessOrder = Sequence<0, 1>; // [E, K] - constexpr index_t WeiBlockCopySrcDataPerRead_E = 4; - constexpr index_t WeiBlockCopyDstDataPerWrite_K = 1; + constexpr index_t WeiBlockCopySrcDataPerRead_E = 1; + constexpr index_t WeiBlockCopyDstDataPerWrite_K = 2; #endif constexpr index_t N0 = N / (N1 * N2); diff --git a/driver/src/driver.cpp b/driver/src/driver.cpp index 3e0158bb42..125adf6b83 100644 --- a/driver/src/driver.cpp +++ b/driver/src/driver.cpp @@ -491,7 +491,7 @@ int main(int argc, char* argv[]) if(do_verification) { -#if 0 +#if 1 in_nchw.GenerateTensorValue(GeneratorTensor_1{}, num_thread); wei_kcyx.GenerateTensorValue(GeneratorTensor_1{}, num_thread); #elif 0