From 2a48812edb1a7c3e280159637fa89b7a0bbfb86b Mon Sep 17 00:00:00 2001 From: Chao Liu Date: Tue, 21 May 2019 16:43:56 -0500 Subject: [PATCH] behavior has changed (better and worse), figuring out why --- ...e_convolution_implicit_gemm_v1_nchw_cyxk_nkhw.hpp | 4 ++-- src/include/ConstantTensorDescriptor.hip.hpp | 7 +++---- src/include/blockwise_tensor_slice_op.hip.hpp | 12 ++++++++---- ...emm_v1r3_lds_double_buffer_nchw_cyxk_khwn.hip.hpp | 4 ++-- ...emm_v1r3_lds_double_buffer_nchw_cyxk_nkhw.hip.hpp | 2 +- 5 files changed, 16 insertions(+), 13 deletions(-) diff --git a/driver/device_convolution_implicit_gemm_v1_nchw_cyxk_nkhw.hpp b/driver/device_convolution_implicit_gemm_v1_nchw_cyxk_nkhw.hpp index 6a005c5dd6..7ab9fd7c75 100644 --- a/driver/device_convolution_implicit_gemm_v1_nchw_cyxk_nkhw.hpp +++ b/driver/device_convolution_implicit_gemm_v1_nchw_cyxk_nkhw.hpp @@ -57,7 +57,7 @@ void device_convolution_implicit_gemm_v1_nchw_cyxk_nkhw(InDesc, wei_cyxk_device_buf.ToDevice(wei_cyxk.mData.data()); out_nkhw_device_buf.ToDevice(out_nkhw.mData.data()); -#if 1 +#if 0 // for 3x3, 34x34, v1r3, Pascal constexpr index_t BlockSize = 128; @@ -162,7 +162,7 @@ void device_convolution_implicit_gemm_v1_nchw_cyxk_nkhw(InDesc, constexpr index_t WeiBlockCopyDataPerRead_K = 4; constexpr index_t OutThreadCopyDataPerWrite_W = 2; -#elif 0 +#elif 1 // for 3x3, 34x34, v1r3, Vega 20, WoPerBlock = 8 constexpr index_t BlockSize = 256; diff --git a/src/include/ConstantTensorDescriptor.hip.hpp b/src/include/ConstantTensorDescriptor.hip.hpp index 880ea5038f..ff422de6e5 100644 --- a/src/include/ConstantTensorDescriptor.hip.hpp +++ b/src/include/ConstantTensorDescriptor.hip.hpp @@ -286,10 +286,9 @@ struct ConstantTensorDescriptor "wrong! dimensions to be unfolded need to be packed"); // checkt ranks - static_assert(GetMemoryRank(IDim_p1) = GetMemoryRank(IDim) + 1, - "wrong! ranks of dimensions to be " - "unfolded need to be in increasing " - "and continuous ranks"); + static_assert(GetMemoryRank(IDim_p1) == GetMemoryRank(IDim) + 1, + "wrong! ranks of dimensions to be unfolded need to be in increasing and " + "continuous ranks"); }); // left and right diff --git a/src/include/blockwise_tensor_slice_op.hip.hpp b/src/include/blockwise_tensor_slice_op.hip.hpp index 6a32754a28..0285528648 100644 --- a/src/include/blockwise_tensor_slice_op.hip.hpp +++ b/src/include/blockwise_tensor_slice_op.hip.hpp @@ -39,7 +39,8 @@ struct BlockwiseTensorSliceReorderCopy_v3 constexpr auto thread_cluster_lengths = src_cluster_lengths.ReorderGivenNew2Old(map_thread_cluster_2_src_cluster); - constexpr auto thread_cluster_desc = make_packed_ConstantTensorDescriptor(thread_cluster_lengths); + constexpr auto thread_cluster_desc = + make_packed_ConstantTensorDescriptor(thread_cluster_lengths); // sanity check: data type static_assert(is_same::value, "wrong! only support float for now!\n"); @@ -147,7 +148,8 @@ struct BlockwiseTensorSliceReorderCopy_v3 constexpr auto thread_tensor_lengths = thread_sub_tensor_lengths * repeat_lengths; - constexpr auto thread_tensor_desc = make_packed_ConstantTensorDescriptor(thread_tensor_lengths); + constexpr auto thread_tensor_desc = + make_packed_ConstantTensorDescriptor(thread_tensor_lengths); return thread_tensor_desc.GetElementSpace(); } @@ -167,7 +169,8 @@ struct BlockwiseTensorSliceReorderCopy_v3 constexpr auto thread_tensor_lengths = thread_sub_tensor_lengths * repeat_lengths; - constexpr auto thread_tensor_desc = make_packed_ConstantTensorDescriptor(thread_tensor_lengths); + constexpr auto thread_tensor_desc = + make_packed_ConstantTensorDescriptor(thread_tensor_lengths); static_ford{}([&](auto repeat_multi_id_) { constexpr auto repeat_multi_id = decltype(repeat_multi_id_){}; @@ -204,7 +207,8 @@ struct BlockwiseTensorSliceReorderCopy_v3 constexpr auto thread_tensor_lengths = thread_sub_tensor_lengths * repeat_lengths; - constexpr auto thread_tensor_desc = make_packed_ConstantTensorDescriptor(thread_tensor_lengths); + constexpr auto thread_tensor_desc = + make_packed_ConstantTensorDescriptor(thread_tensor_lengths); static_ford{}([&](auto repeat_multi_id_) { constexpr auto repeat_multi_id = decltype(repeat_multi_id_){}; diff --git a/src/include/gridwise_convolution_implicit_gemm_v1r3_lds_double_buffer_nchw_cyxk_khwn.hip.hpp b/src/include/gridwise_convolution_implicit_gemm_v1r3_lds_double_buffer_nchw_cyxk_khwn.hip.hpp index 7e1f08c42f..f721b92af1 100644 --- a/src/include/gridwise_convolution_implicit_gemm_v1r3_lds_double_buffer_nchw_cyxk_khwn.hip.hpp +++ b/src/include/gridwise_convolution_implicit_gemm_v1r3_lds_double_buffer_nchw_cyxk_khwn.hip.hpp @@ -362,8 +362,8 @@ struct GridwiseConvolutionImplicitGemm_v1r3_lds_double_buffer_nchw_cyxk_khwn const index_t n_thread_data_begin = c_thread_mtx_begin.col % NPerBlock; static_if{}([&](auto fwd) { // fwd do nothing but - // perfect forwarding. - // Using this trick to + // perfect forwarding. + // Using this trick to // make this lambda a generic lambda, so it won't be compiled until // instantiated static_assert( diff --git a/src/include/gridwise_convolution_implicit_gemm_v1r3_lds_double_buffer_nchw_cyxk_nkhw.hip.hpp b/src/include/gridwise_convolution_implicit_gemm_v1r3_lds_double_buffer_nchw_cyxk_nkhw.hip.hpp index b8689b9257..8549f30a01 100644 --- a/src/include/gridwise_convolution_implicit_gemm_v1r3_lds_double_buffer_nchw_cyxk_nkhw.hip.hpp +++ b/src/include/gridwise_convolution_implicit_gemm_v1r3_lds_double_buffer_nchw_cyxk_nkhw.hip.hpp @@ -196,7 +196,7 @@ struct GridwiseConvolutionImplicitGemm_v1r3_lds_double_buffer_nchw_cyxk_nkhw // choose GEMM implementation here const auto run_blockwise_batch_gemm = [&](auto... Xs) { -#if 1 +#if 0 return blockwise_batch_gemm.Run(Xs...); #elif 0 return blockwise_batch_gemm.Run_asm(Xs...);