From a68b16a5d9c03951cf52c59ea65ccab15f6f581b Mon Sep 17 00:00:00 2001 From: Chao Liu Date: Thu, 6 Jun 2019 19:49:40 -0500 Subject: [PATCH] refactor --- driver/driver.hip.cpp | 4 +-- .../blockwise_generic_tensor_slice_op.hip.hpp | 29 +++---------------- 2 files changed, 6 insertions(+), 27 deletions(-) diff --git a/driver/driver.hip.cpp b/driver/driver.hip.cpp index db13ffbba8..40cd4fdd3f 100644 --- a/driver/driver.hip.cpp +++ b/driver/driver.hip.cpp @@ -646,9 +646,9 @@ int main(int argc, char* argv[]) device_convolution_implicit_gemm_v1_nchw_cyxk_nkhw #elif 0 device_convolution_implicit_gemm_v2_chwn_cyxk_khwn -#elif 1 - device_convolution_implicit_gemm_v3_nchw_cyxk_nkhw #elif 0 + device_convolution_implicit_gemm_v3_nchw_cyxk_nkhw +#elif 1 device_convolution_implicit_gemm_v4_nchw_kcyx_nkhw #endif (in_nchw_desc, in_nchw, wei_kcyx_desc, wei_kcyx, out_nkhw_desc, out_nkhw_device, nrepeat); diff --git a/src/include/blockwise_generic_tensor_slice_op.hip.hpp b/src/include/blockwise_generic_tensor_slice_op.hip.hpp index ef49f7a33e..70b35fa00f 100644 --- a/src/include/blockwise_generic_tensor_slice_op.hip.hpp +++ b/src/include/blockwise_generic_tensor_slice_op.hip.hpp @@ -203,7 +203,7 @@ struct BlockwiseGenericTensorSliceCopy_v1 make_ConstantTensorDescriptor_packed(thread_sub_tensor_lengths * repeat_lengths); static_ford{}([&](auto repeat_multi_id_) { -#if 1 +#if 0 constexpr auto repeat_multi_id = sequence2array(decltype(repeat_multi_id_){}); const auto src_thread_data_multi_id_begin = repeat_multi_id * data_per_cluster_per_dims; @@ -215,7 +215,7 @@ struct BlockwiseGenericTensorSliceCopy_v1 const index_t clipboard_offset = thread_tensor_desc.GetOffsetFromMultiIndex(clipboard_data_multi_id_begin); -#else +#else // HIP compiler performs better with these codes constexpr auto repeat_multi_id = decltype(repeat_multi_id_){}; constexpr auto src_thread_data_multi_id_begin = @@ -256,7 +256,7 @@ struct BlockwiseGenericTensorSliceCopy_v1 make_ConstantTensorDescriptor_packed(thread_sub_tensor_lengths * repeat_lengths); static_ford{}([&](auto repeat_multi_id_) { -#if 1 +#if 0 constexpr auto repeat_multi_id = sequence2array(decltype(repeat_multi_id_){}); const auto clipboard_data_multi_id_begin = repeat_multi_id * thread_sub_tensor_lengths; @@ -267,7 +267,7 @@ struct BlockwiseGenericTensorSliceCopy_v1 thread_tensor_desc.GetOffsetFromMultiIndex(clipboard_data_multi_id_begin); const index_t dst_offset = DstDesc{}.GetOffsetFromMultiIndex(dst_data_multi_id_begin); -#else +#else // HIP compiler performs better with these codes constexpr auto repeat_multi_id = decltype(repeat_multi_id_){}; constexpr auto clipboard_data_multi_id_begin = @@ -338,27 +338,6 @@ struct BlockwiseGenericTensorSliceCopy_v1 src_partial_original_desc.UpdateMultiIndexGivenStepSizeOf1dIndex( old_src_partial_original_multi_id, StepSize, direction); -#if 0 - { - if(debug_flag && get_block_1d_id() == 0) - { - printf("id %5u %5u: " - "old_src_partial_original_multi_id %u %u %u, " - "new_src_partial_original_multi_id %u %u %u, " - "mThreadSrcOffset %u, mThreadDstOffset %u \n", - get_block_1d_id(), - get_thread_local_1d_id(), - old_src_partial_original_multi_id[0], - old_src_partial_original_multi_id[1], - old_src_partial_original_multi_id[2], - new_src_partial_original_multi_id[0], - new_src_partial_original_multi_id[1], - new_src_partial_original_multi_id[2] - ); - } - } -#endif - // update "mThreadSrcOriginalMultiId" static_for<0, decltype(src_partial_original_dims)::GetSize(), 1>{}([&](auto I_) { constexpr auto I = decltype(I_){};