From 00089cd6e579d159bf452334e54510a521327cf8 Mon Sep 17 00:00:00 2001 From: Chao Liu Date: Thu, 26 Sep 2019 21:39:28 -0500 Subject: [PATCH] clean up --- ..._v4r1_nchw_kcyx_nkhw_lds_double_buffer.hpp | 10 +- ...chw_kcyx_nkhw_padded_lds_double_buffer.hpp | 5 +- ..._v4r4_nchw_kcyx_nkhw_lds_double_buffer.hpp | 10 +- ...chw_kcyx_nkhw_padded_lds_double_buffer.hpp | 5 +- .../blockwise_generic_tensor_slice_copy.hpp | 8 +- .../threadwise_generic_tensor_slice_copy.hpp | 8 +- ...e_generic_tensor_slice_copy_deprecated.hpp | 231 +----------------- driver/src/driver.cpp | 4 +- 8 files changed, 18 insertions(+), 263 deletions(-) diff --git a/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer.hpp b/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer.hpp index c7375766da..f2ab9cad2d 100644 --- a/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer.hpp +++ b/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer.hpp @@ -396,14 +396,8 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer 0, b_thread_data_on_global, 0}) -#if 0 - .Run -#else // tweaking - .template Run_optimized_address_calculation -#endif - (p_out_thread, p_out_global); + .template Run( + p_out_thread, p_out_global); } } }; diff --git a/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_padded_lds_double_buffer.hpp b/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_padded_lds_double_buffer.hpp index 917dca9e9c..faf8764507 100644 --- a/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_padded_lds_double_buffer.hpp +++ b/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_padded_lds_double_buffer.hpp @@ -427,10 +427,7 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_padded_lds_double_buf b_thread_data_on_global, 0}) #if 1 - .template Run_generic + .template Run #else // tweaking .template Run_optimized_dst_address_calculation -#endif - (p_out_thread, p_out_global); + .template Run( + p_out_thread, p_out_global); threadwise_out_copy.MoveSrcSliceWindow(Sequence<0, 0, GemmNPerThreadSubC>{}, True); threadwise_out_copy.MoveDstSliceWindow(Sequence<0, 0, B1>{}, True); diff --git a/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw_padded_lds_double_buffer.hpp b/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw_padded_lds_double_buffer.hpp index c0022462c6..bee553f62c 100644 --- a/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw_padded_lds_double_buffer.hpp +++ b/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw_padded_lds_double_buffer.hpp @@ -390,10 +390,7 @@ struct GridwiseConvolutionImplicitGemm_v4r4_nchw_kcyx_nkhw_padded_lds_double_buf b_thread_data_on_global / B1, b_thread_data_on_global % B1}) #if 1 - .template Run_generic + .template Run #else // tweaking .template Run_optimized_dst_address_calculation(p_block_src, @@ -95,8 +95,8 @@ struct BlockwiseGenericTensorSliceCopy_v4 __device__ void RunStoreThreadBuffer(const ThreadBufferData* p_thread_buffer, BlockDstData* p_block_dst) const { -#if 1 - mThreadwiseStore.template Run_generic(p_thread_buffer, p_block_dst); diff --git a/composable_kernel/include/tensor_operation/threadwise_generic_tensor_slice_copy.hpp b/composable_kernel/include/tensor_operation/threadwise_generic_tensor_slice_copy.hpp index 931210c558..aaf85435e6 100644 --- a/composable_kernel/include/tensor_operation/threadwise_generic_tensor_slice_copy.hpp +++ b/composable_kernel/include/tensor_operation/threadwise_generic_tensor_slice_copy.hpp @@ -78,7 +78,7 @@ struct ThreadwiseGenericTensorSliceCopy_v4r2 typename DstData, address_space_t SrcAddressSpace = address_space_t::generic, address_space_t DstAddressSpace = address_space_t::generic> - __device__ void Run_generic(const SrcData* p_src, DstData* p_dst) const + __device__ void Run(const SrcData* p_src, DstData* p_dst) const { using src_vector_t = typename vector_type::MemoryType; using dst_vector_t = typename vector_type::MemoryType; @@ -130,7 +130,7 @@ struct ThreadwiseGenericTensorSliceCopy_v4r2 #if CK_USE_AMD_INTRINSIC && CK_USE_AMD_INTRINSIC_BUFFER_LOAD_STORE *reinterpret_cast(&p_src_long_vector[buffer_offset]) = __buffer_load( - p_src, src_coord.GetOffset(), 0); + p_src, 0, src_coord.GetOffset()); #else *reinterpret_cast(&p_src_long_vector[buffer_offset]) = *reinterpret_cast(&p_src[src_coord.GetOffset()]); @@ -172,8 +172,8 @@ struct ThreadwiseGenericTensorSliceCopy_v4r2 __buffer_store( *reinterpret_cast(&p_dst_long_vector[buffer_offset]), p_dst, - dst_coord.GetOffset(), - 0); + 0, + dst_coord.GetOffset()); #else *reinterpret_cast(&p_dst[dst_coord.GetOffset()]) = *reinterpret_cast(&p_dst_long_vector[buffer_offset]); diff --git a/composable_kernel/include/tensor_operation/threadwise_generic_tensor_slice_copy_deprecated.hpp b/composable_kernel/include/tensor_operation/threadwise_generic_tensor_slice_copy_deprecated.hpp index f942422cf0..f5d0c9734d 100644 --- a/composable_kernel/include/tensor_operation/threadwise_generic_tensor_slice_copy_deprecated.hpp +++ b/composable_kernel/include/tensor_operation/threadwise_generic_tensor_slice_copy_deprecated.hpp @@ -538,235 +538,10 @@ struct ThreadwiseGenericTensorSliceCopy_v2r1 } }; - template - __device__ void Run(const TData* p_src, TData* p_dst) const - { - constexpr auto buffer_desc = make_ConstantTensorDescriptor_packed(SliceLengths{}); - - TData p_buffer_[buffer_desc.GetElementSpace()]; - TData* p_buffer = p_buffer_; - - // copy data from src into buffer - { - using src_vector_t = typename vector_type::MemoryType; - - constexpr auto src_vector_access_dim = Number{}; - constexpr auto src_data_per_access = Number{}; - - constexpr auto src_access_lengths = SliceLengths::Modify( - src_vector_access_dim, - SliceLengths::Get(src_vector_access_dim) / src_data_per_access); - - // Offset w.r.t merged dimensions need to be calculated at run-time. Offset w.r.t - // normal dimensions is known at compile time. - // Below is a hack to isolate merged dimension id from normal dimension id, so the - // corresponding offset can be calculated seperately at run-time and compile-time. - // src_merged_dim_access_lengths has the same value as src_access_lengths on src's - // merged dimensions, and has value = 1 on normal dimensions; - // src_merged_dim_access_lengths has the same value as src_access_lengths on src's - // normal dimensions, and has value = 1 on merged dimensions; - constexpr auto src_merged_dim_access_lengths = typename sequence_gen< - nDim, - IsolateMergedDimLengths>::type{}; - - constexpr auto src_normal_dim_access_lengths = - src_access_lengths + Number<1>{} - src_merged_dim_access_lengths; - -#if CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V2R1 - // offset w.r.t. merged dimension need to be computed at run-time - static_ford{}([&]( - auto src_merged_dim_access_id_) { - - constexpr auto src_merged_dim_access_id = decltype(src_merged_dim_access_id_){}; - - constexpr auto src_merged_dim_data_id = src_merged_dim_access_id.Modify( - src_vector_access_dim, - src_merged_dim_access_id[src_vector_access_dim] * src_data_per_access); - - const TData* p_src_tmp = - p_src + (mSrcSliceOrigin + src_merged_dim_data_id).GetOffset(); - - // offset w.r.t. normal dimension can be computed at compile-time - static_ford{}([&]( - auto src_normal_dim_access_id_) { - - constexpr auto src_normal_dim_access_id = decltype(src_normal_dim_access_id_){}; - - constexpr auto src_normal_dim_data_id = src_normal_dim_access_id.Modify( - src_vector_access_dim, - src_normal_dim_access_id[src_vector_access_dim] * src_data_per_access); - - constexpr index_t src_normal_offset = - SrcDesc::GetOffsetFromMultiIndex(src_normal_dim_data_id); - - // load vector from src - const src_vector_t vector_data = - *reinterpret_cast(&p_src_tmp[src_normal_offset]); - - // unpack vector into buffer - static_for<0, SrcDataPerAccess, 1>{}([&](auto i) { - constexpr auto scalar_id = - typename uniform_sequence_gen::type{}.Modify( - src_vector_access_dim, i); - - constexpr index_t buffer_offset = buffer_desc.GetOffsetFromMultiIndex( - src_merged_dim_data_id + src_normal_dim_data_id + scalar_id); - - p_buffer[buffer_offset] = reinterpret_cast(&vector_data)[i]; - }); - }); - }); -#else - ford{}([&]( - auto src_merged_dim_access_id) { - - auto src_merged_dim_data_id = src_merged_dim_access_id; - src_merged_dim_data_id(src_vector_access_dim) = - src_merged_dim_access_id[src_vector_access_dim] * src_data_per_access; - - const TData* p_src_tmp = - p_src + (mSrcSliceOrigin + src_merged_dim_data_id).GetOffset(); - - // these should be compile-time known - ford{}([&]( - auto src_normal_dim_access_id) { - - auto src_normal_dim_data_id = src_normal_dim_access_id; - src_normal_dim_data_id(src_vector_access_dim) = - src_normal_dim_access_id[src_vector_access_dim] * src_data_per_access; - - const index_t src_normal_offset = - SrcDesc::GetOffsetFromMultiIndex(src_normal_dim_data_id); - - // load vector from src - const src_vector_t vector_data = - *reinterpret_cast(&p_src_tmp[src_normal_offset]); - - // unpack vector into buffer - for(index_t i = 0; i < SrcDataPerAccess; ++i) - { - auto scalar_id = make_zero_array(); - scalar_id(src_vector_access_dim) = i; - - const index_t buffer_offset = buffer_desc.GetOffsetFromMultiIndex( - src_merged_dim_data_id + src_normal_dim_data_id + scalar_id); - - p_buffer[buffer_offset] = reinterpret_cast(&vector_data)[i]; - } - }); - }); -#endif - } - - // copy data from buffer into dst - { - using dst_vector_t = typename vector_type::MemoryType; - - constexpr auto dst_vector_access_dim = Number{}; - constexpr auto dst_data_per_access = Number{}; - - constexpr auto dst_access_lengths = SliceLengths::Modify( - dst_vector_access_dim, - SliceLengths::Get(dst_vector_access_dim) / dst_data_per_access); - - constexpr auto dst_merged_dim_access_lengths = typename sequence_gen< - nDim, - IsolateMergedDimLengths>::type{}; - - constexpr auto dst_normal_dim_access_lengths = - dst_access_lengths + Number<1>{} - dst_merged_dim_access_lengths; - -#if CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V2R1 - // offset w.r.t. merged dimension need to be computed at run-time - static_ford{}([&]( - auto dst_merged_dim_access_id_) { - - constexpr auto dst_merged_dim_access_id = decltype(dst_merged_dim_access_id_){}; - - constexpr auto dst_merged_dim_data_id = dst_merged_dim_access_id.Modify( - dst_vector_access_dim, - dst_merged_dim_access_id[dst_vector_access_dim] * dst_data_per_access); - - TData* p_dst_tmp = p_dst + (mDstSliceOrigin + dst_merged_dim_data_id).GetOffset(); - - // offset w.r.t. normal dimension can be computed at compile-time - static_ford{}([&]( - auto dst_normal_dim_access_id_) { - constexpr auto dst_normal_dim_access_id = decltype(dst_normal_dim_access_id_){}; - - constexpr auto dst_normal_dim_data_id = dst_normal_dim_access_id.Modify( - dst_vector_access_dim, - dst_normal_dim_access_id[dst_vector_access_dim] * dst_data_per_access); - - dst_vector_t vector_data; - - // pack vector from buffer - static_for<0, DstDataPerAccess, 1>{}([&](auto i) { - constexpr auto scalar_id = - typename uniform_sequence_gen::type{}.Modify( - dst_vector_access_dim, i); - - constexpr index_t buffer_offset = buffer_desc.GetOffsetFromMultiIndex( - dst_merged_dim_data_id + dst_normal_dim_data_id + scalar_id); - - reinterpret_cast(&vector_data)[i] = p_buffer[buffer_offset]; - }); - - constexpr index_t dst_normal_offset = - DstDesc::GetOffsetFromMultiIndex(dst_normal_dim_data_id); - - // write vector into dst - *reinterpret_cast(&p_dst_tmp[dst_normal_offset]) = vector_data; - }); - }); -#else - // offset w.r.t. merged dimension need to be computed at run-time - ford{}([&]( - auto dst_merged_dim_access_id) { - - auto dst_merged_dim_data_id = dst_merged_dim_access_id; - dst_merged_dim_data_id(dst_vector_access_dim) = - dst_merged_dim_access_id[dst_vector_access_dim] * dst_data_per_access; - - TData* p_dst_tmp = p_dst + (mDstSliceOrigin + dst_merged_dim_data_id).GetOffset(); - - // offset w.r.t. normal dimension can be computed at compile-time - ford{}([&]( - auto dst_normal_dim_access_id) { - - auto dst_normal_dim_data_id = dst_normal_dim_access_id; - dst_normal_dim_data_id(dst_vector_access_dim) = - dst_normal_dim_access_id[dst_vector_access_dim] * dst_data_per_access; - - dst_vector_t vector_data; - - // pack vector from buffer - for(index_t i = 0; i < DstDataPerAccess; ++i) - { - auto scalar_id = make_zero_array(); - scalar_id(dst_vector_access_dim) = i; - - const index_t buffer_offset = buffer_desc.GetOffsetFromMultiIndex( - dst_merged_dim_data_id + dst_normal_dim_data_id + scalar_id); - - reinterpret_cast(&vector_data)[i] = p_buffer[buffer_offset]; - } - - const index_t dst_normal_offset = - DstDesc::GetOffsetFromMultiIndex(dst_normal_dim_data_id); - - // write vector into dst - *reinterpret_cast(&p_dst_tmp[dst_normal_offset]) = vector_data; - }); - }); -#endif - } - } - template - __device__ void Run_optimized_address_calculation(const TData* p_src, TData* p_dst) const + __device__ void Run(const TData* p_src, TData* p_dst) const { constexpr auto buffer_desc = make_ConstantTensorDescriptor_packed(SliceLengths{}); @@ -841,9 +616,7 @@ struct ThreadwiseGenericTensorSliceCopy_v2r1 static_if{}([&](auto) { #if CK_USE_AMD_INTRINSIC && CK_USE_AMD_INTRINSIC_BUFFER_LOAD_STORE vector_data = __buffer_load( - p_src, - static_cast(src_merged_offset), - static_cast(src_normal_offset)); + p_src, src_merged_offset, src_normal_offset); #else vector_data = *reinterpret_cast( &p_src[src_normal_offset + src_merged_offset]); diff --git a/driver/src/driver.cpp b/driver/src/driver.cpp index 55d8471f50..4bcad1e92c 100644 --- a/driver/src/driver.cpp +++ b/driver/src/driver.cpp @@ -448,7 +448,7 @@ int main(int argc, char* argv[]) ConvStrides{}, ConvDilations{}, nrepeat); -#elif 0 +#elif 1 device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_padded(in_nchw_desc, in_nchw, wei_kcyx_desc, @@ -490,7 +490,7 @@ int main(int argc, char* argv[]) ConvStrides{}, ConvDilations{}, nrepeat); -#elif 1 +#elif 0 device_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw_padded(in_nchw_desc, in_nchw, wei_kcyx_desc,