From 0f52c4c0e4c512c728261c05c5e2a367283bd18f Mon Sep 17 00:00:00 2001 From: Chao Liu Date: Thu, 26 Sep 2019 00:00:25 -0500 Subject: [PATCH] added type conversion in threadwise and blockwise copy --- ...chw_kcyx_nkhw_padded_lds_double_buffer.hpp | 3 +- .../blockwise_generic_tensor_slice_copy.hpp | 4 +- .../threadwise_generic_tensor_slice_copy.hpp | 84 ++++++++++++------- 3 files changed, 58 insertions(+), 33 deletions(-) diff --git a/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_padded_lds_double_buffer.hpp b/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_padded_lds_double_buffer.hpp index 7f66251156..24dfddda16 100644 --- a/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_padded_lds_double_buffer.hpp +++ b/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_padded_lds_double_buffer.hpp @@ -437,9 +437,10 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_padded_lds_double_buf Float, address_space_t::generic, address_space_t::global> -#elif 1 +#else // tweaking .template Run_optimized_dst_address_calculation #endif (p_out_thread, p_out_global); diff --git a/composable_kernel/include/tensor_operation/blockwise_generic_tensor_slice_copy.hpp b/composable_kernel/include/tensor_operation/blockwise_generic_tensor_slice_copy.hpp index 825d9d021d..c7005515a1 100644 --- a/composable_kernel/include/tensor_operation/blockwise_generic_tensor_slice_copy.hpp +++ b/composable_kernel/include/tensor_operation/blockwise_generic_tensor_slice_copy.hpp @@ -772,7 +772,7 @@ struct BlockwiseGenericTensorSliceCopy_v4 BlockSrcAddressSpace, ThreadBufferAddressSpace>(p_block_src, p_thread_buffer); -#else +#else // tweaking mThreadwiseLoad.template Run_optimized_src_address_calculation(p_thread_buffer, p_block_dst); -#else +#else // tweaking mThreadwiseStore.template Run_optimized_dst_address_calculation - __device__ void Run_optimized_src_address_calculation(const TData* p_src, TData* p_dst) const + __device__ void Run_optimized_src_address_calculation(const SrcData* p_src, + DstData* p_dst) const { - using src_vector_t = typename vector_type::MemoryType; - using dst_vector_t = typename vector_type::MemoryType; + using src_vector_t = typename vector_type::MemoryType; + using dst_vector_t = typename vector_type::MemoryType; constexpr auto vector_access_dim = Number{}; @@ -1327,9 +1331,10 @@ struct ThreadwiseGenericTensorSliceCopy_v4r2 constexpr auto src_linear_dim_mask = SrcDesc::GetLinearDimensionMask(); constexpr auto src_nonlinear_dim_mask = SrcDesc::GetNonLinearDimensionMask(); - static_assert( - src_linear_dim_mask.At(VectorAccessDim) || long_vector_size == SrcDataPerAccess, - "Warning! VectorAccessDim is not SrcDesc's linear dimension, performance would drop"); + static_assert(src_linear_dim_mask.At(VectorAccessDim) || + long_vector_size == SrcDataPerAccess, + "Warning! VectorAccessDim is not SrcDesc's linear dimension, performance " + "would drop"); // separate steps into linear and non-linear components, accoording to src tensor constexpr auto linear_long_vector_access_lengths = @@ -1361,12 +1366,12 @@ struct ThreadwiseGenericTensorSliceCopy_v4r2 long_vector_size * linear_dim_long_vector_access_id[vector_access_dim]; // buffer to hold a long-vector - TData p_long_vector[long_vector_size]; + SrcData p_src_long_vector[long_vector_size]; // zero out buffer for(index_t i = 0; i < long_vector_size; ++i) { - p_long_vector[i] = 0; + p_src_long_vector[i] = 0; } // Loop over VectorAccessDim, and load data from src to the @@ -1399,22 +1404,30 @@ struct ThreadwiseGenericTensorSliceCopy_v4r2 { static_if{}([&](auto) { #if CK_USE_AMD_INTRINSIC && CK_USE_AMD_INTRINSIC_BUFFER_LOAD_STORE - *reinterpret_cast(&p_long_vector[buffer_offset]) = - __buffer_load( + *reinterpret_cast(&p_src_long_vector[buffer_offset]) = + __buffer_load( p_src, src_nonlinear_coord.GetOffset(), src_linear_offset); #else - *reinterpret_cast(&p_long_vector[buffer_offset]) = + *reinterpret_cast(&p_src_long_vector[buffer_offset]) = *reinterpret_cast( &p_src[src_nonlinear_coord.GetOffset() + src_linear_offset]); #endif }).Else([&](auto) { - *reinterpret_cast(&p_long_vector[buffer_offset]) = + *reinterpret_cast(&p_src_long_vector[buffer_offset]) = *reinterpret_cast( &p_src[src_nonlinear_coord.GetOffset() + src_linear_offset]); }); } } + // SrcData to DstData conversion + DstData p_dst_long_vector[long_vector_size]; + + for(index_t i = 0; i < long_vector_size; ++i) + { + p_dst_long_vector[i] = type_convert{}(p_src_long_vector[i]); + } + // store data from the long-vector buffer to dst for(index_t i = 0; i < long_vector_size / dst_data_per_access; ++i) { @@ -1434,7 +1447,7 @@ struct ThreadwiseGenericTensorSliceCopy_v4r2 if(dst_coord.IsUpperIndexMappedToValidOffset()) { *reinterpret_cast(&p_dst[dst_coord.GetOffset()]) = - *reinterpret_cast(&p_long_vector[buffer_offset]); + *reinterpret_cast(&p_dst_long_vector[buffer_offset]); } } }); @@ -1447,13 +1460,15 @@ struct ThreadwiseGenericTensorSliceCopy_v4r2 // Will do padding check on src data: Read 0 if src data is in padding area. // Will do padding check on dst data: No write if dst data is in paddin area. // This version is optimized for address calculation of dst tensor - template - __device__ void Run_optimized_dst_address_calculation(const TData* p_src, TData* p_dst) const + __device__ void Run_optimized_dst_address_calculation(const SrcData* p_src, + DstData* p_dst) const { - using src_vector_t = typename vector_type::MemoryType; - using dst_vector_t = typename vector_type::MemoryType; + using src_vector_t = typename vector_type::MemoryType; + using dst_vector_t = typename vector_type::MemoryType; constexpr auto vector_access_dim = Number{}; @@ -1469,9 +1484,10 @@ struct ThreadwiseGenericTensorSliceCopy_v4r2 constexpr auto dst_linear_dim_mask = DstDesc::GetLinearDimensionMask(); constexpr auto dst_nonlinear_dim_mask = DstDesc::GetNonLinearDimensionMask(); - static_assert( - dst_linear_dim_mask.At(VectorAccessDim) || long_vector_size == DstDataPerAccess, - "Warning! VectorAccessDim is not DstDesc's linear dimension, performance would drop"); + static_assert(dst_linear_dim_mask.At(VectorAccessDim) || + long_vector_size == DstDataPerAccess, + "Warning! VectorAccessDim is not DstDesc's linear dimension, performance " + "would drop"); // separate steps into linear and non-linear components, accoording to dst tensor constexpr auto linear_long_vector_access_lengths = @@ -1503,12 +1519,12 @@ struct ThreadwiseGenericTensorSliceCopy_v4r2 long_vector_size * linear_dim_long_vector_access_id[vector_access_dim]; // buffer to hold a long-vector - TData p_long_vector[long_vector_size]; + SrcData p_src_long_vector[long_vector_size]; // zero out buffer for(index_t i = 0; i < long_vector_size; ++i) { - p_long_vector[i] = 0; + p_src_long_vector[i] = 0; } // Loop over VectorAccessDim, and load data from src to the @@ -1535,11 +1551,19 @@ struct ThreadwiseGenericTensorSliceCopy_v4r2 // the src vector has the same padding situation if(src_coord.IsUpperIndexMappedToValidOffset()) { - *reinterpret_cast(&p_long_vector[buffer_offset]) = + *reinterpret_cast(&p_src_long_vector[buffer_offset]) = *reinterpret_cast(&p_src[src_coord.GetOffset()]); } } + // SrcData to DstData conversion + DstData p_dst_long_vector[long_vector_size]; + + for(index_t i = 0; i < long_vector_size; ++i) + { + p_dst_long_vector[i] = type_convert{}(p_src_long_vector[i]); + } + // store data from the long-vector buffer to dst for(index_t i = 0; i < long_vector_size / dst_data_per_access; ++i) { @@ -1564,20 +1588,20 @@ struct ThreadwiseGenericTensorSliceCopy_v4r2 { static_if{}([&](auto) { #if CK_USE_AMD_INTRINSIC && CK_USE_AMD_INTRINSIC_BUFFER_LOAD_STORE - __buffer_store( - *reinterpret_cast(&p_long_vector[buffer_offset]), + __buffer_store( + *reinterpret_cast(&p_dst_long_vector[buffer_offset]), p_dst, dst_nonlinear_coord.GetOffset(), dst_linear_offset); #else *reinterpret_cast( &p_dst[dst_nonlinear_coord.GetOffset() + dst_linear_offset]) = - *reinterpret_cast(&p_long_vector[buffer_offset]); + *reinterpret_cast(&p_dst_long_vector[buffer_offset]); #endif }).Else([&](auto) { *reinterpret_cast( &p_dst[dst_nonlinear_coord.GetOffset() + dst_linear_offset]) = - *reinterpret_cast(&p_long_vector[buffer_offset]); + *reinterpret_cast(&p_dst_long_vector[buffer_offset]); }); } }