diff --git a/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_padded_lds_double_buffer.hpp b/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_padded_lds_double_buffer.hpp index 7fe29324a4..d3a0753444 100644 --- a/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_padded_lds_double_buffer.hpp +++ b/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_padded_lds_double_buffer.hpp @@ -431,9 +431,9 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_padded_lds_double_buf b_thread_data_on_global, 0}) #if 1 - .template Run_generic + .template Run_generic #elif 1 - .template Run_optimized_dst_address_calculation + .template Run_optimized_dst_address_calculation #endif (p_out_thread, p_out_global); } diff --git a/composable_kernel/include/tensor_operation/blockwise_generic_tensor_slice_copy.hpp b/composable_kernel/include/tensor_operation/blockwise_generic_tensor_slice_copy.hpp index 2c9e5e2045..97f31ce622 100644 --- a/composable_kernel/include/tensor_operation/blockwise_generic_tensor_slice_copy.hpp +++ b/composable_kernel/include/tensor_operation/blockwise_generic_tensor_slice_copy.hpp @@ -734,43 +734,46 @@ struct BlockwiseGenericTensorSliceCopy_v4 return RegisterBufferDesc::GetElementSpace(); } - template - __device__ void RunLoadRegisterBuffer(const TData* p_src, TData* p_buffer) const + template + __device__ void RunLoadRegisterBuffer(const SrcData* p_src, BufferData* p_buffer) const { #if 1 - mThreadwiseLoad.template Run_generic( + mThreadwiseLoad.template Run_generic( p_src, p_buffer); #else - mThreadwiseLoad.template Run_optimized_src_address_calculation( p_src, p_buffer); #endif } - template - __device__ void RunStoreRegisterBuffer(const TData* p_buffer, TData* p_dst) const + template + __device__ void RunStoreRegisterBuffer(const BufferData* p_buffer, DstData* p_dst) const { #if 1 - mThreadwiseStore.template Run_generic( + mThreadwiseStore.template Run_generic( p_buffer, p_dst); #else - mThreadwiseStore.template Run_optimized_dst_address_calculation(p_buffer, p_dst); #endif } - template - __device__ void Run(const TData* p_src, TData* p_dst) const + __device__ void Run(const SrcData* p_src, DstData* p_dst) const { - TData p_buffer[GetRegisterBufferSize()]; + SrcData p_src_buffer[GetRegisterBufferSize()]; - RunLoadRegisterBuffer(p_src, p_buffer); - RunStoreRegisterBuffer(p_buffer, p_dst); + RunLoadRegisterBuffer(p_src, p_buffer); + RunStoreRegisterBuffer(p_buffer, p_dst); } template diff --git a/composable_kernel/include/tensor_operation/threadwise_generic_tensor_slice_copy.hpp b/composable_kernel/include/tensor_operation/threadwise_generic_tensor_slice_copy.hpp index 4094b1f094..8c3e92c712 100644 --- a/composable_kernel/include/tensor_operation/threadwise_generic_tensor_slice_copy.hpp +++ b/composable_kernel/include/tensor_operation/threadwise_generic_tensor_slice_copy.hpp @@ -1179,13 +1179,14 @@ struct ThreadwiseGenericTensorSliceCopy_v4r2 // Will do padding check on src data: Read 0 if src data is in padding area. // Will do padding check on dst data: No write if dst data is in paddin area. - template - __device__ void Run_generic(const TData* p_src, TData* p_dst) const + __device__ void Run_generic(const SrcData* p_src, DstData* p_dst) const { - using src_vector_t = typename vector_type::MemoryType; - using dst_vector_t = typename vector_type::MemoryType; + using src_vector_t = typename vector_type::MemoryType; + using dst_vector_t = typename vector_type::MemoryType; constexpr auto vector_access_dim = Number{}; @@ -1205,13 +1206,13 @@ struct ThreadwiseGenericTensorSliceCopy_v4r2 long_vector_data_begin_id(vector_access_dim) = long_vector_size * long_vector_access_id[vector_access_dim]; - // buffer to hold a long-vector - TData p_long_vector[long_vector_size]; + // buffer to hold a src long-vector + SrcData p_src_long_vector[long_vector_size]; // zero out buffer for(index_t i = 0; i < long_vector_size; ++i) { - p_long_vector[i] = 0; + p_src_long_vector[i] = 0; } // load data from src to the long-vector buffer @@ -1231,20 +1232,28 @@ struct ThreadwiseGenericTensorSliceCopy_v4r2 { static_if{}([&](auto) { #if CK_USE_AMD_INTRINSIC && CK_USE_AMD_INTRINSIC_BUFFER_LOAD_STORE - *reinterpret_cast(&p_long_vector[buffer_offset]) = - __buffer_load(p_src, src_coord.GetOffset(), 0); + *reinterpret_cast(&p_src_long_vector[buffer_offset]) = + __buffer_load(p_src, src_coord.GetOffset(), 0); #else - *reinterpret_cast(&p_long_vector[buffer_offset]) = + *reinterpret_cast(&p_src_long_vector[buffer_offset]) = *reinterpret_cast(&p_src[src_coord.GetOffset()]); #endif }).Else([&](auto) { // src can be all kinds of memory-space. - *reinterpret_cast(&p_long_vector[buffer_offset]) = + *reinterpret_cast(&p_src_long_vector[buffer_offset]) = *reinterpret_cast(&p_src[src_coord.GetOffset()]); }); } } + // SrcData to DstData conversion + DstData p_dst_long_vector[long_vector_size]; + + for(index_t i = 0; i < long_vector_size; ++i) + { + p_dst_long_vector[i] = type_convert(p_src_long_vector[i]); + } + // store data from the long-vector buffer to dst for(index_t i = 0; i < long_vector_size / dst_data_per_access; ++i) { @@ -1262,19 +1271,19 @@ struct ThreadwiseGenericTensorSliceCopy_v4r2 { static_if{}([&](auto) { #if CK_USE_AMD_INTRINSIC && CK_USE_AMD_INTRINSIC_BUFFER_LOAD_STORE - __buffer_store( - *reinterpret_cast(&p_long_vector[buffer_offset]), + __buffer_store( + *reinterpret_cast(&p_dst_long_vector[buffer_offset]), p_dst, dst_coord.GetOffset(), 0); #else *reinterpret_cast(&p_dst[dst_coord.GetOffset()]) = - *reinterpret_cast(&p_long_vector[buffer_offset]); + *reinterpret_cast(&p_dst_long_vector[buffer_offset]); #endif }).Else([&](auto) { // dst can be all kinds of memory-space *reinterpret_cast(&p_dst[dst_coord.GetOffset()]) = - *reinterpret_cast(&p_long_vector[buffer_offset]); + *reinterpret_cast(&p_dst_long_vector[buffer_offset]); }); } }