diff --git a/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer.hpp b/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer.hpp index f2ab9cad2d..204b7ab867 100644 --- a/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer.hpp +++ b/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer.hpp @@ -265,10 +265,10 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer // LDS double buffer: preload data into LDS { - blockwise_in_copy.template Run(p_in_global, - p_in_block_double); - blockwise_wei_copy.template Run(p_wei_global, - p_wei_block_double); + blockwise_in_copy.template Run( + p_in_global, p_in_block_double); + blockwise_wei_copy.template Run( + p_wei_global, p_wei_block_double); } // LDS double buffer: main body @@ -299,10 +299,12 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer __syncthreads(); // LDS doubel buffer: load next data from device mem - blockwise_in_copy.template RunLoadThreadBuffer( - p_in_global, p_in_thread_buffer); - blockwise_wei_copy.template RunLoadThreadBuffer( - p_wei_global, p_wei_thread_buffer); + blockwise_in_copy + .template RunLoadThreadBuffer( + p_in_global, p_in_thread_buffer); + blockwise_wei_copy + .template RunLoadThreadBuffer( + p_wei_global, p_wei_thread_buffer); // LDS double buffer: GEMM on current data blockwise_gemm.Run(p_wei_block_now, p_in_block_now, p_out_thread); @@ -325,9 +327,9 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer __syncthreads(); // LDS doubel buffer: load next data from device mem - blockwise_in_copy.template RunLoadThreadBuffer( + blockwise_in_copy.template RunLoadThreadBuffer( p_in_global, p_in_thread_buffer); - blockwise_wei_copy.template RunLoadThreadBuffer( + blockwise_wei_copy.template RunLoadThreadBuffer( p_wei_global, p_wei_thread_buffer); // LDS double buffer: GEMM on current data @@ -396,7 +398,7 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer 0, b_thread_data_on_global, 0}) - .template Run( + .template Run( p_out_thread, p_out_global); } } diff --git a/composable_kernel/include/tensor_operation/blockwise_generic_tensor_slice_copy.hpp b/composable_kernel/include/tensor_operation/blockwise_generic_tensor_slice_copy.hpp index e9cc89fca0..5e4ee81d2d 100644 --- a/composable_kernel/include/tensor_operation/blockwise_generic_tensor_slice_copy.hpp +++ b/composable_kernel/include/tensor_operation/blockwise_generic_tensor_slice_copy.hpp @@ -120,6 +120,8 @@ struct BlockwiseGenericTensorSliceCopy_v4 BlockSrcData, BlockSrcAddressSpace, address_space_t::generic>(p_block_src, p_thread_buffer); + + // if there is type conversion, it's done during store RunStoreThreadBuffer - __device__ void RunLoadThreadBuffer(const TData* p_block_src, TData* p_thread_buffer) const + __device__ void RunLoadThreadBuffer(const SrcData* p_block_src, DstData* p_thread_buffer) const { - mThreadwiseLoad.template Run( - p_block_src, p_thread_buffer); + mThreadwiseLoad + .template Run( + p_block_src, p_thread_buffer); } - template - __device__ void RunStoreThreadBuffer(const TData* p_thread_buffer, TData* p_block_dst) const + __device__ void RunStoreThreadBuffer(const SrcData* p_thread_buffer, DstData* p_block_dst) const { - mThreadwiseStore.template Run( - p_thread_buffer, p_block_dst); + mThreadwiseStore + .template Run( + p_thread_buffer, p_block_dst); } - template - __device__ void Run(const TData* p_block_src, TData* p_block_dst) const + __device__ void Run(const SrcData* p_block_src, DstData* p_block_dst) const { - TData p_thread_buffer[GetThreadBufferSize()]; + SrcData p_thread_buffer[GetThreadBufferSize()]; - RunLoadThreadBuffer(p_block_src, - p_thread_buffer); - RunStoreThreadBuffer(p_thread_buffer, - p_block_dst); + RunLoadThreadBuffer( + p_block_src, p_thread_buffer); + + // if there is type conversion, it's done during store + RunStoreThreadBuffer( + p_thread_buffer, p_block_dst); } template diff --git a/composable_kernel/include/tensor_operation/threadwise_generic_tensor_slice_copy_deprecated.hpp b/composable_kernel/include/tensor_operation/threadwise_generic_tensor_slice_copy_deprecated.hpp index 1e8932d251..c271c65535 100644 --- a/composable_kernel/include/tensor_operation/threadwise_generic_tensor_slice_copy_deprecated.hpp +++ b/composable_kernel/include/tensor_operation/threadwise_generic_tensor_slice_copy_deprecated.hpp @@ -537,19 +537,20 @@ struct ThreadwiseGenericTensorSliceCopy_v2r1 } }; - template - __device__ void Run(const TData* p_src, TData* p_dst) const + __device__ void Run(const SrcData* p_src, DstData* p_dst) const { constexpr auto buffer_desc = make_ConstantTensorDescriptor_packed(SliceLengths{}); - TData p_buffer_[buffer_desc.GetElementSpace()]; - TData* p_buffer = p_buffer_; + SrcData p_src_buffer_[buffer_desc.GetElementSpace()]; + SrcData* p_src_buffer = p_src_buffer_; // copy data from src into buffer { - using src_vector_t = typename vector_type::MemoryType; + using src_vector_t = typename vector_type::MemoryType; constexpr auto src_vector_access_dim = Number{}; constexpr auto src_data_per_access = Number{}; @@ -573,77 +574,88 @@ struct ThreadwiseGenericTensorSliceCopy_v2r1 constexpr auto src_normal_dim_access_lengths = src_access_lengths + Number<1>{} - src_merged_dim_access_lengths; - ford{}([&]( - auto src_merged_dim_access_id) { + ford{}( + [&](auto src_merged_dim_access_id) { - auto src_merged_dim_data_id = src_merged_dim_access_id; - src_merged_dim_data_id(src_vector_access_dim) = - src_merged_dim_access_id[src_vector_access_dim] * src_data_per_access; + auto src_merged_dim_data_id = src_merged_dim_access_id; + src_merged_dim_data_id(src_vector_access_dim) = + src_merged_dim_access_id[src_vector_access_dim] * src_data_per_access; - // offset w.r.t. merged dimension need be computed at run-time, - const index_t src_merged_offset = - (mSrcSliceOrigin + src_merged_dim_data_id).GetOffset(); + // offset w.r.t. merged dimension need be computed at run-time, + const index_t src_merged_offset = + (mSrcSliceOrigin + src_merged_dim_data_id).GetOffset(); - ford{}([&]( - auto src_normal_dim_access_id) { + ford{}([&]( + auto src_normal_dim_access_id) { - auto src_normal_dim_data_id = src_normal_dim_access_id; - src_normal_dim_data_id(src_vector_access_dim) = - src_normal_dim_access_id[src_vector_access_dim] * src_data_per_access; + auto src_normal_dim_data_id = src_normal_dim_access_id; + src_normal_dim_data_id(src_vector_access_dim) = + src_normal_dim_access_id[src_vector_access_dim] * src_data_per_access; - // offset w.r.t. normal dimension is known at compile-time - const index_t src_normal_offset = - SrcDesc::GetOffsetFromMultiIndex(src_normal_dim_data_id); + // offset w.r.t. normal dimension is known at compile-time + const index_t src_normal_offset = + SrcDesc::GetOffsetFromMultiIndex(src_normal_dim_data_id); - src_vector_t vector_data; + src_vector_t vector_data; - // Read vector from src. - // 1. Source code version can take src of all kinds of memory-space - // 2. Intrinsic version using buffer_load can only take - // src from global-memory - // - // Commemt for loading from global-memory: - // When: - // 1) using source code, in order for compiler to emit optimal - // load instruction, or - // 2) using buffer_load intrinsic, in order for ISA to be valid, - // following assumptions need to be satisfied: - // 1. p_src need to be block-invariant (assumption) - // 2. src_normal_offset must be calculatd at compile time (guaranteed by - // algorithm) - // 3. src_merged_offset can be runtime value (no assumption imposed) - static_if{}([&](auto) { + // Read vector from src. + // 1. Source code version can take src of all kinds of memory-space + // 2. Intrinsic version using buffer_load can only take + // src from global-memory + // + // Commemt for loading from global-memory: + // When: + // 1) using source code, in order for compiler to emit optimal + // load instruction, or + // 2) using buffer_load intrinsic, in order for ISA to be valid, + // following assumptions need to be satisfied: + // 1. p_src need to be block-invariant (assumption) + // 2. src_normal_offset must be calculatd at compile time (guaranteed by + // algorithm) + // 3. src_merged_offset can be runtime value (no assumption imposed) + static_if{}([&](auto) { #if CK_USE_AMD_INTRINSIC && CK_USE_AMD_INTRINSIC_BUFFER_LOAD_STORE - vector_data = __buffer_load( - p_src, src_merged_offset, src_normal_offset); + vector_data = __buffer_load( + p_src, src_merged_offset, src_normal_offset); #else - vector_data = *reinterpret_cast( - &p_src[src_normal_offset + src_merged_offset]); + vector_data = *reinterpret_cast( + &p_src[src_normal_offset + src_merged_offset]); #endif - }).Else([&](auto) { - // src can be all kinds of memory-space. - vector_data = *reinterpret_cast( - &p_src[src_normal_offset + src_merged_offset]); + }).Else([&](auto) { + // src can be all kinds of memory-space. + vector_data = *reinterpret_cast( + &p_src[src_normal_offset + src_merged_offset]); + }); + + // unpack vector into buffer + for(index_t i = 0; i < SrcDataPerAccess; ++i) + { + auto scalar_id = make_zero_array(); + scalar_id(src_vector_access_dim) = i; + + const index_t buffer_offset = buffer_desc.GetOffsetFromMultiIndex( + src_merged_dim_data_id + src_normal_dim_data_id + scalar_id); + + p_src_buffer[buffer_offset] = + reinterpret_cast(&vector_data)[i]; + } }); - - // unpack vector into buffer - for(index_t i = 0; i < SrcDataPerAccess; ++i) - { - auto scalar_id = make_zero_array(); - scalar_id(src_vector_access_dim) = i; - - const index_t buffer_offset = buffer_desc.GetOffsetFromMultiIndex( - src_merged_dim_data_id + src_normal_dim_data_id + scalar_id); - - p_buffer[buffer_offset] = reinterpret_cast(&vector_data)[i]; - } }); - }); } + // type conversion + // TODO: would compiler do a good job reusing register for buffer? + DstData p_dst_buffer_[buffer_desc.GetElementSpace()]; + DstData* p_dst_buffer = p_dst_buffer_; + + ford{}([&](auto idx) { + p_dst_buffer[buffer_desc.GetOffsetFromMultiIndex(idx)] = + type_convert{}(p_src_buffer[buffer_desc.GetOffsetFromMultiIndex(idx)]); + }); + // copy data from buffer into dst { - using dst_vector_t = typename vector_type::MemoryType; + using dst_vector_t = typename vector_type::MemoryType; constexpr auto dst_vector_access_dim = Number{}; constexpr auto dst_data_per_access = Number{}; @@ -659,72 +671,72 @@ struct ThreadwiseGenericTensorSliceCopy_v2r1 constexpr auto dst_normal_dim_access_lengths = dst_access_lengths + Number<1>{} - dst_merged_dim_access_lengths; - ford{}( - [&](auto dst_merged_dim_access_id) { + ford{}([&]( + auto dst_merged_dim_access_id) { - auto dst_merged_dim_data_id = dst_merged_dim_access_id; - dst_merged_dim_data_id(dst_vector_access_dim) = - dst_merged_dim_access_id[dst_vector_access_dim] * dst_data_per_access; + auto dst_merged_dim_data_id = dst_merged_dim_access_id; + dst_merged_dim_data_id(dst_vector_access_dim) = + dst_merged_dim_access_id[dst_vector_access_dim] * dst_data_per_access; - // offset w.r.t. merged dimension need be computed at run-time, - const index_t dst_merged_offset = - (mDstSliceOrigin + dst_merged_dim_data_id).GetOffset(); + // offset w.r.t. merged dimension need be computed at run-time, + const index_t dst_merged_offset = + (mDstSliceOrigin + dst_merged_dim_data_id).GetOffset(); - ford{}([&]( - auto dst_normal_dim_access_id) { + ford{}([&]( + auto dst_normal_dim_access_id) { - auto dst_normal_dim_data_id = dst_normal_dim_access_id; - dst_normal_dim_data_id(dst_vector_access_dim) = - dst_normal_dim_access_id[dst_vector_access_dim] * dst_data_per_access; + auto dst_normal_dim_data_id = dst_normal_dim_access_id; + dst_normal_dim_data_id(dst_vector_access_dim) = + dst_normal_dim_access_id[dst_vector_access_dim] * dst_data_per_access; - dst_vector_t vector_data; + dst_vector_t vector_data; - // pack vector from buffer - for(index_t i = 0; i < DstDataPerAccess; ++i) - { - auto scalar_id = make_zero_array(); - scalar_id(dst_vector_access_dim) = i; + // pack vector from buffer + for(index_t i = 0; i < DstDataPerAccess; ++i) + { + auto scalar_id = make_zero_array(); + scalar_id(dst_vector_access_dim) = i; - const index_t buffer_offset = buffer_desc.GetOffsetFromMultiIndex( - dst_merged_dim_data_id + dst_normal_dim_data_id + scalar_id); + const index_t buffer_offset = buffer_desc.GetOffsetFromMultiIndex( + dst_merged_dim_data_id + dst_normal_dim_data_id + scalar_id); - reinterpret_cast(&vector_data)[i] = p_buffer[buffer_offset]; - } + reinterpret_cast(&vector_data)[i] = p_dst_buffer[buffer_offset]; + } - // offset w.r.t. normal dimension is known at compile-time - const index_t dst_normal_offset = - DstDesc::GetOffsetFromMultiIndex(dst_normal_dim_data_id); + // offset w.r.t. normal dimension is known at compile-time + const index_t dst_normal_offset = + DstDesc::GetOffsetFromMultiIndex(dst_normal_dim_data_id); - // Write vector into dst. - // 1. Source code version can take dst of all kinds of memory-space - // 2. Intrinsic version using buffer_store can only take - // dst from global-memory - // - // Commemt for storing into global-memory: - // When: - // 1) using source code, in order for compiler to emit optimal - // store instruction, or - // 2) using buffer_store, intrinsic in order ISA to be valid - // following assumptions need to be satisfied: - // 1. p_dst need to be block-invariant (assumption) - // 2. dst_normal_offset must be calculatd at compile time (guaranteed by - // algorithm) - // 3. dst_merged_offset can be runtime value (no assumption imposed) - static_if{}([&](auto) { + // Write vector into dst. + // 1. Source code version can take dst of all kinds of memory-space + // 2. Intrinsic version using buffer_store can only take + // dst from global-memory + // + // Commemt for storing into global-memory: + // When: + // 1) using source code, in order for compiler to emit optimal + // store instruction, or + // 2) using buffer_store, intrinsic in order ISA to be valid + // following assumptions need to be satisfied: + // 1. p_dst need to be block-invariant (assumption) + // 2. dst_normal_offset must be calculatd at compile time (guaranteed by + // algorithm) + // 3. dst_merged_offset can be runtime value (no assumption imposed) + static_if{}([&](auto) { #if CK_USE_AMD_INTRINSIC && CK_USE_AMD_INTRINSIC_BUFFER_LOAD_STORE - __buffer_store( - vector_data, p_dst, dst_merged_offset, dst_normal_offset); + __buffer_store( + vector_data, p_dst, dst_merged_offset, dst_normal_offset); #else - *reinterpret_cast( - &p_dst[dst_normal_offset + dst_merged_offset]) = vector_data; + *reinterpret_cast( + &p_dst[dst_normal_offset + dst_merged_offset]) = vector_data; #endif - }).Else([&](auto) { - // dst can be all kinds of memory-space - *reinterpret_cast( - &p_dst[dst_normal_offset + dst_merged_offset]) = vector_data; - }); + }).Else([&](auto) { + // dst can be all kinds of memory-space + *reinterpret_cast( + &p_dst[dst_normal_offset + dst_merged_offset]) = vector_data; }); }); + }); } } diff --git a/driver/src/driver.cpp b/driver/src/driver.cpp index 4319c4f7d6..ab5b8826ab 100644 --- a/driver/src/driver.cpp +++ b/driver/src/driver.cpp @@ -295,7 +295,7 @@ int main(int argc, char* argv[]) using LeftPads = Sequence<0, 0>; using RightPads = Sequence<0, 0>; -#elif 0 +#elif 1 // 3x3 filter, 2x2 stride, 35x35 input, 17x17 output // cudnn@V100 90%, ck@V100 93%, ck@P100 83%, ck@VII 81% constexpr index_t N = 128; @@ -341,7 +341,7 @@ int main(int argc, char* argv[]) using LeftPads = Sequence<3, 0>; using RightPads = Sequence<3, 0>; -#elif 1 +#elif 0 // 1x7 filter, 0x3 pad, 17x17 input constexpr index_t N = 128; constexpr index_t C = 128; @@ -438,7 +438,7 @@ int main(int argc, char* argv[]) #elif 0 device_convolution_implicit_gemm_v3_nchw_cyxk_nkhw( (in_nchw_desc, in_nchw, wei_kcyx_desc, wei_kcyx, out_nkhw_desc, out_nkhw_device, nrepeat); -#elif 0 +#elif 1 device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw(in_nchw_desc, in_nchw, wei_kcyx_desc,