diff --git a/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer.hpp b/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer.hpp index 316f1e46b5..555e68e594 100644 --- a/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer.hpp +++ b/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer.hpp @@ -449,18 +449,28 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer out_n0_n1_n2_k0_k1_k2_h_w_thread_desc.GetLengths(), arithmetic_sequence_gen<0, 8, 1>::type{}, Number<1>{}); -#elif 1 - ThreadwiseGenericTensorSliceCopy_v1< +#elif 0 + ThreadwiseGenericTensorSliceCopy_v1r1< decltype(out_n0_n1_n2_k0_k1_k2_h_w_thread_desc), decltype(out_n0_n1_n2_k0_k1_k2_h_w_global_mem_desc), decltype(out_n0_n1_n2_k0_k1_k2_h_w_thread_desc.GetLengths()), arithmetic_sequence_gen<0, 8, 1>::type, arithmetic_sequence_gen<0, 8, 1>::type, - 0, - 0, + 7, + 7, 1, - 1>({0, 0, 0, 0, 0, 0, 0, 0}, {0, 0, 0, 0, 0, 0, 0, 0}) + 1>(make_zero_array(), make_zero_array()) .Run(p_out_thread, p_out_thread_on_global); +#elif 1 + ThreadwiseGenericTensorSliceCopy_v1r2< + decltype(out_n0_n1_n2_k0_k1_k2_h_w_thread_desc), + decltype(out_n0_n1_n2_k0_k1_k2_h_w_global_mem_desc), + decltype(out_n0_n1_n2_k0_k1_k2_h_w_thread_desc.GetLengths()), + arithmetic_sequence_gen<0, 8, 1>::type, + 7, + 1, + 1>(make_zero_array(), make_zero_array()) + .Run_non_static(p_out_thread, p_out_thread_on_global); #elif 0 ThreadwiseGenericTensorSliceCopy_v2< decltype(out_n0_n1_n2_k0_k1_k2_h_w_thread_desc), diff --git a/composable_kernel/include/tensor_operation/blockwise_generic_tensor_slice_copy.hpp b/composable_kernel/include/tensor_operation/blockwise_generic_tensor_slice_copy.hpp index 4a1acd102b..d2161cb4ff 100644 --- a/composable_kernel/include/tensor_operation/blockwise_generic_tensor_slice_copy.hpp +++ b/composable_kernel/include/tensor_operation/blockwise_generic_tensor_slice_copy.hpp @@ -245,7 +245,7 @@ struct BlockwiseGenericTensorSliceCopy_v1 // dimension need to be evenly dividable by its sub-lengths. Also, the repeat-length on // the merged dimension need to be 1. These sanity checks are performed in constructor // of BlockwiseGenericTensorSliceCopy_v1 -#if 0 // debug +#if 0 threadwise_generic_tensor_slice_copy_v1(SrcDesc{}, p_src + src_offset + mThreadSrcOffset, make_zero_array(), @@ -255,18 +255,28 @@ struct BlockwiseGenericTensorSliceCopy_v1 thread_sub_tensor_lengths, SrcDimAccessOrder{}, Number{}); -#else - ThreadwiseGenericTensorSliceCopy_v1::type, - SrcVectorAccessDim, - 0, - SrcDataPerAccess, - 1>(make_zero_array(), - make_zero_array()) +#elif 0 + ThreadwiseGenericTensorSliceCopy_v1r1< + SrcDesc, + decltype(thread_buffer_desc), + SubLengths, + SrcDimAccessOrder, + typename arithmetic_sequence_gen<0, nDim, 1>::type, + SrcVectorAccessDim, + 0, + SrcDataPerAccess, + 1>(make_zero_array(), make_zero_array()) .Run(p_src + src_offset + mThreadSrcOffset, p_buffer + buffer_offset); +#elif 1 + ThreadwiseGenericTensorSliceCopy_v1r2(make_zero_array(), + make_zero_array()) + .Run_non_static(p_src + src_offset + mThreadSrcOffset, p_buffer + buffer_offset); #endif }); } @@ -312,7 +322,7 @@ struct BlockwiseGenericTensorSliceCopy_v1 // By setting SubLengths = 1 at the merged dimension, this is always true; // If in the future, you want to enable SubLengths > 1 at the merged dimension, // special care in implementation is needed -#if 0 // debug +#if 0 threadwise_generic_tensor_slice_copy_v1(thread_buffer_desc, p_buffer + buffer_offset, make_zero_array(), @@ -322,18 +332,29 @@ struct BlockwiseGenericTensorSliceCopy_v1 thread_sub_tensor_lengths, DstDimAccessOrder{}, Number{}); -#else - ThreadwiseGenericTensorSliceCopy_v1::type, - DstDimAccessOrder, - 0, - DstVectorAccessDim, - 1, - DstDataPerAccess>(make_zero_array(), - make_zero_array()) +#elif 0 + ThreadwiseGenericTensorSliceCopy_v1r1< + decltype(thread_buffer_desc), + DstDesc, + SubLengths, + typename arithmetic_sequence_gen<0, nDim, 1>::type, + DstDimAccessOrder, + 0, + DstVectorAccessDim, + 1, + DstDataPerAccess>(make_zero_array(), + make_zero_array()) .Run(p_buffer + buffer_offset, p_dst + dst_offset + mThreadDstOffset); +#elif 1 + ThreadwiseGenericTensorSliceCopy_v1r2(make_zero_array(), + make_zero_array()) + .Run_non_static(p_buffer + buffer_offset, p_dst + dst_offset + mThreadDstOffset); #endif }); } diff --git a/composable_kernel/include/tensor_operation/threadwise_generic_tensor_slice_copy.hpp b/composable_kernel/include/tensor_operation/threadwise_generic_tensor_slice_copy.hpp index ce620bcf88..31de9f2fa0 100644 --- a/composable_kernel/include/tensor_operation/threadwise_generic_tensor_slice_copy.hpp +++ b/composable_kernel/include/tensor_operation/threadwise_generic_tensor_slice_copy.hpp @@ -106,7 +106,15 @@ __device__ void threadwise_generic_tensor_slice_copy_v1( #endif } -#if 1 +// This threadwise copy allow vector access of src and dst. +// It allows the dimensions of vector access to be different on src and dst. +// It also allows the vector size to be different on src and dst. +// It also allows order of access to be different on src and dst. +// It use register as buffer to hold all data moving from src to dst. +// It is designed for copying small amount of data, and src and dst are +// device memory or LDS. +// When copying large amout of data, let's hope compiler will reduce register +// used for the buffer. template -struct ThreadwiseGenericTensorSliceCopy_v1 +struct ThreadwiseGenericTensorSliceCopy_v1r1 { static constexpr index_t nDim = SliceLengths::GetSize(); - __device__ constexpr ThreadwiseGenericTensorSliceCopy_v1(Array src_slice_origin, - Array dst_slice_origin) + __device__ constexpr ThreadwiseGenericTensorSliceCopy_v1r1( + Array src_slice_origin, Array dst_slice_origin) : mSrcSliceOrigin(src_slice_origin), mDstSliceOrigin(dst_slice_origin) { static_assert(nDim == SrcDesc::GetNumOfDimension() && @@ -145,7 +153,7 @@ struct ThreadwiseGenericTensorSliceCopy_v1 static_if{}( [&](auto fwd) { static_assert( - (fwd(SrcDesc{}).GetStrides()[SrcVectorAccessDim] == 1 || SrcDataPerAccess == 1), + (fwd(SrcDesc{}).GetStride(src_vector_access_dim) == 1 || SrcDataPerAccess == 1), "wrong! vectorized access is allowed only if stride == 1"); }) .Else([&](auto fwd) { @@ -158,7 +166,7 @@ struct ThreadwiseGenericTensorSliceCopy_v1 static_if{}( [&](auto fwd) { static_assert( - (fwd(DstDesc{}).GetStrides()[DstVectorAccessDim] == 1 || DstDataPerAccess == 1), + (fwd(DstDesc{}).GetStride(dst_vector_access_dim) == 1 || DstDataPerAccess == 1), "wrong! vectorized access is allowed only if stride == 1"); }) .Else([&](auto fwd) { @@ -169,9 +177,9 @@ struct ThreadwiseGenericTensorSliceCopy_v1 }); } - __device__ constexpr ThreadwiseGenericTensorSliceCopy_v1() - : ThreadwiseGenericTensorSliceCopy_v1(make_zero_array(), - make_zero_array()) + __device__ constexpr ThreadwiseGenericTensorSliceCopy_v1r1() + : ThreadwiseGenericTensorSliceCopy_v1r1(make_zero_array(), + make_zero_array()) { } @@ -205,12 +213,12 @@ struct ThreadwiseGenericTensorSliceCopy_v1 SliceLengths::Get(src_vector_access_dim) / src_data_per_access); static_ford{}([&](auto src_access_id) { - constexpr auto src_data_id = src_access_id.Modify( + constexpr auto src_data_begin_id = src_access_id.Modify( src_vector_access_dim, src_access_id[src_vector_access_dim] * src_data_per_access); const index_t src_offset = - SrcDesc::GetOffsetFromMultiIndex(mSrcSliceOrigin + src_data_id); + SrcDesc::GetOffsetFromMultiIndex(mSrcSliceOrigin + src_data_begin_id); // load vector from src const vector_t vector_data = *reinterpret_cast(&p_src[src_offset]); @@ -222,7 +230,7 @@ struct ThreadwiseGenericTensorSliceCopy_v1 i); constexpr index_t buffer_offset = - buffer_desc.GetOffsetFromMultiIndex(src_data_id + scalar_id); + buffer_desc.GetOffsetFromMultiIndex(src_data_begin_id + scalar_id); p_buffer[buffer_offset] = reinterpret_cast(&vector_data)[i]; }); @@ -241,7 +249,7 @@ struct ThreadwiseGenericTensorSliceCopy_v1 SliceLengths::Get(dst_vector_access_dim) / dst_data_per_access); static_ford{}([&](auto dst_access_id) { - constexpr auto dst_data_id = dst_access_id.Modify( + constexpr auto dst_data_begin_id = dst_access_id.Modify( dst_vector_access_dim, dst_access_id[dst_vector_access_dim] * dst_data_per_access); @@ -254,13 +262,13 @@ struct ThreadwiseGenericTensorSliceCopy_v1 i); constexpr index_t buffer_offset = - buffer_desc.GetOffsetFromMultiIndex(dst_data_id + scalar_id); + buffer_desc.GetOffsetFromMultiIndex(dst_data_begin_id + scalar_id); reinterpret_cast(&vector_data)[i] = p_buffer[buffer_offset]; }); const index_t dst_offset = - DstDesc::GetOffsetFromMultiIndex(mDstSliceOrigin + dst_data_id); + DstDesc::GetOffsetFromMultiIndex(mDstSliceOrigin + dst_data_begin_id); // store vector into dst *reinterpret_cast(&p_dst[dst_offset]) = vector_data; @@ -272,7 +280,196 @@ struct ThreadwiseGenericTensorSliceCopy_v1 Array mSrcSliceOrigin; Array mDstSliceOrigin; }; -#endif + +// This threadwise copy allow vector access of src and dst. +// It allows the vector size to be different on src and dst. +// The dimensions of vector access should be the same on src and dst. +// The dimension access order should be the same on src and dst. +// It is designed for cases, where one of src and dst is register, and +// the other is device memory or LDS +template +struct ThreadwiseGenericTensorSliceCopy_v1r2 +{ + static constexpr index_t nDim = SliceLengths::GetSize(); + + __device__ constexpr ThreadwiseGenericTensorSliceCopy_v1r2( + Array src_slice_origin, Array dst_slice_origin) + : mSrcSliceOrigin(src_slice_origin), mDstSliceOrigin(dst_slice_origin) + { + static_assert(nDim == SrcDesc::GetNumOfDimension() && + nDim == DstDesc::GetNumOfDimension() && nDim == SliceLengths::GetSize() && + nDim == DimAccessOrder::GetSize(), + "wrong! # of dimensions not the same"); + + static_assert(is_valid_sequence_map::value, "wrong! map is not valid"); + + static_assert( + SliceLengths{}[VectorAccessDim] % math::lcm(SrcDataPerAccess, DstDataPerAccess) == 0, + "wrong! cannot evenly divide"); + + // check vectorized memory access + constexpr auto vector_access_dim = Number{}; + + static_if{}([&](auto fwd) { + static_assert( + (fwd(SrcDesc{}).GetStride(vector_access_dim) == 1 || SrcDataPerAccess == 1), + "wrong! vectorized access is allowed only if stride == 1"); + }).Else([&](auto fwd) { + static_assert((fwd(SrcDesc{}).GetLastOriginalDimensionStride(vector_access_dim) == 1 || + SrcDataPerAccess == 1), + "wrong! vectorized access is allowed only if stride == 1"); + }); + + static_if{}([&](auto fwd) { + static_assert( + (fwd(DstDesc{}).GetStride(vector_access_dim) == 1 || DstDataPerAccess == 1), + "wrong! vectorized access is allowed only if stride == 1"); + }).Else([&](auto fwd) { + static_assert((fwd(DstDesc{}).GetLastOriginalDimensionStride(vector_access_dim) == 1 || + DstDataPerAccess == 1), + "wrong! vectorized access is allowed only if stride == 1"); + }); + } + + __device__ constexpr ThreadwiseGenericTensorSliceCopy_v1r2() + : ThreadwiseGenericTensorSliceCopy_v1r2(make_zero_array(), + make_zero_array()) + { + } + + __device__ void SetSrcSliceOrigin(Array src_slice_origin) + { + mSrcSliceOrigin = src_slice_origin; + } + + __device__ void SetDstSliceOrigin(Array dst_slice_origin) + { + mDstSliceOrigin = dst_slice_origin; + } + + template + __device__ void Run(const TData* p_src, TData* p_dst) const + { + using src_vector_t = typename vector_type::MemoryType; + using dst_vector_t = typename vector_type::MemoryType; + + constexpr auto vector_access_dim = Number{}; + + constexpr auto src_data_per_access = Number{}; + constexpr auto dst_data_per_access = Number{}; + + constexpr auto long_vector_size = Number{}; + + constexpr auto long_vector_access_lengths = SliceLengths::Modify( + vector_access_dim, SliceLengths::Get(vector_access_dim) / long_vector_size); + + static_ford{}([&]( + auto long_vector_access_id) { + // data id w.r.t slicing-window + constexpr auto long_vector_data_begin_id = long_vector_access_id.Modify( + vector_access_dim, long_vector_access_id[vector_access_dim] * long_vector_size); + + // buffer to hold a long-vector + TData p_long_vector[long_vector_size]; + + // load data from src to the long-vector buffer + static_for<0, long_vector_size / src_data_per_access, 1>{}([&](auto i) { + constexpr auto scalar_id = typename uniform_sequence_gen::type{}.Modify( + vector_access_dim, i * src_data_per_access); + + const index_t src_offset = SrcDesc::GetOffsetFromMultiIndex( + mSrcSliceOrigin + (long_vector_data_begin_id + scalar_id)); + + constexpr index_t buffer_offset = i * src_data_per_access; + + *reinterpret_cast(&p_long_vector[buffer_offset]) = + *reinterpret_cast(&p_src[src_offset]); + }); + + // store data from the long-vector buffer to dst + static_for<0, long_vector_size / dst_data_per_access, 1>{}([&](auto i) { + constexpr auto scalar_id = typename uniform_sequence_gen::type{}.Modify( + vector_access_dim, i * dst_data_per_access); + + constexpr index_t buffer_offset = i * dst_data_per_access; + + const index_t dst_offset = DstDesc::GetOffsetFromMultiIndex( + mDstSliceOrigin + (long_vector_data_begin_id + scalar_id)); + + *reinterpret_cast(&p_dst[dst_offset]) = + *reinterpret_cast(&p_long_vector[buffer_offset]); + }); + }); + } + + template + __device__ void Run_non_static(const TData* p_src, TData* p_dst) const + { + using src_vector_t = typename vector_type::MemoryType; + using dst_vector_t = typename vector_type::MemoryType; + + constexpr auto vector_access_dim = Number{}; + + constexpr auto src_data_per_access = Number{}; + constexpr auto dst_data_per_access = Number{}; + + constexpr auto long_vector_size = Number{}; + + constexpr auto long_vector_access_lengths = SliceLengths::Modify( + vector_access_dim, SliceLengths::Get(vector_access_dim) / long_vector_size); + + ford{}( + [&](auto long_vector_access_id) { + // data id w.r.t slicing-window + auto long_vector_data_begin_id = long_vector_access_id; + long_vector_data_begin_id(vector_access_dim) = + long_vector_size * long_vector_access_id[vector_access_dim]; + + // buffer to hold a long-vector + TData p_long_vector[long_vector_size]; + + // load data from src to the long-vector buffer + for(index_t i = 0; i < long_vector_size / src_data_per_access; ++i) + { + auto scalar_id = make_zero_array(); + scalar_id(vector_access_dim) = i * src_data_per_access; + + const index_t src_offset = SrcDesc::GetOffsetFromMultiIndex( + mSrcSliceOrigin + (long_vector_data_begin_id + scalar_id)); + + const index_t buffer_offset = i * src_data_per_access; + + *reinterpret_cast(&p_long_vector[buffer_offset]) = + *reinterpret_cast(&p_src[src_offset]); + } + + // store data from the long-vector buffer to dst + for(index_t i = 0; i < long_vector_size / dst_data_per_access; ++i) + { + auto scalar_id = make_zero_array(); + scalar_id(vector_access_dim) = i * dst_data_per_access; + + const index_t buffer_offset = i * dst_data_per_access; + + const index_t dst_offset = DstDesc::GetOffsetFromMultiIndex( + mDstSliceOrigin + (long_vector_data_begin_id + scalar_id)); + + *reinterpret_cast(&p_dst[dst_offset]) = + *reinterpret_cast(&p_long_vector[buffer_offset]); + } + }); + } + + private: + Array mSrcSliceOrigin; + Array mDstSliceOrigin; +}; template ; - using InBlockCopyClusterLengths_E_N1_B_N2 = Sequence<8, 2, 16, 1>; + using InBlockCopySubLengths_E_N1_B_N2 = Sequence<1, 1, 4, 1>; + using InBlockCopyClusterLengths_E_N1_B_N2 = Sequence<8, 2, 4, 4>; using InBlockCopyThreadClusterArrangeOrder = Sequence<0, 1, 3, 2>; // [E, N1, N2, B] using InBlockCopySrcAccessOrder = Sequence<0, 1, 3, 2>; // [E, N1, N2, B] using InBlockCopyDstAccessOrder = Sequence<0, 1, 2, 3>; // [E, N1, B, N2] - constexpr index_t InBlockCopySrcDataPerRead_B = 1; - constexpr index_t InBlockCopyDstDataPerWrite_N2 = 4; + constexpr index_t InBlockCopySrcDataPerRead_B = 4; + constexpr index_t InBlockCopyDstDataPerWrite_N2 = 1; using WeiBlockCopySubLengths_E_K = Sequence<2, 2>; using WeiBlockCopyClusterLengths_E_K = Sequence<4, 64>;