diff --git a/composable_kernel/include/tensor_operation/blockwise_generic_tensor_slice_copy.hpp b/composable_kernel/include/tensor_operation/blockwise_generic_tensor_slice_copy.hpp index ed3049341e..0c1e5af052 100644 --- a/composable_kernel/include/tensor_operation/blockwise_generic_tensor_slice_copy.hpp +++ b/composable_kernel/include/tensor_operation/blockwise_generic_tensor_slice_copy.hpp @@ -199,7 +199,7 @@ struct BlockwiseGenericTensorSliceCopy_v1 } __device__ void RunLoadRegisterBuffer(const Float* __restrict__ p_src, - Float* __restrict__ p_Buffer) const + Float* __restrict__ p_buffer) const { constexpr auto thread_sub_tensor_lengths = SubLengths{}; @@ -216,24 +216,24 @@ struct BlockwiseGenericTensorSliceCopy_v1 constexpr auto src_thread_data_multi_id_begin = repeat_multi_id * data_per_cluster_per_dims; - constexpr auto Buffer_data_multi_id_begin = repeat_multi_id * thread_sub_tensor_lengths; + constexpr auto buffer_data_multi_id_begin = repeat_multi_id * thread_sub_tensor_lengths; constexpr index_t src_offset = SrcDesc::GetOffsetFromMultiIndex(src_thread_data_multi_id_begin); - constexpr index_t Buffer_offset = - thread_tensor_desc.GetOffsetFromMultiIndex(Buffer_data_multi_id_begin); + constexpr index_t buffer_offset = + thread_tensor_desc.GetOffsetFromMultiIndex(buffer_data_multi_id_begin); #else ford{}([&](auto repeat_multi_id) { const auto src_thread_data_multi_id_begin = repeat_multi_id * data_per_cluster_per_dims; - const auto Buffer_data_multi_id_begin = repeat_multi_id * thread_sub_tensor_lengths; + const auto buffer_data_multi_id_begin = repeat_multi_id * thread_sub_tensor_lengths; const index_t src_offset = SrcDesc::GetOffsetFromMultiIndex(src_thread_data_multi_id_begin); - const index_t Buffer_offset = - thread_tensor_desc.GetOffsetFromMultiIndex(Buffer_data_multi_id_begin); + const index_t buffer_offset = + thread_tensor_desc.GetOffsetFromMultiIndex(buffer_data_multi_id_begin); #endif // By position the origin of the per-thread window at the point, where multi-index @@ -247,7 +247,7 @@ struct BlockwiseGenericTensorSliceCopy_v1 p_src + src_offset + mThreadSrcOffset, make_zero_array(), thread_tensor_desc, - p_Buffer + Buffer_offset, + p_buffer + buffer_offset, make_zero_array(), thread_sub_tensor_lengths, SrcAccessOrder{}, @@ -255,7 +255,7 @@ struct BlockwiseGenericTensorSliceCopy_v1 }); } - __device__ void RunStoreRegisterBuffer(const Float* __restrict__ p_Buffer, + __device__ void RunStoreRegisterBuffer(const Float* __restrict__ p_buffer, Float* __restrict__ p_dst) const { constexpr auto thread_sub_tensor_lengths = SubLengths{}; @@ -270,23 +270,23 @@ struct BlockwiseGenericTensorSliceCopy_v1 #if CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_BLOCKWISE_GENERIC_SLICE_COPY_V1 static_ford{}([&](auto repeat_multi_id) { - constexpr auto Buffer_data_multi_id_begin = repeat_multi_id * thread_sub_tensor_lengths; + constexpr auto buffer_data_multi_id_begin = repeat_multi_id * thread_sub_tensor_lengths; constexpr auto dst_data_multi_id_begin = repeat_multi_id * data_per_cluster_per_dims; - constexpr index_t Buffer_offset = - thread_tensor_desc.GetOffsetFromMultiIndex(Buffer_data_multi_id_begin); + constexpr index_t buffer_offset = + thread_tensor_desc.GetOffsetFromMultiIndex(buffer_data_multi_id_begin); constexpr index_t dst_offset = DstDesc::GetOffsetFromMultiIndex(dst_data_multi_id_begin); #else ford{}([&](auto repeat_multi_id) { - const auto Buffer_data_multi_id_begin = repeat_multi_id * thread_sub_tensor_lengths; + const auto buffer_data_multi_id_begin = repeat_multi_id * thread_sub_tensor_lengths; const auto dst_data_multi_id_begin = repeat_multi_id * data_per_cluster_per_dims; - const index_t Buffer_offset = - thread_tensor_desc.GetOffsetFromMultiIndex(Buffer_data_multi_id_begin); + const index_t buffer_offset = + thread_tensor_desc.GetOffsetFromMultiIndex(buffer_data_multi_id_begin); const index_t dst_offset = DstDesc::GetOffsetFromMultiIndex(dst_data_multi_id_begin); #endif @@ -299,7 +299,7 @@ struct BlockwiseGenericTensorSliceCopy_v1 // If in the future, you want to enable SubLengths > 1 at the merged dimension, // special care in implementation is needed threadwise_generic_tensor_slice_copy_v1(thread_tensor_desc, - p_Buffer + Buffer_offset, + p_buffer + buffer_offset, make_zero_array(), DstDesc{}, p_dst + dst_offset + mThreadDstOffset, @@ -312,10 +312,10 @@ struct BlockwiseGenericTensorSliceCopy_v1 __device__ void Run(const Float* __restrict__ p_src, Float* __restrict__ p_dst) const { - Float p_Buffer[GetRegisterBufferSize()]; + Float p_buffer[GetRegisterBufferSize()]; - RunLoadRegisterBuffer(p_src, p_Buffer); - RunStoreRegisterBuffer(p_Buffer, p_dst); + RunLoadRegisterBuffer(p_src, p_buffer); + RunStoreRegisterBuffer(p_buffer, p_dst); } // When moving the slicing windows along a merged dimension, if the strides of the diff --git a/composable_kernel/include/utility/functional3.hpp b/composable_kernel/include/utility/functional3.hpp index 73674aa039..7bd2b8d0cd 100644 --- a/composable_kernel/include/utility/functional3.hpp +++ b/composable_kernel/include/utility/functional3.hpp @@ -24,105 +24,120 @@ struct is_static> : integral_constant }; // RemainLengths: Sequence<...> -template +// Orders: Sequence<...> +template struct static_ford_impl { - // F signature: F(Sequence<...> multi_id) - // CurrentMultiIndex: Sequence<...> - template - __host__ __device__ constexpr void operator()(F f, CurrentMultiIndex) const + __host__ __device__ constexpr static_ford_impl() { static_assert(RemainLengths::GetSize() > 0, "wrong! should not get here"); + } + // F signature: F(Sequence<...>) + // CurrentOrderedId: Sequence<...> + template + __host__ __device__ constexpr void operator()(F f, CurrentOrderedId) const + { static_for<0, RemainLengths::Front(), 1>{}([=](auto I) { - static_ford_impl{}(f, - CurrentMultiIndex::PushBack(I)); + static_ford_impl{}( + f, CurrentOrderedId::PushBack(I)); }); } }; -template <> -struct static_ford_impl> +template +struct static_ford_impl, Orders> { - // F signature: F(Sequence<...> multi_id) - // CurrentMultiIndex: Sequence<...> - template - __host__ __device__ constexpr void operator()(F f, CurrentMultiIndex) const + // F signature: F(Sequence<...>) + // OrderedId: Sequence<...> + template + __host__ __device__ constexpr void operator()(F f, OrderedId) const { - f(CurrentMultiIndex{}); + // retrive unordered Id + f(OrderedId::ReorderGivenOld2New(Orders{})); } }; -// Lengths is Sequence<...> -template +// Lengths is Sequence<...>, it is the length of each dimension for N-dimensional loop +// Orders is Sequence<...>, it is the order of dimension in which static_ford will loop over each +// dimension +template ::type> struct static_ford { - // F signature: F(Sequence<...> multi_id) - template - __host__ __device__ constexpr void operator()(F f) const + __host__ __device__ constexpr static_ford() { static_assert(Lengths::GetSize() > 0, "wrong! Lengths is empty"); - - static_ford_impl{}(f, Sequence<>{}); + static_assert(Lengths::GetSize() == Orders::GetSize(), "wrong! inconsistent size"); } -}; -template -struct ford_impl -{ - // F signature: F(Array<...> multi_id) - // CurrentMultiIndex: Array<...> - // RemainLengths: Sequence<...> - template - __host__ __device__ constexpr void - operator()(F f, CurrentMultiIndex current_multi_id, RemainLengths) const - { - static_assert(RemainLengths::GetSize() == RemainDim, "wrong!"); - static_assert(RemainDim > 1, "wrong!"); - - constexpr auto next_length = RemainLengths{}.Front(); - - for(index_t i = 0; i < next_length; ++i) - { - ford_impl{}(f, current_multi_id.PushBack(i), RemainLengths{}.PopFront()); - } - } -}; - -template <> -struct ford_impl<1> -{ - // F signature: F(Array<...> multi_id) - // CurrentMultiIndex: Array<...> - // RemainLengths: Sequence<...> - template - __host__ __device__ constexpr void - operator()(F f, CurrentMultiIndex current_multi_id, RemainLengths) const - { - static_assert(RemainLengths::GetSize() == 1, "wrong!"); - - constexpr index_t last_length = RemainLengths{}.Front(); - - for(index_t i = 0; i < last_length; ++i) - { - f(current_multi_id.PushBack(i)); - } - } -}; - -// Lengths is Sequence<...> -template -struct ford -{ - // F signature: F(Array<...> multi_id) + // F signature: F(Sequence<...> multi_id) + // multi_id is the unordered multi-index template __host__ __device__ constexpr void operator()(F f) const { - constexpr index_t first_length = Lengths{}.Front(); + constexpr auto ordered_lengths = Lengths::ReorderGivenNew2Old(Orders{}); + static_ford_impl{}(f, Sequence<>{}); + } +}; - for(index_t i = 0; i < first_length; ++i) +// RemainLengths: Sequence<...> +// Orders: Sequence<...> +template +struct ford_impl +{ + __host__ __device__ constexpr ford_impl() + { + static_assert(RemainLengths::GetSize() > 0, "wrong! should not get here"); + } + + // F signature: F(Array<...> multi_id) + // CurrentOrderdId: Array<...> + template + __host__ __device__ constexpr void operator()(F f, CurrentOrderedId current_ordered_id) const + { + for(index_t i = 0; i < RemainLengths::Front(); ++i) { - ford_impl{}(f, Array{i}, Lengths{}.PopFront()); + ford_impl{}( + f, current_ordered_id.PushBack(i)); + } + } +}; + +template +struct ford_impl, Orders> +{ + // F signature: F(Array<...> multi_id) + // CurrentOrderdId: Array<...> + template + __host__ __device__ constexpr void operator()(F f, CurrentOrderedId current_ordered_id) const + { + // retrive unordered Id + f(reorder_array_given_old2new(current_ordered_id, Orders{})); + } +}; + +// Lengths is Sequence<...>, it is the length of each dimension for N-dimensional loop +// Orders is Sequence<...>, it is the order of dimension in which ford will loop over each +// dimension +template ::type> +struct ford +{ + __host__ __device__ constexpr ford() + { + static_assert(Lengths::GetSize() > 0, "wrong! Lengths is empty"); + static_assert(Lengths::GetSize() == Orders::GetSize(), "wrong! inconsistent size"); + } + + // F signature: F(Array<...> multi_id) + // multi_id is the unordered multi-index + template + __host__ __device__ constexpr void operator()(F f) const + { + for(index_t i = 0; i < Lengths::Front(); ++i) + { + ford_impl{}(f, Array{i}); } } };