From 12dfba3d03f402c051e2129fa21f33264f4d26e5 Mon Sep 17 00:00:00 2001 From: Chao Liu Date: Sat, 5 Mar 2022 08:19:44 -0600 Subject: [PATCH] revert changes in threadwise copy due to PR #101 (space filling curve used in threadwise copy) (#111) --- .../threadwise_tensor_slice_transfer.hpp | 441 +++++++++++++++--- .../threadwise_tensor_slice_transfer_v3r1.hpp | 322 +++++++++++-- .../threadwise_tensor_slice_transfer_v6r1.hpp | 192 ++++++-- .../threadwise_tensor_slice_transfer_v6r2.hpp | 201 ++++++-- .../threadwise_tensor_slice_transfer_v6r3.hpp | 213 +++++++-- 5 files changed, 1193 insertions(+), 176 deletions(-) diff --git a/composable_kernel/include/tensor_operation/threadwise_tensor_slice_transfer.hpp b/composable_kernel/include/tensor_operation/threadwise_tensor_slice_transfer.hpp index 4ee7bf3256..f914847192 100644 --- a/composable_kernel/include/tensor_operation/threadwise_tensor_slice_transfer.hpp +++ b/composable_kernel/include/tensor_operation/threadwise_tensor_slice_transfer.hpp @@ -4,7 +4,6 @@ #include "common_header.hpp" #include "tensor_descriptor.hpp" #include "tensor_descriptor_helper.hpp" -#include "tensor_space_filling_curve.hpp" namespace ck { @@ -68,6 +67,8 @@ struct ThreadwiseTensorSliceTransfer_v1r3 using DstCoord = decltype(make_tensor_coordinate(DstDesc{}, Index{})); + using DstCoordStep = decltype(make_tensor_coordinate_step(DstDesc{}, Index{})); + __device__ constexpr ThreadwiseTensorSliceTransfer_v1r3( const DstDesc& dst_desc, const Index& dst_slice_origin_idx, @@ -84,12 +85,16 @@ struct ThreadwiseTensorSliceTransfer_v1r3 dst_coord_ = make_tensor_coordinate(dst_desc, dst_slice_origin_idx); } - template + template __device__ void Run(const SrcDesc&, const SrcSliceOriginIdx&, const SrcBuffer& src_buf, const DstDesc& dst_desc, - DstBuffer& dst_buf) + DstBuffer& dst_buf, + const DstStepHacks& dst_step_hacks) { static_assert(SrcDesc::IsKnownAtCompileTime(), "wrong! SrcDesc need to known at compile-time"); @@ -103,6 +108,9 @@ struct ThreadwiseTensorSliceTransfer_v1r3 constexpr auto src_desc = remove_cvref_t{}; constexpr auto src_slice_origin_idx = to_multi_index(SrcSliceOriginIdx{}); + constexpr auto I0 = Number<0>{}; + constexpr auto I1 = Number<1>{}; + // scalar per access on each dim // TODO: don't use lambda_scalar_per_access constexpr auto dst_scalar_per_access = generate_sequence( @@ -111,26 +119,85 @@ struct ThreadwiseTensorSliceTransfer_v1r3 constexpr auto dst_scalar_step_in_vector = generate_sequence(detail::lambda_scalar_step_in_vector{}, Number{}); - using SpaceFillingCurve = SpaceFillingCurve>; + constexpr auto access_lengths = SliceLengths{} / dst_scalar_per_access; - // TODO: Use SpaceFillingCurve::ScalarsPerAccess instread of DstScalarPerVector? - static_assert(DstScalarPerVector == SpaceFillingCurve::ScalarPerVector, - "wrong!DstScalarPerVector != SpaceFillingCurve::ScalarPerVector"); - typename vector_type_maker::type dst_vector; - using dst_vector_t = typename vector_type_maker::type::type; + constexpr auto dim_access_order = DimAccessOrder{}; - constexpr auto num_accesses = SpaceFillingCurve::GetNumOfAccess(); + constexpr auto ordered_access_lengths = + container_reorder_given_new2old(access_lengths, dim_access_order); - static_for<0, num_accesses, 1>{}([&](auto idx_1d) { - constexpr auto idx_md = SpaceFillingCurve::GetIndex(idx_1d); + // make forward steps + const auto dst_forward_steps = generate_tuple( + [&](auto i) { + Index forward_step_idx; + + static_for<0, nDim, 1>{}([&](auto j) { + forward_step_idx(j) = (i.value == j.value) ? dst_scalar_per_access[i] : 0; + }); + + return make_tensor_coordinate_step( + dst_desc, forward_step_idx, dst_step_hacks[I0][i]); + }, + Number{}); + + // make backward steps + const auto dst_backward_steps = generate_tuple( + [&](auto i) { + Index backward_step_idx; + + static_for<0, nDim, 1>{}([&](auto j) { + backward_step_idx(j) = (i.value == j.value) ? -dst_scalar_per_access[i] : 0; + }); + + return make_tensor_coordinate_step( + dst_desc, backward_step_idx, dst_step_hacks[I1][i]); + }, + Number{}); + + // loop over tensor and copy + static_ford{}([&](auto ordered_access_idx) { + // judge move forward or move backward + constexpr auto forward_sweep = [&]() { + StaticallyIndexedArray forward_sweep_; + + forward_sweep_(I0) = true; + + static_for<1, nDim, 1>{}([&](auto i) { + index_t tmp = ordered_access_idx[I0]; + + static_for<1, i, 1>{}([&](auto j) { + tmp = tmp * ordered_access_lengths[j] + ordered_access_idx[j]; + }); + + forward_sweep_(i) = tmp % 2 == 0; + }); + + return forward_sweep_; + }(); + + // calculate dst data index + constexpr auto dst_data_idx = [&]() { + Index ordered_idx; + + static_for<0, nDim, 1>{}([&](auto i) { + ordered_idx(i) = forward_sweep[i] + ? ordered_access_idx[i] + : ordered_access_lengths[i] - 1 - ordered_access_idx[i]; + }); + + return container_reorder_given_old2new(ordered_idx, dim_access_order) * + dst_scalar_per_access; + }(); + + typename vector_type_maker::type dst_vector; + + using dst_vector_t = + typename vector_type_maker::type::type; // copy data from src_buf into dst_vector - // TODO: It's a hack here to use \p dst_scalar_step_in_vector. Use SpaceFillingCurve? static_for<0, DstScalarPerVector, 1>{}([&](auto i) { constexpr index_t src_offset = src_desc.CalculateOffset( - src_slice_origin_idx + idx_md + i * dst_scalar_step_in_vector); + src_slice_origin_idx + dst_data_idx + i * dst_scalar_step_in_vector); SrcData dst_v; @@ -145,18 +212,69 @@ struct ThreadwiseTensorSliceTransfer_v1r3 coordinate_has_valid_offset_assuming_visible_index_is_valid(dst_desc, dst_coord_); // copy data from dst_vector into dst_buf - dst_buf.template Update( - dst_coord_.GetOffset(), - is_dst_valid, - dst_vector.template AsType()[Number<0>{}]); - - if constexpr(idx_1d.value != num_accesses - 1) + if constexpr(DstInMemOp == InMemoryDataOperationEnum_t::Set) { - constexpr auto forward_step = SpaceFillingCurve::GetForwardStep(idx_1d); - - move_tensor_coordinate( - dst_desc, dst_coord_, make_tensor_coordinate_step(dst_desc, forward_step)); + dst_buf.template Set( + dst_coord_.GetOffset(), + is_dst_valid, + dst_vector.template AsType()[Number<0>{}]); } + else if constexpr(DstInMemOp == InMemoryDataOperationEnum_t::AtomicAdd) + { + dst_buf.template AtomicAdd( + dst_coord_.GetOffset(), + is_dst_valid, + dst_vector.template AsType()[Number<0>{}]); + } + else if constexpr(DstInMemOp == InMemoryDataOperationEnum_t::Add) + { + + typename vector_type_maker::type tmp; + tmp.template AsType()(Number<0>{}) = + dst_buf.template Get(dst_coord_.GetOffset(), is_dst_valid); + + static_for<0, DstScalarPerVector, 1>{}([&](auto t) { + dst_vector.template AsType()(t) += tmp.template AsType()[t]; + }); + + dst_buf.template Set( + dst_coord_.GetOffset(), + is_dst_valid, + dst_vector.template AsType()[Number<0>{}]); + } + + constexpr auto move_on_dim = [&]() constexpr + { + StaticallyIndexedArray move_on_dim_; + + static_for<0, nDim, 1>{}([&](auto i) { + move_on_dim_(i) = ordered_access_idx[i] < ordered_access_lengths[i] - 1; + + static_for{}([&](auto j) { + move_on_dim_(i) &= ordered_access_idx[j] == ordered_access_lengths[j] - 1; + }); + }); + + return move_on_dim_; + } + (); + + // move + static_for<0, nDim, 1>{}([&](auto i) { + if constexpr(move_on_dim[i]) + { + if constexpr(forward_sweep[i]) + { + move_tensor_coordinate( + dst_desc, dst_coord_, dst_forward_steps[dim_access_order[i]]); + } + else + { + move_tensor_coordinate( + dst_desc, dst_coord_, dst_backward_steps[dim_access_order[i]]); + } + } + }); }); // move dst coordinate back to slice origin (or not) @@ -169,20 +287,82 @@ struct ThreadwiseTensorSliceTransfer_v1r3 } } + template + __device__ void Run(const SrcDesc&, + const SrcSliceOriginIdx&, + const SrcBuffer& src_buf, + const DstDesc& dst_desc, + DstBuffer& dst_buf) + { + constexpr index_t ntransform_dst = remove_cvref_t::GetNumOfTransform(); + + constexpr auto zeros = typename uniform_sequence_gen::type{}; + + constexpr auto dst_step_hacks = + make_tuple(generate_tuple([&](auto) { return zeros; }, Number{}), + generate_tuple([&](auto) { return zeros; }, Number{})); + + Run(SrcDesc{}, SrcSliceOriginIdx{}, src_buf, dst_desc, dst_buf, dst_step_hacks); + } + __device__ static constexpr auto GetDstCoordinateResetStep() { + constexpr auto I0 = Number<0>{}; + + // scalar per access on each dim + // TODO: don't use lambda_scalar_per_access constexpr auto dst_scalar_per_access = generate_sequence( detail::lambda_scalar_per_access{}, Number{}); - using SpaceFillingCurve = SpaceFillingCurve>; + constexpr auto access_lengths = SliceLengths{} / dst_scalar_per_access; - constexpr auto num_accesses = SpaceFillingCurve::GetNumOfAccess(); - constexpr auto reset_step = - SpaceFillingCurve::GetStepBetween(Number{}, Number<0>{}); + constexpr auto dim_access_order = DimAccessOrder{}; - return reset_step; + constexpr auto ordered_access_lengths = + container_reorder_given_new2old(access_lengths, dim_access_order); + + // judge move forward or move backward during the last iteration + constexpr auto forward_sweep = [&]() { + StaticallyIndexedArray forward_sweep_; + + forward_sweep_(I0) = true; + + static_for<1, nDim, 1>{}([&](auto i) { + index_t tmp = ordered_access_lengths[I0] - 1; + + static_for<1, i, 1>{}([&](auto j) { + tmp = tmp * ordered_access_lengths[j] + ordered_access_lengths[j] - 1; + }); + + forward_sweep_(i) = tmp % 2 == 0; + }); + + return forward_sweep_; + }(); + + // calculate dst data index after last iteration in Run(), if it has not being reset by + // RunWrite() + constexpr auto dst_data_idx = [&]() { + Index ordered_idx; + + static_for<0, nDim, 1>{}([&](auto i) { + ordered_idx(i) = forward_sweep[i] ? ordered_access_lengths[i] - 1 : 0; + }); + + return container_reorder_given_old2new(ordered_idx, dim_access_order) * + dst_scalar_per_access; + }(); + + // + constexpr auto reset_dst_data_step = [&]() { + Index reset_dst_data_step_; + + static_for<0, nDim, 1>{}([&](auto i) { reset_dst_data_step_(i) = -dst_data_idx[i]; }); + + return reset_dst_data_step_; + }(); + + return reset_dst_data_step; } // dst_slice_origin_step_idx need to be known at compile-time, for performance reason @@ -203,7 +383,7 @@ struct ThreadwiseTensorSliceTransfer_v1r3 private: DstCoord dst_coord_; const DstElementwiseOperation dst_element_op_; -}; // struct ThreadwiseTensorSliceTransfer_v1r3 +}; // namespace ck // Assume: // 1. src: @@ -248,12 +428,16 @@ struct ThreadwiseTensorSliceTransfer_v2 src_coord_ = make_tensor_coordinate(src_desc, src_slice_origin_idx); } - template + template __device__ void Run(const SrcDesc& src_desc, const SrcBuffer& src_buf, const DstDesc&, const DstSliceOriginIdx&, - DstBuffer& dst_buf) + DstBuffer& dst_buf, + const SrcStepHacks& src_step_hacks) { static_assert(DstDesc::IsKnownAtCompileTime(), "wrong! DstDesc need to known at compile-time"); @@ -269,6 +453,9 @@ struct ThreadwiseTensorSliceTransfer_v2 constexpr auto dst_desc = remove_cvref_t{}; constexpr auto dst_slice_origin_idx = DstSliceOriginIdx{}; + constexpr auto I0 = Number<0>{}; + constexpr auto I1 = Number<1>{}; + // scalar per access on each dim // TODO: don't use lambda_scalar_per_access constexpr auto src_scalar_per_access = generate_sequence( @@ -277,19 +464,80 @@ struct ThreadwiseTensorSliceTransfer_v2 constexpr auto src_scalar_step_in_vector = generate_sequence(detail::lambda_scalar_step_in_vector{}, Number{}); - using SpaceFillingCurve = SpaceFillingCurve>; + constexpr auto access_lengths = SliceLengths{} / src_scalar_per_access; + + constexpr auto dim_access_order = DimAccessOrder{}; + + constexpr auto ordered_access_lengths = + container_reorder_given_new2old(access_lengths, dim_access_order); + + // make forward steps + const auto src_forward_steps = generate_tuple( + [&](auto i) { + Index forward_step_idx; + + static_for<0, nDim, 1>{}([&](auto j) { + forward_step_idx(j) = (i.value == j.value) ? src_scalar_per_access[i] : 0; + }); + + return make_tensor_coordinate_step( + src_desc, forward_step_idx, src_step_hacks[I0][i]); + }, + Number{}); + + // make backward steps + const auto src_backward_steps = generate_tuple( + [&](auto i) { + Index backward_step_idx; + + static_for<0, nDim, 1>{}([&](auto j) { + backward_step_idx(j) = (i.value == j.value) ? -src_scalar_per_access[i] : 0; + }); + + return make_tensor_coordinate_step( + src_desc, backward_step_idx, src_step_hacks[I1][i]); + }, + Number{}); // loop over tensor and copy - constexpr auto num_accesses = SpaceFillingCurve::GetNumOfAccess(); + static_ford{}([&](auto ordered_access_idx) { + // judge move forward or move backward + constexpr auto forward_sweep = [&]() { + StaticallyIndexedArray forward_sweep_; + + forward_sweep_(I0) = true; + + static_for<1, nDim, 1>{}([&](auto i) { + index_t tmp = ordered_access_idx[I0]; + + static_for<1, i, 1>{}([&](auto j) { + tmp = tmp * ordered_access_lengths[j] + ordered_access_idx[j]; + }); + + forward_sweep_(i) = tmp % 2 == 0; + }); + + return forward_sweep_; + }(); + + // calculate src data index + constexpr auto src_data_idx = [&]() { + Index ordered_idx; + + static_for<0, nDim, 1>{}([&](auto i) { + ordered_idx(i) = forward_sweep[i] + ? ordered_access_idx[i] + : ordered_access_lengths[i] - 1 - ordered_access_idx[i]; + }); + + return container_reorder_given_old2new(ordered_idx, dim_access_order) * + src_scalar_per_access; + }(); - static_for<0, num_accesses, 1>{}([&](auto idx_1d) { typename vector_type_maker::type src_vector; using src_vector_t = typename vector_type_maker::type::type; - constexpr auto src_data_idx = SpaceFillingCurve::GetIndex(idx_1d); const bool is_src_valid = coordinate_has_valid_offset_assuming_visible_index_is_valid(src_desc, src_coord_); @@ -307,13 +555,38 @@ struct ThreadwiseTensorSliceTransfer_v2 dst_buf(Number{}) = src_vector.template AsType()[i]; }); - if constexpr(idx_1d.value != num_accesses - 1) + constexpr auto move_on_dim = [&]() constexpr { - constexpr auto forward_step = SpaceFillingCurve::GetForwardStep(idx_1d); + StaticallyIndexedArray move_on_dim_; - move_tensor_coordinate( - src_desc, src_coord_, make_tensor_coordinate_step(src_desc, forward_step)); + static_for<0, nDim, 1>{}([&](auto i) { + move_on_dim_(i) = ordered_access_idx[i] < ordered_access_lengths[i] - 1; + + static_for{}([&](auto j) { + move_on_dim_(i) &= ordered_access_idx[j] == ordered_access_lengths[j] - 1; + }); + }); + + return move_on_dim_; } + (); + + // move + static_for<0, nDim, 1>{}([&](auto i) { + if constexpr(move_on_dim[i]) + { + if constexpr(forward_sweep[i]) + { + move_tensor_coordinate( + src_desc, src_coord_, src_forward_steps[dim_access_order[i]]); + } + else + { + move_tensor_coordinate( + src_desc, src_coord_, src_backward_steps[dim_access_order[i]]); + } + } + }); }); // move src coordinate back to slice origin (or not) @@ -326,20 +599,82 @@ struct ThreadwiseTensorSliceTransfer_v2 } } + template + __device__ void Run(const SrcDesc& src_desc, + const SrcBuffer& src_buf, + const DstDesc&, + const DstSliceOriginIdx&, + DstBuffer& dst_buf) + { + constexpr index_t ntransform_src = SrcDesc::GetNumOfTransform(); + + constexpr auto zeros = typename uniform_sequence_gen::type{}; + + constexpr auto src_step_hacks = + make_tuple(generate_tuple([&](auto) { return zeros; }, Number{}), + generate_tuple([&](auto) { return zeros; }, Number{})); + + Run(src_desc, src_buf, DstDesc{}, DstSliceOriginIdx{}, dst_buf, src_step_hacks); + } + __device__ static constexpr auto GetSrcCoordinateResetStep() { + constexpr auto I0 = Number<0>{}; + + // scalar per access on each dim + // TODO: don't use lambda_scalar_per_access constexpr auto src_scalar_per_access = generate_sequence( detail::lambda_scalar_per_access{}, Number{}); - using SpaceFillingCurve = SpaceFillingCurve>; + constexpr auto access_lengths = SliceLengths{} / src_scalar_per_access; - constexpr auto num_accesses = SpaceFillingCurve::GetNumOfAccess(); - constexpr auto reset_step = - SpaceFillingCurve::GetStepBetween(Number{}, Number<0>{}); + constexpr auto dim_access_order = DimAccessOrder{}; - return reset_step; + constexpr auto ordered_access_lengths = + container_reorder_given_new2old(access_lengths, dim_access_order); + + // judge move forward or move backward during the last iteration + constexpr auto forward_sweep = [&]() { + StaticallyIndexedArray forward_sweep_; + + forward_sweep_(I0) = true; + + static_for<1, nDim, 1>{}([&](auto i) { + index_t tmp = ordered_access_lengths[I0] - 1; + + static_for<1, i, 1>{}([&](auto j) { + tmp = tmp * ordered_access_lengths[j] + ordered_access_lengths[j] - 1; + }); + + forward_sweep_(i) = tmp % 2 == 0; + }); + + return forward_sweep_; + }(); + + // calculate src data index after last iteration in Run(), if it has not being reset by + // RunWrite() + constexpr auto src_data_idx = [&]() { + Index ordered_idx; + + static_for<0, nDim, 1>{}([&](auto i) { + ordered_idx(i) = forward_sweep[i] ? ordered_access_lengths[i] - 1 : 0; + }); + + return container_reorder_given_old2new(ordered_idx, dim_access_order) * + src_scalar_per_access; + }(); + + // + constexpr auto reset_src_data_step = [&]() { + Index reset_src_data_step_; + + static_for<0, nDim, 1>{}([&](auto i) { reset_src_data_step_(i) = -src_data_idx[i]; }); + + return reset_src_data_step_; + }(); + + return reset_src_data_step; } // dst_slice_origin_step_idx need to be known at compile-time, for performance reason diff --git a/composable_kernel/include/tensor_operation/threadwise_tensor_slice_transfer_v3r1.hpp b/composable_kernel/include/tensor_operation/threadwise_tensor_slice_transfer_v3r1.hpp index 0cc8aa2edd..b20b391196 100644 --- a/composable_kernel/include/tensor_operation/threadwise_tensor_slice_transfer_v3r1.hpp +++ b/composable_kernel/include/tensor_operation/threadwise_tensor_slice_transfer_v3r1.hpp @@ -5,7 +5,6 @@ #include "tensor_descriptor.hpp" #include "tensor_descriptor_helper.hpp" #include "static_tensor.hpp" -#include "tensor_space_filling_curve.hpp" namespace ck { @@ -124,16 +123,73 @@ struct ThreadwiseTensorSliceTransfer_v3r1 constexpr auto src_scalar_per_access = generate_sequence( detail::lambda_scalar_per_access{}, Number{}); - using SpaceFillingCurve = SpaceFillingCurve>; + constexpr auto src_access_lengths = SliceLengths{} / src_scalar_per_access; - // loop over space-filling curve - constexpr auto num_accesses = SpaceFillingCurve::GetNumOfAccess(); + constexpr auto src_dim_access_order = SrcDimAccessOrder{}; + + constexpr auto ordered_src_access_lengths = + container_reorder_given_new2old(src_access_lengths, src_dim_access_order); + + // make forward steps + const auto src_forward_steps = generate_tuple( + [&](auto i) { + Index forward_step_idx; + + static_for<0, nDim, 1>{}([&](auto j) { + forward_step_idx(j) = (i.value == j.value) ? src_scalar_per_access[i] : 0; + }); + + return make_tensor_coordinate_step(src_desc, forward_step_idx); + }, + Number{}); + + // make backward steps + const auto src_backward_steps = generate_tuple( + [&](auto i) { + Index backward_step_idx; + + static_for<0, nDim, 1>{}([&](auto j) { + backward_step_idx(j) = (i.value == j.value) ? -src_scalar_per_access[i] : 0; + }); + + return make_tensor_coordinate_step(src_desc, backward_step_idx); + }, + Number{}); // loop over tensor and copy - static_for<0, num_accesses, 1>{}([&](auto idx_1d) { - constexpr auto src_data_idx = SpaceFillingCurve::GetIndex(idx_1d); + static_ford{}([&](auto ordered_src_access_idx) { + // judge move forward or move backward + constexpr auto forward_sweep = [&]() { + StaticallyIndexedArray forward_sweep_; + + forward_sweep_(I0) = true; + + static_for<1, nDim, 1>{}([&](auto i) { + index_t tmp = ordered_src_access_idx[I0]; + + static_for<1, i, 1>{}([&](auto j) { + tmp = tmp * ordered_src_access_lengths[j] + ordered_src_access_idx[j]; + }); + + forward_sweep_(i) = tmp % 2 == 0; + }); + + return forward_sweep_; + }(); + + // calculate src data index + constexpr auto src_data_idx = [&]() { + Index ordered_idx; + + static_for<0, nDim, 1>{}([&](auto i) { + ordered_idx(i) = forward_sweep[i] ? ordered_src_access_idx[i] + : ordered_src_access_lengths[i] - 1 - + ordered_src_access_idx[i]; + }); + + return container_reorder_given_old2new(ordered_idx, src_dim_access_order) * + src_scalar_per_access; + }(); constexpr auto src_data_idx_seq = generate_sequence_v2( [&](auto i) { return Number{}; }, Number{}); @@ -162,13 +218,39 @@ struct ThreadwiseTensorSliceTransfer_v3r1 .template SetAsType( src_data_idx_seq, src_vector_container.template AsType()[I0]); - // move coordinate - if constexpr(idx_1d.value != num_accesses - 1) + constexpr auto move_on_dim = [&]() constexpr { - constexpr auto forward_step = SpaceFillingCurve::GetForwardStep(idx_1d); - move_tensor_coordinate( - src_desc, src_coord_, make_tensor_coordinate_step(src_desc, forward_step)); + StaticallyIndexedArray move_on_dim_; + + static_for<0, nDim, 1>{}([&](auto i) { + move_on_dim_(i) = ordered_src_access_idx[i] < ordered_src_access_lengths[i] - 1; + + static_for{}([&](auto j) { + move_on_dim_(i) &= + ordered_src_access_idx[j] == ordered_src_access_lengths[j] - 1; + }); + }); + + return move_on_dim_; } + (); + + // move src coord + static_for<0, nDim, 1>{}([&](auto i) { + if constexpr(move_on_dim[i]) + { + if constexpr(forward_sweep[i]) + { + move_tensor_coordinate( + src_desc, src_coord_, src_forward_steps[src_dim_access_order[i]]); + } + else + { + move_tensor_coordinate( + src_desc, src_coord_, src_backward_steps[src_dim_access_order[i]]); + } + } + }); }); // move src coordinate back to slice origin (or not) @@ -292,15 +374,73 @@ struct ThreadwiseTensorSliceTransfer_v3r1 constexpr auto dst_scalar_per_access = generate_sequence( detail::lambda_scalar_per_access{}, Number{}); - using SpaceFillingCurve = SpaceFillingCurve>; + constexpr auto dst_access_lengths = SliceLengths{} / dst_scalar_per_access; - constexpr auto num_accesses = SpaceFillingCurve::GetNumOfAccess(); + constexpr auto dst_dim_access_order = DstDimAccessOrder{}; + + constexpr auto ordered_dst_access_lengths = + container_reorder_given_new2old(dst_access_lengths, dst_dim_access_order); + + // make forward steps + const auto dst_forward_steps = generate_tuple( + [&](auto i) { + Index forward_step_idx; + + static_for<0, nDim, 1>{}([&](auto j) { + forward_step_idx(j) = (i.value == j.value) ? dst_scalar_per_access[i] : 0; + }); + + return make_tensor_coordinate_step(dst_desc, forward_step_idx); + }, + Number{}); + + // make backward steps + const auto dst_backward_steps = generate_tuple( + [&](auto i) { + Index backward_step_idx; + + static_for<0, nDim, 1>{}([&](auto j) { + backward_step_idx(j) = (i.value == j.value) ? -dst_scalar_per_access[i] : 0; + }); + + return make_tensor_coordinate_step(dst_desc, backward_step_idx); + }, + Number{}); // loop over tensor and copy - static_for<0, num_accesses, 1>{}([&](auto idx_1d) { - constexpr auto dst_data_idx = SpaceFillingCurve::GetIndex(idx_1d); + static_ford{}([&](auto ordered_dst_access_idx) { + // judge move forward or move backward + constexpr auto forward_sweep = [&]() { + StaticallyIndexedArray forward_sweep_; + + forward_sweep_(I0) = true; + + static_for<1, nDim, 1>{}([&](auto i) { + index_t tmp = ordered_dst_access_idx[I0]; + + static_for<1, i, 1>{}([&](auto j) { + tmp = tmp * ordered_dst_access_lengths[j] + ordered_dst_access_idx[j]; + }); + + forward_sweep_(i) = tmp % 2 == 0; + }); + + return forward_sweep_; + }(); + + // calculate dst data index + constexpr auto dst_data_idx = [&]() { + Index ordered_idx; + + static_for<0, nDim, 1>{}([&](auto i) { + ordered_idx(i) = forward_sweep[i] ? ordered_dst_access_idx[i] + : ordered_dst_access_lengths[i] - 1 - + ordered_dst_access_idx[i]; + }); + + return container_reorder_given_old2new(ordered_idx, dst_dim_access_order) * + dst_scalar_per_access; + }(); constexpr auto dst_data_idx_seq = generate_sequence_v2( [&](auto i) { return Number{}; }, Number{}); @@ -330,13 +470,39 @@ struct ThreadwiseTensorSliceTransfer_v3r1 is_dst_valid, dst_vector_container.template AsType()[I0]); - // move coordinate - if constexpr(idx_1d.value != num_accesses - 1) + constexpr auto move_on_dim = [&]() constexpr { - constexpr auto forward_step = SpaceFillingCurve::GetForwardStep(idx_1d); - move_tensor_coordinate( - dst_desc, dst_coord_, make_tensor_coordinate_step(dst_desc, forward_step)); + StaticallyIndexedArray move_on_dim_; + + static_for<0, nDim, 1>{}([&](auto i) { + move_on_dim_(i) = ordered_dst_access_idx[i] < ordered_dst_access_lengths[i] - 1; + + static_for{}([&](auto j) { + move_on_dim_(i) &= + ordered_dst_access_idx[j] == ordered_dst_access_lengths[j] - 1; + }); + }); + + return move_on_dim_; } + (); + + // move dst coord + static_for<0, nDim, 1>{}([&](auto i) { + if constexpr(move_on_dim[i]) + { + if constexpr(forward_sweep[i]) + { + move_tensor_coordinate( + dst_desc, dst_coord_, dst_forward_steps[dst_dim_access_order[i]]); + } + else + { + move_tensor_coordinate( + dst_desc, dst_coord_, dst_backward_steps[dst_dim_access_order[i]]); + } + } + }); }); // move dst coordinate back to slice origin (or not) @@ -356,15 +522,55 @@ struct ThreadwiseTensorSliceTransfer_v3r1 constexpr auto src_scalar_per_access = generate_sequence( detail::lambda_scalar_per_access{}, Number{}); - using SpaceFillingCurve = SpaceFillingCurve>; + constexpr auto src_access_lengths = SliceLengths{} / src_scalar_per_access; - constexpr auto num_accesses = SpaceFillingCurve::GetNumOfAccess(); - constexpr auto reset_step = - SpaceFillingCurve::GetStepBetween(Number{}, Number<0>{}); + constexpr auto src_dim_access_order = SrcDimAccessOrder{}; - return reset_step; + constexpr auto ordered_src_access_lengths = + container_reorder_given_new2old(src_access_lengths, src_dim_access_order); + + // judge move forward or move backward during the last iteration + constexpr auto forward_sweep = [&]() { + StaticallyIndexedArray forward_sweep_; + + forward_sweep_(I0) = true; + + static_for<1, nDim, 1>{}([&](auto i) { + index_t tmp = ordered_src_access_lengths[I0] - 1; + + static_for<1, i, 1>{}([&](auto j) { + tmp = tmp * ordered_src_access_lengths[j] + ordered_src_access_lengths[j] - 1; + }); + + forward_sweep_(i) = tmp % 2 == 0; + }); + + return forward_sweep_; + }(); + + // calculate src data index after last iteration in RunRead(), if it has not being reset by + // RunRead() + constexpr auto src_data_idx = [&]() { + Index ordered_idx; + + static_for<0, nDim, 1>{}([&](auto i) { + ordered_idx(i) = forward_sweep[i] ? ordered_src_access_lengths[i] - 1 : 0; + }); + + return container_reorder_given_old2new(ordered_idx, src_dim_access_order) * + src_scalar_per_access; + }(); + + // + constexpr auto reset_src_data_step = [&]() { + Index reset_src_data_step_; + + static_for<0, nDim, 1>{}([&](auto i) { reset_src_data_step_(i) = -src_data_idx[i]; }); + + return reset_src_data_step_; + }(); + + return reset_src_data_step; } __device__ static constexpr auto GetDstCoordinateResetStep() @@ -374,15 +580,55 @@ struct ThreadwiseTensorSliceTransfer_v3r1 constexpr auto dst_scalar_per_access = generate_sequence( detail::lambda_scalar_per_access{}, Number{}); - using SpaceFillingCurve = SpaceFillingCurve>; + constexpr auto dst_access_lengths = SliceLengths{} / dst_scalar_per_access; - constexpr auto num_accesses = SpaceFillingCurve::GetNumOfAccess(); - constexpr auto reset_step = - SpaceFillingCurve::GetStepBetween(Number{}, Number<0>{}); + constexpr auto dst_dim_access_order = DstDimAccessOrder{}; - return reset_step; + constexpr auto ordered_dst_access_lengths = + container_reorder_given_new2old(dst_access_lengths, dst_dim_access_order); + + // judge move forward or move backward during the last iteration + constexpr auto forward_sweep = [&]() { + StaticallyIndexedArray forward_sweep_; + + forward_sweep_(I0) = true; + + static_for<1, nDim, 1>{}([&](auto i) { + index_t tmp = ordered_dst_access_lengths[I0] - 1; + + static_for<1, i, 1>{}([&](auto j) { + tmp = tmp * ordered_dst_access_lengths[j] + ordered_dst_access_lengths[j] - 1; + }); + + forward_sweep_(i) = tmp % 2 == 0; + }); + + return forward_sweep_; + }(); + + // calculate dst data index after last iteration in RunWrite(), if it has not being reset by + // RunWrite() + constexpr auto dst_data_idx = [&]() { + Index ordered_idx; + + static_for<0, nDim, 1>{}([&](auto i) { + ordered_idx(i) = forward_sweep[i] ? ordered_dst_access_lengths[i] - 1 : 0; + }); + + return container_reorder_given_old2new(ordered_idx, dst_dim_access_order) * + dst_scalar_per_access; + }(); + + // + constexpr auto reset_dst_data_step = [&]() { + Index reset_dst_data_step_; + + static_for<0, nDim, 1>{}([&](auto i) { reset_dst_data_step_(i) = -dst_data_idx[i]; }); + + return reset_dst_data_step_; + }(); + + return reset_dst_data_step; } // src_slice_origin_step_idx need to be known at compile-time, for performance reason diff --git a/composable_kernel/include/tensor_operation/threadwise_tensor_slice_transfer_v6r1.hpp b/composable_kernel/include/tensor_operation/threadwise_tensor_slice_transfer_v6r1.hpp index 85baf060be..6cdb142e76 100644 --- a/composable_kernel/include/tensor_operation/threadwise_tensor_slice_transfer_v6r1.hpp +++ b/composable_kernel/include/tensor_operation/threadwise_tensor_slice_transfer_v6r1.hpp @@ -4,7 +4,6 @@ #include "common_header.hpp" #include "tensor_descriptor.hpp" #include "tensor_descriptor_helper.hpp" -#include "tensor_space_filling_curve.hpp" namespace ck { @@ -41,6 +40,9 @@ struct ThreadwiseTensorSliceTransfer_v6r1 using SrcCoord = decltype(make_tensor_coordinate(SrcDesc{}, Index{})); using DstCoord = decltype(make_tensor_coordinate(DstDesc{}, Index{})); + using SrcCoordStep = decltype(make_tensor_coordinate_step(SrcDesc{}, Index{})); + using DstCoordStep = decltype(make_tensor_coordinate_step(DstDesc{}, Index{})); + static constexpr auto I0 = Number<0>{}; __device__ constexpr ThreadwiseTensorSliceTransfer_v6r1(const SrcDesc& src_desc, @@ -77,14 +79,70 @@ struct ThreadwiseTensorSliceTransfer_v6r1 constexpr auto scalar_per_access = generate_sequence( detail::lambda_scalar_per_access{}, Number{}); - using SpaceFillingCurve = SpaceFillingCurve>; + constexpr auto access_lengths = SliceLengths{} / scalar_per_access; - // loop over space-filling curve - constexpr auto num_accesses = SpaceFillingCurve::GetNumOfAccess(); + constexpr auto dim_access_order = DimAccessOrder{}; + + constexpr auto ordered_access_lengths = + container_reorder_given_new2old(access_lengths, dim_access_order); + + auto make_forward_steps = [&](auto desc) { + return generate_tuple( + [&](auto i) { + Index forward_step_idx; + + static_for<0, nDim, 1>{}([&](auto j) { + forward_step_idx(j) = (i.value == j.value) ? scalar_per_access[i] : 0; + }); + + return make_tensor_coordinate_step(desc, forward_step_idx); + }, + Number{}); + }; + + auto make_backward_steps = [&](auto desc) { + return generate_tuple( + [&](auto i) { + Index backward_step_idx; + + static_for<0, nDim, 1>{}([&](auto j) { + backward_step_idx(j) = (i.value == j.value) ? -scalar_per_access[i] : 0; + }); + + return make_tensor_coordinate_step(desc, backward_step_idx); + }, + Number{}); + }; + + // make forward steps + const auto src_forward_steps = make_forward_steps(src_desc); + const auto dst_forward_steps = make_forward_steps(dst_desc); + + // make backward steps + const auto src_backward_steps = make_backward_steps(src_desc); + const auto dst_backward_steps = make_backward_steps(dst_desc); + + // loop over slice window + static_ford{}([&](auto ordered_access_idx) { + // judge move forward or move backward + constexpr auto forward_sweep = [&]() { + StaticallyIndexedArray forward_sweep_; + + forward_sweep_(I0) = true; + + static_for<1, nDim, 1>{}([&](auto i) { + index_t tmp = ordered_access_idx[I0]; + + static_for<1, i, 1>{}([&](auto j) { + tmp = tmp * ordered_access_lengths[j] + ordered_access_idx[j]; + }); + + forward_sweep_(i) = tmp % 2 == 0; + }); + + return forward_sweep_; + }(); - static_for<0, num_accesses, 1>{}([&](auto idx_1d) { using src_vector_type = vector_type_maker_t; using src_vector_t = typename src_vector_type::type; @@ -110,20 +168,59 @@ struct ThreadwiseTensorSliceTransfer_v6r1 coordinate_has_valid_offset_assuming_visible_index_is_valid(dst_desc, dst_coord_); // copy data from dst_vector into dst_buf - dst_buf.template Update( - dst_coord_.GetOffset(), - is_dst_valid, - dst_vector_container.template AsType()[I0]); + if constexpr(DstInMemOp == InMemoryDataOperationEnum_t::Set) + { + dst_buf.template Set( + dst_coord_.GetOffset(), + is_dst_valid, + dst_vector_container.template AsType()[I0]); + } + else if constexpr(DstInMemOp == InMemoryDataOperationEnum_t::AtomicAdd) + { + dst_buf.template AtomicAdd( + dst_coord_.GetOffset(), + is_dst_valid, + dst_vector_container.template AsType()[I0]); + } + + constexpr auto move_on_dim = [&]() constexpr + { + StaticallyIndexedArray move_on_dim_; + + static_for<0, nDim, 1>{}([&](auto i) { + move_on_dim_(i) = ordered_access_idx[i] < ordered_access_lengths[i] - 1; + + static_for{}([&](auto j) { + move_on_dim_(i) &= ordered_access_idx[j] == ordered_access_lengths[j] - 1; + }); + }); + + return move_on_dim_; + } + (); // move coordinate - if constexpr(idx_1d.value != num_accesses - 1) - { - constexpr auto forward_step = SpaceFillingCurve::GetForwardStep(idx_1d); - move_tensor_coordinate( - src_desc, src_coord_, make_tensor_coordinate_step(src_desc, forward_step)); - move_tensor_coordinate( - dst_desc, dst_coord_, make_tensor_coordinate_step(dst_desc, forward_step)); - } + static_for<0, nDim, 1>{}([&](auto i) { + if constexpr(move_on_dim[i]) + { + if constexpr(forward_sweep[i]) + { + move_tensor_coordinate( + src_desc, src_coord_, src_forward_steps[dim_access_order[i]]); + + move_tensor_coordinate( + dst_desc, dst_coord_, dst_forward_steps[dim_access_order[i]]); + } + else + { + move_tensor_coordinate( + src_desc, src_coord_, src_backward_steps[dim_access_order[i]]); + + move_tensor_coordinate( + dst_desc, dst_coord_, dst_backward_steps[dim_access_order[i]]); + } + } + }); }); // move coordinate back to slice origin (or not) @@ -146,18 +243,59 @@ struct ThreadwiseTensorSliceTransfer_v6r1 __device__ static constexpr auto GetCoordinateResetStep() { + // scalar per access on each dim + // TODO: don't use lambda_scalar_per_access constexpr auto scalar_per_access = generate_sequence( detail::lambda_scalar_per_access{}, Number{}); - using SpaceFillingCurve = SpaceFillingCurve>; + constexpr auto access_lengths = SliceLengths{} / scalar_per_access; - constexpr auto num_accesses = SpaceFillingCurve::GetNumOfAccess(); - constexpr auto reset_step = - SpaceFillingCurve::GetStepBetween(Number{}, Number<0>{}); + constexpr auto dim_access_order = DimAccessOrder{}; - return reset_step; + constexpr auto ordered_access_lengths = + container_reorder_given_new2old(access_lengths, dim_access_order); + + // judge move forward or move backward during the last iteration + constexpr auto forward_sweep = [&]() { + StaticallyIndexedArray forward_sweep_; + + forward_sweep_(I0) = true; + + static_for<1, nDim, 1>{}([&](auto i) { + index_t tmp = ordered_access_lengths[I0] - 1; + + static_for<1, i, 1>{}([&](auto j) { + tmp = tmp * ordered_access_lengths[j] + ordered_access_lengths[j] - 1; + }); + + forward_sweep_(i) = tmp % 2 == 0; + }); + + return forward_sweep_; + }(); + + // calculate data index after last iteration in Run(), if it has not being reset + constexpr auto data_idx = [&]() { + Index ordered_idx; + + static_for<0, nDim, 1>{}([&](auto i) { + ordered_idx(i) = forward_sweep[i] ? ordered_access_lengths[i] - 1 : 0; + }); + + return container_reorder_given_old2new(ordered_idx, dim_access_order) * + scalar_per_access; + }(); + + // + constexpr auto reset_data_step = [&]() { + Index reset_data_step_; + + static_for<0, nDim, 1>{}([&](auto i) { reset_data_step_(i) = -data_idx[i]; }); + + return reset_data_step_; + }(); + + return reset_data_step; } // src_slice_origin_step_idx need to be known at compile-time, for performance reason @@ -194,7 +332,7 @@ struct ThreadwiseTensorSliceTransfer_v6r1 SrcCoord src_coord_; DstCoord dst_coord_; const ElementwiseOperation element_op_; -}; // namespace ck +}; } // namespace ck #endif diff --git a/composable_kernel/include/tensor_operation/threadwise_tensor_slice_transfer_v6r2.hpp b/composable_kernel/include/tensor_operation/threadwise_tensor_slice_transfer_v6r2.hpp index 8e578ab989..a65c275744 100644 --- a/composable_kernel/include/tensor_operation/threadwise_tensor_slice_transfer_v6r2.hpp +++ b/composable_kernel/include/tensor_operation/threadwise_tensor_slice_transfer_v6r2.hpp @@ -4,7 +4,6 @@ #include "common_header.hpp" #include "tensor_descriptor.hpp" #include "tensor_descriptor_helper.hpp" -#include "tensor_space_filling_curve.hpp" namespace ck { @@ -45,6 +44,10 @@ struct ThreadwiseTensorSliceTransfer_v6r2 using Src1Coord = decltype(make_tensor_coordinate(Src1Desc{}, Index{})); using DstCoord = decltype(make_tensor_coordinate(DstDesc{}, Index{})); + using Src0CoordStep = decltype(make_tensor_coordinate_step(Src0Desc{}, Index{})); + using Src1CoordStep = decltype(make_tensor_coordinate_step(Src1Desc{}, Index{})); + using DstCoordStep = decltype(make_tensor_coordinate_step(DstDesc{}, Index{})); + static constexpr auto I0 = Number<0>{}; __device__ constexpr ThreadwiseTensorSliceTransfer_v6r2(const Src0Desc& src0_desc, @@ -93,14 +96,72 @@ struct ThreadwiseTensorSliceTransfer_v6r2 constexpr auto scalar_per_access = generate_sequence( detail::lambda_scalar_per_access{}, Number{}); - using SpaceFillingCurve = SpaceFillingCurve>; + constexpr auto access_lengths = SliceLengths{} / scalar_per_access; - constexpr auto num_accesses = SpaceFillingCurve::GetNumOfAccess(); + constexpr auto dim_access_order = DimAccessOrder{}; + + constexpr auto ordered_access_lengths = + container_reorder_given_new2old(access_lengths, dim_access_order); + + auto make_forward_steps = [&](auto desc) { + return generate_tuple( + [&](auto i) { + Index forward_step_idx; + + static_for<0, nDim, 1>{}([&](auto j) { + forward_step_idx(j) = (i.value == j.value) ? scalar_per_access[i] : 0; + }); + + return make_tensor_coordinate_step(desc, forward_step_idx); + }, + Number{}); + }; + + auto make_backward_steps = [&](auto desc) { + return generate_tuple( + [&](auto i) { + Index backward_step_idx; + + static_for<0, nDim, 1>{}([&](auto j) { + backward_step_idx(j) = (i.value == j.value) ? -scalar_per_access[i] : 0; + }); + + return make_tensor_coordinate_step(desc, backward_step_idx); + }, + Number{}); + }; + + // make forward steps + const auto src0_forward_steps = make_forward_steps(src0_desc); + const auto src1_forward_steps = make_forward_steps(src1_desc); + const auto dst_forward_steps = make_forward_steps(dst_desc); + + // make backward steps + const auto src0_backward_steps = make_backward_steps(src0_desc); + const auto src1_backward_steps = make_backward_steps(src1_desc); + const auto dst_backward_steps = make_backward_steps(dst_desc); + + // loop over slice window + static_ford{}([&](auto ordered_access_idx) { + // judge move forward or move backward + constexpr auto forward_sweep = [&]() { + StaticallyIndexedArray forward_sweep_; + + forward_sweep_(I0) = true; + + static_for<1, nDim, 1>{}([&](auto i) { + index_t tmp = ordered_access_idx[I0]; + + static_for<1, i, 1>{}([&](auto j) { + tmp = tmp * ordered_access_lengths[j] + ordered_access_idx[j]; + }); + + forward_sweep_(i) = tmp % 2 == 0; + }); + + return forward_sweep_; + }(); - // loop over space-filling curve - static_for<0, num_accesses, 1>{}([&](auto idx_1d) { using src0_vector_type = vector_type_maker_t; using src0_vector_t = typename src0_vector_type::type; @@ -136,22 +197,65 @@ struct ThreadwiseTensorSliceTransfer_v6r2 coordinate_has_valid_offset_assuming_visible_index_is_valid(dst_desc, dst_coord_); // copy data from dst_vector into dst_buf - dst_buf.template Update( - dst_coord_.GetOffset(), - is_dst_valid, - dst_vector_container.template AsType()[I0]); + if constexpr(DstInMemOp == InMemoryDataOperationEnum_t::Set) + { + dst_buf.template Set( + dst_coord_.GetOffset(), + is_dst_valid, + dst_vector_container.template AsType()[I0]); + } + else if constexpr(DstInMemOp == InMemoryDataOperationEnum_t::AtomicAdd) + { + dst_buf.template AtomicAdd( + dst_coord_.GetOffset(), + is_dst_valid, + dst_vector_container.template AsType()[I0]); + } + + constexpr auto move_on_dim = [&]() constexpr + { + StaticallyIndexedArray move_on_dim_; + + static_for<0, nDim, 1>{}([&](auto i) { + move_on_dim_(i) = ordered_access_idx[i] < ordered_access_lengths[i] - 1; + + static_for{}([&](auto j) { + move_on_dim_(i) &= ordered_access_idx[j] == ordered_access_lengths[j] - 1; + }); + }); + + return move_on_dim_; + } + (); // move coordinate - if constexpr(idx_1d.value != num_accesses - 1) - { - constexpr auto forward_step = SpaceFillingCurve::GetForwardStep(idx_1d); - move_tensor_coordinate( - src0_desc, src0_coord_, make_tensor_coordinate_step(src0_desc, forward_step)); - move_tensor_coordinate( - src1_desc, src1_coord_, make_tensor_coordinate_step(src1_desc, forward_step)); - move_tensor_coordinate( - dst_desc, dst_coord_, make_tensor_coordinate_step(dst_desc, forward_step)); - } + static_for<0, nDim, 1>{}([&](auto i) { + if constexpr(move_on_dim[i]) + { + if constexpr(forward_sweep[i]) + { + move_tensor_coordinate( + src0_desc, src0_coord_, src0_forward_steps[dim_access_order[i]]); + + move_tensor_coordinate( + src1_desc, src1_coord_, src1_forward_steps[dim_access_order[i]]); + + move_tensor_coordinate( + dst_desc, dst_coord_, dst_forward_steps[dim_access_order[i]]); + } + else + { + move_tensor_coordinate( + src0_desc, src0_coord_, src0_backward_steps[dim_access_order[i]]); + + move_tensor_coordinate( + src1_desc, src1_coord_, src1_backward_steps[dim_access_order[i]]); + + move_tensor_coordinate( + dst_desc, dst_coord_, dst_backward_steps[dim_access_order[i]]); + } + } + }); }); // move coordinate back to slice origin (or not) @@ -182,18 +286,59 @@ struct ThreadwiseTensorSliceTransfer_v6r2 __device__ static constexpr auto GetCoordinateResetStep() { + // scalar per access on each dim + // TODO: don't use lambda_scalar_per_access constexpr auto scalar_per_access = generate_sequence( detail::lambda_scalar_per_access{}, Number{}); - using SpaceFillingCurve = SpaceFillingCurve>; + constexpr auto access_lengths = SliceLengths{} / scalar_per_access; - constexpr auto num_accesses = SpaceFillingCurve::GetNumOfAccess(); - constexpr auto reset_step = - SpaceFillingCurve::GetStepBetween(Number{}, Number<0>{}); + constexpr auto dim_access_order = DimAccessOrder{}; - return reset_step; + constexpr auto ordered_access_lengths = + container_reorder_given_new2old(access_lengths, dim_access_order); + + // judge move forward or move backward during the last iteration + constexpr auto forward_sweep = [&]() { + StaticallyIndexedArray forward_sweep_; + + forward_sweep_(I0) = true; + + static_for<1, nDim, 1>{}([&](auto i) { + index_t tmp = ordered_access_lengths[I0] - 1; + + static_for<1, i, 1>{}([&](auto j) { + tmp = tmp * ordered_access_lengths[j] + ordered_access_lengths[j] - 1; + }); + + forward_sweep_(i) = tmp % 2 == 0; + }); + + return forward_sweep_; + }(); + + // calculate data index after last iteration in Run(), if it has not being reset + constexpr auto data_idx = [&]() { + Index ordered_idx; + + static_for<0, nDim, 1>{}([&](auto i) { + ordered_idx(i) = forward_sweep[i] ? ordered_access_lengths[i] - 1 : 0; + }); + + return container_reorder_given_old2new(ordered_idx, dim_access_order) * + scalar_per_access; + }(); + + // + constexpr auto reset_data_step = [&]() { + Index reset_data_step_; + + static_for<0, nDim, 1>{}([&](auto i) { reset_data_step_(i) = -data_idx[i]; }); + + return reset_data_step_; + }(); + + return reset_data_step; } // src_slice_origin_step_idx need to be known at compile-time, for performance reason diff --git a/composable_kernel/include/tensor_operation/threadwise_tensor_slice_transfer_v6r3.hpp b/composable_kernel/include/tensor_operation/threadwise_tensor_slice_transfer_v6r3.hpp index 4c2398b093..c7590d904c 100644 --- a/composable_kernel/include/tensor_operation/threadwise_tensor_slice_transfer_v6r3.hpp +++ b/composable_kernel/include/tensor_operation/threadwise_tensor_slice_transfer_v6r3.hpp @@ -4,7 +4,6 @@ #include "common_header.hpp" #include "tensor_descriptor.hpp" #include "tensor_descriptor_helper.hpp" -#include "tensor_space_filling_curve.hpp" namespace ck { @@ -49,6 +48,11 @@ struct ThreadwiseTensorSliceTransfer_v6r3 using Src2Coord = decltype(make_tensor_coordinate(Src2Desc{}, Index{})); using DstCoord = decltype(make_tensor_coordinate(DstDesc{}, Index{})); + using Src0CoordStep = decltype(make_tensor_coordinate_step(Src0Desc{}, Index{})); + using Src1CoordStep = decltype(make_tensor_coordinate_step(Src1Desc{}, Index{})); + using Src2CoordStep = decltype(make_tensor_coordinate_step(Src2Desc{}, Index{})); + using DstCoordStep = decltype(make_tensor_coordinate_step(DstDesc{}, Index{})); + static constexpr auto I0 = Number<0>{}; __device__ constexpr ThreadwiseTensorSliceTransfer_v6r3(const Src0Desc& src0_desc, @@ -108,14 +112,74 @@ struct ThreadwiseTensorSliceTransfer_v6r3 constexpr auto scalar_per_access = generate_sequence( detail::lambda_scalar_per_access{}, Number{}); - using SpaceFillingCurve = SpaceFillingCurve>; + constexpr auto access_lengths = SliceLengths{} / scalar_per_access; - constexpr auto num_accesses = SpaceFillingCurve::GetNumOfAccess(); + constexpr auto dim_access_order = DimAccessOrder{}; + + constexpr auto ordered_access_lengths = + container_reorder_given_new2old(access_lengths, dim_access_order); + + auto make_forward_steps = [&](auto desc) { + return generate_tuple( + [&](auto i) { + Index forward_step_idx; + + static_for<0, nDim, 1>{}([&](auto j) { + forward_step_idx(j) = (i.value == j.value) ? scalar_per_access[i] : 0; + }); + + return make_tensor_coordinate_step(desc, forward_step_idx); + }, + Number{}); + }; + + auto make_backward_steps = [&](auto desc) { + return generate_tuple( + [&](auto i) { + Index backward_step_idx; + + static_for<0, nDim, 1>{}([&](auto j) { + backward_step_idx(j) = (i.value == j.value) ? -scalar_per_access[i] : 0; + }); + + return make_tensor_coordinate_step(desc, backward_step_idx); + }, + Number{}); + }; + + // make forward steps + const auto src0_forward_steps = make_forward_steps(src0_desc); + const auto src1_forward_steps = make_forward_steps(src1_desc); + const auto src2_forward_steps = make_forward_steps(src2_desc); + const auto dst_forward_steps = make_forward_steps(dst_desc); + + // make backward steps + const auto src0_backward_steps = make_backward_steps(src0_desc); + const auto src1_backward_steps = make_backward_steps(src1_desc); + const auto src2_backward_steps = make_backward_steps(src2_desc); + const auto dst_backward_steps = make_backward_steps(dst_desc); + + // loop over slice window + static_ford{}([&](auto ordered_access_idx) { + // judge move forward or move backward + constexpr auto forward_sweep = [&]() { + StaticallyIndexedArray forward_sweep_; + + forward_sweep_(I0) = true; + + static_for<1, nDim, 1>{}([&](auto i) { + index_t tmp = ordered_access_idx[I0]; + + static_for<1, i, 1>{}([&](auto j) { + tmp = tmp * ordered_access_lengths[j] + ordered_access_idx[j]; + }); + + forward_sweep_(i) = tmp % 2 == 0; + }); + + return forward_sweep_; + }(); - // loop over space-filling curve - static_for<0, num_accesses, 1>{}([&](auto idx_1d) { using src0_vector_type = vector_type_maker_t; using src0_vector_t = typename src0_vector_type::type; @@ -160,24 +224,72 @@ struct ThreadwiseTensorSliceTransfer_v6r3 const bool is_dst_valid = coordinate_has_valid_offset_assuming_visible_index_is_valid(dst_desc, dst_coord_); - dst_buf.template Update( - dst_coord_.GetOffset(), - is_dst_valid, - dst_vector_container.template AsType()[I0]); + // copy data from dst_vector into dst_buf + if constexpr(DstInMemOp == InMemoryDataOperationEnum_t::Set) + { + dst_buf.template Set( + dst_coord_.GetOffset(), + is_dst_valid, + dst_vector_container.template AsType()[I0]); + } + else if constexpr(DstInMemOp == InMemoryDataOperationEnum_t::AtomicAdd) + { + dst_buf.template AtomicAdd( + dst_coord_.GetOffset(), + is_dst_valid, + dst_vector_container.template AsType()[I0]); + } + + constexpr auto move_on_dim = [&]() constexpr + { + StaticallyIndexedArray move_on_dim_; + + static_for<0, nDim, 1>{}([&](auto i) { + move_on_dim_(i) = ordered_access_idx[i] < ordered_access_lengths[i] - 1; + + static_for{}([&](auto j) { + move_on_dim_(i) &= ordered_access_idx[j] == ordered_access_lengths[j] - 1; + }); + }); + + return move_on_dim_; + } + (); // move coordinate - if constexpr(idx_1d.value != num_accesses - 1) - { - constexpr auto forward_step = SpaceFillingCurve::GetForwardStep(idx_1d); - move_tensor_coordinate( - src0_desc, src0_coord_, make_tensor_coordinate_step(src0_desc, forward_step)); - move_tensor_coordinate( - src1_desc, src1_coord_, make_tensor_coordinate_step(src1_desc, forward_step)); - move_tensor_coordinate( - src2_desc, src2_coord_, make_tensor_coordinate_step(src2_desc, forward_step)); - move_tensor_coordinate( - dst_desc, dst_coord_, make_tensor_coordinate_step(dst_desc, forward_step)); - } + static_for<0, nDim, 1>{}([&](auto i) { + if constexpr(move_on_dim[i]) + { + if constexpr(forward_sweep[i]) + { + move_tensor_coordinate( + src0_desc, src0_coord_, src0_forward_steps[dim_access_order[i]]); + + move_tensor_coordinate( + src1_desc, src1_coord_, src1_forward_steps[dim_access_order[i]]); + + move_tensor_coordinate( + src2_desc, src2_coord_, src2_forward_steps[dim_access_order[i]]); + + move_tensor_coordinate( + dst_desc, dst_coord_, dst_forward_steps[dim_access_order[i]]); + } + else + { + move_tensor_coordinate( + src0_desc, src0_coord_, src0_backward_steps[dim_access_order[i]]); + + move_tensor_coordinate( + src1_desc, src1_coord_, src1_backward_steps[dim_access_order[i]]); + + move_tensor_coordinate( + src2_desc, src2_coord_, src2_backward_steps[dim_access_order[i]]); + + move_tensor_coordinate( + dst_desc, dst_coord_, dst_backward_steps[dim_access_order[i]]); + } + } + }); }); // move coordinate back to slice origin (or not) @@ -216,18 +328,59 @@ struct ThreadwiseTensorSliceTransfer_v6r3 __device__ static constexpr auto GetCoordinateResetStep() { + // scalar per access on each dim + // TODO: don't use lambda_scalar_per_access constexpr auto scalar_per_access = generate_sequence( detail::lambda_scalar_per_access{}, Number{}); - using SpaceFillingCurve = SpaceFillingCurve>; + constexpr auto access_lengths = SliceLengths{} / scalar_per_access; - constexpr auto num_accesses = SpaceFillingCurve::GetNumOfAccess(); - constexpr auto reset_step = - SpaceFillingCurve::GetStepBetween(Number{}, Number<0>{}); + constexpr auto dim_access_order = DimAccessOrder{}; - return reset_step; + constexpr auto ordered_access_lengths = + container_reorder_given_new2old(access_lengths, dim_access_order); + + // judge move forward or move backward during the last iteration + constexpr auto forward_sweep = [&]() { + StaticallyIndexedArray forward_sweep_; + + forward_sweep_(I0) = true; + + static_for<1, nDim, 1>{}([&](auto i) { + index_t tmp = ordered_access_lengths[I0] - 1; + + static_for<1, i, 1>{}([&](auto j) { + tmp = tmp * ordered_access_lengths[j] + ordered_access_lengths[j] - 1; + }); + + forward_sweep_(i) = tmp % 2 == 0; + }); + + return forward_sweep_; + }(); + + // calculate data index after last iteration in Run(), if it has not being reset + constexpr auto data_idx = [&]() { + Index ordered_idx; + + static_for<0, nDim, 1>{}([&](auto i) { + ordered_idx(i) = forward_sweep[i] ? ordered_access_lengths[i] - 1 : 0; + }); + + return container_reorder_given_old2new(ordered_idx, dim_access_order) * + scalar_per_access; + }(); + + // + constexpr auto reset_data_step = [&]() { + Index reset_data_step_; + + static_for<0, nDim, 1>{}([&](auto i) { reset_data_step_(i) = -data_idx[i]; }); + + return reset_data_step_; + }(); + + return reset_data_step; } // src_slice_origin_step_idx need to be known at compile-time, for performance reason