diff --git a/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer.hpp b/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer.hpp index f914847192..524da47e24 100644 --- a/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer.hpp +++ b/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer.hpp @@ -4,6 +4,7 @@ #include "common_header.hpp" #include "tensor_descriptor.hpp" #include "tensor_descriptor_helper.hpp" +#include "tensor_space_filling_curve.hpp" namespace ck { @@ -85,16 +86,12 @@ struct ThreadwiseTensorSliceTransfer_v1r3 dst_coord_ = make_tensor_coordinate(dst_desc, dst_slice_origin_idx); } - template + template __device__ void Run(const SrcDesc&, const SrcSliceOriginIdx&, const SrcBuffer& src_buf, const DstDesc& dst_desc, - DstBuffer& dst_buf, - const DstStepHacks& dst_step_hacks) + DstBuffer& dst_buf) { static_assert(SrcDesc::IsKnownAtCompileTime(), "wrong! SrcDesc need to known at compile-time"); @@ -108,9 +105,6 @@ struct ThreadwiseTensorSliceTransfer_v1r3 constexpr auto src_desc = remove_cvref_t{}; constexpr auto src_slice_origin_idx = to_multi_index(SrcSliceOriginIdx{}); - constexpr auto I0 = Number<0>{}; - constexpr auto I1 = Number<1>{}; - // scalar per access on each dim // TODO: don't use lambda_scalar_per_access constexpr auto dst_scalar_per_access = generate_sequence( @@ -119,85 +113,26 @@ struct ThreadwiseTensorSliceTransfer_v1r3 constexpr auto dst_scalar_step_in_vector = generate_sequence(detail::lambda_scalar_step_in_vector{}, Number{}); - constexpr auto access_lengths = SliceLengths{} / dst_scalar_per_access; + using SpaceFillingCurve = SpaceFillingCurve>; - constexpr auto dim_access_order = DimAccessOrder{}; + // TODO: Use SpaceFillingCurve::ScalarsPerAccess instread of DstScalarPerVector? + static_assert(DstScalarPerVector == SpaceFillingCurve::ScalarPerVector, + "wrong!DstScalarPerVector != SpaceFillingCurve::ScalarPerVector"); + typename vector_type_maker::type dst_vector; + using dst_vector_t = typename vector_type_maker::type::type; - constexpr auto ordered_access_lengths = - container_reorder_given_new2old(access_lengths, dim_access_order); + constexpr auto num_access = SpaceFillingCurve::GetNumOfAccess(); - // make forward steps - const auto dst_forward_steps = generate_tuple( - [&](auto i) { - Index forward_step_idx; - - static_for<0, nDim, 1>{}([&](auto j) { - forward_step_idx(j) = (i.value == j.value) ? dst_scalar_per_access[i] : 0; - }); - - return make_tensor_coordinate_step( - dst_desc, forward_step_idx, dst_step_hacks[I0][i]); - }, - Number{}); - - // make backward steps - const auto dst_backward_steps = generate_tuple( - [&](auto i) { - Index backward_step_idx; - - static_for<0, nDim, 1>{}([&](auto j) { - backward_step_idx(j) = (i.value == j.value) ? -dst_scalar_per_access[i] : 0; - }); - - return make_tensor_coordinate_step( - dst_desc, backward_step_idx, dst_step_hacks[I1][i]); - }, - Number{}); - - // loop over tensor and copy - static_ford{}([&](auto ordered_access_idx) { - // judge move forward or move backward - constexpr auto forward_sweep = [&]() { - StaticallyIndexedArray forward_sweep_; - - forward_sweep_(I0) = true; - - static_for<1, nDim, 1>{}([&](auto i) { - index_t tmp = ordered_access_idx[I0]; - - static_for<1, i, 1>{}([&](auto j) { - tmp = tmp * ordered_access_lengths[j] + ordered_access_idx[j]; - }); - - forward_sweep_(i) = tmp % 2 == 0; - }); - - return forward_sweep_; - }(); - - // calculate dst data index - constexpr auto dst_data_idx = [&]() { - Index ordered_idx; - - static_for<0, nDim, 1>{}([&](auto i) { - ordered_idx(i) = forward_sweep[i] - ? ordered_access_idx[i] - : ordered_access_lengths[i] - 1 - ordered_access_idx[i]; - }); - - return container_reorder_given_old2new(ordered_idx, dim_access_order) * - dst_scalar_per_access; - }(); - - typename vector_type_maker::type dst_vector; - - using dst_vector_t = - typename vector_type_maker::type::type; + static_for<0, num_access, 1>{}([&](auto idx_1d) { + constexpr auto idx_md = SpaceFillingCurve::GetIndex(idx_1d); // copy data from src_buf into dst_vector + // TODO: It's a hack here to use \p dst_scalar_step_in_vector. Use SpaceFillingCurve? static_for<0, DstScalarPerVector, 1>{}([&](auto i) { constexpr index_t src_offset = src_desc.CalculateOffset( - src_slice_origin_idx + dst_data_idx + i * dst_scalar_step_in_vector); + src_slice_origin_idx + idx_md + i * dst_scalar_step_in_vector); SrcData dst_v; @@ -212,69 +147,18 @@ struct ThreadwiseTensorSliceTransfer_v1r3 coordinate_has_valid_offset_assuming_visible_index_is_valid(dst_desc, dst_coord_); // copy data from dst_vector into dst_buf - if constexpr(DstInMemOp == InMemoryDataOperationEnum_t::Set) + dst_buf.template Update( + dst_coord_.GetOffset(), + is_dst_valid, + dst_vector.template AsType()[Number<0>{}]); + + if constexpr(idx_1d.value != num_access - 1) { - dst_buf.template Set( - dst_coord_.GetOffset(), - is_dst_valid, - dst_vector.template AsType()[Number<0>{}]); + constexpr auto forward_step = SpaceFillingCurve::GetForwardStep(idx_1d); + + move_tensor_coordinate( + dst_desc, dst_coord_, make_tensor_coordinate_step(dst_desc, forward_step)); } - else if constexpr(DstInMemOp == InMemoryDataOperationEnum_t::AtomicAdd) - { - dst_buf.template AtomicAdd( - dst_coord_.GetOffset(), - is_dst_valid, - dst_vector.template AsType()[Number<0>{}]); - } - else if constexpr(DstInMemOp == InMemoryDataOperationEnum_t::Add) - { - - typename vector_type_maker::type tmp; - tmp.template AsType()(Number<0>{}) = - dst_buf.template Get(dst_coord_.GetOffset(), is_dst_valid); - - static_for<0, DstScalarPerVector, 1>{}([&](auto t) { - dst_vector.template AsType()(t) += tmp.template AsType()[t]; - }); - - dst_buf.template Set( - dst_coord_.GetOffset(), - is_dst_valid, - dst_vector.template AsType()[Number<0>{}]); - } - - constexpr auto move_on_dim = [&]() constexpr - { - StaticallyIndexedArray move_on_dim_; - - static_for<0, nDim, 1>{}([&](auto i) { - move_on_dim_(i) = ordered_access_idx[i] < ordered_access_lengths[i] - 1; - - static_for{}([&](auto j) { - move_on_dim_(i) &= ordered_access_idx[j] == ordered_access_lengths[j] - 1; - }); - }); - - return move_on_dim_; - } - (); - - // move - static_for<0, nDim, 1>{}([&](auto i) { - if constexpr(move_on_dim[i]) - { - if constexpr(forward_sweep[i]) - { - move_tensor_coordinate( - dst_desc, dst_coord_, dst_forward_steps[dim_access_order[i]]); - } - else - { - move_tensor_coordinate( - dst_desc, dst_coord_, dst_backward_steps[dim_access_order[i]]); - } - } - }); }); // move dst coordinate back to slice origin (or not) @@ -287,82 +171,27 @@ struct ThreadwiseTensorSliceTransfer_v1r3 } } - template - __device__ void Run(const SrcDesc&, - const SrcSliceOriginIdx&, - const SrcBuffer& src_buf, - const DstDesc& dst_desc, - DstBuffer& dst_buf) - { - constexpr index_t ntransform_dst = remove_cvref_t::GetNumOfTransform(); - - constexpr auto zeros = typename uniform_sequence_gen::type{}; - - constexpr auto dst_step_hacks = - make_tuple(generate_tuple([&](auto) { return zeros; }, Number{}), - generate_tuple([&](auto) { return zeros; }, Number{})); - - Run(SrcDesc{}, SrcSliceOriginIdx{}, src_buf, dst_desc, dst_buf, dst_step_hacks); - } - __device__ static constexpr auto GetDstCoordinateResetStep() { - constexpr auto I0 = Number<0>{}; - - // scalar per access on each dim - // TODO: don't use lambda_scalar_per_access constexpr auto dst_scalar_per_access = generate_sequence( detail::lambda_scalar_per_access{}, Number{}); - constexpr auto access_lengths = SliceLengths{} / dst_scalar_per_access; + using SpaceFillingCurve = SpaceFillingCurve>; - constexpr auto dim_access_order = DimAccessOrder{}; + constexpr auto num_access = SpaceFillingCurve::GetNumOfAccess(); + if constexpr(num_access == 0) + { + return typename SpaceFillingCurve::Index{}; + } + else + { + constexpr auto reset_step = + SpaceFillingCurve::GetStepBetween(Number{}, Number<0>{}); - constexpr auto ordered_access_lengths = - container_reorder_given_new2old(access_lengths, dim_access_order); - - // judge move forward or move backward during the last iteration - constexpr auto forward_sweep = [&]() { - StaticallyIndexedArray forward_sweep_; - - forward_sweep_(I0) = true; - - static_for<1, nDim, 1>{}([&](auto i) { - index_t tmp = ordered_access_lengths[I0] - 1; - - static_for<1, i, 1>{}([&](auto j) { - tmp = tmp * ordered_access_lengths[j] + ordered_access_lengths[j] - 1; - }); - - forward_sweep_(i) = tmp % 2 == 0; - }); - - return forward_sweep_; - }(); - - // calculate dst data index after last iteration in Run(), if it has not being reset by - // RunWrite() - constexpr auto dst_data_idx = [&]() { - Index ordered_idx; - - static_for<0, nDim, 1>{}([&](auto i) { - ordered_idx(i) = forward_sweep[i] ? ordered_access_lengths[i] - 1 : 0; - }); - - return container_reorder_given_old2new(ordered_idx, dim_access_order) * - dst_scalar_per_access; - }(); - - // - constexpr auto reset_dst_data_step = [&]() { - Index reset_dst_data_step_; - - static_for<0, nDim, 1>{}([&](auto i) { reset_dst_data_step_(i) = -dst_data_idx[i]; }); - - return reset_dst_data_step_; - }(); - - return reset_dst_data_step; + return reset_step; + } } // dst_slice_origin_step_idx need to be known at compile-time, for performance reason @@ -383,7 +212,7 @@ struct ThreadwiseTensorSliceTransfer_v1r3 private: DstCoord dst_coord_; const DstElementwiseOperation dst_element_op_; -}; // namespace ck +}; // namespace ThreadwiseTensorSliceTransfer_v1r3 // Assume: // 1. src: @@ -428,16 +257,12 @@ struct ThreadwiseTensorSliceTransfer_v2 src_coord_ = make_tensor_coordinate(src_desc, src_slice_origin_idx); } - template + template __device__ void Run(const SrcDesc& src_desc, const SrcBuffer& src_buf, const DstDesc&, const DstSliceOriginIdx&, - DstBuffer& dst_buf, - const SrcStepHacks& src_step_hacks) + DstBuffer& dst_buf) { static_assert(DstDesc::IsKnownAtCompileTime(), "wrong! DstDesc need to known at compile-time"); @@ -453,9 +278,6 @@ struct ThreadwiseTensorSliceTransfer_v2 constexpr auto dst_desc = remove_cvref_t{}; constexpr auto dst_slice_origin_idx = DstSliceOriginIdx{}; - constexpr auto I0 = Number<0>{}; - constexpr auto I1 = Number<1>{}; - // scalar per access on each dim // TODO: don't use lambda_scalar_per_access constexpr auto src_scalar_per_access = generate_sequence( @@ -464,80 +286,19 @@ struct ThreadwiseTensorSliceTransfer_v2 constexpr auto src_scalar_step_in_vector = generate_sequence(detail::lambda_scalar_step_in_vector{}, Number{}); - constexpr auto access_lengths = SliceLengths{} / src_scalar_per_access; - - constexpr auto dim_access_order = DimAccessOrder{}; - - constexpr auto ordered_access_lengths = - container_reorder_given_new2old(access_lengths, dim_access_order); - - // make forward steps - const auto src_forward_steps = generate_tuple( - [&](auto i) { - Index forward_step_idx; - - static_for<0, nDim, 1>{}([&](auto j) { - forward_step_idx(j) = (i.value == j.value) ? src_scalar_per_access[i] : 0; - }); - - return make_tensor_coordinate_step( - src_desc, forward_step_idx, src_step_hacks[I0][i]); - }, - Number{}); - - // make backward steps - const auto src_backward_steps = generate_tuple( - [&](auto i) { - Index backward_step_idx; - - static_for<0, nDim, 1>{}([&](auto j) { - backward_step_idx(j) = (i.value == j.value) ? -src_scalar_per_access[i] : 0; - }); - - return make_tensor_coordinate_step( - src_desc, backward_step_idx, src_step_hacks[I1][i]); - }, - Number{}); + using SpaceFillingCurve = SpaceFillingCurve>; // loop over tensor and copy - static_ford{}([&](auto ordered_access_idx) { - // judge move forward or move backward - constexpr auto forward_sweep = [&]() { - StaticallyIndexedArray forward_sweep_; - - forward_sweep_(I0) = true; - - static_for<1, nDim, 1>{}([&](auto i) { - index_t tmp = ordered_access_idx[I0]; - - static_for<1, i, 1>{}([&](auto j) { - tmp = tmp * ordered_access_lengths[j] + ordered_access_idx[j]; - }); - - forward_sweep_(i) = tmp % 2 == 0; - }); - - return forward_sweep_; - }(); - - // calculate src data index - constexpr auto src_data_idx = [&]() { - Index ordered_idx; - - static_for<0, nDim, 1>{}([&](auto i) { - ordered_idx(i) = forward_sweep[i] - ? ordered_access_idx[i] - : ordered_access_lengths[i] - 1 - ordered_access_idx[i]; - }); - - return container_reorder_given_old2new(ordered_idx, dim_access_order) * - src_scalar_per_access; - }(); + constexpr auto num_access = SpaceFillingCurve::GetNumOfAccess(); + static_for<0, num_access, 1>{}([&](auto idx_1d) { typename vector_type_maker::type src_vector; using src_vector_t = typename vector_type_maker::type::type; + constexpr auto src_data_idx = SpaceFillingCurve::GetIndex(idx_1d); const bool is_src_valid = coordinate_has_valid_offset_assuming_visible_index_is_valid(src_desc, src_coord_); @@ -555,38 +316,13 @@ struct ThreadwiseTensorSliceTransfer_v2 dst_buf(Number{}) = src_vector.template AsType()[i]; }); - constexpr auto move_on_dim = [&]() constexpr + if constexpr(idx_1d.value != num_access - 1) { - StaticallyIndexedArray move_on_dim_; + constexpr auto forward_step = SpaceFillingCurve::GetForwardStep(idx_1d); - static_for<0, nDim, 1>{}([&](auto i) { - move_on_dim_(i) = ordered_access_idx[i] < ordered_access_lengths[i] - 1; - - static_for{}([&](auto j) { - move_on_dim_(i) &= ordered_access_idx[j] == ordered_access_lengths[j] - 1; - }); - }); - - return move_on_dim_; + move_tensor_coordinate( + src_desc, src_coord_, make_tensor_coordinate_step(src_desc, forward_step)); } - (); - - // move - static_for<0, nDim, 1>{}([&](auto i) { - if constexpr(move_on_dim[i]) - { - if constexpr(forward_sweep[i]) - { - move_tensor_coordinate( - src_desc, src_coord_, src_forward_steps[dim_access_order[i]]); - } - else - { - move_tensor_coordinate( - src_desc, src_coord_, src_backward_steps[dim_access_order[i]]); - } - } - }); }); // move src coordinate back to slice origin (or not) @@ -599,82 +335,27 @@ struct ThreadwiseTensorSliceTransfer_v2 } } - template - __device__ void Run(const SrcDesc& src_desc, - const SrcBuffer& src_buf, - const DstDesc&, - const DstSliceOriginIdx&, - DstBuffer& dst_buf) - { - constexpr index_t ntransform_src = SrcDesc::GetNumOfTransform(); - - constexpr auto zeros = typename uniform_sequence_gen::type{}; - - constexpr auto src_step_hacks = - make_tuple(generate_tuple([&](auto) { return zeros; }, Number{}), - generate_tuple([&](auto) { return zeros; }, Number{})); - - Run(src_desc, src_buf, DstDesc{}, DstSliceOriginIdx{}, dst_buf, src_step_hacks); - } - __device__ static constexpr auto GetSrcCoordinateResetStep() { - constexpr auto I0 = Number<0>{}; - - // scalar per access on each dim - // TODO: don't use lambda_scalar_per_access constexpr auto src_scalar_per_access = generate_sequence( detail::lambda_scalar_per_access{}, Number{}); - constexpr auto access_lengths = SliceLengths{} / src_scalar_per_access; + using SpaceFillingCurve = SpaceFillingCurve>; - constexpr auto dim_access_order = DimAccessOrder{}; + constexpr auto num_access = SpaceFillingCurve::GetNumOfAccess(); + if constexpr(num_access == 0) + { + return typename SpaceFillingCurve::Index{}; + } + else + { + constexpr auto reset_step = + SpaceFillingCurve::GetStepBetween(Number{}, Number<0>{}); - constexpr auto ordered_access_lengths = - container_reorder_given_new2old(access_lengths, dim_access_order); - - // judge move forward or move backward during the last iteration - constexpr auto forward_sweep = [&]() { - StaticallyIndexedArray forward_sweep_; - - forward_sweep_(I0) = true; - - static_for<1, nDim, 1>{}([&](auto i) { - index_t tmp = ordered_access_lengths[I0] - 1; - - static_for<1, i, 1>{}([&](auto j) { - tmp = tmp * ordered_access_lengths[j] + ordered_access_lengths[j] - 1; - }); - - forward_sweep_(i) = tmp % 2 == 0; - }); - - return forward_sweep_; - }(); - - // calculate src data index after last iteration in Run(), if it has not being reset by - // RunWrite() - constexpr auto src_data_idx = [&]() { - Index ordered_idx; - - static_for<0, nDim, 1>{}([&](auto i) { - ordered_idx(i) = forward_sweep[i] ? ordered_access_lengths[i] - 1 : 0; - }); - - return container_reorder_given_old2new(ordered_idx, dim_access_order) * - src_scalar_per_access; - }(); - - // - constexpr auto reset_src_data_step = [&]() { - Index reset_src_data_step_; - - static_for<0, nDim, 1>{}([&](auto i) { reset_src_data_step_(i) = -src_data_idx[i]; }); - - return reset_src_data_step_; - }(); - - return reset_src_data_step; + return reset_step; + } } // dst_slice_origin_step_idx need to be known at compile-time, for performance reason diff --git a/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v6r1.hpp b/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v6r1.hpp index 6cdb142e76..b180f7f432 100644 --- a/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v6r1.hpp +++ b/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v6r1.hpp @@ -4,6 +4,7 @@ #include "common_header.hpp" #include "tensor_descriptor.hpp" #include "tensor_descriptor_helper.hpp" +#include "tensor_space_filling_curve.hpp" namespace ck { @@ -40,9 +41,6 @@ struct ThreadwiseTensorSliceTransfer_v6r1 using SrcCoord = decltype(make_tensor_coordinate(SrcDesc{}, Index{})); using DstCoord = decltype(make_tensor_coordinate(DstDesc{}, Index{})); - using SrcCoordStep = decltype(make_tensor_coordinate_step(SrcDesc{}, Index{})); - using DstCoordStep = decltype(make_tensor_coordinate_step(DstDesc{}, Index{})); - static constexpr auto I0 = Number<0>{}; __device__ constexpr ThreadwiseTensorSliceTransfer_v6r1(const SrcDesc& src_desc, @@ -79,70 +77,14 @@ struct ThreadwiseTensorSliceTransfer_v6r1 constexpr auto scalar_per_access = generate_sequence( detail::lambda_scalar_per_access{}, Number{}); - constexpr auto access_lengths = SliceLengths{} / scalar_per_access; + using SpaceFillingCurve = SpaceFillingCurve>; - constexpr auto dim_access_order = DimAccessOrder{}; - - constexpr auto ordered_access_lengths = - container_reorder_given_new2old(access_lengths, dim_access_order); - - auto make_forward_steps = [&](auto desc) { - return generate_tuple( - [&](auto i) { - Index forward_step_idx; - - static_for<0, nDim, 1>{}([&](auto j) { - forward_step_idx(j) = (i.value == j.value) ? scalar_per_access[i] : 0; - }); - - return make_tensor_coordinate_step(desc, forward_step_idx); - }, - Number{}); - }; - - auto make_backward_steps = [&](auto desc) { - return generate_tuple( - [&](auto i) { - Index backward_step_idx; - - static_for<0, nDim, 1>{}([&](auto j) { - backward_step_idx(j) = (i.value == j.value) ? -scalar_per_access[i] : 0; - }); - - return make_tensor_coordinate_step(desc, backward_step_idx); - }, - Number{}); - }; - - // make forward steps - const auto src_forward_steps = make_forward_steps(src_desc); - const auto dst_forward_steps = make_forward_steps(dst_desc); - - // make backward steps - const auto src_backward_steps = make_backward_steps(src_desc); - const auto dst_backward_steps = make_backward_steps(dst_desc); - - // loop over slice window - static_ford{}([&](auto ordered_access_idx) { - // judge move forward or move backward - constexpr auto forward_sweep = [&]() { - StaticallyIndexedArray forward_sweep_; - - forward_sweep_(I0) = true; - - static_for<1, nDim, 1>{}([&](auto i) { - index_t tmp = ordered_access_idx[I0]; - - static_for<1, i, 1>{}([&](auto j) { - tmp = tmp * ordered_access_lengths[j] + ordered_access_idx[j]; - }); - - forward_sweep_(i) = tmp % 2 == 0; - }); - - return forward_sweep_; - }(); + // loop over space-filling curve + constexpr auto num_access = SpaceFillingCurve::GetNumOfAccess(); + static_for<0, num_access, 1>{}([&](auto idx_1d) { using src_vector_type = vector_type_maker_t; using src_vector_t = typename src_vector_type::type; @@ -168,59 +110,20 @@ struct ThreadwiseTensorSliceTransfer_v6r1 coordinate_has_valid_offset_assuming_visible_index_is_valid(dst_desc, dst_coord_); // copy data from dst_vector into dst_buf - if constexpr(DstInMemOp == InMemoryDataOperationEnum_t::Set) - { - dst_buf.template Set( - dst_coord_.GetOffset(), - is_dst_valid, - dst_vector_container.template AsType()[I0]); - } - else if constexpr(DstInMemOp == InMemoryDataOperationEnum_t::AtomicAdd) - { - dst_buf.template AtomicAdd( - dst_coord_.GetOffset(), - is_dst_valid, - dst_vector_container.template AsType()[I0]); - } - - constexpr auto move_on_dim = [&]() constexpr - { - StaticallyIndexedArray move_on_dim_; - - static_for<0, nDim, 1>{}([&](auto i) { - move_on_dim_(i) = ordered_access_idx[i] < ordered_access_lengths[i] - 1; - - static_for{}([&](auto j) { - move_on_dim_(i) &= ordered_access_idx[j] == ordered_access_lengths[j] - 1; - }); - }); - - return move_on_dim_; - } - (); + dst_buf.template Update( + dst_coord_.GetOffset(), + is_dst_valid, + dst_vector_container.template AsType()[I0]); // move coordinate - static_for<0, nDim, 1>{}([&](auto i) { - if constexpr(move_on_dim[i]) - { - if constexpr(forward_sweep[i]) - { - move_tensor_coordinate( - src_desc, src_coord_, src_forward_steps[dim_access_order[i]]); - - move_tensor_coordinate( - dst_desc, dst_coord_, dst_forward_steps[dim_access_order[i]]); - } - else - { - move_tensor_coordinate( - src_desc, src_coord_, src_backward_steps[dim_access_order[i]]); - - move_tensor_coordinate( - dst_desc, dst_coord_, dst_backward_steps[dim_access_order[i]]); - } - } - }); + if constexpr(idx_1d.value != num_access - 1) + { + constexpr auto forward_step = SpaceFillingCurve::GetForwardStep(idx_1d); + move_tensor_coordinate( + src_desc, src_coord_, make_tensor_coordinate_step(src_desc, forward_step)); + move_tensor_coordinate( + dst_desc, dst_coord_, make_tensor_coordinate_step(dst_desc, forward_step)); + } }); // move coordinate back to slice origin (or not) @@ -243,59 +146,25 @@ struct ThreadwiseTensorSliceTransfer_v6r1 __device__ static constexpr auto GetCoordinateResetStep() { - // scalar per access on each dim - // TODO: don't use lambda_scalar_per_access constexpr auto scalar_per_access = generate_sequence( detail::lambda_scalar_per_access{}, Number{}); - constexpr auto access_lengths = SliceLengths{} / scalar_per_access; + using SpaceFillingCurve = SpaceFillingCurve>; - constexpr auto dim_access_order = DimAccessOrder{}; + constexpr auto num_access = SpaceFillingCurve::GetNumOfAccess(); + if constexpr(num_access == 0) + { + return typename SpaceFillingCurve::Index{}; + } + else + { + constexpr auto reset_step = + SpaceFillingCurve::GetStepBetween(Number{}, Number<0>{}); - constexpr auto ordered_access_lengths = - container_reorder_given_new2old(access_lengths, dim_access_order); - - // judge move forward or move backward during the last iteration - constexpr auto forward_sweep = [&]() { - StaticallyIndexedArray forward_sweep_; - - forward_sweep_(I0) = true; - - static_for<1, nDim, 1>{}([&](auto i) { - index_t tmp = ordered_access_lengths[I0] - 1; - - static_for<1, i, 1>{}([&](auto j) { - tmp = tmp * ordered_access_lengths[j] + ordered_access_lengths[j] - 1; - }); - - forward_sweep_(i) = tmp % 2 == 0; - }); - - return forward_sweep_; - }(); - - // calculate data index after last iteration in Run(), if it has not being reset - constexpr auto data_idx = [&]() { - Index ordered_idx; - - static_for<0, nDim, 1>{}([&](auto i) { - ordered_idx(i) = forward_sweep[i] ? ordered_access_lengths[i] - 1 : 0; - }); - - return container_reorder_given_old2new(ordered_idx, dim_access_order) * - scalar_per_access; - }(); - - // - constexpr auto reset_data_step = [&]() { - Index reset_data_step_; - - static_for<0, nDim, 1>{}([&](auto i) { reset_data_step_(i) = -data_idx[i]; }); - - return reset_data_step_; - }(); - - return reset_data_step; + return reset_step; + } } // src_slice_origin_step_idx need to be known at compile-time, for performance reason @@ -332,7 +201,7 @@ struct ThreadwiseTensorSliceTransfer_v6r1 SrcCoord src_coord_; DstCoord dst_coord_; const ElementwiseOperation element_op_; -}; +}; // namespace ck } // namespace ck #endif diff --git a/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v6r2.hpp b/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v6r2.hpp index a65c275744..67a2bc9bb2 100644 --- a/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v6r2.hpp +++ b/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v6r2.hpp @@ -4,6 +4,7 @@ #include "common_header.hpp" #include "tensor_descriptor.hpp" #include "tensor_descriptor_helper.hpp" +#include "tensor_space_filling_curve.hpp" namespace ck { @@ -44,10 +45,6 @@ struct ThreadwiseTensorSliceTransfer_v6r2 using Src1Coord = decltype(make_tensor_coordinate(Src1Desc{}, Index{})); using DstCoord = decltype(make_tensor_coordinate(DstDesc{}, Index{})); - using Src0CoordStep = decltype(make_tensor_coordinate_step(Src0Desc{}, Index{})); - using Src1CoordStep = decltype(make_tensor_coordinate_step(Src1Desc{}, Index{})); - using DstCoordStep = decltype(make_tensor_coordinate_step(DstDesc{}, Index{})); - static constexpr auto I0 = Number<0>{}; __device__ constexpr ThreadwiseTensorSliceTransfer_v6r2(const Src0Desc& src0_desc, @@ -96,72 +93,14 @@ struct ThreadwiseTensorSliceTransfer_v6r2 constexpr auto scalar_per_access = generate_sequence( detail::lambda_scalar_per_access{}, Number{}); - constexpr auto access_lengths = SliceLengths{} / scalar_per_access; + using SpaceFillingCurve = SpaceFillingCurve>; - constexpr auto dim_access_order = DimAccessOrder{}; - - constexpr auto ordered_access_lengths = - container_reorder_given_new2old(access_lengths, dim_access_order); - - auto make_forward_steps = [&](auto desc) { - return generate_tuple( - [&](auto i) { - Index forward_step_idx; - - static_for<0, nDim, 1>{}([&](auto j) { - forward_step_idx(j) = (i.value == j.value) ? scalar_per_access[i] : 0; - }); - - return make_tensor_coordinate_step(desc, forward_step_idx); - }, - Number{}); - }; - - auto make_backward_steps = [&](auto desc) { - return generate_tuple( - [&](auto i) { - Index backward_step_idx; - - static_for<0, nDim, 1>{}([&](auto j) { - backward_step_idx(j) = (i.value == j.value) ? -scalar_per_access[i] : 0; - }); - - return make_tensor_coordinate_step(desc, backward_step_idx); - }, - Number{}); - }; - - // make forward steps - const auto src0_forward_steps = make_forward_steps(src0_desc); - const auto src1_forward_steps = make_forward_steps(src1_desc); - const auto dst_forward_steps = make_forward_steps(dst_desc); - - // make backward steps - const auto src0_backward_steps = make_backward_steps(src0_desc); - const auto src1_backward_steps = make_backward_steps(src1_desc); - const auto dst_backward_steps = make_backward_steps(dst_desc); - - // loop over slice window - static_ford{}([&](auto ordered_access_idx) { - // judge move forward or move backward - constexpr auto forward_sweep = [&]() { - StaticallyIndexedArray forward_sweep_; - - forward_sweep_(I0) = true; - - static_for<1, nDim, 1>{}([&](auto i) { - index_t tmp = ordered_access_idx[I0]; - - static_for<1, i, 1>{}([&](auto j) { - tmp = tmp * ordered_access_lengths[j] + ordered_access_idx[j]; - }); - - forward_sweep_(i) = tmp % 2 == 0; - }); - - return forward_sweep_; - }(); + constexpr auto num_access = SpaceFillingCurve::GetNumOfAccess(); + // loop over space-filling curve + static_for<0, num_access, 1>{}([&](auto idx_1d) { using src0_vector_type = vector_type_maker_t; using src0_vector_t = typename src0_vector_type::type; @@ -197,65 +136,22 @@ struct ThreadwiseTensorSliceTransfer_v6r2 coordinate_has_valid_offset_assuming_visible_index_is_valid(dst_desc, dst_coord_); // copy data from dst_vector into dst_buf - if constexpr(DstInMemOp == InMemoryDataOperationEnum_t::Set) - { - dst_buf.template Set( - dst_coord_.GetOffset(), - is_dst_valid, - dst_vector_container.template AsType()[I0]); - } - else if constexpr(DstInMemOp == InMemoryDataOperationEnum_t::AtomicAdd) - { - dst_buf.template AtomicAdd( - dst_coord_.GetOffset(), - is_dst_valid, - dst_vector_container.template AsType()[I0]); - } - - constexpr auto move_on_dim = [&]() constexpr - { - StaticallyIndexedArray move_on_dim_; - - static_for<0, nDim, 1>{}([&](auto i) { - move_on_dim_(i) = ordered_access_idx[i] < ordered_access_lengths[i] - 1; - - static_for{}([&](auto j) { - move_on_dim_(i) &= ordered_access_idx[j] == ordered_access_lengths[j] - 1; - }); - }); - - return move_on_dim_; - } - (); + dst_buf.template Update( + dst_coord_.GetOffset(), + is_dst_valid, + dst_vector_container.template AsType()[I0]); // move coordinate - static_for<0, nDim, 1>{}([&](auto i) { - if constexpr(move_on_dim[i]) - { - if constexpr(forward_sweep[i]) - { - move_tensor_coordinate( - src0_desc, src0_coord_, src0_forward_steps[dim_access_order[i]]); - - move_tensor_coordinate( - src1_desc, src1_coord_, src1_forward_steps[dim_access_order[i]]); - - move_tensor_coordinate( - dst_desc, dst_coord_, dst_forward_steps[dim_access_order[i]]); - } - else - { - move_tensor_coordinate( - src0_desc, src0_coord_, src0_backward_steps[dim_access_order[i]]); - - move_tensor_coordinate( - src1_desc, src1_coord_, src1_backward_steps[dim_access_order[i]]); - - move_tensor_coordinate( - dst_desc, dst_coord_, dst_backward_steps[dim_access_order[i]]); - } - } - }); + if constexpr(idx_1d.value != num_access - 1) + { + constexpr auto forward_step = SpaceFillingCurve::GetForwardStep(idx_1d); + move_tensor_coordinate( + src0_desc, src0_coord_, make_tensor_coordinate_step(src0_desc, forward_step)); + move_tensor_coordinate( + src1_desc, src1_coord_, make_tensor_coordinate_step(src1_desc, forward_step)); + move_tensor_coordinate( + dst_desc, dst_coord_, make_tensor_coordinate_step(dst_desc, forward_step)); + } }); // move coordinate back to slice origin (or not) @@ -286,59 +182,25 @@ struct ThreadwiseTensorSliceTransfer_v6r2 __device__ static constexpr auto GetCoordinateResetStep() { - // scalar per access on each dim - // TODO: don't use lambda_scalar_per_access constexpr auto scalar_per_access = generate_sequence( detail::lambda_scalar_per_access{}, Number{}); - constexpr auto access_lengths = SliceLengths{} / scalar_per_access; + using SpaceFillingCurve = SpaceFillingCurve>; - constexpr auto dim_access_order = DimAccessOrder{}; + constexpr auto num_access = SpaceFillingCurve::GetNumOfAccess(); + if constexpr(num_access == 0) + { + return typename SpaceFillingCurve::Index{}; + } + else + { + constexpr auto reset_step = + SpaceFillingCurve::GetStepBetween(Number{}, Number<0>{}); - constexpr auto ordered_access_lengths = - container_reorder_given_new2old(access_lengths, dim_access_order); - - // judge move forward or move backward during the last iteration - constexpr auto forward_sweep = [&]() { - StaticallyIndexedArray forward_sweep_; - - forward_sweep_(I0) = true; - - static_for<1, nDim, 1>{}([&](auto i) { - index_t tmp = ordered_access_lengths[I0] - 1; - - static_for<1, i, 1>{}([&](auto j) { - tmp = tmp * ordered_access_lengths[j] + ordered_access_lengths[j] - 1; - }); - - forward_sweep_(i) = tmp % 2 == 0; - }); - - return forward_sweep_; - }(); - - // calculate data index after last iteration in Run(), if it has not being reset - constexpr auto data_idx = [&]() { - Index ordered_idx; - - static_for<0, nDim, 1>{}([&](auto i) { - ordered_idx(i) = forward_sweep[i] ? ordered_access_lengths[i] - 1 : 0; - }); - - return container_reorder_given_old2new(ordered_idx, dim_access_order) * - scalar_per_access; - }(); - - // - constexpr auto reset_data_step = [&]() { - Index reset_data_step_; - - static_for<0, nDim, 1>{}([&](auto i) { reset_data_step_(i) = -data_idx[i]; }); - - return reset_data_step_; - }(); - - return reset_data_step; + return reset_step; + } } // src_slice_origin_step_idx need to be known at compile-time, for performance reason diff --git a/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v6r3.hpp b/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v6r3.hpp index c7590d904c..fd3a5151fb 100644 --- a/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v6r3.hpp +++ b/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v6r3.hpp @@ -4,6 +4,7 @@ #include "common_header.hpp" #include "tensor_descriptor.hpp" #include "tensor_descriptor_helper.hpp" +#include "tensor_space_filling_curve.hpp" namespace ck { @@ -48,11 +49,6 @@ struct ThreadwiseTensorSliceTransfer_v6r3 using Src2Coord = decltype(make_tensor_coordinate(Src2Desc{}, Index{})); using DstCoord = decltype(make_tensor_coordinate(DstDesc{}, Index{})); - using Src0CoordStep = decltype(make_tensor_coordinate_step(Src0Desc{}, Index{})); - using Src1CoordStep = decltype(make_tensor_coordinate_step(Src1Desc{}, Index{})); - using Src2CoordStep = decltype(make_tensor_coordinate_step(Src2Desc{}, Index{})); - using DstCoordStep = decltype(make_tensor_coordinate_step(DstDesc{}, Index{})); - static constexpr auto I0 = Number<0>{}; __device__ constexpr ThreadwiseTensorSliceTransfer_v6r3(const Src0Desc& src0_desc, @@ -112,74 +108,14 @@ struct ThreadwiseTensorSliceTransfer_v6r3 constexpr auto scalar_per_access = generate_sequence( detail::lambda_scalar_per_access{}, Number{}); - constexpr auto access_lengths = SliceLengths{} / scalar_per_access; + using SpaceFillingCurve = SpaceFillingCurve>; - constexpr auto dim_access_order = DimAccessOrder{}; - - constexpr auto ordered_access_lengths = - container_reorder_given_new2old(access_lengths, dim_access_order); - - auto make_forward_steps = [&](auto desc) { - return generate_tuple( - [&](auto i) { - Index forward_step_idx; - - static_for<0, nDim, 1>{}([&](auto j) { - forward_step_idx(j) = (i.value == j.value) ? scalar_per_access[i] : 0; - }); - - return make_tensor_coordinate_step(desc, forward_step_idx); - }, - Number{}); - }; - - auto make_backward_steps = [&](auto desc) { - return generate_tuple( - [&](auto i) { - Index backward_step_idx; - - static_for<0, nDim, 1>{}([&](auto j) { - backward_step_idx(j) = (i.value == j.value) ? -scalar_per_access[i] : 0; - }); - - return make_tensor_coordinate_step(desc, backward_step_idx); - }, - Number{}); - }; - - // make forward steps - const auto src0_forward_steps = make_forward_steps(src0_desc); - const auto src1_forward_steps = make_forward_steps(src1_desc); - const auto src2_forward_steps = make_forward_steps(src2_desc); - const auto dst_forward_steps = make_forward_steps(dst_desc); - - // make backward steps - const auto src0_backward_steps = make_backward_steps(src0_desc); - const auto src1_backward_steps = make_backward_steps(src1_desc); - const auto src2_backward_steps = make_backward_steps(src2_desc); - const auto dst_backward_steps = make_backward_steps(dst_desc); - - // loop over slice window - static_ford{}([&](auto ordered_access_idx) { - // judge move forward or move backward - constexpr auto forward_sweep = [&]() { - StaticallyIndexedArray forward_sweep_; - - forward_sweep_(I0) = true; - - static_for<1, nDim, 1>{}([&](auto i) { - index_t tmp = ordered_access_idx[I0]; - - static_for<1, i, 1>{}([&](auto j) { - tmp = tmp * ordered_access_lengths[j] + ordered_access_idx[j]; - }); - - forward_sweep_(i) = tmp % 2 == 0; - }); - - return forward_sweep_; - }(); + constexpr auto num_access = SpaceFillingCurve::GetNumOfAccess(); + // loop over space-filling curve + static_for<0, num_access, 1>{}([&](auto idx_1d) { using src0_vector_type = vector_type_maker_t; using src0_vector_t = typename src0_vector_type::type; @@ -224,72 +160,24 @@ struct ThreadwiseTensorSliceTransfer_v6r3 const bool is_dst_valid = coordinate_has_valid_offset_assuming_visible_index_is_valid(dst_desc, dst_coord_); - // copy data from dst_vector into dst_buf - if constexpr(DstInMemOp == InMemoryDataOperationEnum_t::Set) - { - dst_buf.template Set( - dst_coord_.GetOffset(), - is_dst_valid, - dst_vector_container.template AsType()[I0]); - } - else if constexpr(DstInMemOp == InMemoryDataOperationEnum_t::AtomicAdd) - { - dst_buf.template AtomicAdd( - dst_coord_.GetOffset(), - is_dst_valid, - dst_vector_container.template AsType()[I0]); - } - - constexpr auto move_on_dim = [&]() constexpr - { - StaticallyIndexedArray move_on_dim_; - - static_for<0, nDim, 1>{}([&](auto i) { - move_on_dim_(i) = ordered_access_idx[i] < ordered_access_lengths[i] - 1; - - static_for{}([&](auto j) { - move_on_dim_(i) &= ordered_access_idx[j] == ordered_access_lengths[j] - 1; - }); - }); - - return move_on_dim_; - } - (); + dst_buf.template Update( + dst_coord_.GetOffset(), + is_dst_valid, + dst_vector_container.template AsType()[I0]); // move coordinate - static_for<0, nDim, 1>{}([&](auto i) { - if constexpr(move_on_dim[i]) - { - if constexpr(forward_sweep[i]) - { - move_tensor_coordinate( - src0_desc, src0_coord_, src0_forward_steps[dim_access_order[i]]); - - move_tensor_coordinate( - src1_desc, src1_coord_, src1_forward_steps[dim_access_order[i]]); - - move_tensor_coordinate( - src2_desc, src2_coord_, src2_forward_steps[dim_access_order[i]]); - - move_tensor_coordinate( - dst_desc, dst_coord_, dst_forward_steps[dim_access_order[i]]); - } - else - { - move_tensor_coordinate( - src0_desc, src0_coord_, src0_backward_steps[dim_access_order[i]]); - - move_tensor_coordinate( - src1_desc, src1_coord_, src1_backward_steps[dim_access_order[i]]); - - move_tensor_coordinate( - src2_desc, src2_coord_, src2_backward_steps[dim_access_order[i]]); - - move_tensor_coordinate( - dst_desc, dst_coord_, dst_backward_steps[dim_access_order[i]]); - } - } - }); + if constexpr(idx_1d.value != num_access - 1) + { + constexpr auto forward_step = SpaceFillingCurve::GetForwardStep(idx_1d); + move_tensor_coordinate( + src0_desc, src0_coord_, make_tensor_coordinate_step(src0_desc, forward_step)); + move_tensor_coordinate( + src1_desc, src1_coord_, make_tensor_coordinate_step(src1_desc, forward_step)); + move_tensor_coordinate( + src2_desc, src2_coord_, make_tensor_coordinate_step(src2_desc, forward_step)); + move_tensor_coordinate( + dst_desc, dst_coord_, make_tensor_coordinate_step(dst_desc, forward_step)); + } }); // move coordinate back to slice origin (or not) @@ -328,59 +216,25 @@ struct ThreadwiseTensorSliceTransfer_v6r3 __device__ static constexpr auto GetCoordinateResetStep() { - // scalar per access on each dim - // TODO: don't use lambda_scalar_per_access constexpr auto scalar_per_access = generate_sequence( detail::lambda_scalar_per_access{}, Number{}); - constexpr auto access_lengths = SliceLengths{} / scalar_per_access; + using SpaceFillingCurve = SpaceFillingCurve>; - constexpr auto dim_access_order = DimAccessOrder{}; + constexpr auto num_access = SpaceFillingCurve::GetNumOfAccess(); + if constexpr(num_access == 0) + { + return typename SpaceFillingCurve::Index{}; + } + else + { + constexpr auto reset_step = + SpaceFillingCurve::GetStepBetween(Number{}, Number<0>{}); - constexpr auto ordered_access_lengths = - container_reorder_given_new2old(access_lengths, dim_access_order); - - // judge move forward or move backward during the last iteration - constexpr auto forward_sweep = [&]() { - StaticallyIndexedArray forward_sweep_; - - forward_sweep_(I0) = true; - - static_for<1, nDim, 1>{}([&](auto i) { - index_t tmp = ordered_access_lengths[I0] - 1; - - static_for<1, i, 1>{}([&](auto j) { - tmp = tmp * ordered_access_lengths[j] + ordered_access_lengths[j] - 1; - }); - - forward_sweep_(i) = tmp % 2 == 0; - }); - - return forward_sweep_; - }(); - - // calculate data index after last iteration in Run(), if it has not being reset - constexpr auto data_idx = [&]() { - Index ordered_idx; - - static_for<0, nDim, 1>{}([&](auto i) { - ordered_idx(i) = forward_sweep[i] ? ordered_access_lengths[i] - 1 : 0; - }); - - return container_reorder_given_old2new(ordered_idx, dim_access_order) * - scalar_per_access; - }(); - - // - constexpr auto reset_data_step = [&]() { - Index reset_data_step_; - - static_for<0, nDim, 1>{}([&](auto i) { reset_data_step_(i) = -data_idx[i]; }); - - return reset_data_step_; - }(); - - return reset_data_step; + return reset_step; + } } // src_slice_origin_step_idx need to be known at compile-time, for performance reason diff --git a/test/space_filling_curve/space_filling_curve.cpp b/test/space_filling_curve/space_filling_curve.cpp index 2ec7df1c33..c104445319 100644 --- a/test/space_filling_curve/space_filling_curve.cpp +++ b/test/space_filling_curve/space_filling_curve.cpp @@ -95,13 +95,13 @@ void traverse_using_space_filling_curve() make_tuple(12, 2, 6), make_tuple(12, 0, 6)); - constexpr index_t num_accesses = SpaceFillingCurve::GetNumOfAccess(); + constexpr index_t num_access = SpaceFillingCurve::GetNumOfAccess(); - static_assert(num_accesses == reduce_on_sequence(TensorLengths{} / ScalarsPerAccess{}, - math::multiplies{}, - Number<1>{})); + static_assert(num_access == reduce_on_sequence(TensorLengths{} / ScalarsPerAccess{}, + math::multiplies{}, + Number<1>{})); - static_for<1, num_accesses, 1>{}([&](auto i) { + static_for<1, num_access, 1>{}([&](auto i) { constexpr auto idx_curr = SpaceFillingCurve::GetIndex(i); static_assert(idx_curr[I0] == expected[i][I0]); @@ -115,7 +115,7 @@ void traverse_using_space_filling_curve() static_assert(backward_step[I2] == expected_step[I2]); }); - static_for<0, num_accesses - 1, 1>{}([&](auto i) { + static_for<0, num_access - 1, 1>{}([&](auto i) { constexpr auto idx_curr = SpaceFillingCurve::GetIndex(i); static_assert(idx_curr[I0] == expected[i][I0]);