diff --git a/CMakeLists.txt b/CMakeLists.txt index 610f9c9d2a..ca7b62d8cb 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -125,6 +125,9 @@ add_compile_options(-Wno-pass-failed) add_compile_options(-Wno-switch-default) add_compile_options(-Wno-unique-object-duplication) +# Increase the number of max elements in fold expressions +add_compile_options(-fbracket-depth=1024) + # add -Og -gdwarf64 for debug builds add_compile_options( "$<$:-Og>" diff --git a/CMakePresets.json b/CMakePresets.json index f81dbadb19..a8958b82ff 100644 --- a/CMakePresets.json +++ b/CMakePresets.json @@ -43,7 +43,7 @@ "CMAKE_PREFIX_PATH": "/opt/rocm/", "CMAKE_CXX_COMPILER": "/opt/rocm/llvm/bin/clang++", "CMAKE_HIP_COMPILER": "/opt/rocm/llvm/bin/clang++", - "CMAKE_CXX_FLAGS": "-ftemplate-backtrace-limit=0 -fPIE -Wno-gnu-line-marker -fbracket-depth=512", + "CMAKE_CXX_FLAGS": "-ftemplate-backtrace-limit=0 -fPIE -Wno-gnu-line-marker -fbracket-depth=1024", "CMAKE_BUILD_TYPE": "Release", "BUILD_DEV": "ON", "CMAKE_VERBOSE_MAKEFILE": "ON", diff --git a/include/ck/utility/functional3.hpp b/include/ck/utility/functional3.hpp index ffec810760..8cad72c66f 100644 --- a/include/ck/utility/functional3.hpp +++ b/include/ck/utility/functional3.hpp @@ -8,135 +8,233 @@ #include "ck/utility/functional2.hpp" #include "ck/utility/sequence.hpp" #include "ck/utility/multi_index.hpp" +#include "ck/utility/math.hpp" namespace ck { namespace detail { -// RemainLengths: Sequence<...> -// Orders: Sequence<...> -template -struct static_ford_impl +/** + * @brief Common base class for static_ford and ford. + * + * Provides shared compile-time constants and type aliases used by both + * static_ford (compile-time iteration) and ford (runtime iteration). + * + * @tparam Lengths Sequence specifying the size of each dimension + * @tparam Orders Sequence specifying the iteration order of dimensions. + * Orders[i] indicates which dimension is iterated at loop level i. + */ +template +struct ford_base { - __host__ __device__ constexpr static_ford_impl() - { - static_assert(RemainLengths::GetSize() > 0, "wrong! should not get here"); - } + /// Number of dimensions + static constexpr index_t NDim = Lengths::Size(); - // F signature: F(Sequence<...>) - // CurrentOrderedId: Sequence<...> - template - __host__ __device__ constexpr void operator()(F f, CurrentOrderedId) const + /// Total number of iterations (product of all lengths) + static constexpr index_t TotalSize = + reduce_on_sequence(Lengths{}, math::multiplies{}, Number<1>{}); + + /// Lengths reordered according to iteration order + static constexpr auto OrderedLengths = Lengths::ReorderGivenNew2Old(Orders{}); + + /// Type of OrderedLengths with cv-qualifiers removed + using OrderedLengthsType = remove_cvref_t; + + /// Mapping from loop level ("new" index) to original dimension ("old" index) + using New2Old = Orders; + + __host__ __device__ constexpr ford_base() { - static_for<0, RemainLengths::Front(), 1>{}([=](auto I) { - static_ford_impl{}( - f, CurrentOrderedId::PushBack(I)); - }); + static_assert(Lengths::GetSize() > 0, "wrong! Lengths is empty"); + static_assert(Lengths::GetSize() == Orders::GetSize(), "wrong! inconsistent size"); } }; -template -struct static_ford_impl, Orders> -{ - // F signature: F(Sequence<...>) - // OrderedId: Sequence<...> - template - __host__ __device__ constexpr void operator()(F f, OrderedId) const - { - // retrive unordered Id - f(OrderedId::ReorderGivenOld2New(Orders{})); - } -}; +/** + * @brief Helper for decomposing a linear index into multi-dimensional indices. + * + * Computes strides at compile time and provides both compile-time and runtime + * index decomposition. Used by static_ford and ford to convert a flat iteration + * index into N-dimensional coordinates. + * + * For OrderedLengths = Sequence: + * - strides = {L1*L2, L2, 1} + * - ordered_idx[i] = (linear_idx / strides[i]) % lengths[i] + * + * @tparam OrderedLengths Sequence<...> of dimension sizes in iteration order + * @tparam IndexSeq Sequence<0, 1, ..., NDim-1> for pack expansion + */ +template +struct index_decomposer; -// RemainLengths: Sequence<...> -// Orders: Sequence<...> -template -struct ford_impl +template +struct index_decomposer, Sequence> { - __host__ __device__ constexpr ford_impl() - { - static_assert(RemainLengths::GetSize() > 0, "wrong! should not get here"); - } + /// Number of dimensions + static constexpr index_t NDim = sizeof...(Ls); - // F signature: F(Array<...> multi_id) - // CurrentOrderdId: Array<...> - template - __host__ __device__ constexpr void operator()(F f, CurrentOrderedId current_ordered_id) const + /// Dimension lengths in iteration order + static constexpr index_array lengths = {{Ls...}}; + + /** + * @brief Compute all strides in a single O(N) pass. + * + * For dimensions with lengths [L0, L1, L2, ...]: + * strides[N-1] = 1 + * strides[i] = strides[i+1] * lengths[i+1] + * + * @return index_array containing computed strides + */ + static constexpr index_array compute_all_strides() { - for(index_t i = 0; i < RemainLengths::Front(); ++i) + index_array result{}; + if constexpr(NDim > 0) { - ford_impl{}( - f, container_push_back(current_ordered_id, i)); + result[NDim - 1] = 1; + for(index_t i = NDim - 2; i >= 0; --i) + { + result[i] = result[i + 1] * lengths[i + 1]; + } } + return result; } -}; -template -struct ford_impl, Orders> -{ - // F signature: F(Array<...> multi_id) - // CurrentOrderdId: Array<...> - template - __host__ __device__ constexpr void operator()(F f, CurrentOrderedId current_ordered_id) const + /// Pre-computed strides for each dimension + static constexpr index_array strides = compute_all_strides(); + + /** + * @brief Compile-time decomposition of a linear index. + * + * Returns a Sequence containing the multi-dimensional indices + * in iteration order. + * + * @tparam LinearIdx The linear index to decompose (compile-time constant) + */ + template + using decompose = Sequence<((LinearIdx / strides[Is]) % lengths[Is])...>; + + /** + * @brief Runtime decomposition of a linear index with reordering. + * + * Decomposes linear_idx into ordered indices, then reorders them + * to the original dimension order and stores in result. + * + * @tparam New2Old Sequence mapping iteration position to original dimension + * @tparam MultiIndex Type of the output multi-index container + * @param linear_idx The linear index to decompose + * @param[out] result Multi-index container to store the result + */ + template + __host__ __device__ static void decompose_runtime(index_t linear_idx, MultiIndex& result) { - // retrive unordered Id - f(container_reorder_given_old2new(current_ordered_id, Orders{})); + // Compute ordered indices and assign to result in original dimension order + ((result(Number{})>{}) = (linear_idx / strides[Is]) % lengths[Is]), + ...); } }; } // namespace detail -// Lengths is Sequence<...>, it is the length of each dimension for -// N-dimensional loop -// Orders is Sequence<...>, it is the order of dimension in which static_ford -// will loop over each -// dimension -template ::type> -struct static_ford +/** + * @brief Compile-time N-dimensional loop with static multi-indices. + * + * Iterates over an N-dimensional space where dimensions have sizes specified + * by Lengths. The iteration order is controlled by Orders. Each iteration + * provides a compile-time Sequence containing the current multi-index. + * + * Uses O(1) template instantiation depth via flat loop with index decomposition, + * avoiding recursive template structures. + * + * Example: + * @code + * // Iterate over 2x3 space in row-major order (dim 0 outer, dim 1 inner) + * static_ford>{}([](auto multi_id) { + * constexpr index_t i = multi_id[Number<0>{}]; // 0, 0, 0, 1, 1, 1 + * constexpr index_t j = multi_id[Number<1>{}]; // 0, 1, 2, 0, 1, 2 + * }); + * + * // Column-major order (dim 1 outer, dim 0 inner) + * static_ford, Sequence<1, 0>>{}([](auto multi_id) { + * // Visits: (0,0), (1,0), (0,1), (1,1), (0,2), (1,2) + * }); + * @endcode + * + * @tparam Lengths Sequence specifying dimension sizes + * @tparam Orders Sequence specifying iteration order + * (default: Sequence<0, 1, ..., N-1> for row-major) + */ +template > +struct static_ford : detail::ford_base { - __host__ __device__ constexpr static_ford() - { - static_assert(Lengths::GetSize() > 0, "wrong! Lengths is empty"); - static_assert(Lengths::GetSize() == Orders::GetSize(), "wrong! inconsistent size"); - } + using Base = detail::ford_base; + using Decomposer = detail::index_decomposer>; - // F signature: F(Sequence<...> multi_id) - // multi_id is the unordered multi-index + /** + * @brief Execute the N-dimensional loop. + * + * Calls f with a compile-time Sequence for each point + * in the iteration space. + * + * @tparam F Functor type with signature F(Sequence<...>) + * @param f The functor to call for each multi-index + */ template __host__ __device__ constexpr void operator()(F f) const { - constexpr auto ordered_lengths = Lengths::ReorderGivenNew2Old(Orders{}); - detail::static_ford_impl{}(f, Sequence<>{}); + static_for<0, Base::TotalSize, 1>{}([&](auto linear_idx) { + using OrderedIdx = typename Decomposer::template decompose; + f(OrderedIdx::ReorderGivenOld2New(Orders{})); + }); } }; -// Lengths is Sequence<...>, it is the length of each dimension for -// N-dimensional loop -// Orders is Sequence<...>, it is the order of dimension in which ford will loop -// over each -// dimension -template ::type> -struct ford +/** + * @brief Runtime N-dimensional loop with runtime multi-indices. + * + * Iterates over an N-dimensional space where dimensions have sizes specified + * by Lengths. The iteration order is controlled by Orders. Each iteration + * provides a runtime multi-index container. + * + * Uses O(1) template instantiation depth via flat for-loop with index decomposition, + * avoiding recursive template structures. + * + * Example: + * @code + * // Iterate over 2x3 space in row-major order + * ford>{}([](auto multi_id) { + * index_t i = multi_id[Number<0>{}]; // Runtime values + * index_t j = multi_id[Number<1>{}]; + * }); + * @endcode + * + * @tparam Lengths Sequence specifying dimension sizes + * @tparam Orders Sequence specifying iteration order + * (default: Sequence<0, 1, ..., N-1> for row-major) + */ +template > +struct ford : detail::ford_base { - __host__ __device__ constexpr ford() - { - static_assert(Lengths::GetSize() > 0, "wrong! Lengths is empty"); - static_assert(Lengths::GetSize() == Orders::GetSize(), "wrong! inconsistent size"); - } + using Base = detail::ford_base; + using Decomposer = detail::index_decomposer>; - // F signature: F(Array<...> multi_id) - // multi_id is the unordered multi-index + /** + * @brief Execute the N-dimensional loop. + * + * Calls f with a runtime multi-index for each point in the iteration space. + * + * @tparam F Functor type with signature F(MultiIndex) + * @param f The functor to call for each multi-index + */ template __host__ __device__ constexpr void operator()(F f) const { - constexpr auto ordered_lengths = Lengths::ReorderGivenNew2Old(Orders{}); - - for(index_t i = 0; i < ordered_lengths.Front(); ++i) + for(index_t linear_idx = 0; linear_idx < Base::TotalSize; ++linear_idx) { - detail::ford_impl{}(f, - make_multi_index(i)); + auto multi_id = make_zero_multi_index(); + Decomposer::template decompose_runtime(linear_idx, multi_id); + f(multi_id); } } }; diff --git a/include/ck/utility/sequence.hpp b/include/ck/utility/sequence.hpp index 3a45d52bd3..eb4373447c 100644 --- a/include/ck/utility/sequence.hpp +++ b/include/ck/utility/sequence.hpp @@ -38,6 +38,30 @@ __host__ __device__ constexpr auto sequence_pop_front(Sequence); template __host__ __device__ constexpr auto sequence_pop_back(Seq); +namespace detail { + +/** + * @brief Helper to generate integer sequences with custom Sequence class + */ +template +struct __integer_sequence; + +template +struct __integer_sequence +{ + using seq_type = Sequence; +}; + +} // namespace detail + +/** + * @brief Generate a Sequence class with index_t integers from 0 to N-1 + * @tparam N The size of the sequence to generate + */ +template +using make_index_sequence = + typename __make_integer_seq::seq_type; + template struct Sequence { @@ -157,18 +181,37 @@ struct Sequence return Sequence{})...>{}; } + /** + * @brief Modify the sequence at a specific index with a new value + * @tparam I The index of the element to modify + * @tparam X The new value to set at index I + * @return A new Sequence with the value at index I replaced by X + */ template __host__ __device__ static constexpr auto Modify(Number, Number) { - static_assert(I < Size(), "wrong!"); - - using seq_split = sequence_split; - constexpr auto seq_left = typename seq_split::left_type{}; - constexpr auto seq_right = typename seq_split::right_type{}.PopFront(); - - return seq_left.PushBack(Number{}).PushBack(seq_right); + // Generate and forward an index sequence that covers all elements + static_assert(I >= 0 && I < mSize, "Index I is out of bounds"); + return modify_impl(make_index_sequence{}, Number{}, Number{}); } + private: + /** + * @brief Helper function to modify the sequence at a specific index + * @tparam Idxs Indices of the sequence elements (0, 1, ..., Size-1) + * @tparam ModifyIdx The index of the value in the sequence to modify + * @tparam NewVal The new value to set at ModifyIdx + * @return A new Sequence with the value at ModifyIdx replaced by NewVal + */ + template + __host__ __device__ static constexpr auto + modify_impl(Sequence, Number, Number) + { + // For each index: if it equals ModifyIdx, use NewVal; otherwise use original value + return Sequence<(Idxs == ModifyIdx ? NewVal : At(Idxs))...>{}; + } + + public: template __host__ __device__ static constexpr auto Transform(F f) { @@ -184,21 +227,6 @@ struct Sequence } }; -namespace impl { -template -struct __integer_sequence; - -template -struct __integer_sequence -{ - using seq_type = Sequence; -}; -} // namespace impl - -template -using make_index_sequence = - typename __make_integer_seq::seq_type; - // merge sequence - optimized to avoid recursive instantiation // // Note: Unlike sequence_gen and uniform_sequence_gen which use __make_integer_seq for O(1) @@ -332,13 +360,7 @@ struct arithmetic_sequence_gen template struct arithmetic_sequence_gen<0, IEnd, 1> { - template - struct WrapSequence - { - using type = Sequence; - }; - // https://reviews.llvm.org/D13786 - using type = typename __make_integer_seq::type; + using type = make_index_sequence; }; // uniform sequence - optimized using __make_integer_seq @@ -367,26 +389,79 @@ struct uniform_sequence_gen<0, I> using type = Sequence<>; }; -// reverse inclusive scan (with init) sequence -template -struct sequence_reverse_inclusive_scan; +namespace detail { -template -struct sequence_reverse_inclusive_scan, Reduce, Init> +/** + * @brief A simple fixed-size array to hold intermediate results during constexpr computation + * @tparam N The size of the array + */ +template +struct index_array { - using old_scan = typename sequence_reverse_inclusive_scan, Reduce, Init>::type; + index_t data[N > 0 ? N : 1]; - static constexpr index_t new_reduce = Reduce{}(I, old_scan{}.Front()); - - using type = typename sequence_merge, old_scan>::type; + __host__ __device__ constexpr index_t& operator[](index_t i) { return data[i]; } + __host__ __device__ constexpr const index_t& operator[](index_t i) const { return data[i]; } }; -template -struct sequence_reverse_inclusive_scan, Reduce, Init> +/** + * @brief Compute the reverse inclusive scan of a sequence at compile time using a constexpr + * function + * @tparam Reduce The binary reduction functor + * @tparam Init The initial value for the reduction + * @tparam Vs The input sequence values + * @return An index_array containing the reverse inclusive scan results + */ +template +__host__ __device__ constexpr auto compute_reverse_inclusive_scan() { - using type = Sequence; + constexpr index_t N = sizeof...(Vs); + index_array result{}; + constexpr index_t input[N > 0 ? N : 1] = {Vs...}; + + if constexpr(N > 0) + { + result.data[N - 1] = Reduce{}(input[N - 1], Init); + for(index_t i = N - 2; i >= 0; --i) + { + result.data[i] = Reduce{}(input[i], result.data[i + 1]); + } + } + return result; +} + +// Build result sequence with O(1) instantiation depth +template +struct build_reverse_inclusive_scan; + +template +struct build_reverse_inclusive_scan, Sequence> +{ + static constexpr auto result = compute_reverse_inclusive_scan(); + + using type = Sequence; }; +} // namespace detail + +/** + * @brief Reverse inclusive scan of a sequence - main interface + * @tparam Seq The input sequence to scan + * @tparam Reduce The binary reduction functor + * @tparam Init The initial value for the reduction + */ +template +struct sequence_reverse_inclusive_scan +{ + using type = typename detail:: + build_reverse_inclusive_scan>::type; +}; + +/** + * @brief Specialization for empty sequence - returns empty sequence without computation + * @tparam Reduce The binary reduction functor + * @tparam Init The initial value for the reduction + */ template struct sequence_reverse_inclusive_scan, Reduce, Init> { @@ -406,28 +481,34 @@ struct sequence_split using right_type = decltype(Seq::Extract(range1{})); }; -// reverse sequence +// reverse sequence - optimized using direct pack expansion O(1) depth +namespace detail { + +template +struct sequence_reverse_impl; + +template +struct sequence_reverse_impl, Sequence> +{ + static constexpr index_t N = sizeof...(Is); + // Access elements in reverse order: index (N-1-i) for position i + using type = Sequence::At(Number{})...>; +}; + +} // namespace detail + template struct sequence_reverse { - static constexpr index_t NSize = Seq{}.Size(); - - using seq_split = sequence_split; - using type = typename sequence_merge< - typename sequence_reverse::type, - typename sequence_reverse::type>::type; + using type = + typename detail::sequence_reverse_impl>::type; }; -template -struct sequence_reverse> +// Empty sequence specialization +template <> +struct sequence_reverse> { - using type = Sequence; -}; - -template -struct sequence_reverse> -{ - using type = Sequence; + using type = Sequence<>; }; #if 1 @@ -597,31 +678,59 @@ struct is_valid_sequence_map : is_same -struct sequence_map_inverse +/** + * @brief Invert a permutation sequence: given X2Y = {a, b, c, ...}, compute Y2X where Y2X[X2Y[i]] + * = i Example: Sequence<2,0,1> (meaning pos0->2, pos1->0, pos2->1) inverts to Sequence<1,2,0> + * + * Why this implementation is faster to compile than recursive templates: + * + * The old recursive approach created a new template type for each element: + * sequence_map_inverse> -> sequence_map_inverse> -> + * sequence_map_inverse> + * Each "->" is a new type the compiler must create, track, and manage. For N elements, that's + * N template types, each with overhead (name mangling, debug info, symbol table entries). + * + * This implementation uses a constexpr for loop to build the inverse in O(N) operations: + * For input Sequence<2,0,1>, the loop sets result[input[pos]] = pos for each position: + * pos=0: result[2]=0, pos=1: result[0]=1, pos=2: result[1]=2 + * This builds the inverse permutation in a single pass with O(1) template instantiation depth. + * + * @tparam Is The input permutation sequence + */ +template +struct sequence_map_inverse> { - template - struct sequence_map_inverse_impl + static_assert(is_valid_sequence_map>::value, + "sequence_map_inverse requires SeqMap to be a valid permutation sequence map"); + + private: + static constexpr auto build_inverse() { - static constexpr auto new_y2x = - WorkingY2X::Modify(X2Y::At(Number{}), Number{}); + detail::index_array result{}; + constexpr index_t input[] = {Is...}; + for(index_t pos = 0; pos < static_cast(sizeof...(Is)); ++pos) + { + result[input[pos]] = pos; + } + return result; + } - using type = - typename sequence_map_inverse_impl:: - type; - }; + static constexpr auto inverse = build_inverse(); - template - struct sequence_map_inverse_impl + template + static constexpr auto compute(Sequence) { - using type = WorkingY2X; - }; + return Sequence{}; + } - using type = - typename sequence_map_inverse_impl::type, - 0, - SeqMap::Size()>::type; + public: + using type = decltype(compute(make_index_sequence{})); +}; + +template <> +struct sequence_map_inverse> +{ + using type = Sequence<>; }; template @@ -799,8 +908,8 @@ __host__ __device__ constexpr auto pick_sequence_elements_by_ids(Seq, Sequence{})...>{}; } -#if 1 namespace detail { + template struct pick_sequence_elements_by_mask_impl { @@ -856,7 +965,6 @@ __host__ __device__ constexpr auto modify_sequence_elements_by_ids(Seq, Values, return typename detail::modify_sequence_elements_by_ids_impl::type{}; } -#endif template __host__ __device__ constexpr index_t diff --git a/include/ck/utility/statically_indexed_array.hpp b/include/ck/utility/statically_indexed_array.hpp index f3d73e84a7..e953735385 100644 --- a/include/ck/utility/statically_indexed_array.hpp +++ b/include/ck/utility/statically_indexed_array.hpp @@ -5,7 +5,6 @@ #define CK_STATICALLY_INDEXED_ARRAY_HPP #include "functional2.hpp" -#include "sequence.hpp" #include "tuple.hpp" namespace ck { diff --git a/test/util/CMakeLists.txt b/test/util/CMakeLists.txt index bf0a444f18..ffec00674c 100644 --- a/test/util/CMakeLists.txt +++ b/test/util/CMakeLists.txt @@ -5,3 +5,9 @@ add_gtest_executable(unit_sequence unit_sequence.cpp) if(result EQUAL 0) target_link_libraries(unit_sequence PRIVATE utility) endif() + +add_gtest_executable(unit_ford unit_ford.cpp) +if(result EQUAL 0) + target_link_libraries(unit_ford PRIVATE utility) +endif() + \ No newline at end of file diff --git a/test/util/unit_ford.cpp b/test/util/unit_ford.cpp new file mode 100644 index 0000000000..487b47203d --- /dev/null +++ b/test/util/unit_ford.cpp @@ -0,0 +1,725 @@ +// Copyright (c) Advanced Micro Devices, Inc., or its affiliates. +// SPDX-License-Identifier: MIT + +#include +#include +#include +#include "ck/utility/functional3.hpp" +#include "ck/utility/sequence.hpp" + +using namespace ck; + +// ============================================================================ +// static_ford Tests +// ============================================================================ + +// Test basic static_ford construction and iteration +TEST(StaticFord, BasicConstruction2D) +{ + std::vector> visited; + + static_ford>{}([&](auto multi_id) { + constexpr index_t i = multi_id[Number<0>{}]; + constexpr index_t j = multi_id[Number<1>{}]; + visited.emplace_back(i, j); + }); + + ASSERT_EQ(visited.size(), 6u); + EXPECT_EQ(visited[0], std::make_pair(0, 0)); + EXPECT_EQ(visited[1], std::make_pair(0, 1)); + EXPECT_EQ(visited[2], std::make_pair(0, 2)); + EXPECT_EQ(visited[3], std::make_pair(1, 0)); + EXPECT_EQ(visited[4], std::make_pair(1, 1)); + EXPECT_EQ(visited[5], std::make_pair(1, 2)); +} + +TEST(StaticFord, ReversedOrder2D) +{ + std::vector> visited; + + // Order (1, 0) means dim 1 is outer, dim 0 is inner (column-major) + static_ford, Sequence<1, 0>>{}([&](auto multi_id) { + constexpr index_t i = multi_id[Number<0>{}]; + constexpr index_t j = multi_id[Number<1>{}]; + visited.emplace_back(i, j); + }); + + ASSERT_EQ(visited.size(), 6u); + EXPECT_EQ(visited[0], std::make_pair(0, 0)); + EXPECT_EQ(visited[1], std::make_pair(1, 0)); + EXPECT_EQ(visited[2], std::make_pair(0, 1)); + EXPECT_EQ(visited[3], std::make_pair(1, 1)); + EXPECT_EQ(visited[4], std::make_pair(0, 2)); + EXPECT_EQ(visited[5], std::make_pair(1, 2)); +} + +TEST(StaticFord, BasicConstruction3D) +{ + std::vector> visited; + + static_ford>{}([&](auto multi_id) { + constexpr index_t i = multi_id[Number<0>{}]; + constexpr index_t j = multi_id[Number<1>{}]; + constexpr index_t k = multi_id[Number<2>{}]; + visited.emplace_back(i, j, k); + }); + + ASSERT_EQ(visited.size(), 8u); + EXPECT_EQ(visited[0], std::make_tuple(0, 0, 0)); + EXPECT_EQ(visited[1], std::make_tuple(0, 0, 1)); + EXPECT_EQ(visited[2], std::make_tuple(0, 1, 0)); + EXPECT_EQ(visited[3], std::make_tuple(0, 1, 1)); + EXPECT_EQ(visited[4], std::make_tuple(1, 0, 0)); + EXPECT_EQ(visited[5], std::make_tuple(1, 0, 1)); + EXPECT_EQ(visited[6], std::make_tuple(1, 1, 0)); + EXPECT_EQ(visited[7], std::make_tuple(1, 1, 1)); +} + +TEST(StaticFord, CustomOrder3D) +{ + std::vector> visited; + + // Order (2, 0, 1) means: dim 2 outer, dim 0 middle, dim 1 inner + static_ford, Sequence<2, 0, 1>>{}([&](auto multi_id) { + constexpr index_t i = multi_id[Number<0>{}]; + constexpr index_t j = multi_id[Number<1>{}]; + constexpr index_t k = multi_id[Number<2>{}]; + visited.emplace_back(i, j, k); + }); + + ASSERT_EQ(visited.size(), 8u); + // With order (2, 0, 1): k varies slowest, then i, then j fastest + EXPECT_EQ(visited[0], std::make_tuple(0, 0, 0)); + EXPECT_EQ(visited[1], std::make_tuple(0, 1, 0)); + EXPECT_EQ(visited[2], std::make_tuple(1, 0, 0)); + EXPECT_EQ(visited[3], std::make_tuple(1, 1, 0)); + EXPECT_EQ(visited[4], std::make_tuple(0, 0, 1)); + EXPECT_EQ(visited[5], std::make_tuple(0, 1, 1)); + EXPECT_EQ(visited[6], std::make_tuple(1, 0, 1)); + EXPECT_EQ(visited[7], std::make_tuple(1, 1, 1)); +} + +TEST(StaticFord, SingleDimension) +{ + std::vector visited; + + static_ford>{}([&](auto multi_id) { + constexpr index_t i = multi_id[Number<0>{}]; + visited.push_back(i); + }); + + ASSERT_EQ(visited.size(), 5u); + EXPECT_EQ(visited[0], 0); + EXPECT_EQ(visited[1], 1); + EXPECT_EQ(visited[2], 2); + EXPECT_EQ(visited[3], 3); + EXPECT_EQ(visited[4], 4); +} + +TEST(StaticFord, LargerDimensions) +{ + std::vector> visited; + + static_ford>{}([&](auto multi_id) { + constexpr index_t i = multi_id[Number<0>{}]; + constexpr index_t j = multi_id[Number<1>{}]; + visited.emplace_back(i, j); + }); + + ASSERT_EQ(visited.size(), 12u); + // Verify first and last + EXPECT_EQ(visited.front(), std::make_pair(0, 0)); + EXPECT_EQ(visited.back(), std::make_pair(2, 3)); +} + +TEST(StaticFord, CompileTimeMultiIndex) +{ + // Test that multi_id is truly compile-time by using it in constexpr context + static_ford>{}([](auto multi_id) { + // These should all be compile-time constants + constexpr index_t i = multi_id[Number<0>{}]; + constexpr index_t j = multi_id[Number<1>{}]; + constexpr index_t size = decltype(multi_id)::Size(); + static_assert(size == 2, "Multi-index should have 2 elements"); + (void)i; + (void)j; + }); + SUCCEED(); +} + +// ============================================================================ +// ford Tests +// ============================================================================ + +TEST(Ford, BasicConstruction2D) +{ + std::vector> visited; + + ford>{}([&](auto multi_id) { + index_t i = multi_id[Number<0>{}]; + index_t j = multi_id[Number<1>{}]; + visited.emplace_back(i, j); + }); + + ASSERT_EQ(visited.size(), 6u); + EXPECT_EQ(visited[0], std::make_pair(0, 0)); + EXPECT_EQ(visited[1], std::make_pair(0, 1)); + EXPECT_EQ(visited[2], std::make_pair(0, 2)); + EXPECT_EQ(visited[3], std::make_pair(1, 0)); + EXPECT_EQ(visited[4], std::make_pair(1, 1)); + EXPECT_EQ(visited[5], std::make_pair(1, 2)); +} + +TEST(Ford, ReversedOrder2D) +{ + std::vector> visited; + + ford, Sequence<1, 0>>{}([&](auto multi_id) { + index_t i = multi_id[Number<0>{}]; + index_t j = multi_id[Number<1>{}]; + visited.emplace_back(i, j); + }); + + ASSERT_EQ(visited.size(), 6u); + EXPECT_EQ(visited[0], std::make_pair(0, 0)); + EXPECT_EQ(visited[1], std::make_pair(1, 0)); + EXPECT_EQ(visited[2], std::make_pair(0, 1)); + EXPECT_EQ(visited[3], std::make_pair(1, 1)); + EXPECT_EQ(visited[4], std::make_pair(0, 2)); + EXPECT_EQ(visited[5], std::make_pair(1, 2)); +} + +TEST(Ford, BasicConstruction3D) +{ + std::vector> visited; + + ford>{}([&](auto multi_id) { + index_t i = multi_id[Number<0>{}]; + index_t j = multi_id[Number<1>{}]; + index_t k = multi_id[Number<2>{}]; + visited.emplace_back(i, j, k); + }); + + ASSERT_EQ(visited.size(), 8u); + EXPECT_EQ(visited[0], std::make_tuple(0, 0, 0)); + EXPECT_EQ(visited[1], std::make_tuple(0, 0, 1)); + EXPECT_EQ(visited[2], std::make_tuple(0, 1, 0)); + EXPECT_EQ(visited[3], std::make_tuple(0, 1, 1)); + EXPECT_EQ(visited[4], std::make_tuple(1, 0, 0)); + EXPECT_EQ(visited[5], std::make_tuple(1, 0, 1)); + EXPECT_EQ(visited[6], std::make_tuple(1, 1, 0)); + EXPECT_EQ(visited[7], std::make_tuple(1, 1, 1)); +} + +TEST(Ford, CustomOrder3D) +{ + std::vector> visited; + + ford, Sequence<2, 0, 1>>{}([&](auto multi_id) { + index_t i = multi_id[Number<0>{}]; + index_t j = multi_id[Number<1>{}]; + index_t k = multi_id[Number<2>{}]; + visited.emplace_back(i, j, k); + }); + + ASSERT_EQ(visited.size(), 8u); + EXPECT_EQ(visited[0], std::make_tuple(0, 0, 0)); + EXPECT_EQ(visited[1], std::make_tuple(0, 1, 0)); + EXPECT_EQ(visited[2], std::make_tuple(1, 0, 0)); + EXPECT_EQ(visited[3], std::make_tuple(1, 1, 0)); + EXPECT_EQ(visited[4], std::make_tuple(0, 0, 1)); + EXPECT_EQ(visited[5], std::make_tuple(0, 1, 1)); + EXPECT_EQ(visited[6], std::make_tuple(1, 0, 1)); + EXPECT_EQ(visited[7], std::make_tuple(1, 1, 1)); +} + +TEST(Ford, SingleDimension) +{ + std::vector visited; + + ford>{}([&](auto multi_id) { + index_t i = multi_id[Number<0>{}]; + visited.push_back(i); + }); + + ASSERT_EQ(visited.size(), 5u); + EXPECT_EQ(visited[0], 0); + EXPECT_EQ(visited[1], 1); + EXPECT_EQ(visited[2], 2); + EXPECT_EQ(visited[3], 3); + EXPECT_EQ(visited[4], 4); +} + +// ============================================================================ +// Consistency Tests (static_ford vs ford should produce same sequence) +// ============================================================================ + +TEST(FordConsistency, StaticFordMatchesFord2D) +{ + std::vector> static_visited; + std::vector> runtime_visited; + + static_ford>{}([&](auto multi_id) { + constexpr index_t i = multi_id[Number<0>{}]; + constexpr index_t j = multi_id[Number<1>{}]; + static_visited.emplace_back(i, j); + }); + + ford>{}([&](auto multi_id) { + index_t i = multi_id[Number<0>{}]; + index_t j = multi_id[Number<1>{}]; + runtime_visited.emplace_back(i, j); + }); + + ASSERT_EQ(static_visited.size(), runtime_visited.size()); + for(size_t idx = 0; idx < static_visited.size(); ++idx) + { + EXPECT_EQ(static_visited[idx], runtime_visited[idx]) << "Mismatch at index " << idx; + } +} + +TEST(FordConsistency, StaticFordMatchesFord3DWithOrder) +{ + std::vector> static_visited; + std::vector> runtime_visited; + + static_ford, Sequence<1, 2, 0>>{}([&](auto multi_id) { + constexpr index_t i = multi_id[Number<0>{}]; + constexpr index_t j = multi_id[Number<1>{}]; + constexpr index_t k = multi_id[Number<2>{}]; + static_visited.emplace_back(i, j, k); + }); + + ford, Sequence<1, 2, 0>>{}([&](auto multi_id) { + index_t i = multi_id[Number<0>{}]; + index_t j = multi_id[Number<1>{}]; + index_t k = multi_id[Number<2>{}]; + runtime_visited.emplace_back(i, j, k); + }); + + ASSERT_EQ(static_visited.size(), runtime_visited.size()); + for(size_t idx = 0; idx < static_visited.size(); ++idx) + { + EXPECT_EQ(static_visited[idx], runtime_visited[idx]) << "Mismatch at index " << idx; + } +} + +// ============================================================================ +// ford_base Tests +// ============================================================================ + +TEST(FordBase, TotalSizeComputation) +{ + // 2 * 3 = 6 + using Ford2D = static_ford>; + EXPECT_EQ(Ford2D::Base::TotalSize, 6); + + // 2 * 3 * 4 = 24 + using Ford3D = static_ford>; + EXPECT_EQ(Ford3D::Base::TotalSize, 24); + + // 5 + using Ford1D = static_ford>; + EXPECT_EQ(Ford1D::Base::TotalSize, 5); +} + +TEST(FordBase, NDimComputation) +{ + using Ford1D = static_ford>; + EXPECT_EQ(Ford1D::Base::NDim, 1); + + using Ford2D = static_ford>; + EXPECT_EQ(Ford2D::Base::NDim, 2); + + using Ford3D = static_ford>; + EXPECT_EQ(Ford3D::Base::NDim, 3); + + using Ford4D = static_ford>; + EXPECT_EQ(Ford4D::Base::NDim, 4); +} + +// ============================================================================ +// index_decomposer Tests +// ============================================================================ + +TEST(IndexDecomposer, DecomposeLinearIndex2D) +{ + // For Sequence<2, 3> (row-major): strides = {3, 1} + // linear 0: (0/3)%2=0, (0/1)%3=0 -> (0,0) + // linear 1: (1/3)%2=0, (1/1)%3=1 -> (0,1) + // linear 5: (5/3)%2=1, (5/1)%3=2 -> (1,2) + using Decomposer = detail::index_decomposer, Sequence<0, 1>>; + + using Idx0 = typename Decomposer::template decompose<0>; + EXPECT_EQ(Idx0::At(Number<0>{}), 0); + EXPECT_EQ(Idx0::At(Number<1>{}), 0); + + using Idx1 = typename Decomposer::template decompose<1>; + EXPECT_EQ(Idx1::At(Number<0>{}), 0); + EXPECT_EQ(Idx1::At(Number<1>{}), 1); + + using Idx5 = typename Decomposer::template decompose<5>; + EXPECT_EQ(Idx5::At(Number<0>{}), 1); + EXPECT_EQ(Idx5::At(Number<1>{}), 2); +} + +TEST(IndexDecomposer, DecomposeLinearIndex3D) +{ + // For Sequence<2, 3, 4>: strides = {12, 4, 1} + using Decomposer = detail::index_decomposer, Sequence<0, 1, 2>>; + + using Idx0 = typename Decomposer::template decompose<0>; + EXPECT_EQ(Idx0::At(Number<0>{}), 0); + EXPECT_EQ(Idx0::At(Number<1>{}), 0); + EXPECT_EQ(Idx0::At(Number<2>{}), 0); + + // linear 13 = 1*12 + 0*4 + 1 -> (1, 0, 1) + using Idx13 = typename Decomposer::template decompose<13>; + EXPECT_EQ(Idx13::At(Number<0>{}), 1); + EXPECT_EQ(Idx13::At(Number<1>{}), 0); + EXPECT_EQ(Idx13::At(Number<2>{}), 1); + + // linear 23 = 1*12 + 2*4 + 3 -> (1, 2, 3) + using Idx23 = typename Decomposer::template decompose<23>; + EXPECT_EQ(Idx23::At(Number<0>{}), 1); + EXPECT_EQ(Idx23::At(Number<1>{}), 2); + EXPECT_EQ(Idx23::At(Number<2>{}), 3); +} + +TEST(IndexDecomposer, StrideComputation) +{ + // For Sequence<2, 3, 4>: strides = {3*4=12, 4, 1} + using Decomposer = detail::index_decomposer, Sequence<0, 1, 2>>; + + EXPECT_EQ(Decomposer::strides[0], 12); + EXPECT_EQ(Decomposer::strides[1], 4); + EXPECT_EQ(Decomposer::strides[2], 1); +} + +// ============================================================================ +// Edge Cases +// ============================================================================ + +TEST(FordEdgeCases, DimensionWithSizeOne) +{ + std::vector> visited; + + // A dimension with size 1 should just contribute one iteration level + static_ford>{}([&](auto multi_id) { + constexpr index_t i = multi_id[Number<0>{}]; + constexpr index_t j = multi_id[Number<1>{}]; + constexpr index_t k = multi_id[Number<2>{}]; + visited.emplace_back(i, j, k); + }); + + ASSERT_EQ(visited.size(), 6u); // 2 * 1 * 3 = 6 + // j should always be 0 + for(const auto& v : visited) + { + EXPECT_EQ(std::get<1>(v), 0); + } +} + +TEST(FordEdgeCases, HigherDimensions4D) +{ + index_t count = 0; + + static_ford>{}([&](auto multi_id) { + (void)multi_id; + ++count; + }); + + EXPECT_EQ(count, 16); // 2^4 = 16 +} + +TEST(FordEdgeCases, VariousSizes) +{ + index_t count = 0; + + static_ford>{}([&](auto multi_id) { + (void)multi_id; + ++count; + }); + + EXPECT_EQ(count, 30); // 3 * 5 * 2 = 30 +} + +// ============================================================================ +// Regression Tests for Non-Trivial Orderings +// These tests specifically verify correct behavior for non-identity permutations +// which were previously broken (GitHub PR #4447 feedback) +// ============================================================================ + +// Regression test: Orders<2, 0, 1> should correctly map loop levels to dimensions +// This was broken when New2Old was incorrectly defined as sequence_map_inverse +TEST(FordRegression, NonTrivialOrder3D_201) +{ + // With Orders = <2, 0, 1>: + // - Loop level 0 iterates dimension 2 (outermost) + // - Loop level 1 iterates dimension 0 (middle) + // - Loop level 2 iterates dimension 1 (innermost) + // + // For Lengths = <2, 3, 2>: + // - Dimension 0 has size 2, Dimension 1 has size 3, Dimension 2 has size 2 + // - OrderedLengths = = <2, 2, 3> + // + // Iteration should proceed: + // (dim0, dim1, dim2) with dim2 slowest, dim0 middle, dim1 fastest + + std::vector> visited; + + static_ford, Sequence<2, 0, 1>>{}([&](auto multi_id) { + constexpr index_t i = multi_id[Number<0>{}]; + constexpr index_t j = multi_id[Number<1>{}]; + constexpr index_t k = multi_id[Number<2>{}]; + visited.emplace_back(i, j, k); + }); + + ASSERT_EQ(visited.size(), 12u); // 2 * 3 * 2 = 12 + + // Expected order with Orders<2, 0, 1>: + // dim2 varies slowest (0, then 1), dim0 varies in middle (0, 1), dim1 varies fastest (0, 1, 2) + size_t idx = 0; + for(index_t k = 0; k < 2; ++k) + { + for(index_t i = 0; i < 2; ++i) + { + for(index_t j = 0; j < 3; ++j) + { + EXPECT_EQ(visited[idx], std::make_tuple(i, j, k)) + << "Mismatch at idx=" << idx << " expected (i,j,k)=(" << i << "," << j << "," + << k << ")"; + ++idx; + } + } + } +} + +// Same regression test for ford (runtime version) +TEST(FordRegression, NonTrivialOrder3D_201_Runtime) +{ + std::vector> visited; + + ford, Sequence<2, 0, 1>>{}([&](auto multi_id) { + index_t i = multi_id[Number<0>{}]; + index_t j = multi_id[Number<1>{}]; + index_t k = multi_id[Number<2>{}]; + visited.emplace_back(i, j, k); + }); + + ASSERT_EQ(visited.size(), 12u); + + size_t idx = 0; + for(index_t k = 0; k < 2; ++k) + { + for(index_t i = 0; i < 2; ++i) + { + for(index_t j = 0; j < 3; ++j) + { + EXPECT_EQ(visited[idx], std::make_tuple(i, j, k)) + << "Mismatch at idx=" << idx << " expected (i,j,k)=(" << i << "," << j << "," + << k << ")"; + ++idx; + } + } + } +} + +// Regression test: Verify static_ford and ford produce identical results for all non-trivial +// orderings +TEST(FordRegression, ConsistencyWithNonTrivialOrder_201) +{ + std::vector> static_visited; + std::vector> runtime_visited; + + static_ford, Sequence<2, 0, 1>>{}([&](auto multi_id) { + constexpr index_t i = multi_id[Number<0>{}]; + constexpr index_t j = multi_id[Number<1>{}]; + constexpr index_t k = multi_id[Number<2>{}]; + static_visited.emplace_back(i, j, k); + }); + + ford, Sequence<2, 0, 1>>{}([&](auto multi_id) { + index_t i = multi_id[Number<0>{}]; + index_t j = multi_id[Number<1>{}]; + index_t k = multi_id[Number<2>{}]; + runtime_visited.emplace_back(i, j, k); + }); + + ASSERT_EQ(static_visited.size(), runtime_visited.size()); + for(size_t idx = 0; idx < static_visited.size(); ++idx) + { + EXPECT_EQ(static_visited[idx], runtime_visited[idx]) + << "static_ford and ford disagree at index " << idx; + } +} + +// Regression test: Another non-trivial ordering <1, 2, 0> +TEST(FordRegression, NonTrivialOrder3D_120) +{ + // Orders = <1, 2, 0>: + // - Loop level 0 iterates dimension 1 (outermost) + // - Loop level 1 iterates dimension 2 (middle) + // - Loop level 2 iterates dimension 0 (innermost) + + std::vector> visited; + + static_ford, Sequence<1, 2, 0>>{}([&](auto multi_id) { + constexpr index_t i = multi_id[Number<0>{}]; + constexpr index_t j = multi_id[Number<1>{}]; + constexpr index_t k = multi_id[Number<2>{}]; + visited.emplace_back(i, j, k); + }); + + ASSERT_EQ(visited.size(), 24u); // 2 * 3 * 4 = 24 + + // Expected: j slowest, k middle, i fastest + size_t idx = 0; + for(index_t j = 0; j < 3; ++j) + { + for(index_t k = 0; k < 4; ++k) + { + for(index_t i = 0; i < 2; ++i) + { + EXPECT_EQ(visited[idx], std::make_tuple(i, j, k)) + << "Mismatch at idx=" << idx << " expected (i,j,k)=(" << i << "," << j << "," + << k << ")"; + ++idx; + } + } + } +} + +// Regression test for ford with <1, 2, 0> ordering +TEST(FordRegression, NonTrivialOrder3D_120_Runtime) +{ + std::vector> visited; + + ford, Sequence<1, 2, 0>>{}([&](auto multi_id) { + index_t i = multi_id[Number<0>{}]; + index_t j = multi_id[Number<1>{}]; + index_t k = multi_id[Number<2>{}]; + visited.emplace_back(i, j, k); + }); + + ASSERT_EQ(visited.size(), 24u); + + size_t idx = 0; + for(index_t j = 0; j < 3; ++j) + { + for(index_t k = 0; k < 4; ++k) + { + for(index_t i = 0; i < 2; ++i) + { + EXPECT_EQ(visited[idx], std::make_tuple(i, j, k)) + << "Mismatch at idx=" << idx << " expected (i,j,k)=(" << i << "," << j << "," + << k << ")"; + ++idx; + } + } + } +} + +// Regression test: 4D with non-trivial ordering +TEST(FordRegression, NonTrivialOrder4D) +{ + // Orders = <3, 1, 0, 2>: + // - Level 0 iterates dim 3 (outermost) + // - Level 1 iterates dim 1 + // - Level 2 iterates dim 0 + // - Level 3 iterates dim 2 (innermost) + + std::vector> visited; + + static_ford, Sequence<3, 1, 0, 2>>{}([&](auto multi_id) { + constexpr index_t a = multi_id[Number<0>{}]; + constexpr index_t b = multi_id[Number<1>{}]; + constexpr index_t c = multi_id[Number<2>{}]; + constexpr index_t d = multi_id[Number<3>{}]; + visited.emplace_back(a, b, c, d); + }); + + ASSERT_EQ(visited.size(), 16u); + + // Expected: d slowest, b, a, c fastest + size_t idx = 0; + for(index_t d = 0; d < 2; ++d) + { + for(index_t b = 0; b < 2; ++b) + { + for(index_t a = 0; a < 2; ++a) + { + for(index_t c = 0; c < 2; ++c) + { + EXPECT_EQ(visited[idx], std::make_tuple(a, b, c, d)) + << "Mismatch at idx=" << idx; + ++idx; + } + } + } + } +} + +// Regression test: 4D with non-trivial ordering for ford (runtime) +TEST(FordRegression, NonTrivialOrder4D_Runtime) +{ + std::vector> visited; + + ford, Sequence<3, 1, 0, 2>>{}([&](auto multi_id) { + index_t a = multi_id[Number<0>{}]; + index_t b = multi_id[Number<1>{}]; + index_t c = multi_id[Number<2>{}]; + index_t d = multi_id[Number<3>{}]; + visited.emplace_back(a, b, c, d); + }); + + ASSERT_EQ(visited.size(), 16u); + + size_t idx = 0; + for(index_t d = 0; d < 2; ++d) + { + for(index_t b = 0; b < 2; ++b) + { + for(index_t a = 0; a < 2; ++a) + { + for(index_t c = 0; c < 2; ++c) + { + EXPECT_EQ(visited[idx], std::make_tuple(a, b, c, d)) + << "Mismatch at idx=" << idx; + ++idx; + } + } + } + } +} + +// Regression test: Asymmetric dimensions with non-trivial ordering +TEST(FordRegression, AsymmetricDimensionsWithOrder) +{ + // Lengths = <3, 4, 5> (all different), Orders = <1, 2, 0> + // dim1 (size 4) outermost, dim2 (size 5) middle, dim0 (size 3) innermost + + std::vector> visited; + + static_ford, Sequence<1, 2, 0>>{}([&](auto multi_id) { + constexpr index_t i = multi_id[Number<0>{}]; + constexpr index_t j = multi_id[Number<1>{}]; + constexpr index_t k = multi_id[Number<2>{}]; + visited.emplace_back(i, j, k); + }); + + ASSERT_EQ(visited.size(), 60u); // 3 * 4 * 5 = 60 + + size_t idx = 0; + for(index_t j = 0; j < 4; ++j) + { + for(index_t k = 0; k < 5; ++k) + { + for(index_t i = 0; i < 3; ++i) + { + EXPECT_EQ(visited[idx], std::make_tuple(i, j, k)) << "Mismatch at idx=" << idx; + ++idx; + } + } + } +}