mirror of
https://github.com/ROCm/composable_kernel.git
synced 2026-05-12 17:26:00 +00:00
This change significantly improves compile-time performance by reducing template instantiation depth for sequence generation and merging operations: Optimizations: - sequence_gen: Reduce instantiation depth from O(log N) to O(1) by using __make_integer_seq to generate indices in a single step, then applying the functor via pack expansion - uniform_sequence_gen: Similarly optimized to O(1) depth using __make_integer_seq with a helper that applies a constant value via pack expansion - sequence_merge: Reduce depth from O(N) to O(log N) using binary tree reduction strategy. Added direct concatenation specializations for 1-4 sequences to avoid recursion in common cases, falling back to binary tree merging for 5+ sequences Documentation: - Added extensive inline comments explaining why sequence_merge cannot achieve O(1) depth like sequence_gen (requires computing cumulative sequence lengths from heterogeneous inputs, inherently requiring recursion) - Documented the binary tree reduction approach and why it's superior to fold expressions for this use case Testing: - Added comprehensive unit tests for uniform_sequence_gen with different values, sizes, and edge cases - Added tests for sequence_gen with custom functors (double, square, identity, constant) to verify the new implementation works with arbitrary functors - Added tests for sequence_merge with 4, 5, and many sequences to verify both the direct concatenation path and binary tree reduction path - Added tests for empty sequence edge cases
107 lines
2.5 KiB
C++
107 lines
2.5 KiB
C++
// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
|
|
// SPDX-License-Identifier: MIT
|
|
|
|
#ifndef CK_STATICALLY_INDEXED_ARRAY_HPP
|
|
#define CK_STATICALLY_INDEXED_ARRAY_HPP
|
|
|
|
#include "functional2.hpp"
|
|
#include "sequence.hpp"
|
|
#include "tuple.hpp"
|
|
|
|
namespace ck {
|
|
|
|
namespace detail {
|
|
template <typename X, typename Y>
|
|
struct tuple_concat;
|
|
|
|
template <typename... Xs, typename... Ys>
|
|
struct tuple_concat<Tuple<Xs...>, Tuple<Ys...>>
|
|
{
|
|
using type = Tuple<Xs..., Ys...>;
|
|
};
|
|
|
|
// StaticallyIndexedArrayImpl uses binary split for O(log N) depth
|
|
template <typename T, index_t N>
|
|
struct StaticallyIndexedArrayImpl
|
|
{
|
|
using type =
|
|
typename tuple_concat<typename StaticallyIndexedArrayImpl<T, N / 2>::type,
|
|
typename StaticallyIndexedArrayImpl<T, N - N / 2>::type>::type;
|
|
};
|
|
|
|
template <typename T>
|
|
struct StaticallyIndexedArrayImpl<T, 0>
|
|
{
|
|
using type = Tuple<>;
|
|
};
|
|
|
|
template <typename T>
|
|
struct StaticallyIndexedArrayImpl<T, 1>
|
|
{
|
|
using type = Tuple<T>;
|
|
};
|
|
} // namespace detail
|
|
|
|
template <typename T, index_t N>
|
|
using StaticallyIndexedArray = typename detail::StaticallyIndexedArrayImpl<T, N>::type;
|
|
|
|
template <typename X, typename... Xs>
|
|
__host__ __device__ constexpr auto make_statically_indexed_array(const X& x, const Xs&... xs)
|
|
{
|
|
return StaticallyIndexedArray<X, sizeof...(Xs) + 1>(x, static_cast<X>(xs)...);
|
|
}
|
|
|
|
// make empty StaticallyIndexedArray
|
|
template <typename X>
|
|
__host__ __device__ constexpr auto make_statically_indexed_array()
|
|
{
|
|
return StaticallyIndexedArray<X, 0>();
|
|
}
|
|
|
|
template <typename T, index_t N>
|
|
struct StaticallyIndexedArray_v2
|
|
{
|
|
__host__ __device__ constexpr StaticallyIndexedArray_v2() = default;
|
|
|
|
__host__ __device__ static constexpr index_t Size() { return N; }
|
|
|
|
// read access
|
|
template <index_t I>
|
|
__host__ __device__ constexpr const auto& At(Number<I>) const
|
|
{
|
|
static_assert(I < N, "wrong! out of range");
|
|
|
|
return data_[I];
|
|
}
|
|
|
|
// write access
|
|
template <index_t I>
|
|
__host__ __device__ constexpr auto& At(Number<I>)
|
|
{
|
|
static_assert(I < N, "wrong! out of range");
|
|
|
|
return data_[I];
|
|
}
|
|
|
|
// read access
|
|
template <index_t I>
|
|
__host__ __device__ constexpr const auto& operator[](Number<I> i) const
|
|
{
|
|
return At(i);
|
|
}
|
|
|
|
// write access
|
|
template <index_t I>
|
|
__host__ __device__ constexpr auto& operator()(Number<I> i)
|
|
{
|
|
return At(i);
|
|
}
|
|
|
|
__host__ __device__ static constexpr bool IsStaticBuffer() { return true; }
|
|
|
|
T data_[N];
|
|
};
|
|
|
|
} // namespace ck
|
|
#endif
|