From 06fb85327913bfa5e6e6e75aba0536f448c70a96 Mon Sep 17 00:00:00 2001 From: "assistant-librarian[bot]" Date: Mon, 26 Jan 2026 18:17:51 +0000 Subject: [PATCH] Merge commit 'de59c0716c631edfa4742e4309ee11d4379ef6e8' into develop --- Dockerfile.manylinux | 101 ++++++++++++ include/ck/utility/sequence.hpp | 152 +++++++++++++----- .../ck/utility/statically_indexed_array.hpp | 1 + test/util/unit_sequence.cpp | 134 +++++++++++++++ 4 files changed, 348 insertions(+), 40 deletions(-) create mode 100644 Dockerfile.manylinux diff --git a/Dockerfile.manylinux b/Dockerfile.manylinux new file mode 100644 index 0000000000..0683bcd4a6 --- /dev/null +++ b/Dockerfile.manylinux @@ -0,0 +1,101 @@ +FROM ghcr.io/rocm/therock_build_manylinux_x86_64:latest +ARG DEBIAN_FRONTEND=noninteractive +ARG ROCMVERSION=7.2 +ARG compiler_version="" +ARG compiler_commit="" +ARG CK_SCCACHE="" +ARG DEB_ROCM_REPO=http://repo.radeon.com/rocm/apt/.apt_$ROCMVERSION/ +ENV APT_KEY_DONT_WARN_ON_DANGEROUS_USAGE=DontWarn +ENV DEBIAN_FRONTEND=noninteractive + +USER root + +# Add rocm repository +RUN dnf clean all && dnf update -y && dnf -v install wget gnupg2 curl -y + +RUN wget https://repo.radeon.com/amdgpu-install/7.2/rhel/8.10/amdgpu-install-7.2.70200-1.el8.noarch.rpm && \ + dnf install ./amdgpu-install-7.2.70200-1.el8.noarch.rpm -y && \ + dnf update -y && \ + dnf install python3-setuptools python3-wheel -y && \ + dnf install rocm-dev -y + +## Sccache binary built from source for ROCm, only install if CK_SCCACHE is defined +ARG SCCACHE_REPO_URL=http://compute-artifactory.amd.com/artifactory/rocm-generic-experimental/rocm-sccache +ENV SCCACHE_INSTALL_LOCATION=/usr/local/.cargo/bin +ENV PATH=$PATH:${SCCACHE_INSTALL_LOCATION} +ENV CK_SCCACHE=$CK_SCCACHE +RUN if [ "$CK_SCCACHE" != "" ]; then \ + mkdir -p ${SCCACHE_INSTALL_LOCATION} && \ + curl ${SCCACHE_REPO_URL}/portable/0.2.16/sccache-0.2.16-alpha.1-rocm --output ${SCCACHE_INSTALL_LOCATION}/sccache && \ + chmod +x ${SCCACHE_INSTALL_LOCATION}/sccache; \ + fi + +# Install dependencies +RUN dnf update -y && DEBIAN_FRONTEND=noninteractive dnf install -y \ + cmake \ + clang-tools-extra \ + gcc-c++ \ + libstdc++ \ + libstdc++-devel \ + libstdc++-static \ + git \ + hip-rocclr \ + jq \ + mpich \ + net-tools \ + pkg-config \ + redis \ + sshpass \ + stunnel \ + vim \ + nano \ + zip \ + openssh-server \ + kmod && \ + dnf clean all && \ + rm -rf /var/lib/apt/lists/* && \ + rm -rf amdgpu-install* && \ +#Install latest ccache + git clone https://github.com/ccache/ccache.git && \ + cd ccache && mkdir build && cd build && cmake .. && make install && \ +#Install ClangBuildAnalyzer + git clone https://github.com/aras-p/ClangBuildAnalyzer.git && \ + cd ClangBuildAnalyzer/ && \ + make -f projects/make/Makefile && \ + cd / && \ +#Install latest cppcheck + git clone https://github.com/danmar/cppcheck.git && \ + cd cppcheck && mkdir build && cd build && cmake .. && cmake --build . && \ + cd / && \ +# Install packages for processing the performance results + pip3 install --break-system-packages --upgrade pytest pymysql pandas==2.2.3 sqlalchemy==2.0.3 setuptools-rust setuptools sshtunnel==0.4.0 && \ +# Add render group + groupadd -f render && \ +# Install the new rocm-cmake version + git clone -b master https://github.com/ROCm/rocm-cmake.git && \ + cd rocm-cmake && mkdir build && cd build && \ + cmake .. && cmake --build . && cmake --build . --target install + +WORKDIR / +# Add alternative compilers, if necessary +ENV compiler_version=$compiler_version +ENV compiler_commit=$compiler_commit +RUN sh -c "echo compiler version = '$compiler_version'" && \ + sh -c "echo compiler commit = '$compiler_commit'" + +RUN if ( [ "$compiler_version" = "amd-staging" ] || [ "$compiler_version" = "amd-mainline" ] ) && [ "$compiler_commit" = "" ]; then \ + git clone -b "$compiler_version" https://github.com/ROCm/llvm-project.git && \ + cd llvm-project && mkdir build && cd build && \ + cmake -DCMAKE_INSTALL_PREFIX=/opt/rocm/llvm -DCMAKE_BUILD_TYPE=Release -DLLVM_ENABLE_ASSERTIONS=1 -DLLVM_TARGETS_TO_BUILD="AMDGPU;X86" -DLLVM_ENABLE_PROJECTS="clang;lld" -DLLVM_ENABLE_RUNTIMES="compiler-rt" ../llvm && \ + make -j 8 ; \ + else echo "using the release compiler"; \ + fi + +RUN if ( [ "$compiler_version" = "amd-staging" ] || [ "$compiler_version" = "amd-mainline" ] ) && [ "$compiler_commit" != "" ]; then \ + git clone -b "$compiler_version" https://github.com/ROCm/llvm-project.git && \ + cd llvm-project && git checkout "$compiler_commit" && echo "checking out commit $compiler_commit" && mkdir build && cd build && \ + cmake -DCMAKE_INSTALL_PREFIX=/opt/rocm/llvm -DCMAKE_BUILD_TYPE=Release -DLLVM_ENABLE_ASSERTIONS=1 -DLLVM_TARGETS_TO_BUILD="AMDGPU;X86" -DLLVM_ENABLE_PROJECTS="clang;lld" -DLLVM_ENABLE_RUNTIMES="compiler-rt" ../llvm && \ + make -j 8 ; \ + else echo "using the release compiler"; \ + fi + diff --git a/include/ck/utility/sequence.hpp b/include/ck/utility/sequence.hpp index 6e68690048..3a45d52bd3 100644 --- a/include/ck/utility/sequence.hpp +++ b/include/ck/utility/sequence.hpp @@ -199,55 +199,113 @@ template using make_index_sequence = typename __make_integer_seq::seq_type; -// merge sequence -template -struct sequence_merge +// merge sequence - optimized to avoid recursive instantiation +// +// Note: Unlike sequence_gen and uniform_sequence_gen which use __make_integer_seq for O(1) +// instantiation depth, sequence_merge cannot achieve O(1) depth. Here's why: +// +// - sequence_gen and uniform_sequence_gen generate a SINGLE output sequence where each +// element can be computed independently: output[i] = f(i) +// +// - sequence_merge takes MULTIPLE input sequences with different, unknown lengths. +// To compute output[i], we need to know: +// 1. Which input sequence contains this index +// 2. The offset within that sequence +// This requires computing cumulative sequence lengths, which requires recursion/iteration. +// +// Instead, we use a binary tree reduction approach that achieves O(log N) instantiation depth: +// - Base cases handle 1-4 sequences directly (O(1) for common cases) +// - Recursive case merges pairs then combines: merge(s1,s2) + merge(s3,s4,...) +// - This gives O(log N) depth, which is optimal for merging heterogeneous sequences +// +// Alternative considered: Fold expressions (... + sequences) would give O(N) depth due to +// linear dependency chain, so binary tree is superior. +// +namespace detail { + +// Helper to concatenate multiple sequences in one step using fold expression +template +struct sequence_merge_impl; + +// Base case: single sequence +template +struct sequence_merge_impl> { - using type = typename sequence_merge::type>::type; + using type = Sequence; }; +// Two sequences: direct concatenation template -struct sequence_merge, Sequence> +struct sequence_merge_impl, Sequence> { using type = Sequence; }; -template -struct sequence_merge +// Three sequences: direct concatenation (avoids one level of recursion) +template +struct sequence_merge_impl, Sequence, Sequence> { - using type = Seq; + using type = Sequence; }; -// generate sequence +// Four sequences: direct concatenation +template +struct sequence_merge_impl, Sequence, Sequence, Sequence> +{ + using type = Sequence; +}; + +// General case: binary tree reduction (O(log N) depth instead of O(N)) +template +struct sequence_merge_impl +{ + // Merge pairs first, then recurse + using left = typename sequence_merge_impl::type; + using right = typename sequence_merge_impl::type; + using type = typename sequence_merge_impl::type; +}; + +} // namespace detail + +template +struct sequence_merge +{ + using type = typename detail::sequence_merge_impl::type; +}; + +template <> +struct sequence_merge<> +{ + using type = Sequence<>; +}; + +// generate sequence - optimized using __make_integer_seq to avoid recursive instantiation +namespace detail { + +// Helper that applies functor F to indices and produces a Sequence +// __make_integer_seq produces sequence_gen_helper +template +struct sequence_gen_helper +{ + // Apply a functor F to all indices at once via pack expansion (O(1) depth) + template + using apply = Sequence{})...>; +}; + +} // namespace detail + template struct sequence_gen { - template - struct sequence_gen_impl - { - static constexpr index_t NRemainLeft = NRemain / 2; - static constexpr index_t NRemainRight = NRemain - NRemainLeft; - static constexpr index_t IMiddle = IBegin + NRemainLeft; + using type = + typename __make_integer_seq::template apply; +}; - using type = typename sequence_merge< - typename sequence_gen_impl::type, - typename sequence_gen_impl::type>::type; - }; - - template - struct sequence_gen_impl - { - static constexpr index_t Is = G{}(Number{}); - using type = Sequence; - }; - - template - struct sequence_gen_impl - { - using type = Sequence<>; - }; - - using type = typename sequence_gen_impl<0, NSize, F>::type; +template +struct sequence_gen<0, F> +{ + using type = Sequence<>; }; // arithmetic sequence @@ -283,16 +341,30 @@ struct arithmetic_sequence_gen<0, IEnd, 1> using type = typename __make_integer_seq::type; }; -// uniform sequence +// uniform sequence - optimized using __make_integer_seq +namespace detail { + +template +struct uniform_sequence_helper +{ + // Apply a constant value to all indices via pack expansion + template + using apply = Sequence<((void)Is, Value)...>; +}; + +} // namespace detail + template struct uniform_sequence_gen { - struct F - { - __host__ __device__ constexpr index_t operator()(index_t) const { return I; } - }; + using type = typename __make_integer_seq:: + template apply; +}; - using type = typename sequence_gen::type; +template +struct uniform_sequence_gen<0, I> +{ + using type = Sequence<>; }; // reverse inclusive scan (with init) sequence diff --git a/include/ck/utility/statically_indexed_array.hpp b/include/ck/utility/statically_indexed_array.hpp index d0735a32f6..f3d73e84a7 100644 --- a/include/ck/utility/statically_indexed_array.hpp +++ b/include/ck/utility/statically_indexed_array.hpp @@ -20,6 +20,7 @@ struct tuple_concat, Tuple> using type = Tuple; }; +// StaticallyIndexedArrayImpl uses binary split for O(log N) depth template struct StaticallyIndexedArrayImpl { diff --git a/test/util/unit_sequence.cpp b/test/util/unit_sequence.cpp index f09fd86e06..9e62b9a6c0 100644 --- a/test/util/unit_sequence.cpp +++ b/test/util/unit_sequence.cpp @@ -229,6 +229,32 @@ TEST(SequenceGen, UniformSequenceZeroSize) EXPECT_TRUE((is_same::value)); } +TEST(SequenceGen, UniformSequenceSingleElement) +{ + using Result = typename uniform_sequence_gen<1, 99>::type; + using Expected = Sequence<99>; + EXPECT_TRUE((is_same::value)); +} + +TEST(SequenceGen, UniformSequenceDifferentValues) +{ + using Result1 = typename uniform_sequence_gen<3, 0>::type; + using Expected1 = Sequence<0, 0, 0>; + EXPECT_TRUE((is_same::value)); + + using Result2 = typename uniform_sequence_gen<4, -5>::type; + using Expected2 = Sequence<-5, -5, -5, -5>; + EXPECT_TRUE((is_same::value)); +} + +TEST(SequenceGen, UniformSequenceLargeSize) +{ + // Test with larger size to verify __make_integer_seq implementation + using Result = typename uniform_sequence_gen<16, 7>::type; + using Expected = Sequence<7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7>; + EXPECT_TRUE((is_same::value)); +} + // Test make_index_sequence TEST(SequenceGen, MakeIndexSequence) { @@ -244,6 +270,54 @@ TEST(SequenceGen, MakeIndexSequenceZero) EXPECT_TRUE((is_same::value)); } +// Test sequence_gen with custom functors +TEST(SequenceGen, SequenceGenWithDoubleFunctor) +{ + struct DoubleFunctor + { + __host__ __device__ constexpr index_t operator()(index_t i) const { return i * 2; } + }; + using Result = typename sequence_gen<5, DoubleFunctor>::type; + using Expected = Sequence<0, 2, 4, 6, 8>; + EXPECT_TRUE((is_same::value)); +} + +TEST(SequenceGen, SequenceGenWithSquareFunctor) +{ + struct SquareFunctor + { + __host__ __device__ constexpr index_t operator()(index_t i) const { return i * i; } + }; + using Result = typename sequence_gen<5, SquareFunctor>::type; + using Expected = Sequence<0, 1, 4, 9, 16>; + EXPECT_TRUE((is_same::value)); +} + +TEST(SequenceGen, SequenceGenZeroSize) +{ + struct IdentityFunctor + { + __host__ __device__ constexpr index_t operator()(index_t i) const { return i; } + }; + using Result = typename sequence_gen<0, IdentityFunctor>::type; + using Expected = Sequence<>; + EXPECT_TRUE((is_same::value)); + // Also verify non-zero size works with identity + using Result5 = typename sequence_gen<5, IdentityFunctor>::type; + EXPECT_TRUE((is_same>::value)); +} + +TEST(SequenceGen, SequenceGenSingleElement) +{ + struct ConstantFunctor + { + __host__ __device__ constexpr index_t operator()(index_t) const { return 42; } + }; + using Result = typename sequence_gen<1, ConstantFunctor>::type; + using Expected = Sequence<42>; + EXPECT_TRUE((is_same::value)); +} + // Test sequence_merge TEST(SequenceMerge, MergeTwoSequences) { @@ -272,6 +346,66 @@ TEST(SequenceMerge, MergeSingleSequence) EXPECT_TRUE((is_same::value)); } +TEST(SequenceMerge, MergeFourSequences) +{ + // Test the 4-sequence specialization + using Seq1 = Sequence<1>; + using Seq2 = Sequence<2, 3>; + using Seq3 = Sequence<4, 5, 6>; + using Seq4 = Sequence<7, 8>; + using Result = typename sequence_merge::type; + using Expected = Sequence<1, 2, 3, 4, 5, 6, 7, 8>; + EXPECT_TRUE((is_same::value)); +} + +TEST(SequenceMerge, MergeFiveSequences) +{ + // Test the binary tree reduction path (5+ sequences) + using Seq1 = Sequence<1>; + using Seq2 = Sequence<2>; + using Seq3 = Sequence<3>; + using Seq4 = Sequence<4>; + using Seq5 = Sequence<5>; + using Result = typename sequence_merge::type; + using Expected = Sequence<1, 2, 3, 4, 5>; + EXPECT_TRUE((is_same::value)); +} + +TEST(SequenceMerge, MergeManySequences) +{ + // Test with many sequences to stress the binary tree reduction + using Seq1 = Sequence<1>; + using Seq2 = Sequence<2>; + using Seq3 = Sequence<3, 4>; + using Seq4 = Sequence<5>; + using Seq5 = Sequence<6, 7>; + using Seq6 = Sequence<8>; + using Seq7 = Sequence<9, 10>; + using Seq8 = Sequence<11, 12>; + using Result = typename sequence_merge::type; + using Expected = Sequence<1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12>; + EXPECT_TRUE((is_same::value)); +} + +TEST(SequenceMerge, MergeEmptySequences) +{ + // Test merging empty sequences + using Seq1 = Sequence<>; + using Seq2 = Sequence<1, 2>; + using Seq3 = Sequence<>; + using Result = typename sequence_merge::type; + using Expected = Sequence<1, 2>; + EXPECT_TRUE((is_same::value)); +} + +TEST(SequenceMerge, MergeZeroSequences) +{ + // Test the empty specialization + using Result = typename sequence_merge<>::type; + using Expected = Sequence<>; + EXPECT_TRUE((is_same::value)); +} + // Test sequence_split TEST(SequenceSplit, SplitInMiddle) {