Dlejeune/ck tile 2d multiple reductions (#3147)

* WIP * Add Unit tests for the Multi Reduction Kernel * clang format * Rename multiblock to threadwise * Multiblock WIP * Fix multi reduce multi block unit tests * Multi Reduce Tile Engine: WIP * refactoring + try addressing precision error * Fix multiops examples * Cleanup * Clean up tile engine's reduce op * Update changelog * Fix remod/clang * Fix dates * Fix documentation & missing file * Fix comments * Use the update_tile api in the multi-block kernel * Unify threadwise/multiblock into a single kernel + default multiblock output to float in tests * Add TileParitioner * Cleanup * Add warning when no data to process, in the example * Refactoring Reduce kernel Tile Partioner + cleanup * Move the tile partioner to its own file * Add missing includes * Fix copyright header with update_amd_copyright_headers.py * Fix change of interface in Reduce2dProblem --------- Co-authored-by: Damien Lejeune <damien.lejeune@amd.com> Co-authored-by: Adam Osewski <19374865+aosewski@users.noreply.github.com>
2026-04-19 22:39:03 +00:00 · 2026-01-09 11:16:37 +01:00
parent e3884bbf05
commit 4216d43da8
26 changed files with 2661 additions and 2 deletions
--- a/include/ck_tile/host/reference/reference_reduce.hpp
+++ b/include/ck_tile/host/reference/reference_reduce.hpp
@@ -5,6 +5,7 @@

 #include "ck_tile/core.hpp"
 #include "ck_tile/host/host_tensor.hpp"
+#include "ck_tile/ops/elementwise.hpp"
 #include <thread>

 namespace ck_tile {
@@ -108,4 +109,233 @@ CK_TILE_HOST void reference_reduce(const HostTensor<XDataType>& x_tensor,

    make_ParallelTensorFunctor(f, total_kept_elements)(std::thread::hardware_concurrency());
 }
+
+template <typename XDataType,
+          typename ComputeDataType,
+          typename YDataType,
+          typename YRefTuple,
+          typename ReduceOps, // Expected type: ck_tile::tuple<...> containing reduce operations
+          typename KeptDim, // Expected type: ck_tile::sequence<...> containing dimension indices to
+                            // keep
+          typename ReduceDims, // Expected type: ck_tile::sequence<...> containing dimension indices
+                               // to reduce
+          typename ElementWiseOps,
+          typename AccElementWiseOps>
+CK_TILE_HOST void reference_multiple_reduce(const HostTensor<XDataType>& x_tensor,
+                                            YRefTuple& y_tensor_tuple,
+                                            ReduceOps reduce_ops,
+                                            KeptDim kept_dim,
+                                            ReduceDims reduce_dims,
+                                            ElementWiseOps elementwise_ops,
+                                            AccElementWiseOps accumulator_ops)
+{
+    const auto& x_lengths = x_tensor.mDesc.get_lengths();
+
+    // Calculate total kept elements (product of all kept dimension lengths)
+    index_t total_kept_elements = 1;
+    static_for<0, kept_dim.size(), 1>{}(
+        [&](auto i) { total_kept_elements *= x_lengths[kept_dim.at(i)]; });
+
+    // Calculate total reduce elements (product of all reduce dimension lengths)
+    index_t total_reduce_elements = 1;
+    static_for<0, reduce_dims.size(), 1>{}(
+        [&](auto i) { total_reduce_elements *= x_lengths[reduce_dims.at(i)]; });
+
+    auto f = [&](auto linear_kept_idx) {
+        // Initialize accumulators for each reduction operation
+        auto v_acc_tuple = ck_tile::generate_tuple(
+            [&](auto i) {
+                return reduce_ops.template at<i>().template GetIdentityValue<ComputeDataType>();
+            },
+            number<reduce_ops.size()>{});
+
+        // Convert linear kept index to multi-dimensional kept indices
+        std::vector<index_t> kept_indices(kept_dim.size());
+        index_t temp_kept = linear_kept_idx;
+        static_for<0, kept_dim.size(), 1>{}([&](auto i) {
+            constexpr auto dim_idx = kept_dim.size() - 1 - i;
+            constexpr auto dim     = kept_dim.at(dim_idx);
+            const auto len         = x_lengths[dim];
+            kept_indices[dim_idx]  = temp_kept % len;
+            temp_kept /= len;
+        });
+
+        for(index_t reduce_idx = 0; reduce_idx < total_reduce_elements; ++reduce_idx)
+        {
+            // Convert linear reduce index to multi-dimensional reduce indices
+            std::vector<index_t> reduce_indices(reduce_dims.size());
+            index_t temp_reduce = reduce_idx;
+            static_for<0, reduce_dims.size(), 1>{}([&](auto i) {
+                constexpr auto dim_idx  = reduce_dims.size() - 1 - i;
+                constexpr auto dim      = reduce_dims.at(dim_idx);
+                const auto len          = x_lengths[dim];
+                reduce_indices[dim_idx] = temp_reduce % len;
+                temp_reduce /= len;
+            });
+
+            // Build full input tensor indices by combining kept and reduce indices
+            std::vector<std::size_t> full_indices(x_lengths.size(), 0);
+            static_for<0, kept_dim.size(), 1>{}(
+                [&](auto i) { full_indices[kept_dim.at(i)] = kept_indices[i]; });
+            static_for<0, reduce_dims.size(), 1>{}(
+                [&](auto i) { full_indices[reduce_dims.at(i)] = reduce_indices[i]; });
+
+            // Access input tensor element
+            auto v_a = type_convert<ComputeDataType>(x_tensor(full_indices));
+
+            // Apply each reduction operation
+            static_for<0, reduce_ops.size(), 1>{}([&](auto i) {
+                // Apply element-wise operation before reduction
+                elementwise_ops.at(i)(v_a, v_a);
+
+                v_acc_tuple.template at<i>() =
+                    reduce_ops.template at<i>()(v_acc_tuple.template at<i>(), v_a);
+            });
+        }
+
+        static_for<0, reduce_ops.size(), 1>{}([&](auto i) {
+            // Apply accumulator element-wise operation after reduction
+            accumulator_ops.at(i)(v_acc_tuple.template at<i>(), v_acc_tuple.template at<i>());
+        });
+
+        // Calculate output tensor index using kept indices
+        // The output tensor has the same structure as the kept dimensions
+        std::vector<std::size_t> y_indices(kept_dim.size());
+        static_for<0, kept_dim.size(), 1>{}([&](auto i) { y_indices[i] = kept_indices[i]; });
+
+        // Store results for each reduction operation in the output tensor
+        static_for<0, reduce_ops.size(), 1>{}([&](auto i) {
+            y_tensor_tuple.template at<i>()(y_indices) =
+                type_convert<YDataType>(v_acc_tuple.template at<i>());
+        });
+    };
+
+    make_ParallelTensorFunctor(f, total_kept_elements)(std::thread::hardware_concurrency());
+}
+
+template <typename XDataType,
+          typename ComputeDataType,
+          typename YDataType,
+          typename YRefTuple,
+          typename ReduceOps, // Expected type: ck_tile::tuple<...> containing reduce operations
+          typename KeptDim, // Expected type: ck_tile::sequence<...> containing dimension indices to
+                            // keep
+          typename ReduceDims, // Expected type: ck_tile::sequence<...> containing dimension indices
+                               // to reduce
+          typename ElementWiseOps,
+          typename AccElementWiseOps,
+          typename InterBlockReduceOps>
+CK_TILE_HOST void reference_multiple_reduce_multiblock(const HostTensor<XDataType>& x_tensor,
+                                                       YRefTuple& y_tensor_tuple,
+                                                       ReduceOps reduce_ops,
+                                                       KeptDim kept_dim,
+                                                       ReduceDims reduce_dims,
+                                                       ElementWiseOps elementwise_ops,
+                                                       AccElementWiseOps accumulator_ops,
+                                                       InterBlockReduceOps inter_block_reduce_ops,
+                                                       ck_tile::index_t num_blocks)
+{
+    const auto& x_lengths = x_tensor.mDesc.get_lengths();
+
+    // Calculate total kept elements (product of all kept dimension lengths)
+    index_t total_kept_elements = 1;
+    static_for<0, kept_dim.size(), 1>{}(
+        [&](auto i) { total_kept_elements *= x_lengths[kept_dim.at(i)]; });
+
+    // Calculate total reduce elements (product of all reduce dimension lengths)
+    index_t total_reduce_elements = 1;
+    static_for<0, reduce_dims.size(), 1>{}(
+        [&](auto i) { total_reduce_elements *= x_lengths[reduce_dims.at(i)]; });
+
+    // Initialize output tensors
+    static_for<0, reduce_ops.size(), 1>{}([&](auto i) {
+        auto& y_tensor = y_tensor_tuple.template at<i>();
+        for(auto& val : y_tensor.mData)
+        {
+            val = inter_block_reduce_ops.template at<i>().template GetIdentityValue<YDataType>();
+        }
+    });
+
+    auto f = [&](auto linear_kept_idx) {
+        // Convert linear kept index to multi-dimensional kept indices
+        std::vector<index_t> kept_indices(kept_dim.size());
+        index_t temp_kept = linear_kept_idx;
+        static_for<0, kept_dim.size(), 1>{}([&](auto i) {
+            constexpr auto dim_idx = kept_dim.size() - 1 - i;
+            constexpr auto dim     = kept_dim.at(dim_idx);
+            const auto len         = x_lengths[dim];
+            kept_indices[dim_idx]  = temp_kept % len;
+            temp_kept /= len;
+        });
+
+        // Calculate output tensor index using kept indices
+        std::vector<std::size_t> y_indices(kept_dim.size());
+        static_for<0, kept_dim.size(), 1>{}([&](auto i) { y_indices[i] = kept_indices[i]; });
+
+        const auto max_element_per_block = (total_reduce_elements + num_blocks - 1) / num_blocks;
+
+        for(index_t block_id = 0; block_id < num_blocks; ++block_id)
+        {
+            // Initialize accumulators for each reduction operation for the current block
+            auto v_acc_tuple = ck_tile::generate_tuple(
+                [&](auto i) {
+                    return reduce_ops.template at<i>().template GetIdentityValue<ComputeDataType>();
+                },
+                number<reduce_ops.size()>{});
+
+            const index_t element_offset = block_id * max_element_per_block;
+            const index_t element_end =
+                std::min(element_offset + max_element_per_block, total_reduce_elements);
+
+            for(index_t linear_reduce_idx = element_offset; linear_reduce_idx < element_end;
+                ++linear_reduce_idx)
+            {
+                // Convert linear reduce index to multi-dimensional reduce indices
+                std::vector<index_t> reduce_indices(reduce_dims.size());
+                index_t temp_reduce = linear_reduce_idx;
+                static_for<0, reduce_dims.size(), 1>{}([&](auto i) {
+                    constexpr auto dim_idx  = reduce_dims.size() - 1 - i;
+                    constexpr auto dim      = reduce_dims.at(dim_idx);
+                    const auto len          = x_lengths[dim];
+                    reduce_indices[dim_idx] = temp_reduce % len;
+                    temp_reduce /= len;
+                });
+
+                // Build full input tensor indices by combining kept and reduce indices
+                std::vector<std::size_t> full_indices(x_lengths.size(), 0);
+                static_for<0, kept_dim.size(), 1>{}(
+                    [&](auto i) { full_indices[kept_dim.at(i)] = kept_indices[i]; });
+                static_for<0, reduce_dims.size(), 1>{}(
+                    [&](auto i) { full_indices[reduce_dims.at(i)] = reduce_indices[i]; });
+
+                // Access input tensor element
+                const auto v_a_in = type_convert<ComputeDataType>(x_tensor(full_indices));
+
+                // Apply each reduction operation
+                static_for<0, reduce_ops.size(), 1>{}([&](auto i) {
+                    auto v_a = v_a_in;
+                    // Apply element-wise operation before reduction
+                    elementwise_ops.at(i)(v_a, v_a);
+
+                    v_acc_tuple.template at<i>() =
+                        reduce_ops.template at<i>()(v_acc_tuple.template at<i>(), v_a);
+                });
+            }
+
+            static_for<0, reduce_ops.size(), 1>{}([&](auto i) {
+                // Apply accumulator element-wise operation after reduction
+                accumulator_ops.at(i)(v_acc_tuple.template at<i>(), v_acc_tuple.template at<i>());
+
+                // Update the output tensor with the partial result from this block
+                auto& y_tensor = y_tensor_tuple.template at<i>();
+                auto& y_val    = y_tensor(y_indices);
+                y_val          = inter_block_reduce_ops.template at<i>()(
+                    y_val, type_convert<YDataType>(v_acc_tuple.template at<i>()));
+            });
+        }
+    };
+
+    make_ParallelTensorFunctor(f, total_kept_elements)(std::thread::hardware_concurrency());
+}
+
 } // namespace ck_tile