[CK_TILE] Support for elementwise kernel (#2246)

* Elementwise kernel implementation Co-authored-by: Sami Aario <samaario@amd.com> Co-authored-by: Mohsen Saffari <mohsen.saffari@amd.com> Co-authored-by: yashagar <yashagar@amd.com> * Elementwise with generalized nDims * Adding the n-ary input tensor feature * Generalize dimensions on top of inputs * Add TFLOPS + remove std usage for tuples * 1D basecase optimization * Cleanup code + refactoring to a common interface * Generalize to unary and add an example * Cleanup, refactoring and commenting * Suggestions for LWPCK-3170: elementwise kernel improvements * Clang-format: remod.py * Replace InputTensorType with XDataType as the type of input_tensors * Add Tuple::apply and use it in ElementWiseKernel::operator to call operation with the exact number of arguments in xs * Move examples to folder 19_elementwise * Add missing copyright headers and fix some existing ones * Replace an assert with throw std::runtime_error in elementwise example * Avoid reading the output by using make_static_distributed_tensor for y_tile * Removed two unused includes * No need to move windows to the next block when each workgroup processes a single tile * Only copy input tensors to the device * Use get_warp_size to obtain warp size, and use ceiling division for grid size also for the unary example * Adding output strides to the kernel, transposition example and update the other examples * Changes made by remod.py * Use default template parameter values for memory operation and coherence in a call to make_naive_tensor_view * Move binary operations to include/ck_tile/ops/elementwise/binary_elementwise_operation.hpp * Reuse generic reference binary/unary operation in examples + refactoring the transpose reference * Fix comments in elementwise_example.cpp - Refer to AMD terminology except when suggesting NVIDIA alternatives in parentheses - ElementWiseTraits was renamed to ElementWiseShape - Adopt suggestions made by Copilot when prompted to check for factual or typographical errors * Simplify CMakeLists.txt and remove the unused variables this uncovers * Rename a file and fix some copyright statements * Changes made by script/clang-format-overwrite.sh * Add basic unit test for ElementWiseKernel * Remove left-over uninformative comment in apply unit test * Changes made by clang-format-overwrite.sh * fixup! Use default template parameter values for memory operation and coherence in a call to make_naive_tensor_view * Clean up test_tuple_apply.cpp and test_elementwise_1d.cpp * Use make_uniform_array_with_factory to define h_xs and d_xs_mems_owner as type std::array * Use a DeviceMem constructor that calls get_element_space_size_in_bytes internally * Move examples to folder 20_elementwise * Reduced register pressure on the CK tile elementwise kernel + add 4d input example to be able benchmark against old CK * Fix CLang formating * Bump up the elementwise example folder number * Elementwise: add padding + minor cleanup * Add Vector Size inference + fix issue with wrong vectorization due to missing GuaranteedLastDimensionVectorStride setting in make_naive_tensor_view * Add isSupportedArg to Elementwise kernel + addapt example and unit tests * Fix clang-format on the unit test file --------- Co-authored-by: Damien Lejeune <damien.lejeune@amd.com> Co-authored-by: Sami Aario <samaario@amd.com> Co-authored-by: Mohsen Saffari <mohsen.saffari@amd.com> Co-authored-by: Aviral Goel <aviral.goel@amd.com>
2026-04-20 14:59:17 +00:00 · 2025-07-24 12:21:45 +03:00
parent 6681593864
commit 606b0cc947
23 changed files with 1509 additions and 6 deletions
--- a/include/ck_tile/ops/elementwise/kernel/elementwise_kernel.hpp
+++ b/include/ck_tile/ops/elementwise/kernel/elementwise_kernel.hpp
@@ -0,0 +1,123 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck_tile/core.hpp"
+#include "ck_tile/ops/common.hpp"
+#include "ck_tile/ops/elementwise/pipeline/elementwise_pipeline_problem.hpp"
+#include "ck_tile/ops/elementwise/pipeline/elementwise_pipeline_default_policy.hpp"
+namespace ck_tile {
+
+template <typename Problem_, typename Policy_>
+struct ElementWiseKernel
+{
+    using Problem = ck_tile::remove_cvref_t<Problem_>;
+    using Policy  = ck_tile::remove_cvref_t<Policy_>;
+
+    using XDataType            = ck_tile::remove_cvref_t<typename Problem::XDataType>;
+    using ComputeDataType      = ck_tile::remove_cvref_t<typename Problem::ComputeDataType>;
+    using YDataType            = ck_tile::remove_cvref_t<typename Problem::YDataType>;
+    using ElementWiseOperation = ck_tile::remove_cvref_t<typename Problem::ElementWiseOperation>;
+
+    template <typename... XDataType, typename Dims>
+    CK_TILE_DEVICE void operator()(Dims lens,
+                                   Dims input_strides,
+                                   Dims output_strides,
+                                   const tuple<XDataType...>& input_tensors,
+                                   YDataType* p_y) const
+    {
+        using S = typename Problem::BlockShape;
+
+        // Setup block-level coordinates and transforms
+        const index_t iM           = get_block_id() * S::kBlockM;
+        const auto merge_transform = make_merge_transform(lens);
+
+        // Load all input tiles into registers.
+        // The lambda structure here is intended to minimize the lifetime
+        // of intermediate objects (views, windows) used for loading.
+        const auto x_tiles = ck_tile::generate_tuple(
+            [&](auto i) {
+                const auto tensor_view = make_naive_tensor_view<address_space_enum::global>(
+                    input_tensors.get(i), lens, input_strides, number<S::kVectorM>{}, number<1>{});
+
+                const auto transformed_tensor = pad_tensor_view(
+                    transform_tensor_view(tensor_view,
+                                          ck_tile::make_tuple(merge_transform),
+                                          ck_tile::make_tuple(make_index_sequence<Dims::size()>{}),
+                                          ck_tile::make_tuple(sequence<0>{})),
+                    ck_tile::make_tuple(number<S::kBlockM>{}),
+                    sequence<Problem::kPad>{});
+
+                const auto x_window =
+                    make_tile_window(transformed_tensor,
+                                     ck_tile::make_tuple(number<S::kBlockM>{}),
+                                     {iM},
+                                     Policy::template MakeXBlockTileDistribution<Problem>());
+
+                return load_tile(x_window);
+            },
+            number<sizeof...(XDataType)>{});
+
+        // Setup output tile in registers.
+        const auto& x_tile0 = x_tiles.get(number<0>{});
+        auto y_tile = make_static_distributed_tensor<YDataType>(x_tile0.get_tile_distribution());
+
+        // Perform element-wise computation.
+        const auto spans = x_tile0.get_distributed_spans();
+        sweep_tile_span(spans[number<0>{}], [&](auto idx) {
+            const auto tile_idx = make_tuple(idx);
+            apply(
+                [&](auto&&... tiles) {
+                    ElementWiseOperation{}(y_tile(tile_idx),
+                                           type_convert<ComputeDataType>(tiles[tile_idx])...);
+                },
+                x_tiles);
+        });
+
+        // Setup output window and store the result tile.
+        const auto y_m_n = make_naive_tensor_view<address_space_enum::global>(
+            p_y, lens, output_strides, number<S::kVectorM>{});
+
+        const auto transformed_y_m_n = pad_tensor_view(
+            transform_tensor_view(y_m_n,
+                                  ck_tile::make_tuple(merge_transform),
+                                  ck_tile::make_tuple(make_index_sequence<Dims::size()>{}),
+                                  ck_tile::make_tuple(sequence<0>{})),
+            ck_tile::make_tuple(number<S::kBlockM>{}),
+            sequence<Problem::kPad>{});
+
+        auto y_window = make_tile_window(transformed_y_m_n,
+                                         make_tuple(number<S::kBlockM>{}),
+                                         {iM},
+                                         y_tile.get_tile_distribution());
+
+        store_tile(y_window, cast_tile<YDataType>(y_tile));
+    }
+
+    template <typename... Ints>
+    CK_TILE_HOST static bool IsSupportedArgument(const ck_tile::tuple<Ints...>& input_sizes)
+    {
+        int total_elements  = 1;
+        const auto kVectorM = Problem_::BlockShape::kVectorM;
+
+        apply([&](auto&&... args) { ((total_elements *= args), ...); }, input_sizes);
+
+        if((total_elements % kVectorM) != 0)
+        {
+            if(ck_tile::EnvIsEnabled(CK_TILE_ENV(CK_TILE_LOGGING)))
+            {
+                CK_TILE_ERROR("Conditions not met: total number of input elements (",
+                              total_elements,
+                              ") should be multiple of the vectorization size (",
+                              kVectorM,
+                              ")");
+            }
+            return false;
+        }
+
+        return true;
+    }
+};
+
+} // namespace ck_tile