[CK_TILE] Support for elementwise kernel (#2246)

* Elementwise kernel implementation Co-authored-by: Sami Aario <samaario@amd.com> Co-authored-by: Mohsen Saffari <mohsen.saffari@amd.com> Co-authored-by: yashagar <yashagar@amd.com> * Elementwise with generalized nDims * Adding the n-ary input tensor feature * Generalize dimensions on top of inputs * Add TFLOPS + remove std usage for tuples * 1D basecase optimization * Cleanup code + refactoring to a common interface * Generalize to unary and add an example * Cleanup, refactoring and commenting * Suggestions for LWPCK-3170: elementwise kernel improvements * Clang-format: remod.py * Replace InputTensorType with XDataType as the type of input_tensors * Add Tuple::apply and use it in ElementWiseKernel::operator to call operation with the exact number of arguments in xs * Move examples to folder 19_elementwise * Add missing copyright headers and fix some existing ones * Replace an assert with throw std::runtime_error in elementwise example * Avoid reading the output by using make_static_distributed_tensor for y_tile * Removed two unused includes * No need to move windows to the next block when each workgroup processes a single tile * Only copy input tensors to the device * Use get_warp_size to obtain warp size, and use ceiling division for grid size also for the unary example * Adding output strides to the kernel, transposition example and update the other examples * Changes made by remod.py * Use default template parameter values for memory operation and coherence in a call to make_naive_tensor_view * Move binary operations to include/ck_tile/ops/elementwise/binary_elementwise_operation.hpp * Reuse generic reference binary/unary operation in examples + refactoring the transpose reference * Fix comments in elementwise_example.cpp - Refer to AMD terminology except when suggesting NVIDIA alternatives in parentheses - ElementWiseTraits was renamed to ElementWiseShape - Adopt suggestions made by Copilot when prompted to check for factual or typographical errors * Simplify CMakeLists.txt and remove the unused variables this uncovers * Rename a file and fix some copyright statements * Changes made by script/clang-format-overwrite.sh * Add basic unit test for ElementWiseKernel * Remove left-over uninformative comment in apply unit test * Changes made by clang-format-overwrite.sh * fixup! Use default template parameter values for memory operation and coherence in a call to make_naive_tensor_view * Clean up test_tuple_apply.cpp and test_elementwise_1d.cpp * Use make_uniform_array_with_factory to define h_xs and d_xs_mems_owner as type std::array * Use a DeviceMem constructor that calls get_element_space_size_in_bytes internally * Move examples to folder 20_elementwise * Reduced register pressure on the CK tile elementwise kernel + add 4d input example to be able benchmark against old CK * Fix CLang formating * Bump up the elementwise example folder number * Elementwise: add padding + minor cleanup * Add Vector Size inference + fix issue with wrong vectorization due to missing GuaranteedLastDimensionVectorStride setting in make_naive_tensor_view * Add isSupportedArg to Elementwise kernel + addapt example and unit tests * Fix clang-format on the unit test file --------- Co-authored-by: Damien Lejeune <damien.lejeune@amd.com> Co-authored-by: Sami Aario <samaario@amd.com> Co-authored-by: Mohsen Saffari <mohsen.saffari@amd.com> Co-authored-by: Aviral Goel <aviral.goel@amd.com>
2026-05-04 13:41:24 +00:00 · 2025-07-24 12:21:45 +03:00
parent 6681593864
commit 606b0cc947
23 changed files with 1509 additions and 6 deletions
--- a/include/ck_tile/core/container/tuple.hpp
+++ b/include/ck_tile/core/container/tuple.hpp
@@ -264,10 +264,14 @@ struct tuple : impl::tuple_base<make_index_sequence<sizeof...(T)>, T...>

 #define TP_COM_() static_assert(I < size(), "wrong! out of range")
    // clang-format off
-    template<index_t I> CK_TILE_HOST_DEVICE constexpr decltype(auto) get() const          { TP_COM_(); return impl::getv<I>(*this); }
-    template<index_t I> CK_TILE_HOST_DEVICE constexpr decltype(auto) get(number<I>) const { TP_COM_(); return get<I>(); }
-    template<index_t I> CK_TILE_HOST_DEVICE constexpr decltype(auto) get()                      { TP_COM_(); return impl::getv<I>(*this); }
-    template<index_t I> CK_TILE_HOST_DEVICE constexpr decltype(auto) get(number<I>)             { TP_COM_(); return get<I>(); }
+    template<index_t I> CK_TILE_HOST_DEVICE constexpr decltype(auto) get() const &          { TP_COM_(); return impl::getv<I>(*this); }
+    template<index_t I> CK_TILE_HOST_DEVICE constexpr decltype(auto) get(number<I>) const & { TP_COM_(); return get<I>(); }
+    template<index_t I> CK_TILE_HOST_DEVICE constexpr decltype(auto) get() &                { TP_COM_(); return impl::getv<I>(*this); }
+    template<index_t I> CK_TILE_HOST_DEVICE constexpr decltype(auto) get(number<I>) &       { TP_COM_(); return get<I>(); }
+    template<index_t I> CK_TILE_HOST_DEVICE constexpr decltype(auto) get() &&               { TP_COM_(); return impl::getv<I>(std::move(*this)); }
+    template<index_t I> CK_TILE_HOST_DEVICE constexpr decltype(auto) get(number<I>) &&      { TP_COM_(); return std::move(*this).template get<I>(); }
+    template<index_t I> CK_TILE_HOST_DEVICE constexpr decltype(auto) get() const &&         { TP_COM_(); return impl::getv<I>(std::move(*this)); }
+    template<index_t I> CK_TILE_HOST_DEVICE constexpr decltype(auto) get(number<I>) const &&{ TP_COM_(); return std::move(*this).template get<I>(); }

    template<index_t I> CK_TILE_HOST_DEVICE constexpr decltype(auto) at() const          { TP_COM_(); return impl::getv<I>(*this); }
    template<index_t I> CK_TILE_HOST_DEVICE constexpr decltype(auto) at(number<I>) const { TP_COM_(); return get<I>(); }
@@ -470,6 +474,12 @@ transform_tuples_impl(F f, const X& x, const Y& y, const Z& z, sequence<Is...>)
    return make_tuple(f(x.at(number<Is>{}), y.at(number<Is>{}), z.at(number<Is>{}))...);
 }

+template <typename F, typename Tuple, index_t... Is>
+constexpr decltype(auto) apply_impl(F&& f, Tuple&& t, sequence<Is...>)
+{
+    return std::forward<F>(f)(std::forward<Tuple>(t).get(number<Is>{})...);
+}
+
 } // namespace detail

 template <typename F, typename X>
@@ -493,6 +503,13 @@ CK_TILE_HOST_DEVICE constexpr auto transform_tuples(F f, const X& x, const Y& y,
        f, x, y, z, typename arithmetic_sequence_gen<0, X::size(), 1>::type{});
 }

+template <typename F, typename Tuple>
+constexpr decltype(auto) apply(F&& f, Tuple&& t)
+{
+    constexpr index_t N = std::decay_t<Tuple>::size();
+    return detail::apply_impl(std::forward<F>(f), std::forward<Tuple>(t), make_index_sequence<N>{});
+}
+
 namespace detail {

 template <typename F, typename X, index_t... Is>