[CK_TILE] Support for elementwise kernel (#2246)

* Elementwise kernel implementation

Co-authored-by: Sami Aario <samaario@amd.com>
Co-authored-by: Mohsen Saffari <mohsen.saffari@amd.com>
Co-authored-by: yashagar <yashagar@amd.com>

* Elementwise with generalized nDims

* Adding the n-ary input tensor feature

* Generalize dimensions on top of inputs

* Add TFLOPS + remove std usage for tuples

* 1D basecase optimization

* Cleanup code + refactoring to a common interface

* Generalize to unary and add an example

* Cleanup, refactoring and commenting

* Suggestions for LWPCK-3170: elementwise kernel improvements

* Clang-format: remod.py

* Replace InputTensorType with XDataType as the type of input_tensors

* Add Tuple::apply and use it in ElementWiseKernel::operator to call operation with the exact number of arguments in xs

* Move examples to folder 19_elementwise

* Add missing copyright headers and fix some existing ones

* Replace an assert with throw std::runtime_error in elementwise example

* Avoid reading the output by using make_static_distributed_tensor for y_tile

* Removed two unused includes

* No need to move windows to the next block when each workgroup processes a single tile

* Only copy input tensors to the device

* Use get_warp_size to obtain warp size, and use ceiling division for grid size also for the unary example

* Adding output strides to the kernel, transposition example and update the other examples

* Changes made by remod.py

* Use default template parameter values for memory operation and coherence in a call to make_naive_tensor_view

* Move binary operations to include/ck_tile/ops/elementwise/binary_elementwise_operation.hpp

* Reuse generic reference binary/unary operation in examples + refactoring the transpose reference

* Fix comments in elementwise_example.cpp

- Refer to AMD terminology except when suggesting NVIDIA alternatives in parentheses
- ElementWiseTraits was renamed to ElementWiseShape
- Adopt suggestions made by Copilot when prompted to check for factual or typographical errors

* Simplify CMakeLists.txt and remove the unused variables this uncovers

* Rename a file and fix some copyright statements

* Changes made by script/clang-format-overwrite.sh

* Add basic unit test for ElementWiseKernel

* Remove left-over uninformative comment in apply unit test

* Changes made by clang-format-overwrite.sh

* fixup! Use default template parameter values for memory operation and coherence in a call to make_naive_tensor_view

* Clean up test_tuple_apply.cpp and test_elementwise_1d.cpp

* Use make_uniform_array_with_factory to define h_xs and d_xs_mems_owner as type std::array

* Use a DeviceMem constructor that calls get_element_space_size_in_bytes internally

* Move examples to folder 20_elementwise

* Reduced register pressure on the CK tile elementwise kernel + add 4d input example to be able benchmark against old CK

* Fix CLang formating

* Bump up the elementwise example folder number

* Elementwise: add padding + minor cleanup

* Add Vector Size inference + fix issue with wrong vectorization due to missing GuaranteedLastDimensionVectorStride setting in make_naive_tensor_view

* Add isSupportedArg to Elementwise kernel + addapt example and unit tests

* Fix clang-format on the unit test file

---------

Co-authored-by: Damien Lejeune <damien.lejeune@amd.com>
Co-authored-by: Sami Aario <samaario@amd.com>
Co-authored-by: Mohsen Saffari <mohsen.saffari@amd.com>
Co-authored-by: Aviral Goel <aviral.goel@amd.com>
This commit is contained in:
Yashvardhan Agarwal
2025-07-24 12:21:45 +03:00
committed by GitHub
parent 6681593864
commit 606b0cc947
23 changed files with 1509 additions and 6 deletions

View File

@@ -0,0 +1,123 @@
// SPDX-License-Identifier: MIT
// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#include "ck_tile/core.hpp"
#include "ck_tile/ops/common.hpp"
#include "ck_tile/ops/elementwise/pipeline/elementwise_pipeline_problem.hpp"
#include "ck_tile/ops/elementwise/pipeline/elementwise_pipeline_default_policy.hpp"
namespace ck_tile {
template <typename Problem_, typename Policy_>
struct ElementWiseKernel
{
using Problem = ck_tile::remove_cvref_t<Problem_>;
using Policy = ck_tile::remove_cvref_t<Policy_>;
using XDataType = ck_tile::remove_cvref_t<typename Problem::XDataType>;
using ComputeDataType = ck_tile::remove_cvref_t<typename Problem::ComputeDataType>;
using YDataType = ck_tile::remove_cvref_t<typename Problem::YDataType>;
using ElementWiseOperation = ck_tile::remove_cvref_t<typename Problem::ElementWiseOperation>;
template <typename... XDataType, typename Dims>
CK_TILE_DEVICE void operator()(Dims lens,
Dims input_strides,
Dims output_strides,
const tuple<XDataType...>& input_tensors,
YDataType* p_y) const
{
using S = typename Problem::BlockShape;
// Setup block-level coordinates and transforms
const index_t iM = get_block_id() * S::kBlockM;
const auto merge_transform = make_merge_transform(lens);
// Load all input tiles into registers.
// The lambda structure here is intended to minimize the lifetime
// of intermediate objects (views, windows) used for loading.
const auto x_tiles = ck_tile::generate_tuple(
[&](auto i) {
const auto tensor_view = make_naive_tensor_view<address_space_enum::global>(
input_tensors.get(i), lens, input_strides, number<S::kVectorM>{}, number<1>{});
const auto transformed_tensor = pad_tensor_view(
transform_tensor_view(tensor_view,
ck_tile::make_tuple(merge_transform),
ck_tile::make_tuple(make_index_sequence<Dims::size()>{}),
ck_tile::make_tuple(sequence<0>{})),
ck_tile::make_tuple(number<S::kBlockM>{}),
sequence<Problem::kPad>{});
const auto x_window =
make_tile_window(transformed_tensor,
ck_tile::make_tuple(number<S::kBlockM>{}),
{iM},
Policy::template MakeXBlockTileDistribution<Problem>());
return load_tile(x_window);
},
number<sizeof...(XDataType)>{});
// Setup output tile in registers.
const auto& x_tile0 = x_tiles.get(number<0>{});
auto y_tile = make_static_distributed_tensor<YDataType>(x_tile0.get_tile_distribution());
// Perform element-wise computation.
const auto spans = x_tile0.get_distributed_spans();
sweep_tile_span(spans[number<0>{}], [&](auto idx) {
const auto tile_idx = make_tuple(idx);
apply(
[&](auto&&... tiles) {
ElementWiseOperation{}(y_tile(tile_idx),
type_convert<ComputeDataType>(tiles[tile_idx])...);
},
x_tiles);
});
// Setup output window and store the result tile.
const auto y_m_n = make_naive_tensor_view<address_space_enum::global>(
p_y, lens, output_strides, number<S::kVectorM>{});
const auto transformed_y_m_n = pad_tensor_view(
transform_tensor_view(y_m_n,
ck_tile::make_tuple(merge_transform),
ck_tile::make_tuple(make_index_sequence<Dims::size()>{}),
ck_tile::make_tuple(sequence<0>{})),
ck_tile::make_tuple(number<S::kBlockM>{}),
sequence<Problem::kPad>{});
auto y_window = make_tile_window(transformed_y_m_n,
make_tuple(number<S::kBlockM>{}),
{iM},
y_tile.get_tile_distribution());
store_tile(y_window, cast_tile<YDataType>(y_tile));
}
template <typename... Ints>
CK_TILE_HOST static bool IsSupportedArgument(const ck_tile::tuple<Ints...>& input_sizes)
{
int total_elements = 1;
const auto kVectorM = Problem_::BlockShape::kVectorM;
apply([&](auto&&... args) { ((total_elements *= args), ...); }, input_sizes);
if((total_elements % kVectorM) != 0)
{
if(ck_tile::EnvIsEnabled(CK_TILE_ENV(CK_TILE_LOGGING)))
{
CK_TILE_ERROR("Conditions not met: total number of input elements (",
total_elements,
") should be multiple of the vectorization size (",
kVectorM,
")");
}
return false;
}
return true;
}
};
} // namespace ck_tile