[CK_TILE] Support for elementwise kernel (#2246)

* Elementwise kernel implementation Co-authored-by: Sami Aario <samaario@amd.com> Co-authored-by: Mohsen Saffari <mohsen.saffari@amd.com> Co-authored-by: yashagar <yashagar@amd.com> * Elementwise with generalized nDims * Adding the n-ary input tensor feature * Generalize dimensions on top of inputs * Add TFLOPS + remove std usage for tuples * 1D basecase optimization * Cleanup code + refactoring to a common interface * Generalize to unary and add an example * Cleanup, refactoring and commenting * Suggestions for LWPCK-3170: elementwise kernel improvements * Clang-format: remod.py * Replace InputTensorType with XDataType as the type of input_tensors * Add Tuple::apply and use it in ElementWiseKernel::operator to call operation with the exact number of arguments in xs * Move examples to folder 19_elementwise * Add missing copyright headers and fix some existing ones * Replace an assert with throw std::runtime_error in elementwise example * Avoid reading the output by using make_static_distributed_tensor for y_tile * Removed two unused includes * No need to move windows to the next block when each workgroup processes a single tile * Only copy input tensors to the device * Use get_warp_size to obtain warp size, and use ceiling division for grid size also for the unary example * Adding output strides to the kernel, transposition example and update the other examples * Changes made by remod.py * Use default template parameter values for memory operation and coherence in a call to make_naive_tensor_view * Move binary operations to include/ck_tile/ops/elementwise/binary_elementwise_operation.hpp * Reuse generic reference binary/unary operation in examples + refactoring the transpose reference * Fix comments in elementwise_example.cpp - Refer to AMD terminology except when suggesting NVIDIA alternatives in parentheses - ElementWiseTraits was renamed to ElementWiseShape - Adopt suggestions made by Copilot when prompted to check for factual or typographical errors * Simplify CMakeLists.txt and remove the unused variables this uncovers * Rename a file and fix some copyright statements * Changes made by script/clang-format-overwrite.sh * Add basic unit test for ElementWiseKernel * Remove left-over uninformative comment in apply unit test * Changes made by clang-format-overwrite.sh * fixup! Use default template parameter values for memory operation and coherence in a call to make_naive_tensor_view * Clean up test_tuple_apply.cpp and test_elementwise_1d.cpp * Use make_uniform_array_with_factory to define h_xs and d_xs_mems_owner as type std::array * Use a DeviceMem constructor that calls get_element_space_size_in_bytes internally * Move examples to folder 20_elementwise * Reduced register pressure on the CK tile elementwise kernel + add 4d input example to be able benchmark against old CK * Fix CLang formating * Bump up the elementwise example folder number * Elementwise: add padding + minor cleanup * Add Vector Size inference + fix issue with wrong vectorization due to missing GuaranteedLastDimensionVectorStride setting in make_naive_tensor_view * Add isSupportedArg to Elementwise kernel + addapt example and unit tests * Fix clang-format on the unit test file --------- Co-authored-by: Damien Lejeune <damien.lejeune@amd.com> Co-authored-by: Sami Aario <samaario@amd.com> Co-authored-by: Mohsen Saffari <mohsen.saffari@amd.com> Co-authored-by: Aviral Goel <aviral.goel@amd.com>
2026-05-01 12:11:19 +00:00 · 2025-07-24 12:21:45 +03:00
parent 6681593864
commit 606b0cc947
23 changed files with 1509 additions and 6 deletions
--- a/example/ck_tile/21_elementwise/elementwise_example_transpose.cpp
+++ b/example/ck_tile/21_elementwise/elementwise_example_transpose.cpp
@@ -0,0 +1,156 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck_tile/host.hpp"
+#include "ck_tile/ops/elementwise.hpp"
+#include "ck_tile/host/reference/reference_transpose.hpp"
+
+auto create_args(int argc, char* argv[])
+{
+    ck_tile::ArgParser arg_parser;
+    arg_parser.insert("m", "1024", "m dimension of input")
+        .insert("n", "1024", "n dimension of input")
+        .insert("stride_in", "-1", "stride for input M dim, if -1 then equal to n")
+        .insert("v", "1", "cpu validation or not")
+        .insert("prec", "fp16", "precision")
+        .insert("warmup", "10", "cold iter")
+        .insert("repeat", "50", "hot iter");
+
+    bool result = arg_parser.parse(argc, argv);
+    return std::make_tuple(result, arg_parser);
+}
+
+template <typename DataType>
+bool run(const ck_tile::ArgParser& arg_parser)
+{
+    ck_tile::index_t M         = arg_parser.get_int("m");
+    ck_tile::index_t N         = arg_parser.get_int("n");
+    ck_tile::index_t stride_in = arg_parser.get_int("stride_in");
+
+    if(stride_in < 0)
+        stride_in = N; // Dense input: stride for M dim is N
+    std::string data_type = arg_parser.get_str("prec");
+    int do_validation     = arg_parser.get_int("v");
+    int warmup            = arg_parser.get_int("warmup");
+    int repeat            = arg_parser.get_int("repeat");
+
+    if(stride_in < N)
+    {
+        throw std::runtime_error("stride_in must be >= N");
+    }
+
+    using XDataType       = DataType;
+    using ComputeDataType = float;
+    using YDataType       = DataType;
+    // Use PassThrough operation for transposition (data is moved, not changed)
+    using XElementwiseOperation = ck_tile::element_wise::PassThrough;
+
+    // 1. Initialize the input data on the host (CPU).
+    // Input x_host_a: M x N
+    // Output y_host: N x M (transposed)
+    ck_tile::HostTensor<XDataType> x_host_a({M, N}, {stride_in, 1});
+    // Output tensor y_host will have dimensions N x M.
+    // Assuming dense output, its stride for the N dimension will be M.
+    ck_tile::index_t stride_out_dim0 = M;
+    ck_tile::HostTensor<YDataType> y_host({N, M}, {stride_out_dim0, 1});
+    ck_tile::HostTensor<YDataType> y_validation({N, M}, {stride_out_dim0, 1});
+
+    // The logical shape for the element-wise operation kernel is based on the input tensor's
+    // elements.
+    std::vector<ck_tile::index_t> op_shape_vec = {M, N};
+    auto op_lengths                            = ck_tile::make_tuple(M, N); // Lens for the kernel
+
+    ck_tile::FillUniformDistribution<XDataType>{0.f, 5.f}(x_host_a);
+
+    // 2. Create device memory buffers
+    ck_tile::DeviceMem x_buf_a(x_host_a.get_element_space_size_in_bytes());
+    ck_tile::DeviceMem y_buf(y_host.get_element_space_size_in_bytes()); // y_host is N x M
+
+    x_buf_a.ToDevice(x_host_a.data());
+
+    // 3. Configure the kernel execution parameters.
+    using BlockTile  = ck_tile::sequence<1024>;
+    using BlockWarps = ck_tile::sequence<8>;
+    using WarpTile   = ck_tile::sequence<64>;
+
+    using Shape = ck_tile::ElementWiseShape<BlockWarps, BlockTile, WarpTile, ComputeDataType>;
+
+    // Problem definition for a single input tensor
+    using Problem = ck_tile::ElementWisePipelineProblem<XDataType,
+                                                        ComputeDataType,
+                                                        YDataType,
+                                                        Shape,
+                                                        XElementwiseOperation>;
+
+    using Kernel = ck_tile::ElementWiseKernel<Problem, ck_tile::ElementWiseDefaultPolicy>;
+
+    ck_tile::index_t total_elements = M * N;
+
+    constexpr ck_tile::index_t kBlockSize         = 64 * BlockWarps::at(ck_tile::number<0>{});
+    constexpr ck_tile::index_t kBlockPerCu        = 1;
+    constexpr ck_tile::index_t elements_per_block = BlockTile::at(ck_tile::number<0>{});
+    ck_tile::index_t kGridSize = (total_elements + elements_per_block - 1) / elements_per_block;
+
+    std::cout << "Input M=" << M << ", N=" << N << ", StrideIn=" << stride_in << std::endl;
+    std::cout << "Output N=" << N << ", M=" << M << ", StrideOut=" << stride_out_dim0 << std::endl;
+    std::cout << "Grid size = " << kGridSize << ", BlockSize = " << kBlockSize << std::endl;
+    std::cout << "Total elements = " << total_elements << std::endl;
+
+    // Input tensors tuple (single input)
+    auto input_tensors = ck_tile::make_tuple(static_cast<XDataType*>(x_buf_a.GetDeviceBuffer()));
+    // Input strides tuple (tuple of tuples, one for each input)
+    auto input_strides = ck_tile::make_tuple(stride_in, 1);
+    // Output strides (for N x M tensor, dense)
+    auto output_strides = ck_tile::make_tuple(1, stride_out_dim0);
+
+    // Check if the kernel configuration is supported
+    if(!Kernel::IsSupportedArgument(op_lengths))
+    {
+        throw std::runtime_error(
+            "The kernel configuration is not supported for the given input size.");
+    }
+
+    // 4. Run the kernel
+    float ave_time = launch_kernel(ck_tile::stream_config{nullptr, true, 0, warmup, repeat},
+                                   ck_tile::make_kernel<kBlockSize, kBlockPerCu>(
+                                       Kernel{},
+                                       kGridSize,
+                                       kBlockSize,
+                                       0,             // Shared memory
+                                       op_lengths,    // Logical dimensions for the operation (M, N)
+                                       input_strides, // Strides for input tensor(s)
+                                       output_strides, // Strides for output tensor (N, M)
+                                       input_tensors,
+                                       static_cast<YDataType*>(y_buf.GetDeviceBuffer())));
+
+    std::cout << "Average time: " << ave_time << " ms" << std::endl;
+
+    // 5. Verify the output
+    bool pass = true;
+    if(do_validation)
+    {
+        y_buf.FromDevice(y_validation.data()); // Copy result from device to y_validation
+        ck_tile::reference_transpose_elementwise<XDataType, YDataType>(
+            x_host_a, y_host); // Compute reference on host
+        pass = ck_tile::check_err(
+            y_validation, y_host, "Transpose Error: Incorrect results!", 0.01, 0.01);
+    }
+
+    return pass;
+}
+
+int main(int argc, char* argv[])
+{
+    auto [result, arg_parser] = create_args(argc, argv);
+    if(!result)
+        return -1;
+
+    const std::string data_type = arg_parser.get_str("prec");
+    if(data_type == "fp16")
+    {
+        return run<ck_tile::half_t>(arg_parser) ? 0 : -2;
+    }
+
+    std::cerr << "Unsupported data type: " << data_type << std::endl;
+    return -3;
+}