composable_kernel/example/ck_tile/21_elementwise/elementwise_example.cpp

// SPDX-License-Identifier: MIT
// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.

#include "ck_tile/core/arch/arch.hpp"
#include "ck_tile/host.hpp"
#include "ck_tile/ops/elementwise.hpp"
#include "ck_tile/host/reference/reference_elementwise.hpp"

auto create_args(int argc, char* argv[])
{
    ck_tile::ArgParser arg_parser;
    arg_parser.insert("m", "1024", "m dimension")
        .insert("n", "1024", "n dimension")
        .insert("stride", "-1", "stride per row, if -1 then equal to n")
        .insert("v", "1", "cpu validation or not")
        .insert("prec", "fp16", "precision")
        .insert("warmup", "10", "cold iter")
        .insert("repeat", "50", "hot iter");

    bool result = arg_parser.parse(argc, argv);
    return std::make_tuple(result, arg_parser);
}

template <typename DataType>
bool run(const ck_tile::ArgParser& arg_parser)
{
    ck_tile::index_t M      = arg_parser.get_int("m");
    ck_tile::index_t N      = arg_parser.get_int("n");
    ck_tile::index_t stride = arg_parser.get_int("stride");

    // If stride is negative (default -1), set it to N, assuming a dense row-major layout.
    if(stride < 0)
        stride = N;
    std::string data_type = arg_parser.get_str("prec");
    int do_validation     = arg_parser.get_int("v");
    int warmup            = arg_parser.get_int("warmup");
    int repeat            = arg_parser.get_int("repeat");

    if(stride < N)
    {
        throw std::runtime_error("stride must be >= N");
    }

    // Define type aliases for clarity.
    // XDataType: Data type of the input tensors.
    // ComputeDataType: Data type used for intermediate computations (often float for precision).
    // YDataType: Data type of the output tensor.
    // XElementwiseOperation: The specific elementwise operation to perform (e.g., Add, Mul).
    using XDataType = DataType;
    using ComputeDataType =
        float; // Using float for intermediate calculations can improve numerical stability.
    using YDataType             = DataType;
    using XElementwiseOperation = ck_tile::element_wise::Add;

    // 1. Initialize the input data on the host (CPU).
    // HostTensor is a utility to manage tensor data on the CPU.
    // The first argument is the shape (dimensions) of the tensor {M, N}.
    // The second argument is the strides {stride, 1} for row-major layout.
    // 'x_host_a' and 'x_host_b' are the two input tensors for the elementwise operation.
    ck_tile::HostTensor<XDataType> x_host_a({M, N}, {stride, 1});
    ck_tile::HostTensor<XDataType> x_host_b({M, N}, {stride, 1});
    ck_tile::HostTensor<YDataType> y_host({M, N}, {stride, 1});
    ck_tile::HostTensor<YDataType> y_validation({M, N}, {stride, 1});

    std::vector<ck_tile::index_t> shape = {M, N};

    // Fill the host tensors with random data.
    // FillUniformDistribution populates the tensor with values from a uniform distribution,
    // within an interval.
    ck_tile::FillUniformDistribution<XDataType>{0.f, 5.f}(x_host_a);
    ck_tile::FillUniformDistribution<XDataType>{0.f, 5.f}(x_host_b);

    // 2. Create device memory buffers
    // DeviceMem allocates memory on the GPU.
    // The size is determined by the total number of elements and the size of DataType.
    ck_tile::DeviceMem x_buf_a(x_host_a.get_element_space_size_in_bytes());
    ck_tile::DeviceMem x_buf_b(x_host_b.get_element_space_size_in_bytes());
    ck_tile::DeviceMem y_buf(y_host.get_element_space_size_in_bytes());

    // Copy data from host input tensors to device buffers.
    x_buf_a.ToDevice(x_host_a.data());
    x_buf_b.ToDevice(x_host_b.data());

    // 3. Configure the kernel execution parameters.
    // Dividing the problem into blocktile, blockwarp and warptile
    // The blocktile is the size of the tile processed by a single work group (also called thread
    // block). The warptile is the size of the tile processed by a single wavefront (also called
    // warp). The vector is the size of the tile processed by a single work item (also called
    // thread). The problem is divided into blocks of size BlockTile. Each block is further divided
    // into wavefronts of size WarpTile. Each wavefront is composed of 64 work items (on AMD; 32
    // threads on NVIDIA). Each work item in a wavefront processes one vector's worth of elements.
    // Note that WarpTile/Vector should be 64 for CDNA (because there are 64 work items per
    // wavefront). Vector size is set to be 16 / sizeof(ComputeDataType), to maximize vectorization.
    using BlockTile = ck_tile::sequence<2048>; // How many elements are handled by a block tile (the
                                               // tensor is divided into blocks of this size)
    using BlockWarps = ck_tile::sequence<8>; // How many concurrent wavefronts are in a block (each
                                             // wavefront will cover some part of the block tile)

    // WarpTile: Defines the size of the data sub-tile processed by a single wavefront.
    // This should be consistent with BlockTile and BlockWarps.
    // If BlockTile is 2048 and BlockWarps is 8, then WarpTile could be 2048/8 = 256.
    // However, this example uses 64, meaning each wavefront processes 64 elements, and multiple
    // such wavefront operations might be needed to cover the BlockTile, or the BlockTile is
    // distributed differently.
    // The current configuration (BlockTile=2048, BlockWarps=8, WarpTile=64) implies that
    // each wavefront processes 64 elements, and 8 wavefronts process 8*64 = 512 elements
    // concurrently. Since 512 is not equal to 2048, it means that warptile(s) will need to iterate
    // over multiple times over different set of elements to cover the entire BlockTile.
    using WarpTile = ck_tile::sequence<64>;

    // 4. Create the kernel

    // ElementWiseShape bundles these tiling parameters.
    // It calculates derived properties like threads per wavefront, repeats, vectorization and total
    // block size.
    using Shape = ck_tile::ElementWiseShape<BlockWarps, BlockTile, WarpTile, XDataType>;

    // ElementWisePipelineProblem encapsulates all necessary information for the elementwise kernel:
    // - Data types (input, compute, output).
    // - Shape traits (tiling configuration).
    // - The specific elementwise operation (e.g., Add).
    using Problem = ck_tile::ElementWisePipelineProblem<XDataType,
                                                        ComputeDataType,
                                                        YDataType,
                                                        Shape,
                                                        XElementwiseOperation>;

    // ElementWiseKernel refers to the GPU kernel class
    using Kernel = ck_tile::ElementWiseKernel<Problem, ck_tile::ElementWiseDefaultPolicy>;

    // Compute flattened size
    ck_tile::index_t total_elements = 1;
    for(auto d : shape)
        total_elements *= d;

    // kBlockSize: The number of work items in a GPU workgroup (thread block).
    // This is often a multiple of the wavefront size, 64 on CDNA.
    // Here, it's explicitly set to 512. This should be consistent with Shape::kBlockSize.
    // Shape::kBlockSize would be BlockWarps * warpSize (e.g., 8 * 64 = 512).
    constexpr ck_tile::index_t kBlockSize =
        ck_tile::get_warp_size() * BlockWarps::at(ck_tile::number<0>{});

    // kBlockPerCu: Hint for how many workgroups can be scheduled per Compute Unit (CU).
    // This can influence occupancy and performance.
    constexpr ck_tile::index_t kBlockPerCu = 1;

    // kGridSize: Calculates the total number of workgroups required to process all elements.
    // Each workgroup is responsible for 'elements_per_block' elements.
    // To ensure all elements are covered, especially when 'total_elements' is not perfectly
    // divisible by 'elements_per_block', using ceiling division.
    constexpr ck_tile::index_t elements_per_block = BlockTile::at(ck_tile::number<0>{});
    ck_tile::index_t kGridSize = (total_elements + elements_per_block - 1) / elements_per_block;

    std::cout << "grid size = " << kGridSize << std::endl;
    std::cout << "Total elements = " << total_elements << std::endl;

    auto input_tensors = ck_tile::make_tuple(static_cast<XDataType*>(x_buf_a.GetDeviceBuffer()),
                                             static_cast<XDataType*>(x_buf_b.GetDeviceBuffer()));

    auto input_size = ck_tile::make_tuple(M, N);

    // Check if the kernel configuration is supported
    if(!Kernel::IsSupportedArgument(input_size))
    {
        throw std::runtime_error(
            "The kernel configuration is not supported for the given input size.");
    }

    // 4. Run the kernel
    float ave_time = launch_kernel(ck_tile::stream_config{nullptr, true, 0, warmup, repeat},
                                   ck_tile::make_kernel<kBlockSize, kBlockPerCu>(
                                       Kernel{},
                                       kGridSize,
                                       kBlockSize,
                                       0,
                                       input_size,
                                       ck_tile::make_tuple(N, 1), // Input Stride
                                       ck_tile::make_tuple(N, 1), // Output Stride
                                       input_tensors,
                                       static_cast<YDataType*>(y_buf.GetDeviceBuffer())));

    std::cout << "Average time: " << ave_time << " ms" << std::endl;

    // 5. Verify the output
    bool pass = true;
    if(do_validation)
    {
        y_buf.FromDevice(y_validation.data());
        auto op = [](const auto& v0, const auto& v1) { return v0 + v1; };

        ck_tile::reference_binary_elementwise<XDataType, XDataType, YDataType, ComputeDataType>(
            x_host_a, x_host_b, y_host, op);

        pass = ck_tile::check_err(
            y_validation, y_host, "Elementwise Add Error: Incorrect results!", 0.01, 0.01);
    }

    return pass;
}

int main(int argc, char* argv[])
{
    auto [result, arg_parser] = create_args(argc, argv);
    if(!result)
        return -1;

    const std::string data_type = arg_parser.get_str("prec");
    if(data_type == "fp16")
    {
        return run<ck_tile::half_t>(arg_parser) ? 0 : -2;
    }

    return -3;
}