[CK_TILE] Support for elementwise kernel (#2246)

* Elementwise kernel implementation

Co-authored-by: Sami Aario <samaario@amd.com>
Co-authored-by: Mohsen Saffari <mohsen.saffari@amd.com>
Co-authored-by: yashagar <yashagar@amd.com>

* Elementwise with generalized nDims

* Adding the n-ary input tensor feature

* Generalize dimensions on top of inputs

* Add TFLOPS + remove std usage for tuples

* 1D basecase optimization

* Cleanup code + refactoring to a common interface

* Generalize to unary and add an example

* Cleanup, refactoring and commenting

* Suggestions for LWPCK-3170: elementwise kernel improvements

* Clang-format: remod.py

* Replace InputTensorType with XDataType as the type of input_tensors

* Add Tuple::apply and use it in ElementWiseKernel::operator to call operation with the exact number of arguments in xs

* Move examples to folder 19_elementwise

* Add missing copyright headers and fix some existing ones

* Replace an assert with throw std::runtime_error in elementwise example

* Avoid reading the output by using make_static_distributed_tensor for y_tile

* Removed two unused includes

* No need to move windows to the next block when each workgroup processes a single tile

* Only copy input tensors to the device

* Use get_warp_size to obtain warp size, and use ceiling division for grid size also for the unary example

* Adding output strides to the kernel, transposition example and update the other examples

* Changes made by remod.py

* Use default template parameter values for memory operation and coherence in a call to make_naive_tensor_view

* Move binary operations to include/ck_tile/ops/elementwise/binary_elementwise_operation.hpp

* Reuse generic reference binary/unary operation in examples + refactoring the transpose reference

* Fix comments in elementwise_example.cpp

- Refer to AMD terminology except when suggesting NVIDIA alternatives in parentheses
- ElementWiseTraits was renamed to ElementWiseShape
- Adopt suggestions made by Copilot when prompted to check for factual or typographical errors

* Simplify CMakeLists.txt and remove the unused variables this uncovers

* Rename a file and fix some copyright statements

* Changes made by script/clang-format-overwrite.sh

* Add basic unit test for ElementWiseKernel

* Remove left-over uninformative comment in apply unit test

* Changes made by clang-format-overwrite.sh

* fixup! Use default template parameter values for memory operation and coherence in a call to make_naive_tensor_view

* Clean up test_tuple_apply.cpp and test_elementwise_1d.cpp

* Use make_uniform_array_with_factory to define h_xs and d_xs_mems_owner as type std::array

* Use a DeviceMem constructor that calls get_element_space_size_in_bytes internally

* Move examples to folder 20_elementwise

* Reduced register pressure on the CK tile elementwise kernel + add 4d input example to be able benchmark against old CK

* Fix CLang formating

* Bump up the elementwise example folder number

* Elementwise: add padding + minor cleanup

* Add Vector Size inference + fix issue with wrong vectorization due to missing GuaranteedLastDimensionVectorStride setting in make_naive_tensor_view

* Add isSupportedArg to Elementwise kernel + addapt example and unit tests

* Fix clang-format on the unit test file

---------

Co-authored-by: Damien Lejeune <damien.lejeune@amd.com>
Co-authored-by: Sami Aario <samaario@amd.com>
Co-authored-by: Mohsen Saffari <mohsen.saffari@amd.com>
Co-authored-by: Aviral Goel <aviral.goel@amd.com>
This commit is contained in:
Yashvardhan Agarwal
2025-07-24 12:21:45 +03:00
committed by GitHub
parent 6681593864
commit 606b0cc947
23 changed files with 1509 additions and 6 deletions

View File

@@ -0,0 +1,15 @@
# Elementwise example targets 2D inputs
set(TARGET_NAME_2D_INPUT tile_example_elementwise)
add_executable(${TARGET_NAME_2D_INPUT} elementwise_example.cpp)
# Elementwise unary example targets 2D inputs
set(TARGET_NAME_2D_INPUT_UNARY tile_example_elementwise_unary)
add_executable(${TARGET_NAME_2D_INPUT_UNARY} elementwise_example_unary.cpp)
# Elementwise transpose example targets 2D inputs
set(TARGET_NAME_2D_INPUT_TRANSPOSE tile_example_elementwise_transpose)
add_executable(${TARGET_NAME_2D_INPUT_TRANSPOSE} elementwise_example_transpose.cpp)
# Elementwise example targets 4D inputs
set(TARGET_NAME_4D_INPUT tile_example_elementwise_add_4d)
add_executable(${TARGET_NAME_4D_INPUT} elementwise_example_add_4d.cpp)

View File

@@ -0,0 +1,214 @@
// SPDX-License-Identifier: MIT
// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
#include "ck_tile/core/arch/arch.hpp"
#include "ck_tile/host.hpp"
#include "ck_tile/ops/elementwise.hpp"
#include "ck_tile/host/reference/reference_elementwise.hpp"
auto create_args(int argc, char* argv[])
{
ck_tile::ArgParser arg_parser;
arg_parser.insert("m", "1024", "m dimension")
.insert("n", "1024", "n dimension")
.insert("stride", "-1", "stride per row, if -1 then equal to n")
.insert("v", "1", "cpu validation or not")
.insert("prec", "fp16", "precision")
.insert("warmup", "10", "cold iter")
.insert("repeat", "50", "hot iter");
bool result = arg_parser.parse(argc, argv);
return std::make_tuple(result, arg_parser);
}
template <typename DataType>
bool run(const ck_tile::ArgParser& arg_parser)
{
ck_tile::index_t M = arg_parser.get_int("m");
ck_tile::index_t N = arg_parser.get_int("n");
ck_tile::index_t stride = arg_parser.get_int("stride");
// If stride is negative (default -1), set it to N, assuming a dense row-major layout.
if(stride < 0)
stride = N;
std::string data_type = arg_parser.get_str("prec");
int do_validation = arg_parser.get_int("v");
int warmup = arg_parser.get_int("warmup");
int repeat = arg_parser.get_int("repeat");
if(stride < N)
{
throw std::runtime_error("stride must be >= N");
}
// Define type aliases for clarity.
// XDataType: Data type of the input tensors.
// ComputeDataType: Data type used for intermediate computations (often float for precision).
// YDataType: Data type of the output tensor.
// XElementwiseOperation: The specific elementwise operation to perform (e.g., Add, Mul).
using XDataType = DataType;
using ComputeDataType =
float; // Using float for intermediate calculations can improve numerical stability.
using YDataType = DataType;
using XElementwiseOperation = ck_tile::element_wise::Add;
// 1. Initialize the input data on the host (CPU).
// HostTensor is a utility to manage tensor data on the CPU.
// The first argument is the shape (dimensions) of the tensor {M, N}.
// The second argument is the strides {stride, 1} for row-major layout.
// 'x_host_a' and 'x_host_b' are the two input tensors for the elementwise operation.
ck_tile::HostTensor<XDataType> x_host_a({M, N}, {stride, 1});
ck_tile::HostTensor<XDataType> x_host_b({M, N}, {stride, 1});
ck_tile::HostTensor<YDataType> y_host({M, N}, {stride, 1});
ck_tile::HostTensor<YDataType> y_validation({M, N}, {stride, 1});
std::vector<ck_tile::index_t> shape = {M, N};
// Fill the host tensors with random data.
// FillUniformDistribution populates the tensor with values from a uniform distribution,
// within an interval.
ck_tile::FillUniformDistribution<XDataType>{0.f, 5.f}(x_host_a);
ck_tile::FillUniformDistribution<XDataType>{0.f, 5.f}(x_host_b);
// 2. Create device memory buffers
// DeviceMem allocates memory on the GPU.
// The size is determined by the total number of elements and the size of DataType.
ck_tile::DeviceMem x_buf_a(x_host_a.get_element_space_size_in_bytes());
ck_tile::DeviceMem x_buf_b(x_host_b.get_element_space_size_in_bytes());
ck_tile::DeviceMem y_buf(y_host.get_element_space_size_in_bytes());
// Copy data from host input tensors to device buffers.
x_buf_a.ToDevice(x_host_a.data());
x_buf_b.ToDevice(x_host_b.data());
// 3. Configure the kernel execution parameters.
// Dividing the problem into blocktile, blockwarp and warptile
// The blocktile is the size of the tile processed by a single work group (also called thread
// block). The warptile is the size of the tile processed by a single wavefront (also called
// warp). The vector is the size of the tile processed by a single work item (also called
// thread). The problem is divided into blocks of size BlockTile. Each block is further divided
// into wavefronts of size WarpTile. Each wavefront is composed of 64 work items (on AMD; 32
// threads on NVIDIA). Each work item in a wavefront processes one vector's worth of elements.
// Note that WarpTile/Vector should be 64 for CDNA (because there are 64 work items per
// wavefront). Vector size is set to be 16 / sizeof(ComputeDataType), to maximize vectorization.
using BlockTile = ck_tile::sequence<2048>; // How many elements are handled by a block tile (the
// tensor is divided into blocks of this size)
using BlockWarps = ck_tile::sequence<8>; // How many concurrent wavefronts are in a block (each
// wavefront will cover some part of the block tile)
// WarpTile: Defines the size of the data sub-tile processed by a single wavefront.
// This should be consistent with BlockTile and BlockWarps.
// If BlockTile is 2048 and BlockWarps is 8, then WarpTile could be 2048/8 = 256.
// However, this example uses 64, meaning each wavefront processes 64 elements, and multiple
// such wavefront operations might be needed to cover the BlockTile, or the BlockTile is
// distributed differently.
// The current configuration (BlockTile=2048, BlockWarps=8, WarpTile=64) implies that
// each wavefront processes 64 elements, and 8 wavefronts process 8*64 = 512 elements
// concurrently. Since 512 is not equal to 2048, it means that warptile(s) will need to iterate
// over multiple times over different set of elements to cover the entire BlockTile.
using WarpTile = ck_tile::sequence<64>;
// 4. Create the kernel
// ElementWiseShape bundles these tiling parameters.
// It calculates derived properties like threads per wavefront, repeats, vectorization and total
// block size.
using Shape = ck_tile::ElementWiseShape<BlockWarps, BlockTile, WarpTile, ComputeDataType>;
// ElementWisePipelineProblem encapsulates all necessary information for the elementwise kernel:
// - Data types (input, compute, output).
// - Shape traits (tiling configuration).
// - The specific elementwise operation (e.g., Add).
using Problem = ck_tile::ElementWisePipelineProblem<XDataType,
ComputeDataType,
YDataType,
Shape,
XElementwiseOperation>;
// ElementWiseKernel refers to the GPU kernel class
using Kernel = ck_tile::ElementWiseKernel<Problem, ck_tile::ElementWiseDefaultPolicy>;
// Compute flattened size
ck_tile::index_t total_elements = 1;
for(auto d : shape)
total_elements *= d;
// kBlockSize: The number of work items in a GPU workgroup (thread block).
// This is often a multiple of the wavefront size, 64 on CDNA.
// Here, it's explicitly set to 512. This should be consistent with Shape::kBlockSize.
// Shape::kBlockSize would be BlockWarps * warpSize (e.g., 8 * 64 = 512).
constexpr ck_tile::index_t kBlockSize =
ck_tile::get_warp_size() * BlockWarps::at(ck_tile::number<0>{});
// kBlockPerCu: Hint for how many workgroups can be scheduled per Compute Unit (CU).
// This can influence occupancy and performance.
constexpr ck_tile::index_t kBlockPerCu = 1;
// kGridSize: Calculates the total number of workgroups required to process all elements.
// Each workgroup is responsible for 'elements_per_block' elements.
// To ensure all elements are covered, especially when 'total_elements' is not perfectly
// divisible by 'elements_per_block', using ceiling division.
constexpr ck_tile::index_t elements_per_block = BlockTile::at(ck_tile::number<0>{});
ck_tile::index_t kGridSize = (total_elements + elements_per_block - 1) / elements_per_block;
std::cout << "grid size = " << kGridSize << std::endl;
std::cout << "Total elements = " << total_elements << std::endl;
auto input_tensors = ck_tile::make_tuple(static_cast<XDataType*>(x_buf_a.GetDeviceBuffer()),
static_cast<XDataType*>(x_buf_b.GetDeviceBuffer()));
auto input_size = ck_tile::make_tuple(M, N);
// Check if the kernel configuration is supported
if(!Kernel::IsSupportedArgument(input_size))
{
throw std::runtime_error(
"The kernel configuration is not supported for the given input size.");
}
// 4. Run the kernel
float ave_time = launch_kernel(ck_tile::stream_config{nullptr, true, 0, warmup, repeat},
ck_tile::make_kernel<kBlockSize, kBlockPerCu>(
Kernel{},
kGridSize,
kBlockSize,
0,
input_size,
ck_tile::make_tuple(N, 1), // Input Stride
ck_tile::make_tuple(N, 1), // Output Stride
input_tensors,
static_cast<YDataType*>(y_buf.GetDeviceBuffer())));
std::cout << "Average time: " << ave_time << " ms" << std::endl;
// 5. Verify the output
bool pass = true;
if(do_validation)
{
y_buf.FromDevice(y_validation.data());
auto op = [](const auto& v0, const auto& v1) { return v0 + v1; };
ck_tile::reference_binary_elementwise<XDataType, XDataType, YDataType, ComputeDataType>(
x_host_a, x_host_b, y_host, op);
pass = ck_tile::check_err(
y_validation, y_host, "Elementwise Add Error: Incorrect results!", 0.01, 0.01);
}
return pass;
}
int main(int argc, char* argv[])
{
auto [result, arg_parser] = create_args(argc, argv);
if(!result)
return -1;
const std::string data_type = arg_parser.get_str("prec");
if(data_type == "fp16")
{
return run<ck_tile::half_t>(arg_parser) ? 0 : -2;
}
return -3;
}

View File

@@ -0,0 +1,159 @@
// SPDX-License-Identifier: MIT
// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
#include "ck_tile/core/arch/arch.hpp"
#include "ck_tile/host.hpp"
#include "ck_tile/ops/elementwise.hpp"
#include "ck_tile/host/reference/reference_elementwise.hpp"
auto create_args(int argc, char* argv[])
{
ck_tile::ArgParser arg_parser;
arg_parser.insert("dim0", "4", "dimension 0")
.insert("dim1", "16", "dimension 1")
.insert("dim2", "32", "dimension 2")
.insert("dim3", "32", "dimension 3")
.insert("v", "1", "cpu validation or not")
.insert("prec", "fp16", "precision")
.insert("warmup", "10", "cold iter")
.insert("repeat", "50", "hot iter");
bool result = arg_parser.parse(argc, argv);
return std::make_tuple(result, arg_parser);
}
template <typename DataType>
bool run(const ck_tile::ArgParser& arg_parser)
{
ck_tile::index_t D0 = arg_parser.get_int("dim0");
ck_tile::index_t D1 = arg_parser.get_int("dim1");
ck_tile::index_t D2 = arg_parser.get_int("dim2");
ck_tile::index_t D3 = arg_parser.get_int("dim3");
std::string data_type = arg_parser.get_str("prec");
int do_validation = arg_parser.get_int("v");
int warmup = arg_parser.get_int("warmup");
int repeat = arg_parser.get_int("repeat");
using XDataType = DataType;
using ComputeDataType =
float; // Using float for intermediate calculations can improve numerical stability.
using YDataType = DataType;
using XElementwiseOperation = ck_tile::element_wise::Add;
// Initialize the input data on the host (CPU).
std::vector<ck_tile::index_t> problem_shape = {D0, D1, D2, D3};
std::vector<ck_tile::index_t> host_strides(4);
host_strides[3] = 1;
host_strides[2] = problem_shape[3];
host_strides[1] = problem_shape[2] * problem_shape[3];
host_strides[0] = problem_shape[1] * problem_shape[2] * problem_shape[3];
ck_tile::HostTensor<XDataType> x_host_a(problem_shape, host_strides);
ck_tile::HostTensor<XDataType> x_host_b(problem_shape, host_strides);
ck_tile::HostTensor<YDataType> y_host(problem_shape, host_strides);
ck_tile::HostTensor<YDataType> y_validation(problem_shape, host_strides);
ck_tile::FillUniformDistribution<XDataType>{0.f, 5.f}(x_host_a);
ck_tile::FillUniformDistribution<XDataType>{2.f, 10.f}(x_host_b);
ck_tile::DeviceMem x_buf_a(x_host_a.get_element_space_size_in_bytes());
ck_tile::DeviceMem x_buf_b(x_host_b.get_element_space_size_in_bytes());
ck_tile::DeviceMem y_buf(y_host.get_element_space_size_in_bytes());
x_buf_a.ToDevice(x_host_a.data());
x_buf_b.ToDevice(x_host_b.data());
using BlockTile = ck_tile::sequence<256>;
using BlockWarps = ck_tile::sequence<1>;
using WarpTile = ck_tile::sequence<256>;
using Shape = ck_tile::ElementWiseShape<BlockWarps, BlockTile, WarpTile, ComputeDataType>;
using Problem = ck_tile::ElementWisePipelineProblem<XDataType,
ComputeDataType,
YDataType,
Shape,
XElementwiseOperation>;
using Kernel = ck_tile::ElementWiseKernel<Problem, ck_tile::ElementWiseDefaultPolicy>;
ck_tile::index_t total_elements = 1;
for(auto d : problem_shape)
total_elements *= d;
constexpr ck_tile::index_t kBlockSize =
ck_tile::get_warp_size() * BlockWarps::at(ck_tile::number<0>{});
constexpr ck_tile::index_t kBlockPerCu = 2;
constexpr ck_tile::index_t elements_per_block = BlockTile::at(ck_tile::number<0>{});
ck_tile::index_t kGridSize = (total_elements + elements_per_block - 1) / elements_per_block;
std::cout << "grid size = " << kGridSize << std::endl;
std::cout << "Total elements = " << total_elements << std::endl;
auto input_tensors = ck_tile::make_tuple(static_cast<XDataType*>(x_buf_a.GetDeviceBuffer()),
static_cast<XDataType*>(x_buf_b.GetDeviceBuffer()));
auto problem_shape_tuple =
ck_tile::make_tuple(problem_shape[0], problem_shape[1], problem_shape[2], problem_shape[3]);
auto strides_tuple =
ck_tile::make_tuple(host_strides[0], host_strides[1], host_strides[2], host_strides[3]);
// Check if the kernel configuration is supported
if(!Kernel::IsSupportedArgument(problem_shape_tuple))
{
throw std::runtime_error(
"The kernel configuration is not supported for the given input size.");
}
// Run the kernel
float ave_time = launch_kernel(
ck_tile::stream_config{nullptr, true, 0, warmup, repeat},
ck_tile::make_kernel<kBlockSize, kBlockPerCu>(
Kernel{},
kGridSize,
kBlockSize,
0,
problem_shape_tuple, // ck_tile::tuple<index_t, index_t, index_t, index_t>
strides_tuple, // ck_tile::tuple<index_t, index_t, index_t, index_t> for input strides
strides_tuple, // ck_tile::tuple<index_t, index_t, index_t, index_t> for output strides
input_tensors,
static_cast<YDataType*>(y_buf.GetDeviceBuffer())));
std::cout << "Average time: " << ave_time << " ms" << std::endl;
// Verify the output
bool pass = true;
if(do_validation)
{
y_buf.FromDevice(y_validation.data());
auto op = [](const auto& v0, const auto& v1) { return v0 + v1; };
ck_tile::reference_binary_elementwise<XDataType, XDataType, YDataType, ComputeDataType>(
x_host_a, x_host_b, y_host, op);
pass = ck_tile::check_err(
y_validation, y_host, "Elementwise Add Error: Incorrect results!", 0.01, 0.01);
}
return pass;
}
int main(int argc, char* argv[])
{
auto [result, arg_parser] = create_args(argc, argv);
if(!result)
return -1;
const std::string data_type = arg_parser.get_str("prec");
if(data_type == "fp16")
{
return run<ck_tile::half_t>(arg_parser) ? 0 : -2;
}
return -3;
}

View File

@@ -0,0 +1,156 @@
// SPDX-License-Identifier: MIT
// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
#include "ck_tile/host.hpp"
#include "ck_tile/ops/elementwise.hpp"
#include "ck_tile/host/reference/reference_transpose.hpp"
auto create_args(int argc, char* argv[])
{
ck_tile::ArgParser arg_parser;
arg_parser.insert("m", "1024", "m dimension of input")
.insert("n", "1024", "n dimension of input")
.insert("stride_in", "-1", "stride for input M dim, if -1 then equal to n")
.insert("v", "1", "cpu validation or not")
.insert("prec", "fp16", "precision")
.insert("warmup", "10", "cold iter")
.insert("repeat", "50", "hot iter");
bool result = arg_parser.parse(argc, argv);
return std::make_tuple(result, arg_parser);
}
template <typename DataType>
bool run(const ck_tile::ArgParser& arg_parser)
{
ck_tile::index_t M = arg_parser.get_int("m");
ck_tile::index_t N = arg_parser.get_int("n");
ck_tile::index_t stride_in = arg_parser.get_int("stride_in");
if(stride_in < 0)
stride_in = N; // Dense input: stride for M dim is N
std::string data_type = arg_parser.get_str("prec");
int do_validation = arg_parser.get_int("v");
int warmup = arg_parser.get_int("warmup");
int repeat = arg_parser.get_int("repeat");
if(stride_in < N)
{
throw std::runtime_error("stride_in must be >= N");
}
using XDataType = DataType;
using ComputeDataType = float;
using YDataType = DataType;
// Use PassThrough operation for transposition (data is moved, not changed)
using XElementwiseOperation = ck_tile::element_wise::PassThrough;
// 1. Initialize the input data on the host (CPU).
// Input x_host_a: M x N
// Output y_host: N x M (transposed)
ck_tile::HostTensor<XDataType> x_host_a({M, N}, {stride_in, 1});
// Output tensor y_host will have dimensions N x M.
// Assuming dense output, its stride for the N dimension will be M.
ck_tile::index_t stride_out_dim0 = M;
ck_tile::HostTensor<YDataType> y_host({N, M}, {stride_out_dim0, 1});
ck_tile::HostTensor<YDataType> y_validation({N, M}, {stride_out_dim0, 1});
// The logical shape for the element-wise operation kernel is based on the input tensor's
// elements.
std::vector<ck_tile::index_t> op_shape_vec = {M, N};
auto op_lengths = ck_tile::make_tuple(M, N); // Lens for the kernel
ck_tile::FillUniformDistribution<XDataType>{0.f, 5.f}(x_host_a);
// 2. Create device memory buffers
ck_tile::DeviceMem x_buf_a(x_host_a.get_element_space_size_in_bytes());
ck_tile::DeviceMem y_buf(y_host.get_element_space_size_in_bytes()); // y_host is N x M
x_buf_a.ToDevice(x_host_a.data());
// 3. Configure the kernel execution parameters.
using BlockTile = ck_tile::sequence<1024>;
using BlockWarps = ck_tile::sequence<8>;
using WarpTile = ck_tile::sequence<64>;
using Shape = ck_tile::ElementWiseShape<BlockWarps, BlockTile, WarpTile, ComputeDataType>;
// Problem definition for a single input tensor
using Problem = ck_tile::ElementWisePipelineProblem<XDataType,
ComputeDataType,
YDataType,
Shape,
XElementwiseOperation>;
using Kernel = ck_tile::ElementWiseKernel<Problem, ck_tile::ElementWiseDefaultPolicy>;
ck_tile::index_t total_elements = M * N;
constexpr ck_tile::index_t kBlockSize = 64 * BlockWarps::at(ck_tile::number<0>{});
constexpr ck_tile::index_t kBlockPerCu = 1;
constexpr ck_tile::index_t elements_per_block = BlockTile::at(ck_tile::number<0>{});
ck_tile::index_t kGridSize = (total_elements + elements_per_block - 1) / elements_per_block;
std::cout << "Input M=" << M << ", N=" << N << ", StrideIn=" << stride_in << std::endl;
std::cout << "Output N=" << N << ", M=" << M << ", StrideOut=" << stride_out_dim0 << std::endl;
std::cout << "Grid size = " << kGridSize << ", BlockSize = " << kBlockSize << std::endl;
std::cout << "Total elements = " << total_elements << std::endl;
// Input tensors tuple (single input)
auto input_tensors = ck_tile::make_tuple(static_cast<XDataType*>(x_buf_a.GetDeviceBuffer()));
// Input strides tuple (tuple of tuples, one for each input)
auto input_strides = ck_tile::make_tuple(stride_in, 1);
// Output strides (for N x M tensor, dense)
auto output_strides = ck_tile::make_tuple(1, stride_out_dim0);
// Check if the kernel configuration is supported
if(!Kernel::IsSupportedArgument(op_lengths))
{
throw std::runtime_error(
"The kernel configuration is not supported for the given input size.");
}
// 4. Run the kernel
float ave_time = launch_kernel(ck_tile::stream_config{nullptr, true, 0, warmup, repeat},
ck_tile::make_kernel<kBlockSize, kBlockPerCu>(
Kernel{},
kGridSize,
kBlockSize,
0, // Shared memory
op_lengths, // Logical dimensions for the operation (M, N)
input_strides, // Strides for input tensor(s)
output_strides, // Strides for output tensor (N, M)
input_tensors,
static_cast<YDataType*>(y_buf.GetDeviceBuffer())));
std::cout << "Average time: " << ave_time << " ms" << std::endl;
// 5. Verify the output
bool pass = true;
if(do_validation)
{
y_buf.FromDevice(y_validation.data()); // Copy result from device to y_validation
ck_tile::reference_transpose_elementwise<XDataType, YDataType>(
x_host_a, y_host); // Compute reference on host
pass = ck_tile::check_err(
y_validation, y_host, "Transpose Error: Incorrect results!", 0.01, 0.01);
}
return pass;
}
int main(int argc, char* argv[])
{
auto [result, arg_parser] = create_args(argc, argv);
if(!result)
return -1;
const std::string data_type = arg_parser.get_str("prec");
if(data_type == "fp16")
{
return run<ck_tile::half_t>(arg_parser) ? 0 : -2;
}
std::cerr << "Unsupported data type: " << data_type << std::endl;
return -3;
}

View File

@@ -0,0 +1,147 @@
// SPDX-License-Identifier: MIT
// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
#include "ck_tile/core/arch/arch.hpp"
#include "ck_tile/host.hpp"
#include "ck_tile/ops/elementwise.hpp"
#include "ck_tile/host/reference/reference_elementwise.hpp"
auto create_args(int argc, char* argv[])
{
ck_tile::ArgParser arg_parser;
arg_parser.insert("m", "1024", "m dimension")
.insert("n", "1024", "n dimension")
.insert("stride", "-1", "stride per row, if -1 then equal to n")
.insert("v", "1", "cpu validation or not")
.insert("prec", "fp16", "precision")
.insert("warmup", "10", "cold iter")
.insert("repeat", "50", "hot iter");
bool result = arg_parser.parse(argc, argv);
return std::make_tuple(result, arg_parser);
}
template <typename DataType>
bool run(const ck_tile::ArgParser& arg_parser)
{
ck_tile::index_t M = arg_parser.get_int("m");
ck_tile::index_t N = arg_parser.get_int("n");
ck_tile::index_t stride = arg_parser.get_int("stride");
if(stride < 0)
stride = N;
std::string data_type = arg_parser.get_str("prec");
int do_validation = arg_parser.get_int("v");
int warmup = arg_parser.get_int("warmup");
int repeat = arg_parser.get_int("repeat");
assert(stride >= N);
using XDataType = DataType;
using YDataType = DataType;
using ComputeDataType = float;
using XElementwiseOperation = ck_tile::element_wise::UnarySquare;
// 1. Initialize the input data on the host
ck_tile::HostTensor<XDataType> x_host_a({M, N}, {stride, 1});
ck_tile::HostTensor<YDataType> y_host({M, N}, {stride, 1});
ck_tile::HostTensor<YDataType> y_validation({M, N}, {stride, 1});
std::vector<ck_tile::index_t> shape = {M, N};
ck_tile::FillUniformDistribution<XDataType>{0.f, 5.f}(x_host_a);
// 2. Create device memory buffers and copy input data from host to device
ck_tile::DeviceMem x_buf_a(x_host_a.get_element_space_size_in_bytes());
ck_tile::DeviceMem y_buf(y_host.get_element_space_size_in_bytes());
x_buf_a.ToDevice(x_host_a.data());
// 3. Create the kernel
// Dividing the problem into blocktile, warptile, and vector
using BlockTile = ck_tile::sequence<2048>; // Size of the block tile (Entire problem is divided
// into blocks of this size)
using BlockWarps = ck_tile::sequence<8>; // How many concurrent warps are in a block (Each warp
// will cover some part of blockTile)
using WarpTile = ck_tile::sequence<64>; // How many elements are covered by a warp
using Shape = ck_tile::ElementWiseShape<BlockWarps, BlockTile, WarpTile, ComputeDataType>;
using Problem = ck_tile::ElementWisePipelineProblem<XDataType,
XDataType, // ComputeDataType is same as
// XDataType in the unary case
YDataType,
Shape,
XElementwiseOperation>;
using Kernel = ck_tile::ElementWiseKernel<Problem, ck_tile::ElementWiseDefaultPolicy>;
// Compute flattened size
ck_tile::index_t total_elements = 1;
for(auto d : shape)
total_elements *= d;
constexpr ck_tile::index_t kBlockSize =
ck_tile::get_warp_size() * BlockWarps::at(ck_tile::number<0>{});
constexpr ck_tile::index_t kBlockPerCu = 1;
constexpr ck_tile::index_t elements_per_block = BlockTile::at(ck_tile::number<0>{});
ck_tile::index_t kGridSize = (total_elements + elements_per_block - 1) / elements_per_block;
std::cout << "grid size = " << kGridSize << std::endl;
std::cout << "Total elements = " << total_elements << std::endl;
auto input_tensors = ck_tile::make_tuple(static_cast<XDataType*>(x_buf_a.GetDeviceBuffer()));
auto input_size = ck_tile::make_tuple(M, N);
// Check if the kernel configuration is supported
if(!Kernel::IsSupportedArgument(input_size))
{
throw std::runtime_error(
"The kernel configuration is not supported for the given input size.");
}
// 4. Run the kernel
float ave_time = launch_kernel(ck_tile::stream_config{nullptr, true, 0, warmup, repeat},
ck_tile::make_kernel<kBlockSize, kBlockPerCu>(
Kernel{},
kGridSize,
kBlockSize,
0,
input_size,
ck_tile::make_tuple(N, 1), // Input Stride
ck_tile::make_tuple(N, 1), // Output Stride
input_tensors,
static_cast<YDataType*>(y_buf.GetDeviceBuffer())));
std::cout << "Average time: " << ave_time << " ms" << std::endl;
// 5. Verify the output
bool pass = true;
if(do_validation)
{
y_buf.FromDevice(y_validation.data());
auto op = [](const auto& v0) { return v0 * v0; };
ck_tile::reference_unary_elementwise<XDataType, YDataType, YDataType>(x_host_a, y_host, op);
pass = ck_tile::check_err(
y_validation, y_host, "Elementwise Add Error: Incorrect results!", 0.01, 0.01);
}
return pass;
}
int main(int argc, char* argv[])
{
auto [result, arg_parser] = create_args(argc, argv);
if(!result)
return -1;
const std::string data_type = arg_parser.get_str("prec");
if(data_type == "fp16")
{
return run<ck_tile::half_t>(arg_parser) ? 0 : -2;
}
return -3;
}