mirror of
https://github.com/ROCm/composable_kernel.git
synced 2026-05-05 22:22:27 +00:00
This commit is contained in:
18
example/ck_tile/21_elementwise/CMakeLists.txt
Normal file
18
example/ck_tile/21_elementwise/CMakeLists.txt
Normal file
@@ -0,0 +1,18 @@
|
||||
# Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
|
||||
# SPDX-License-Identifier: MIT
|
||||
|
||||
# Elementwise example targets 2D inputs
|
||||
set(TARGET_NAME_2D_INPUT tile_example_elementwise)
|
||||
add_executable(${TARGET_NAME_2D_INPUT} elementwise_example.cpp)
|
||||
|
||||
# Elementwise unary example targets 2D inputs
|
||||
set(TARGET_NAME_2D_INPUT_UNARY tile_example_elementwise_unary)
|
||||
add_executable(${TARGET_NAME_2D_INPUT_UNARY} elementwise_example_unary.cpp)
|
||||
|
||||
# Elementwise transpose example targets 2D inputs
|
||||
set(TARGET_NAME_2D_INPUT_TRANSPOSE tile_example_elementwise_transpose)
|
||||
add_executable(${TARGET_NAME_2D_INPUT_TRANSPOSE} elementwise_example_transpose.cpp)
|
||||
|
||||
# Elementwise example targets 4D inputs
|
||||
set(TARGET_NAME_4D_INPUT tile_example_elementwise_add_4d)
|
||||
add_executable(${TARGET_NAME_4D_INPUT} elementwise_example_add_4d.cpp)
|
||||
27
example/ck_tile/21_elementwise/elementwise_common.hpp
Normal file
27
example/ck_tile/21_elementwise/elementwise_common.hpp
Normal file
@@ -0,0 +1,27 @@
|
||||
// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
|
||||
// SPDX-License-Identifier: MIT
|
||||
|
||||
#include <variant>
|
||||
#include "ck_tile/core/arch/arch.hpp"
|
||||
|
||||
auto string_to_datatype(const std::string& datatype)
|
||||
{
|
||||
using PrecVariant = std::variant<ck_tile::half_t, ck_tile::bf16_t, float>;
|
||||
|
||||
if(datatype == "fp16")
|
||||
{
|
||||
return PrecVariant{ck_tile::half_t{}};
|
||||
}
|
||||
else if(datatype == "bf16")
|
||||
{
|
||||
return PrecVariant{ck_tile::bf16_t{}};
|
||||
}
|
||||
else if(datatype == "fp32")
|
||||
{
|
||||
return PrecVariant{float{}};
|
||||
}
|
||||
else
|
||||
{
|
||||
throw std::runtime_error("Unsupported data type: " + datatype);
|
||||
}
|
||||
};
|
||||
238
example/ck_tile/21_elementwise/elementwise_example.cpp
Normal file
238
example/ck_tile/21_elementwise/elementwise_example.cpp
Normal file
@@ -0,0 +1,238 @@
|
||||
// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
|
||||
// SPDX-License-Identifier: MIT
|
||||
|
||||
#include "ck_tile/host.hpp"
|
||||
#include "ck_tile/ops/elementwise.hpp"
|
||||
#include "ck_tile/host/reference/reference_elementwise.hpp"
|
||||
#include "ck_tile/utility/json_dump.hpp"
|
||||
#include "elementwise_common.hpp"
|
||||
|
||||
auto create_args(int argc, char* argv[])
|
||||
{
|
||||
ck_tile::ArgParser arg_parser;
|
||||
arg_parser.insert("m", "1024", "m dimension")
|
||||
.insert("n", "1024", "n dimension")
|
||||
.insert("stride", "-1", "stride per row, if -1 then equal to n")
|
||||
.insert("v", "1", "cpu validation or not")
|
||||
.insert("x_prec", "fp16", "input precision, fp16/bf16/fp32")
|
||||
.insert("y_prec", "fp16", "output precision, fp16/bf16/fp32")
|
||||
.insert("warmup", "10", "cold iter")
|
||||
.insert("repeat", "50", "hot iter")
|
||||
.insert("json", "0", "0: No Json, 1: Dump Results in Json format")
|
||||
.insert("jsonfile", "elementwise.json", "json file name to dump results");
|
||||
|
||||
bool result = arg_parser.parse(argc, argv);
|
||||
return std::make_tuple(result, arg_parser);
|
||||
}
|
||||
|
||||
// XDataType: Data type of the input tensors.
|
||||
// ComputeDataType: Data type used for intermediate computations (often float for precision).
|
||||
// YDataType: Data type of the output tensor.
|
||||
template <typename XDataType, typename YDataType>
|
||||
bool run(const ck_tile::ArgParser& arg_parser)
|
||||
{
|
||||
ck_tile::index_t M = arg_parser.get_int("m");
|
||||
ck_tile::index_t N = arg_parser.get_int("n");
|
||||
ck_tile::index_t stride = arg_parser.get_int("stride");
|
||||
|
||||
// If stride is negative (default -1), set it to N, assuming a dense row-major layout.
|
||||
if(stride < 0)
|
||||
stride = N;
|
||||
int do_validation = arg_parser.get_int("v");
|
||||
int warmup = arg_parser.get_int("warmup");
|
||||
int repeat = arg_parser.get_int("repeat");
|
||||
|
||||
if(stride < N)
|
||||
{
|
||||
throw std::runtime_error("stride must be >= N");
|
||||
}
|
||||
|
||||
// XElementwiseOperation: The specific elementwise operation to perform (e.g., Add, Mul).
|
||||
using ComputeDataType =
|
||||
float; // Using float for intermediate calculations can improve numerical stability.
|
||||
using XElementwiseOperation = ck_tile::element_wise::Add;
|
||||
|
||||
// 1. Initialize the input data on the host (CPU).
|
||||
// HostTensor is a utility to manage tensor data on the CPU.
|
||||
// The first argument is the shape (dimensions) of the tensor {M, N}.
|
||||
// The second argument is the strides {stride, 1} for row-major layout.
|
||||
// 'x_host_a' and 'x_host_b' are the two input tensors for the elementwise operation.
|
||||
ck_tile::HostTensor<XDataType> x_host_a({M, N}, {stride, 1});
|
||||
ck_tile::HostTensor<XDataType> x_host_b({M, N}, {stride, 1});
|
||||
ck_tile::HostTensor<YDataType> y_host({M, N}, {stride, 1});
|
||||
ck_tile::HostTensor<YDataType> y_validation({M, N}, {stride, 1});
|
||||
|
||||
std::vector<ck_tile::index_t> shape = {M, N};
|
||||
|
||||
// Fill the host tensors with random data.
|
||||
// FillUniformDistribution populates the tensor with values from a uniform distribution,
|
||||
// within an interval.
|
||||
ck_tile::FillUniformDistribution<XDataType>{0.f, 5.f}(x_host_a);
|
||||
ck_tile::FillUniformDistribution<XDataType>{0.f, 5.f}(x_host_b);
|
||||
|
||||
// 2. Create device memory buffers
|
||||
// DeviceMem allocates memory on the GPU.
|
||||
// The size is determined by the total number of elements and the size of DataType.
|
||||
ck_tile::DeviceMem x_buf_a(x_host_a.get_element_space_size_in_bytes());
|
||||
ck_tile::DeviceMem x_buf_b(x_host_b.get_element_space_size_in_bytes());
|
||||
ck_tile::DeviceMem y_buf(y_host.get_element_space_size_in_bytes());
|
||||
|
||||
// Copy data from host input tensors to device buffers.
|
||||
x_buf_a.ToDevice(x_host_a.data());
|
||||
x_buf_b.ToDevice(x_host_b.data());
|
||||
|
||||
// 3. Configure the kernel execution parameters.
|
||||
// Dividing the problem into blocktile, blockwarp and warptile
|
||||
// The blocktile is the size of the tile processed by a single work group (also called thread
|
||||
// block). The warptile is the size of the tile processed by a single wavefront (also called
|
||||
// warp). The vector is the size of the tile processed by a single work item (also called
|
||||
// thread). The problem is divided into blocks of size BlockTile. Each block is further divided
|
||||
// into wavefronts of size WarpTile. Each wavefront is composed of 64 work items (on AMD; 32
|
||||
// threads on NVIDIA). Each work item in a wavefront processes one vector's worth of elements.
|
||||
// Note that WarpTile/Vector should be 64 for CDNA (because there are 64 work items per
|
||||
// wavefront). Vector size is set to be 16 / sizeof(ComputeDataType), to maximize vectorization.
|
||||
using BlockTile = ck_tile::sequence<2048>; // How many elements are handled by a block tile (the
|
||||
// tensor is divided into blocks of this size)
|
||||
using BlockWarps = ck_tile::sequence<8>; // How many concurrent wavefronts are in a block (each
|
||||
// wavefront will cover some part of the block tile)
|
||||
|
||||
// WarpTile: Defines the size of the data sub-tile processed by a single wavefront.
|
||||
// This should be consistent with BlockTile and BlockWarps.
|
||||
// If BlockTile is 2048 and BlockWarps is 8, then WarpTile could be 2048/8 = 256.
|
||||
// However, this example uses 64, meaning each wavefront processes 64 elements, and multiple
|
||||
// such wavefront operations might be needed to cover the BlockTile, or the BlockTile is
|
||||
// distributed differently.
|
||||
// The current configuration (BlockTile=2048, BlockWarps=8, WarpTile=64) implies that
|
||||
// each wavefront processes 64 elements, and 8 wavefronts process 8*64 = 512 elements
|
||||
// concurrently. Since 512 is not equal to 2048, it means that warptile(s) will need to iterate
|
||||
// over multiple times over different set of elements to cover the entire BlockTile.
|
||||
using WarpTile = ck_tile::sequence<64>;
|
||||
|
||||
// 4. Create the kernel
|
||||
|
||||
// ElementWiseShape bundles these tiling parameters.
|
||||
// It calculates derived properties like threads per wavefront, repeats, vectorization and total
|
||||
// block size.
|
||||
using Shape = ck_tile::ElementWiseShape<BlockWarps, BlockTile, WarpTile, XDataType>;
|
||||
|
||||
// ElementWisePipelineProblem encapsulates all necessary information for the elementwise kernel:
|
||||
// - Data types (input, compute, output).
|
||||
// - Shape traits (tiling configuration).
|
||||
// - The specific elementwise operation (e.g., Add).
|
||||
using Problem = ck_tile::ElementWisePipelineProblem<XDataType,
|
||||
ComputeDataType,
|
||||
YDataType,
|
||||
Shape,
|
||||
XElementwiseOperation>;
|
||||
|
||||
// ElementWiseKernel refers to the GPU kernel class
|
||||
using Kernel = ck_tile::ElementWiseKernel<Problem, ck_tile::ElementWiseDefaultPolicy>;
|
||||
|
||||
// Compute flattened size
|
||||
ck_tile::index_t total_elements = 1;
|
||||
for(auto d : shape)
|
||||
total_elements *= d;
|
||||
|
||||
// kBlockSize: The number of work items in a GPU workgroup (thread block).
|
||||
// This is often a multiple of the wavefront size, 64 on CDNA.
|
||||
// Here, it's explicitly set to 512. This should be consistent with Shape::kBlockSize.
|
||||
// Shape::kBlockSize would be BlockWarps * warpSize (e.g., 8 * 64 = 512).
|
||||
const ck_tile::index_t kBlockSize = Kernel::BlockSize();
|
||||
|
||||
// kBlockPerCu: Hint for how many workgroups can be scheduled per Compute Unit (CU).
|
||||
// This can influence occupancy and performance.
|
||||
constexpr ck_tile::index_t kBlockPerCu = 1;
|
||||
|
||||
// kGridSize: Calculates the total number of workgroups required to process all elements.
|
||||
// Each workgroup is responsible for 'elements_per_block' elements.
|
||||
// To ensure all elements are covered, especially when 'total_elements' is not perfectly
|
||||
// divisible by 'elements_per_block', using ceiling division.
|
||||
constexpr ck_tile::index_t elements_per_block = BlockTile::at(ck_tile::number<0>{});
|
||||
ck_tile::index_t kGridSize = (total_elements + elements_per_block - 1) / elements_per_block;
|
||||
|
||||
std::cout << "grid size = " << kGridSize << std::endl;
|
||||
std::cout << "Total elements = " << total_elements << std::endl;
|
||||
|
||||
auto input_tensors = ck_tile::make_tuple(static_cast<XDataType*>(x_buf_a.GetDeviceBuffer()),
|
||||
static_cast<XDataType*>(x_buf_b.GetDeviceBuffer()));
|
||||
|
||||
auto input_size = ck_tile::make_tuple(M, N);
|
||||
|
||||
// Check if the kernel configuration is supported
|
||||
if(!Kernel::IsSupportedArgument(input_size))
|
||||
{
|
||||
throw std::runtime_error(
|
||||
"The kernel configuration is not supported for the given input size.");
|
||||
}
|
||||
|
||||
// 4. Run the kernel
|
||||
float ave_time = launch_kernel(
|
||||
ck_tile::stream_config{nullptr, true, 0, warmup, repeat},
|
||||
ck_tile::make_kernel<kBlockPerCu>(Kernel{},
|
||||
kGridSize,
|
||||
kBlockSize,
|
||||
0,
|
||||
input_size,
|
||||
ck_tile::make_tuple(N, 1), // Input Stride
|
||||
ck_tile::make_tuple(N, 1), // Output Stride
|
||||
input_tensors,
|
||||
static_cast<YDataType*>(y_buf.GetDeviceBuffer())));
|
||||
|
||||
std::cout << "Average time: " << ave_time << " ms" << std::endl;
|
||||
|
||||
// 5. Verify the output
|
||||
bool pass = true;
|
||||
if(do_validation)
|
||||
{
|
||||
y_buf.FromDevice(y_validation.data());
|
||||
auto op = [](const auto& v0, const auto& v1) { return v0 + v1; };
|
||||
|
||||
ck_tile::reference_binary_elementwise<XDataType, XDataType, YDataType, ComputeDataType>(
|
||||
x_host_a, x_host_b, y_host, op);
|
||||
|
||||
pass = ck_tile::check_err(
|
||||
y_validation, y_host, "Elementwise Add Error: Incorrect results!", 0.01, 0.01);
|
||||
}
|
||||
|
||||
if(arg_parser.get_int("json") == 1)
|
||||
{
|
||||
dump_elementwise_json_results(arg_parser.get_str("jsonfile"),
|
||||
arg_parser.get_str("prec"),
|
||||
kGridSize,
|
||||
kBlockSize,
|
||||
ave_time,
|
||||
0,
|
||||
0,
|
||||
"elementwise_add");
|
||||
}
|
||||
|
||||
return pass;
|
||||
}
|
||||
|
||||
int main(int argc, char* argv[])
|
||||
{
|
||||
bool result = true;
|
||||
ck_tile::ArgParser arg_parser;
|
||||
std::tie(result, arg_parser) = create_args(argc, argv);
|
||||
if(!result)
|
||||
return -1;
|
||||
|
||||
try
|
||||
{
|
||||
const auto x_prec_variant = string_to_datatype(arg_parser.get_str("x_prec"));
|
||||
const auto y_prec_variant = string_to_datatype(arg_parser.get_str("y_prec"));
|
||||
return std::visit(
|
||||
[&](auto&& x_dt, auto&& y_dt) -> int {
|
||||
using XDataType = std::decay_t<decltype(x_dt)>;
|
||||
using YDataType = std::decay_t<decltype(y_dt)>;
|
||||
return run<XDataType, YDataType>(arg_parser);
|
||||
},
|
||||
x_prec_variant,
|
||||
y_prec_variant);
|
||||
}
|
||||
catch(const std::exception& e)
|
||||
{
|
||||
std::cerr << "Error: " << e.what() << std::endl;
|
||||
return -3;
|
||||
}
|
||||
}
|
||||
184
example/ck_tile/21_elementwise/elementwise_example_add_4d.cpp
Normal file
184
example/ck_tile/21_elementwise/elementwise_example_add_4d.cpp
Normal file
@@ -0,0 +1,184 @@
|
||||
// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
|
||||
// SPDX-License-Identifier: MIT
|
||||
|
||||
#include "ck_tile/host.hpp"
|
||||
#include "ck_tile/ops/elementwise.hpp"
|
||||
#include "ck_tile/host/reference/reference_elementwise.hpp"
|
||||
#include "ck_tile/utility/json_dump.hpp"
|
||||
#include "elementwise_common.hpp"
|
||||
|
||||
auto create_args(int argc, char* argv[])
|
||||
{
|
||||
ck_tile::ArgParser arg_parser;
|
||||
arg_parser.insert("dim0", "4", "dimension 0")
|
||||
.insert("dim1", "16", "dimension 1")
|
||||
.insert("dim2", "32", "dimension 2")
|
||||
.insert("dim3", "32", "dimension 3")
|
||||
.insert("v", "1", "cpu validation or not")
|
||||
.insert("x_prec", "fp16", "input precision")
|
||||
.insert("y_prec", "fp16", "output precision")
|
||||
.insert("warmup", "10", "cold iter")
|
||||
.insert("repeat", "50", "hot iter")
|
||||
.insert("json", "0", "0: No Json, 1: Dump Results in Json format")
|
||||
.insert("jsonfile", "elementwise_add_4d.json", "json file name to dump results");
|
||||
|
||||
bool result = arg_parser.parse(argc, argv);
|
||||
return std::make_tuple(result, arg_parser);
|
||||
}
|
||||
|
||||
template <typename XDataType, typename YDataType>
|
||||
bool run(const ck_tile::ArgParser& arg_parser)
|
||||
{
|
||||
ck_tile::index_t D0 = arg_parser.get_int("dim0");
|
||||
ck_tile::index_t D1 = arg_parser.get_int("dim1");
|
||||
ck_tile::index_t D2 = arg_parser.get_int("dim2");
|
||||
ck_tile::index_t D3 = arg_parser.get_int("dim3");
|
||||
|
||||
int do_validation = arg_parser.get_int("v");
|
||||
int warmup = arg_parser.get_int("warmup");
|
||||
int repeat = arg_parser.get_int("repeat");
|
||||
|
||||
using ComputeDataType =
|
||||
float; // Using float for intermediate calculations can improve numerical stability.
|
||||
using XElementwiseOperation = ck_tile::element_wise::Add;
|
||||
|
||||
// Initialize the input data on the host (CPU).
|
||||
std::vector<ck_tile::index_t> problem_shape = {D0, D1, D2, D3};
|
||||
|
||||
std::vector<ck_tile::index_t> host_strides(4);
|
||||
host_strides[3] = 1;
|
||||
host_strides[2] = problem_shape[3];
|
||||
host_strides[1] = problem_shape[2] * problem_shape[3];
|
||||
host_strides[0] = problem_shape[1] * problem_shape[2] * problem_shape[3];
|
||||
|
||||
ck_tile::HostTensor<XDataType> x_host_a(problem_shape, host_strides);
|
||||
ck_tile::HostTensor<XDataType> x_host_b(problem_shape, host_strides);
|
||||
ck_tile::HostTensor<YDataType> y_host(problem_shape, host_strides);
|
||||
ck_tile::HostTensor<YDataType> y_validation(problem_shape, host_strides);
|
||||
|
||||
ck_tile::FillUniformDistribution<XDataType>{0.f, 5.f}(x_host_a);
|
||||
ck_tile::FillUniformDistribution<XDataType>{2.f, 10.f}(x_host_b);
|
||||
|
||||
ck_tile::DeviceMem x_buf_a(x_host_a.get_element_space_size_in_bytes());
|
||||
ck_tile::DeviceMem x_buf_b(x_host_b.get_element_space_size_in_bytes());
|
||||
ck_tile::DeviceMem y_buf(y_host.get_element_space_size_in_bytes());
|
||||
|
||||
x_buf_a.ToDevice(x_host_a.data());
|
||||
x_buf_b.ToDevice(x_host_b.data());
|
||||
|
||||
using BlockTile = ck_tile::sequence<256>;
|
||||
using BlockWarps = ck_tile::sequence<1>;
|
||||
using WarpTile = ck_tile::sequence<256>;
|
||||
|
||||
using Shape = ck_tile::ElementWiseShape<BlockWarps, BlockTile, WarpTile, XDataType>;
|
||||
|
||||
using Problem = ck_tile::ElementWisePipelineProblem<XDataType,
|
||||
ComputeDataType,
|
||||
YDataType,
|
||||
Shape,
|
||||
XElementwiseOperation>;
|
||||
|
||||
using Kernel = ck_tile::ElementWiseKernel<Problem, ck_tile::ElementWiseDefaultPolicy>;
|
||||
|
||||
ck_tile::index_t total_elements = 1;
|
||||
for(auto d : problem_shape)
|
||||
total_elements *= d;
|
||||
|
||||
const ck_tile::index_t kBlockSize = Kernel::BlockSize();
|
||||
|
||||
constexpr ck_tile::index_t kBlockPerCu = 2;
|
||||
|
||||
constexpr ck_tile::index_t elements_per_block = BlockTile::at(ck_tile::number<0>{});
|
||||
ck_tile::index_t kGridSize = (total_elements + elements_per_block - 1) / elements_per_block;
|
||||
|
||||
std::cout << "grid size = " << kGridSize << std::endl;
|
||||
std::cout << "Total elements = " << total_elements << std::endl;
|
||||
|
||||
auto input_tensors = ck_tile::make_tuple(static_cast<XDataType*>(x_buf_a.GetDeviceBuffer()),
|
||||
static_cast<XDataType*>(x_buf_b.GetDeviceBuffer()));
|
||||
|
||||
auto problem_shape_tuple =
|
||||
ck_tile::make_tuple(problem_shape[0], problem_shape[1], problem_shape[2], problem_shape[3]);
|
||||
|
||||
auto strides_tuple =
|
||||
ck_tile::make_tuple(host_strides[0], host_strides[1], host_strides[2], host_strides[3]);
|
||||
|
||||
// Check if the kernel configuration is supported
|
||||
if(!Kernel::IsSupportedArgument(problem_shape_tuple))
|
||||
{
|
||||
throw std::runtime_error(
|
||||
"The kernel configuration is not supported for the given input size.");
|
||||
}
|
||||
|
||||
// Run the kernel
|
||||
float ave_time = launch_kernel(
|
||||
ck_tile::stream_config{nullptr, true, 0, warmup, repeat},
|
||||
ck_tile::make_kernel<kBlockPerCu>(
|
||||
Kernel{},
|
||||
kGridSize,
|
||||
kBlockSize,
|
||||
0,
|
||||
problem_shape_tuple, // ck_tile::tuple<index_t, index_t, index_t, index_t>
|
||||
strides_tuple, // ck_tile::tuple<index_t, index_t, index_t, index_t> for input strides
|
||||
strides_tuple, // ck_tile::tuple<index_t, index_t, index_t, index_t> for output strides
|
||||
input_tensors,
|
||||
static_cast<YDataType*>(y_buf.GetDeviceBuffer())));
|
||||
|
||||
std::cout << "Average time: " << ave_time << " ms" << std::endl;
|
||||
|
||||
// Verify the output
|
||||
bool pass = true;
|
||||
if(do_validation)
|
||||
{
|
||||
y_buf.FromDevice(y_validation.data());
|
||||
auto op = [](const auto& v0, const auto& v1) { return v0 + v1; };
|
||||
|
||||
ck_tile::reference_binary_elementwise<XDataType, XDataType, YDataType, ComputeDataType>(
|
||||
x_host_a, x_host_b, y_host, op);
|
||||
|
||||
pass = ck_tile::check_err(
|
||||
y_validation, y_host, "Elementwise Add Error: Incorrect results!", 0.01, 0.01);
|
||||
}
|
||||
|
||||
if(arg_parser.get_int("json") == 1)
|
||||
{
|
||||
dump_elementwise_json_results(arg_parser.get_str("jsonfile"),
|
||||
arg_parser.get_str("prec"),
|
||||
kGridSize,
|
||||
kBlockSize,
|
||||
ave_time,
|
||||
0,
|
||||
0,
|
||||
"elementwise_add_4d");
|
||||
}
|
||||
|
||||
return pass;
|
||||
}
|
||||
|
||||
int main(int argc, char* argv[])
|
||||
{
|
||||
bool result = true;
|
||||
ck_tile::ArgParser arg_parser;
|
||||
std::tie(result, arg_parser) = create_args(argc, argv);
|
||||
if(!result)
|
||||
return -1;
|
||||
|
||||
try
|
||||
{
|
||||
const auto x_prec_variant = string_to_datatype(arg_parser.get_str("x_prec"));
|
||||
const auto y_prec_variant = string_to_datatype(arg_parser.get_str("y_prec"));
|
||||
return std::visit(
|
||||
[&](auto&& x_dt, auto&& y_dt) -> int {
|
||||
using XDataType = std::decay_t<decltype(x_dt)>;
|
||||
using YDataType = std::decay_t<decltype(y_dt)>;
|
||||
return run<XDataType, YDataType>(arg_parser);
|
||||
},
|
||||
x_prec_variant,
|
||||
y_prec_variant);
|
||||
}
|
||||
catch(const std::exception& e)
|
||||
{
|
||||
std::cerr << "Error: " << e.what() << std::endl;
|
||||
return -3;
|
||||
}
|
||||
}
|
||||
180
example/ck_tile/21_elementwise/elementwise_example_transpose.cpp
Normal file
180
example/ck_tile/21_elementwise/elementwise_example_transpose.cpp
Normal file
@@ -0,0 +1,180 @@
|
||||
// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
|
||||
// SPDX-License-Identifier: MIT
|
||||
|
||||
#include "ck_tile/host.hpp"
|
||||
#include "ck_tile/ops/elementwise.hpp"
|
||||
#include "ck_tile/host/reference/reference_transpose.hpp"
|
||||
#include "ck_tile/utility/json_dump.hpp"
|
||||
#include "elementwise_common.hpp"
|
||||
|
||||
auto create_args(int argc, char* argv[])
|
||||
{
|
||||
ck_tile::ArgParser arg_parser;
|
||||
arg_parser.insert("m", "1024", "m dimension of input")
|
||||
.insert("n", "1024", "n dimension of input")
|
||||
.insert("stride_in", "-1", "stride for input M dim, if -1 then equal to n")
|
||||
.insert("v", "1", "cpu validation or not")
|
||||
.insert("prec", "fp16", "precision")
|
||||
.insert("warmup", "10", "cold iter")
|
||||
.insert("repeat", "50", "hot iter")
|
||||
.insert("json", "0", "0: No Json, 1: Dump Results in Json format")
|
||||
.insert("jsonfile", "elementwise_transpose.json", "json file name to dump results");
|
||||
|
||||
bool result = arg_parser.parse(argc, argv);
|
||||
return std::make_tuple(result, arg_parser);
|
||||
}
|
||||
|
||||
template <typename DataType>
|
||||
bool run(const ck_tile::ArgParser& arg_parser)
|
||||
{
|
||||
ck_tile::index_t M = arg_parser.get_int("m");
|
||||
ck_tile::index_t N = arg_parser.get_int("n");
|
||||
ck_tile::index_t stride_in = arg_parser.get_int("stride_in");
|
||||
|
||||
if(stride_in < 0)
|
||||
stride_in = N; // Dense input: stride for M dim is N
|
||||
int do_validation = arg_parser.get_int("v");
|
||||
int warmup = arg_parser.get_int("warmup");
|
||||
int repeat = arg_parser.get_int("repeat");
|
||||
|
||||
if(stride_in < N)
|
||||
{
|
||||
throw std::runtime_error("stride_in must be >= N");
|
||||
}
|
||||
|
||||
using XDataType = DataType;
|
||||
using ComputeDataType = float;
|
||||
using YDataType = DataType;
|
||||
// Use PassThrough operation for transposition (data is moved, not changed)
|
||||
using XElementwiseOperation = ck_tile::element_wise::PassThrough;
|
||||
|
||||
// 1. Initialize the input data on the host (CPU).
|
||||
// Input x_host_a: M x N
|
||||
// Output y_host: N x M (transposed)
|
||||
ck_tile::HostTensor<XDataType> x_host_a({M, N}, {stride_in, 1});
|
||||
// Output tensor y_host will have dimensions N x M.
|
||||
// Assuming dense output, its stride for the N dimension will be M.
|
||||
ck_tile::index_t stride_out_dim0 = M;
|
||||
ck_tile::HostTensor<YDataType> y_host({N, M}, {stride_out_dim0, 1});
|
||||
ck_tile::HostTensor<YDataType> y_validation({N, M}, {stride_out_dim0, 1});
|
||||
|
||||
// The logical shape for the element-wise operation kernel is based on the input tensor's
|
||||
// elements.
|
||||
std::vector<ck_tile::index_t> op_shape_vec = {M, N};
|
||||
auto op_lengths = ck_tile::make_tuple(M, N); // Lens for the kernel
|
||||
|
||||
ck_tile::FillUniformDistribution<XDataType>{0.f, 5.f}(x_host_a);
|
||||
|
||||
// 2. Create device memory buffers
|
||||
ck_tile::DeviceMem x_buf_a(x_host_a.get_element_space_size_in_bytes());
|
||||
ck_tile::DeviceMem y_buf(y_host.get_element_space_size_in_bytes()); // y_host is N x M
|
||||
|
||||
x_buf_a.ToDevice(x_host_a.data());
|
||||
|
||||
// 3. Configure the kernel execution parameters.
|
||||
using BlockTile = ck_tile::sequence<1024>;
|
||||
using BlockWarps = ck_tile::sequence<8>;
|
||||
using WarpTile = ck_tile::sequence<64>;
|
||||
|
||||
using Shape = ck_tile::ElementWiseShape<BlockWarps, BlockTile, WarpTile, XDataType>;
|
||||
|
||||
// Problem definition for a single input tensor
|
||||
using Problem = ck_tile::ElementWisePipelineProblem<XDataType,
|
||||
ComputeDataType,
|
||||
YDataType,
|
||||
Shape,
|
||||
XElementwiseOperation>;
|
||||
|
||||
using Kernel = ck_tile::ElementWiseKernel<Problem, ck_tile::ElementWiseDefaultPolicy>;
|
||||
|
||||
ck_tile::index_t total_elements = M * N;
|
||||
|
||||
const ck_tile::index_t kBlockSize = Kernel::BlockSize();
|
||||
constexpr ck_tile::index_t kBlockPerCu = 1;
|
||||
constexpr ck_tile::index_t elements_per_block = BlockTile::at(ck_tile::number<0>{});
|
||||
ck_tile::index_t kGridSize = (total_elements + elements_per_block - 1) / elements_per_block;
|
||||
|
||||
std::cout << "Input M=" << M << ", N=" << N << ", StrideIn=" << stride_in << std::endl;
|
||||
std::cout << "Output N=" << N << ", M=" << M << ", StrideOut=" << stride_out_dim0 << std::endl;
|
||||
std::cout << "Grid size = " << kGridSize << ", BlockSize = " << kBlockSize << std::endl;
|
||||
std::cout << "Total elements = " << total_elements << std::endl;
|
||||
|
||||
// Input tensors tuple (single input)
|
||||
auto input_tensors = ck_tile::make_tuple(static_cast<XDataType*>(x_buf_a.GetDeviceBuffer()));
|
||||
// Input strides tuple (tuple of tuples, one for each input)
|
||||
auto input_strides = ck_tile::make_tuple(stride_in, 1);
|
||||
// Output strides (for N x M tensor, dense)
|
||||
auto output_strides = ck_tile::make_tuple(1, stride_out_dim0);
|
||||
|
||||
// Check if the kernel configuration is supported
|
||||
if(!Kernel::IsSupportedArgument(op_lengths))
|
||||
{
|
||||
throw std::runtime_error(
|
||||
"The kernel configuration is not supported for the given input size.");
|
||||
}
|
||||
|
||||
// 4. Run the kernel
|
||||
float ave_time = launch_kernel(
|
||||
ck_tile::stream_config{nullptr, true, 0, warmup, repeat},
|
||||
ck_tile::make_kernel<kBlockPerCu>(Kernel{},
|
||||
kGridSize,
|
||||
kBlockSize,
|
||||
0, // Shared memory
|
||||
op_lengths, // Logical dimensions for the operation (M, N)
|
||||
input_strides, // Strides for input tensor(s)
|
||||
output_strides, // Strides for output tensor (N, M)
|
||||
input_tensors,
|
||||
static_cast<YDataType*>(y_buf.GetDeviceBuffer())));
|
||||
|
||||
std::cout << "Average time: " << ave_time << " ms" << std::endl;
|
||||
|
||||
// 5. Verify the output
|
||||
bool pass = true;
|
||||
if(do_validation)
|
||||
{
|
||||
y_buf.FromDevice(y_validation.data()); // Copy result from device to y_validation
|
||||
ck_tile::reference_transpose_elementwise<XDataType, YDataType>(
|
||||
x_host_a, y_host); // Compute reference on host
|
||||
pass = ck_tile::check_err(
|
||||
y_validation, y_host, "Transpose Error: Incorrect results!", 0.01, 0.01);
|
||||
}
|
||||
|
||||
if(arg_parser.get_int("json") == 1)
|
||||
{
|
||||
dump_elementwise_json_results(arg_parser.get_str("jsonfile"),
|
||||
arg_parser.get_str("prec"),
|
||||
kGridSize,
|
||||
kBlockSize,
|
||||
ave_time,
|
||||
0,
|
||||
0,
|
||||
"elementwise_transpose");
|
||||
}
|
||||
|
||||
return pass;
|
||||
}
|
||||
|
||||
int main(int argc, char* argv[])
|
||||
{
|
||||
bool result = true;
|
||||
ck_tile::ArgParser arg_parser;
|
||||
std::tie(result, arg_parser) = create_args(argc, argv);
|
||||
if(!result)
|
||||
return -1;
|
||||
|
||||
try
|
||||
{
|
||||
const auto prec_variant = string_to_datatype(arg_parser.get_str("prec"));
|
||||
return std::visit(
|
||||
[&](auto&& dt) -> int {
|
||||
using DataType = std::decay_t<decltype(dt)>;
|
||||
return run<DataType>(arg_parser);
|
||||
},
|
||||
prec_variant);
|
||||
}
|
||||
catch(const std::exception& e)
|
||||
{
|
||||
std::cerr << "Error: " << e.what() << std::endl;
|
||||
return -3;
|
||||
}
|
||||
}
|
||||
223
example/ck_tile/21_elementwise/elementwise_example_unary.cpp
Normal file
223
example/ck_tile/21_elementwise/elementwise_example_unary.cpp
Normal file
@@ -0,0 +1,223 @@
|
||||
// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
|
||||
// SPDX-License-Identifier: MIT
|
||||
|
||||
#include "ck_tile/host.hpp"
|
||||
#include "ck_tile/ops/elementwise.hpp"
|
||||
#include "ck_tile/host/reference/reference_elementwise.hpp"
|
||||
#include "ck_tile/utility/json_dump.hpp"
|
||||
#include "elementwise_common.hpp"
|
||||
|
||||
auto create_args(int argc, char* argv[])
|
||||
{
|
||||
ck_tile::ArgParser arg_parser;
|
||||
arg_parser.insert("m", "1024", "m dimension")
|
||||
.insert("n", "1024", "n dimension")
|
||||
.insert("stride", "-1", "stride per row, if -1 then equal to n")
|
||||
.insert("v", "1", "cpu validation or not")
|
||||
.insert("op", "1", "unary operation, 1: square, 2: convert")
|
||||
.insert("x_prec", "fp16", "input precision")
|
||||
.insert("y_prec", "fp16", "output precision")
|
||||
.insert("warmup", "10", "cold iter")
|
||||
.insert("repeat", "50", "hot iter")
|
||||
.insert("json", "0", "0: No Json, 1: Dump Results in Json format")
|
||||
.insert("jsonfile", "elementwise_unary.json", "json file name to dump results");
|
||||
|
||||
bool result = arg_parser.parse(argc, argv);
|
||||
return std::make_tuple(result, arg_parser);
|
||||
}
|
||||
|
||||
template <typename XElementwiseOperation, typename XDataType, typename YDataType>
|
||||
bool run(const ck_tile::ArgParser& arg_parser)
|
||||
{
|
||||
ck_tile::index_t M = arg_parser.get_int("m");
|
||||
ck_tile::index_t N = arg_parser.get_int("n");
|
||||
ck_tile::index_t stride = arg_parser.get_int("stride");
|
||||
if(stride < 0)
|
||||
stride = N;
|
||||
int do_validation = arg_parser.get_int("v");
|
||||
int warmup = arg_parser.get_int("warmup");
|
||||
int repeat = arg_parser.get_int("repeat");
|
||||
|
||||
assert(stride >= N);
|
||||
|
||||
// 1. Initialize the input data on the host
|
||||
ck_tile::HostTensor<XDataType> x_host_a({M, N}, {stride, 1});
|
||||
ck_tile::HostTensor<YDataType> y_host({M, N}, {stride, 1});
|
||||
ck_tile::HostTensor<YDataType> y_validation({M, N}, {stride, 1});
|
||||
|
||||
std::vector<ck_tile::index_t> shape = {M, N};
|
||||
|
||||
ck_tile::FillUniformDistribution<XDataType>{0.f, 5.f}(x_host_a);
|
||||
|
||||
// 2. Create device memory buffers and copy input data from host to device
|
||||
ck_tile::DeviceMem x_buf_a(x_host_a.get_element_space_size_in_bytes());
|
||||
ck_tile::DeviceMem y_buf(y_host.get_element_space_size_in_bytes());
|
||||
x_buf_a.ToDevice(x_host_a.data());
|
||||
|
||||
// 3. Create the kernel
|
||||
|
||||
// Dividing the problem into blocktile, warptile, and vector
|
||||
using BlockTile = ck_tile::sequence<2048>; // Size of the block tile (Entire problem is divided
|
||||
// into blocks of this size)
|
||||
using BlockWarps = ck_tile::sequence<8>; // How many concurrent warps are in a block (Each warp
|
||||
// will cover some part of blockTile)
|
||||
using WarpTile = ck_tile::sequence<64>; // How many elements are covered by a warp
|
||||
|
||||
using Shape = ck_tile::ElementWiseShape<BlockWarps, BlockTile, WarpTile, XDataType>;
|
||||
using Problem = ck_tile::ElementWisePipelineProblem<XDataType,
|
||||
XDataType, // ComputeDataType is same as
|
||||
// XDataType in the unary case
|
||||
YDataType,
|
||||
Shape,
|
||||
XElementwiseOperation>;
|
||||
|
||||
using Kernel = ck_tile::ElementWiseKernel<Problem, ck_tile::ElementWiseDefaultPolicy>;
|
||||
|
||||
// Compute flattened size
|
||||
ck_tile::index_t total_elements = 1;
|
||||
for(auto d : shape)
|
||||
total_elements *= d;
|
||||
|
||||
const ck_tile::index_t kBlockSize = Kernel::BlockSize();
|
||||
constexpr ck_tile::index_t kBlockPerCu = 1;
|
||||
|
||||
constexpr ck_tile::index_t elements_per_block = BlockTile::at(ck_tile::number<0>{});
|
||||
ck_tile::index_t kGridSize = (total_elements + elements_per_block - 1) / elements_per_block;
|
||||
|
||||
std::cout << "grid size = " << kGridSize << std::endl;
|
||||
std::cout << "Total elements = " << total_elements << std::endl;
|
||||
|
||||
auto input_tensors = ck_tile::make_tuple(static_cast<XDataType*>(x_buf_a.GetDeviceBuffer()));
|
||||
auto input_size = ck_tile::make_tuple(M, N);
|
||||
|
||||
// Check if the kernel configuration is supported
|
||||
if(!Kernel::IsSupportedArgument(input_size))
|
||||
{
|
||||
throw std::runtime_error(
|
||||
"The kernel configuration is not supported for the given input size.");
|
||||
}
|
||||
|
||||
// 4. Run the kernel
|
||||
float ave_time = launch_kernel(
|
||||
ck_tile::stream_config{nullptr, true, 0, warmup, repeat},
|
||||
ck_tile::make_kernel<kBlockPerCu>(Kernel{},
|
||||
kGridSize,
|
||||
kBlockSize,
|
||||
0,
|
||||
input_size,
|
||||
ck_tile::make_tuple(N, 1), // Input Stride
|
||||
ck_tile::make_tuple(N, 1), // Output Stride
|
||||
input_tensors,
|
||||
static_cast<YDataType*>(y_buf.GetDeviceBuffer())));
|
||||
|
||||
std::cout << "Average time: " << ave_time << " ms" << std::endl;
|
||||
|
||||
// 5. Verify the output
|
||||
bool pass = true;
|
||||
if(do_validation)
|
||||
{
|
||||
y_buf.FromDevice(y_validation.data());
|
||||
|
||||
auto op = [](const XDataType& v0) -> YDataType {
|
||||
XElementwiseOperation element_op{};
|
||||
YDataType result;
|
||||
element_op(result, v0);
|
||||
return result;
|
||||
};
|
||||
|
||||
ck_tile::reference_unary_elementwise<XDataType, YDataType, YDataType>(x_host_a, y_host, op);
|
||||
|
||||
pass = ck_tile::check_err(
|
||||
y_validation, y_host, "Elementwise unary op: Incorrect results!", 0.01, 0.01);
|
||||
}
|
||||
|
||||
if(arg_parser.get_int("json") == 1)
|
||||
{
|
||||
dump_elementwise_json_results(arg_parser.get_str("jsonfile"),
|
||||
arg_parser.get_str("prec"),
|
||||
kGridSize,
|
||||
kBlockSize,
|
||||
ave_time,
|
||||
0,
|
||||
0,
|
||||
"elementwise_unary");
|
||||
}
|
||||
|
||||
return pass;
|
||||
}
|
||||
|
||||
template <typename XElementwiseOperation, typename XDataType, typename YDataType>
|
||||
bool filter_then_run(const ck_tile::ArgParser& arg_parser)
|
||||
{
|
||||
auto throw_unsupported = [&]() {
|
||||
const auto x_prec = arg_parser.get_str("x_prec");
|
||||
const auto op = arg_parser.get_str("op");
|
||||
throw std::runtime_error("Unsupported! x_prec: " + x_prec + ", op: " + op);
|
||||
};
|
||||
bool pass = true;
|
||||
|
||||
if constexpr(std::is_same_v<XElementwiseOperation, ck_tile::element_wise::UnarySquare> &&
|
||||
(std::is_same_v<XDataType, ck_tile::bf16_t> ||
|
||||
std::is_same_v<YDataType, ck_tile::bf16_t>))
|
||||
{
|
||||
throw_unsupported();
|
||||
}
|
||||
else if constexpr(std::is_same_v<XElementwiseOperation, ck_tile::element_wise::UnaryConvert> &&
|
||||
(std::is_same_v<XDataType, ck_tile::bf16_t> ||
|
||||
std::is_same_v<YDataType, ck_tile::bf16_t>))
|
||||
{
|
||||
throw_unsupported();
|
||||
}
|
||||
else
|
||||
{
|
||||
pass = run<XElementwiseOperation, XDataType, YDataType>(arg_parser);
|
||||
}
|
||||
|
||||
return pass;
|
||||
}
|
||||
|
||||
auto string_to_op(const std::string& op)
|
||||
{
|
||||
using OpVariant =
|
||||
std::variant<ck_tile::element_wise::UnarySquare, ck_tile::element_wise::UnaryConvert>;
|
||||
|
||||
if(op == "1")
|
||||
return OpVariant{ck_tile::element_wise::UnarySquare{}};
|
||||
else if(op == "2")
|
||||
return OpVariant{ck_tile::element_wise::UnaryConvert{}};
|
||||
else
|
||||
{
|
||||
throw std::runtime_error("Unsupported unary operation: " + op);
|
||||
}
|
||||
};
|
||||
|
||||
int main(int argc, char* argv[])
|
||||
{
|
||||
bool result = true;
|
||||
ck_tile::ArgParser arg_parser;
|
||||
std::tie(result, arg_parser) = create_args(argc, argv);
|
||||
if(!result)
|
||||
return -1;
|
||||
|
||||
try
|
||||
{
|
||||
const auto x_prec_variant = string_to_datatype(arg_parser.get_str("x_prec"));
|
||||
const auto y_prec_variant = string_to_datatype(arg_parser.get_str("y_prec"));
|
||||
const auto op_variant = string_to_op(arg_parser.get_str("op"));
|
||||
return std::visit(
|
||||
[&](auto&& op, auto&& x_dt, auto&& y_dt) -> int {
|
||||
using XElementwiseOperation = std::decay_t<decltype(op)>;
|
||||
using XDataType = std::decay_t<decltype(x_dt)>;
|
||||
using YDataType = std::decay_t<decltype(y_dt)>;
|
||||
return filter_then_run<XElementwiseOperation, XDataType, YDataType>(arg_parser);
|
||||
},
|
||||
op_variant,
|
||||
x_prec_variant,
|
||||
y_prec_variant);
|
||||
}
|
||||
catch(const std::exception& e)
|
||||
{
|
||||
std::cerr << "Error: " << e.what() << std::endl;
|
||||
return -3;
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user