mirror of
https://github.com/ROCm/composable_kernel.git
synced 2026-04-19 22:39:03 +00:00
[CK Tile] gemm splitk two stage (#2697)
* Fix a typo * Use std::variant to call run_gemm_example_with_layouts with the available layout variant combinations * Use a unified run_gemm_example_prec_type for basic gemm and universal gemm * Factor out run_gemm_example_prec_type * Refactor argument parsing in gemm_splitk_two_stage_reduce.cpp * Parse arguments outside of create_args * Move the gemm operators to separate structs to facilitate their reuse * Move the invokers to separate files to facilitate their reuse * Rename the invoker files for consistency with the examples that use them * Add fp32 support to the elementwise examples, and produce an error message for unsupported types * Get rid of four unused variables * Make two variables const * Add support for different input-output type combinations in elementwise examples * Test support for different input and output types in elementwise examples * Add support for different operations in the elementwise unary tests * Add support for UnaryConvert in the elementwise unary tests * Add support for bf16 in elementwise examples, excluding unsupported type combinations * Make some operator parameters const in ElementWiseKernel * Remove some unnecessary include statements * Implement a two-stage GEMM that does a type conversion in the second stage using the elementwise kernel * Clear workspace instead of output when flushing the cache in SplitKTwoStageInvoker::gemm * Fix formatting issues reported by clang * Add back CK_TILE_USE_WMMA related changes * Use the right prec type for bf16 in the universal GEMM and two stage split K examples * Add some brackets * Add some brackets * Separate the clearing of the GEMM output memory from the cache flushing in the universal GEMM example * Separate the clearing of the GEMM output memory from the cache flushing in the split K two stage example * Fix formatting * No need to call SetZero on ws_m_n_dev_buf here, as clear_gemm_output now does this as part of the kernel preprocessing * Add fp16 data type to splitk two stage example * Add preprocessing with optional cache flushing and clearing of output for k_batch > 1 to the basic GEMM example
This commit is contained in:
@@ -1,11 +1,11 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#include "ck_tile/core/arch/arch.hpp"
|
||||
#include "ck_tile/host.hpp"
|
||||
#include "ck_tile/ops/elementwise.hpp"
|
||||
#include "ck_tile/host/reference/reference_elementwise.hpp"
|
||||
#include "json_dump.hpp"
|
||||
#include "elementwise_common.hpp"
|
||||
|
||||
auto create_args(int argc, char* argv[])
|
||||
{
|
||||
@@ -14,7 +14,9 @@ auto create_args(int argc, char* argv[])
|
||||
.insert("n", "1024", "n dimension")
|
||||
.insert("stride", "-1", "stride per row, if -1 then equal to n")
|
||||
.insert("v", "1", "cpu validation or not")
|
||||
.insert("prec", "fp16", "precision")
|
||||
.insert("op", "1", "unary operation, 1: square, 2: convert")
|
||||
.insert("x_prec", "fp16", "input precision")
|
||||
.insert("y_prec", "fp16", "output precision")
|
||||
.insert("warmup", "10", "cold iter")
|
||||
.insert("repeat", "50", "hot iter")
|
||||
.insert("json", "0", "0: No Json, 1: Dump Results in Json format")
|
||||
@@ -24,7 +26,7 @@ auto create_args(int argc, char* argv[])
|
||||
return std::make_tuple(result, arg_parser);
|
||||
}
|
||||
|
||||
template <typename DataType>
|
||||
template <typename XElementwiseOperation, typename XDataType, typename YDataType>
|
||||
bool run(const ck_tile::ArgParser& arg_parser)
|
||||
{
|
||||
ck_tile::index_t M = arg_parser.get_int("m");
|
||||
@@ -32,17 +34,12 @@ bool run(const ck_tile::ArgParser& arg_parser)
|
||||
ck_tile::index_t stride = arg_parser.get_int("stride");
|
||||
if(stride < 0)
|
||||
stride = N;
|
||||
std::string data_type = arg_parser.get_str("prec");
|
||||
int do_validation = arg_parser.get_int("v");
|
||||
int warmup = arg_parser.get_int("warmup");
|
||||
int repeat = arg_parser.get_int("repeat");
|
||||
int do_validation = arg_parser.get_int("v");
|
||||
int warmup = arg_parser.get_int("warmup");
|
||||
int repeat = arg_parser.get_int("repeat");
|
||||
|
||||
assert(stride >= N);
|
||||
|
||||
using XDataType = DataType;
|
||||
using YDataType = DataType;
|
||||
using XElementwiseOperation = ck_tile::element_wise::UnarySquare;
|
||||
|
||||
// 1. Initialize the input data on the host
|
||||
ck_tile::HostTensor<XDataType> x_host_a({M, N}, {stride, 1});
|
||||
ck_tile::HostTensor<YDataType> y_host({M, N}, {stride, 1});
|
||||
@@ -122,12 +119,17 @@ bool run(const ck_tile::ArgParser& arg_parser)
|
||||
{
|
||||
y_buf.FromDevice(y_validation.data());
|
||||
|
||||
auto op = [](const auto& v0) { return v0 * v0; };
|
||||
auto op = [](const XDataType& v0) -> YDataType {
|
||||
XElementwiseOperation element_op{};
|
||||
YDataType result;
|
||||
element_op(result, v0);
|
||||
return result;
|
||||
};
|
||||
|
||||
ck_tile::reference_unary_elementwise<XDataType, YDataType, YDataType>(x_host_a, y_host, op);
|
||||
|
||||
pass = ck_tile::check_err(
|
||||
y_validation, y_host, "Elementwise Add Error: Incorrect results!", 0.01, 0.01);
|
||||
y_validation, y_host, "Elementwise unary op: Incorrect results!", 0.01, 0.01);
|
||||
}
|
||||
|
||||
if(arg_parser.get_int("json") == 1)
|
||||
@@ -145,17 +147,69 @@ bool run(const ck_tile::ArgParser& arg_parser)
|
||||
return pass;
|
||||
}
|
||||
|
||||
template <typename XElementwiseOperation, typename XDataType, typename YDataType>
|
||||
bool filter_then_run(const ck_tile::ArgParser& arg_parser)
|
||||
{
|
||||
auto throw_unsupported = [&]() {
|
||||
const auto x_prec = arg_parser.get_str("x_prec");
|
||||
const auto op = arg_parser.get_str("op");
|
||||
throw std::runtime_error("Unsupported! x_prec: " + x_prec + ", op: " + op);
|
||||
};
|
||||
bool pass = true;
|
||||
|
||||
if constexpr(std::is_same_v<XElementwiseOperation, ck_tile::element_wise::UnarySquare> &&
|
||||
std::is_same_v<XDataType, ck_tile::bf16_t>)
|
||||
{
|
||||
throw_unsupported();
|
||||
}
|
||||
else
|
||||
{
|
||||
pass = run<XElementwiseOperation, XDataType, YDataType>(arg_parser);
|
||||
}
|
||||
|
||||
return pass;
|
||||
}
|
||||
|
||||
auto string_to_op(const std::string& op)
|
||||
{
|
||||
using OpVariant =
|
||||
std::variant<ck_tile::element_wise::UnarySquare, ck_tile::element_wise::UnaryConvert>;
|
||||
|
||||
if(op == "1")
|
||||
return OpVariant{ck_tile::element_wise::UnarySquare{}};
|
||||
else if(op == "2")
|
||||
return OpVariant{ck_tile::element_wise::UnaryConvert{}};
|
||||
else
|
||||
{
|
||||
throw std::runtime_error("Unsupported unary operation: " + op);
|
||||
}
|
||||
};
|
||||
|
||||
int main(int argc, char* argv[])
|
||||
{
|
||||
auto [result, arg_parser] = create_args(argc, argv);
|
||||
if(!result)
|
||||
return -1;
|
||||
|
||||
const std::string data_type = arg_parser.get_str("prec");
|
||||
if(data_type == "fp16")
|
||||
try
|
||||
{
|
||||
return run<ck_tile::half_t>(arg_parser) ? 0 : -2;
|
||||
const auto x_prec_variant = string_to_datatype(arg_parser.get_str("x_prec"));
|
||||
const auto y_prec_variant = string_to_datatype(arg_parser.get_str("y_prec"));
|
||||
const auto op_variant = string_to_op(arg_parser.get_str("op"));
|
||||
return std::visit(
|
||||
[&](auto&& op, auto&& x_dt, auto&& y_dt) -> int {
|
||||
using XElementwiseOperation = std::decay_t<decltype(op)>;
|
||||
using XDataType = std::decay_t<decltype(x_dt)>;
|
||||
using YDataType = std::decay_t<decltype(y_dt)>;
|
||||
return filter_then_run<XElementwiseOperation, XDataType, YDataType>(arg_parser);
|
||||
},
|
||||
op_variant,
|
||||
x_prec_variant,
|
||||
y_prec_variant);
|
||||
}
|
||||
catch(const std::exception& e)
|
||||
{
|
||||
std::cerr << "Error: " << e.what() << std::endl;
|
||||
return -3;
|
||||
}
|
||||
|
||||
return -3;
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user