[CK Tile] gemm splitk two stage (#2697)

* Fix a typo

* Use std::variant to call run_gemm_example_with_layouts with the available layout variant combinations

* Use a unified run_gemm_example_prec_type for basic gemm and universal gemm

* Factor out run_gemm_example_prec_type

* Refactor argument parsing in gemm_splitk_two_stage_reduce.cpp

* Parse arguments outside of create_args

* Move the gemm operators to separate structs to facilitate their reuse

* Move the invokers to separate files to facilitate their reuse

* Rename the invoker files for consistency with the examples that use them

* Add fp32 support to the elementwise examples, and produce an error message for unsupported types

* Get rid of four unused variables

* Make two variables const

* Add support for different input-output type combinations in elementwise examples

* Test support for different input and output types in elementwise examples

* Add support for different operations in the elementwise unary tests

* Add support for UnaryConvert in the elementwise unary tests

* Add support for bf16 in elementwise examples, excluding unsupported type combinations

* Make some operator parameters const in ElementWiseKernel

* Remove some unnecessary include statements

* Implement a two-stage GEMM that does a type conversion in the second stage using the elementwise kernel

* Clear workspace instead of output when flushing the cache in SplitKTwoStageInvoker::gemm

* Fix formatting issues reported by clang

* Add back CK_TILE_USE_WMMA related changes

* Use the right prec type for bf16 in the universal GEMM and two stage split K examples

* Add some brackets

* Add some brackets

* Separate the clearing of the GEMM output memory from the cache flushing in the universal GEMM example

* Separate the clearing of the GEMM output memory from the cache flushing in the split K two stage example

* Fix formatting

* No need to call SetZero on ws_m_n_dev_buf here, as clear_gemm_output now does this as part of the kernel preprocessing

* Add fp16 data type to splitk two stage example

* Add preprocessing with optional cache flushing and clearing of output for k_batch > 1 to the basic GEMM example
This commit is contained in:
SamiAario-AMD
2025-09-04 14:33:44 +03:00
committed by GitHub
parent e2d28a92af
commit 1acd8e041c
21 changed files with 1245 additions and 782 deletions

View File

@@ -1,11 +1,11 @@
// SPDX-License-Identifier: MIT
// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
#include "ck_tile/core/arch/arch.hpp"
#include "ck_tile/host.hpp"
#include "ck_tile/ops/elementwise.hpp"
#include "ck_tile/host/reference/reference_elementwise.hpp"
#include "json_dump.hpp"
#include "elementwise_common.hpp"
auto create_args(int argc, char* argv[])
{
@@ -14,7 +14,8 @@ auto create_args(int argc, char* argv[])
.insert("n", "1024", "n dimension")
.insert("stride", "-1", "stride per row, if -1 then equal to n")
.insert("v", "1", "cpu validation or not")
.insert("prec", "fp16", "precision")
.insert("x_prec", "fp16", "input precision, fp16/bf16/fp32")
.insert("y_prec", "fp16", "output precision, fp16/bf16/fp32")
.insert("warmup", "10", "cold iter")
.insert("repeat", "50", "hot iter")
.insert("json", "0", "0: No Json, 1: Dump Results in Json format")
@@ -24,7 +25,10 @@ auto create_args(int argc, char* argv[])
return std::make_tuple(result, arg_parser);
}
template <typename DataType>
// XDataType: Data type of the input tensors.
// ComputeDataType: Data type used for intermediate computations (often float for precision).
// YDataType: Data type of the output tensor.
template <typename XDataType, typename YDataType>
bool run(const ck_tile::ArgParser& arg_parser)
{
ck_tile::index_t M = arg_parser.get_int("m");
@@ -34,25 +38,18 @@ bool run(const ck_tile::ArgParser& arg_parser)
// If stride is negative (default -1), set it to N, assuming a dense row-major layout.
if(stride < 0)
stride = N;
std::string data_type = arg_parser.get_str("prec");
int do_validation = arg_parser.get_int("v");
int warmup = arg_parser.get_int("warmup");
int repeat = arg_parser.get_int("repeat");
int do_validation = arg_parser.get_int("v");
int warmup = arg_parser.get_int("warmup");
int repeat = arg_parser.get_int("repeat");
if(stride < N)
{
throw std::runtime_error("stride must be >= N");
}
// Define type aliases for clarity.
// XDataType: Data type of the input tensors.
// ComputeDataType: Data type used for intermediate computations (often float for precision).
// YDataType: Data type of the output tensor.
// XElementwiseOperation: The specific elementwise operation to perform (e.g., Add, Mul).
using XDataType = DataType;
using ComputeDataType =
float; // Using float for intermediate calculations can improve numerical stability.
using YDataType = DataType;
using XElementwiseOperation = ck_tile::element_wise::Add;
// 1. Initialize the input data on the host (CPU).
@@ -219,11 +216,22 @@ int main(int argc, char* argv[])
if(!result)
return -1;
const std::string data_type = arg_parser.get_str("prec");
if(data_type == "fp16")
try
{
return run<ck_tile::half_t>(arg_parser) ? 0 : -2;
const auto x_prec_variant = string_to_datatype(arg_parser.get_str("x_prec"));
const auto y_prec_variant = string_to_datatype(arg_parser.get_str("y_prec"));
return std::visit(
[&](auto&& x_dt, auto&& y_dt) -> int {
using XDataType = std::decay_t<decltype(x_dt)>;
using YDataType = std::decay_t<decltype(y_dt)>;
return run<XDataType, YDataType>(arg_parser);
},
x_prec_variant,
y_prec_variant);
}
catch(const std::exception& e)
{
std::cerr << "Error: " << e.what() << std::endl;
return -3;
}
return -3;
}