Files
composable_kernel/example/ck_tile/03_gemm/gemm_weight_preshuffle.cpp
SamiAario-AMD 1acd8e041c [CK Tile] gemm splitk two stage (#2697)
* Fix a typo

* Use std::variant to call run_gemm_example_with_layouts with the available layout variant combinations

* Use a unified run_gemm_example_prec_type for basic gemm and universal gemm

* Factor out run_gemm_example_prec_type

* Refactor argument parsing in gemm_splitk_two_stage_reduce.cpp

* Parse arguments outside of create_args

* Move the gemm operators to separate structs to facilitate their reuse

* Move the invokers to separate files to facilitate their reuse

* Rename the invoker files for consistency with the examples that use them

* Add fp32 support to the elementwise examples, and produce an error message for unsupported types

* Get rid of four unused variables

* Make two variables const

* Add support for different input-output type combinations in elementwise examples

* Test support for different input and output types in elementwise examples

* Add support for different operations in the elementwise unary tests

* Add support for UnaryConvert in the elementwise unary tests

* Add support for bf16 in elementwise examples, excluding unsupported type combinations

* Make some operator parameters const in ElementWiseKernel

* Remove some unnecessary include statements

* Implement a two-stage GEMM that does a type conversion in the second stage using the elementwise kernel

* Clear workspace instead of output when flushing the cache in SplitKTwoStageInvoker::gemm

* Fix formatting issues reported by clang

* Add back CK_TILE_USE_WMMA related changes

* Use the right prec type for bf16 in the universal GEMM and two stage split K examples

* Add some brackets

* Add some brackets

* Separate the clearing of the GEMM output memory from the cache flushing in the universal GEMM example

* Separate the clearing of the GEMM output memory from the cache flushing in the split K two stage example

* Fix formatting

* No need to call SetZero on ws_m_n_dev_buf here, as clear_gemm_output now does this as part of the kernel preprocessing

* Add fp16 data type to splitk two stage example

* Add preprocessing with optional cache flushing and clearing of output for k_batch > 1 to the basic GEMM example
2025-09-04 14:33:44 +03:00

102 lines
3.1 KiB
C++

// SPDX-License-Identifier: MIT
// Copyright (c) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
#include <hip/hip_runtime.h>
#include <cstring>
#include <iostream>
#include <sstream>
#include <string>
#include <tuple>
#include "ck_tile/host.hpp"
#include "gemm_utils.hpp"
#include "run_gemm_example.inc"
#include "gemm_weight_preshuffle_invoker.hpp"
template <typename GemmConfig,
typename APrecType,
typename BPrecType = APrecType,
typename CPrecType = APrecType>
int run_gemm_example_prec_type(std::string a_layout,
std::string b_layout,
ck_tile::ArgParser& arg_parser)
{
using Row = ck_tile::tensor_layout::gemm::RowMajor;
using Col = ck_tile::tensor_layout::gemm::ColumnMajor;
bool preshuffle = GemmConfig::Preshuffle;
using Invoker = WeightPreshuffleInvoker;
if(preshuffle && (a_layout != "R" || b_layout != "C"))
{
throw std::runtime_error(
"Preshuffle is supported only for A(Row major), B(column major) input matrices!");
}
if(a_layout == "R" && b_layout == "C")
{
return run_gemm_example_with_layouts<GemmConfig, Invoker, APrecType, BPrecType, CPrecType>(
arg_parser, Row{}, Col{}, Row{});
}
else
{
throw std::runtime_error("Unsupported memory layout for the input matrices!");
}
}
template <template <typename PreType> typename GemmConfig>
int run_gemm_example(ck_tile::ArgParser& arg_parser)
{
std::string data_type = arg_parser.get_str("prec");
std::string a_layout = arg_parser.get_str("a_layout");
std::string b_layout = arg_parser.get_str("b_layout");
if(data_type == "fp16")
{
return run_gemm_example_prec_type<GemmConfig<ck_tile::half_t>, ck_tile::half_t>(
a_layout, b_layout, arg_parser);
}
else if(data_type == "bf16")
{
return run_gemm_example_prec_type<GemmConfig<ck_tile::half_t>, ck_tile::bf16_t>(
a_layout, b_layout, arg_parser);
}
else if(data_type == "fp8")
{
return run_gemm_example_prec_type<GemmConfig<ck_tile::fp8_t>,
ck_tile::fp8_t,
ck_tile::fp8_t,
ck_tile::half_t>(a_layout, b_layout, arg_parser);
}
else if(data_type == "bf8")
{
return run_gemm_example_prec_type<GemmConfig<ck_tile::bf8_t>,
ck_tile::bf8_t,
ck_tile::bf8_t,
ck_tile::half_t>(a_layout, b_layout, arg_parser);
}
else
{
throw std::runtime_error("Unsupported data type for this operation !!!");
}
}
int main(int argc, char* argv[])
{
auto arg_parser = create_args();
auto result = arg_parser.parse(argc, argv);
if(!result)
return -1;
try
{
return !run_gemm_example<GemmConfigPreshuffleDecode>(arg_parser);
}
catch(const std::runtime_error& e)
{
std::cerr << "Caught runtime error: " << e.what() << '\n';
return EXIT_FAILURE;
}
}