mirror of
https://github.com/ROCm/composable_kernel.git
synced 2026-05-16 10:59:55 +00:00
[CK Tile] gemm splitk two stage (#2697)
* Fix a typo
* Use std::variant to call run_gemm_example_with_layouts with the available layout variant combinations
* Use a unified run_gemm_example_prec_type for basic gemm and universal gemm
* Factor out run_gemm_example_prec_type
* Refactor argument parsing in gemm_splitk_two_stage_reduce.cpp
* Parse arguments outside of create_args
* Move the gemm operators to separate structs to facilitate their reuse
* Move the invokers to separate files to facilitate their reuse
* Rename the invoker files for consistency with the examples that use them
* Add fp32 support to the elementwise examples, and produce an error message for unsupported types
* Get rid of four unused variables
* Make two variables const
* Add support for different input-output type combinations in elementwise examples
* Test support for different input and output types in elementwise examples
* Add support for different operations in the elementwise unary tests
* Add support for UnaryConvert in the elementwise unary tests
* Add support for bf16 in elementwise examples, excluding unsupported type combinations
* Make some operator parameters const in ElementWiseKernel
* Remove some unnecessary include statements
* Implement a two-stage GEMM that does a type conversion in the second stage using the elementwise kernel
* Clear workspace instead of output when flushing the cache in SplitKTwoStageInvoker::gemm
* Fix formatting issues reported by clang
* Add back CK_TILE_USE_WMMA related changes
* Use the right prec type for bf16 in the universal GEMM and two stage split K examples
* Add some brackets
* Add some brackets
* Separate the clearing of the GEMM output memory from the cache flushing in the universal GEMM example
* Separate the clearing of the GEMM output memory from the cache flushing in the split K two stage example
* Fix formatting
* No need to call SetZero on ws_m_n_dev_buf here, as clear_gemm_output now does this as part of the kernel preprocessing
* Add fp16 data type to splitk two stage example
* Add preprocessing with optional cache flushing and clearing of output for k_batch > 1 to the basic GEMM example
[ROCm/composable_kernel commit: 1acd8e041c]
This commit is contained in:
@@ -82,6 +82,14 @@ struct Add
|
||||
y = type_convert<bf16_t>(y_tmp);
|
||||
}
|
||||
|
||||
template <>
|
||||
__host__ __device__ constexpr void
|
||||
operator()<bf16_t>(bf16_t& y, const float& x0, const float& x1) const
|
||||
{
|
||||
const float y_tmp = x0 + x1;
|
||||
y = type_convert<bf16_t>(y_tmp);
|
||||
}
|
||||
|
||||
template <>
|
||||
__host__ __device__ constexpr void
|
||||
operator()<int8_t>(int8_t& y, const int8_t& x0, const int8_t& x1) const
|
||||
|
||||
@@ -23,9 +23,9 @@ struct ElementWiseKernel
|
||||
static constexpr index_t kBlockSize = Problem::BlockShape::kBlockSize;
|
||||
|
||||
template <typename... XDataType, typename Dims>
|
||||
CK_TILE_DEVICE void operator()(Dims lens,
|
||||
Dims input_strides,
|
||||
Dims output_strides,
|
||||
CK_TILE_DEVICE void operator()(const Dims lens,
|
||||
const Dims input_strides,
|
||||
const Dims output_strides,
|
||||
const tuple<XDataType...>& input_tensors,
|
||||
YDataType* p_y) const
|
||||
{
|
||||
|
||||
@@ -326,7 +326,6 @@ struct MultiDAdd
|
||||
}
|
||||
};
|
||||
|
||||
#if 0
|
||||
struct UnaryConvert
|
||||
{
|
||||
template <typename Y, typename X>
|
||||
@@ -336,6 +335,7 @@ struct UnaryConvert
|
||||
}
|
||||
};
|
||||
|
||||
#if 0
|
||||
struct ConvertBF16RTN
|
||||
{
|
||||
// convert to bf16 using round to nearest (rtn)
|
||||
@@ -472,14 +472,14 @@ struct UnaryDivide
|
||||
|
||||
struct UnarySquare
|
||||
{
|
||||
template <typename T>
|
||||
CK_TILE_HOST_DEVICE void operator()(T& y, const T& x) const
|
||||
template <typename Y, typename X>
|
||||
CK_TILE_HOST_DEVICE void operator()(Y& y, const X& x) const
|
||||
{
|
||||
static_assert(std::is_same_v<T, float> || std::is_same_v<T, ck_tile::fp16_t> ||
|
||||
std::is_same_v<T, double> || std::is_same_v<T, int32_t> ||
|
||||
std::is_same_v<T, int8_t>
|
||||
static_assert(std::is_same_v<X, float> || std::is_same_v<X, ck_tile::fp16_t> ||
|
||||
std::is_same_v<X, double> || std::is_same_v<X, int32_t> ||
|
||||
std::is_same_v<X, int8_t>
|
||||
#ifdef CK_TILE_EXPERIMENTAL_BIT_INT_EXTENSION_INT4
|
||||
|| std::is_same_v<T, int4_t>
|
||||
|| std::is_same_v<X, int4_t>
|
||||
#endif
|
||||
,
|
||||
"Data type is not supported by this operation!");
|
||||
|
||||
Reference in New Issue
Block a user