Files
composable_kernel/include/ck_tile/ops/elementwise/binary_elementwise_operation.hpp
SamiAario-AMD 1acd8e041c [CK Tile] gemm splitk two stage (#2697)
* Fix a typo

* Use std::variant to call run_gemm_example_with_layouts with the available layout variant combinations

* Use a unified run_gemm_example_prec_type for basic gemm and universal gemm

* Factor out run_gemm_example_prec_type

* Refactor argument parsing in gemm_splitk_two_stage_reduce.cpp

* Parse arguments outside of create_args

* Move the gemm operators to separate structs to facilitate their reuse

* Move the invokers to separate files to facilitate their reuse

* Rename the invoker files for consistency with the examples that use them

* Add fp32 support to the elementwise examples, and produce an error message for unsupported types

* Get rid of four unused variables

* Make two variables const

* Add support for different input-output type combinations in elementwise examples

* Test support for different input and output types in elementwise examples

* Add support for different operations in the elementwise unary tests

* Add support for UnaryConvert in the elementwise unary tests

* Add support for bf16 in elementwise examples, excluding unsupported type combinations

* Make some operator parameters const in ElementWiseKernel

* Remove some unnecessary include statements

* Implement a two-stage GEMM that does a type conversion in the second stage using the elementwise kernel

* Clear workspace instead of output when flushing the cache in SplitKTwoStageInvoker::gemm

* Fix formatting issues reported by clang

* Add back CK_TILE_USE_WMMA related changes

* Use the right prec type for bf16 in the universal GEMM and two stage split K examples

* Add some brackets

* Add some brackets

* Separate the clearing of the GEMM output memory from the cache flushing in the universal GEMM example

* Separate the clearing of the GEMM output memory from the cache flushing in the split K two stage example

* Fix formatting

* No need to call SetZero on ws_m_n_dev_buf here, as clear_gemm_output now does this as part of the kernel preprocessing

* Add fp16 data type to splitk two stage example

* Add preprocessing with optional cache flushing and clearing of output for k_batch > 1 to the basic GEMM example
2025-09-04 14:33:44 +03:00

103 lines
2.7 KiB
C++

// SPDX-License-Identifier: MIT
// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#include "ck_tile/core.hpp"
namespace ck_tile {
namespace element_wise {
struct Add
{
template <typename Y, typename X0, typename X1>
__host__ __device__ constexpr void operator()(Y& y, const X0& x0, const X1& x1) const;
template <>
__host__ __device__ constexpr void
operator()<float>(float& y, const float& x0, const float& x1) const
{
y = x0 + x1;
};
template <>
__host__ __device__ constexpr void
operator()<double>(double& y, const double& x0, const double& x1) const
{
y = x0 + x1;
};
template <>
__host__ __device__ constexpr void
operator()<float>(float& y, const float& x0, const half_t& x1) const
{
y = x0 + type_convert<half_t>(x1);
};
template <>
__host__ __device__ constexpr void
operator()<half_t>(half_t& y, const float& x0, const float& x1) const
{
y = type_convert<half_t>(x0 + x1);
};
template <>
__host__ __device__ constexpr void
operator()<half_t>(half_t& y, const float& x0, const half_t& x1) const
{
y = type_convert<half_t>(x0) + x1;
};
template <>
__host__ __device__ constexpr void
operator()<half_t>(half_t& y, const half_t& x0, const half_t& x1) const
{
y = x0 + x1;
};
template <>
__host__ __device__ constexpr void
operator()<float>(float& y, const float& x0, const bf16_t& x1) const
{
const float x1_tmp = type_convert<float>(x1);
y = x0 + x1_tmp;
}
template <>
__host__ __device__ constexpr void
operator()<bf16_t>(bf16_t& y, const bf16_t& x0, const bf16_t& x1) const
{
const float x1_tmp = type_convert<float>(x0);
const float x2_tmp = type_convert<float>(x1);
const float y_tmp = x1_tmp + x2_tmp;
y = type_convert<bf16_t>(y_tmp);
}
template <>
__host__ __device__ constexpr void
operator()<bf16_t>(bf16_t& y, const float& x0, const bf16_t& x1) const
{
const float x2_tmp = type_convert<float>(x1);
const float y_tmp = x0 + x2_tmp;
y = type_convert<bf16_t>(y_tmp);
}
template <>
__host__ __device__ constexpr void
operator()<bf16_t>(bf16_t& y, const float& x0, const float& x1) const
{
const float y_tmp = x0 + x1;
y = type_convert<bf16_t>(y_tmp);
}
template <>
__host__ __device__ constexpr void
operator()<int8_t>(int8_t& y, const int8_t& x0, const int8_t& x1) const
{
y = x0 + x1;
};
};
} // namespace element_wise
} // namespace ck_tile