[CK_TILE] Multiple-D GEMM example (#2219)

* Multiple d, initial commit

* Check Ds Layout

* Readme and clang format

* Update branch & conflicts

* Multiple D - fix clang-formatter

* Rename elemetwise_op

* Fix CI

* Code review part1

* Remove printf

* Remove unnecessary comment

* Add new tests with Col layout

* Review part 2

* Added support for Multiple D GEMM

* Update comment

* Remove maybe_unused

* Clang-format

* Review part 3

* Add comment to function

* Add comment to function: another

* Take number of params for a refrence function

* Remove additional d param for 0 tensor

* Change name of function

* Fix CI fails
This commit is contained in:
Mateusz Ozga
2025-06-13 19:39:11 +02:00
committed by GitHub
parent 3a0cb27966
commit bd96ac9742
34 changed files with 2267 additions and 285 deletions

View File

@@ -71,6 +71,58 @@ CK_TILE_HOST void reference_gemm(const HostTensor<ADataType>& a_m_k,
make_ParallelTensorFunctor(f_mn, M, N)(std::thread::hardware_concurrency());
}
template <typename ADataType,
typename BDataType,
typename DsDataType,
typename AccDataType,
typename CDataType,
typename ACCElementOp,
typename DDataType = remove_cvref_t<std::tuple_element_t<0, DsDataType>>>
CK_TILE_HOST void
reference_gemm_multiple_d(const HostTensor<ADataType>& a_m_k,
const HostTensor<BDataType>& b_k_n,
const std::array<HostTensor<DDataType>, DsDataType::size()>& ds_m_n,
HostTensor<CDataType>& c_m_n,
const ACCElementOp& acc_element_op = {})
{
const std::size_t M = a_m_k.get_length(0);
const std::size_t N = b_k_n.get_length(1);
const std::size_t K = a_m_k.get_length(1);
auto f_mk_kn_mn = [&](auto m, auto n) {
AccDataType v_acc = 0;
for(std::size_t k = 0; k < K; ++k)
{
ADataType v_a = a_m_k(m, k);
BDataType v_b = b_k_n(k, n);
v_acc +=
ck_tile::type_convert<AccDataType>(v_a) * ck_tile::type_convert<AccDataType>(v_b);
}
CDataType v_c = 0;
if constexpr(DsDataType::size() == 0)
{
acc_element_op(v_c, ck_tile::type_convert<float>(v_acc));
}
else if constexpr(DsDataType::size() == 1)
{
acc_element_op(v_c,
ck_tile::type_convert<float>(v_acc),
ck_tile::type_convert<float>(ds_m_n[0](m, n)));
}
else if constexpr(DsDataType::size() == 2)
{
acc_element_op(v_c,
ck_tile::type_convert<float>(v_acc),
ck_tile::type_convert<float>(ds_m_n[0](m, n)),
ck_tile::type_convert<float>(ds_m_n[1](m, n)));
}
c_m_n(m, n) = ck_tile::type_convert<CDataType>(v_c);
};
make_ParallelTensorFunctor(f_mk_kn_mn, M, N)(std::thread::hardware_concurrency());
}
template <typename ADataType,
typename BDataType,
typename AccDataType,