Wmma support for multiple ABD GEMM (#2803)

* multi_abd wmma support:

 - Add multiple A and B support to multiple D implementation (gridwise level)
 - Add multi_abd GEMM (device level)
 - Add instances (xdl parity)
 - Add tests (both xdl and wmma)
 - Add examples
 - Add ckProfiler support (both xdl and wmma)

* Fix bug in device print function

* Fix unused template parameter

* Fix batched gemm for multiABD gridwise implementation

* Fix gemm_universal_reduce with multiABDs gridwise implementation

---------

Co-authored-by: Illia Silin <98187287+illsilin@users.noreply.github.com>
This commit is contained in:
Enrico Degregori
2025-09-23 03:49:06 +02:00
committed by GitHub
parent de47ae2fdf
commit 3d29bff2f0
38 changed files with 5343 additions and 312 deletions

View File

@@ -0,0 +1,9 @@
add_gtest_executable(test_gemm_multi_abd_wmma test_gemm_multi_abd_wmma.cpp)
if(result EQUAL 0)
target_link_libraries(test_gemm_multi_abd_wmma PRIVATE utility device_gemm_multi_abd_instance)
endif()
add_gtest_executable(test_gemm_multi_abd_xdl test_gemm_multi_abd_xdl.cpp)
if(result EQUAL 0)
target_link_libraries(test_gemm_multi_abd_xdl PRIVATE utility device_gemm_multi_abd_instance)
endif()

View File

@@ -0,0 +1,73 @@
// SPDX-License-Identifier: MIT
// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
#include "gtest/gtest.h"
#include "ck/ck.hpp"
namespace ck {
namespace test {
using Row = ck::tensor_layout::gemm::RowMajor;
using F32 = float;
template <typename Tuple>
class TestGemmCommon : public ::testing::Test
{
protected:
using AsLayout = std::tuple_element_t<0, Tuple>;
using BsLayout = std::tuple_element_t<1, Tuple>;
using DsLayout = std::tuple_element_t<2, Tuple>;
using ELayout = Row;
using AsDataType = std::tuple_element_t<3, Tuple>;
using BsDataType = std::tuple_element_t<4, Tuple>;
using DsDataType = std::tuple_element_t<5, Tuple>;
using EDataType = std::tuple_element_t<6, Tuple>;
using AElementOp = std::tuple_element_t<7, Tuple>;
using BElementOp = std::tuple_element_t<8, Tuple>;
using CDEElementOp = std::tuple_element_t<9, Tuple>;
void Run()
{
std::vector<std::vector<ck::index_t>> lengths = {
{16, 32, 64}, {512, 1024, 2048}, {1024, 512, 32}};
bool all_success = true;
for(auto length : lengths)
{
int M = length[0];
int N = length[1];
int K = length[2];
// Assuming same layout for all A matrices (same applies for Bs and Ds)
int StrideA = ck::is_same_v<remove_cvref_t<tuple_element_t<0, AsLayout>>, Row> ? K : M;
int StrideB = ck::is_same_v<remove_cvref_t<tuple_element_t<0, BsLayout>>, Row> ? N : K;
// In case no D matrices are provided, set stride to 0
int StrideD = 0;
if constexpr(DsDataType::Size() > 0)
{
StrideD = ck::is_same_v<remove_cvref_t<tuple_element_t<0, DsLayout>>, Row> ? N : M;
}
int StrideE = ck::is_same_v<ELayout, Row> ? N : M;
all_success =
all_success & ck::profiler::profile_gemm_multi_abd_impl<AsDataType,
BsDataType,
F32,
DsDataType,
EDataType,
AsLayout,
BsLayout,
DsLayout,
ELayout,
AElementOp,
BElementOp,
CDEElementOp>(
1, 2, false, false, M, N, K, StrideA, StrideB, StrideD, StrideE);
}
EXPECT_TRUE(all_success);
}
};
} // namespace test
} // namespace ck

View File

@@ -0,0 +1,154 @@
// SPDX-License-Identifier: MIT
// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
#include <tuple>
#include "gtest/gtest.h"
#include "ck/ck.hpp"
#include "profiler/profile_gemm_multi_abd_impl.hpp"
#include "test_gemm_common.hpp"
namespace ck {
namespace test {
using Row = ck::tensor_layout::gemm::RowMajor;
using Col = ck::tensor_layout::gemm::ColumnMajor;
using I8 = int8_t;
using BF16 = ck::bhalf_t;
using PassThrough = ck::tensor_operation::element_wise::PassThrough;
using Multiply = ck::tensor_operation::element_wise::Multiply;
using Add = ck::tensor_operation::element_wise::Add;
using MultiplyAdd = ck::tensor_operation::element_wise::MultiplyAdd;
using FastGelu = ck::tensor_operation::element_wise::FastGelu;
using AddFastGelu = ck::tensor_operation::element_wise::AddFastGelu;
using MultiplyAddFastGelu = ck::tensor_operation::element_wise::MultiplyAddFastGelu;
using MultiplyFastGelu = ck::tensor_operation::element_wise::MultiplyFastGelu;
using KernelTypesABD = ::testing::Types<std::tuple<ck::Tuple<Row>,
ck::Tuple<Row, Row>,
ck::Tuple<Row>,
ck::Tuple<BF16>,
ck::Tuple<I8, BF16>,
ck::Tuple<BF16>,
BF16,
PassThrough,
Multiply,
Add>,
std::tuple<ck::Tuple<Row>,
ck::Tuple<Col, Col>,
ck::Tuple<Row>,
ck::Tuple<BF16>,
ck::Tuple<I8, BF16>,
ck::Tuple<BF16>,
BF16,
PassThrough,
Multiply,
Add>,
std::tuple<ck::Tuple<Row>,
ck::Tuple<Row, Row>,
ck::Tuple<Row>,
ck::Tuple<BF16>,
ck::Tuple<I8, BF16>,
ck::Tuple<BF16>,
BF16,
PassThrough,
Multiply,
AddFastGelu>,
std::tuple<ck::Tuple<Row>,
ck::Tuple<Col, Col>,
ck::Tuple<Row>,
ck::Tuple<BF16>,
ck::Tuple<I8, BF16>,
ck::Tuple<BF16>,
BF16,
PassThrough,
Multiply,
AddFastGelu>,
std::tuple<ck::Tuple<Row>,
ck::Tuple<Row, Row>,
ck::Tuple<>,
ck::Tuple<BF16>,
ck::Tuple<I8, BF16>,
ck::Tuple<>,
BF16,
PassThrough,
Multiply,
FastGelu>,
std::tuple<ck::Tuple<Row>,
ck::Tuple<Col, Col>,
ck::Tuple<>,
ck::Tuple<BF16>,
ck::Tuple<I8, BF16>,
ck::Tuple<>,
BF16,
PassThrough,
Multiply,
FastGelu>,
std::tuple<ck::Tuple<Row>,
ck::Tuple<Row, Row>,
ck::Tuple<>,
ck::Tuple<BF16>,
ck::Tuple<I8, BF16>,
ck::Tuple<>,
BF16,
PassThrough,
Multiply,
PassThrough>,
std::tuple<ck::Tuple<Row>,
ck::Tuple<Col, Col>,
ck::Tuple<>,
ck::Tuple<BF16>,
ck::Tuple<I8, BF16>,
ck::Tuple<>,
BF16,
PassThrough,
Multiply,
PassThrough>,
std::tuple<ck::Tuple<Row>,
ck::Tuple<Row>,
ck::Tuple<Row, Row>,
ck::Tuple<BF16>,
ck::Tuple<I8>,
ck::Tuple<BF16, BF16>,
BF16,
PassThrough,
PassThrough,
MultiplyAddFastGelu>,
std::tuple<ck::Tuple<Row>,
ck::Tuple<Row>,
ck::Tuple<Row, Row>,
ck::Tuple<BF16>,
ck::Tuple<I8>,
ck::Tuple<BF16, BF16>,
BF16,
PassThrough,
PassThrough,
MultiplyAdd>,
std::tuple<ck::Tuple<Row>,
ck::Tuple<Row>,
ck::Tuple<Row>,
ck::Tuple<BF16>,
ck::Tuple<I8>,
ck::Tuple<BF16>,
BF16,
PassThrough,
PassThrough,
MultiplyFastGelu>,
std::tuple<ck::Tuple<Row>,
ck::Tuple<Row>,
ck::Tuple<Row>,
ck::Tuple<BF16>,
ck::Tuple<I8>,
ck::Tuple<BF16>,
BF16,
PassThrough,
PassThrough,
Multiply>>;
TYPED_TEST_SUITE(TestGemmCommon, KernelTypesABD);
TYPED_TEST(TestGemmCommon, Test_BF16I8BF16) { this->Run(); }
} // namespace test
} // namespace ck

View File

@@ -0,0 +1,154 @@
// SPDX-License-Identifier: MIT
// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
#include <tuple>
#include "gtest/gtest.h"
#include "ck/ck.hpp"
#include "profiler/profile_gemm_multi_abd_impl.hpp"
#include "test_gemm_common.hpp"
namespace ck {
namespace test {
using Row = ck::tensor_layout::gemm::RowMajor;
using Col = ck::tensor_layout::gemm::ColumnMajor;
using I8 = int8_t;
using BF16 = ck::bhalf_t;
using PassThrough = ck::tensor_operation::element_wise::PassThrough;
using Multiply = ck::tensor_operation::element_wise::Multiply;
using Add = ck::tensor_operation::element_wise::Add;
using MultiplyAdd = ck::tensor_operation::element_wise::MultiplyAdd;
using FastGelu = ck::tensor_operation::element_wise::FastGelu;
using AddFastGelu = ck::tensor_operation::element_wise::AddFastGelu;
using MultiplyAddFastGelu = ck::tensor_operation::element_wise::MultiplyAddFastGelu;
using MultiplyFastGelu = ck::tensor_operation::element_wise::MultiplyFastGelu;
using KernelTypesABD = ::testing::Types<std::tuple<ck::Tuple<Row>,
ck::Tuple<Row, Row>,
ck::Tuple<Row>,
ck::Tuple<BF16>,
ck::Tuple<I8, BF16>,
ck::Tuple<BF16>,
BF16,
PassThrough,
Multiply,
Add>,
std::tuple<ck::Tuple<Row>,
ck::Tuple<Col, Col>,
ck::Tuple<Row>,
ck::Tuple<BF16>,
ck::Tuple<I8, BF16>,
ck::Tuple<BF16>,
BF16,
PassThrough,
Multiply,
Add>,
std::tuple<ck::Tuple<Row>,
ck::Tuple<Row, Row>,
ck::Tuple<Row>,
ck::Tuple<BF16>,
ck::Tuple<I8, BF16>,
ck::Tuple<BF16>,
BF16,
PassThrough,
Multiply,
AddFastGelu>,
std::tuple<ck::Tuple<Row>,
ck::Tuple<Col, Col>,
ck::Tuple<Row>,
ck::Tuple<BF16>,
ck::Tuple<I8, BF16>,
ck::Tuple<BF16>,
BF16,
PassThrough,
Multiply,
AddFastGelu>,
std::tuple<ck::Tuple<Row>,
ck::Tuple<Row, Row>,
ck::Tuple<>,
ck::Tuple<BF16>,
ck::Tuple<I8, BF16>,
ck::Tuple<>,
BF16,
PassThrough,
Multiply,
FastGelu>,
std::tuple<ck::Tuple<Row>,
ck::Tuple<Col, Col>,
ck::Tuple<>,
ck::Tuple<BF16>,
ck::Tuple<I8, BF16>,
ck::Tuple<>,
BF16,
PassThrough,
Multiply,
FastGelu>,
std::tuple<ck::Tuple<Row>,
ck::Tuple<Row, Row>,
ck::Tuple<>,
ck::Tuple<BF16>,
ck::Tuple<I8, BF16>,
ck::Tuple<>,
BF16,
PassThrough,
Multiply,
PassThrough>,
std::tuple<ck::Tuple<Row>,
ck::Tuple<Col, Col>,
ck::Tuple<>,
ck::Tuple<BF16>,
ck::Tuple<I8, BF16>,
ck::Tuple<>,
BF16,
PassThrough,
Multiply,
PassThrough>,
std::tuple<ck::Tuple<Row>,
ck::Tuple<Row>,
ck::Tuple<Row, Row>,
ck::Tuple<BF16>,
ck::Tuple<I8>,
ck::Tuple<BF16, BF16>,
BF16,
PassThrough,
PassThrough,
MultiplyAddFastGelu>,
std::tuple<ck::Tuple<Row>,
ck::Tuple<Row>,
ck::Tuple<Row, Row>,
ck::Tuple<BF16>,
ck::Tuple<I8>,
ck::Tuple<BF16, BF16>,
BF16,
PassThrough,
PassThrough,
MultiplyAdd>,
std::tuple<ck::Tuple<Row>,
ck::Tuple<Row>,
ck::Tuple<Row>,
ck::Tuple<BF16>,
ck::Tuple<I8>,
ck::Tuple<BF16>,
BF16,
PassThrough,
PassThrough,
MultiplyFastGelu>,
std::tuple<ck::Tuple<Row>,
ck::Tuple<Row>,
ck::Tuple<Row>,
ck::Tuple<BF16>,
ck::Tuple<I8>,
ck::Tuple<BF16>,
BF16,
PassThrough,
PassThrough,
Multiply>>;
TYPED_TEST_SUITE(TestGemmCommon, KernelTypesABD);
TYPED_TEST(TestGemmCommon, Test_BF16I8BF16) { this->Run(); }
} // namespace test
} // namespace ck