Add a new gemm pipeline based on ComputeV4 which utilizes async copy API (#2949)

* check in pipeline and policy

for async  load in mi350, need to make sure TileAccessPattern is warp_raked or block_raked

solve merge conflicts

* fix cmakelists

* make it build

* fix? buffer async fence

* relax fences; it appears it only is needed between pairs of ping-pongs

* remove fences

* remove fences

* cleanup and reformat

* add steps annotations

* comment all pipeline steps / remove unexplainable syncs

* clang-format

* add comment

* cleanup kernel types for test

* fix comment

* fix hardcoded warp size

* faithfully copy block gemm from compute v4 policy to async policy

* make async test gfx950 only

* fix cmake logic

* set separate compile options for async

* refine comment in policy

* try update hotloop scheduler

* cleanup comments

* test more K block sizes

* unhardcode Ks, sort of

* add large odd test case

* fix build for quant

* add comment to hot loop scheduler and rename enum

* reformat

* reword the pipeline description

* reformat

* address review / add static asserts / typo fix

* update changelog

[ROCm/composable_kernel commit: a7da3c68b9]
This commit is contained in:
Max Podkorytov
2025-10-01 15:38:07 -07:00
committed by GitHub
parent db83ff21e8
commit de41af5d2e
13 changed files with 803 additions and 62 deletions

View File

@@ -11,6 +11,7 @@ list(APPEND EXAMPLE_GEMM_COMPILE_COMPUTE_V4_OPTIONS
-mllvm
-enable-noalias-to-md-conversion=0
)
set(EXAMPLE_GEMM_COMPILE_COMPUTE_ASYNC_OPTIONS ${EXAMPLE_GEMM_COMPILE_COMPUTE_V4_OPTIONS})
if(GPU_TARGETS MATCHES "gfx94|gfx95|gfx11|gfx12")
add_test_executable(test_ck_tile_gemm_pipeline_universal_int8 test_gemm_pipeline_universal_int8.cpp)
@@ -60,6 +61,11 @@ if(GPU_TARGETS MATCHES "gfx94|gfx95|gfx90a|gfx11|gfx12")
target_compile_options(test_ck_tile_gemm_pipeline_persistent PRIVATE ${EXAMPLE_GEMM_COMPILE_OPTIONS})
endif()
if(GPU_TARGETS MATCHES "gfx95")
add_gtest_executable(test_ck_tile_gemm_pipeline_comp_async test_gemm_pipeline_comp_async.cpp)
target_compile_options(test_ck_tile_gemm_pipeline_comp_async PRIVATE ${EXAMPLE_GEMM_COMPILE_COMPUTE_ASYNC_OPTIONS})
endif()
if(GPU_TARGETS MATCHES "gfx11|gfx12")
# On Radeon devices, build the WMMA version instead
add_gtest_executable(test_ck_tile_gemm_pipeline_mem_wmma test_gemm_pipeline_mem_wmma.cpp)

View File

@@ -0,0 +1,17 @@
#include "test_gemm_pipeline_kernel_types.hpp"
#include "test_gemm_pipeline_util.hpp"
#include "gtest/gtest.h"
template <typename T>
class TestCkTileGemmPipelineCompAsync
: public TestCkTileGemmPipeline<T, class TestCkTileGemmPipelineCompAsync<T>>
{
};
#define TEST_SUITE_NAME TestCkTileGemmPipelineCompAsync
TYPED_TEST_SUITE(TestCkTileGemmPipelineCompAsync, KernelTypesCompAsync);
#include "test_gemm_pipeline_ut_cases.inc"
#undef TEST_SUITE_NAME

View File

@@ -26,9 +26,10 @@ using Intrawave = ck_tile::integral_constant<ck_tile::GemmPipelineScheduler,
using Interwave = ck_tile::integral_constant<ck_tile::GemmPipelineScheduler,
ck_tile::GemmPipelineScheduler::Interwave>;
using Mem = ck_tile::integral_constant<GemmPipelineType, GemmPipelineType::Mem>;
using CompV3 = ck_tile::integral_constant<GemmPipelineType, GemmPipelineType::CompV3>;
using CompV4 = ck_tile::integral_constant<GemmPipelineType, GemmPipelineType::CompV4>;
using Mem = ck_tile::integral_constant<GemmPipelineType, GemmPipelineType::Mem>;
using CompV3 = ck_tile::integral_constant<GemmPipelineType, GemmPipelineType::CompV3>;
using CompV4 = ck_tile::integral_constant<GemmPipelineType, GemmPipelineType::CompV4>;
using CompAsync = ck_tile::integral_constant<GemmPipelineType, GemmPipelineType::CompAsync>;
using Persistent = std::true_type;
using NonPersistent = std::false_type;
@@ -129,6 +130,10 @@ using KernelTypesCompV4 = ::testing::Types<
std::tuple< Col, Col, Row, F16, F16, F32, F16, I256, I256, I32, I32, I32, I16, Intrawave, CompV4>
>;
using KernelTypesCompAsync = ::testing::Types<
std::tuple< Row, Col, Row, F16, F16, F32, F16, I256, I256, I32, I32, I32, I16, Intrawave, CompAsync>
>;
using KernelTypesCompV4Wmma = ::testing::Types<
std::tuple< Row, Row, Row, F16, F16, F32, F16, I64, I64, I32, I16, I16, I16, Intrawave, CompV4>,
std::tuple< Row, Col, Row, F16, F16, F32, F16, I64, I64, I32, I16, I16, I16, Intrawave, CompV4>,

View File

@@ -10,18 +10,25 @@ TYPED_TEST(TEST_SUITE_NAME, SmallM)
{
std::vector<int> Ms{1, 2, 3, 4, 5, 6};
constexpr int N = 1024;
constexpr int K = 320;
std::vector<int> Ks;
for (auto K_count: {2, 3, 4, 10, 11})
{
Ks.push_back(K_count * TestFixture::K_Tile);
}
for(int M : Ms)
{
if constexpr(std::is_same_v<typename TestFixture::ALayout,
ck_tile::tensor_layout::gemm::ColumnMajor>)
for(int K : Ks)
{
EXPECT_THROW((this->Run(M, N, K)), std::runtime_error);
}
else
{
this->Run(M, N, K);
if constexpr(std::is_same_v<typename TestFixture::ALayout,
ck_tile::tensor_layout::gemm::ColumnMajor>)
{
EXPECT_THROW((this->Run(M, N, K)), std::runtime_error);
}
else
{
this->Run(M, N, K);
}
}
}
}
@@ -30,7 +37,12 @@ TYPED_TEST(TEST_SUITE_NAME, MidLargeM)
{
std::vector<int> Ms{127, 255, 312, 799, 1573};
constexpr int N = 1024;
constexpr int K = 320;
std::vector<int> Ks;
for (auto K_count: {2, 3, 4, 10, 11})
{
Ks.push_back(K_count * TestFixture::K_Tile);
}
constexpr int VecLoadSize = (std::is_same_v<typename TestFixture::ADataType, ck_tile::fp8_t> ||
std::is_same_v<typename TestFixture::ADataType, ck_tile::bf8_t> ||
std::is_same_v<typename TestFixture::ADataType, ck_tile::int8_t>)
@@ -39,22 +51,25 @@ TYPED_TEST(TEST_SUITE_NAME, MidLargeM)
for(int M : Ms)
{
if constexpr(std::is_same_v<typename TestFixture::ALayout,
ck_tile::tensor_layout::gemm::ColumnMajor>)
for (int K: Ks)
{
if(M % VecLoadSize == 0)
if constexpr(std::is_same_v<typename TestFixture::ALayout,
ck_tile::tensor_layout::gemm::ColumnMajor>)
{
this->Run(M, N, K);
if(M % VecLoadSize == 0)
{
this->Run(M, N, K);
}
else
{
EXPECT_THROW((this->Run(M, N, K)), std::runtime_error);
}
}
else
{
EXPECT_THROW((this->Run(M, N, K)), std::runtime_error);
this->Run(M, N, K);
}
}
else
{
this->Run(M, N, K);
}
}
}

View File

@@ -37,7 +37,8 @@ enum struct GemmPipelineType
{
Mem,
CompV3,
CompV4
CompV4,
CompAsync
};
template <GemmPipelineType PT, typename Problem>
@@ -70,6 +71,15 @@ struct GemmPipelineTypeSelector<GemmPipelineType::CompV4, Problem>
static constexpr auto GetName() { return "GemmPipelineAgBgCrCompV4"; }
};
template <typename Problem>
struct GemmPipelineTypeSelector<GemmPipelineType::CompAsync, Problem>
{
using base_pipeline = ck_tile::BaseGemmPipelineAgBgCrCompAsync<Problem>;
using pipeline = ck_tile::GemmPipelineAgBgCrCompAsync<Problem>;
static constexpr auto GetName() { return "GemmPipelineAgBgCrCompAsync"; }
};
template <typename Tuple, typename Derived>
class TestCkTileGemmPipeline : public ::testing::Test
{
@@ -110,7 +120,8 @@ class TestCkTileGemmPipeline : public ::testing::Test
constexpr bool kPadK = PadK;
constexpr bool preshuffle = Preshuffle;
constexpr bool DoubleSmemBuffer = (PipelineType == GemmPipelineType::CompV4) ? true : false;
constexpr bool DoubleSmemBuffer = (PipelineType == GemmPipelineType::CompV4 ||
PipelineType == GemmPipelineType::CompAsync);
// TODO: For now - but this should also be a test parameter
constexpr bool TransposeC = false;