Merge commit 'a7da3c68b979bd46c315da09208271d26f5e2900' into develop

This commit is contained in:
assistant-librarian[bot]
2025-10-01 23:11:22 +00:00
parent 24ac4febf4
commit ba74f76cbc
17 changed files with 952 additions and 76 deletions

View File

@@ -11,6 +11,7 @@ list(APPEND EXAMPLE_GEMM_COMPILE_COMPUTE_V4_OPTIONS
-mllvm
-enable-noalias-to-md-conversion=0
)
set(EXAMPLE_GEMM_COMPILE_COMPUTE_ASYNC_OPTIONS ${EXAMPLE_GEMM_COMPILE_COMPUTE_V4_OPTIONS})
if(GPU_TARGETS MATCHES "gfx94|gfx95|gfx11|gfx12")
add_test_executable(test_ck_tile_gemm_pipeline_universal_int8 test_gemm_pipeline_universal_int8.cpp)
@@ -60,6 +61,11 @@ if(GPU_TARGETS MATCHES "gfx94|gfx95|gfx90a|gfx11|gfx12")
target_compile_options(test_ck_tile_gemm_pipeline_persistent PRIVATE ${EXAMPLE_GEMM_COMPILE_OPTIONS})
endif()
if(GPU_TARGETS MATCHES "gfx95")
add_gtest_executable(test_ck_tile_gemm_pipeline_comp_async test_gemm_pipeline_comp_async.cpp)
target_compile_options(test_ck_tile_gemm_pipeline_comp_async PRIVATE ${EXAMPLE_GEMM_COMPILE_COMPUTE_ASYNC_OPTIONS})
endif()
if(GPU_TARGETS MATCHES "gfx11|gfx12")
# On Radeon devices, build the WMMA version instead
add_gtest_executable(test_ck_tile_gemm_pipeline_mem_wmma test_gemm_pipeline_mem_wmma.cpp)

View File

@@ -0,0 +1,17 @@
#include "test_gemm_pipeline_kernel_types.hpp"
#include "test_gemm_pipeline_util.hpp"
#include "gtest/gtest.h"
template <typename T>
class TestCkTileGemmPipelineCompAsync
: public TestCkTileGemmPipeline<T, class TestCkTileGemmPipelineCompAsync<T>>
{
};
#define TEST_SUITE_NAME TestCkTileGemmPipelineCompAsync
TYPED_TEST_SUITE(TestCkTileGemmPipelineCompAsync, KernelTypesCompAsync);
#include "test_gemm_pipeline_ut_cases.inc"
#undef TEST_SUITE_NAME

View File

@@ -26,9 +26,10 @@ using Intrawave = ck_tile::integral_constant<ck_tile::GemmPipelineScheduler,
using Interwave = ck_tile::integral_constant<ck_tile::GemmPipelineScheduler,
ck_tile::GemmPipelineScheduler::Interwave>;
using Mem = ck_tile::integral_constant<GemmPipelineType, GemmPipelineType::Mem>;
using CompV3 = ck_tile::integral_constant<GemmPipelineType, GemmPipelineType::CompV3>;
using CompV4 = ck_tile::integral_constant<GemmPipelineType, GemmPipelineType::CompV4>;
using Mem = ck_tile::integral_constant<GemmPipelineType, GemmPipelineType::Mem>;
using CompV3 = ck_tile::integral_constant<GemmPipelineType, GemmPipelineType::CompV3>;
using CompV4 = ck_tile::integral_constant<GemmPipelineType, GemmPipelineType::CompV4>;
using CompAsync = ck_tile::integral_constant<GemmPipelineType, GemmPipelineType::CompAsync>;
using Persistent = std::true_type;
using NonPersistent = std::false_type;
@@ -129,6 +130,10 @@ using KernelTypesCompV4 = ::testing::Types<
std::tuple< Col, Col, Row, F16, F16, F32, F16, I256, I256, I32, I32, I32, I16, Intrawave, CompV4>
>;
using KernelTypesCompAsync = ::testing::Types<
std::tuple< Row, Col, Row, F16, F16, F32, F16, I256, I256, I32, I32, I32, I16, Intrawave, CompAsync>
>;
using KernelTypesCompV4Wmma = ::testing::Types<
std::tuple< Row, Row, Row, F16, F16, F32, F16, I64, I64, I32, I16, I16, I16, Intrawave, CompV4>,
std::tuple< Row, Col, Row, F16, F16, F32, F16, I64, I64, I32, I16, I16, I16, Intrawave, CompV4>,

View File

@@ -10,18 +10,25 @@ TYPED_TEST(TEST_SUITE_NAME, SmallM)
{
std::vector<int> Ms{1, 2, 3, 4, 5, 6};
constexpr int N = 1024;
constexpr int K = 320;
std::vector<int> Ks;
for (auto K_count: {2, 3, 4, 10, 11})
{
Ks.push_back(K_count * TestFixture::K_Tile);
}
for(int M : Ms)
{
if constexpr(std::is_same_v<typename TestFixture::ALayout,
ck_tile::tensor_layout::gemm::ColumnMajor>)
for(int K : Ks)
{
EXPECT_THROW((this->Run(M, N, K)), std::runtime_error);
}
else
{
this->Run(M, N, K);
if constexpr(std::is_same_v<typename TestFixture::ALayout,
ck_tile::tensor_layout::gemm::ColumnMajor>)
{
EXPECT_THROW((this->Run(M, N, K)), std::runtime_error);
}
else
{
this->Run(M, N, K);
}
}
}
}
@@ -30,7 +37,12 @@ TYPED_TEST(TEST_SUITE_NAME, MidLargeM)
{
std::vector<int> Ms{127, 255, 312, 799, 1573};
constexpr int N = 1024;
constexpr int K = 320;
std::vector<int> Ks;
for (auto K_count: {2, 3, 4, 10, 11})
{
Ks.push_back(K_count * TestFixture::K_Tile);
}
constexpr int VecLoadSize = (std::is_same_v<typename TestFixture::ADataType, ck_tile::fp8_t> ||
std::is_same_v<typename TestFixture::ADataType, ck_tile::bf8_t> ||
std::is_same_v<typename TestFixture::ADataType, ck_tile::int8_t>)
@@ -39,22 +51,25 @@ TYPED_TEST(TEST_SUITE_NAME, MidLargeM)
for(int M : Ms)
{
if constexpr(std::is_same_v<typename TestFixture::ALayout,
ck_tile::tensor_layout::gemm::ColumnMajor>)
for (int K: Ks)
{
if(M % VecLoadSize == 0)
if constexpr(std::is_same_v<typename TestFixture::ALayout,
ck_tile::tensor_layout::gemm::ColumnMajor>)
{
this->Run(M, N, K);
if(M % VecLoadSize == 0)
{
this->Run(M, N, K);
}
else
{
EXPECT_THROW((this->Run(M, N, K)), std::runtime_error);
}
}
else
{
EXPECT_THROW((this->Run(M, N, K)), std::runtime_error);
this->Run(M, N, K);
}
}
else
{
this->Run(M, N, K);
}
}
}

View File

@@ -37,7 +37,8 @@ enum struct GemmPipelineType
{
Mem,
CompV3,
CompV4
CompV4,
CompAsync
};
template <GemmPipelineType PT, typename Problem>
@@ -70,6 +71,15 @@ struct GemmPipelineTypeSelector<GemmPipelineType::CompV4, Problem>
static constexpr auto GetName() { return "GemmPipelineAgBgCrCompV4"; }
};
template <typename Problem>
struct GemmPipelineTypeSelector<GemmPipelineType::CompAsync, Problem>
{
using base_pipeline = ck_tile::BaseGemmPipelineAgBgCrCompAsync<Problem>;
using pipeline = ck_tile::GemmPipelineAgBgCrCompAsync<Problem>;
static constexpr auto GetName() { return "GemmPipelineAgBgCrCompAsync"; }
};
template <typename Tuple, typename Derived>
class TestCkTileGemmPipeline : public ::testing::Test
{
@@ -110,7 +120,8 @@ class TestCkTileGemmPipeline : public ::testing::Test
constexpr bool kPadK = PadK;
constexpr bool preshuffle = Preshuffle;
constexpr bool DoubleSmemBuffer = (PipelineType == GemmPipelineType::CompV4) ? true : false;
constexpr bool DoubleSmemBuffer = (PipelineType == GemmPipelineType::CompV4 ||
PipelineType == GemmPipelineType::CompAsync);
// TODO: For now - but this should also be a test parameter
constexpr bool TransposeC = false;

View File

@@ -31,7 +31,8 @@ template <typename ALayout_,
int K_Warp_Tile_val_,
bool DoubleSmemBuffer_val_,
ck_tile::GemmPipelineScheduler Scheduler_val_,
PipelineType Pipeline_val_>
PipelineType Pipeline_val_,
bool Persistent_val_>
struct KernelConfig
{
using ALayoutType = ALayout_;
@@ -56,15 +57,19 @@ struct KernelConfig
static constexpr bool DoubleSmemBuffer_ = DoubleSmemBuffer_val_;
static constexpr auto Scheduler_ = Scheduler_val_;
static constexpr PipelineType Pipeline_ = Pipeline_val_;
static constexpr bool Persistent_ = Persistent_val_;
static constexpr int BlockPerCu_ = 1;
};
// clang-format off
using KernelTypes = ::testing::Types<
// ALayout, BLayout, ELayout, ADataType, BDataType, AccDataType, EDataType, M_N_KTiles, M_N_K_Warps, M_N_K_Warp_Tile, DoubleSmemBuffer, Scheduler, Pipeline
KernelConfig< Row, Col, Row, F16, F16, F32, F16, 128, 32, 64, 4, 1, 1, 32, 32, 8, false, ck_tile::GemmPipelineScheduler::Interwave, PipelineType::Memory>, // memory
KernelConfig< Row, Col, Row, F16, F16, F32, F16, 256, 256, 64, 2, 2, 1, 32, 32, 16, false, ck_tile::GemmPipelineScheduler::Intrawave, PipelineType::CompV3>, // v3
KernelConfig< Row, Col, Row, F16, F16, F32, F16, 256, 256, 32, 2, 2, 1, 32, 32, 16, true, ck_tile::GemmPipelineScheduler::Intrawave, PipelineType::CompV4> // v4
// ALayout, BLayout, ELayout, ADataType, BDataType, AccDataType, EDataType, M_N_KTiles, M_N_K_Warps, M_N_K_Warp_Tile, DoubleSmemBuffer, Scheduler, Pipeline, Persistent
KernelConfig< Row, Col, Row, F16, F16, F32, F16, 128, 32, 64, 4, 1, 1, 32, 32, 8, false, ck_tile::GemmPipelineScheduler::Interwave, PipelineType::Memory, false>, // memory
KernelConfig< Row, Col, Row, F16, F16, F32, F16, 128, 32, 64, 4, 1, 1, 32, 32, 8, false, ck_tile::GemmPipelineScheduler::Interwave, PipelineType::Memory, true>, // memory
KernelConfig< Row, Col, Row, F16, F16, F32, F16, 256, 256, 64, 2, 2, 1, 32, 32, 16, false, ck_tile::GemmPipelineScheduler::Intrawave, PipelineType::CompV3, false>, // v3
KernelConfig< Row, Col, Row, F16, F16, F32, F16, 256, 256, 64, 2, 2, 1, 32, 32, 16, false, ck_tile::GemmPipelineScheduler::Intrawave, PipelineType::CompV3, true>, // v3
KernelConfig< Row, Col, Row, F16, F16, F32, F16, 256, 256, 32, 2, 2, 1, 32, 32, 16, true, ck_tile::GemmPipelineScheduler::Intrawave, PipelineType::CompV4, false>, // v4
KernelConfig< Row, Col, Row, F16, F16, F32, F16, 256, 256, 32, 2, 2, 1, 32, 32, 16, true, ck_tile::GemmPipelineScheduler::Intrawave, PipelineType::CompV4, true> // v4
>;
// clang-format on

View File

@@ -93,7 +93,6 @@ class TestCkTileGroupedGemmMultiD : public ::testing::Test
return gemm_descs.size() * sizeof(ck_tile::GemmTransKernelArg<DsDataType::size()>);
}
template <typename ALayout, typename BLayout, typename ELayout>
void invoke_grouped_gemm(const std::vector<grouped_gemm_kargs>& gemm_descs,
const ck_tile::stream_config& s,
void* kargs_ptr)
@@ -229,6 +228,100 @@ class TestCkTileGroupedGemmMultiD : public ::testing::Test
BaseGemmPipeline::TailHandler(RunSplitk, has_hot_loop, tail_num);
}
void invoke_grouped_gemm_persistent(const ck_tile::stream_config& s,
const ck_tile::index_t num_groups,
void* kargs_ptr,
bool splitk)
{
using GemmShape = ck_tile::TileGemmShape<
ck_tile::sequence<Config::M_Tile_, Config::N_Tile_, Config::K_Tile_>,
ck_tile::sequence<Config::M_Warp_, Config::N_Warp_, Config::K_Warp_>,
ck_tile::sequence<Config::M_Warp_Tile_, Config::N_Warp_Tile_, Config::K_Warp_Tile_>>;
using TilePartitioner = ck_tile::
GemmSpatiallyLocalTilePartitioner<GemmShape, TileParitionerGroupNum, TileParitionerM01>;
using GemmUniversalTraits =
ck_tile::PersistentTileGemmUniversalTraits<kPadM,
kPadN,
kPadK,
Config::DoubleSmemBuffer_,
ALayout,
BLayout,
ELayout>;
float ave_time{0};
const auto Run = [&](const auto memory_operation_) {
constexpr auto memory_operation = memory_operation_.value;
// We create the GEMM pipeline without specifying hotloop or tailnumber.
// These are automatically run inside the kernel based on the given input data.
using UniversalGemmProblem = ck_tile::UniversalGemmPipelineProblem<ADataType,
BDataType,
AccDataType,
GemmShape,
GemmUniversalTraits,
Config::Scheduler_>;
using GemmPipeline = std::conditional_t<
Config::Pipeline_ == (PipelineType::Memory),
ck_tile::GemmPipelineAgBgCrMem<UniversalGemmProblem>,
std::conditional_t<Config::Pipeline_ == (PipelineType::CompV3),
ck_tile::GemmPipelineAgBgCrCompV3<UniversalGemmProblem>,
ck_tile::GemmPipelineAgBgCrCompV4<UniversalGemmProblem>>>;
using GemmEpilogue = ck_tile::CShuffleEpilogue<
ck_tile::CShuffleEpilogueProblem<ADataType,
BDataType,
DsDataType,
AccDataType,
EDataType,
DsLayout,
ELayout,
MultiplyMultiply,
TilePartitioner::MPerBlock,
TilePartitioner::NPerBlock,
Config::M_Warp_,
Config::N_Warp_,
Config::M_Warp_Tile_,
Config::N_Warp_Tile_,
Config::K_Warp_Tile_,
UniversalGemmProblem::TransposeC,
memory_operation>>;
using Kernel = ck_tile::GroupedGemmKernel<TilePartitioner, GemmPipeline, GemmEpilogue>;
const dim3 blocks = Kernel::BlockSize();
const dim3 grids = Kernel::MaxOccupancyGridSize(s);
if(s.log_level_ > 0)
{
std::cout << "Launching kernel: " << Kernel::GetName()
<< " with args:" << " grid: {" << grids.x << ", " << grids.y << ", "
<< grids.z << "}" << ", blocks: {" << blocks.x << ", " << blocks.y << ", "
<< blocks.z << "}" << std::endl;
}
ave_time = ck_tile::launch_kernel(
s,
ck_tile::make_kernel<Config::BlockPerCu_>(
Kernel{},
grids,
blocks,
0,
ck_tile::cast_pointer_to_constant_address_space(kargs_ptr),
num_groups));
return ave_time;
};
if(!splitk)
{
Run(ck_tile::integral_constant<ck_tile::memory_operation_enum,
ck_tile::memory_operation_enum::set>{});
}
else
{
Run(ck_tile::integral_constant<ck_tile::memory_operation_enum,
ck_tile::memory_operation_enum::atomic_add>{});
}
}
public:
void Run(const std::vector<int>& Ms,
const std::vector<int>& Ns,
@@ -379,9 +472,43 @@ class TestCkTileGroupedGemmMultiD : public ::testing::Test
ck_tile::DeviceMem gemm_workspace;
gemm_workspace.Realloc(get_workspace_size(gemm_descs));
invoke_grouped_gemm<ALayout, BLayout, ELayout>(gemm_descs,
ck_tile::stream_config{nullptr, false, 1},
gemm_workspace.GetDeviceBuffer());
if constexpr(Config::Persistent_)
{
std::vector<ck_tile::GemmTransKernelArg<DsDataType::size()>> kargs;
void* kargs_ptr = gemm_workspace.GetDeviceBuffer();
const bool splitk = gemm_descs[0].k_batch > 1;
for(const auto& arg : gemm_descs)
{
kargs.emplace_back(
ck_tile::UniversalGemmKernelArgs<1, 1, DsDataType::size()>{{arg.a_ptr},
{arg.b_ptr},
arg.ds_ptr,
arg.e_ptr,
arg.M,
arg.N,
arg.K,
{arg.stride_A},
{arg.stride_B},
arg.stride_Ds,
arg.stride_E,
arg.k_batch});
}
const auto stream = ck_tile::stream_config{nullptr, false, 1};
ck_tile::hip_check_error(hipMemcpyWithStream(
kargs_ptr,
kargs.data(),
kargs.size() * sizeof(ck_tile::GemmTransKernelArg<DsDataType::size()>),
hipMemcpyHostToDevice,
stream.stream_id_));
invoke_grouped_gemm_persistent(stream, group_count, kargs_ptr, splitk);
}
else
{
invoke_grouped_gemm(gemm_descs,
ck_tile::stream_config{nullptr, false, 1},
gemm_workspace.GetDeviceBuffer());
}
// Copy results back to host for validation
for(int i = 0; i < group_count; i++)