From 7a12ecc762980d35371827f9ff60816429f51f69 Mon Sep 17 00:00:00 2001 From: Adam Osewski <19374865+aosewski@users.noreply.github.com> Date: Thu, 12 Mar 2026 14:28:24 +0100 Subject: [PATCH] [CK TILE] Skip work if any of Grouped GEMM groups M/N/K are zero. (#5050) ## Motivation It's common in MoE workloads that some experts receive zero tokens, which would result in some of the dimensions equal to zero. Currently we handle such case only for non-persistent kernels where we have all GEMMs information beforehand on host - we validate this during creation of kernel arguments. However for the "dynamic" input path (persistent kernel) this information is not available before kernel launch. Thus we have to validate this during kernel execution. The goal is to add this validation. ## Technical Details Skip work if any of Grouped GEMM groups M/N/K are zero for persistent kernel path. ## Test Plan Add unit-tests which cover "dynamic" inputs with zero dims for persistent kernel execution path. ## Test Result All tests pass. ## Submission Checklist - [ x] Look over the contributing guidelines at https://github.com/ROCm/ROCm/blob/develop/CONTRIBUTING.md#pull-requests. --------- Co-authored-by: Claude Opus 4.6 --- .../ops/gemm/kernel/grouped_gemm_kernel.hpp | 22 ++++++ test/ck_tile/grouped_gemm/CMakeLists.txt | 11 ++- .../grouped_gemm/test_grouped_gemm_bf16.cpp | 41 ++++++++++ ...ped_gemm.cpp => test_grouped_gemm_f16.cpp} | 22 +++--- .../test_grouped_gemm_ut_cases.inc | 78 +++++++++++++++---- .../grouped_gemm/test_grouped_gemm_util.hpp | 64 +++++++++------ 6 files changed, 186 insertions(+), 52 deletions(-) create mode 100644 test/ck_tile/grouped_gemm/test_grouped_gemm_bf16.cpp rename test/ck_tile/grouped_gemm/{test_grouped_gemm.cpp => test_grouped_gemm_f16.cpp} (60%) diff --git a/include/ck_tile/ops/gemm/kernel/grouped_gemm_kernel.hpp b/include/ck_tile/ops/gemm/kernel/grouped_gemm_kernel.hpp index 3f028ead2b..bb054eeaa3 100644 --- a/include/ck_tile/ops/gemm/kernel/grouped_gemm_kernel.hpp +++ b/include/ck_tile/ops/gemm/kernel/grouped_gemm_kernel.hpp @@ -507,6 +507,12 @@ struct GroupedGemmKernel const index_t group_id = FindGroupId(gemm_desc_ptr, block_id, group_count); const auto& kargs = gemm_desc_ptr[group_id]; + // Early exit if no work to do. + if(kargs.group_karg.M == 0 || kargs.group_karg.N == 0 || kargs.group_karg.K == 0) + { + return; + } + const auto grid_size_2d = TilePartitioner::GridSize(kargs.group_karg.M, kargs.group_karg.N); const auto block_idx_2d = OffsetTile1DPartitioner::GetOffsetedTileIndex( 0, @@ -534,6 +540,22 @@ struct GroupedGemmKernel const auto& k_batch = kargs.k_batch; const auto block_start = cum_grid_size; cum_grid_size += TilePartitioner::GridSize(kargs.M, kargs.N) * k_batch; + + // Early exit if no work to do. + // If M or N is zero, TilePartitioner::GridSize(kargs.M, kargs.N) returns zero, + // so this group contributes no blocks and cum_grid_size is unchanged. The group + // is naturally skipped by the block_id < cum_grid_size check below. + if(kargs.K == 0) + { + // Advance only if this workgroup was assigned to this group's range, + // matching the pattern of the normal while loop below. + while(block_id < cum_grid_size) + { + block_id += grid_size; + } + continue; + } + while(block_id < cum_grid_size) { const auto grid_size_2d = TilePartitioner::GridSize(kargs.M, kargs.N); diff --git a/test/ck_tile/grouped_gemm/CMakeLists.txt b/test/ck_tile/grouped_gemm/CMakeLists.txt index b30dc2a867..27ff74a15b 100644 --- a/test/ck_tile/grouped_gemm/CMakeLists.txt +++ b/test/ck_tile/grouped_gemm/CMakeLists.txt @@ -1,7 +1,14 @@ # Copyright (c) Advanced Micro Devices, Inc., or its affiliates. # SPDX-License-Identifier: MIT -# Currently ck_tile is only built on gfx9 if(GPU_TARGETS MATCHES "gfx9|gfx11|gfx12") - add_gtest_executable(test_ck_tile_grouped_gemm test_grouped_gemm.cpp) + + add_custom_target(test_ck_tile_grouped_gemm) + + add_gtest_executable(test_ck_tile_grouped_gemm_f16 test_grouped_gemm_f16.cpp) + add_gtest_executable(test_ck_tile_grouped_gemm_bf16 test_grouped_gemm_bf16.cpp) + + add_dependencies(test_ck_tile_grouped_gemm + test_ck_tile_grouped_gemm_f16 + test_ck_tile_grouped_gemm_bf16) endif() diff --git a/test/ck_tile/grouped_gemm/test_grouped_gemm_bf16.cpp b/test/ck_tile/grouped_gemm/test_grouped_gemm_bf16.cpp new file mode 100644 index 0000000000..0ae056c5b3 --- /dev/null +++ b/test/ck_tile/grouped_gemm/test_grouped_gemm_bf16.cpp @@ -0,0 +1,41 @@ +// Copyright (c) Advanced Micro Devices, Inc., or its affiliates. +// SPDX-License-Identifier: MIT + +#include + +#include "gtest/gtest.h" + +#include "ck_tile/host.hpp" +#include "test_grouped_gemm_util.hpp" + +using F32 = float; +using BF16 = ck_tile::bf16_t; +using Row = ck_tile::tensor_layout::gemm::RowMajor; +using Col = ck_tile::tensor_layout::gemm::ColumnMajor; +using True = ck_tile::bool_constant; +using False = ck_tile::bool_constant; + +// clang-format off +using KernelTypes = ::testing::Types< + // ALayout, BLayout, CLayout, ADataType, BDataType, AccDataType, CDataType, Persistent + std::tuple< Row, Col, Row, BF16, BF16, F32, BF16, True>, + std::tuple< Row, Col, Row, BF16, BF16, F32, BF16, False>, + std::tuple< Col, Col, Row, BF16, BF16, F32, BF16, True>, + std::tuple< Col, Col, Row, BF16, BF16, F32, BF16, False>, + std::tuple< Row, Row, Row, BF16, BF16, F32, BF16, True>, + std::tuple< Row, Row, Row, BF16, BF16, F32, BF16, False>, + std::tuple< Col, Row, Row, BF16, BF16, F32, BF16, True>, + std::tuple< Col, Row, Row, BF16, BF16, F32, BF16, False> + >; +// clang-format on + +template +class TestCkTileGroupedGemmBF16 : public TestCkTileGroupedGemm +{ +}; + +TYPED_TEST_SUITE(TestCkTileGroupedGemmBF16, KernelTypes); + +#define TEST_CKTILE_GGEMM_SUITE_NAME TestCkTileGroupedGemmBF16 + +#include "test_grouped_gemm_ut_cases.inc" diff --git a/test/ck_tile/grouped_gemm/test_grouped_gemm.cpp b/test/ck_tile/grouped_gemm/test_grouped_gemm_f16.cpp similarity index 60% rename from test/ck_tile/grouped_gemm/test_grouped_gemm.cpp rename to test/ck_tile/grouped_gemm/test_grouped_gemm_f16.cpp index 0f8e25e03b..2d3c30a5fb 100644 --- a/test/ck_tile/grouped_gemm/test_grouped_gemm.cpp +++ b/test/ck_tile/grouped_gemm/test_grouped_gemm_f16.cpp @@ -10,7 +10,6 @@ using F16 = ck_tile::half_t; using F32 = float; -using BF16 = ck_tile::bf16_t; using Row = ck_tile::tensor_layout::gemm::RowMajor; using Col = ck_tile::tensor_layout::gemm::ColumnMajor; using True = ck_tile::bool_constant; @@ -21,25 +20,22 @@ using KernelTypes = ::testing::Types< // ALayout, BLayout, CLayout, ADataType, BDataType, AccDataType, CDataType, Persistent std::tuple< Row, Col, Row, F16, F16, F32, F16, True>, std::tuple< Row, Col, Row, F16, F16, F32, F16, False>, - std::tuple< Row, Col, Row, F16, F16, F32, F16, True>, - std::tuple< Row, Col, Row, F16, F16, F32, F16, False>, - std::tuple< Col, Col, Row, F16, F16, F32, F16, True>, std::tuple< Col, Col, Row, F16, F16, F32, F16, False>, std::tuple< Row, Row, Row, F16, F16, F32, F16, True>, std::tuple< Row, Row, Row, F16, F16, F32, F16, False>, std::tuple< Col, Row, Row, F16, F16, F32, F16, True>, - std::tuple< Col, Row, Row, F16, F16, F32, F16, False>, - - std::tuple< Col, Col, Row, BF16, BF16, F32, BF16, True>, - std::tuple< Col, Col, Row, BF16, BF16, F32, BF16, False>, - std::tuple< Row, Row, Row, BF16, BF16, F32, BF16, True>, - std::tuple< Row, Row, Row, BF16, BF16, F32, BF16, False>, - std::tuple< Col, Row, Row, BF16, BF16, F32, BF16, True>, - std::tuple< Col, Row, Row, BF16, BF16, F32, BF16, False> + std::tuple< Col, Row, Row, F16, F16, F32, F16, False> >; // clang-format on -TYPED_TEST_SUITE(TestCkTileGroupedGemm, KernelTypes); +template +class TestCkTileGroupedGemmF16 : public TestCkTileGroupedGemm +{ +}; + +TYPED_TEST_SUITE(TestCkTileGroupedGemmF16, KernelTypes); + +#define TEST_CKTILE_GGEMM_SUITE_NAME TestCkTileGroupedGemmF16 #include "test_grouped_gemm_ut_cases.inc" diff --git a/test/ck_tile/grouped_gemm/test_grouped_gemm_ut_cases.inc b/test/ck_tile/grouped_gemm/test_grouped_gemm_ut_cases.inc index 4d6d565423..94a7596763 100644 --- a/test/ck_tile/grouped_gemm/test_grouped_gemm_ut_cases.inc +++ b/test/ck_tile/grouped_gemm/test_grouped_gemm_ut_cases.inc @@ -3,7 +3,7 @@ #pragma once -TYPED_TEST(TestCkTileGroupedGemm, Basic) +TYPED_TEST(TEST_CKTILE_GGEMM_SUITE_NAME, Basic) { const int group_count = 8; const int kbatch = 1; @@ -16,19 +16,19 @@ TYPED_TEST(TestCkTileGroupedGemm, Basic) for(int i = 0; i < group_count; i++) { - Ms.push_back(256 + 256 * i); - Ns.push_back(256 + 512 * i); - Ks.push_back(512 + 128 * i); + Ms.push_back(64 + 64 * i); + Ns.push_back(128 + 64 * i); + Ks.push_back(64 + 32 * i); - stride_As.push_back(Ks[i]); - stride_Bs.push_back(Ks[i]); - stride_Cs.push_back(Ns[i]); + stride_As.push_back(0); + stride_Bs.push_back(0); + stride_Cs.push_back(0); } this->Run(Ms, Ns, Ks, stride_As, stride_Bs, stride_Cs, kbatch, group_count); } -TYPED_TEST(TestCkTileGroupedGemm, SplitK) +TYPED_TEST(TEST_CKTILE_GGEMM_SUITE_NAME, SplitK) { const int group_count = 8; const int kbatch = 2; @@ -41,14 +41,64 @@ TYPED_TEST(TestCkTileGroupedGemm, SplitK) for(int i = 0; i < group_count; i++) { - Ms.push_back(256 + 256 * i); - Ns.push_back(256 + 512 * i); - Ks.push_back(512 + 128 * i); + Ms.push_back(64 + 64 * i); + Ns.push_back(128 + 64 * i); + Ks.push_back(64 + 32 * i); - stride_As.push_back(Ks[i]); - stride_Bs.push_back(Ks[i]); - stride_Cs.push_back(Ns[i]); + stride_As.push_back(0); + stride_Bs.push_back(0); + stride_Cs.push_back(0); } this->Run(Ms, Ns, Ks, stride_As, stride_Bs, stride_Cs, kbatch, group_count); } + +// Verify that groups with M=0 are silently skipped (primary MoE scenario where some +// experts receive zero tokens) and that non-zero groups produce correct results. +TYPED_TEST(TEST_CKTILE_GGEMM_SUITE_NAME, ZeroM) +{ + const int group_count = 8; + const int kbatch = 1; + + const std::vector Ms = {256, 0, 256, 0, 256, 256, 0, 256}; + const std::vector Ns = {256, 256, 256, 256, 256, 256, 256, 256}; + const std::vector Ks = {512, 512, 512, 512, 512, 512, 512, 512}; + std::vector stride_As(group_count, 0); + std::vector stride_Bs(group_count, 0); + std::vector stride_Cs(group_count, 0); + + this->Run(Ms, Ns, Ks, stride_As, stride_Bs, stride_Cs, kbatch, group_count); +} + +// Verify that groups with K=0 produce all-zero output and that surrounding groups +// with non-zero K are unaffected. +TYPED_TEST(TEST_CKTILE_GGEMM_SUITE_NAME, ZeroK) +{ + const int group_count = 8; + const int kbatch = 1; + + const std::vector Ms = {256, 256, 256, 256, 256, 256, 256, 256}; + const std::vector Ns = {256, 256, 256, 256, 256, 256, 256, 256}; + const std::vector Ks = {512, 512, 512, 0, 0, 512, 512, 512}; + std::vector stride_As(group_count, 0); + std::vector stride_Bs(group_count, 0); + std::vector stride_Cs(group_count, 0); + + this->Run(Ms, Ns, Ks, stride_As, stride_Bs, stride_Cs, kbatch, group_count); +} + +// Verify that a mix of M=0, N=0, and K=0 groups all behave correctly together. +TYPED_TEST(TEST_CKTILE_GGEMM_SUITE_NAME, ZeroMixed) +{ + const int group_count = 8; + const int kbatch = 1; + + const std::vector Ms = {256, 0, 256, 256, 512, 256, 0, 256}; + const std::vector Ns = {256, 256, 512, 0, 256, 256, 256, 256}; + const std::vector Ks = {512, 512, 512, 512, 512, 0, 512, 512}; + std::vector stride_As(group_count, 0); + std::vector stride_Bs(group_count, 0); + std::vector stride_Cs(group_count, 0); + + this->Run(Ms, Ns, Ks, stride_As, stride_Bs, stride_Cs, kbatch, group_count); +} diff --git a/test/ck_tile/grouped_gemm/test_grouped_gemm_util.hpp b/test/ck_tile/grouped_gemm/test_grouped_gemm_util.hpp index 58e9168c6a..75f072dd85 100644 --- a/test/ck_tile/grouped_gemm/test_grouped_gemm_util.hpp +++ b/test/ck_tile/grouped_gemm/test_grouped_gemm_util.hpp @@ -32,14 +32,14 @@ class TestCkTileGroupedGemm : public ::testing::Test struct GroupedGemKernelParam_Mfma { - static const bool kPadM = false; - static const bool kPadN = false; - static const bool kPadK = false; + static const bool kPadM = true; + static const bool kPadN = true; + static const bool kPadK = true; static const int kBlockPerCu = 1; - static const ck_tile::index_t M_Tile = 256; - static const ck_tile::index_t N_Tile = 256; - static const ck_tile::index_t K_Tile = 64; + static const ck_tile::index_t M_Tile = 64; + static const ck_tile::index_t N_Tile = 64; + static const ck_tile::index_t K_Tile = 32; static const ck_tile::index_t M_Warp = 2; static const ck_tile::index_t N_Warp = 2; @@ -52,9 +52,9 @@ class TestCkTileGroupedGemm : public ::testing::Test struct GroupedGemKernelParam_Wmma : public GroupedGemKernelParam_Mfma { - static const ck_tile::index_t M_Tile = 128; - static const ck_tile::index_t N_Tile = 128; - static const ck_tile::index_t K_Tile = 64; + static const ck_tile::index_t M_Tile = 64; + static const ck_tile::index_t N_Tile = 64; + static const ck_tile::index_t K_Tile = 32; static const ck_tile::index_t M_Warp_Tile = 16; static const ck_tile::index_t N_Warp_Tile = 16; @@ -131,14 +131,20 @@ class TestCkTileGroupedGemm : public ::testing::Test auto kargs = Kernel::MakeKargs(gemm_descs); EXPECT_TRUE(Kernel::IsSupportedArgument(kargs)); - const dim3 grids = Kernel::GridSize(gemm_descs); + // Use the filtered kargs (zero-dim groups are excluded by MakeKargs) to derive + // the correct grid size and group count — not the raw gemm_descs vector. const dim3 blocks = Kernel::BlockSize(); + if(kargs.empty()) + return; - ck_tile::hip_check_error(hipMemcpyWithStream(kargs_ptr, - kargs.data(), - get_workspace_size(gemm_descs), - hipMemcpyHostToDevice, - s.stream_id_)); + const dim3 grids = dim3(kargs.back().block_end, 1, 1); + + ck_tile::hip_check_error( + hipMemcpyWithStream(kargs_ptr, + kargs.data(), + kargs.size() * sizeof(ck_tile::GemmTransKernelArg<>), + hipMemcpyHostToDevice, + s.stream_id_)); if(s.log_level_ > 0) { @@ -155,7 +161,7 @@ class TestCkTileGroupedGemm : public ::testing::Test blocks, 0, ck_tile::cast_pointer_to_constant_address_space(kargs_ptr), - gemm_descs.size())); + kargs.size())); } template @@ -296,11 +302,13 @@ class TestCkTileGroupedGemm : public ::testing::Test if constexpr(std::is_same_v) { - return col; + // Use stride 1, in case the dim equals to zero + return std::max(col, std::size_t{1}); } else { - return row; + // Use stride 1, in case the dim equals to zero + return std::max(row, std::size_t{1}); } } else @@ -332,7 +340,7 @@ class TestCkTileGroupedGemm : public ::testing::Test const ck_tile::index_t N = Ns[i]; const ck_tile::index_t K = Ks[i]; - stride_As[i] = f_get_default_stride(M, N, stride_As[i], ALayout{}); + stride_As[i] = f_get_default_stride(M, K, stride_As[i], ALayout{}); stride_Bs[i] = f_get_default_stride(K, N, stride_Bs[i], BLayout{}); stride_Cs[i] = f_get_default_stride(M, N, stride_Cs[i], CLayout{}); @@ -442,17 +450,27 @@ class TestCkTileGroupedGemm : public ::testing::Test bool pass{true}; for(int i = 0; i < group_count; ++i) { + // Groups with M=0 or N=0 produce no output — skip validation. + // K=0 groups do produce output (all zeros) and are validated normally. + if(Ms[i] == 0 || Ns[i] == 0) + continue; + ck_tile::HostTensor c_m_n_host_ref( f_host_tensor_descriptor(Ms[i], Ns[i], stride_Cs[i], CLayout{})); c_m_n_host_ref.SetZero(); ck_tile::reference_gemm( a_m_k_tensors[i], b_k_n_tensors[i], c_m_n_host_ref); - const float max_accumulated_value = std::abs(static_cast(*std::max_element( + // Use max absolute value (not algebraic max) to calibrate atol. + // The absolute threshold in calculate_rtol_atol scales with this value, + // so using the algebraic max (which may be a small positive number when + // most outputs are negative) would produce a near-zero atol. Near-zero + // reference elements then have no tolerance headroom for the ~1 ULP + // error introduced by SplitK atomicAdd accumulation. + const float max_accumulated_value = std::accumulate( c_m_n_host_ref.mData.begin(), c_m_n_host_ref.mData.end(), - [](CDataType a, CDataType b) { - return std::abs(static_cast(a)) < std::abs(static_cast(b)); - }))); + 0.0f, + [](float acc, auto v) { return std::max(acc, std::abs(static_cast(v))); }); const auto rtol_atol = calculate_rtol_atol(Ks[i], kbatch, max_accumulated_value); pass &= ck_tile::check_err(c_m_n_tensors[i], c_m_n_host_ref,