[CK_TILE] Grouped GEMM tile loop (#2146)

* Add trait to use a persistent kernel and split the entrypoints in grouped gemm * Some helper functions for persistent kernel case * Get max occupancy grid using device properties * Implement tile loop in main entry point to grouped gemm * Enable GridSize() on device * Handle offset tile index using real current block index * Add persistent kernel choice to grouped gemm example * Use a for-loop for iterating over the group * Reduce VGPR spills by early-exit * Enable persistent kernel choice in grouped_gemm example * Add persistent kernel option to grouped_gemm test * Fix formatting with remod.py * Remove GridUpdateBlocks as blocks are now iteratively computed * Add comment about VGPR spilling * Fix formatting * Use CK_TILE_HOST instead of __host__ * Enable all Row/Col combinations in grouped gemm unit test * Add some KBatch=2 cases to grouped gemm tests * Fix SplitK for grouped gemm * Enable pipeline hotloop/tailnumber selection in-kernel for grouped gemm * Add type traits * Split examples to regular and tileloop * Formatting * Use hipExtStreamGetCUMask to get current active CUs for the given stream * Align test and example kernel config, and disable validation for splitk repeats * Remove debug options from CMakeLists.txt * Separate the code paths for persistent/non-persistent in test * Fix formatting * Address review comments --------- Co-authored-by: Adam Osewski <19374865+aosewski@users.noreply.github.com>
2026-05-04 05:31:24 +00:00 · 2025-05-20 17:18:57 +03:00
parent c4929225f6
commit d1e6f0982d
15 changed files with 908 additions and 146 deletions
--- a/test/ck_tile/grouped_gemm/test_grouped_gemm.cpp
+++ b/test/ck_tile/grouped_gemm/test_grouped_gemm.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.

 #include <tuple>

@@ -8,19 +8,27 @@
 #include "ck_tile/host.hpp"
 #include "test_grouped_gemm_util.hpp"

-using F16 = ck_tile::half_t;
-using F32 = float;
-
-using Row = ck_tile::tensor_layout::gemm::RowMajor;
-using Col = ck_tile::tensor_layout::gemm::ColumnMajor;
+using F16   = ck_tile::half_t;
+using F32   = float;
+using Row   = ck_tile::tensor_layout::gemm::RowMajor;
+using Col   = ck_tile::tensor_layout::gemm::ColumnMajor;
+using True  = ck_tile::bool_constant<true>;
+using False = ck_tile::bool_constant<false>;

 // clang-format off
 using KernelTypes = ::testing::Types<
-    //         ALayout, BLayout, CLayout, ADataType, BDataType, AccDataType, CDataType
-    // std::tuple<    Row,     Row,     Row,       F16,       F16,         F32,      F16>,
-    //std::tuple<    Col,     Row,     Row,       F16,       F16,         F32,      F16>,
-    std::tuple<    Row,     Col,     Row,       F16,       F16,         F32,      F16>//,
-    //std::tuple<    Col,     Col,     Row,       F16,       F16,         F32,      F16>
+    //         ALayout, BLayout, CLayout, ADataType, BDataType, AccDataType, CDataType, Persistent
+    std::tuple<    Row,     Col,     Row,       F16,       F16,         F32,       F16,       True>,
+    std::tuple<    Row,     Col,     Row,       F16,       F16,         F32,       F16,      False>,
+    std::tuple<    Row,     Col,     Row,       F16,       F16,         F32,       F16,       True>,
+    std::tuple<    Row,     Col,     Row,       F16,       F16,         F32,       F16,      False>,
+
+    std::tuple<    Col,     Col,     Row,       F16,       F16,         F32,       F16,       True>,
+    std::tuple<    Col,     Col,     Row,       F16,       F16,         F32,       F16,      False>,
+    std::tuple<    Row,     Row,     Row,       F16,       F16,         F32,       F16,       True>,
+    std::tuple<    Row,     Row,     Row,       F16,       F16,         F32,       F16,      False>,
+    std::tuple<    Col,     Row,     Row,       F16,       F16,         F32,       F16,       True>,
+    std::tuple<    Col,     Row,     Row,       F16,       F16,         F32,       F16,      False>
    >;
 // clang-format on