feat(grouped_gemm): add preshuffle v2 support to grouped gemm example (#2721)

* docs(README): update readme with new build instructions * feat(grouped_gemm): add support back for non persistent kernel * refactor(grouped_gemm): simplify tensor creation * refactor(grouped_gemm): Persistance is now GemmConfig value for easier management * chore(grouped_gemm): add print statements to ease debugging * WIP(grouped_gemm): add grouped_gemm_preshuffle example and update CMake configuration * fix(tile_gemm_traits): change default value of Preshuffle_ from 0 to false for clarity * WIP(grouped_gemm): add dummy variables to compile the preshuffle pipelines * chore(grouped_gemm): add print statements and variables to debug numerical error with preshuffle * style: clang format work so far * BUG!(grouped_gemm_kernel.hpp): figured out a potential bug in for numerical errors in preshuffle pipeline * fix(grouped_gemm_kernel): add function in the kernel code to dynamically calculate tail_number resolving numerical errors * refactor(gemm_presuffle): make preshuffle pipeline v2 compatible with operator () calls from grouped gemm * chore(grouped_gemm): add/remove debug comments and debug print statements * feat(grouped_gemm): integrate preshuffle pipeline v2 into grouped gemm for all supported shapes * chore(gemm_profile): add new argument combinations * fix: branch cleanup, formatting, refactoring * fix: branch cleanup, formatting, refactoring * chore(changelog): update changelog to reflect new featuer * address review comments & nit
2026-04-20 06:49:15 +00:00 · 2025-09-07 17:18:35 -04:00
parent 5224d2ead3
commit e279e9420e
13 changed files with 808 additions and 178 deletions
--- a/include/ck_tile/ops/gemm/kernel/grouped_gemm_kernel.hpp
+++ b/include/ck_tile/ops/gemm/kernel/grouped_gemm_kernel.hpp
@@ -266,6 +266,10 @@ struct GroupedGemmKernel
                            const tuple<index_t, index_t>& block_idx_2d,
                            const index_t block_idx_z) const
    {
+
+        static_assert(GemmPipeline::DoubleSmemBuffer || !GemmPipeline::Preshuffle,
+                      "SingleSmemBuffer and Preshuffle cannot both be enabled simultaneously!");
+
        const auto [iM, iN] = block_idx_2d;

        const index_t i_m = __builtin_amdgcn_readfirstlane(iM * TilePartitioner::MPerBlock);
@@ -282,11 +286,15 @@ struct GroupedGemmKernel
        // allocate LDS
        __shared__ char smem_ptr_0[GetSmemSize()];

+        // TO DO:
+        // Can we simplify this branching logic?
        if constexpr(GemmPipeline::DoubleSmemBuffer == true)
        {
+
            __shared__ char smem_ptr_1[GetSmemSize()];
-            if constexpr(UsePersistentKernel)
+            if constexpr(UsePersistentKernel || GemmPipeline::Preshuffle)
            {
+
                RunGemmWithPipelineSelection2LDS(a_ptr,
                                                 b_ptr,
                                                 c_ptr,
@@ -296,9 +304,11 @@ struct GroupedGemmKernel
                                                 splitk_batch_offset,
                                                 i_m,
                                                 i_n);
+                return;
            }
            else
            {
+
                Base::RunGemm2LDS({a_ptr},
                                  {b_ptr},
                                  {/*ds_ptr*/},
@@ -311,14 +321,14 @@ struct GroupedGemmKernel
                                  i_n);
            }
        }
-        else
+        else // SingleSmemBuffer
        {
            if constexpr(UsePersistentKernel)
            {
                RunGemmWithPipelineSelection(
                    a_ptr, b_ptr, c_ptr, smem_ptr_0, kargs, splitk_batch_offset, i_m, i_n);
            }
-            else
+            else // Non-persistent kernel
            {
                Base::RunGemm({a_ptr},
                              {b_ptr},
@@ -438,17 +448,34 @@ struct GroupedGemmKernel
        // Get hot-loop and tail configuration
        const index_t num_loop = __builtin_amdgcn_readfirstlane(
            TilePartitioner::GetLoopNum(splitk_batch_offset.splitted_k));
-        const bool has_hot_loop   = GemmPipeline::BlockHasHotloop(num_loop);
        const TailNumber tail_num = GemmPipeline::GetBlockLoopTailNum(num_loop);

-        // Run GEMM pipeline
-        const auto& c_block_tile = GemmPipeline{}.template operator()(a_block_window[Base::I0],
-                                                                      b_block_window[Base::I0],
-                                                                      num_loop,
-                                                                      has_hot_loop,
-                                                                      tail_num,
-                                                                      smem_ptr_0,
-                                                                      smem_ptr_1);
+        // Run GEMM pipeline with compile-time branching
+        const auto& c_block_tile = [&]() {
+            if constexpr(GemmPipeline::Preshuffle)
+            {
+                // Preshuffle version - without has_hot_loop parameter
+                return GemmPipeline{}.template operator()(a_block_window[Base::I0],
+                                                          b_block_window[Base::I0],
+                                                          num_loop,
+                                                          tail_num,
+                                                          smem_ptr_0,
+                                                          smem_ptr_1);
+            }
+            else
+            {
+                // Regular version - with has_hot_loop parameter
+                const bool has_hot_loop = GemmPipeline::BlockHasHotloop(num_loop);
+                return GemmPipeline{}.template operator()(a_block_window[Base::I0],
+                                                          b_block_window[Base::I0],
+                                                          num_loop,
+                                                          has_hot_loop,
+                                                          tail_num,
+                                                          smem_ptr_0,
+                                                          smem_ptr_1);
+            }
+        }();
+
        // Run Epilogue Pipeline
        auto& c_block_window = gemm_tile_windows.at(Base::I3);
        EpiloguePipeline{}.template
@@ -491,8 +518,9 @@ struct GroupedGemmKernel
        const auto gemm_desc_ptr = reinterpret_cast<const GemmTransKernelArg*>(
            cast_pointer_to_generic_address_space(gemm_descs_const));

-        const index_t group_id  = FindGroupId(gemm_desc_ptr, block_id, group_count);
-        const auto& kargs       = gemm_desc_ptr[group_id];
+        const index_t group_id = FindGroupId(gemm_desc_ptr, block_id, group_count);
+        const auto& kargs      = gemm_desc_ptr[group_id];
+
        const auto grid_size_2d = TilePartitioner::GridSize(kargs.group_karg.M, kargs.group_karg.N);
        const auto block_idx_2d = OffsetTile1DPartitioner::GetOffsetedTileIndex(
            0,