[CK TILE] Implement cschuflle algorithm (#1842)

* [CK TILE] Implement cschuflle algorithm * Rebase * Vector store size fixes * fixes * Fixes * fixes * fmha fix * fixes * fixes of fixes [ROCm/composable_kernel commit: 25e2e0f04a]
2026-05-17 19:40:04 +00:00 · 2025-01-30 11:57:39 +01:00
parent e4d8548dc5
commit 4f2c699f90
18 changed files with 408 additions and 371 deletions
--- a/test/ck_tile/batched_gemm/test_batched_gemm_util.hpp
+++ b/test/ck_tile/batched_gemm/test_batched_gemm_util.hpp
@@ -29,12 +29,9 @@ class TestCkTileBatchedGemm : public ::testing::Test
                             const ck_tile::stream_config& s)
    {
        // The kPadM, kPadN, kPadK & kBlockPerCu should also come from the Codegen part.
-        constexpr bool kPadM        = false;
-        constexpr bool kPadN        = false;
-        constexpr bool kPadK        = false;
-        constexpr bool kTilePermute = false;
-        // The rank and permutation will also be generate out by the CodeGen part.
-        constexpr ck_tile::index_t kOutputRank = 2;
+        constexpr bool kPadM = false;
+        constexpr bool kPadN = false;
+        constexpr bool kPadK = false;

        constexpr int kBlockPerCu = 1;

@@ -51,11 +48,6 @@ class TestCkTileBatchedGemm : public ::testing::Test
        constexpr ck_tile::index_t N_Warp_Tile = 32;
        constexpr ck_tile::index_t K_Warp_Tile = 8;

-        // Whether doing the CShuffle (transpose before the global memory), depending on the output
-        // layout.
-        constexpr bool CShuffleEpilogue =
-            std::is_same_v<CLayout, ck_tile::tensor_layout::gemm::ColumnMajor>;
-
        using CodegenGemmShape =
            ck_tile::TileGemmShape<ck_tile::sequence<M_Tile, N_Tile, K_Tile>,
                                   ck_tile::sequence<M_Warp, N_Warp, K_Warp>,
@@ -63,21 +55,6 @@ class TestCkTileBatchedGemm : public ::testing::Test

        using TilePartitioner = ck_tile::GemmTile2DPartitioner<CodegenGemmShape>;

-        using GemmEpilogue = std::conditional_t<
-            CShuffleEpilogue,
-            ck_tile::CShuffleEpilogue<ck_tile::CShuffleEpilogueProblem<AccDataType,
-                                                                       CDataType,
-                                                                       kPadM,
-                                                                       kPadN,
-                                                                       kTilePermute,
-                                                                       kOutputRank,
-                                                                       1,
-                                                                       0,
-                                                                       TilePartitioner::MPerBlock,
-                                                                       TilePartitioner::NPerBlock>>,
-            ck_tile::Default2DEpilogue<
-                ck_tile::Default2DEpilogueProblem<AccDataType, CDataType, kPadM, kPadN>>>;
-
        using CodegenGemmTraits =
            ck_tile::TileGemmTraits<kPadM, kPadN, kPadK, ALayout, BLayout, CLayout>;

@@ -88,6 +65,20 @@ class TestCkTileBatchedGemm : public ::testing::Test
                                                                    CodegenGemmTraits>;

        using CodegenGemmPipeline = ck_tile::GemmPipelineAGmemBGmemCRegV1<CodegenPipelineProblem>;
+
+        using GemmEpilogue = ck_tile::CShuffleEpilogue<
+            ck_tile::CShuffleEpilogueProblem<AccDataType,
+                                             CDataType,
+                                             CLayout,
+                                             CodegenGemmPipeline::BlockSize,
+                                             TilePartitioner::MPerBlock,
+                                             TilePartitioner::NPerBlock,
+                                             M_Warp,
+                                             N_Warp,
+                                             M_Warp_Tile,
+                                             N_Warp_Tile,
+                                             K_Warp_Tile,
+                                             CodegenPipelineProblem::TransposeC>>;
        using Kernel =
            ck_tile::BatchedGemmKernel<TilePartitioner, CodegenGemmPipeline, GemmEpilogue>;

--- a/test/ck_tile/gemm/test_gemm_pipeline_util.hpp
+++ b/test/ck_tile/gemm/test_gemm_pipeline_util.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
 #pragma once

 #include <sstream>
@@ -65,9 +65,6 @@ class TestCkTileGemmPipeline : public ::testing::Test
                                   ck_tile::sequence<M_Warp_Tile, N_Warp_Tile, K_Warp_Tile>>;
        using TilePartitioner = ck_tile::GemmTile2DPartitioner<GemmShape>;

-        using GemmEpilogue = ck_tile::Default2DEpilogue<
-            ck_tile::Default2DEpilogueProblem<AccDataType, CDataType, kPadM, kPadN>>;
-
        using Traits = ck_tile::TileGemmTraits<kPadM, kPadN, kPadK, ALayout, BLayout, CLayout>;
        using GemmUniversalTraits = ck_tile::
            TileGemmUniversalTraits<kPadM, kPadN, kPadK, ALayout, BLayout, CLayout, TransposeC>;
@@ -106,6 +103,20 @@ class TestCkTileGemmPipeline : public ::testing::Test
                ck_tile::GemmPipelineAgBgCrCompV3<UniversalGemmProblem,
                                                  ck_tile::UniversalGemmPipelineAgBgCrPolicy>>;

+            using GemmEpilogue = ck_tile::CShuffleEpilogue<
+                ck_tile::CShuffleEpilogueProblem<AccDataType,
+                                                 CDataType,
+                                                 CLayout,
+                                                 GemmPipeline::BlockSize,
+                                                 TilePartitioner::MPerBlock,
+                                                 TilePartitioner::NPerBlock,
+                                                 M_Warp,
+                                                 N_Warp,
+                                                 M_Warp_Tile,
+                                                 N_Warp_Tile,
+                                                 K_Warp_Tile,
+                                                 UniversalGemmProblem::TransposeC>>;
+
            using Kernel = ck_tile::GemmKernel<TilePartitioner, GemmPipeline, GemmEpilogue>;
            auto kargs   = Kernel::MakeKernelArgs(args);

@@ -244,7 +255,7 @@ class TestCkTileGemmPipeline : public ::testing::Test
    public:
    std::vector<int> k_batches_;

-    void SetUp() override { k_batches_ = {1}; }
+    void SetUp() override { k_batches_ = {1, 2}; }

    template <bool PadM = true, bool PadN = true, bool PadK = true>
    void Run(const int M,
--- a/test/ck_tile/grouped_gemm/test_grouped_gemm_util.hpp
+++ b/test/ck_tile/grouped_gemm/test_grouped_gemm_util.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
 #pragma once

 #include <sstream>
@@ -26,12 +26,9 @@ class TestCkTileGroupedGemm : public ::testing::Test

    struct GroupedGemKernelParam
    {
-        static const bool kPadM        = false;
-        static const bool kPadN        = false;
-        static const bool kPadK        = false;
-        static const bool kTilePermute = false;
-
-        static const ck_tile::index_t kOutputRank = 2;
+        static const bool kPadM = false;
+        static const bool kPadN = false;
+        static const bool kPadK = false;

        static const int kBlockPerCu         = 1;
        static const ck_tile::index_t M_Tile = 128;
@@ -60,26 +57,6 @@ class TestCkTileGroupedGemm : public ::testing::Test

    using TilePartitioner = ck_tile::GemmTile1DPartitioner<CodegenGemmShape>;

-    template <typename CLayout>
-    using GemmEpilogue =
-        std::conditional_t<std::is_same_v<CLayout, ck_tile::tensor_layout::gemm::ColumnMajor>,
-                           ck_tile::CShuffleEpilogue<
-                               ck_tile::CShuffleEpilogueProblem<AccDataType,
-                                                                CDataType,
-                                                                GroupedGemKernelParam::kPadM,
-                                                                GroupedGemKernelParam::kPadN,
-                                                                GroupedGemKernelParam::kTilePermute,
-                                                                GroupedGemKernelParam::kOutputRank,
-                                                                1,
-                                                                0,
-                                                                TilePartitioner::MPerBlock,
-                                                                TilePartitioner::NPerBlock>>,
-                           ck_tile::Default2DEpilogue<
-                               ck_tile::Default2DEpilogueProblem<AccDataType,
-                                                                 CDataType,
-                                                                 GroupedGemKernelParam::kPadM,
-                                                                 GroupedGemKernelParam::kPadN>>>;
-
    template <typename ALayout, typename BLayout, typename CLayout>
    using CodegenGemmTraits = ck_tile::TileGemmTraits<GroupedGemKernelParam::kPadM,
                                                      GroupedGemKernelParam::kPadN,
@@ -100,10 +77,25 @@ class TestCkTileGroupedGemm : public ::testing::Test
    using CodegenGemmPipeline =
        ck_tile::GemmPipelineAGmemBGmemCRegV1<CodegenPipelineProblem<ALayout, BLayout, CLayout>>;

+    template <typename ALayout, typename BLayout, typename CLayout>
+    using GemmEpilogue = ck_tile::CShuffleEpilogue<ck_tile::CShuffleEpilogueProblem<
+        AccDataType,
+        CDataType,
+        CLayout,
+        CodegenGemmPipeline<ALayout, BLayout, CLayout>::BlockSize,
+        TilePartitioner::MPerBlock,
+        TilePartitioner::NPerBlock,
+        GroupedGemKernelParam::M_Warp,
+        GroupedGemKernelParam::N_Warp,
+        GroupedGemKernelParam::M_Warp_Tile,
+        GroupedGemKernelParam::N_Warp_Tile,
+        GroupedGemKernelParam::K_Warp_Tile,
+        CodegenPipelineProblem<ALayout, BLayout, CLayout>::TransposeC>>;
+
    template <typename ALayout, typename BLayout, typename CLayout>
    using Kernel = ck_tile::GroupedGemmKernel<TilePartitioner,
                                              CodegenGemmPipeline<ALayout, BLayout, CLayout>,
-                                              GemmEpilogue<CLayout>>;
+                                              GemmEpilogue<ALayout, BLayout, CLayout>>;

    using grouped_gemm_kargs = ck_tile::GroupedGemmHostArgs;
    std::size_t GetWorkspaceSize(const std::vector<grouped_gemm_kargs>& gemm_descs)