[rocm-libraries] ROCm/rocm-libraries#4299 (commit 668cd49)

173 implement device grouped gemm fixed nk for rdna4 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ## Proposed changes This PR adds an RDNA4 implementation of the device_grouped_gemm_fixed_nk instance library using for WMMA. The implementation is based on the existing DeviceGroupedGemm_Xdl_Fixed_NK design and reuses the same high-level structure, but replaces the XDL kernel with a WMMA-based one. It uses the GridwiseGemm_wmma_cshuffle_v3 kernel. At this stage, the focus is functional correctness and compatibility, not performance tuning. ## Technical Details - Device struct for grouped gemm fixed NK - Example code for the WMMA version - Unit tests for both new wmma implementation and the reference XDL code (previously missing) - Generic ck profiler interface with the purpose of calling unit tests. ## Checklist Please put an into the boxes that apply. You can also fill these out after creating the PR. If you're not sure, please don't hesitate to ask. - [x] I have added tests relevant to the introduced functionality, and the unit tests are passing locally - [x] I have added the test to REGRESSION_TESTS list defined at the top of CMakeLists.txt in tests/CMakeLists.txt, **IF** the test takes more than 30 seconds to run. - [ ] I have added inline documentation which enables the maintainers with understanding the motivation - [ ] I have removed the stale documentation which is no longer relevant after this pull request - [x] (If this change is user-facing) I have added release notes which provide the end users with a brief summary of the improvement from this pull request - [x] I have run on all changed files - [x] Any dependent changes have been merged ## Discussion If this is a relatively large or complex change, feel free to start a discussion by explaining why you chose the solution you did and what alternatives you considered
2026-04-19 22:39:03 +00:00 · 2026-02-19 08:13:46 +00:00
parent c5ce5eee5b
commit 7b97e197ef
32 changed files with 2819 additions and 163 deletions
--- a/test/grouped_gemm/CMakeLists.txt
+++ b/test/grouped_gemm/CMakeLists.txt
@@ -18,6 +18,12 @@ if (CK_USE_XDL OR CK_USE_WMMA)
        target_link_libraries(test_grouped_gemm_fastgelu PRIVATE utility device_grouped_gemm_fastgelu_instance)
        add_dependencies(test_grouped_gemm test_grouped_gemm_fastgelu)
    endif()
+
+    add_gtest_executable(test_grouped_gemm_fixed_nk test_grouped_gemm_fixed_nk.cpp)
+    if(result EQUAL 0)
+        target_link_libraries(test_grouped_gemm_fixed_nk PRIVATE utility device_grouped_gemm_fixed_nk_instance)
+        add_dependencies(test_grouped_gemm test_grouped_gemm_fixed_nk)
+    endif()
 endif()

 add_gtest_executable(test_grouped_gemm_interface test_grouped_gemm_interface_xdl.cpp)
--- a/test/grouped_gemm/test_grouped_gemm_fixed_nk.cpp
+++ b/test/grouped_gemm/test_grouped_gemm_fixed_nk.cpp
@@ -0,0 +1,84 @@
+// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
+// SPDX-License-Identifier: MIT
+
+#include <tuple>
+#include <vector>
+
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/utility/data_type.hpp"
+#include "ck/utility/tuple.hpp"
+
+#include "gtest/gtest.h"
+#include "test_grouped_gemm_util.hpp"
+
+ck::index_t param_mask     = 0xffffff;
+ck::index_t instance_index = -1;
+
+using F16  = ck::half_t;
+using BF16 = ck::bhalf_t;
+using F8   = ck::f8_t;
+using I8   = int8_t;
+
+using Row = ck::tensor_layout::gemm::RowMajor;
+using Col = ck::tensor_layout::gemm::ColumnMajor;
+
+template <typename Tuple>
+class TestGroupedGemm
+    : public ck::test::TestGroupedGemm<Tuple, false, ck::test::FixedNKGroupedGemmProfiler>
+{
+    public:
+    void SetUp() override
+    {
+        ck::test::TestGroupedGemm<Tuple, false, ck::test::FixedNKGroupedGemmProfiler>::SetUp();
+
+#if defined(CK_USE_WMMA)
+        // The old XDL tests didn't fail if instances were not supported, so we want to keep that
+        // behaviour When compiling WMMA instances and WMMA is supported, then we'll fail if a
+        // specific case is not supported
+        this->fail_if_no_supported_instances_ =
+            ck::is_gfx11_supported() || ck::is_gfx12_supported();
+#endif
+    }
+};
+
+// clang-format off
+using KernelTypes = ::testing::Types<
+#if CK_USE_OCP_FP8 || CK_USE_FNUZ_FP8 || defined(CK_USE_FP8_ON_UNSUPPORTED_ARCH) || \
+    defined(CK_USE_WMMA_FP8)
+    ck::Tuple<Row, Row, Row, F16, F8, F16>,
+    ck::Tuple<Row, Col, Row, F16, F8, F16>,
+#endif
+
+    ck::Tuple<Row, Row, Row, F16, F16, F16>,
+    ck::Tuple<Row, Col, Row, F16, F16, F16>,
+
+    ck::Tuple<Row, Row, Row, BF16, BF16, BF16>,
+    ck::Tuple<Row, Col, Row, BF16, BF16, BF16>,
+    ck::Tuple<Row, Row, Row, BF16, I8, BF16>,
+    ck::Tuple<Row, Col, Row, BF16, I8, BF16>,
+
+    ck::Tuple<Row, Row, Row, F16, I8, F16>,
+    ck::Tuple<Row, Col, Row, F16, I8, F16>
+>;
+
+// clang-format on
+
+TYPED_TEST_SUITE(TestGroupedGemm, KernelTypes);
+
+#include "test_grouped_gemm_fixed_nk_cases.inc"
+int main(int argc, char** argv)
+{
+    testing::InitGoogleTest(&argc, argv);
+    if(argc == 1) {}
+    else if(argc == 3)
+    {
+        param_mask     = strtol(argv[1], nullptr, 0);
+        instance_index = atoi(argv[2]);
+    }
+    else
+    {
+        std::cout << "Usage of " << argv[0] << std::endl;
+        std::cout << "Arg1,2: param_mask instance_index(-1 means all)" << std::endl;
+    }
+    return RUN_ALL_TESTS();
+}
--- a/test/grouped_gemm/test_grouped_gemm_fixed_nk_cases.inc
+++ b/test/grouped_gemm/test_grouped_gemm_fixed_nk_cases.inc
@@ -0,0 +1,84 @@
+// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
+// SPDX-License-Identifier: MIT
+
+#pragma once
+
+TYPED_TEST(TestGroupedGemm, TinyCases)
+{
+    const std::vector<int> Ms{2, 1};
+    constexpr int N = 768;
+    constexpr int K = 544;
+
+    const std::vector<int> Ns(Ms.size(), N);
+    const std::vector<int> Ks(Ms.size(), K);
+
+    this->Run(Ms, Ns, Ks);
+}
+
+TYPED_TEST(TestGroupedGemm, SmallCases)
+{
+    const std::vector<int> Ms{2, 1, 3, 4, 5};
+    constexpr int N = 768;
+    constexpr int K = 544;
+
+    const std::vector<int> Ns(Ms.size(), N);
+    const std::vector<int> Ks(Ms.size(), K);
+
+    this->Run(Ms, Ns, Ks);
+}
+
+TYPED_TEST(TestGroupedGemm, MidCases)
+{
+    const std::vector<int> Ms{167, 183, 177, 153, 139, 204};
+    constexpr int N = 768;
+    constexpr int K = 544;
+
+    const std::vector<int> Ns(Ms.size(), N);
+    const std::vector<int> Ks(Ms.size(), K);
+
+    this->Run(Ms, Ns, Ks);
+}
+
+TYPED_TEST(TestGroupedGemm, Regular)
+{
+    const std::vector<int> Ms{64, 128, 256};
+    constexpr int N = 768;
+    constexpr int K = 320;
+
+    const std::vector<int> Ns(Ms.size(), N);
+    const std::vector<int> Ks(Ms.size(), K);
+
+    this->Run(Ms, Ns, Ks);
+}
+
+TYPED_TEST(TestGroupedGemm, MNKPadded)
+{
+    const std::vector<int> Ms{127, 150, 188, 210};
+    constexpr int N = 136;
+    constexpr int K = 280;
+
+    const std::vector<int> Ns(Ms.size(), N);
+    const std::vector<int> Ks(Ms.size(), K);
+
+    this->Run(Ms, Ns, Ks);
+}
+
+TYPED_TEST(TestGroupedGemm, TestLargeKBatch)
+{
+    // In some cases Split K is not supported. Running this test would fail since no instance will
+    // be supported, so we skip the test
+    if(!this->IsSplitKSupported())
+        GTEST_SKIP() << "Split-K not supported for for the current configuration (FP16/BF16 on "
+                        "GFX11, or using CDE element-wise operation)";
+
+    const std::vector<int> Ms{188, 210};
+    constexpr int N = 768;
+    constexpr int K = 4096;
+
+    const std::vector<int> Ns(Ms.size(), N);
+    const std::vector<int> Ks(Ms.size(), K);
+
+    this->k_batches_ = {32, 64};
+
+    this->Run(Ms, Ns, Ks);
+}
--- a/test/grouped_gemm/test_grouped_gemm_util.hpp
+++ b/test/grouped_gemm/test_grouped_gemm_util.hpp
@@ -16,6 +16,7 @@
 #include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
 #include "ck/tensor_operation/gpu/element/unary_element_wise_operation.hpp"
 #include "profiler/profile_grouped_gemm_impl.hpp"
+#include "profiler/profile_grouped_gemm_fixed_nk_impl.hpp"

 extern ck::index_t param_mask;
 extern ck::index_t instance_index;
@@ -23,7 +24,124 @@ extern ck::index_t instance_index;
 namespace ck {
 namespace test {

-template <typename Tuple, bool FailIfNoSupportedInstances = false>
+struct DefaultGroupedGemmProfiler
+{
+    template <typename ADataType,
+              typename BDataType,
+              typename EDataType,
+              typename AccDataType,
+              typename ALayout,
+              typename BLayout,
+              typename ELayout,
+              typename AElementOp,
+              typename BElementOp,
+              typename CDEElementOp>
+    static bool Run(bool verify,
+                    int init_method,
+                    bool log,
+                    bool bench,
+                    const std::vector<int>& Ms,
+                    const std::vector<int>& Ns,
+                    const std::vector<int>& Ks,
+                    const std::vector<int>& StrideAs,
+                    const std::vector<int>& StrideBs,
+                    const std::vector<int>& StrideCs,
+                    const std::vector<int>& kbatches,
+                    int n_warmup,
+                    int n_iter,
+                    int instance_index,
+                    bool fail_if_no_supported_instances)
+    {
+        return ck::profiler::profile_grouped_gemm_impl<ADataType,
+                                                       BDataType,
+                                                       EDataType,
+                                                       AccDataType,
+                                                       ALayout,
+                                                       BLayout,
+                                                       ELayout,
+                                                       AElementOp,
+                                                       BElementOp,
+                                                       CDEElementOp>(
+            verify,
+            init_method,
+            log,
+            bench,
+            Ms,
+            Ns,
+            Ks,
+            StrideAs,
+            StrideBs,
+            StrideCs,
+            kbatches,
+            n_warmup,
+            n_iter,
+            instance_index,
+            fail_if_no_supported_instances);
+    }
+};
+
+struct FixedNKGroupedGemmProfiler
+{
+    template <typename ADataType,
+              typename BDataType,
+              typename EDataType,
+              typename AccDataType,
+              typename ALayout,
+              typename BLayout,
+              typename CLayout>
+    static bool Run(bool verify,
+                    int init_method,
+                    bool log,
+                    bool bench,
+                    const std::vector<int>& Ms,
+                    const std::vector<int>& Ns,
+                    const std::vector<int>& Ks,
+                    const std::vector<int>& StrideAs,
+                    const std::vector<int>& StrideBs,
+                    const std::vector<int>& StrideCs,
+                    const std::vector<int>& kbatches,
+                    int n_warmup,
+                    int n_iter,
+                    int /*instance_index*/,
+                    bool /*fail_if_no_supported_instances*/)
+    {
+        bool pass = true;
+        for(int kbatch : kbatches)
+        {
+            try
+            {
+                pass &= ck::profiler::profile_grouped_gemm_fixed_nk_impl<ADataType,
+                                                                         BDataType,
+                                                                         EDataType,
+                                                                         AccDataType,
+                                                                         ALayout,
+                                                                         BLayout,
+                                                                         CLayout>(verify,
+                                                                                  init_method,
+                                                                                  log,
+                                                                                  bench,
+                                                                                  Ms,
+                                                                                  Ns,
+                                                                                  Ks,
+                                                                                  StrideAs,
+                                                                                  StrideBs,
+                                                                                  StrideCs,
+                                                                                  kbatch,
+                                                                                  n_warmup,
+                                                                                  n_iter);
+            }
+            catch(const std::exception& e)
+            {
+                std::cerr << e.what() << std::endl;
+            }
+        }
+        return pass;
+    }
+};
+
+template <typename Tuple,
+          bool FailIfNoSupportedInstances = false,
+          typename Profiler               = ck::test::DefaultGroupedGemmProfiler>
 class TestGroupedGemm : public testing::Test
 {
    protected:
@@ -76,7 +194,7 @@ class TestGroupedGemm : public testing::Test
        }
        else
        {
-            k_batches_ = {1, 2, 3, 5, 8};
+            k_batches_ = {1, 2, 3, 4, 8};
        }
    }

@@ -146,31 +264,61 @@ class TestGroupedGemm : public testing::Test
                   const std::vector<int>& StrideCs,
                   const std::vector<int>& kbatches)
    {
-        bool pass =
-            ck::profiler::profile_grouped_gemm_impl<ADataType,
-                                                    BDataType,
-                                                    EDataType,
-                                                    float,
-                                                    ALayout,
-                                                    BLayout,
-                                                    ELayout,
-                                                    AElementOp,
-                                                    BElementOp,
-                                                    CDEElementOp>(verify_,
-                                                                  init_method_,
-                                                                  log_,
-                                                                  bench_,
-                                                                  Ms,
-                                                                  Ns,
-                                                                  Ks,
-                                                                  StrideAs,
-                                                                  StrideBs,
-                                                                  StrideCs,
-                                                                  kbatches,
-                                                                  n_warmup_,
-                                                                  n_iter_,
-                                                                  instance_index,
-                                                                  fail_if_no_supported_instances_);
+        bool pass         = false;
+        using AccDataType = float;
+
+        if constexpr(std::is_same_v<Profiler, FixedNKGroupedGemmProfiler>)
+        {
+            pass = Profiler::template Run<ADataType,
+                                          BDataType,
+                                          EDataType,
+                                          AccDataType,
+                                          ALayout,
+                                          BLayout,
+                                          ELayout>(verify_,
+                                                   init_method_,
+                                                   log_,
+                                                   bench_,
+                                                   Ms,
+                                                   Ns,
+                                                   Ks,
+                                                   StrideAs,
+                                                   StrideBs,
+                                                   StrideCs,
+                                                   kbatches,
+                                                   n_warmup_,
+                                                   n_iter_,
+                                                   instance_index,
+                                                   fail_if_no_supported_instances_);
+        }
+        else
+        {
+            pass = Profiler::template Run<ADataType,
+                                          BDataType,
+                                          EDataType,
+                                          AccDataType,
+                                          ALayout,
+                                          BLayout,
+                                          ELayout,
+                                          AElementOp,
+                                          BElementOp,
+                                          CDEElementOp>(verify_,
+                                                        init_method_,
+                                                        log_,
+                                                        bench_,
+                                                        Ms,
+                                                        Ns,
+                                                        Ks,
+                                                        StrideAs,
+                                                        StrideBs,
+                                                        StrideCs,
+                                                        kbatches,
+                                                        n_warmup_,
+                                                        n_iter_,
+                                                        instance_index,
+                                                        fail_if_no_supported_instances_);
+        }
+
        EXPECT_TRUE(pass);
    }
 };