implement device batched gemm b scale for wmma (#2825)

* rebased on top of develop * fixed missing shuffeling and wrong indexing * added tests for batched_b_scale * added missing files * fixed wrong stride computation and removed k batching (for now) due to precision issues * reinstated k-batching with PRNG constrained to -1..1 * added specialization of GeneratorTensor_3 for int4 and fixed internal overflow * added k-batching to reference and increased tolerances for test * changed gemm_b_scale and gemm_universal tests to use correct parameters * adressed review commentsd * ported fixes back to non-batched version of b_scale * adressed review comments * run clang-format on older commits * add type-conversion to AccDataType and then to CDataType to exactly mimic GPU's behavior * added newline at end of file * reflected changes from muitl-abd branch in batched b_scale * fixed gfx11 issue * changed range for pki4 to -1...1 (-0.5...0.5 never really made sense for i4 anyway and always should have caused compiler errors, but since there was no int4 specialization of GeneratorTensor3 until now, this passed * run clang format * set range of i4 generation to 0...1 for upstream tests to pass. This replicated previous behavior, which however means that it is NOT properly tested. * reduced range for pk_i4 even further to 0..0 * removed failing xld instances. Failure now uncovered now that tests were fixed * removed generation of int4 values entierly * divide B buffer by BPackedSize --------- Co-authored-by: Kevin Abraham <kevin.abraham@streamhpc.com> [ROCm/composable_kernel commit: c4b2da9cbd]
2026-05-17 11:30:02 +00:00 · 2025-10-16 20:00:42 +02:00
parent 62afd9eb14
commit 06d76b160e
22 changed files with 1352 additions and 97 deletions
--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@@ -24,6 +24,7 @@ set(REGRESSION_TESTS
    test_batched_gemm_softmax_gemm_permute_bf16
    test_batched_gemm_bias_softmax_gemm_permute_bf16
    test_grouped_gemm_splitk
+    test_batched_gemm_b_scale_wmma
    test_reduce_no_index
    test_reduce_with_index
    test_convnd_fwd
@@ -257,6 +258,7 @@ add_subdirectory(batched_gemm_reduce)
 add_subdirectory(batched_gemm_gemm)
 add_subdirectory(batched_gemm_softmax_gemm)
 add_subdirectory(batched_gemm_softmax_gemm_permute)
+add_subdirectory(batched_gemm_b_scale)
 add_subdirectory(grouped_gemm)
 add_subdirectory(reduce)
 add_subdirectory(convnd_fwd)
--- a/test/batched_gemm_b_scale/CMakeLists.txt
+++ b/test/batched_gemm_b_scale/CMakeLists.txt
@@ -0,0 +1,5 @@
+
+add_gtest_executable(test_batched_gemm_b_scale_wmma test_batched_gemm_b_scale_wmma.cpp)
+if(result EQUAL 0)
+    target_link_libraries(test_batched_gemm_b_scale_wmma PRIVATE utility device_batched_gemm_b_scale_instance)
+endif()
--- a/test/batched_gemm_b_scale/test_batched_gemm_b_scale_ut_cases.inc
+++ b/test/batched_gemm_b_scale/test_batched_gemm_b_scale_ut_cases.inc
@@ -0,0 +1,49 @@
+#pragma once
+
+TYPED_TEST(TestBatchedGemmBScale_MK_NK, SmallM)
+{
+    std::vector<int> Ms{1, 2, 3, 4, 5, 6};
+    constexpr int N = 256;
+    constexpr int K = 1024;
+
+    constexpr int StrideA = K;
+    constexpr int StrideB = K;
+    constexpr int StrideC = N;
+
+    constexpr int NBatches = 10;
+
+    for(int M : Ms)
+        this->Run(M, N, K, StrideA, StrideB, StrideC, NBatches);
+}
+
+TYPED_TEST(TestBatchedGemmBScale_MK_NK, MidLargeM)
+{
+    std::vector<int> Ms{127, 255, 312, 799, 1573};
+    constexpr int N = 512;
+    constexpr int K = 768;
+
+    constexpr int StrideA = K;
+    constexpr int StrideB = K;
+    constexpr int StrideC = N;
+
+    constexpr int NBatches = 7;
+
+    for(int M : Ms)
+        this->Run(M, N, K, StrideA, StrideB, StrideC, NBatches);
+}
+
+TYPED_TEST(TestBatchedGemmBScale_MK_NK, Regular)
+{
+    std::vector<int> Ms{512, 1024};
+    constexpr int N = 512;
+    constexpr int K = 1024;
+
+    constexpr int StrideA = K;
+    constexpr int StrideB = K;
+    constexpr int StrideC = N;
+
+    constexpr int NBatches = 3;
+
+    for(int M : Ms)
+        this->Run(M, N, K, StrideA, StrideB, StrideC, NBatches);
+}
--- a/test/batched_gemm_b_scale/test_batched_gemm_b_scale_util.hpp
+++ b/test/batched_gemm_b_scale/test_batched_gemm_b_scale_util.hpp
@@ -0,0 +1,108 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include <string>
+#include <sstream>
+#include <tuple>
+#include <vector>
+#include <gtest/gtest.h>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "include/ck/utility/data_type.hpp"
+#include "profiler/profile_batched_gemm_b_scale_impl.hpp"
+
+namespace ck {
+namespace test {
+
+template <typename Tuple>
+class TestBatchedGemmBScale : public testing::Test
+{
+    using Row = ck::tensor_layout::gemm::RowMajor;
+    using F32 = float;
+
+    protected:
+    using ALayout         = std::tuple_element_t<0, Tuple>;
+    using BLayout         = std::tuple_element_t<1, Tuple>;
+    using CLayout         = Row;
+    using ADataType       = std::tuple_element_t<2, Tuple>;
+    using BDataType       = std::tuple_element_t<3, Tuple>;
+    using BScaleDataType  = std::tuple_element_t<4, Tuple>;
+    using ComputeDataType = std::tuple_element_t<5, Tuple>;
+    using CDataType       = std::tuple_element_t<6, Tuple>;
+
+    public:
+    static constexpr ck::index_t ScaleBlockK = 128; // all instances
+    static constexpr bool verify_            = true;
+    static constexpr int init_method_        = 2;
+    static constexpr bool log_               = false;
+    static constexpr bool bench_             = false; // measure kernel performance
+    std::vector<int> k_batches_;
+
+    void SetUp() override { k_batches_ = {1, 2}; }
+
+    void Run(const int M,
+             const int N,
+             const int K,
+             const int StrideA,
+             const int StrideB,
+             const int StrideC,
+             const int NBatch)
+    {
+        for(auto kb : k_batches_)
+        {
+            RunSingle(M, N, K, StrideA, StrideB, StrideC, NBatch, kb);
+        }
+    }
+
+    void RunSingle(const int M,
+                   const int N,
+                   const int K,
+                   const int StrideA,
+                   const int StrideB,
+                   const int StrideC,
+                   const int Nbatch,
+                   int kbatch   = 1,
+                   int n_warmup = 1,
+                   int n_iter   = 10)
+    {
+        const int BatchStrideA      = StrideA * M;
+        const int BatchStrideB      = StrideB * K;
+        const int BatchStrideC      = StrideC * M;
+        const int BatchStrideScaleB = StrideB * K;
+        bool pass                   = ck::profiler::profile_batched_gemm_b_scale_impl<ADataType,
+                                                                                      BDataType,
+                                                                                      BScaleDataType,
+                                                                                      ComputeDataType,
+                                                                                      F32,
+                                                                                      CDataType,
+                                                                                      ScaleBlockK,
+                                                                                      ALayout,
+                                                                                      BLayout,
+                                                                                      CLayout>(verify_,
+                                                                             init_method_,
+                                                                             log_,
+                                                                             bench_,
+                                                                             M,
+                                                                             N,
+                                                                             K,
+                                                                             StrideA,
+                                                                             StrideB,
+                                                                             StrideC,
+                                                                             BatchStrideA,
+                                                                             BatchStrideB,
+                                                                             BatchStrideC,
+                                                                             BatchStrideScaleB,
+                                                                             Nbatch,
+                                                                             kbatch,
+                                                                             n_warmup,
+                                                                             n_iter);
+        EXPECT_TRUE(pass);
+    }
+};
+
+} // namespace test
+} // namespace ck
--- a/test/batched_gemm_b_scale/test_batched_gemm_b_scale_wmma.cpp
+++ b/test/batched_gemm_b_scale/test_batched_gemm_b_scale_wmma.cpp
@@ -0,0 +1,45 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <tuple>
+
+#include "gtest/gtest.h"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "test_batched_gemm_b_scale_util.hpp"
+
+using I4  = ck::pk_i4_t;
+using F16 = ck::half_t;
+using F32 = float;
+
+using Row = ck::tensor_layout::gemm::RowMajor;
+using Col = ck::tensor_layout::gemm::ColumnMajor;
+
+namespace {
+
+template <typename X, typename Y>
+struct tuple_concat;
+
+template <typename... Xs, typename... Ys>
+struct tuple_concat<std::tuple<Xs...>, std::tuple<Ys...>>
+{
+    using type = std::tuple<Xs..., Ys...>;
+};
+
+} // namespace
+
+template <typename Tuple>
+class TestBatchedGemmBScale_MK_NK : public ck::test::TestBatchedGemmBScale<
+                                        typename tuple_concat<std::tuple<Row, Col>, Tuple>::type>
+{
+};
+
+// clang-format off
+using KernelTypes_MK_NK = ::testing::Types<
+    //         ADataType, BDataType, BScaleDataType, ComputeDataType, CDataType
+    std::tuple<      F16,        I4,            F16,             F16,       F16>
+    >;
+// clang-format on
+
+TYPED_TEST_SUITE(TestBatchedGemmBScale_MK_NK, KernelTypes_MK_NK);
+
+#include "test_batched_gemm_b_scale_ut_cases.inc"