Add Grouped Gemm Multiple D SplitK TwoStage (#1212)

* Support A/B/C elementwise ops. * First part of GGEMM multiD splitk two stage. * WIP - changes for debuggin. * tmp save * working version * added bf16@int8 version * fixes * add reviewers sugestions * pre-commited missing files * switched to ifs from elseifs --------- Co-authored-by: Adam Osewski <Adam.Osewski@amd.com> [ROCm/composable_kernel commit: c701071666]
2026-05-27 08:25:46 +00:00 · 2024-04-04 11:01:33 +02:00
parent c1d4988b4c
commit e07dbbecc7
13 changed files with 2490 additions and 16 deletions
--- a/library/include/ck/library/reference_tensor_operation/cpu/reference_gemm_multiple_d.hpp
+++ b/library/include/ck/library/reference_tensor_operation/cpu/reference_gemm_multiple_d.hpp
@@ -0,0 +1,175 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include <iostream>
+#include <sstream>
+
+#include "ck/tensor_operation/gpu/element/unary_element_wise_operation.hpp"
+#include "ck/tensor_operation/gpu/device/device_base.hpp"
+#include "ck/library/utility/host_tensor.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace host {
+
+// assumption: every D matrix has the same layout and the same datatype
+template <typename ADataType,
+          typename BDataType,
+          typename DsDataType,
+          typename CDataType,
+          typename AccDataType,
+          typename AElementwiseOperation,
+          typename BElementwiseOperation,
+          typename CDEElementwiseOperation,
+          typename ComputeTypeA = ADataType,
+          typename ComputeTypeB = ComputeTypeA>
+struct ReferenceGemmMultipleD : public device::BaseOperator
+{
+    using DDataType = remove_cvref_t<tuple_element_t<0, DsDataType>>;
+    // Argument
+    struct Argument : public device::BaseArgument
+    {
+        Argument(const Tensor<ADataType>& a_m_k,
+                 const Tensor<BDataType>& b_k_n,
+                 const std::array<Tensor<DDataType>, DsDataType::Size()>& ds_m_n,
+                 Tensor<CDataType>& c_m_n,
+                 AElementwiseOperation a_element_op,
+                 BElementwiseOperation b_element_op,
+                 CDEElementwiseOperation cde_element_op)
+            : a_m_k_{a_m_k},
+              b_k_n_{b_k_n},
+              ds_m_n_{ds_m_n},
+              c_m_n_{c_m_n},
+              a_element_op_{a_element_op},
+              b_element_op_{b_element_op},
+              cde_element_op_{cde_element_op}
+        {
+        }
+
+        const Tensor<ADataType>& a_m_k_;
+        const Tensor<BDataType>& b_k_n_;
+        const std::array<Tensor<DDataType>, DsDataType::Size()>& ds_m_n_;
+        Tensor<CDataType>& c_m_n_;
+
+        AElementwiseOperation a_element_op_;
+        BElementwiseOperation b_element_op_;
+        CDEElementwiseOperation cde_element_op_;
+    };
+
+    // Invoker
+    struct Invoker : public device::BaseInvoker
+    {
+        using Argument = ReferenceGemmMultipleD::Argument;
+
+        float Run(const Argument& arg)
+        {
+            auto f_mk_kn_mn = [&](auto m, auto n) {
+                const int K = arg.a_m_k_.mDesc.GetLengths()[1];
+
+                AccDataType v_acc = 0;
+                ComputeTypeA v_a  = 0;
+                ComputeTypeB v_b  = 0;
+
+                for(int k = 0; k < K; ++k)
+                {
+                    // use PassThrough instead of ConvertBF16RTN for reference calculation
+                    if constexpr(is_same_v<AElementwiseOperation,
+                                           ck::tensor_operation::element_wise::ConvertBF16RTN>)
+                    {
+                        ck::tensor_operation::element_wise::PassThrough{}(v_a, arg.a_m_k_(m, k));
+                    }
+                    else
+                    {
+                        arg.a_element_op_(v_a, arg.a_m_k_(m, k));
+                    }
+                    // same for B matrix
+                    if constexpr(is_same_v<BElementwiseOperation,
+                                           ck::tensor_operation::element_wise::ConvertBF16RTN>)
+                    {
+                        ck::tensor_operation::element_wise::PassThrough{}(v_b, arg.b_k_n_(k, n));
+                    }
+                    else
+                    {
+                        arg.b_element_op_(v_b, arg.b_k_n_(k, n));
+                    }
+
+                    v_acc +=
+                        ck::type_convert<AccDataType>(v_a) * ck::type_convert<AccDataType>(v_b);
+                }
+
+                CDataType v_c = 0;
+
+                if constexpr(DsDataType::Size() == 0)
+                {
+                    arg.cde_element_op_(v_c, v_acc);
+                }
+                else if constexpr(DsDataType::Size() == 1)
+                {
+                    arg.cde_element_op_(v_c, v_acc, arg.ds_m_n_[0](m, n));
+                }
+                else if constexpr(DsDataType::Size() == 2)
+                {
+                    arg.cde_element_op_(v_c, v_acc, arg.ds_m_n_[0](m, n), arg.ds_m_n_[1](m, n));
+                }
+
+                arg.c_m_n_(m, n) = v_c;
+            };
+
+            make_ParallelTensorFunctor(
+                f_mk_kn_mn, arg.c_m_n_.mDesc.GetLengths()[0], arg.c_m_n_.mDesc.GetLengths()[1])(
+                std::thread::hardware_concurrency());
+
+            return 0;
+        }
+
+        float Run(const device::BaseArgument* p_arg,
+                  const StreamConfig& /* stream_config */ = StreamConfig{}) override
+        {
+            return Run(*dynamic_cast<const Argument*>(p_arg));
+        }
+    };
+
+    static constexpr bool IsValidCompilationParameter()
+    {
+        // TODO: properly implement this check
+        return true;
+    }
+
+    bool IsSupportedArgument(const device::BaseArgument*) override { return true; }
+
+    static auto MakeArgument(const Tensor<ADataType>& a_m_k,
+                             const Tensor<BDataType>& b_k_n,
+                             const std::array<Tensor<DDataType>, DsDataType::Size()>& ds_m_n,
+                             Tensor<CDataType>& c_m_n,
+                             AElementwiseOperation a_element_op,
+                             BElementwiseOperation b_element_op,
+                             CDEElementwiseOperation cde_element_op)
+    {
+        return Argument{a_m_k, b_k_n, ds_m_n, c_m_n, a_element_op, b_element_op, cde_element_op};
+    }
+
+    static auto MakeInvoker() { return Invoker{}; }
+
+    virtual std::unique_ptr<device::BaseInvoker> MakeInvokerPointer()
+    {
+        return std::make_unique<Invoker>(Invoker{});
+    }
+
+    std::string GetTypeString() const override
+    {
+        auto str = std::stringstream();
+
+        // clang-format off
+        str << "ReferenceGemmMultipleD"
+            << std::endl;
+        // clang-format on
+
+        return str.str();
+    }
+};
+
+} // namespace host
+} // namespace tensor_operation
+} // namespace ck
--- a/library/include/ck/library/tensor_operation_instance/gpu/grouped_gemm.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/grouped_gemm.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.

 #pragma once

@@ -146,6 +146,32 @@ void add_device_grouped_gemm_xdl_splitk_f8_f16_f16_mk_kn_mn_irregular_instances(
                                                  PassThrough,
                                                  PassThrough>>>& instances);

+void add_device_grouped_gemm_multiple_d_xdl_two_stage_f16_f16_f16_mk_kn_mn_instances(
+    std::vector<std::unique_ptr<DeviceGroupedGemm<Row,
+                                                  Row,
+                                                  Empty_Tuple,
+                                                  Row,
+                                                  F16,
+                                                  F16,
+                                                  Empty_Tuple,
+                                                  F16,
+                                                  PassThrough,
+                                                  PassThrough,
+                                                  PassThrough>>>& instances);
+
+void add_device_grouped_gemm_multiple_d_xdl_two_stage_bf16_i8_bf16_mk_kn_mn_instances(
+    std::vector<std::unique_ptr<DeviceGroupedGemm<Row,
+                                                  Row,
+                                                  Empty_Tuple,
+                                                  Row,
+                                                  BF16,
+                                                  I8,
+                                                  Empty_Tuple,
+                                                  BF16,
+                                                  PassThrough,
+                                                  PassThrough,
+                                                  PassThrough>>>& instances);
+
 template <typename ALayout,
          typename BLayout,
          typename ELayout,
@@ -180,6 +206,7 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceGroupe
    {
        std::vector<std::unique_ptr<DeviceOp>> op_ptrs;

+#if defined(CK_ENABLE_FP16)
        if constexpr(is_same_v<ADataType, half_t> && is_same_v<BDataType, half_t> &&
                     is_same_v<EDataType, half_t>)
        {
@@ -190,6 +217,8 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceGroupe
                add_device_grouped_gemm_xdl_splitk_f16_f16_f16_mk_kn_mn_instances(op_ptrs);
                add_device_grouped_gemm_xdl_splitk_f16_f16_f16_mk_kn_mn_irregular_instances(
                    op_ptrs);
+                add_device_grouped_gemm_multiple_d_xdl_two_stage_f16_f16_f16_mk_kn_mn_instances(
+                    op_ptrs);
            }
            else if constexpr(is_same_v<ALayout, Row> && is_same_v<BLayout, Col> &&
                              is_same_v<ELayout, Row>)
@@ -210,8 +239,10 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceGroupe
                add_device_grouped_gemm_xdl_f16_f16_f16_km_nk_mn_instances(op_ptrs);
            }
        }
-        else if constexpr(is_same_v<ADataType, half_t> && is_same_v<BDataType, f8_t> &&
-                          is_same_v<EDataType, half_t>)
+#endif
+#if defined(CK_ENABLE_FP16) && defined(CK_ENABLE_FP8)
+        if constexpr(is_same_v<ADataType, half_t> && is_same_v<BDataType, f8_t> &&
+                     is_same_v<EDataType, half_t>)
        {
            if constexpr(is_same_v<ALayout, Row> && is_same_v<BLayout, Row> &&
                         is_same_v<ELayout, Row>)
@@ -228,6 +259,19 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceGroupe
                add_device_grouped_gemm_xdl_splitk_f8_f16_f16_mk_kn_mn_irregular_instances(op_ptrs);
            }
        }
+#endif
+#if defined(CK_ENABLE_BF16) && defined(CK_ENABLE_INT8)
+        if constexpr(is_same_v<ADataType, bhalf_t> && is_same_v<BDataType, int8_t> &&
+                     is_same_v<EDataType, bhalf_t>)
+        {
+            if constexpr(is_same_v<ALayout, Row> && is_same_v<BLayout, Row> &&
+                         is_same_v<ELayout, Row>)
+            {
+                add_device_grouped_gemm_multiple_d_xdl_two_stage_bf16_i8_bf16_mk_kn_mn_instances(
+                    op_ptrs);
+            }
+        }
+#endif
        return op_ptrs;
    }
 };