GEMM+Bias+ReLU+Add (#76)

* tweak conv for odd C * update script * clean up elementwise op * fix build * clean up * added example for gemm+bias+relu+add * added example for gemm+bias+relu * add profiler for gemm_s_shuffle; re-org files * add profiler * fix build * clean up * clean up * clean up * fix build [ROCm/composable_kernel commit: 823657ed12]
2026-05-19 12:30:16 +00:00 · 2022-02-06 22:32:47 -06:00
parent 8890cc207d
commit 8efcb80fa5
77 changed files with 3865 additions and 932 deletions
--- a/host/host_tensor/include/host_gemm.hpp
+++ b/host/host_tensor/include/host_gemm.hpp
@@ -17,15 +17,24 @@ void host_gemm_mk_kn_mn(const Tensor<AType>& a_m_k,
    auto f_mk_kn_mn = [&](auto m, auto n) {
        const int K = a_m_k.mDesc.GetLengths()[1];

-        double v = 0;
+        float v_acc = 0;

        for(int k = 0; k < K; ++k)
        {
-            v += static_cast<const double>(a_element_op(a_m_k(m, k))) *
-                 static_cast<const double>(b_element_op(b_k_n(k, n)));
+            float v_a;
+            float v_b;
+
+            a_element_op(v_a, static_cast<const float>(a_m_k(m, k)));
+            b_element_op(v_b, static_cast<const float>(b_k_n(k, n)));
+
+            v_acc += v_a * v_b;
        }

-        c_m_n(m, n) = c_element_op(v);
+        float v_c;
+
+        c_element_op(v_c, v_acc);
+
+        c_m_n(m, n) = v_c;
    };

    make_ParallelTensorFunctor(f_mk_kn_mn,
--- a/host/include/reference_conv_fwd.hpp
+++ b/host/include/reference_conv_fwd.hpp
@@ -1,166 +0,0 @@
-#ifndef REFERENCE_CONV_FWD_HPP
-#define REFERENCE_CONV_FWD_HPP
-
-#include <iostream>
-#include <sstream>
-#include "device_base.hpp"
-#include "host_tensor.hpp"
-
-namespace ck {
-namespace tensor_operation {
-namespace host {
-
-// out[N, K, Ho, Wo] = in[N, C, Hi, Wi] * wei[K, C, Y, X]
-template <typename InDataType,
-          typename WeiDataType,
-          typename OutDataType,
-          typename AccDataType,
-          typename InElementwiseOperation,
-          typename WeiElementwiseOperation,
-          typename OutElementwiseOperation>
-struct ReferenceConvFwd : public device::BaseOperator
-{
-    // Argument
-    struct Argument : public device::BaseArgument
-    {
-        Argument(const Tensor<InDataType>& in_n_c_hi_wi,
-                 const Tensor<WeiDataType>& wei_k_c_y_x,
-                 Tensor<OutDataType>& out_n_k_ho_wo,
-                 std::vector<ck::index_t> conv_filter_strides,
-                 std::vector<ck::index_t> conv_filter_dilations,
-                 std::vector<ck::index_t> input_left_pads,
-                 std::vector<ck::index_t> input_right_pads,
-                 InElementwiseOperation in_element_op,
-                 WeiElementwiseOperation wei_element_op,
-                 OutElementwiseOperation out_element_op)
-            : in_n_c_hi_wi_{in_n_c_hi_wi},
-              wei_k_c_y_x_{wei_k_c_y_x},
-              out_n_k_ho_wo_{out_n_k_ho_wo},
-              conv_strides_{conv_filter_strides},
-              conv_dilations_{conv_filter_dilations},
-              in_left_pads_{input_left_pads},
-              in_right_pads_{input_right_pads},
-              in_element_op_{in_element_op},
-              wei_element_op_{wei_element_op},
-              out_element_op_{out_element_op}
-        {
-        }
-
-        const Tensor<InDataType>& in_n_c_hi_wi_;
-        const Tensor<WeiDataType>& wei_k_c_y_x_;
-        Tensor<OutDataType>& out_n_k_ho_wo_;
-
-        std::vector<index_t> conv_strides_;
-        std::vector<index_t> conv_dilations_;
-        std::vector<index_t> in_left_pads_;
-        std::vector<index_t> in_right_pads_;
-
-        InElementwiseOperation in_element_op_;
-        WeiElementwiseOperation wei_element_op_;
-        OutElementwiseOperation out_element_op_;
-    };
-
-    // Invoker
-    struct Invoker : public device::BaseInvoker
-    {
-        using Argument = ReferenceConvFwd::Argument;
-
-        float Run(const Argument& arg)
-        {
-            auto f_nchw = [&](auto n, auto k, auto ho, auto wo) {
-                float v = 0;
-                for(int c = 0; c < arg.wei_k_c_y_x_.mDesc.GetLengths()[1]; ++c)
-                {
-                    for(int y = 0; y < arg.wei_k_c_y_x_.mDesc.GetLengths()[2]; ++y)
-                    {
-                        int hi = ho * arg.conv_strides_[0] + y * arg.conv_dilations_[0] -
-                                 arg.in_left_pads_[0];
-                        for(int x = 0; x < arg.wei_k_c_y_x_.mDesc.GetLengths()[3]; ++x)
-                        {
-                            int wi = wo * arg.conv_strides_[1] + x * arg.conv_dilations_[1] -
-                                     arg.in_left_pads_[1];
-                            if(hi >= 0 && hi < arg.in_n_c_hi_wi_.mDesc.GetLengths()[2] && wi >= 0 &&
-                               wi < arg.in_n_c_hi_wi_.mDesc.GetLengths()[3])
-                            {
-                                v += arg.in_element_op_(
-                                         ck::type_convert<float>(arg.in_n_c_hi_wi_(n, c, hi, wi))) *
-                                     arg.wei_element_op_(
-                                         ck::type_convert<float>(arg.wei_k_c_y_x_(k, c, y, x)));
-                            }
-                        }
-                    }
-                }
-
-                arg.out_n_k_ho_wo_(n, k, ho, wo) =
-                    ck::type_convert<OutDataType>(arg.out_element_op_(v));
-            };
-
-            make_ParallelTensorFunctor(f_nchw,
-                                       arg.out_n_k_ho_wo_.mDesc.GetLengths()[0],
-                                       arg.out_n_k_ho_wo_.mDesc.GetLengths()[1],
-                                       arg.out_n_k_ho_wo_.mDesc.GetLengths()[2],
-                                       arg.out_n_k_ho_wo_.mDesc.GetLengths()[3])(
-                std::thread::hardware_concurrency());
-            return 0;
-        }
-
-        float Run(const device::BaseArgument* p_arg, int) override
-        {
-            return Run(*dynamic_cast<const Argument*>(p_arg));
-        }
-    };
-
-    static constexpr bool IsValidCompilationParameter()
-    {
-        // TODO: properly implement this check
-        return true;
-    }
-
-    bool IsSupportedArgument(const device::BaseArgument*) override { return true; }
-
-    static auto MakeArgument(const Tensor<InDataType>& in_n_c_hi_wi,
-                             const Tensor<WeiDataType>& wei_k_c_y_x,
-                             Tensor<OutDataType>& out_n_k_ho_wo,
-                             std::vector<ck::index_t> conv_filter_strides,
-                             std::vector<ck::index_t> conv_filter_dilations,
-                             std::vector<ck::index_t> input_left_pads,
-                             std::vector<ck::index_t> input_right_pads,
-                             InElementwiseOperation in_element_op,
-                             WeiElementwiseOperation wei_element_op,
-                             OutElementwiseOperation out_element_op)
-    {
-        return Argument{in_n_c_hi_wi,
-                        wei_k_c_y_x,
-                        out_n_k_ho_wo,
-                        conv_filter_strides,
-                        conv_filter_dilations,
-                        input_left_pads,
-                        input_right_pads,
-                        in_element_op,
-                        wei_element_op,
-                        out_element_op};
-    }
-
-    static auto MakeInvoker() { return Invoker{}; }
-
-    virtual std::unique_ptr<device::BaseInvoker> MakeInvokerPointer()
-    {
-        return std::make_unique<Invoker>(Invoker{});
-    }
-
-    std::string GetTypeString() const override
-    {
-        auto str = std::stringstream();
-
-        // clang-format off
-        str << "ReferenceConvFwd"
-            << std::endl;
-        // clang-format on
-
-        return str.str();
-    }
-};
-} // namespace host
-} // namespace tensor_operation
-} // namespace ck
-#endif
--- a/host/include/reference_conv_fwd_bias_activation.hpp
+++ b/host/include/reference_conv_fwd_bias_activation.hpp
@@ -1,172 +0,0 @@
-#ifndef REFERENCE_CONV_FWD_BIAS_ACTIVATION_HPP
-#define REFERENCE_CONV_FWD_BIAS_ACTIVATION_HPP
-
-#include <iostream>
-#include <sstream>
-#include "device_base.hpp"
-#include "host_tensor.hpp"
-
-namespace ck {
-namespace tensor_operation {
-namespace host {
-
-// out[N, Ho, Wo, K] =
-//     activate(in[N, Hi, Wi, C] * wei[K, Y, X, C] + bias[K])
-template <typename InDataType,
-          typename WeiDataType,
-          typename OutDataType,
-          typename AccDataType,
-          typename InElementwiseOperation,
-          typename WeiElementwiseOperation,
-          typename OutElementwiseOperation>
-struct ReferenceConvFwd_Bias_Activation : public device::BaseOperator
-{
-    // Argument
-    struct Argument : public device::BaseArgument
-    {
-        Argument(const Tensor<InDataType>& in_n_c_hi_wi,
-                 const Tensor<WeiDataType>& wei_k_c_y_x,
-                 Tensor<OutDataType>& out_n_k_ho_wo,
-                 const Tensor<OutDataType>& bias_k,
-                 std::vector<ck::index_t> conv_filter_strides,
-                 std::vector<ck::index_t> conv_filter_dilations,
-                 std::vector<ck::index_t> input_left_pads,
-                 std::vector<ck::index_t> input_right_pads,
-                 InElementwiseOperation in_element_op,
-                 WeiElementwiseOperation wei_element_op,
-                 OutElementwiseOperation out_element_op)
-            : in_n_c_hi_wi_{in_n_c_hi_wi},
-              wei_k_c_y_x_{wei_k_c_y_x},
-              out_n_k_ho_wo_{out_n_k_ho_wo},
-              bias_k_{bias_k},
-              conv_strides_{conv_filter_strides},
-              conv_dilations_{conv_filter_dilations},
-              in_left_pads_{input_left_pads},
-              in_right_pads_{input_right_pads},
-              in_element_op_{in_element_op},
-              wei_element_op_{wei_element_op},
-              out_element_op_{out_element_op}
-        {
-        }
-
-        const Tensor<InDataType>& in_n_c_hi_wi_;
-        const Tensor<WeiDataType>& wei_k_c_y_x_;
-        Tensor<OutDataType>& out_n_k_ho_wo_;
-        const Tensor<OutDataType>& bias_k_;
-
-        std::vector<index_t> conv_strides_;
-        std::vector<index_t> conv_dilations_;
-        std::vector<index_t> in_left_pads_;
-        std::vector<index_t> in_right_pads_;
-
-        InElementwiseOperation in_element_op_;
-        WeiElementwiseOperation wei_element_op_;
-        OutElementwiseOperation out_element_op_;
-    };
-
-    // Invoker
-    struct Invoker : public device::BaseInvoker
-    {
-        using Argument = ReferenceConvFwd_Bias_Activation::Argument;
-
-        float Run(const Argument& arg)
-        {
-            auto f_nchw = [&](auto n, auto k, auto ho, auto wo) {
-                float v = 0;
-                for(int c = 0; c < arg.wei_k_c_y_x_.mDesc.GetLengths()[1]; ++c)
-                {
-                    for(int y = 0; y < arg.wei_k_c_y_x_.mDesc.GetLengths()[2]; ++y)
-                    {
-                        int hi = ho * arg.conv_strides_[0] + y * arg.conv_dilations_[0] -
-                                 arg.in_left_pads_[0];
-                        for(int x = 0; x < arg.wei_k_c_y_x_.mDesc.GetLengths()[3]; ++x)
-                        {
-                            int wi = wo * arg.conv_strides_[1] + x * arg.conv_dilations_[1] -
-                                     arg.in_left_pads_[1];
-                            if(hi >= 0 && hi < arg.in_n_c_hi_wi_.mDesc.GetLengths()[2] && wi >= 0 &&
-                               wi < arg.in_n_c_hi_wi_.mDesc.GetLengths()[3])
-                            {
-                                v += arg.in_element_op_(
-                                         ck::type_convert<float>(arg.in_n_c_hi_wi_(n, c, hi, wi))) *
-                                     arg.wei_element_op_(
-                                         ck::type_convert<float>(arg.wei_k_c_y_x_(k, c, y, x)));
-                            }
-                        }
-                    }
-                }
-
-                arg.out_n_k_ho_wo_(n, k, ho, wo) =
-                    ck::type_convert<OutDataType>(arg.out_element_op_(v, arg.bias_k_(k)));
-            };
-
-            make_ParallelTensorFunctor(f_nchw,
-                                       arg.out_n_k_ho_wo_.mDesc.GetLengths()[0],
-                                       arg.out_n_k_ho_wo_.mDesc.GetLengths()[1],
-                                       arg.out_n_k_ho_wo_.mDesc.GetLengths()[2],
-                                       arg.out_n_k_ho_wo_.mDesc.GetLengths()[3])(
-                std::thread::hardware_concurrency());
-            return 0;
-        }
-
-        float Run(const device::BaseArgument* p_arg, int) override
-        {
-            return Run(*dynamic_cast<const Argument*>(p_arg));
-        }
-    };
-
-    static constexpr bool IsValidCompilationParameter()
-    {
-        // TODO: properly implement this check
-        return true;
-    }
-
-    bool IsSupportedArgument(const device::BaseArgument*) override { return true; }
-
-    static auto MakeArgument(const Tensor<InDataType>& in_n_c_hi_wi,
-                             const Tensor<WeiDataType>& wei_k_c_y_x,
-                             Tensor<OutDataType>& out_n_k_ho_wo,
-                             const Tensor<OutDataType>& bias_k,
-                             std::vector<ck::index_t> conv_filter_strides,
-                             std::vector<ck::index_t> conv_filter_dilations,
-                             std::vector<ck::index_t> input_left_pads,
-                             std::vector<ck::index_t> input_right_pads,
-                             InElementwiseOperation in_element_op,
-                             WeiElementwiseOperation wei_element_op,
-                             OutElementwiseOperation out_element_op)
-    {
-        return Argument{in_n_c_hi_wi,
-                        wei_k_c_y_x,
-                        out_n_k_ho_wo,
-                        bias_k,
-                        conv_filter_strides,
-                        conv_filter_dilations,
-                        input_left_pads,
-                        input_right_pads,
-                        in_element_op,
-                        wei_element_op,
-                        out_element_op};
-    }
-
-    static auto MakeInvoker() { return Invoker{}; }
-
-    virtual std::unique_ptr<device::BaseInvoker> MakeInvokerPointer()
-    {
-        return std::make_unique<Invoker>(Invoker{});
-    }
-
-    std::string GetTypeString() const override
-    {
-        auto str = std::stringstream();
-
-        // clang-format off
-        str << "ReferenceConvFwd_Bias_Activation"
-            << std::endl;
-        // clang-format on
-
-        return str.str();
-    }
-};
-} // namespace host
-} // namespace tensor_operation
-} // namespace ck
-#endif
--- a/host/include/reference_conv_fwd_bias_activation_add.hpp
+++ b/host/include/reference_conv_fwd_bias_activation_add.hpp
@@ -1,183 +0,0 @@
-#ifndef REFERENCE_CONV2D_FWD_BIAS_ACTIVATION_ADD_HPP
-#define REFERENCE_CONV2D_FWD_BIAS_ACTIVATION_ADD_HPP
-
-#include <iostream>
-#include <sstream>
-#include "device_base.hpp"
-#include "host_tensor.hpp"
-
-namespace ck {
-namespace tensor_operation {
-namespace host {
-
-// out[N, Ho, Wo, K] =
-//     activate(in[N, Hi, Wi, C] * wei[K, Y, X, C] + bias[K]) + residual[N, Ho, Wo, K]
-template <typename InDataType,
-          typename WeiDataType,
-          typename OutDataType,
-          typename AccDataType,
-          typename InElementwiseOperation,
-          typename WeiElementwiseOperation,
-          typename OutElementwiseOperation>
-struct ReferenceConvFwd_Bias_Activation_Add : public device::BaseOperator
-{
-    // Argument
-    struct Argument : public device::BaseArgument
-    {
-        Argument(const Tensor<InDataType>& in_n_c_hi_wi,
-                 const Tensor<WeiDataType>& wei_k_c_y_x,
-                 Tensor<OutDataType>& out_n_k_ho_wo,
-                 const Tensor<OutDataType>& bias_k,
-                 const Tensor<OutDataType>& resi_n_k_ho_wo,
-                 std::vector<ck::index_t> conv_filter_strides,
-                 std::vector<ck::index_t> conv_filter_dilations,
-                 std::vector<ck::index_t> input_left_pads,
-                 std::vector<ck::index_t> input_right_pads,
-                 InElementwiseOperation in_element_op,
-                 WeiElementwiseOperation wei_element_op,
-                 OutElementwiseOperation out_element_op)
-            : in_n_c_hi_wi_{in_n_c_hi_wi},
-              wei_k_c_y_x_{wei_k_c_y_x},
-              out_n_k_ho_wo_{out_n_k_ho_wo},
-              bias_k_{bias_k},
-              resi_n_k_ho_wo_{resi_n_k_ho_wo},
-              conv_strides_{conv_filter_strides},
-              conv_dilations_{conv_filter_dilations},
-              in_left_pads_{input_left_pads},
-              in_right_pads_{input_right_pads},
-              in_element_op_{in_element_op},
-              wei_element_op_{wei_element_op},
-              out_element_op_{out_element_op}
-        {
-        }
-
-        const Tensor<InDataType>& in_n_c_hi_wi_;
-        const Tensor<WeiDataType>& wei_k_c_y_x_;
-        Tensor<OutDataType>& out_n_k_ho_wo_;
-        const Tensor<OutDataType>& bias_k_;
-        const Tensor<OutDataType>& resi_n_k_ho_wo_;
-
-        std::vector<index_t> conv_strides_;
-        std::vector<index_t> conv_dilations_;
-        std::vector<index_t> in_left_pads_;
-        std::vector<index_t> in_right_pads_;
-
-        InElementwiseOperation in_element_op_;
-        WeiElementwiseOperation wei_element_op_;
-        OutElementwiseOperation out_element_op_;
-    };
-
-    // Invoker
-    struct Invoker : public device::BaseInvoker
-    {
-        using Argument = ReferenceConvFwd_Bias_Activation_Add::Argument;
-
-        float Run(const Argument& arg)
-        {
-            auto f_nchw = [&](auto n, auto k, auto ho, auto wo) {
-                float v = 0;
-                for(int c = 0; c < arg.wei_k_c_y_x_.mDesc.GetLengths()[1]; ++c)
-                {
-                    for(int y = 0; y < arg.wei_k_c_y_x_.mDesc.GetLengths()[2]; ++y)
-                    {
-                        int hi = ho * arg.conv_strides_[0] + y * arg.conv_dilations_[0] -
-                                 arg.in_left_pads_[0];
-                        for(int x = 0; x < arg.wei_k_c_y_x_.mDesc.GetLengths()[3]; ++x)
-                        {
-                            int wi = wo * arg.conv_strides_[1] + x * arg.conv_dilations_[1] -
-                                     arg.in_left_pads_[1];
-                            if(hi >= 0 && hi < arg.in_n_c_hi_wi_.mDesc.GetLengths()[2] && wi >= 0 &&
-                               wi < arg.in_n_c_hi_wi_.mDesc.GetLengths()[3])
-                            {
-                                v += arg.in_element_op_(
-                                         ck::type_convert<float>(arg.in_n_c_hi_wi_(n, c, hi, wi))) *
-                                     arg.wei_element_op_(
-                                         ck::type_convert<float>(arg.wei_k_c_y_x_(k, c, y, x)));
-                            }
-                        }
-                    }
-                }
-
-                float v2 = ck::type_convert<float>(arg.out_n_k_ho_wo_(n, k, ho, wo));
-
-                arg.out_element_op_(v2,
-                                    v,
-                                    ck::type_convert<float>(arg.bias_k_(k)),
-                                    ck::type_convert<float>(arg.resi_n_k_ho_wo_(n, k, ho, wo)));
-
-                arg.out_n_k_ho_wo_(n, k, ho, wo) = ck::type_convert<OutDataType>(v2);
-            };
-
-            make_ParallelTensorFunctor(f_nchw,
-                                       arg.out_n_k_ho_wo_.mDesc.GetLengths()[0],
-                                       arg.out_n_k_ho_wo_.mDesc.GetLengths()[1],
-                                       arg.out_n_k_ho_wo_.mDesc.GetLengths()[2],
-                                       arg.out_n_k_ho_wo_.mDesc.GetLengths()[3])(
-                std::thread::hardware_concurrency());
-            return 0;
-        }
-
-        float Run(const device::BaseArgument* p_arg, int) override
-        {
-            return Run(*dynamic_cast<const Argument*>(p_arg));
-        }
-    };
-
-    static constexpr bool IsValidCompilationParameter()
-    {
-        // TODO: properly implement this check
-        return true;
-    }
-
-    bool IsSupportedArgument(const device::BaseArgument*) override { return true; }
-
-    static auto MakeArgument(const Tensor<InDataType>& in_n_c_hi_wi,
-                             const Tensor<WeiDataType>& wei_k_c_y_x,
-                             Tensor<OutDataType>& out_n_k_ho_wo,
-                             const Tensor<OutDataType>& bias_k,
-                             const Tensor<OutDataType>& resi_n_k_ho_wo,
-                             std::vector<ck::index_t> conv_filter_strides,
-                             std::vector<ck::index_t> conv_filter_dilations,
-                             std::vector<ck::index_t> input_left_pads,
-                             std::vector<ck::index_t> input_right_pads,
-                             InElementwiseOperation in_element_op,
-                             WeiElementwiseOperation wei_element_op,
-                             OutElementwiseOperation out_element_op)
-    {
-        return Argument{in_n_c_hi_wi,
-                        wei_k_c_y_x,
-                        out_n_k_ho_wo,
-                        bias_k,
-                        resi_n_k_ho_wo,
-                        conv_filter_strides,
-                        conv_filter_dilations,
-                        input_left_pads,
-                        input_right_pads,
-                        in_element_op,
-                        wei_element_op,
-                        out_element_op};
-    }
-
-    static auto MakeInvoker() { return Invoker{}; }
-
-    virtual std::unique_ptr<device::BaseInvoker> MakeInvokerPointer()
-    {
-        return std::make_unique<Invoker>(Invoker{});
-    }
-
-    std::string GetTypeString() const override
-    {
-        auto str = std::stringstream();
-
-        // clang-format off
-        str << "ReferenceConvFwd_Bias_Activation_Add"
-            << std::endl;
-        // clang-format on
-
-        return str.str();
-    }
-};
-} // namespace host
-} // namespace tensor_operation
-} // namespace ck
-#endif