Layernorm4d (#1022)

* Rename folder * Add layernorm 4d fwd example * Rename original layernorm example * Add layernorm 4d f16 test * Add layernorm4d_fwd client example * Support layernorm4D in ckProfiler * Rename groupnorm to groupnorm fwd in example * Rename layernorm and group fwd in test * Rename normalization to normalization_fwd (instances) * Add fwd to DeviceNormalization * Rename external api header * Rename folder, because we can also add bwd in this folder * Add fwd in layernorm and groupnorm (profiler * Fix compile error --------- Co-authored-by: Po Yen Chen <PoYen.Chen@amd.com>
2026-04-20 14:59:17 +00:00 · 2023-11-09 08:34:51 +08:00
parent ce52621123
commit a3d9a2cd42
59 changed files with 1271 additions and 675 deletions
--- a/example/42_groupnorm_fwd/CMakeLists.txt
+++ b/example/42_groupnorm_fwd/CMakeLists.txt
@@ -0,0 +1,3 @@
+add_example_executable(example_groupnorm_fwd_sigmoid_mul_fp16 groupnorm_fwd_sigmoid_mul_fp16.cpp)
+add_example_executable(example_groupnorm_fwd_splitk_fp16 groupnorm_fwd_splitk_fp16.cpp)
+add_example_executable(example_groupnorm_fwd_swish_fp16 groupnorm_fwd_swish_fp16.cpp)
--- a/example/42_groupnorm_fwd/common.hpp
+++ b/example/42_groupnorm_fwd/common.hpp
@@ -0,0 +1,24 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include <iostream>
+#include <numeric>
+#include <initializer_list>
+#include <cstdlib>
+#include <getopt.h>
+
+#include "ck/ck.hpp"
+#include "ck/utility/reduction_enums.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_normalization_fwd_impl.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_normalization_fwd_splitk_impl.hpp"
+#include "ck/tensor_operation/gpu/device/reduction_operator_mapping.hpp"
+
+#include "ck/library/utility/fill.hpp"
+#include "ck/library/utility/check_err.hpp"
+#include "ck/library/utility/device_memory.hpp"
+#include "ck/library/utility/host_common_util.hpp"
+#include "ck/library/utility/host_tensor.hpp"
+#include "ck/library/utility/host_tensor_generator.hpp"
+#include "ck/library/reference_tensor_operation/cpu/reference_groupnorm.hpp"
--- a/example/42_groupnorm_fwd/groupnorm_fwd_sigmoid_mul_fp16.cpp
+++ b/example/42_groupnorm_fwd/groupnorm_fwd_sigmoid_mul_fp16.cpp
@@ -0,0 +1,65 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "common.hpp"
+
+constexpr int Rank         = 5;
+constexpr int NumReduceDim = 3;
+
+using XDataType              = ck::half_t;
+using GammaDataType          = ck::half_t;
+using BetaDataType           = ck::half_t;
+using YDataType              = ck::half_t;
+using SaveMeanInvStdDataType = float;
+using ComputeDataType        = float;
+
+#define SAVE_MEAN_INV_STD
+
+struct YElementOp
+{
+    template <typename Y, typename X>
+    __host__ __device__ void operator()(Y& y, const X& x) const
+    {
+        static_assert(ck::is_same<X, float>::value || ck::is_same<X, double>::value ||
+                          ck::is_same<X, ck::half_t>::value,
+                      "Data type is not supported by this operation!");
+
+        static_assert(ck::is_same<Y, float>::value || ck::is_same<Y, double>::value ||
+                          ck::is_same<Y, ck::half_t>::value,
+                      "Data type is not supported by this operation!");
+
+        X a;
+
+        ck::tensor_operation::element_wise::Sigmoid{}(a, x);
+
+        y = ck::type_convert<Y>(x * a);
+    };
+};
+
+using DeviceInstance =
+    ck::tensor_operation::device::DeviceNormalizationFwdImpl<XDataType,
+                                                             GammaDataType,
+                                                             BetaDataType,
+                                                             ComputeDataType,
+                                                             YDataType,
+                                                             SaveMeanInvStdDataType,
+                                                             YElementOp,
+                                                             Rank,
+                                                             NumReduceDim,
+                                                             1024, // BlockSize
+                                                             1,    // ClusterM
+                                                             1024, // ClusterK
+                                                             1,    // SliceM
+                                                             32,   // SliceK
+                                                             1,    // SrcVecDim (0=M, 1=K)
+                                                             2,    // SrcScalarPerVector
+                                                             1,    // GammaVecDim (0=M, 1=K)
+                                                             2,    // GammaScalarPerVector
+                                                             1,    // BetaVecDim (0=M, 1=K)
+                                                             2,    // BetaScalarPerVector
+                                                             2,    // YScalarPerVector
+                                                             1>;   // SaveMeanInvStdScalarPerVector
+
+#include "run_groupnorm_fwd_example.inc"
+
+int main(int argc, char* argv[]) { run_groupnorm_fwd_example(argc, argv); }
--- a/example/42_groupnorm_fwd/groupnorm_fwd_splitk_fp16.cpp
+++ b/example/42_groupnorm_fwd/groupnorm_fwd_splitk_fp16.cpp
@@ -0,0 +1,45 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "common.hpp"
+
+constexpr int Rank         = 5;
+constexpr int NumReduceDim = 3;
+
+using XDataType              = ck::half_t;
+using GammaDataType          = ck::half_t;
+using BetaDataType           = ck::half_t;
+using YDataType              = ck::half_t;
+using SaveMeanInvStdDataType = float;
+using ComputeDataType        = float;
+using YElementOp             = ck::tensor_operation::element_wise::Swish;
+
+#define SAVE_MEAN_INV_STD
+
+using DeviceInstance = ck::tensor_operation::device::DeviceNormalizationFwdSplitKImpl<
+    XDataType,
+    GammaDataType,
+    BetaDataType,
+    ComputeDataType,
+    YDataType,
+    SaveMeanInvStdDataType,
+    YElementOp,
+    Rank,
+    NumReduceDim,
+    256, // BlockSize
+    1,   // ClusterM
+    256, // ClusterK
+    1,   // SliceM
+    16,  // SliceK
+    1,   // SrcVecDim (0=M, 1=K)
+    2,   // SrcScalarPerVector
+    1,   // GammaVecDim (0=M, 1=K)
+    2,   // GammaScalarPerVector
+    1,   // BetaVecDim (0=M, 1=K)
+    2,   // BetaScalarPerVector
+    2,   // YScalarPerVector
+    1>;  // SaveMeanInvStdScalarPerVector
+
+#include "run_groupnorm_fwd_example.inc"
+
+int main(int argc, char* argv[]) { run_groupnorm_fwd_example(argc, argv); }
--- a/example/42_groupnorm_fwd/groupnorm_fwd_swish_fp16.cpp
+++ b/example/42_groupnorm_fwd/groupnorm_fwd_swish_fp16.cpp
@@ -0,0 +1,45 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "common.hpp"
+
+constexpr int Rank         = 5;
+constexpr int NumReduceDim = 3;
+
+using XDataType              = ck::half_t;
+using GammaDataType          = ck::half_t;
+using BetaDataType           = ck::half_t;
+using YDataType              = ck::half_t;
+using SaveMeanInvStdDataType = float;
+using ComputeDataType        = float;
+using YElementOp             = ck::tensor_operation::element_wise::Swish;
+
+#define SAVE_MEAN_INV_STD
+
+using DeviceInstance =
+    ck::tensor_operation::device::DeviceNormalizationFwdImpl<XDataType,
+                                                             GammaDataType,
+                                                             BetaDataType,
+                                                             ComputeDataType,
+                                                             YDataType,
+                                                             SaveMeanInvStdDataType,
+                                                             YElementOp,
+                                                             Rank,
+                                                             NumReduceDim,
+                                                             1024, // BlockSize
+                                                             1,    // ClusterM
+                                                             1024, // ClusterK
+                                                             1,    // SliceM
+                                                             32,   // SliceK
+                                                             1,    // SrcVecDim (0=M, 1=K)
+                                                             2,    // SrcScalarPerVector
+                                                             1,    // GammaVecDim (0=M, 1=K)
+                                                             2,    // GammaScalarPerVector
+                                                             1,    // BetaVecDim (0=M, 1=K)
+                                                             2,    // BetaScalarPerVector
+                                                             2,    // YScalarPerVector
+                                                             1>;   // SaveMeanInvStdScalarPerVector
+
+#include "run_groupnorm_fwd_example.inc"
+
+int main(int argc, char* argv[]) { run_groupnorm_fwd_example(argc, argv); }
--- a/example/42_groupnorm_fwd/run_groupnorm_fwd_example.inc
+++ b/example/42_groupnorm_fwd/run_groupnorm_fwd_example.inc
@@ -0,0 +1,148 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+int run_groupnorm_fwd_example(int argc, char* argv[])
+{
+    ck::index_t N = 32;
+    ck::index_t H = 16;
+    ck::index_t W = 16;
+    ck::index_t G = 64;
+    ck::index_t C = 128;
+
+    if(argc == 1)
+    {
+        // use default case
+    }
+    else if(argc == 6)
+    {
+        N = std::stoi(argv[1]);
+        H = std::stoi(argv[2]);
+        W = std::stoi(argv[3]);
+        G = std::stoi(argv[4]);
+        C = std::stoi(argv[5]);
+    }
+    else
+    {
+        std::cerr << "arg1 to 5: N, H, W, G, C" << std::endl;
+
+        return 1;
+    }
+
+    Tensor<XDataType> x({N, H, W, G, C});
+    Tensor<YDataType> y({N, H, W, G, C});
+    Tensor<GammaDataType> gamma({G, C});
+    Tensor<BetaDataType> beta({G, C});
+    Tensor<SaveMeanInvStdDataType> save_mean({N, G});
+    Tensor<SaveMeanInvStdDataType> save_inv_std({N, G});
+
+    ck::utils::FillUniformDistribution<XDataType>{0.f, 1.f}(x);
+    ck::utils::FillUniformDistribution<GammaDataType>{0.f, 1.f}(gamma);
+    ck::utils::FillUniformDistribution<BetaDataType>{0.f, 1.f}(beta);
+
+    DeviceMem x_dev(sizeof(XDataType) * x.mDesc.GetElementSpaceSize());
+    DeviceMem gamma_dev(sizeof(GammaDataType) * gamma.mDesc.GetElementSpaceSize());
+    DeviceMem beta_dev(sizeof(BetaDataType) * beta.mDesc.GetElementSpaceSize());
+    DeviceMem y_dev(sizeof(YDataType) * y.mDesc.GetElementSpaceSize());
+#ifdef SAVE_MEAN_INV_STD
+    DeviceMem save_mean_dev(sizeof(SaveMeanInvStdDataType) * save_mean.mDesc.GetElementSpaceSize());
+    DeviceMem save_inv_std_dev(sizeof(SaveMeanInvStdDataType) *
+                               save_inv_std.mDesc.GetElementSpaceSize());
+#endif
+
+    x_dev.ToDevice(x.mData.data());
+    gamma_dev.ToDevice(gamma.mData.data());
+    beta_dev.ToDevice(beta.mData.data());
+
+    const auto y_element_op = YElementOp{};
+
+    auto device_instance = DeviceInstance{};
+    auto argument_ptr    = device_instance.MakeArgumentPointer(
+        {N, H, W, G, C},
+        std::vector<ck::index_t>{x.mDesc.GetStrides().begin(), x.mDesc.GetStrides().end()},
+        {0, 0, 0, C, 1},
+        {0, 0, 0, C, 1},
+        std::vector<ck::index_t>{y.mDesc.GetStrides().begin(), y.mDesc.GetStrides().end()},
+        std::vector<ck::index_t>{save_mean.mDesc.GetStrides().begin(),
+                                    save_mean.mDesc.GetStrides().end()},
+        std::vector<ck::index_t>{save_mean.mDesc.GetStrides().begin(),
+                                    save_mean.mDesc.GetStrides().end()},
+        {1, 2, 4}, // reduction dimension: [H, W, C]
+        1e-6,
+        x_dev.GetDeviceBuffer(),
+        gamma_dev.GetDeviceBuffer(),
+        beta_dev.GetDeviceBuffer(),
+        y_dev.GetDeviceBuffer(),
+#ifdef SAVE_MEAN_INV_STD
+        save_mean_dev.GetDeviceBuffer(),
+        save_inv_std_dev.GetDeviceBuffer(),
+#else
+        nullptr,
+        nullptr,
+#endif
+        y_element_op);
+
+    if(!device_instance.IsSupportedArgument(argument_ptr.get()))
+    {
+        std::cout << "The runtime parameters are not supported" << std::endl;
+        return 1;
+    };
+
+    size_t workspace_sz = device_instance.GetWorkSpaceSize(argument_ptr.get());
+    DeviceMem workspace_dev(workspace_sz);
+    device_instance.SetWorkSpacePointer(argument_ptr.get(), workspace_dev.GetDeviceBuffer());
+
+    auto invoker_ptr = device_instance.MakeInvokerPointer();
+    float ave_time   = invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, true, true});
+
+    std::size_t num_btype = sizeof(XDataType) * N * H * W * G * C +
+                            sizeof(YDataType) * N * H * W * G * C + sizeof(GammaDataType) * G * C +
+                            sizeof(BetaDataType) * G * C;
+
+    float gb_per_sec = num_btype / 1.E6 / ave_time;
+
+    std::cout << "Perf: " << ave_time << " ms, " << gb_per_sec << " GB/s, "
+              << device_instance.GetTypeString() << std::endl;
+
+    bool pass = true;
+    {
+        Tensor<YDataType> host_y({N, H, W, G, C});
+        Tensor<SaveMeanInvStdDataType> host_save_mean(HostTensorDescriptor{N, G});
+        Tensor<SaveMeanInvStdDataType> host_save_inv_std(HostTensorDescriptor{N, G});
+        using ReferenceInstance =
+            ck::tensor_operation::host::ReferenceGroupnorm<XDataType,
+                                                           GammaDataType,
+                                                           BetaDataType,
+                                                           YDataType,
+                                                           SaveMeanInvStdDataType,
+                                                           ComputeDataType,
+                                                           YElementOp>;
+
+        ReferenceInstance ref;
+        auto ref_argument = ref.MakeArgument(x,
+                                             gamma,
+                                             beta,
+                                             host_y,
+                                             host_save_mean,
+                                             host_save_inv_std,
+                                             y_element_op,
+                                             {N, H, W, G, C},
+                                             1e-6);
+        auto ref_invoker  = ref.MakeInvoker();
+        ref_invoker.Run(ref_argument);
+
+        y_dev.FromDevice(y.mData.data());
+        pass &= ck::utils::check_err(y, host_y, "Error: Incorrect results", 1e-3, 1e-3);
+#ifdef SAVE_MEAN_INV_STD
+        save_mean_dev.FromDevice(save_mean.mData.data());
+        save_inv_std_dev.FromDevice(save_inv_std.mData.data());
+        pass &= ck::utils::check_err(
+            save_mean, host_save_mean, "Error: Incorrect results (mean)", 1e-3, 1e-3);
+        pass &= ck::utils::check_err(
+            save_inv_std, host_save_inv_std, "Error: Incorrect results (inv_std)", 1e-3, 1e-3);
+#endif
+    }
+
+    return (pass ? 0 : 1);
+}