From 53477aee72e205e04b8c64d470c51ae5fc4beeea Mon Sep 17 00:00:00 2001
From: rocking5566 <ChunYu.Lai@amd.com>
Date: Thu, 25 Aug 2022 07:43:43 +0800
Subject: [PATCH] layernorm external api (#379)

* Add layernorm client example

* [What] Add default make install dir to gitignore
[Why] client example need to make install

[ROCm/composable_kernel commit: e1a3fff67510be2af023b31587e411230b994631]
---
 .gitignore                                    |   1 +
 client_example/05_layernorm/CMakeLists.txt    |   2 +
 client_example/05_layernorm/layernorm2d.cpp   | 159 ++++++++++++++++++
 client_example/CMakeLists.txt                 |   1 +
 .../gpu/layernorm.hpp                         |  85 ++++++++++
 5 files changed, 248 insertions(+)
 create mode 100644 client_example/05_layernorm/CMakeLists.txt
 create mode 100644 client_example/05_layernorm/layernorm2d.cpp
 create mode 100644 library/include/ck/library/tensor_operation_instance/gpu/layernorm.hpp
diff --git a/.gitignore b/.gitignore
index cdf5b64dec..71059ec4d9 100644
--- a/.gitignore
+++ b/.gitignore
@@ -46,3 +46,4 @@ build*
 
 # GDB temporary files
 .gdb_history
+install.dir*
diff --git a/client_example/05_layernorm/CMakeLists.txt b/client_example/05_layernorm/CMakeLists.txt
new file mode 100644
index 0000000000..b582b485d4
--- /dev/null
+++ b/client_example/05_layernorm/CMakeLists.txt
@@ -0,0 +1,2 @@
+add_executable(client_layernorm2d layernorm2d.cpp)
+target_link_libraries(client_layernorm2d PRIVATE composable_kernel::device_operations)
diff --git a/client_example/05_layernorm/layernorm2d.cpp b/client_example/05_layernorm/layernorm2d.cpp
new file mode 100644
index 0000000000..657f2248f3
--- /dev/null
+++ b/client_example/05_layernorm/layernorm2d.cpp
@@ -0,0 +1,159 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <iomanip>
+#include <vector>
+#include <iostream>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/device_normalization.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+#include "ck/library/tensor_operation_instance/gpu/layernorm.hpp"
+
+using XDataType     = ck::half_t;
+using GammaDataType = ck::half_t;
+using BetaDataType  = ck::half_t;
+using YDataType     = ck::half_t;
+using AccDataType   = float;
+using PassThrough   = ck::tensor_operation::element_wise::PassThrough;
+
+constexpr int Rank         = 2;
+constexpr int NumReduceDim = 1;
+
+struct SimpleDeviceMem
+{
+    SimpleDeviceMem() = delete;
+
+    SimpleDeviceMem(std::size_t mem_size) : p_mem_{}
+    {
+        (void)hipMalloc(static_cast<void**>(&p_mem_), mem_size);
+    }
+
+    void* GetDeviceBuffer() { return p_mem_; }
+
+    ~SimpleDeviceMem() { (void)hipFree(p_mem_); }
+
+    void* p_mem_;
+};
+
+int main(int argc, char* argv[])
+{
+    ck::index_t M      = 1024;
+    ck::index_t N      = 1024;
+    ck::index_t Stride = 1024;
+
+    auto xy_size = (M - 1) * Stride + N;
+
+    SimpleDeviceMem x_device_buf(sizeof(XDataType) * xy_size);
+    SimpleDeviceMem gamma_device_buf(sizeof(GammaDataType) * N);
+    SimpleDeviceMem beta_device_buf(sizeof(BetaDataType) * N);
+    SimpleDeviceMem y_device_buf(sizeof(YDataType) * xy_size);
+
+    using DeviceOp = ck::tensor_operation::device::DeviceLayernorm<XDataType,
+                                                                   GammaDataType,
+                                                                   BetaDataType,
+                                                                   AccDataType,
+                                                                   YDataType,
+                                                                   PassThrough,
+                                                                   Rank,
+                                                                   NumReduceDim>;
+
+    // get device op instances
+    const auto op_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory<
+        DeviceOp>::GetInstances();
+
+    std::cout << "found " << op_ptrs.size() << " instances" << std::endl;
+
+    std::string best_op_name;
+    bool found            = false;
+    int best_op_id        = -1;
+    float best_ave_time   = std::numeric_limits<float>::max();
+    float best_gb_per_sec = 0;
+
+    // profile device operation instances
+    std::cout << "Run all instances and do timing" << std::endl;
+
+    for(int i = 0; i < op_ptrs.size(); ++i)
+    {
+        auto& op_ptr = op_ptrs[i];
+
+        auto argument_ptr = op_ptr->MakeArgumentPointer({M, N},      // lengths
+                                                        {Stride, 1}, // xStrides
+                                                        {1},         // gammaStrides
+                                                        {1},         // betaStrides
+                                                        {Stride, 1}, // yStrides
+                                                        {1},         // reduceDims
+                                                        1e-4,
+                                                        x_device_buf.GetDeviceBuffer(),
+                                                        gamma_device_buf.GetDeviceBuffer(),
+                                                        beta_device_buf.GetDeviceBuffer(),
+                                                        y_device_buf.GetDeviceBuffer(),
+                                                        PassThrough{});
+
+        auto invoker_ptr = op_ptr->MakeInvokerPointer();
+
+        std::string op_name = op_ptr->GetTypeString();
+
+        if(op_ptr->IsSupportedArgument(argument_ptr.get()))
+        {
+            float ave_time = invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, true});
+
+            std::size_t num_byte = sizeof(XDataType) * M * N + sizeof(GammaDataType) * N +
+                                   sizeof(BetaDataType) * N + sizeof(YDataType) * M * N;
+
+            float gb_per_sec = num_byte / 1.E6 / ave_time;
+
+            std::cout << "Perf: " << std::setw(10) << ave_time << " ms, " << gb_per_sec << " GB/s, "
+                      << op_name << std::endl;
+
+            if(ave_time < best_ave_time)
+            {
+                found           = true;
+                best_op_id      = i;
+                best_op_name    = op_name;
+                best_ave_time   = ave_time;
+                best_gb_per_sec = gb_per_sec;
+            }
+        }
+        else
+        {
+            std::cout << op_name << " does not support this problem" << std::endl;
+        }
+    }
+
+    std::cout << "Best Perf: " << best_ave_time << " ms, " << best_gb_per_sec << " GB/s, "
+              << best_op_name << std::endl;
+
+    // run the best intance
+    {
+        auto& op_ptr = op_ptrs[best_op_id];
+        std::cout << "Run the best instance without timing: " << op_ptr->GetTypeString()
+                  << std::endl;
+
+        auto argument_ptr = op_ptr->MakeArgumentPointer({M, N},      // lengths
+                                                        {Stride, 1}, // xStrides
+                                                        {1},         // gammaStrides
+                                                        {1},         // betaStrides
+                                                        {Stride, 1}, // yStrides
+                                                        {1},         // reduceDims
+                                                        1e-4,
+                                                        x_device_buf.GetDeviceBuffer(),
+                                                        gamma_device_buf.GetDeviceBuffer(),
+                                                        beta_device_buf.GetDeviceBuffer(),
+                                                        y_device_buf.GetDeviceBuffer(),
+                                                        PassThrough{});
+
+        auto invoker_ptr = op_ptr->MakeInvokerPointer();
+
+        if(op_ptr->IsSupportedArgument(argument_ptr.get()))
+        {
+            invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, false});
+        }
+
+        std::cout << "Done" << std::endl;
+    }
+
+    return 0;
+}
diff --git a/client_example/CMakeLists.txt b/client_example/CMakeLists.txt
index 3e04a18599..9a0e243570 100644
--- a/client_example/CMakeLists.txt
+++ b/client_example/CMakeLists.txt
@@ -10,3 +10,4 @@ add_subdirectory(01_gemm)
 add_subdirectory(02_gemm_add_add_fastgelu)
 add_subdirectory(03_gemm_layernorm)
 add_subdirectory(04_contraction)
+add_subdirectory(05_layernorm)
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/layernorm.hpp b/library/include/ck/library/tensor_operation_instance/gpu/layernorm.hpp
new file mode 100644
index 0000000000..a73c8c5c43
--- /dev/null
+++ b/library/include/ck/library/tensor_operation_instance/gpu/layernorm.hpp
@@ -0,0 +1,85 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/device_normalization.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+#include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_layernorm_f16_rank2_instances(
+    std::vector<DeviceLayernormPtr<F16, F16, F16, F32, F16, PassThrough, 2, 1>>&);
+
+void add_device_layernorm_f16_rank4_instances(
+    std::vector<DeviceLayernormPtr<F16, F16, F16, F32, F16, PassThrough, 4, 3>>&);
+
+void add_device_layernorm_f32_rank2_instances(
+    std::vector<DeviceLayernormPtr<F32, F32, F32, F32, F32, PassThrough, 2, 1>>&);
+
+void add_device_layernorm_f32_rank4_instances(
+    std::vector<DeviceLayernormPtr<F32, F32, F32, F32, F32, PassThrough, 4, 3>>&);
+
+template <typename XDataType,
+          typename GammaDataType,
+          typename BetaDataType,
+          typename YDataType,
+          index_t Rank,
+          index_t NumReduceDim>
+struct DeviceOperationInstanceFactory<
+    ck::tensor_operation::device::DeviceLayernorm<XDataType,
+                                                  GammaDataType,
+                                                  BetaDataType,
+                                                  F32,
+                                                  YDataType,
+                                                  ck::tensor_operation::element_wise::PassThrough,
+                                                  Rank,
+                                                  NumReduceDim>>
+{
+    using DeviceOp = DeviceLayernorm<XDataType,
+                                     GammaDataType,
+                                     BetaDataType,
+                                     F32,
+                                     YDataType,
+                                     ck::tensor_operation::element_wise::PassThrough,
+                                     Rank,
+                                     NumReduceDim>;
+
+    static auto GetInstances()
+    {
+        std::vector<std::unique_ptr<DeviceOp>> op_ptrs;
+
+        if constexpr(is_same_v<XDataType, F16> && is_same_v<GammaDataType, F16> &&
+                     is_same_v<BetaDataType, F16> && is_same_v<YDataType, F16>)
+        {
+            if constexpr(Rank == 2 && NumReduceDim == 1)
+                add_device_layernorm_f16_rank2_instances(op_ptrs);
+            else if constexpr(Rank == 4 && NumReduceDim == 3)
+                add_device_layernorm_f16_rank4_instances(op_ptrs);
+        }
+        else if constexpr(is_same_v<XDataType, F32> && is_same_v<GammaDataType, F32> &&
+                          is_same_v<BetaDataType, F32> && is_same_v<YDataType, F32>)
+        {
+            if constexpr(Rank == 2 && NumReduceDim == 1)
+                add_device_layernorm_f32_rank2_instances(op_ptrs);
+            else if constexpr(Rank == 4 && NumReduceDim == 3)
+                add_device_layernorm_f32_rank4_instances(op_ptrs);
+        }
+
+        return op_ptrs;
+    }
+};
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck