From 53477aee72e205e04b8c64d470c51ae5fc4beeea Mon Sep 17 00:00:00 2001 From: rocking5566 Date: Thu, 25 Aug 2022 07:43:43 +0800 Subject: [PATCH] layernorm external api (#379) * Add layernorm client example * [What] Add default make install dir to gitignore [Why] client example need to make install [ROCm/composable_kernel commit: e1a3fff67510be2af023b31587e411230b994631] --- .gitignore | 1 + client_example/05_layernorm/CMakeLists.txt | 2 + client_example/05_layernorm/layernorm2d.cpp | 159 ++++++++++++++++++ client_example/CMakeLists.txt | 1 + .../gpu/layernorm.hpp | 85 ++++++++++ 5 files changed, 248 insertions(+) create mode 100644 client_example/05_layernorm/CMakeLists.txt create mode 100644 client_example/05_layernorm/layernorm2d.cpp create mode 100644 library/include/ck/library/tensor_operation_instance/gpu/layernorm.hpp diff --git a/.gitignore b/.gitignore index cdf5b64dec..71059ec4d9 100644 --- a/.gitignore +++ b/.gitignore @@ -46,3 +46,4 @@ build* # GDB temporary files .gdb_history +install.dir* diff --git a/client_example/05_layernorm/CMakeLists.txt b/client_example/05_layernorm/CMakeLists.txt new file mode 100644 index 0000000000..b582b485d4 --- /dev/null +++ b/client_example/05_layernorm/CMakeLists.txt @@ -0,0 +1,2 @@ +add_executable(client_layernorm2d layernorm2d.cpp) +target_link_libraries(client_layernorm2d PRIVATE composable_kernel::device_operations) diff --git a/client_example/05_layernorm/layernorm2d.cpp b/client_example/05_layernorm/layernorm2d.cpp new file mode 100644 index 0000000000..657f2248f3 --- /dev/null +++ b/client_example/05_layernorm/layernorm2d.cpp @@ -0,0 +1,159 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +#include +#include +#include + +#include "ck/ck.hpp" +#include "ck/tensor_operation/gpu/device/tensor_layout.hpp" +#include "ck/tensor_operation/gpu/device/device_normalization.hpp" +#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" + +#include "ck/library/tensor_operation_instance/gpu/layernorm.hpp" + +using XDataType = ck::half_t; +using GammaDataType = ck::half_t; +using BetaDataType = ck::half_t; +using YDataType = ck::half_t; +using AccDataType = float; +using PassThrough = ck::tensor_operation::element_wise::PassThrough; + +constexpr int Rank = 2; +constexpr int NumReduceDim = 1; + +struct SimpleDeviceMem +{ + SimpleDeviceMem() = delete; + + SimpleDeviceMem(std::size_t mem_size) : p_mem_{} + { + (void)hipMalloc(static_cast(&p_mem_), mem_size); + } + + void* GetDeviceBuffer() { return p_mem_; } + + ~SimpleDeviceMem() { (void)hipFree(p_mem_); } + + void* p_mem_; +}; + +int main(int argc, char* argv[]) +{ + ck::index_t M = 1024; + ck::index_t N = 1024; + ck::index_t Stride = 1024; + + auto xy_size = (M - 1) * Stride + N; + + SimpleDeviceMem x_device_buf(sizeof(XDataType) * xy_size); + SimpleDeviceMem gamma_device_buf(sizeof(GammaDataType) * N); + SimpleDeviceMem beta_device_buf(sizeof(BetaDataType) * N); + SimpleDeviceMem y_device_buf(sizeof(YDataType) * xy_size); + + using DeviceOp = ck::tensor_operation::device::DeviceLayernorm; + + // get device op instances + const auto op_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory< + DeviceOp>::GetInstances(); + + std::cout << "found " << op_ptrs.size() << " instances" << std::endl; + + std::string best_op_name; + bool found = false; + int best_op_id = -1; + float best_ave_time = std::numeric_limits::max(); + float best_gb_per_sec = 0; + + // profile device operation instances + std::cout << "Run all instances and do timing" << std::endl; + + for(int i = 0; i < op_ptrs.size(); ++i) + { + auto& op_ptr = op_ptrs[i]; + + auto argument_ptr = op_ptr->MakeArgumentPointer({M, N}, // lengths + {Stride, 1}, // xStrides + {1}, // gammaStrides + {1}, // betaStrides + {Stride, 1}, // yStrides + {1}, // reduceDims + 1e-4, + x_device_buf.GetDeviceBuffer(), + gamma_device_buf.GetDeviceBuffer(), + beta_device_buf.GetDeviceBuffer(), + y_device_buf.GetDeviceBuffer(), + PassThrough{}); + + auto invoker_ptr = op_ptr->MakeInvokerPointer(); + + std::string op_name = op_ptr->GetTypeString(); + + if(op_ptr->IsSupportedArgument(argument_ptr.get())) + { + float ave_time = invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, true}); + + std::size_t num_byte = sizeof(XDataType) * M * N + sizeof(GammaDataType) * N + + sizeof(BetaDataType) * N + sizeof(YDataType) * M * N; + + float gb_per_sec = num_byte / 1.E6 / ave_time; + + std::cout << "Perf: " << std::setw(10) << ave_time << " ms, " << gb_per_sec << " GB/s, " + << op_name << std::endl; + + if(ave_time < best_ave_time) + { + found = true; + best_op_id = i; + best_op_name = op_name; + best_ave_time = ave_time; + best_gb_per_sec = gb_per_sec; + } + } + else + { + std::cout << op_name << " does not support this problem" << std::endl; + } + } + + std::cout << "Best Perf: " << best_ave_time << " ms, " << best_gb_per_sec << " GB/s, " + << best_op_name << std::endl; + + // run the best intance + { + auto& op_ptr = op_ptrs[best_op_id]; + std::cout << "Run the best instance without timing: " << op_ptr->GetTypeString() + << std::endl; + + auto argument_ptr = op_ptr->MakeArgumentPointer({M, N}, // lengths + {Stride, 1}, // xStrides + {1}, // gammaStrides + {1}, // betaStrides + {Stride, 1}, // yStrides + {1}, // reduceDims + 1e-4, + x_device_buf.GetDeviceBuffer(), + gamma_device_buf.GetDeviceBuffer(), + beta_device_buf.GetDeviceBuffer(), + y_device_buf.GetDeviceBuffer(), + PassThrough{}); + + auto invoker_ptr = op_ptr->MakeInvokerPointer(); + + if(op_ptr->IsSupportedArgument(argument_ptr.get())) + { + invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, false}); + } + + std::cout << "Done" << std::endl; + } + + return 0; +} diff --git a/client_example/CMakeLists.txt b/client_example/CMakeLists.txt index 3e04a18599..9a0e243570 100644 --- a/client_example/CMakeLists.txt +++ b/client_example/CMakeLists.txt @@ -10,3 +10,4 @@ add_subdirectory(01_gemm) add_subdirectory(02_gemm_add_add_fastgelu) add_subdirectory(03_gemm_layernorm) add_subdirectory(04_contraction) +add_subdirectory(05_layernorm) diff --git a/library/include/ck/library/tensor_operation_instance/gpu/layernorm.hpp b/library/include/ck/library/tensor_operation_instance/gpu/layernorm.hpp new file mode 100644 index 0000000000..a73c8c5c43 --- /dev/null +++ b/library/include/ck/library/tensor_operation_instance/gpu/layernorm.hpp @@ -0,0 +1,85 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once + +#include + +#include "ck/ck.hpp" +#include "ck/tensor_operation/gpu/device/tensor_layout.hpp" +#include "ck/tensor_operation/gpu/device/device_normalization.hpp" +#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" + +#include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +void add_device_layernorm_f16_rank2_instances( + std::vector>&); + +void add_device_layernorm_f16_rank4_instances( + std::vector>&); + +void add_device_layernorm_f32_rank2_instances( + std::vector>&); + +void add_device_layernorm_f32_rank4_instances( + std::vector>&); + +template +struct DeviceOperationInstanceFactory< + ck::tensor_operation::device::DeviceLayernorm> +{ + using DeviceOp = DeviceLayernorm; + + static auto GetInstances() + { + std::vector> op_ptrs; + + if constexpr(is_same_v && is_same_v && + is_same_v && is_same_v) + { + if constexpr(Rank == 2 && NumReduceDim == 1) + add_device_layernorm_f16_rank2_instances(op_ptrs); + else if constexpr(Rank == 4 && NumReduceDim == 3) + add_device_layernorm_f16_rank4_instances(op_ptrs); + } + else if constexpr(is_same_v && is_same_v && + is_same_v && is_same_v) + { + if constexpr(Rank == 2 && NumReduceDim == 1) + add_device_layernorm_f32_rank2_instances(op_ptrs); + else if constexpr(Rank == 4 && NumReduceDim == 3) + add_device_layernorm_f32_rank4_instances(op_ptrs); + } + + return op_ptrs; + } +}; + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck