From 3b685d045277054f453cfd042461e12d87420de4 Mon Sep 17 00:00:00 2001 From: rocking5566 Date: Fri, 12 Aug 2022 06:03:54 +0800 Subject: [PATCH] ckProfiler for layernorm (#330) * Refine parameter * Add base class for layernorm * Add layernorm instance * Add layernorm to ckProfiler * Remove redundant * Add verification * Fix compile error due to merge [ROCm/composable_kernel commit: fdfd7eb597cc557c3ad7c831c8c89a437ec4d948] --- example/27_layernorm/layernorm_blockwise.cpp | 2 +- .../gpu/device/device_layernorm.hpp | 40 +-- .../gpu/device/device_normalization.hpp | 43 ++++ .../gpu/normalization/CMakeLists.txt | 2 + .../device_layernorm_f16_instance.cpp | 53 ++++ .../device_layernorm_f32_instance.cpp | 51 ++++ profiler/CMakeLists.txt | 1 + profiler/include/profile_layernorm_impl.hpp | 238 ++++++++++++++++++ .../include/profile_normalization_impl.hpp | 1 - profiler/src/profile_layernorm.cpp | 123 +++++++++ profiler/src/profile_normalization.cpp | 3 +- profiler/src/profiler.cpp | 8 +- 12 files changed, 544 insertions(+), 21 deletions(-) create mode 100644 library/src/tensor_operation_instance/gpu/normalization/device_layernorm_f16_instance.cpp create mode 100644 library/src/tensor_operation_instance/gpu/normalization/device_layernorm_f32_instance.cpp create mode 100644 profiler/include/profile_layernorm_impl.hpp create mode 100644 profiler/src/profile_layernorm.cpp diff --git a/example/27_layernorm/layernorm_blockwise.cpp b/example/27_layernorm/layernorm_blockwise.cpp index e2625a7772..38a2a63663 100644 --- a/example/27_layernorm/layernorm_blockwise.cpp +++ b/example/27_layernorm/layernorm_blockwise.cpp @@ -46,7 +46,7 @@ using DeviceInstance = ck::tensor_operation::device::DeviceLayernorm; // OutScalarPerVector + 8>; // OutScalarPerVector int main() { diff --git a/include/ck/tensor_operation/gpu/device/device_layernorm.hpp b/include/ck/tensor_operation/gpu/device/device_layernorm.hpp index d4c771c007..464ac8c549 100644 --- a/include/ck/tensor_operation/gpu/device/device_layernorm.hpp +++ b/include/ck/tensor_operation/gpu/device/device_layernorm.hpp @@ -7,7 +7,7 @@ #include #include "ck/utility/reduction_operator.hpp" -#include "ck/tensor_operation/gpu/device/device_base.hpp" +#include "ck/tensor_operation/gpu/device/device_normalization.hpp" #include "ck/tensor_operation/gpu/device/device_reduce.hpp" #include "ck/tensor_operation/gpu/device/device_reduce_multiblock.hpp" #include "ck/tensor_operation/gpu/device/device_reduce_common.hpp" @@ -39,7 +39,14 @@ template -struct DeviceLayernorm : public BaseOperator +struct DeviceLayernorm : public DeviceNormalization2 { static_assert( (KThreadSliceSize % GammaSrcVectorSize == 0), @@ -297,17 +304,18 @@ struct DeviceLayernorm : public BaseOperator return true; }; - std::unique_ptr MakeArgumentPointer(const std::vector lengths, - const std::vector xStrides, - const std::vector gammaStrides, - const std::vector betaStrides, - const std::vector reduceDims, - AccDataType epsilon, - const void* p_x, - const void* p_gamma, - const void* p_beta, - void* p_y, - AccElementwiseOperation acc_elementwise_op) + std::unique_ptr + MakeArgumentPointer(const std::vector lengths, + const std::vector xStrides, + const std::vector gammaStrides, + const std::vector betaStrides, + const std::vector reduceDims, + AccDataType epsilon, + const void* p_x, + const void* p_gamma, + const void* p_beta, + void* p_y, + AccElementwiseOperation acc_elementwise_op) override { return std::make_unique(lengths, xStrides, @@ -322,7 +330,10 @@ struct DeviceLayernorm : public BaseOperator static_cast(p_y)); }; - std::unique_ptr MakeInvokerPointer() { return std::make_unique(); }; + std::unique_ptr MakeInvokerPointer() override + { + return std::make_unique(); + }; std::string GetTypeString() const override { @@ -332,7 +343,6 @@ struct DeviceLayernorm : public BaseOperator str << "DeviceLayernorm<" << BlockSize << ","; str << "M_C" << MThreadClusterSize << "_S" << MThreadSliceSize << ","; str << "K_C" << KThreadClusterSize << "_S" << KThreadSliceSize << ","; - str << "K_C" << KThreadClusterSize << "_S" << KThreadSliceSize << ","; str << "XYSrcVectorDim_" << XYSrcVectorDim << ","; str << "VectorSize_X" << XSrcVectorSize << "_Gamma" << GammaSrcVectorSize << "_Beta" << BetaSrcVectorSize << "_Y" << YDstVectorSize << ">"; // clang-format on diff --git a/include/ck/tensor_operation/gpu/device/device_normalization.hpp b/include/ck/tensor_operation/gpu/device/device_normalization.hpp index 0e4313f17d..2ca66c5d82 100644 --- a/include/ck/tensor_operation/gpu/device/device_normalization.hpp +++ b/include/ck/tensor_operation/gpu/device/device_normalization.hpp @@ -38,6 +38,49 @@ struct DeviceNormalization : public BaseOperator using DeviceNormalizationPtr = std::unique_ptr; +template +struct DeviceNormalization2 : public BaseOperator +{ + virtual std::unique_ptr + MakeArgumentPointer(const std::vector lengths, + const std::vector xStrides, + const std::vector gammaStrides, + const std::vector betaStrides, + const std::vector reduceDims, + AccDataType epsilon, + const void* p_x, + const void* p_gamma, + const void* p_beta, + void* p_y, + AccElementwiseOperation acc_elementwise_op) = 0; + + virtual std::unique_ptr MakeInvokerPointer() = 0; +}; + +template +using DeviceNormalization2Ptr = std::unique_ptr>; + } // namespace device } // namespace tensor_operation } // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/normalization/CMakeLists.txt b/library/src/tensor_operation_instance/gpu/normalization/CMakeLists.txt index a6ae07bab9..a38539dcb7 100644 --- a/library/src/tensor_operation_instance/gpu/normalization/CMakeLists.txt +++ b/library/src/tensor_operation_instance/gpu/normalization/CMakeLists.txt @@ -1,5 +1,7 @@ # device_normalization_instance set(DEVICE_NORMALIZATION_INSTANCE_SOURCE + device_layernorm_f16_instance.cpp + device_layernorm_f32_instance.cpp device_softmax_f32_f32_instance.cpp device_softmax_f16_f16_instance.cpp ) diff --git a/library/src/tensor_operation_instance/gpu/normalization/device_layernorm_f16_instance.cpp b/library/src/tensor_operation_instance/gpu/normalization/device_layernorm_f16_instance.cpp new file mode 100644 index 0000000000..b880d648dd --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/normalization/device_layernorm_f16_instance.cpp @@ -0,0 +1,53 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +#include "ck/ck.hpp" +#include "ck/tensor_operation/gpu/device/device_layernorm.hpp" +#include "ck/utility/data_type.hpp" + +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +using F16 = ck::half_t; +using F32 = float; + +using Pass = ck::tensor_operation::element_wise::PassThrough; + +template +using device_layernorm_f16_instances = std::tuple< + // clang-format off + // XDataType, GammaDataType, BetaDataType, AccDataType, YDataType, Rank, NumReduceDim, BlockSize, MThreadClusterSize, KThreadClusterSize, MThreadSliceSize, KThreadSliceSize, XYSrcVectorDim, XSrcVectorSize, GammaSrcVectorSize, BetaSrcVectorSize, YDstVectorSize> + DeviceLayernorm, // fallback kernel + DeviceLayernorm, // fallback kernel + DeviceLayernorm, // fallback kernel + DeviceLayernorm, + DeviceLayernorm, + DeviceLayernorm, + DeviceLayernorm, + DeviceLayernorm, + DeviceLayernorm, + DeviceLayernorm, + DeviceLayernorm + // clang-format on + >; + +void add_device_layernorm_f16_rank2_instances( + std::vector>& instances) +{ + add_device_operation_instances(instances, device_layernorm_f16_instances<2, 1>{}); +} + +void add_device_layernorm_f16_rank4_instances( + std::vector>& instances) +{ + add_device_operation_instances(instances, device_layernorm_f16_instances<4, 3>{}); +} + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/normalization/device_layernorm_f32_instance.cpp b/library/src/tensor_operation_instance/gpu/normalization/device_layernorm_f32_instance.cpp new file mode 100644 index 0000000000..e30f76b514 --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/normalization/device_layernorm_f32_instance.cpp @@ -0,0 +1,51 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +#include "ck/ck.hpp" +#include "ck/tensor_operation/gpu/device/device_layernorm.hpp" +#include "ck/utility/data_type.hpp" + +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +using F32 = float; + +using Pass = ck::tensor_operation::element_wise::PassThrough; + +template +using device_layernorm_f32_instances = std::tuple< + // clang-format off + // XDataType, GammaDataType, BetaDataType, AccDataType, YDataType, Rank, NumReduceDim, BlockSize, MThreadClusterSize, KThreadClusterSize, MThreadSliceSize, KThreadSliceSize, XYSrcVectorDim, XSrcVectorSize, GammaSrcVectorSize, BetaSrcVectorSize, YDstVectorSize> + DeviceLayernorm, // fallback kernel + DeviceLayernorm, // fallback kernel + DeviceLayernorm, + DeviceLayernorm, + DeviceLayernorm, + DeviceLayernorm, + DeviceLayernorm, + DeviceLayernorm, + DeviceLayernorm, + DeviceLayernorm + // clang-format on + >; + +void add_device_layernorm_f32_rank2_instances( + std::vector>& instances) +{ + add_device_operation_instances(instances, device_layernorm_f32_instances<2, 1>{}); +} + +void add_device_layernorm_f32_rank4_instances( + std::vector>& instances) +{ + add_device_operation_instances(instances, device_layernorm_f32_instances<4, 3>{}); +} + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/profiler/CMakeLists.txt b/profiler/CMakeLists.txt index 274cfd5f21..449e3fd94f 100644 --- a/profiler/CMakeLists.txt +++ b/profiler/CMakeLists.txt @@ -21,6 +21,7 @@ set(PROFILER_SOURCE src/profile_conv_bwd_weight.cpp src/profile_grouped_conv_fwd.cpp src/profile_reduce.cpp + src/profile_layernorm.cpp src/profile_normalization.cpp ) diff --git a/profiler/include/profile_layernorm_impl.hpp b/profiler/include/profile_layernorm_impl.hpp new file mode 100644 index 0000000000..0f26050b95 --- /dev/null +++ b/profiler/include/profile_layernorm_impl.hpp @@ -0,0 +1,238 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once + +#include + +#include "ck/ck.hpp" +#include "profiler/include/data_type_enum.hpp" +#include "ck/tensor_operation/gpu/device/device_layernorm.hpp" + +#include "ck/library/utility/check_err.hpp" +#include "ck/library/utility/device_memory.hpp" +#include "ck/library/utility/host_tensor.hpp" +#include "ck/library/utility/host_tensor_generator.hpp" +#include "ck/library/reference_tensor_operation/cpu/reference_layernorm.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +using F16 = ck::half_t; +using F32 = float; +using PassThrough = ck::tensor_operation::element_wise::PassThrough; + +void add_device_layernorm_f16_rank2_instances( + std::vector>&); + +void add_device_layernorm_f32_rank2_instances( + std::vector>&); + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck + +namespace ck { +namespace profiler { + +template +void profile_layernorm_impl(int do_verification, + int init_method, + bool do_log, + bool time_kernel, + std::vector length, + std::vector strideXY, + std::vector strideGamma, + std::vector strideBeta) +{ + using F16 = ck::half_t; + using F32 = float; + using PassThrough = ck::tensor_operation::element_wise::PassThrough; + + if(length.size() < 2) + return; + + // Assume normalize dimension except for first dimension + std::vector reduce_length{length.begin() + 1, length.end()}; + std::vector reduce_dim; + for(int i = 1; i < Rank; ++i) + reduce_dim.push_back(i); + + Tensor x(length); + Tensor gamma(reduce_length, strideGamma); + Tensor beta(reduce_length, strideBeta); + Tensor y(length, strideXY); + Tensor host_y(length, strideXY); + + switch(init_method) + { + // case 0: break; + case 0: + x.GenerateTensorValue(GeneratorTensor_1{}); + gamma.GenerateTensorValue(GeneratorTensor_1{}); + beta.GenerateTensorValue(GeneratorTensor_1{}); + y.GenerateTensorValue(GeneratorTensor_1{}); + break; + case 1: + x.GenerateTensorValue(GeneratorTensor_2{-5, 5}); + gamma.GenerateTensorValue(GeneratorTensor_2{-5, 5}); + beta.GenerateTensorValue(GeneratorTensor_2{-5, 5}); + y.GenerateTensorValue(GeneratorTensor_2{-5, 5}); + break; + default: + x.GenerateTensorValue(GeneratorTensor_3{0, 1}); + gamma.GenerateTensorValue(GeneratorTensor_3{-0.5, 0.5}); + beta.GenerateTensorValue(GeneratorTensor_3{-0.5, 0.5}); + y.GenerateTensorValue(GeneratorTensor_3{-0.5, 0.5}); + } + + DeviceMem x_dev(sizeof(XDataType) * x.mDesc.GetElementSpaceSize()); + DeviceMem gamma_dev(sizeof(GammaDataType) * gamma.mDesc.GetElementSpaceSize()); + DeviceMem beta_dev(sizeof(BetaDataType) * beta.mDesc.GetElementSpaceSize()); + DeviceMem y_dev(sizeof(YDataType) * y.mDesc.GetElementSpaceSize()); + + x_dev.ToDevice(x.mData.data()); + gamma_dev.ToDevice(gamma.mData.data()); + beta_dev.ToDevice(beta.mData.data()); + + // add device normalization instances + constexpr int NumReduceDim = Rank - 1; + std::vector> + instances; + + if constexpr(is_same::value && is_same::value && + is_same::value && is_same::value && + is_same::value) + { + if(length.size() == 2) + tensor_operation::device::instance::add_device_layernorm_f16_rank2_instances(instances); + } + else if constexpr(is_same::value && is_same::value && + is_same::value && is_same::value && + is_same::value) + { + if(length.size() == 2) + tensor_operation::device::instance::add_device_layernorm_f32_rank2_instances(instances); + } + + if(instances.size() <= 0) + { + throw std::runtime_error("wrong! no device normalization instance found"); + } + + std::string best_instance_name; + float best_avg_time = std::numeric_limits::max(); + float best_gb_per_sec = 0; + + if(do_verification) + { + using ReferenceInstance = ck::tensor_operation::host::ReferenceLayernorm; + + ReferenceInstance ref; + auto ref_argument = + ref.MakeArgument(x, gamma, beta, host_y, PassThrough{}, length, reduce_dim, 1e-4); + auto ref_invoker = ref.MakeInvoker(); + ref_invoker.Run(ref_argument); + } + + for(auto& inst_ptr : instances) + { + auto argument_ptr = inst_ptr->MakeArgumentPointer(length, + strideXY, + strideGamma, + strideBeta, + reduce_dim, + 1e-4, + x_dev.GetDeviceBuffer(), + gamma_dev.GetDeviceBuffer(), + beta_dev.GetDeviceBuffer(), + y_dev.GetDeviceBuffer(), + PassThrough{}); + + if(!inst_ptr->IsSupportedArgument(argument_ptr.get())) + { + std::cout << inst_ptr->GetTypeString() << " skipped due to unsupported argument: "; + LogRange(std::cout << "input lengths = [", length, "], ") << std::endl; + + return; + } + + auto invoker_ptr = inst_ptr->MakeInvokerPointer(); + + float avg_time = invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, time_kernel}); + + std::size_t num_bytes = x.mDesc.GetElementSize() * sizeof(XDataType) + + gamma.mDesc.GetElementSize() * sizeof(GammaDataType) + + beta.mDesc.GetElementSize() * sizeof(BetaDataType) + + y.mDesc.GetElementSize() * sizeof(YDataType); + + float gb_per_sec = num_bytes / 1.E6 / avg_time; + + std::cout << "Perf: " << std::setw(10) << avg_time << " ms, " << gb_per_sec << " GB/s, " + << inst_ptr->GetTypeString() << std::endl; + + if(avg_time < best_avg_time) + { + best_instance_name = inst_ptr->GetTypeString(); + best_avg_time = avg_time; + best_gb_per_sec = gb_per_sec; + } + + if(do_verification) + { + y_dev.FromDevice(y.mData.data()); + + bool pass = ck::utils::check_err( + y.mData, host_y.mData, "Error: Incorrect results d1", 1e-3, 1e-3); + + if(do_log) + { + LogRangeAsType(std::cout << "x : ", x.mData, ",") << std::endl; + LogRangeAsType(std::cout << "host_y : ", host_y.mData, ",") << std::endl; + LogRangeAsType(std::cout << "y : ", y.mData, ",") << std::endl; + } + + if(!pass) + { + std::cout << inst_ptr->GetTypeString() << " failed verification: "; + LogRange(std::cout << "lengths = [", length, ", ") << "]." << std::endl; + return; + } + else + { + std::cout << "pass" << std::endl; + } + } + } + + LogRange(std::cout << "length = ", length, ",") << ", "; + LogRange(std::cout << "stride = ", strideXY, ",") << ", "; + LogRange(std::cout << "reduce dims ", reduce_dim, ",") << std::endl; + std::cout << "best perf = " << best_avg_time << " ms, " << best_gb_per_sec << " GB/s, " + << best_instance_name << std::endl; +} + +} // namespace profiler +} // namespace ck diff --git a/profiler/include/profile_normalization_impl.hpp b/profiler/include/profile_normalization_impl.hpp index 77a2c32d18..394d679ce2 100644 --- a/profiler/include/profile_normalization_impl.hpp +++ b/profiler/include/profile_normalization_impl.hpp @@ -36,7 +36,6 @@ namespace profiler { enum struct NormType { - LAYERNORM, BATCHNORM, SOFTMAX, }; diff --git a/profiler/src/profile_layernorm.cpp b/profiler/src/profile_layernorm.cpp new file mode 100644 index 0000000000..f4cffb33d1 --- /dev/null +++ b/profiler/src/profile_layernorm.cpp @@ -0,0 +1,123 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +#include +#include +#include + +#include "profiler/include/profile_layernorm_impl.hpp" + +using ck::index_t; + +struct LayernormArgParser +{ + std::unordered_map> long_opts = { + {"length", {}}, {"strideXY", {}}, {"strideGamma", {}}, {"strideBeta", {}}}; + + bool parse_opt(int argc, char* argv[], const std::string& key, int i) + { + if(std::string("--") + key == argv[i]) + { + int pos = i; + while(++i < argc && argv[i][0] != '-') {} + int end = i; + for(int j = pos + 1; j < end; j++) + { + long_opts[key].push_back(std::stoi(argv[j])); + } + return true; + } + return false; + } + + void operator()(int argc, char* argv[]) + { + for(auto& kv : long_opts) + { + for(int i = 1; i < argc; i++) + { + if(parse_opt(argc, argv, kv.first, i)) + break; + } + } + } +}; + +void print_help_layernorm() +{ + std::cout << "arg1: data type (0: fp16; 1: fp32)\n" + << "arg2: verification (0: no; 1: yes)\n" + << "arg3: initialization (0: no init; 1: integer value; 2: decimal value)\n" + << "arg4: print tensor value (0: no; 1: yes)\n" + << "arg5: time kernel (0=n0, 1=yes)\n" + << "--length: tensor extents (e.g, --length 1024 1024) \n" + << "--strideXY: tensor strides (e.g, --strideXY 1024 1)\n" + << "--strideGamma: tensor strides (e.g, --strideGamma 1)\n" + << "--strideBeta: tensor strides (e.g, --strideBeta 1)\n" + << std::endl; +} + +int profile_layernorm(int argc, char* argv[]) +{ + if(argc <= 2) + { + print_help_layernorm(); + return 0; + } + + LayernormArgParser arg_parser; + + // short unnamed options + const ck::DataTypeEnum data_type = static_cast(std::stoi(argv[2])); + const bool do_verification = std::stoi(argv[3]); + const int init_method = std::stoi(argv[4]); + const bool do_log = std::stoi(argv[5]); + const bool time_kernel = std::stoi(argv[6]); + + // parse the long options + arg_parser(argc, argv); + const std::vector length = arg_parser.long_opts["length"]; + const std::vector strideXY = arg_parser.long_opts["strideXY"]; + const std::vector strideGamma = arg_parser.long_opts["strideGamma"]; + const std::vector strideBeta = arg_parser.long_opts["strideBeta"]; + + using F16 = ck::half_t; + using F32 = float; + constexpr int rank = 2; + + if(data_type == ck::DataTypeEnum::Half) + { + ck::profiler::profile_layernorm_impl(do_verification, + init_method, + do_log, + time_kernel, + length, + strideXY, + strideGamma, + strideBeta); + } + else if(data_type == ck::DataTypeEnum::Float) + { + ck::profiler::profile_layernorm_impl(do_verification, + init_method, + do_log, + time_kernel, + length, + strideXY, + strideGamma, + strideBeta); + } + else + { + throw std::runtime_error("not implemented yet"); + } + + return 0; +} + +// hijack main() for quick debugging +// int main(int argc, char* argv[]) +// { +// profile_layernorm(argc, argv); +// return 0; +// } diff --git a/profiler/src/profile_normalization.cpp b/profiler/src/profile_normalization.cpp index 277a78a669..5f2913464b 100644 --- a/profiler/src/profile_normalization.cpp +++ b/profiler/src/profile_normalization.cpp @@ -13,8 +13,7 @@ using ck::profiler::NormType; struct ArgParser { - std::unordered_map norm_dict = {{"layernorm", NormType::LAYERNORM}, - {"batchnorm", NormType::BATCHNORM}, + std::unordered_map norm_dict = {{"batchnorm", NormType::BATCHNORM}, {"softmax", NormType::SOFTMAX}}; std::unordered_map> long_opts = { diff --git a/profiler/src/profiler.cpp b/profiler/src/profiler.cpp index 0b1602acc2..c43cc23a9e 100644 --- a/profiler/src/profiler.cpp +++ b/profiler/src/profiler.cpp @@ -19,6 +19,7 @@ int profile_conv_bwd_data(int, char*[]); int profile_conv_bwd_weight(int, char*[]); int profile_grouped_conv_fwd(int, char*[]); int profile_normalization(int, char*[]); +int profile_layernorm(int, char*[]); int profile_reduce(int, char*[]); static void print_helper_message() @@ -115,11 +116,14 @@ int main(int argc, char* argv[]) { return profile_reduce(argc, argv); } - else if(strcmp(argv[1], "batchnorm") == 0 || strcmp(argv[1], "layernorm") == 0 || - strcmp(argv[1], "softmax") == 0) + else if(strcmp(argv[1], "batchnorm") == 0 || strcmp(argv[1], "softmax") == 0) { return profile_normalization(argc, argv); } + else if(strcmp(argv[1], "layernorm") == 0) + { + return profile_layernorm(argc, argv); + } else { print_helper_message();