From 356c1cc17bbb7848d72e17c33f8764c631927914 Mon Sep 17 00:00:00 2001 From: rocking5566 Date: Mon, 10 Apr 2023 21:02:17 +0800 Subject: [PATCH] Groupnorm + swish external api (#668) * Rename to proper naming * Add example of groupnorm + swish * Extract duplicate code in example * Add groupnorm + swish instances * Ractor instance generation, split into multiple cpp file * Add external api and client example * Refine profiler message * Use ck math version of exp * Refine problem size in example * Add host version of exp [ROCm/composable_kernel commit: ed3a2e52265e11daa366f47b082141a652b67c58] --- client_example/18_groupnorm/CMakeLists.txt | 2 + .../18_groupnorm/groupnorm_swish.cpp | 169 ++++++++++++++++++ example/42_groupnorm/CMakeLists.txt | 3 +- example/42_groupnorm/common.hpp | 23 +++ .../groupnorm_sigmoid_mul_fp16.cpp | 56 ++++++ example/42_groupnorm/groupnorm_swish_fp16.cpp | 40 +++++ ...oid_fp16.cpp => run_groupnorm_example.inc} | 79 +------- .../element/unary_element_wise_operation.hpp | 19 +- include/ck/utility/math.hpp | 4 + .../device_operation_instance_factory.hpp | 1 + .../gpu/normalization_swish.hpp | 81 +++++++++ .../gpu/normalization/CMakeLists.txt | 10 +- .../device_groupnorm_f16_instance.cpp | 23 +++ .../device_groupnorm_f32_instance.cpp | 23 +++ .../device_groupnorm_swish_f16_instance.cpp | 23 +++ .../device_groupnorm_swish_f32_instance.cpp | 23 +++ .../device_layernorm2d_f16_instance.cpp | 23 +++ .../device_layernorm2d_f32_instance.cpp | 23 +++ .../device_layernorm4d_f16_instance.cpp | 23 +++ .../device_layernorm4d_f32_instance.cpp | 23 +++ .../device_normalization_f16_instance.cpp | 70 -------- ....cpp => normalization_instance_common.hpp} | 52 +++--- .../profiler/profile_groupnorm_impl.hpp | 6 +- 23 files changed, 626 insertions(+), 173 deletions(-) create mode 100644 client_example/18_groupnorm/CMakeLists.txt create mode 100644 client_example/18_groupnorm/groupnorm_swish.cpp create mode 100644 example/42_groupnorm/common.hpp create mode 100644 example/42_groupnorm/groupnorm_sigmoid_mul_fp16.cpp create mode 100644 example/42_groupnorm/groupnorm_swish_fp16.cpp rename example/42_groupnorm/{groupnorm_sigmoid_fp16.cpp => run_groupnorm_example.inc} (54%) create mode 100644 library/include/ck/library/tensor_operation_instance/gpu/normalization_swish.hpp create mode 100644 library/src/tensor_operation_instance/gpu/normalization/device_groupnorm_f16_instance.cpp create mode 100644 library/src/tensor_operation_instance/gpu/normalization/device_groupnorm_f32_instance.cpp create mode 100644 library/src/tensor_operation_instance/gpu/normalization/device_groupnorm_swish_f16_instance.cpp create mode 100644 library/src/tensor_operation_instance/gpu/normalization/device_groupnorm_swish_f32_instance.cpp create mode 100644 library/src/tensor_operation_instance/gpu/normalization/device_layernorm2d_f16_instance.cpp create mode 100644 library/src/tensor_operation_instance/gpu/normalization/device_layernorm2d_f32_instance.cpp create mode 100644 library/src/tensor_operation_instance/gpu/normalization/device_layernorm4d_f16_instance.cpp create mode 100644 library/src/tensor_operation_instance/gpu/normalization/device_layernorm4d_f32_instance.cpp delete mode 100644 library/src/tensor_operation_instance/gpu/normalization/device_normalization_f16_instance.cpp rename library/src/tensor_operation_instance/gpu/normalization/{device_normalization_f32_instance.cpp => normalization_instance_common.hpp} (53%) diff --git a/client_example/18_groupnorm/CMakeLists.txt b/client_example/18_groupnorm/CMakeLists.txt new file mode 100644 index 0000000000..17c88cb61b --- /dev/null +++ b/client_example/18_groupnorm/CMakeLists.txt @@ -0,0 +1,2 @@ +add_executable(client_groupnorm_swish groupnorm_swish.cpp) +target_link_libraries(client_groupnorm_swish PRIVATE composable_kernel::device_operations) diff --git a/client_example/18_groupnorm/groupnorm_swish.cpp b/client_example/18_groupnorm/groupnorm_swish.cpp new file mode 100644 index 0000000000..8a873e6acd --- /dev/null +++ b/client_example/18_groupnorm/groupnorm_swish.cpp @@ -0,0 +1,169 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +#include +#include +#include + +#include "ck/ck.hpp" +#include "ck/tensor_operation/gpu/device/tensor_layout.hpp" +#include "ck/tensor_operation/gpu/device/device_normalization.hpp" +#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" + +#include "ck/library/tensor_operation_instance/gpu/normalization_swish.hpp" + +using XDataType = ck::half_t; +using GammaDataType = ck::half_t; +using BetaDataType = ck::half_t; +using YDataType = ck::half_t; +using ComputeDataType = float; +using Swish = ck::tensor_operation::element_wise::Swish; + +constexpr int Rank = 5; +constexpr int NumReduceDim = 3; + +struct SimpleDeviceMem +{ + SimpleDeviceMem() = delete; + + SimpleDeviceMem(std::size_t mem_size) : p_mem_{} + { + (void)hipMalloc(static_cast(&p_mem_), mem_size); + } + + void* GetDeviceBuffer() { return p_mem_; } + + ~SimpleDeviceMem() { (void)hipFree(p_mem_); } + + void* p_mem_; +}; + +int main(int argc, char* argv[]) +{ + ck::index_t N = 32; + ck::index_t H = 16; + ck::index_t W = 16; + ck::index_t G = 64; + ck::index_t C = 128; + + std::size_t xy_size = N * H * W * G * C; + std::size_t gamma_beta_size = G * C; + + std::vector xy_strides = {H * W * G * C, W * G * C, G * C, C, 1}; + std::vector gamma_beta_strides = {0, 0, 0, C, 1}; + + SimpleDeviceMem x_device_buf(sizeof(XDataType) * xy_size); + SimpleDeviceMem gamma_device_buf(sizeof(GammaDataType) * gamma_beta_size); + SimpleDeviceMem beta_device_buf(sizeof(BetaDataType) * gamma_beta_size); + SimpleDeviceMem y_device_buf(sizeof(YDataType) * xy_size); + + using DeviceOp = ck::tensor_operation::device::DeviceNormalization; + + // get device op instances + const auto op_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory< + DeviceOp>::GetInstances(); + + std::cout << "found " << op_ptrs.size() << " instances" << std::endl; + + std::string best_op_name; + bool found = false; + int best_op_id = -1; + float best_ave_time = std::numeric_limits::max(); + float best_gb_per_sec = 0; + + // profile device operation instances + std::cout << "Run all instances and do timing" << std::endl; + + for(int i = 0; i < op_ptrs.size(); ++i) + { + auto& op_ptr = op_ptrs[i]; + auto argument_ptr = op_ptr->MakeArgumentPointer({N, H, W, G, C}, // lengths + xy_strides, // xStrides + gamma_beta_strides, // gammaStrides + gamma_beta_strides, // betaStrides + xy_strides, // yStrides + {1, 2, 4}, // reduceDims + 1e-6, + x_device_buf.GetDeviceBuffer(), + gamma_device_buf.GetDeviceBuffer(), + beta_device_buf.GetDeviceBuffer(), + y_device_buf.GetDeviceBuffer(), + nullptr, + nullptr, + Swish{}); + + auto invoker_ptr = op_ptr->MakeInvokerPointer(); + + std::string op_name = op_ptr->GetTypeString(); + + if(op_ptr->IsSupportedArgument(argument_ptr.get())) + { + float ave_time = invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, true}); + + std::size_t num_byte = + sizeof(XDataType) * xy_size + sizeof(GammaDataType) * gamma_beta_size + + sizeof(BetaDataType) * gamma_beta_size + sizeof(YDataType) * xy_size; + + float gb_per_sec = num_byte / 1.E6 / ave_time; + + std::cout << "Perf: " << std::setw(10) << ave_time << " ms, " << gb_per_sec << " GB/s, " + << op_name << std::endl; + + if(ave_time < best_ave_time) + { + found = true; + best_op_id = i; + best_op_name = op_name; + best_ave_time = ave_time; + best_gb_per_sec = gb_per_sec; + } + } + else + { + std::cout << op_name << " does not support this problem" << std::endl; + } + } + + std::cout << "Best Perf: " << best_ave_time << " ms, " << best_gb_per_sec << " GB/s, " + << best_op_name << std::endl; + + // run the best intance + { + auto& op_ptr = op_ptrs[best_op_id]; + std::cout << "Run the best instance without timing: " << op_ptr->GetTypeString() + << std::endl; + + auto argument_ptr = op_ptr->MakeArgumentPointer({N, H, W, G, C}, // lengths + xy_strides, // xStrides + gamma_beta_strides, // gammaStrides + gamma_beta_strides, // betaStrides + xy_strides, // yStrides + {1, 2, 4}, // reduceDims + 1e-6, + x_device_buf.GetDeviceBuffer(), + gamma_device_buf.GetDeviceBuffer(), + beta_device_buf.GetDeviceBuffer(), + y_device_buf.GetDeviceBuffer(), + nullptr, + nullptr, + Swish{}); + + auto invoker_ptr = op_ptr->MakeInvokerPointer(); + + if(op_ptr->IsSupportedArgument(argument_ptr.get())) + { + invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, false}); + } + + std::cout << "Done" << std::endl; + } + + return 0; +} diff --git a/example/42_groupnorm/CMakeLists.txt b/example/42_groupnorm/CMakeLists.txt index c3b7b82592..a9990c5d89 100644 --- a/example/42_groupnorm/CMakeLists.txt +++ b/example/42_groupnorm/CMakeLists.txt @@ -1 +1,2 @@ -add_example_executable(example_groupnorm_sigmoid_fp16 groupnorm_sigmoid_fp16.cpp) +add_example_executable(example_groupnorm_sigmoid_mul_fp16 groupnorm_sigmoid_mul_fp16.cpp) +add_example_executable(example_groupnorm_swish_fp16 groupnorm_swish_fp16.cpp) diff --git a/example/42_groupnorm/common.hpp b/example/42_groupnorm/common.hpp new file mode 100644 index 0000000000..e159abf3e9 --- /dev/null +++ b/example/42_groupnorm/common.hpp @@ -0,0 +1,23 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once + +#include +#include +#include +#include +#include + +#include "ck/ck.hpp" +#include "ck/utility/reduction_enums.hpp" +#include "ck/tensor_operation/gpu/device/impl/device_normalization_impl.hpp" +#include "ck/tensor_operation/gpu/device/reduction_operator_mapping.hpp" + +#include "ck/library/utility/fill.hpp" +#include "ck/library/utility/check_err.hpp" +#include "ck/library/utility/device_memory.hpp" +#include "ck/library/utility/host_common_util.hpp" +#include "ck/library/utility/host_tensor.hpp" +#include "ck/library/utility/host_tensor_generator.hpp" +#include "ck/library/reference_tensor_operation/cpu/reference_groupnorm.hpp" diff --git a/example/42_groupnorm/groupnorm_sigmoid_mul_fp16.cpp b/example/42_groupnorm/groupnorm_sigmoid_mul_fp16.cpp new file mode 100644 index 0000000000..b07a26c4c9 --- /dev/null +++ b/example/42_groupnorm/groupnorm_sigmoid_mul_fp16.cpp @@ -0,0 +1,56 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +#include "common.hpp" + +constexpr int Rank = 5; +constexpr int NumReduceDim = 3; + +using XDataType = ck::half_t; +using GammaDataType = ck::half_t; +using BetaDataType = ck::half_t; +using YDataType = ck::half_t; +using ComputeDataType = float; + +struct YElementOp +{ + template + __host__ __device__ void operator()(T& y, const T& x) const + { + static_assert(ck::is_same::value || ck::is_same::value || + ck::is_same::value, + "Data type is not supported by this operation!"); + + T a; + + ck::tensor_operation::element_wise::Sigmoid{}(a, x); + + y = x * a; + }; +}; + +using DeviceInstance = + ck::tensor_operation::device::DeviceNormalizationImpl; // OutScalarPerVector + +#include "run_groupnorm_example.inc" + +int main(int argc, char* argv[]) { run_groupnorm_example(argc, argv); } diff --git a/example/42_groupnorm/groupnorm_swish_fp16.cpp b/example/42_groupnorm/groupnorm_swish_fp16.cpp new file mode 100644 index 0000000000..c52243bfb0 --- /dev/null +++ b/example/42_groupnorm/groupnorm_swish_fp16.cpp @@ -0,0 +1,40 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +#include "common.hpp" + +constexpr int Rank = 5; +constexpr int NumReduceDim = 3; + +using XDataType = ck::half_t; +using GammaDataType = ck::half_t; +using BetaDataType = ck::half_t; +using YDataType = ck::half_t; +using ComputeDataType = float; +using YElementOp = ck::tensor_operation::element_wise::Swish; + +using DeviceInstance = + ck::tensor_operation::device::DeviceNormalizationImpl; // OutScalarPerVector + +#include "run_groupnorm_example.inc" + +int main(int argc, char* argv[]) { run_groupnorm_example(argc, argv); } diff --git a/example/42_groupnorm/groupnorm_sigmoid_fp16.cpp b/example/42_groupnorm/run_groupnorm_example.inc similarity index 54% rename from example/42_groupnorm/groupnorm_sigmoid_fp16.cpp rename to example/42_groupnorm/run_groupnorm_example.inc index 35c7c054e0..bd7eb98ca0 100644 --- a/example/42_groupnorm/groupnorm_sigmoid_fp16.cpp +++ b/example/42_groupnorm/run_groupnorm_example.inc @@ -1,80 +1,15 @@ // SPDX-License-Identifier: MIT // Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. -#include -#include -#include -#include -#include +#pragma once -#include "ck/ck.hpp" -#include "ck/utility/reduction_enums.hpp" -#include "ck/tensor_operation/gpu/device/impl/device_normalization_impl.hpp" -#include "ck/tensor_operation/gpu/device/reduction_operator_mapping.hpp" - -#include "ck/library/utility/fill.hpp" -#include "ck/library/utility/check_err.hpp" -#include "ck/library/utility/device_memory.hpp" -#include "ck/library/utility/host_common_util.hpp" -#include "ck/library/utility/host_tensor.hpp" -#include "ck/library/utility/host_tensor_generator.hpp" -#include "ck/library/reference_tensor_operation/cpu/reference_groupnorm.hpp" - -constexpr int Rank = 5; -constexpr int NumReduceDim = 3; - -using XDataType = ck::half_t; -using GammaDataType = ck::half_t; -using BetaDataType = ck::half_t; -using YDataType = ck::half_t; -using ComputeDataType = float; - -struct YElementOp +int run_groupnorm_example(int argc, char* argv[]) { - template - __host__ __device__ void operator()(T& y, const T& x) const - { - static_assert(ck::is_same::value || ck::is_same::value || - ck::is_same::value, - "Data type is not supported by this operation!"); - - T a; - - ck::tensor_operation::element_wise::Sigmoid{}(a, x); - - y = x * a; - }; -}; - -using DeviceInstance = - ck::tensor_operation::device::DeviceNormalizationImpl; // OutScalarPerVector - -int main(int argc, char* argv[]) -{ - ck::index_t N = 2; - ck::index_t H = 32; - ck::index_t W = 32; - ck::index_t G = 32; - ck::index_t C = 30; + ck::index_t N = 32; + ck::index_t H = 16; + ck::index_t W = 16; + ck::index_t G = 64; + ck::index_t C = 128; if(argc == 1) { diff --git a/include/ck/tensor_operation/gpu/element/unary_element_wise_operation.hpp b/include/ck/tensor_operation/gpu/element/unary_element_wise_operation.hpp index f1f3042ad1..2987def02a 100644 --- a/include/ck/tensor_operation/gpu/element/unary_element_wise_operation.hpp +++ b/include/ck/tensor_operation/gpu/element/unary_element_wise_operation.hpp @@ -316,8 +316,6 @@ struct Sigmoid y = 1 / (ck::type_convert(1) + exp(-x)); }; - - int32_t divider_ = 1; }; struct TanH @@ -333,6 +331,23 @@ struct TanH }; }; +struct Swish +{ + Swish(float beta = 1.0f) : beta_(beta) {} + + template + __host__ __device__ void operator()(T& y, const T& x) const + { + static_assert(is_same::value || is_same::value || + is_same::value, + "Data type is not supported by this operation!"); + + y = x / (ck::type_convert(1) + ck::math::exp(-beta_ * x)); + }; + + float beta_ = 1.0f; +}; + } // namespace element_wise } // namespace tensor_operation } // namespace ck diff --git a/include/ck/utility/math.hpp b/include/ck/utility/math.hpp index 12203bd7f3..72071992f6 100644 --- a/include/ck/utility/math.hpp +++ b/include/ck/utility/math.hpp @@ -168,6 +168,10 @@ __device__ double exp(double x) return exp(x); } +static inline __host__ float exp(float x) { return std::expf(x); } + +static inline __host__ double exp(double x) { return std::exp(x); } + // greatest common divisor, aka highest common factor __host__ __device__ constexpr index_t gcd(index_t x, index_t y) { diff --git a/library/include/ck/library/tensor_operation_instance/device_operation_instance_factory.hpp b/library/include/ck/library/tensor_operation_instance/device_operation_instance_factory.hpp index f176cb91e0..1886439528 100644 --- a/library/include/ck/library/tensor_operation_instance/device_operation_instance_factory.hpp +++ b/library/include/ck/library/tensor_operation_instance/device_operation_instance_factory.hpp @@ -96,6 +96,7 @@ using FastGelu = ck::tensor_operation::element_wise::FastGelu; using AddMultiply = ck::tensor_operation::element_wise::AddMultiply; using ScaleAdd = ck::tensor_operation::element_wise::ScaleAdd; using Gelu = ck::tensor_operation::element_wise::Gelu; +using Swish = ck::tensor_operation::element_wise::Swish; template using Activation_Mul_Clamp = ck::tensor_operation::element_wise::Activation_Mul_Clamp; diff --git a/library/include/ck/library/tensor_operation_instance/gpu/normalization_swish.hpp b/library/include/ck/library/tensor_operation_instance/gpu/normalization_swish.hpp new file mode 100644 index 0000000000..c04a54455d --- /dev/null +++ b/library/include/ck/library/tensor_operation_instance/gpu/normalization_swish.hpp @@ -0,0 +1,81 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once + +#include + +#include "ck/ck.hpp" +#include "ck/tensor_operation/gpu/device/tensor_layout.hpp" +#include "ck/tensor_operation/gpu/device/device_normalization.hpp" +#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" + +#include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +// FP16 +void add_device_normalization_rank_5_3_swish_f16_instances( + std::vector>>&); + +// FP32 +void add_device_normalization_rank_5_3_swish_f32_instances( + std::vector>>&); + +template +struct DeviceOperationInstanceFactory< + ck::tensor_operation::device::DeviceNormalization> +{ + using DeviceOp = DeviceNormalization; + + static auto GetInstances() + { + std::vector> op_ptrs; + + if constexpr(is_same_v && is_same_v && + is_same_v && is_same_v) + { + if constexpr(Rank == 5 && NumReduceDim == 3) + { + add_device_normalization_rank_5_3_swish_f16_instances(op_ptrs); + } + } + else if constexpr(is_same_v && is_same_v && + is_same_v && is_same_v) + { + if constexpr(Rank == 5 && NumReduceDim == 3) + { + add_device_normalization_rank_5_3_swish_f32_instances(op_ptrs); + } + } + + return op_ptrs; + } +}; + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/normalization/CMakeLists.txt b/library/src/tensor_operation_instance/gpu/normalization/CMakeLists.txt index aa0cc11480..6bed36e350 100644 --- a/library/src/tensor_operation_instance/gpu/normalization/CMakeLists.txt +++ b/library/src/tensor_operation_instance/gpu/normalization/CMakeLists.txt @@ -1,4 +1,10 @@ add_instance_library(device_normalization_instance - device_normalization_f16_instance.cpp - device_normalization_f32_instance.cpp + device_layernorm2d_f16_instance.cpp + device_layernorm2d_f32_instance.cpp + device_layernorm4d_f16_instance.cpp + device_layernorm4d_f32_instance.cpp + device_groupnorm_f16_instance.cpp + device_groupnorm_f32_instance.cpp + device_groupnorm_swish_f16_instance.cpp + device_groupnorm_swish_f32_instance.cpp ) diff --git a/library/src/tensor_operation_instance/gpu/normalization/device_groupnorm_f16_instance.cpp b/library/src/tensor_operation_instance/gpu/normalization/device_groupnorm_f16_instance.cpp new file mode 100644 index 0000000000..e9c2112e16 --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/normalization/device_groupnorm_f16_instance.cpp @@ -0,0 +1,23 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +#include "normalization_instance_common.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +using Pass = ck::tensor_operation::element_wise::PassThrough; + +void add_device_normalization_rank_5_3_f16_instances( + std::vector>>& + instances) +{ + add_device_operation_instances(instances, device_normalization_f16_instances{}); +} + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/normalization/device_groupnorm_f32_instance.cpp b/library/src/tensor_operation_instance/gpu/normalization/device_groupnorm_f32_instance.cpp new file mode 100644 index 0000000000..79dde38fc9 --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/normalization/device_groupnorm_f32_instance.cpp @@ -0,0 +1,23 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +#include "normalization_instance_common.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +using Pass = ck::tensor_operation::element_wise::PassThrough; + +void add_device_normalization_rank_5_3_f32_instances( + std::vector>>& + instances) +{ + add_device_operation_instances(instances, device_normalization_f32_instances{}); +} + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/normalization/device_groupnorm_swish_f16_instance.cpp b/library/src/tensor_operation_instance/gpu/normalization/device_groupnorm_swish_f16_instance.cpp new file mode 100644 index 0000000000..6241e03385 --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/normalization/device_groupnorm_swish_f16_instance.cpp @@ -0,0 +1,23 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +#include "normalization_instance_common.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +using Swish = ck::tensor_operation::element_wise::Swish; + +void add_device_normalization_rank_5_3_swish_f16_instances( + std::vector>>& + instances) +{ + add_device_operation_instances(instances, device_normalization_f16_instances{}); +} + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/normalization/device_groupnorm_swish_f32_instance.cpp b/library/src/tensor_operation_instance/gpu/normalization/device_groupnorm_swish_f32_instance.cpp new file mode 100644 index 0000000000..b64328d5d0 --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/normalization/device_groupnorm_swish_f32_instance.cpp @@ -0,0 +1,23 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +#include "normalization_instance_common.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +using Swish = ck::tensor_operation::element_wise::Swish; + +void add_device_normalization_rank_5_3_swish_f32_instances( + std::vector>>& + instances) +{ + add_device_operation_instances(instances, device_normalization_f32_instances{}); +} + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/normalization/device_layernorm2d_f16_instance.cpp b/library/src/tensor_operation_instance/gpu/normalization/device_layernorm2d_f16_instance.cpp new file mode 100644 index 0000000000..d6a2f6f2c1 --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/normalization/device_layernorm2d_f16_instance.cpp @@ -0,0 +1,23 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +#include "normalization_instance_common.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +using Pass = ck::tensor_operation::element_wise::PassThrough; + +void add_device_normalization_rank_2_1_f16_instances( + std::vector>>& + instances) +{ + add_device_operation_instances(instances, device_normalization_f16_instances{}); +} + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/normalization/device_layernorm2d_f32_instance.cpp b/library/src/tensor_operation_instance/gpu/normalization/device_layernorm2d_f32_instance.cpp new file mode 100644 index 0000000000..73097828e3 --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/normalization/device_layernorm2d_f32_instance.cpp @@ -0,0 +1,23 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +#include "normalization_instance_common.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +using Pass = ck::tensor_operation::element_wise::PassThrough; + +void add_device_normalization_rank_2_1_f32_instances( + std::vector>>& + instances) +{ + add_device_operation_instances(instances, device_normalization_f32_instances{}); +} + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/normalization/device_layernorm4d_f16_instance.cpp b/library/src/tensor_operation_instance/gpu/normalization/device_layernorm4d_f16_instance.cpp new file mode 100644 index 0000000000..507a683ee7 --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/normalization/device_layernorm4d_f16_instance.cpp @@ -0,0 +1,23 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +#include "normalization_instance_common.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +using Pass = ck::tensor_operation::element_wise::PassThrough; + +void add_device_normalization_rank_4_3_f16_instances( + std::vector>>& + instances) +{ + add_device_operation_instances(instances, device_normalization_f16_instances{}); +} + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/normalization/device_layernorm4d_f32_instance.cpp b/library/src/tensor_operation_instance/gpu/normalization/device_layernorm4d_f32_instance.cpp new file mode 100644 index 0000000000..ca1aa0c25c --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/normalization/device_layernorm4d_f32_instance.cpp @@ -0,0 +1,23 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +#include "normalization_instance_common.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +using Pass = ck::tensor_operation::element_wise::PassThrough; + +void add_device_normalization_rank_4_3_f32_instances( + std::vector>>& + instances) +{ + add_device_operation_instances(instances, device_normalization_f32_instances{}); +} + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/normalization/device_normalization_f16_instance.cpp b/library/src/tensor_operation_instance/gpu/normalization/device_normalization_f16_instance.cpp deleted file mode 100644 index beeaa3aa22..0000000000 --- a/library/src/tensor_operation_instance/gpu/normalization/device_normalization_f16_instance.cpp +++ /dev/null @@ -1,70 +0,0 @@ -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. - -#include "ck/ck.hpp" -#include "ck/tensor_operation/gpu/device/impl/device_normalization_impl.hpp" -#include "ck/utility/data_type.hpp" - -#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" - -namespace ck { -namespace tensor_operation { -namespace device { -namespace instance { - -using F16 = ck::half_t; -using F32 = float; - -using Pass = ck::tensor_operation::element_wise::PassThrough; - -template -// clang-format off -using device_normalization_f16_instances = - std::tuple < - // XDataType, GammaDataType, BetaDataType, ComputeDataType, YDataType, Rank, NumReduceDim, BlockSize, MThreadClusterSize, KThreadClusterSize, MThreadSliceSize, KThreadSliceSize, XYSrcVectorDim, XSrcVectorSize, GammaSrcVectorDim, GammaSrcVectorSize, BetaSrcVectorDim, BetaSrcVectorSize, YDstVectorSize> - DeviceNormalizationImpl, // irregular size - DeviceNormalizationImpl, // irregular size - DeviceNormalizationImpl, // irregular size - DeviceNormalizationImpl, // irregular size - DeviceNormalizationImpl, // irregular size - DeviceNormalizationImpl, // irregular size - DeviceNormalizationImpl, - DeviceNormalizationImpl, - DeviceNormalizationImpl, - DeviceNormalizationImpl, - DeviceNormalizationImpl, - DeviceNormalizationImpl, - DeviceNormalizationImpl, - DeviceNormalizationImpl, - DeviceNormalizationImpl, - DeviceNormalizationImpl, - DeviceNormalizationImpl, - DeviceNormalizationImpl - >; -// clang-format on - -void add_device_normalization_rank_2_1_f16_instances( - std::vector>>& - instances) -{ - add_device_operation_instances(instances, device_normalization_f16_instances{}); -} - -void add_device_normalization_rank_4_3_f16_instances( - std::vector>>& - instances) -{ - add_device_operation_instances(instances, device_normalization_f16_instances{}); -} - -void add_device_normalization_rank_5_3_f16_instances( - std::vector>>& - instances) -{ - add_device_operation_instances(instances, device_normalization_f16_instances{}); -} - -} // namespace instance -} // namespace device -} // namespace tensor_operation -} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/normalization/device_normalization_f32_instance.cpp b/library/src/tensor_operation_instance/gpu/normalization/normalization_instance_common.hpp similarity index 53% rename from library/src/tensor_operation_instance/gpu/normalization/device_normalization_f32_instance.cpp rename to library/src/tensor_operation_instance/gpu/normalization/normalization_instance_common.hpp index 4d236fb633..a58fb6ca35 100644 --- a/library/src/tensor_operation_instance/gpu/normalization/device_normalization_f32_instance.cpp +++ b/library/src/tensor_operation_instance/gpu/normalization/normalization_instance_common.hpp @@ -1,6 +1,8 @@ // SPDX-License-Identifier: MIT // Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. +#pragma once + #include "ck/ck.hpp" #include "ck/tensor_operation/gpu/device/impl/device_normalization_impl.hpp" #include "ck/utility/data_type.hpp" @@ -12,12 +14,37 @@ namespace tensor_operation { namespace device { namespace instance { +using F16 = ck::half_t; using F32 = float; -using Pass = ck::tensor_operation::element_wise::PassThrough; +template +using device_normalization_f16_instances = + // clang-format off + std::tuple < + // XDataType, GammaDataType, BetaDataType, ComputeDataType, YDataType, Rank, NumReduceDim, BlockSize, MThreadClusterSize, KThreadClusterSize, MThreadSliceSize, KThreadSliceSize, XYSrcVectorDim, XSrcVectorSize, GammaSrcVectorDim, GammaSrcVectorSize, BetaSrcVectorDim, BetaSrcVectorSize, YDstVectorSize> + DeviceNormalizationImpl, // irregular size + DeviceNormalizationImpl, // irregular size + DeviceNormalizationImpl, // irregular size + DeviceNormalizationImpl, // irregular size + DeviceNormalizationImpl, // irregular size + DeviceNormalizationImpl, // irregular size + DeviceNormalizationImpl, + DeviceNormalizationImpl, + DeviceNormalizationImpl, + DeviceNormalizationImpl, + DeviceNormalizationImpl, + DeviceNormalizationImpl, + DeviceNormalizationImpl, + DeviceNormalizationImpl, + DeviceNormalizationImpl, + DeviceNormalizationImpl, + DeviceNormalizationImpl, + DeviceNormalizationImpl + // clang-format on + >; template -using device_layernorm_f32_instances = std::tuple< +using device_normalization_f32_instances = std::tuple< // clang-format off // XDataType, GammaDataType, BetaDataType, ComputeDataType, YDataType, Rank, NumReduceDim, BlockSize, MThreadClusterSize, KThreadClusterSize, MThreadSliceSize, KThreadSliceSize, XYSrcVectorDim, XSrcVectorSize, GammaSrcVectorSize, BetaSrcVectorSize, YDstVectorSize> DeviceNormalizationImpl, // irregular size @@ -42,27 +69,6 @@ using device_layernorm_f32_instances = std::tuple< // clang-format on >; -void add_device_normalization_rank_2_1_f32_instances( - std::vector>>& - instances) -{ - add_device_operation_instances(instances, device_layernorm_f32_instances{}); -} - -void add_device_normalization_rank_4_3_f32_instances( - std::vector>>& - instances) -{ - add_device_operation_instances(instances, device_layernorm_f32_instances{}); -} - -void add_device_normalization_rank_5_3_f32_instances( - std::vector>>& - instances) -{ - add_device_operation_instances(instances, device_layernorm_f32_instances{}); -} - } // namespace instance } // namespace device } // namespace tensor_operation diff --git a/profiler/include/profiler/profile_groupnorm_impl.hpp b/profiler/include/profiler/profile_groupnorm_impl.hpp index 81fec5590a..73343f6bec 100644 --- a/profiler/include/profiler/profile_groupnorm_impl.hpp +++ b/profiler/include/profiler/profile_groupnorm_impl.hpp @@ -190,9 +190,9 @@ bool profile_groupnorm_impl(int do_verification, if(time_kernel) { - LogRange(std::cout << "length = ", length, ",") << ", "; - std::cout << "num_kernel = " << num_kernel << ", best perf = " << best_avg_time << " ms, " - << best_gb_per_sec << " GB/s, " << best_instance_name << std::endl; + LogRange(std::cout << "length = ", length, ",") << std::endl; + std::cout << "best perf = " << best_avg_time << " ms, " << best_gb_per_sec << " GB/s, " + << best_instance_name << std::endl; } if(num_kernel == 0)