From b148acfaaa80daaf8296dd7552eab89b5d92a5fc Mon Sep 17 00:00:00 2001 From: Qianfeng Date: Thu, 26 Jan 2023 07:09:04 +0800 Subject: [PATCH] Batchnorm inference instances, external API, client examples and gtests (#531) * File renaming and class renaming for device element-wise operation * Add batchnorm-infer instances, external API and client example * Add batchnorm-infer profiler module and gtests * Remove file device_elementwise_extension.hpp and move NormalizeInInfer operation to element_wise_operation.hpp * Remove the using of class aliasing for DeviceElementwiseForBatchNormInfer * Rename class and file due to conflict from device_elementwise_2d.hpp * Fix namespace in batcnnorm_infer_nhwc client example [ROCm/composable_kernel commit: a1b2441f8d3fc229629d0c6c18ef5836d1548e12] --- .../gemm_add_add_layernorm.cpp | 2 +- client_example/13_batchnorm/CMakeLists.txt | 2 + .../13_batchnorm/batchnorm_infer_nhwc.cpp | 189 ++++++++++ .../broadcast_add_2d_amn_bn.cpp | 16 +- .../broadcast_add_3d_am_bmnk.cpp | 16 +- .../elementwise_add_1d.cpp | 16 +- .../elementwise_add_4d.cpp | 16 +- ...bias_relu_add_layernorm_xdl_naive_fp16.cpp | 4 +- .../gemm_layernorm_xdl_naive_fp16.cpp | 4 +- example/34_batchnorm/batchnorm_infer_impl.hpp | 4 +- .../elementwise_permute_4D_fp16.cpp | 16 +- .../elementwise_permute_4D_fp16_2d.cpp | 20 +- ...ntwise_base.hpp => device_elementwise.hpp} | 6 +- .../device_elementwise_2d_impl.hpp} | 10 +- ...ntwise.hpp => device_elementwise_impl.hpp} | 6 +- .../gpu/element/element_wise_operation.hpp | 34 ++ .../gpu/batchnorm_infer.hpp | 117 ++++++ .../gpu/device_elementwise_instance.hpp | 9 +- .../gpu/batchnorm/CMakeLists.txt | 4 + .../device_batchnorm_infer_bf16_instance.cpp | 55 +++ .../device_batchnorm_infer_f16_instance.cpp | 54 +++ .../device_batchnorm_infer_f32_instance.cpp | 52 +++ .../device_batchnorm_infer_f64_instance.cpp | 47 +++ .../elementwise/device_normalize_instance.cpp | 12 +- .../profiler/profile_batchnorm_infer_impl.hpp | 335 ++++++++++++++++++ profiler/src/CMakeLists.txt | 1 + profiler/src/profile_batchnorm_infer.cpp | 202 +++++++++++ test/batchnorm/CMakeLists.txt | 2 + test/batchnorm/batchnorm_infer_rank_4.cpp | 89 +++++ 29 files changed, 1260 insertions(+), 80 deletions(-) create mode 100644 client_example/13_batchnorm/batchnorm_infer_nhwc.cpp rename include/ck/tensor_operation/gpu/device/{device_elementwise_base.hpp => device_elementwise.hpp} (87%) rename include/ck/tensor_operation/gpu/device/{device_elementwise_2d.hpp => impl/device_elementwise_2d_impl.hpp} (97%) rename include/ck/tensor_operation/gpu/device/impl/{device_elementwise.hpp => device_elementwise_impl.hpp} (98%) create mode 100644 library/include/ck/library/tensor_operation_instance/gpu/batchnorm_infer.hpp create mode 100644 library/src/tensor_operation_instance/gpu/batchnorm/device_batchnorm_infer_bf16_instance.cpp create mode 100644 library/src/tensor_operation_instance/gpu/batchnorm/device_batchnorm_infer_f16_instance.cpp create mode 100644 library/src/tensor_operation_instance/gpu/batchnorm/device_batchnorm_infer_f32_instance.cpp create mode 100644 library/src/tensor_operation_instance/gpu/batchnorm/device_batchnorm_infer_f64_instance.cpp create mode 100644 profiler/include/profiler/profile_batchnorm_infer_impl.hpp create mode 100644 profiler/src/profile_batchnorm_infer.cpp create mode 100644 test/batchnorm/batchnorm_infer_rank_4.cpp diff --git a/client_example/03_gemm_layernorm/gemm_add_add_layernorm.cpp b/client_example/03_gemm_layernorm/gemm_add_add_layernorm.cpp index 6c259407d4..02da5ff6ce 100644 --- a/client_example/03_gemm_layernorm/gemm_add_add_layernorm.cpp +++ b/client_example/03_gemm_layernorm/gemm_add_add_layernorm.cpp @@ -8,7 +8,7 @@ #include "ck/ck.hpp" #include "ck/tensor_operation/gpu/device/tensor_layout.hpp" #include "ck/tensor_operation/gpu/device/device_gemm_reduce.hpp" -#include "ck/tensor_operation/gpu/device/impl/device_elementwise.hpp" +#include "ck/tensor_operation/gpu/device/impl/device_elementwise_impl.hpp" #include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" #include "ck/library/tensor_operation_instance/gpu/device_elementwise_instance.hpp" diff --git a/client_example/13_batchnorm/CMakeLists.txt b/client_example/13_batchnorm/CMakeLists.txt index 54669678ae..fc4f9d395c 100644 --- a/client_example/13_batchnorm/CMakeLists.txt +++ b/client_example/13_batchnorm/CMakeLists.txt @@ -1,4 +1,6 @@ add_executable(client_batchnorm_fwd_nhwc batchnorm_fwd_nhwc.cpp) add_executable(client_batchnorm_bwd_nhwc batchnorm_bwd_nhwc.cpp) +add_executable(client_batchnorm_infer_nhwc batchnorm_infer_nhwc.cpp) target_link_libraries(client_batchnorm_fwd_nhwc PRIVATE composable_kernel::device_operations) target_link_libraries(client_batchnorm_bwd_nhwc PRIVATE composable_kernel::device_operations) +target_link_libraries(client_batchnorm_infer_nhwc PRIVATE composable_kernel::device_operations) diff --git a/client_example/13_batchnorm/batchnorm_infer_nhwc.cpp b/client_example/13_batchnorm/batchnorm_infer_nhwc.cpp new file mode 100644 index 0000000000..3117d162db --- /dev/null +++ b/client_example/13_batchnorm/batchnorm_infer_nhwc.cpp @@ -0,0 +1,189 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +#include +#include +#include +#include +#include + +#include "ck/ck.hpp" +#include "ck/utility/tuple.hpp" +#include "ck/library/tensor_operation_instance/gpu/batchnorm_infer.hpp" + +using XDataType = float; +using YDataType = float; +using ScaleDataType = float; +using BiasDataType = float; +using MeanVarDataType = float; + +constexpr int Rank = 4; +constexpr int NumBatchNormReduceDim = 3; + +using Normalize = ck::tensor_operation::element_wise::NormalizeInInfer; + +const double epsilon = std::numeric_limits::epsilon(); + +struct SimpleDeviceMem +{ + SimpleDeviceMem() = delete; + + SimpleDeviceMem(std::size_t mem_size) : p_mem_{} + { + (void)hipMalloc(static_cast(&p_mem_), mem_size); + } + + void* GetDeviceBuffer() { return p_mem_; } + + ~SimpleDeviceMem() { (void)hipFree(p_mem_); } + + void* p_mem_; +}; + +int main(int argc, char* argv[]) +{ + std::array xyLengths{16, 8, 128, 256}; + std::array xyStrides{8 * 128 * 256, 128 * 256, 256, 1}; + std::array scaleBiasMeanVarLengths{256}; + std::array scaleBiasMeanVarStrides{1}; + std::array reduceDims{0, 1, 2}; + std::array invariantDims{3}; + + ck::index_t numXYElement = + std::accumulate(xyLengths.begin(), xyLengths.end(), 1, std::multiplies()); + + ck::index_t numScaleBiasMeanVarElement = std::accumulate(scaleBiasMeanVarLengths.begin(), + scaleBiasMeanVarLengths.end(), + 1, + std::multiplies()); + + SimpleDeviceMem x(sizeof(XDataType) * numXYElement); + SimpleDeviceMem y(sizeof(YDataType) * numXYElement); + SimpleDeviceMem scale(sizeof(ScaleDataType) * numScaleBiasMeanVarElement); + SimpleDeviceMem bias(sizeof(BiasDataType) * numScaleBiasMeanVarElement); + SimpleDeviceMem mean(sizeof(MeanVarDataType) * numScaleBiasMeanVarElement); + SimpleDeviceMem variance(sizeof(MeanVarDataType) * numScaleBiasMeanVarElement); + + // values in variance need be non-negative + (void)hipMemset( + variance.GetDeviceBuffer(), 0, sizeof(MeanVarDataType) * numScaleBiasMeanVarElement); + + std::array aligned_scaleBiasMeanVarStrides{0}; + + int i = 0; + for(auto dim : invariantDims) + { + assert(xyLengths[dim] == scaleBiasMeanVarLengths[i]); + + aligned_scaleBiasMeanVarStrides[dim] = scaleBiasMeanVarStrides[i]; + i++; + }; + + using DeviceOp = ck::tensor_operation::device::DeviceElementwise< + ck::Tuple, + ck::Tuple, + Normalize, + Rank>; + + const auto op_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory< + DeviceOp>::GetInstances(); + + std::cout << "found " << op_ptrs.size() << " instances" << std::endl; + + std::string best_op_name; + bool found = false; + int best_op_id = -1; + float best_ave_time = std::numeric_limits::max(); + float best_gb_per_sec = 0; + + // profile device operation instances + std::cout << "Run all instances and do timing" << std::endl; + + for(int i = 0; i < op_ptrs.size(); ++i) + { + auto& op_ptr = op_ptrs[i]; + + auto argument_ptr = op_ptr->MakeArgumentPointer(xyLengths, + {xyStrides, + aligned_scaleBiasMeanVarStrides, + aligned_scaleBiasMeanVarStrides, + aligned_scaleBiasMeanVarStrides, + aligned_scaleBiasMeanVarStrides}, + {xyStrides}, + {x.GetDeviceBuffer(), + mean.GetDeviceBuffer(), + variance.GetDeviceBuffer(), + scale.GetDeviceBuffer(), + bias.GetDeviceBuffer()}, + {y.GetDeviceBuffer()}, + Normalize{epsilon}); + + auto invoker_ptr = op_ptr->MakeInvokerPointer(); + std::string op_name = op_ptr->GetTypeString(); + + if(op_ptr->IsSupportedArgument(argument_ptr.get())) + { + float ave_time = invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, true}); + + std::size_t num_bytes = + numXYElement * (sizeof(XDataType) + sizeof(YDataType)) + + numScaleBiasMeanVarElement * (sizeof(ScaleDataType) + sizeof(BiasDataType) + + sizeof(MeanVarDataType) + sizeof(MeanVarDataType)); + + float gb_per_sec = num_bytes / 1.E6 / ave_time; + + std::cout << "Perf: " << std::setw(10) << ave_time << " ms, " << gb_per_sec << " GB/s, " + << op_name << std::endl; + + if(ave_time < best_ave_time) + { + found = true; + best_op_id = i; + best_op_name = op_name; + best_ave_time = ave_time; + best_gb_per_sec = gb_per_sec; + } + } + else + { + std::cout << op_name << " does not support this problem" << std::endl; + } + } + + if(found) + { + std::cout << "Best Perf: " << best_ave_time << " ms, " << best_gb_per_sec << " GB/s, " + << best_op_name << std::endl; + + // run the best intance + auto& op_ptr = op_ptrs[best_op_id]; + std::cout << "Run the best instance without timing: " << op_ptr->GetTypeString() + << std::endl; + + auto argument_ptr = op_ptr->MakeArgumentPointer(xyLengths, + {xyStrides, + aligned_scaleBiasMeanVarStrides, + aligned_scaleBiasMeanVarStrides, + aligned_scaleBiasMeanVarStrides, + aligned_scaleBiasMeanVarStrides}, + {xyStrides}, + {x.GetDeviceBuffer(), + mean.GetDeviceBuffer(), + variance.GetDeviceBuffer(), + scale.GetDeviceBuffer(), + bias.GetDeviceBuffer()}, + {y.GetDeviceBuffer()}, + Normalize{epsilon}); + + auto invoker_ptr = op_ptr->MakeInvokerPointer(); + + if(op_ptr->IsSupportedArgument(argument_ptr.get())) + { + invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, false}); + } + + std::cout << "Done" << std::endl; + } + + return 0; +} diff --git a/example/19_binary_elementwise/broadcast_add_2d_amn_bn.cpp b/example/19_binary_elementwise/broadcast_add_2d_amn_bn.cpp index 9eae27ca6e..bee5dea546 100644 --- a/example/19_binary_elementwise/broadcast_add_2d_amn_bn.cpp +++ b/example/19_binary_elementwise/broadcast_add_2d_amn_bn.cpp @@ -6,7 +6,7 @@ #include "ck/ck.hpp" #include "ck/tensor_operation/gpu/element/binary_element_wise_operation.hpp" -#include "ck/tensor_operation/gpu/device/impl/device_elementwise.hpp" +#include "ck/tensor_operation/gpu/device/impl/device_elementwise_impl.hpp" #include "ck/library/utility/check_err.hpp" #include "ck/library/utility/device_memory.hpp" @@ -23,13 +23,13 @@ using CDataType = F16; using Add = ck::tensor_operation::element_wise::Add; using DeviceElementwiseAddInstance = - ck::tensor_operation::device::DeviceElementwise, - ck::Tuple, - Add, - 2, - 8, - ck::Sequence<8, 8>, - ck::Sequence<8>>; + ck::tensor_operation::device::DeviceElementwiseImpl, + ck::Tuple, + Add, + 2, + 8, + ck::Sequence<8, 8>, + ck::Sequence<8>>; template , - ck::Tuple, - Add, - 3, - 8, - ck::Sequence<1, 8>, - ck::Sequence<8>>; + ck::tensor_operation::device::DeviceElementwiseImpl, + ck::Tuple, + Add, + 3, + 8, + ck::Sequence<1, 8>, + ck::Sequence<8>>; template void host_broadcast3D_am_bmnk(HostTensorC& C, diff --git a/example/19_binary_elementwise/elementwise_add_1d.cpp b/example/19_binary_elementwise/elementwise_add_1d.cpp index a1ca9378d3..a5a6bc0a8b 100644 --- a/example/19_binary_elementwise/elementwise_add_1d.cpp +++ b/example/19_binary_elementwise/elementwise_add_1d.cpp @@ -5,7 +5,7 @@ #include #include "ck/ck.hpp" -#include "ck/tensor_operation/gpu/device/impl/device_elementwise.hpp" +#include "ck/tensor_operation/gpu/device/impl/device_elementwise_impl.hpp" #include "ck/tensor_operation/gpu/element/binary_element_wise_operation.hpp" #include "ck/library/utility/check_err.hpp" #include "ck/library/utility/device_memory.hpp" @@ -21,13 +21,13 @@ using CDataType = F16; using Add = ck::tensor_operation::element_wise::Add; using DeviceElementwiseAddInstance = - ck::tensor_operation::device::DeviceElementwise, - ck::Tuple, - Add, - 1, - 8, - ck::Sequence<8, 8>, - ck::Sequence<8>>; + ck::tensor_operation::device::DeviceElementwiseImpl, + ck::Tuple, + Add, + 1, + 8, + ck::Sequence<8, 8>, + ck::Sequence<8>>; template void host_elementwise1D( diff --git a/example/19_binary_elementwise/elementwise_add_4d.cpp b/example/19_binary_elementwise/elementwise_add_4d.cpp index 27e1001481..cc209b12e3 100644 --- a/example/19_binary_elementwise/elementwise_add_4d.cpp +++ b/example/19_binary_elementwise/elementwise_add_4d.cpp @@ -6,7 +6,7 @@ #include "ck/ck.hpp" #include "ck/tensor_operation/gpu/element/binary_element_wise_operation.hpp" -#include "ck/tensor_operation/gpu/device/impl/device_elementwise.hpp" +#include "ck/tensor_operation/gpu/device/impl/device_elementwise_impl.hpp" #include "ck/library/utility/algorithm.hpp" #include "ck/library/utility/check_err.hpp" @@ -23,13 +23,13 @@ using CDataType = F16; using Add = ck::tensor_operation::element_wise::Add; using DeviceElementwiseAddInstance = - ck::tensor_operation::device::DeviceElementwise, - ck::Tuple, - Add, - 4, - 8, - ck::Sequence<8, 8>, - ck::Sequence<8>>; + ck::tensor_operation::device::DeviceElementwiseImpl, + ck::Tuple, + Add, + 4, + 8, + ck::Sequence<8, 8>, + ck::Sequence<8>>; template void host_elementwise4D(HostTensorC& C, diff --git a/example/21_gemm_layernorm/gemm_bias_relu_add_layernorm_xdl_naive_fp16.cpp b/example/21_gemm_layernorm/gemm_bias_relu_add_layernorm_xdl_naive_fp16.cpp index e37555e761..83b17699a7 100644 --- a/example/21_gemm_layernorm/gemm_bias_relu_add_layernorm_xdl_naive_fp16.cpp +++ b/example/21_gemm_layernorm/gemm_bias_relu_add_layernorm_xdl_naive_fp16.cpp @@ -10,7 +10,7 @@ #include "ck/tensor_operation/gpu/device/tensor_layout.hpp" #include "ck/tensor_operation/gpu/device/gemm_specialization.hpp" #include "ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_multiple_r_xdl_cshuffle.hpp" -#include "ck/tensor_operation/gpu/device/impl/device_elementwise.hpp" +#include "ck/tensor_operation/gpu/device/impl/device_elementwise_impl.hpp" #include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" #include "ck/library/utility/device_memory.hpp" @@ -95,7 +95,7 @@ using ReferenceGemmInstance = ck::tensor_operation::host::ReferenceGemm, // x, mean, // variance, // scale, diff --git a/example/44_elementwise_permute/elementwise_permute_4D_fp16.cpp b/example/44_elementwise_permute/elementwise_permute_4D_fp16.cpp index 0bbdbe52b9..2ceda86839 100644 --- a/example/44_elementwise_permute/elementwise_permute_4D_fp16.cpp +++ b/example/44_elementwise_permute/elementwise_permute_4D_fp16.cpp @@ -3,7 +3,7 @@ #include "ck/ck.hpp" #include "ck/tensor_operation/gpu/element/binary_element_wise_operation.hpp" -#include "ck/tensor_operation/gpu/device/impl/device_elementwise.hpp" +#include "ck/tensor_operation/gpu/device/impl/device_elementwise_impl.hpp" #include "ck/library/utility/algorithm.hpp" #include "ck/library/utility/check_err.hpp" @@ -19,13 +19,13 @@ using BDataType = F16; using PassThrough = ck::tensor_operation::element_wise::PassThrough; using DeviceElementwisePermuteInstance = - ck::tensor_operation::device::DeviceElementwise, - ck::Tuple, - PassThrough, - 4, - 8, - ck::Sequence<8>, - ck::Sequence<1>>; + ck::tensor_operation::device::DeviceElementwiseImpl, + ck::Tuple, + PassThrough, + 4, + 8, + ck::Sequence<8>, + ck::Sequence<1>>; template void host_elementwise4D(HostTensorB& B_nhwc, const HostTensorA& A_nchw, Functor functor) diff --git a/example/44_elementwise_permute/elementwise_permute_4D_fp16_2d.cpp b/example/44_elementwise_permute/elementwise_permute_4D_fp16_2d.cpp index f16ad3b3c5..6b94a5d46f 100644 --- a/example/44_elementwise_permute/elementwise_permute_4D_fp16_2d.cpp +++ b/example/44_elementwise_permute/elementwise_permute_4D_fp16_2d.cpp @@ -3,7 +3,7 @@ #include "ck/ck.hpp" #include "ck/tensor_operation/gpu/element/binary_element_wise_operation.hpp" -#include "ck/tensor_operation/gpu/device/device_elementwise_2d.hpp" +#include "ck/tensor_operation/gpu/device/impl/device_elementwise_2d_impl.hpp" #include "ck/library/utility/check_err.hpp" #include "ck/library/utility/device_memory.hpp" @@ -17,15 +17,15 @@ using BDataType = F16; using PassThrough = ck::tensor_operation::element_wise::PassThrough; using DeviceElementwisePermuteInstance = - ck::tensor_operation::device::DeviceElementwise, - ck::Tuple, - PassThrough, - 3, // NumDim_M - 1, // NumDim_N - 8, - 8, - ck::Sequence<8>, - ck::Sequence<8>>; + ck::tensor_operation::device::DeviceElementwise2dImpl, + ck::Tuple, + PassThrough, + 3, // NumDim_M + 1, // NumDim_N + 8, + 8, + ck::Sequence<8>, + ck::Sequence<8>>; template void host_elementwise4D(HostTensorB& B_nhwc, diff --git a/include/ck/tensor_operation/gpu/device/device_elementwise_base.hpp b/include/ck/tensor_operation/gpu/device/device_elementwise.hpp similarity index 87% rename from include/ck/tensor_operation/gpu/device/device_elementwise_base.hpp rename to include/ck/tensor_operation/gpu/device/device_elementwise.hpp index 728faf543d..f9f913a7c1 100644 --- a/include/ck/tensor_operation/gpu/device/device_elementwise_base.hpp +++ b/include/ck/tensor_operation/gpu/device/device_elementwise.hpp @@ -17,7 +17,7 @@ template -struct DeviceElementwiseBase : public BaseOperator +struct DeviceElementwise : public BaseOperator { static constexpr int NumInput = InDataTypeTuple::Size(); static constexpr int NumOutput = OutDataTypeTuple::Size(); @@ -37,8 +37,8 @@ template -using DeviceElementwiseBasePtr = std::unique_ptr< - DeviceElementwiseBase>; +using DeviceElementwisePtr = std::unique_ptr< + DeviceElementwise>; } // namespace device } // namespace tensor_operation diff --git a/include/ck/tensor_operation/gpu/device/device_elementwise_2d.hpp b/include/ck/tensor_operation/gpu/device/impl/device_elementwise_2d_impl.hpp similarity index 97% rename from include/ck/tensor_operation/gpu/device/device_elementwise_2d.hpp rename to include/ck/tensor_operation/gpu/device/impl/device_elementwise_2d_impl.hpp index 23aada0f44..83ed6198bd 100644 --- a/include/ck/tensor_operation/gpu/device/device_elementwise_2d.hpp +++ b/include/ck/tensor_operation/gpu/device/impl/device_elementwise_2d_impl.hpp @@ -8,7 +8,7 @@ #include "ck/utility/math.hpp" #include "ck/utility/sequence.hpp" -#include "ck/tensor_operation/gpu/device/device_elementwise_base.hpp" +#include "ck/tensor_operation/gpu/device/device_elementwise.hpp" #include "ck/tensor_operation/gpu/grid/gridwise_elementwise_2d.hpp" #include "ck/tensor_description/tensor_descriptor_helper.hpp" @@ -26,10 +26,10 @@ template -struct DeviceElementwise : public DeviceElementwiseBase +struct DeviceElementwise2dImpl : public DeviceElementwise { static constexpr index_t NumDim = NumDim_m + NumDim_n; diff --git a/include/ck/tensor_operation/gpu/device/impl/device_elementwise.hpp b/include/ck/tensor_operation/gpu/device/impl/device_elementwise_impl.hpp similarity index 98% rename from include/ck/tensor_operation/gpu/device/impl/device_elementwise.hpp rename to include/ck/tensor_operation/gpu/device/impl/device_elementwise_impl.hpp index 8e62880098..a11b5d0398 100644 --- a/include/ck/tensor_operation/gpu/device/impl/device_elementwise.hpp +++ b/include/ck/tensor_operation/gpu/device/impl/device_elementwise_impl.hpp @@ -8,7 +8,7 @@ #include "ck/utility/math.hpp" #include "ck/utility/sequence.hpp" -#include "ck/tensor_operation/gpu/device/device_elementwise_base.hpp" +#include "ck/tensor_operation/gpu/device/device_elementwise.hpp" #include "ck/tensor_operation/gpu/grid/gridwise_elementwise_1d.hpp" #include "ck/tensor_description/tensor_descriptor_helper.hpp" @@ -25,8 +25,8 @@ template -struct DeviceElementwise - : public DeviceElementwiseBase +struct DeviceElementwiseImpl + : public DeviceElementwise { static constexpr int NumInput = InDataTypeTuple::Size(); static constexpr int NumOutput = OutDataTypeTuple::Size(); diff --git a/include/ck/tensor_operation/gpu/element/element_wise_operation.hpp b/include/ck/tensor_operation/gpu/element/element_wise_operation.hpp index 5d2dd96c5c..7f3d450a39 100644 --- a/include/ck/tensor_operation/gpu/element/element_wise_operation.hpp +++ b/include/ck/tensor_operation/gpu/element/element_wise_operation.hpp @@ -314,6 +314,40 @@ struct Normalize double epsilon_; }; +// used by BatchNorm inference +// y = gamma * (x-mean) / sqrt(epsilon+variance) + beta +// The data type of mean and variance is used as AccDataType +struct NormalizeInInfer +{ + NormalizeInInfer(double epsilon = 1e-4) : epsilon_(epsilon) {} + + template + __host__ __device__ constexpr void operator()(T1& y, + const T1& x, + const T2& mean, + const T2& variance, + const T3& gamma, + const T4& beta) const + { + static_assert(std::is_same::value || std::is_same::value, + "Data type is not supported by this operation!"); + + using ck::type_convert; + using ck::math::sqrt; + + T2 tmp_x, tmp_y; + + tmp_x = type_convert(x); + + tmp_y = ((tmp_x - mean) / sqrt(variance + type_convert(epsilon_))) * + type_convert(gamma) + + type_convert(beta); + y = type_convert(tmp_y); + }; + + double epsilon_; +}; + template struct UnaryTypeConvert; diff --git a/library/include/ck/library/tensor_operation_instance/gpu/batchnorm_infer.hpp b/library/include/ck/library/tensor_operation_instance/gpu/batchnorm_infer.hpp new file mode 100644 index 0000000000..342ade69cd --- /dev/null +++ b/library/include/ck/library/tensor_operation_instance/gpu/batchnorm_infer.hpp @@ -0,0 +1,117 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once + +#include + +#include "ck/ck.hpp" +#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" +#include "ck/tensor_operation/gpu/device/device_elementwise.hpp" + +#include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +// FP16 +void add_device_batchnorm_infer_rank_4_f16_instances( + std::vector, + ck::Tuple, + ck::tensor_operation::element_wise::NormalizeInInfer, + 4>>>&); + +// FP32 +void add_device_batchnorm_infer_rank_4_f32_instances( + std::vector, + ck::Tuple, + ck::tensor_operation::element_wise::NormalizeInInfer, + 4>>>&); + +// BF16 +void add_device_batchnorm_infer_rank_4_bf16_instances( + std::vector, + ck::Tuple, + ck::tensor_operation::element_wise::NormalizeInInfer, + 4>>>&); + +// FP64 +void add_device_batchnorm_infer_rank_4_f64_instances( + std::vector, + ck::Tuple, + ck::tensor_operation::element_wise::NormalizeInInfer, + 4>>>&); + +template +struct DeviceOperationInstanceFactory, + ck::Tuple, + ck::tensor_operation::element_wise::NormalizeInInfer, + Rank>> +{ + using DeviceOp = ck::tensor_operation::device::DeviceElementwise< + ck::Tuple, + ck::Tuple, + ck::tensor_operation::element_wise::NormalizeInInfer, + Rank>; + + static auto GetInstances() + { + std::vector> op_ptrs; + + if constexpr(is_same_v && is_same_v && + is_same_v && is_same_v && + is_same_v) + { + if constexpr(Rank == 4) + { + add_device_batchnorm_infer_rank_4_f16_instances(op_ptrs); + } + } + else if constexpr(is_same_v && is_same_v && + is_same_v && is_same_v && + is_same_v) + { + if constexpr(Rank == 4) + { + add_device_batchnorm_infer_rank_4_f32_instances(op_ptrs); + } + } + else if constexpr(is_same_v && is_same_v && + is_same_v && is_same_v && + is_same_v) + { + if constexpr(Rank == 4) + { + add_device_batchnorm_infer_rank_4_bf16_instances(op_ptrs); + } + } + else if constexpr(is_same_v && is_same_v && + is_same_v && is_same_v && + is_same_v) + { + if constexpr(Rank == 4) + { + add_device_batchnorm_infer_rank_4_f64_instances(op_ptrs); + } + } + + return op_ptrs; + } +}; + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/include/ck/library/tensor_operation_instance/gpu/device_elementwise_instance.hpp b/library/include/ck/library/tensor_operation_instance/gpu/device_elementwise_instance.hpp index 141af55847..381a015eb0 100644 --- a/library/include/ck/library/tensor_operation_instance/gpu/device_elementwise_instance.hpp +++ b/library/include/ck/library/tensor_operation_instance/gpu/device_elementwise_instance.hpp @@ -7,7 +7,7 @@ #include "ck/ck.hpp" #include "ck/tensor_operation/gpu/device/tensor_layout.hpp" -#include "ck/tensor_operation/gpu/device/impl/device_elementwise.hpp" +#include "ck/tensor_operation/gpu/device/impl/device_elementwise_impl.hpp" #include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" #include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" @@ -18,11 +18,8 @@ namespace device { namespace instance { using Normalize = ck::tensor_operation::element_wise::Normalize; -using DeviceNormalizeFromMeanMeanSquarePtr = ck::tensor_operation::device::DeviceElementwiseBasePtr< - Tuple, - Tuple, - Normalize, - 2>; +using DeviceNormalizeFromMeanMeanSquarePtr = ck::tensor_operation::device:: + DeviceElementwisePtr, Tuple, Normalize, 2>; void add_device_normalize_from_mean_squaremean_f16_f32_f32_f16_f16_instances( std::vector& instances); diff --git a/library/src/tensor_operation_instance/gpu/batchnorm/CMakeLists.txt b/library/src/tensor_operation_instance/gpu/batchnorm/CMakeLists.txt index d12a2f244f..19a3cc8cd1 100644 --- a/library/src/tensor_operation_instance/gpu/batchnorm/CMakeLists.txt +++ b/library/src/tensor_operation_instance/gpu/batchnorm/CMakeLists.txt @@ -7,4 +7,8 @@ add_instance_library(device_batchnorm_instance device_batchnorm_backward_f32_instance.cpp device_batchnorm_backward_bf16_instance.cpp device_batchnorm_backward_f64_instance.cpp + device_batchnorm_infer_f16_instance.cpp + device_batchnorm_infer_f32_instance.cpp + device_batchnorm_infer_bf16_instance.cpp + device_batchnorm_infer_f64_instance.cpp ) diff --git a/library/src/tensor_operation_instance/gpu/batchnorm/device_batchnorm_infer_bf16_instance.cpp b/library/src/tensor_operation_instance/gpu/batchnorm/device_batchnorm_infer_bf16_instance.cpp new file mode 100644 index 0000000000..2e695afa97 --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/batchnorm/device_batchnorm_infer_bf16_instance.cpp @@ -0,0 +1,55 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +#include "ck/ck.hpp" +#include "ck/utility/tuple.hpp" +#include "ck/utility/data_type.hpp" + +#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" +#include "ck/tensor_operation/gpu/device/impl/device_elementwise_impl.hpp" +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +using BF16 = ck::bhalf_t; +using F32 = float; + +using Normalize = ck::tensor_operation::element_wise::NormalizeInInfer; + +// clang-format off +template +using device_batchnorm_infer_bf16_instances = + std::tuple < + // Tuple, Tuple, NormalizeOp, Rank, MPerThread, Sequence, Sequence + DeviceElementwiseImpl, Tuple, Normalize, Rank, 1, Sequence<1, 1, 1, 1, 1>, Sequence<1> >, + DeviceElementwiseImpl, Tuple, Normalize, Rank, 2, Sequence<1, 1, 1, 1, 1>, Sequence<1> >, + DeviceElementwiseImpl, Tuple, Normalize, Rank, 2, Sequence<2, 1, 1, 1, 1>, Sequence<2> >, + DeviceElementwiseImpl, Tuple, Normalize, Rank, 2, Sequence<1, 2, 2, 2, 2>, Sequence<1> >, + DeviceElementwiseImpl, Tuple, Normalize, Rank, 2, Sequence<2, 2, 2, 2, 2>, Sequence<2> >, + DeviceElementwiseImpl, Tuple, Normalize, Rank, 4, Sequence<1, 1, 1, 1, 1>, Sequence<1> >, + DeviceElementwiseImpl, Tuple, Normalize, Rank, 4, Sequence<2, 1, 1, 1, 1>, Sequence<2> >, + DeviceElementwiseImpl, Tuple, Normalize, Rank, 4, Sequence<1, 2, 2, 2, 2>, Sequence<1> >, + DeviceElementwiseImpl, Tuple, Normalize, Rank, 4, Sequence<2, 2, 2, 2, 2>, Sequence<2> >, + DeviceElementwiseImpl, Tuple, Normalize, Rank, 4, Sequence<4, 1, 1, 1, 1>, Sequence<4> >, + DeviceElementwiseImpl, Tuple, Normalize, Rank, 4, Sequence<1, 4, 4, 4, 4>, Sequence<1> >, + DeviceElementwiseImpl, Tuple, Normalize, Rank, 4, Sequence<4, 2, 2, 2, 2>, Sequence<4> >, + DeviceElementwiseImpl, Tuple, Normalize, Rank, 4, Sequence<2, 4, 4, 4, 4>, Sequence<2> >, + DeviceElementwiseImpl, Tuple, Normalize, Rank, 4, Sequence<4, 4, 4, 4, 4>, Sequence<4> > + >; +// clang-format on + +void add_device_batchnorm_infer_rank_4_bf16_instances( + std::vector, Tuple, Normalize, 4>>>& + instances) +{ + add_device_operation_instances(instances, device_batchnorm_infer_bf16_instances<4>{}); +} + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/batchnorm/device_batchnorm_infer_f16_instance.cpp b/library/src/tensor_operation_instance/gpu/batchnorm/device_batchnorm_infer_f16_instance.cpp new file mode 100644 index 0000000000..9ec761e445 --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/batchnorm/device_batchnorm_infer_f16_instance.cpp @@ -0,0 +1,54 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +#include "ck/ck.hpp" +#include "ck/utility/tuple.hpp" +#include "ck/utility/data_type.hpp" + +#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" +#include "ck/tensor_operation/gpu/device/impl/device_elementwise_impl.hpp" +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +using F16 = ck::half_t; +using F32 = float; + +using Normalize = ck::tensor_operation::element_wise::NormalizeInInfer; + +// clang-format off +template +using device_batchnorm_infer_f16_instances = + std::tuple < + // Tuple, Tuple, NormalizeOp, Rank, MPerThread, Sequence, Sequence + DeviceElementwiseImpl, Tuple, Normalize, Rank, 1, Sequence<1, 1, 1, 1, 1>, Sequence<1> >, + DeviceElementwiseImpl, Tuple, Normalize, Rank, 2, Sequence<1, 1, 1, 1, 1>, Sequence<1> >, + DeviceElementwiseImpl, Tuple, Normalize, Rank, 2, Sequence<2, 1, 1, 1, 1>, Sequence<2> >, + DeviceElementwiseImpl, Tuple, Normalize, Rank, 2, Sequence<1, 2, 2, 2, 2>, Sequence<1> >, + DeviceElementwiseImpl, Tuple, Normalize, Rank, 2, Sequence<2, 2, 2, 2, 2>, Sequence<2> >, + DeviceElementwiseImpl, Tuple, Normalize, Rank, 4, Sequence<1, 1, 1, 1, 1>, Sequence<1> >, + DeviceElementwiseImpl, Tuple, Normalize, Rank, 4, Sequence<2, 1, 1, 1, 1>, Sequence<2> >, + DeviceElementwiseImpl, Tuple, Normalize, Rank, 4, Sequence<1, 2, 2, 2, 2>, Sequence<1> >, + DeviceElementwiseImpl, Tuple, Normalize, Rank, 4, Sequence<2, 2, 2, 2, 2>, Sequence<2> >, + DeviceElementwiseImpl, Tuple, Normalize, Rank, 4, Sequence<4, 1, 1, 1, 1>, Sequence<4> >, + DeviceElementwiseImpl, Tuple, Normalize, Rank, 4, Sequence<1, 4, 4, 4, 4>, Sequence<1> >, + DeviceElementwiseImpl, Tuple, Normalize, Rank, 4, Sequence<4, 2, 2, 2, 2>, Sequence<4> >, + DeviceElementwiseImpl, Tuple, Normalize, Rank, 4, Sequence<2, 4, 4, 4, 4>, Sequence<2> >, + DeviceElementwiseImpl, Tuple, Normalize, Rank, 4, Sequence<4, 4, 4, 4, 4>, Sequence<4> > + >; +// clang-format on + +void add_device_batchnorm_infer_rank_4_f16_instances( + std::vector, Tuple, Normalize, 4>>>& instances) +{ + add_device_operation_instances(instances, device_batchnorm_infer_f16_instances<4>{}); +} + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/batchnorm/device_batchnorm_infer_f32_instance.cpp b/library/src/tensor_operation_instance/gpu/batchnorm/device_batchnorm_infer_f32_instance.cpp new file mode 100644 index 0000000000..f0d26c36be --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/batchnorm/device_batchnorm_infer_f32_instance.cpp @@ -0,0 +1,52 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +#include "ck/ck.hpp" +#include "ck/utility/tuple.hpp" + +#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" +#include "ck/tensor_operation/gpu/device/impl/device_elementwise_impl.hpp" +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +using F32 = float; + +using Normalize = ck::tensor_operation::element_wise::NormalizeInInfer; + +// clang-format off +template +using device_batchnorm_infer_f32_instances = + std::tuple < + // Tuple, Tuple, NormalizeOp, Rank, MPerThread, Sequence, Sequence + DeviceElementwiseImpl, Tuple, Normalize, Rank, 1, Sequence<1, 1, 1, 1, 1>, Sequence<1> >, + DeviceElementwiseImpl, Tuple, Normalize, Rank, 2, Sequence<1, 1, 1, 1, 1>, Sequence<1> >, + DeviceElementwiseImpl, Tuple, Normalize, Rank, 2, Sequence<2, 1, 1, 1, 1>, Sequence<2> >, + DeviceElementwiseImpl, Tuple, Normalize, Rank, 2, Sequence<1, 2, 2, 2, 2>, Sequence<1> >, + DeviceElementwiseImpl, Tuple, Normalize, Rank, 2, Sequence<2, 2, 2, 2, 2>, Sequence<2> >, + DeviceElementwiseImpl, Tuple, Normalize, Rank, 4, Sequence<1, 1, 1, 1, 1>, Sequence<1> >, + DeviceElementwiseImpl, Tuple, Normalize, Rank, 4, Sequence<2, 1, 1, 1, 1>, Sequence<2> >, + DeviceElementwiseImpl, Tuple, Normalize, Rank, 4, Sequence<1, 2, 2, 2, 2>, Sequence<1> >, + DeviceElementwiseImpl, Tuple, Normalize, Rank, 4, Sequence<2, 2, 2, 2, 2>, Sequence<2> >, + DeviceElementwiseImpl, Tuple, Normalize, Rank, 4, Sequence<4, 1, 1, 1, 1>, Sequence<4> >, + DeviceElementwiseImpl, Tuple, Normalize, Rank, 4, Sequence<1, 4, 4, 4, 4>, Sequence<1> >, + DeviceElementwiseImpl, Tuple, Normalize, Rank, 4, Sequence<4, 2, 2, 2, 2>, Sequence<4> >, + DeviceElementwiseImpl, Tuple, Normalize, Rank, 4, Sequence<2, 4, 4, 4, 4>, Sequence<2> >, + DeviceElementwiseImpl, Tuple, Normalize, Rank, 4, Sequence<4, 4, 4, 4, 4>, Sequence<4> > + >; +// clang-format on + +void add_device_batchnorm_infer_rank_4_f32_instances( + std::vector, Tuple, Normalize, 4>>>& instances) +{ + add_device_operation_instances(instances, device_batchnorm_infer_f32_instances<4>{}); +} + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/batchnorm/device_batchnorm_infer_f64_instance.cpp b/library/src/tensor_operation_instance/gpu/batchnorm/device_batchnorm_infer_f64_instance.cpp new file mode 100644 index 0000000000..9e4066bb06 --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/batchnorm/device_batchnorm_infer_f64_instance.cpp @@ -0,0 +1,47 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +#include "ck/ck.hpp" +#include "ck/utility/tuple.hpp" + +#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" +#include "ck/tensor_operation/gpu/device/impl/device_elementwise_impl.hpp" +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +using F64 = double; + +using Normalize = ck::tensor_operation::element_wise::NormalizeInInfer; + +// clang-format off +template +using device_batchnorm_infer_f64_instances = + std::tuple < + // Tuple, Tuple, NormalizeOp, Rank, MPerThread, Sequence, Sequence + DeviceElementwiseImpl, Tuple, Normalize, Rank, 1, Sequence<1, 1, 1, 1, 1>, Sequence<1> >, + DeviceElementwiseImpl, Tuple, Normalize, Rank, 2, Sequence<1, 1, 1, 1, 1>, Sequence<1> >, + DeviceElementwiseImpl, Tuple, Normalize, Rank, 2, Sequence<2, 1, 1, 1, 1>, Sequence<2> >, + DeviceElementwiseImpl, Tuple, Normalize, Rank, 2, Sequence<1, 2, 2, 2, 2>, Sequence<1> >, + DeviceElementwiseImpl, Tuple, Normalize, Rank, 2, Sequence<2, 2, 2, 2, 2>, Sequence<2> >, + DeviceElementwiseImpl, Tuple, Normalize, Rank, 4, Sequence<1, 1, 1, 1, 1>, Sequence<1> >, + DeviceElementwiseImpl, Tuple, Normalize, Rank, 4, Sequence<2, 1, 1, 1, 1>, Sequence<2> >, + DeviceElementwiseImpl, Tuple, Normalize, Rank, 4, Sequence<1, 2, 2, 2, 2>, Sequence<1> >, + DeviceElementwiseImpl, Tuple, Normalize, Rank, 4, Sequence<2, 2, 2, 2, 2>, Sequence<2> > + >; +// clang-format on + +void add_device_batchnorm_infer_rank_4_f64_instances( + std::vector, Tuple, Normalize, 4>>>& instances) +{ + add_device_operation_instances(instances, device_batchnorm_infer_f64_instances<4>{}); +} + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/elementwise/device_normalize_instance.cpp b/library/src/tensor_operation_instance/gpu/elementwise/device_normalize_instance.cpp index baddecf645..182037f15c 100644 --- a/library/src/tensor_operation_instance/gpu/elementwise/device_normalize_instance.cpp +++ b/library/src/tensor_operation_instance/gpu/elementwise/device_normalize_instance.cpp @@ -6,7 +6,7 @@ #include "ck/ck.hpp" #include "ck/tensor_operation/gpu/device/tensor_layout.hpp" #include "ck/tensor_operation/gpu/device/gemm_specialization.hpp" -#include "ck/tensor_operation/gpu/device/impl/device_elementwise.hpp" +#include "ck/tensor_operation/gpu/device/impl/device_elementwise_impl.hpp" #include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" namespace ck { @@ -28,15 +28,15 @@ using Normalize = ck::tensor_operation::element_wise::Normalize; using device_normalize_from_mean_squaremean_f16_f32_f32_f16_f16_instances = std::tuple< // clang-format off //###################|| | functor| NDim| MPerThread| | | - DeviceElementwise, Tuple, Normalize, 2, 8, Sequence<8, 1, 1, 8, 8>, Sequence<8> >, - DeviceElementwise, Tuple, Normalize, 2, 4, Sequence<4, 1, 1, 4, 4>, Sequence<4> >, - DeviceElementwise, Tuple, Normalize, 2, 2, Sequence<2, 1, 1, 2, 2>, Sequence<2> >, - DeviceElementwise, Tuple, Normalize, 2, 1, Sequence<1, 1, 1, 1, 1>, Sequence<1> > + DeviceElementwiseImpl, Tuple, Normalize, 2, 8, Sequence<8, 1, 1, 8, 8>, Sequence<8> >, + DeviceElementwiseImpl, Tuple, Normalize, 2, 4, Sequence<4, 1, 1, 4, 4>, Sequence<4> >, + DeviceElementwiseImpl, Tuple, Normalize, 2, 2, Sequence<2, 1, 1, 2, 2>, Sequence<2> >, + DeviceElementwiseImpl, Tuple, Normalize, 2, 1, Sequence<1, 1, 1, 1, 1>, Sequence<1> > // clang-format on >; void add_device_normalize_from_mean_squaremean_f16_f32_f32_f16_f16_instances( - std::vector, Tuple, Normalize, 2>>& + std::vector, Tuple, Normalize, 2>>& instances) { add_device_operation_instances( diff --git a/profiler/include/profiler/profile_batchnorm_infer_impl.hpp b/profiler/include/profiler/profile_batchnorm_infer_impl.hpp new file mode 100644 index 0000000000..ca65339345 --- /dev/null +++ b/profiler/include/profiler/profile_batchnorm_infer_impl.hpp @@ -0,0 +1,335 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once + +#include +#include + +#include "ck/ck.hpp" +#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" +#include "ck/library/utility/check_err.hpp" +#include "ck/library/utility/device_memory.hpp" +#include "ck/library/utility/host_tensor.hpp" +#include "ck/library/utility/host_tensor_generator.hpp" +#include "ck/library/tensor_operation_instance/gpu/batchnorm_infer.hpp" +#include "ck/library/reference_tensor_operation/cpu/reference_batchnorm_infer.hpp" + +namespace ck { +namespace profiler { + +template +bool profile_batchnorm_infer_impl(int do_verification, + int init_method, + bool do_dumpout, + bool time_kernel, + const std::vector inOutLengths, + const std::vector reduceDims, + double epsilon) +{ + if(inOutLengths.size() != Rank || reduceDims.size() != NumBatchNormReduceDim) + { + throw std::runtime_error("Invalid tensor lengths or number of reduce dimensions!"); + }; + + std::vector scaleBiasMeanVarLengths; + std::vector invariantDims; + + // used for calculating the effective transferred bytes by each operation + size_t total_length; + size_t invariant_length = 1; + + total_length = + std::accumulate(inOutLengths.begin(), inOutLengths.end(), 1, std::multiplies{}); + + if(std::any_of(reduceDims.begin(), reduceDims.end(), [](int d) { return d < 0 || d >= Rank; })) + throw std::runtime_error("Invalid reduce dimensions!"); + + for(int dim = 0; dim < Rank; dim++) + { + if(std::none_of(reduceDims.begin(), reduceDims.end(), [&](int d) { return dim == d; })) + { + invariantDims.push_back(dim); + scaleBiasMeanVarLengths.push_back(inOutLengths[dim]); + invariant_length *= inOutLengths[dim]; + }; + } + + // input data of the batchnorm infer algorithm + Tensor x(inOutLengths); + Tensor scale(scaleBiasMeanVarLengths); + Tensor bias(scaleBiasMeanVarLengths); + Tensor estimatedMean(scaleBiasMeanVarLengths); + Tensor estimatedVariance(scaleBiasMeanVarLengths); + + // output data of the batchnorm infer algorithm + Tensor y_ref(inOutLengths); + Tensor y(inOutLengths); + + auto inOutStrides = x.mDesc.GetStrides(); + auto scaleBiasMeanVarStrides = scale.mDesc.GetStrides(); + + std::size_t num_thread = std::thread::hardware_concurrency(); + + const float x_mean = 0.0f; + const float x_stddev = 1.0f; + const float noise_stddev = 0.04f; + + // input data in normal distribution + x.GenerateTensorValue(GeneratorTensor_4{x_mean, x_stddev}, num_thread); + + // initialize the estimatedMean to be values with tiny variation to the mean of the x + // values + estimatedMean.GenerateTensorValue(GeneratorTensor_4{x_mean, noise_stddev}, + num_thread); + + // initialize the estimatedVariance to be values with tiny variation to the variance of + // the x values + estimatedVariance.GenerateTensorValue( + GeneratorTensor_4{x_stddev * x_stddev, noise_stddev}, num_thread); + + if(do_verification) + { + switch(init_method) + { + case 0: + scale.GenerateTensorValue(GeneratorTensor_0{}, num_thread); + bias.GenerateTensorValue(GeneratorTensor_0{}, num_thread); + break; + case 1: + scale.GenerateTensorValue(GeneratorTensor_1{1}, num_thread); + bias.GenerateTensorValue(GeneratorTensor_1{0}, num_thread); + break; + case 2: + scale.GenerateTensorValue(GeneratorTensor_2{-5, 5}, num_thread); + bias.GenerateTensorValue(GeneratorTensor_2{-5, 5}, num_thread); + break; + default: + scale.GenerateTensorValue(GeneratorTensor_3{-1.0f, 1.0f}, num_thread); + bias.GenerateTensorValue(GeneratorTensor_3{-1.0f, 1.0f}, num_thread); + } + }; + + // these buffers are usually provided by the user application + DeviceMem x_dev(sizeof(XDataType) * x.mDesc.GetElementSpaceSize()); + DeviceMem y_dev(sizeof(XDataType) * y.mDesc.GetElementSpaceSize()); + DeviceMem scale_dev(sizeof(ScaleDataType) * scale.mDesc.GetElementSpaceSize()); + DeviceMem bias_dev(sizeof(BiasDataType) * bias.mDesc.GetElementSpaceSize()); + + // estimatedMean_dev + DeviceMem estimatedMean_dev(sizeof(MeanVarDataType) * + estimatedMean.mDesc.GetElementSpaceSize()); + // estimatedVariance_dev + DeviceMem estimatedVariance_dev(sizeof(MeanVarDataType) * + estimatedVariance.mDesc.GetElementSpaceSize()); + + x_dev.ToDevice(x.mData.data()); + scale_dev.ToDevice(scale.mData.data()); + bias_dev.ToDevice(bias.mData.data()); + estimatedMean_dev.ToDevice(estimatedMean.mData.data()); + estimatedVariance_dev.ToDevice(estimatedVariance.mData.data()); + + std::array arrInOutLengths; + std::array arrInOutStrides; + std::array arrScaleBiasMeanVarLengths; + std::array arrScaleBiasMeanVarStrides; + std::array arrReduceDims; + + std::copy(inOutLengths.begin(), inOutLengths.end(), arrInOutLengths.begin()); + std::copy(inOutStrides.begin(), inOutStrides.end(), arrInOutStrides.begin()); + std::copy(scaleBiasMeanVarLengths.begin(), + scaleBiasMeanVarLengths.end(), + arrScaleBiasMeanVarLengths.begin()); + std::copy(scaleBiasMeanVarStrides.begin(), + scaleBiasMeanVarStrides.end(), + arrScaleBiasMeanVarStrides.begin()); + + std::copy(reduceDims.begin(), reduceDims.end(), arrReduceDims.begin()); + + std::array aligned_scaleBiasMeanVarStrides{0}; + + int i = 0; + for(auto dim : invariantDims) + { + assert(inOutLengths[dim] == scaleBiasMeanVarLengths[i]); + + aligned_scaleBiasMeanVarStrides[dim] = scaleBiasMeanVarStrides[i]; + i++; + }; + + using Normalize = ck::tensor_operation::element_wise::NormalizeInInfer; + + // add device batchnorm-infer instances + using DeviceOp = ck::tensor_operation::device::DeviceElementwise< + ck::Tuple, + ck::Tuple, + Normalize, + Rank>; + + // get device op instances + const auto instance_ptrs = + ck::tensor_operation::device::instance::DeviceOperationInstanceFactory< + DeviceOp>::GetInstances(); + + std::cout << "found " << instance_ptrs.size() << " instances" << std::endl; + + std::string best_instance_name; + float best_avg_time = std::numeric_limits::max(); + float best_gb_per_sec = 0; + + if(do_verification) + { + using PassThroughOp = ck::tensor_operation::element_wise::PassThrough; + + using ReferenceBatchNormInferInstance = + ck::tensor_operation::host::ReferenceBatchNormInfer; + auto batchNormInfer_ref = ReferenceBatchNormInferInstance{}; + + auto argument_ptr_ref = + batchNormInfer_ref.MakeArgumentPointer(arrInOutLengths, + arrInOutStrides, + arrInOutStrides, + arrReduceDims, + arrScaleBiasMeanVarLengths, + arrScaleBiasMeanVarStrides, + arrScaleBiasMeanVarStrides, + arrScaleBiasMeanVarStrides, + x.mData.data(), + scale.mData.data(), + bias.mData.data(), + epsilon, + PassThroughOp{}, + estimatedMean.mData.data(), + estimatedVariance.mData.data(), + y_ref.mData.data()); + + if(!batchNormInfer_ref.IsSupportedArgument(argument_ptr_ref.get())) + { + std::cout << "The runtime parameters not supported by the reference instance, exiting!" + << std::endl; + return (false); + }; + + auto invoker_ptr_ref = batchNormInfer_ref.MakeInvokerPointer(); + + (void)invoker_ptr_ref->Run(argument_ptr_ref.get()); + } + + int num_kernel = 0; + bool pass = true; + + for(auto& inst_ptr : instance_ptrs) + { + auto argument_ptr = inst_ptr->MakeArgumentPointer(arrInOutLengths, + {arrInOutStrides, + aligned_scaleBiasMeanVarStrides, + aligned_scaleBiasMeanVarStrides, + aligned_scaleBiasMeanVarStrides, + aligned_scaleBiasMeanVarStrides}, + {arrInOutStrides}, + {x_dev.GetDeviceBuffer(), + estimatedMean_dev.GetDeviceBuffer(), + estimatedVariance_dev.GetDeviceBuffer(), + scale_dev.GetDeviceBuffer(), + bias_dev.GetDeviceBuffer()}, + {y_dev.GetDeviceBuffer()}, + Normalize{epsilon}); + + if(inst_ptr->IsSupportedArgument(argument_ptr.get())) + { + num_kernel++; + } + else + { + if(time_kernel) + { + std::cout << inst_ptr->GetTypeString() + << " skipped due to unsupported argument: " << std::endl; + } + + continue; + }; + + auto invoker_ptr = inst_ptr->MakeInvokerPointer(); + + float avg_time = invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, time_kernel}); + + size_t num_bytes = 0; + + // inputing of x, scale, bias, outputing of y + num_bytes += total_length * (sizeof(XDataType) + sizeof(YDataType)) + + invariant_length * + (sizeof(ScaleDataType) + sizeof(BiasDataType) + sizeof(MeanVarDataType)); + + float gb_per_sec = num_bytes / 1.E6 / avg_time; + + if(time_kernel) + std::cout << "Perf: " << avg_time << " ms, " << gb_per_sec << " GB/s, " + << inst_ptr->GetTypeString() << std::endl; + + if(avg_time < best_avg_time) + { + best_instance_name = inst_ptr->GetTypeString(); + best_avg_time = avg_time; + best_gb_per_sec = gb_per_sec; + } + + if(do_verification) + { + using ck::utils::check_err; + bool single_pass; + + y_dev.FromDevice(y.mData.data()); + + if constexpr(ck::is_same_v) + single_pass = check_err(y.mData, y_ref.mData, "y results", 1e-2, 1e-2); + else + single_pass = check_err(y.mData, y_ref.mData, "y results", 4e-3, 4e-3); + + pass = pass && single_pass; + }; + + if(do_dumpout) + { + using ck::host_common::dumpBufferToFile; + + // clang-format off + dumpBufferToFile("dump_x.bin", x.mData.data(), x.mDesc.GetElementSize()); + dumpBufferToFile("dump_y.bin", y.mData.data(), y.mDesc.GetElementSize()); + dumpBufferToFile("dump_y_ref.bin", y_ref.mData.data(), y_ref.mDesc.GetElementSize()); + // clang-format off + }; + } + + if(time_kernel) + { + std::cout << "best perf = " << best_avg_time << " ms, " << best_gb_per_sec << " GB/s, " + << best_instance_name << std::endl; + } + + if(num_kernel == 0) + { + std::cout << "Error: No kernel is applicable" << std::endl; + return false; + } + + return pass; +} + +} // namespace profiler +} // namespace ck diff --git a/profiler/src/CMakeLists.txt b/profiler/src/CMakeLists.txt index e7a95a905f..bcf25f87e8 100644 --- a/profiler/src/CMakeLists.txt +++ b/profiler/src/CMakeLists.txt @@ -27,6 +27,7 @@ set(PROFILER_SOURCES profile_softmax.cpp profile_batchnorm_fwd.cpp profile_batchnorm_bwd.cpp + profile_batchnorm_infer.cpp ) set(PROFILER_EXECUTABLE ckProfiler) diff --git a/profiler/src/profile_batchnorm_infer.cpp b/profiler/src/profile_batchnorm_infer.cpp new file mode 100644 index 0000000000..92c16859c1 --- /dev/null +++ b/profiler/src/profile_batchnorm_infer.cpp @@ -0,0 +1,202 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +#include +#include +#include + +#include "ck/library/utility/host_common_util.hpp" +#include "profiler/profile_batchnorm_infer_impl.hpp" +#include "profiler_operation_registry.hpp" + +using ck::index_t; + +using namespace std; + +static const struct option long_options[] = {{"inOutLengths", required_argument, nullptr, 'D'}, + {"reduceDims", required_argument, nullptr, 'R'}, + {"dumpout", required_argument, nullptr, 'o'}, + {"verify", required_argument, nullptr, 'v'}, + {"help", no_argument, nullptr, '?'}, + {nullptr, 0, nullptr, 0}}; + +class BatchnormInferArgParser +{ + private: + int option_index = 0; + + public: + std::vector inLengths; + std::vector reduceDims; + + bool do_verification = false; + bool do_dumpout = false; + + bool updateMovingAverage; + bool saveMeanAndInvVariance; + + int data_type = 0; + int init_method = 2; + bool time_kernel = false; + + BatchnormInferArgParser() = default; + ~BatchnormInferArgParser() = default; + + void show_usage(const char* cmd) + { + // clang-format off + std::cout << "Usage of " << cmd << std::endl; + std::cout << "--inOutLengths or -D, comma separated list of input tensor dimension lengths, must have 4 integers for nhwc" << std::endl; + std::cout << "--reduceDims or -R, comma separated list of dimensions to reduce on" << std::endl; + std::cout << "--verify or -v, 1/0 to indicate whether to verify the result by comparing with the host-based batch-normalization" << std::endl; + std::cout << "Arg1: data type (0: fp16, 1: fp32, 5: bp16, 6: fp64)" << std::endl; + std::cout << "Arg2: init method used for bnScale and bnBias (0=no init, 1=single integer value, 2=scope integer value, 3=decimal value)" << std::endl; + std::cout << "Arg3: time kernel (0=no, 1=yes)" << std::endl; + // clang-format on + }; + + int operator()(int argc, char* argv[]) + { + using ck::host_common::getTypeValuesFromString; + + int ch; + + optind++; // to skip the module name + + while(1) + { + ch = getopt_long(argc, argv, "D:R:v:o:", long_options, &option_index); + if(ch == -1) + break; + switch(ch) + { + case 'D': + if(!optarg) + throw std::runtime_error("Invalid option format!"); + + inLengths = getTypeValuesFromString(optarg); + break; + case 'R': + if(!optarg) + throw std::runtime_error("Invalid option format!"); + + reduceDims = getTypeValuesFromString(optarg); + break; + case 'v': + if(!optarg) + throw std::runtime_error("Invalid option format!"); + + do_verification = static_cast(std::atoi(optarg)); + break; + case 'o': + if(!optarg) + throw std::runtime_error("Invalid option format!"); + + do_dumpout = static_cast(std::atoi(optarg)); + break; + case '?': + if(std::string(long_options[option_index].name) == "help") + { + show_usage(argv[0]); + return -1; + }; + break; + + default: + show_usage(argv[0]); + std::cerr << "Invalid cmd-line options!" << std::endl; + return -1; + }; + }; + + if(optind + 3 > argc) + throw std::runtime_error("Invalid cmd-line arguments, more argumetns are needed!"); + + data_type = std::atoi(argv[optind++]); + init_method = std::atoi(argv[optind++]); + time_kernel = static_cast(std::atoi(argv[optind++])); + + if(data_type != 0 && data_type != 1 && data_type != 5 && data_type != 6) + return -1; + + return 0; + }; +}; // end of class AppArgs + +static const double epsilon = std::numeric_limits::epsilon(); + +int profile_batchnorm_infer(int argc, char* argv[]) +{ + using ck::profiler::profile_batchnorm_infer_impl; + + BatchnormInferArgParser arg_parser; + + if(arg_parser(argc, argv) != 0) + return -1; + + using F16 = ck::half_t; + using F32 = float; + using BF16 = ck::bhalf_t; + using F64 = double; + + if(arg_parser.data_type == 0) + { + if(arg_parser.inLengths.size() == 4 && arg_parser.reduceDims.size() == 3) + { + profile_batchnorm_infer_impl( + arg_parser.do_verification, + arg_parser.init_method, + arg_parser.do_dumpout, + arg_parser.time_kernel, + arg_parser.inLengths, + arg_parser.reduceDims, + epsilon); + }; + } + else if(arg_parser.data_type == 1) + { + if(arg_parser.inLengths.size() == 4 && arg_parser.reduceDims.size() == 3) + { + profile_batchnorm_infer_impl( + arg_parser.do_verification, + arg_parser.init_method, + arg_parser.do_dumpout, + arg_parser.time_kernel, + arg_parser.inLengths, + arg_parser.reduceDims, + epsilon); + }; + } + else if(arg_parser.data_type == 5) + { + if(arg_parser.inLengths.size() == 4 && arg_parser.reduceDims.size() == 3) + { + profile_batchnorm_infer_impl( + arg_parser.do_verification, + arg_parser.init_method, + arg_parser.do_dumpout, + arg_parser.time_kernel, + arg_parser.inLengths, + arg_parser.reduceDims, + epsilon); + }; + } + else if(arg_parser.data_type == 6) + { + if(arg_parser.inLengths.size() == 4 && arg_parser.reduceDims.size() == 3) + { + profile_batchnorm_infer_impl( + arg_parser.do_verification, + arg_parser.init_method, + arg_parser.do_dumpout, + arg_parser.time_kernel, + arg_parser.inLengths, + arg_parser.reduceDims, + epsilon); + }; + } + + return 0; +} + +REGISTER_PROFILER_OPERATION("bnorm_infer", "Batchnorm inference", profile_batchnorm_infer); diff --git a/test/batchnorm/CMakeLists.txt b/test/batchnorm/CMakeLists.txt index 52f1508682..2a528f9c37 100644 --- a/test/batchnorm/CMakeLists.txt +++ b/test/batchnorm/CMakeLists.txt @@ -1,4 +1,6 @@ add_gtest_executable(test_batchnorm_fwd_rank_4 batchnorm_fwd_rank_4.cpp) add_gtest_executable(test_batchnorm_bwd_rank_4 batchnorm_bwd_rank_4.cpp) +add_gtest_executable(test_batchnorm_infer_rank_4 batchnorm_infer_rank_4.cpp) target_link_libraries(test_batchnorm_fwd_rank_4 PRIVATE utility device_batchnorm_instance) target_link_libraries(test_batchnorm_bwd_rank_4 PRIVATE utility device_batchnorm_instance) +target_link_libraries(test_batchnorm_infer_rank_4 PRIVATE utility device_batchnorm_instance) diff --git a/test/batchnorm/batchnorm_infer_rank_4.cpp b/test/batchnorm/batchnorm_infer_rank_4.cpp new file mode 100644 index 0000000000..77fc1daae6 --- /dev/null +++ b/test/batchnorm/batchnorm_infer_rank_4.cpp @@ -0,0 +1,89 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +#include +#include +#include +#include +#include +#include + +#include "profiler/profile_batchnorm_infer_impl.hpp" + +using F16 = ck::half_t; +using F32 = float; +using BF16 = ck::bhalf_t; +using F64 = double; + +template +class TestBatchNormInferRank4 : public ::testing::Test +{ + private: + const double epsilon = std::numeric_limits::epsilon(); + + protected: + using XDataType = std::tuple_element_t<0, Tuple>; + using YDataType = std::tuple_element_t<1, Tuple>; + using AccDataType = std::tuple_element_t<2, Tuple>; + using ScaleDataType = std::tuple_element_t<3, Tuple>; + using BiasDataType = std::tuple_element_t<4, Tuple>; + using MeanVarDataType = std::tuple_element_t<5, Tuple>; + + std::vector> list_of_lengths = { + {128, 16, 3, 1024}, {128, 16, 6, 512}, {4, 4, 4, 4}, {32, 32, 32, 32}}; + std::vector reduceDims; + + template + void Run() + { + for(auto& inOutLengths : list_of_lengths) + { + bool pass = true; + + EXPECT_FALSE(reduceDims.size() != NumReduceDim); + + pass = pass && ck::profiler::profile_batchnorm_infer_impl( + true, 3, false, false, inOutLengths, reduceDims, epsilon); + + pass = pass && ck::profiler::profile_batchnorm_infer_impl( + true, 3, false, false, inOutLengths, reduceDims, epsilon); + + EXPECT_TRUE(pass); + } + } +}; + +using KernelTypes = ::testing::Types, + std::tuple, + std::tuple, + std::tuple>; + +TYPED_TEST_SUITE(TestBatchNormInferRank4, KernelTypes); + +// nhwc +TYPED_TEST(TestBatchNormInferRank4, nhwc) +{ + this->reduceDims = {0, 1, 2}; + this->template Run<3>(); +} + +// nchw +TYPED_TEST(TestBatchNormInferRank4, nchw) +{ + this->reduceDims = {0, 2, 3}; + this->template Run<3>(); +}