From 52d082baded81e349db6c826aa4308ce67a561f5 Mon Sep 17 00:00:00 2001 From: Qianfeng Date: Fri, 25 Nov 2022 08:02:27 +0800 Subject: [PATCH] BatchNorm forward instance/external api/profiler/tests/client example (#511) * Update to device_batchnorm_forward base class to include all template parameters for problem description * Add batchnorm forward instances and external api * Add batchnorm forward profiler module which uses the external api * Add some comments in batchnorm_forward example to explain the dimensions in lengths[] * Replace the reference_batchnorm_forward_nhwc_c by generic reference_batchnorm_forward * Improvement to the batchnorm infer base API * Add batchnorm forward client example which shows using the batchnorm forward external API * Add test for batchnorm forward * Tuning the batchnorm profiler initialized values and error threshold * Add support for bhalf_t in instances/external api/tests * Add support for int8_t in instances/external api/tests * Add support for double in instances/external api/tests * Let ScaleDataType and BiasDataType be same as XDataType and YDataType when creating instances * Checking before running best instance in batchnorm_fwd_nhwc client example * Add checking for YElementwiseOp in batchnorm_forward external API * Add more types in batchnorm forward profiler * Add more test lengths Co-authored-by: rocking5566 [ROCm/composable_kernel commit: 4e6a5575bebd074812a4c12eafd37a599719083b] --- client_example/13_batchnorm/CMakeLists.txt | 2 + .../13_batchnorm/batchnorm_fwd_nhwc.cpp | 197 ++++++++ .../34_batchnorm/batchnorm_forward_nhwc.cpp | 24 +- example/34_batchnorm/batchnorm_infer_nhwc.cpp | 25 +- .../gpu/device/device_batchnorm_forward.hpp | 31 +- .../gpu/device/device_batchnorm_infer.hpp | 32 +- .../impl/device_batchnorm_forward_impl.hpp | 11 +- .../cpu/reference_batchnorm_forward.hpp | 368 +++++++++++++++ .../reference_batchnorm_forward_nhwc_c.hpp | 290 ------------ .../cpu/reference_batchnorm_infer.hpp | 300 ++++++++++++ .../cpu/reference_batchnorm_infer_nhwc_c.hpp | 204 -------- .../gpu/batchnorm_forward.hpp | 130 ++++++ .../ck/library/utility/host_common_util.hpp | 60 +++ .../gpu/batchnorm/CMakeLists.txt | 7 + ...device_batchnorm_forward_bf16_instance.cpp | 147 ++++++ .../device_batchnorm_forward_f16_instance.cpp | 147 ++++++ .../device_batchnorm_forward_f32_instance.cpp | 145 ++++++ .../device_batchnorm_forward_f64_instance.cpp | 145 ++++++ .../device_batchnorm_forward_i8_instance.cpp | 145 ++++++ profiler/CMakeLists.txt | 2 + .../profile_batchnorm_forward_impl.hpp | 440 ++++++++++++++++++ profiler/src/profile_batchnorm_fwd.cpp | 234 ++++++++++ profiler/src/profiler.cpp | 8 +- test/CMakeLists.txt | 1 + test/batchnorm_fwd/CMakeLists.txt | 2 + test/batchnorm_fwd/batchnorm_fwd_rank_4.cpp | 110 +++++ 26 files changed, 2685 insertions(+), 522 deletions(-) create mode 100644 client_example/13_batchnorm/CMakeLists.txt create mode 100644 client_example/13_batchnorm/batchnorm_fwd_nhwc.cpp create mode 100644 library/include/ck/library/reference_tensor_operation/cpu/reference_batchnorm_forward.hpp delete mode 100644 library/include/ck/library/reference_tensor_operation/cpu/reference_batchnorm_forward_nhwc_c.hpp create mode 100644 library/include/ck/library/reference_tensor_operation/cpu/reference_batchnorm_infer.hpp delete mode 100644 library/include/ck/library/reference_tensor_operation/cpu/reference_batchnorm_infer_nhwc_c.hpp create mode 100644 library/include/ck/library/tensor_operation_instance/gpu/batchnorm_forward.hpp create mode 100644 library/src/tensor_operation_instance/gpu/batchnorm/CMakeLists.txt create mode 100644 library/src/tensor_operation_instance/gpu/batchnorm/device_batchnorm_forward_bf16_instance.cpp create mode 100644 library/src/tensor_operation_instance/gpu/batchnorm/device_batchnorm_forward_f16_instance.cpp create mode 100644 library/src/tensor_operation_instance/gpu/batchnorm/device_batchnorm_forward_f32_instance.cpp create mode 100644 library/src/tensor_operation_instance/gpu/batchnorm/device_batchnorm_forward_f64_instance.cpp create mode 100644 library/src/tensor_operation_instance/gpu/batchnorm/device_batchnorm_forward_i8_instance.cpp create mode 100644 profiler/include/profile_batchnorm_forward_impl.hpp create mode 100644 profiler/src/profile_batchnorm_fwd.cpp create mode 100644 test/batchnorm_fwd/CMakeLists.txt create mode 100644 test/batchnorm_fwd/batchnorm_fwd_rank_4.cpp diff --git a/client_example/13_batchnorm/CMakeLists.txt b/client_example/13_batchnorm/CMakeLists.txt new file mode 100644 index 0000000000..0ddea1a8f1 --- /dev/null +++ b/client_example/13_batchnorm/CMakeLists.txt @@ -0,0 +1,2 @@ +add_executable(client_batchnorm_fwd_nhwc batchnorm_fwd_nhwc.cpp) +target_link_libraries(client_batchnorm_fwd_nhwc PRIVATE composable_kernel::device_operations) diff --git a/client_example/13_batchnorm/batchnorm_fwd_nhwc.cpp b/client_example/13_batchnorm/batchnorm_fwd_nhwc.cpp new file mode 100644 index 0000000000..322667a46b --- /dev/null +++ b/client_example/13_batchnorm/batchnorm_fwd_nhwc.cpp @@ -0,0 +1,197 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +#include +#include +#include +#include +#include + +#include "ck/ck.hpp" +#include "ck/tensor_operation/gpu/device/device_reduce.hpp" +#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" + +#include "ck/library/tensor_operation_instance/gpu/batchnorm_forward.hpp" + +using XDataType = float; +using YDataType = float; +using AccDataType = float; +using ScaleDataType = AccDataType; +using BiasDataType = AccDataType; +using MeanVarDataType = AccDataType; + +using PassThrough = ck::tensor_operation::element_wise::PassThrough; + +constexpr int Rank = 4; +constexpr int NumBatchNormReduceDim = 3; + +const double epsilon = std::numeric_limits::epsilon(); +const double averageFactor = 0.1; + +struct SimpleDeviceMem +{ + SimpleDeviceMem() = delete; + + SimpleDeviceMem(std::size_t mem_size) : p_mem_{} + { + (void)hipMalloc(static_cast(&p_mem_), mem_size); + } + + void* GetDeviceBuffer() { return p_mem_; } + + ~SimpleDeviceMem() { (void)hipFree(p_mem_); } + + void* p_mem_; +}; + +int main(int argc, char* argv[]) +{ + std::array xyLengths{16, 8, 128, 256}; + std::array xyStrides{8 * 128 * 256, 128 * 256, 256, 1}; + std::array scaleBiasMeanVarLengths{256}; + std::array scaleBiasMeanVarStrides{1}; + std::array reduceDims{0, 1, 2}; + + ck::index_t numXYElement = + std::accumulate(xyLengths.begin(), xyLengths.end(), 1, std::multiplies()); + + ck::index_t numScaleBiasMeanVarElement = std::accumulate(scaleBiasMeanVarLengths.begin(), + scaleBiasMeanVarLengths.end(), + 1, + std::multiplies()); + + SimpleDeviceMem x(sizeof(XDataType) * numXYElement); + SimpleDeviceMem y(sizeof(YDataType) * numXYElement); + SimpleDeviceMem scale(sizeof(ScaleDataType) * numScaleBiasMeanVarElement); + SimpleDeviceMem bias(sizeof(BiasDataType) * numScaleBiasMeanVarElement); + SimpleDeviceMem mean(sizeof(MeanVarDataType) * numScaleBiasMeanVarElement); + SimpleDeviceMem invVariance(sizeof(MeanVarDataType) * numScaleBiasMeanVarElement); + + using DeviceOp = ck::tensor_operation::device::DeviceBatchNormFwd; + + const auto op_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory< + DeviceOp>::GetInstances(); + + std::cout << "found " << op_ptrs.size() << " instances" << std::endl; + + std::string best_op_name; + bool found = false; + int best_op_id = -1; + float best_ave_time = std::numeric_limits::max(); + float best_gb_per_sec = 0; + + // profile device operation instances + std::cout << "Run all instances and do timing" << std::endl; + + for(int i = 0; i < op_ptrs.size(); ++i) + { + auto& op_ptr = op_ptrs[i]; + + auto argument_ptr = op_ptr->MakeArgumentPointer(xyLengths, + xyStrides, + xyStrides, + reduceDims, + scaleBiasMeanVarLengths, + scaleBiasMeanVarStrides, + scaleBiasMeanVarStrides, + scaleBiasMeanVarStrides, + x.GetDeviceBuffer(), + scale.GetDeviceBuffer(), + bias.GetDeviceBuffer(), + epsilon, + PassThrough{}, + y.GetDeviceBuffer(), + mean.GetDeviceBuffer(), + invVariance.GetDeviceBuffer(), + averageFactor, + nullptr, + nullptr); + + auto invoker_ptr = op_ptr->MakeInvokerPointer(); + std::string op_name = op_ptr->GetTypeString(); + + if(op_ptr->IsSupportedArgument(argument_ptr.get())) + { + size_t workspace_sz = op_ptr->GetWorkSpaceSize(argument_ptr.get()); + + SimpleDeviceMem workspace(workspace_sz); + + op_ptr->SetWorkSpacePointer(argument_ptr.get(), workspace.GetDeviceBuffer()); + + float ave_time = invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, true}); + + std::size_t num_bytes = + numXYElement * (sizeof(XDataType) + sizeof(YDataType)) + + numScaleBiasMeanVarElement * (sizeof(ScaleDataType) + sizeof(BiasDataType) + + sizeof(MeanVarDataType) + sizeof(MeanVarDataType)); + + float gb_per_sec = num_bytes / 1.E6 / ave_time; + + std::cout << "Perf: " << std::setw(10) << ave_time << " ms, " << gb_per_sec << " GB/s, " + << op_name << std::endl; + + if(ave_time < best_ave_time) + { + found = true; + best_op_id = i; + best_op_name = op_name; + best_ave_time = ave_time; + best_gb_per_sec = gb_per_sec; + } + } + else + { + std::cout << op_name << " does not support this problem" << std::endl; + } + } + + if(found) + { + std::cout << "Best Perf: " << best_ave_time << " ms, " << best_gb_per_sec << " GB/s, " + << best_op_name << std::endl; + + // run the best intance + auto& op_ptr = op_ptrs[best_op_id]; + std::cout << "Run the best instance without timing: " << op_ptr->GetTypeString() + << std::endl; + + auto argument_ptr = op_ptr->MakeArgumentPointer(xyLengths, + xyStrides, + xyStrides, + reduceDims, + scaleBiasMeanVarLengths, + scaleBiasMeanVarStrides, + scaleBiasMeanVarStrides, + scaleBiasMeanVarStrides, + x.GetDeviceBuffer(), + scale.GetDeviceBuffer(), + bias.GetDeviceBuffer(), + epsilon, + PassThrough{}, + y.GetDeviceBuffer(), + mean.GetDeviceBuffer(), + invVariance.GetDeviceBuffer(), + averageFactor, + nullptr, + nullptr); + + auto invoker_ptr = op_ptr->MakeInvokerPointer(); + + if(op_ptr->IsSupportedArgument(argument_ptr.get())) + { + invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, false}); + } + + std::cout << "Done" << std::endl; + } + + return 0; +} diff --git a/example/34_batchnorm/batchnorm_forward_nhwc.cpp b/example/34_batchnorm/batchnorm_forward_nhwc.cpp index 03f24eeb67..da36d65a29 100644 --- a/example/34_batchnorm/batchnorm_forward_nhwc.cpp +++ b/example/34_batchnorm/batchnorm_forward_nhwc.cpp @@ -15,7 +15,7 @@ #include "ck/library/utility/host_tensor.hpp" #include "ck/library/utility/host_tensor_generator.hpp" #include "ck/library/utility/host_common_util.hpp" -#include "ck/library/reference_tensor_operation/cpu/reference_batchnorm_forward_nhwc_c.hpp" +#include "ck/library/reference_tensor_operation/cpu/reference_batchnorm_forward.hpp" #include "ck/tensor_operation/gpu/device/impl/device_batchnorm_forward_impl.hpp" #include "ck/library/utility/host_common_util.hpp" #include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" @@ -142,6 +142,8 @@ bool bnorm_fwd_nhwc_test(bool do_verification, constexpr int Rank = 4; constexpr int NumReduceDim = 3; + // when using lengths[] to create a tensor, lengths[0] is the length of highest dimension + // eg. N of NHWC, so lengths[3] is the dimension C length of NHWC const std::vector scaleBiasMeanVarLengths = {inOutLengths[3]}; // input data of the batchnorm forward algorithm @@ -300,7 +302,7 @@ bool bnorm_fwd_nhwc_test(bool do_verification, i_inOutLengths, i_inOutStrides, i_inOutStrides, - {0, 1, 2}, + {0, 1, 2}, // indicates physical indices of reduce dimensions in lengths[] and strides[] i_scaleBiasMeanVarLengths, i_scaleBiasMeanVarStrides, i_scaleBiasMeanVarStrides, @@ -366,13 +368,15 @@ bool bnorm_fwd_nhwc_test(bool do_verification, { using ReferenceBatchNormFwdInstance = - ck::tensor_operation::host::ReferenceBatchNormFwd_Input_N_H_W_C_Output_C; + ck::tensor_operation::host::ReferenceBatchNormFwd; auto batchNormFwd_ref = ReferenceBatchNormFwdInstance{}; @@ -380,7 +384,7 @@ bool bnorm_fwd_nhwc_test(bool do_verification, i_inOutLengths, i_inOutStrides, i_inOutStrides, - {0, 1, 2}, + {0, 1, 2}, // indicates physical indices of reduce dimensions in lengths[] and strides[] i_scaleBiasMeanVarLengths, i_scaleBiasMeanVarStrides, i_scaleBiasMeanVarStrides, diff --git a/example/34_batchnorm/batchnorm_infer_nhwc.cpp b/example/34_batchnorm/batchnorm_infer_nhwc.cpp index 2dc9d6b789..dc2984851a 100644 --- a/example/34_batchnorm/batchnorm_infer_nhwc.cpp +++ b/example/34_batchnorm/batchnorm_infer_nhwc.cpp @@ -15,7 +15,8 @@ #include "ck/library/utility/host_tensor.hpp" #include "ck/library/utility/host_tensor_generator.hpp" #include "ck/library/utility/host_common_util.hpp" -#include "ck/library/reference_tensor_operation/cpu/reference_batchnorm_infer_nhwc_c.hpp" +#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" +#include "ck/library/reference_tensor_operation/cpu/reference_batchnorm_infer.hpp" #include "batchnorm_infer_impl.hpp" @@ -124,6 +125,8 @@ bool bnorm_infer_nhwc_test(bool do_verification, constexpr int Rank = 4; constexpr int NumReduceDim = 3; + // when using lengths[] to create a tensor, lengths[0] is the length of highest dimension + // eg. N of NHWC, so lengths[3] is the dimension C length of NHWC const std::vector scaleBiasMeanVarLengths = {inOutLengths[3]}; // input data of the batchnorm forward algorithm @@ -260,20 +263,25 @@ bool bnorm_infer_nhwc_test(bool do_verification, if(do_verification) { + using PassThroughOp = ck::tensor_operation::element_wise::PassThrough; + using ReferenceBatchNormInferInstance = - ck::tensor_operation::host::ReferenceBatchNormInfer_Input_N_H_W_C_Output_C< - InOutDataType, - InOutDataType, - AccDataType, - AccDataType, - AccDataType, - AccDataType>; + ck::tensor_operation::host::ReferenceBatchNormInfer; auto batchNormInfer_ref = ReferenceBatchNormInferInstance{}; auto argument_ptr_ref = batchNormInfer_ref.MakeArgumentPointer(i_inOutLengths, i_inOutStrides, i_inOutStrides, + {0, 1, 2}, i_scaleBiasMeanVarLengths, i_scaleBiasMeanVarStrides, i_scaleBiasMeanVarStrides, @@ -282,6 +290,7 @@ bool bnorm_infer_nhwc_test(bool do_verification, bnScale.mData.data(), bnBias.mData.data(), epsilon, + PassThroughOp{}, estimatedMean.mData.data(), estimatedVariance.mData.data(), y_ref.mData.data()); diff --git a/include/ck/tensor_operation/gpu/device/device_batchnorm_forward.hpp b/include/ck/tensor_operation/gpu/device/device_batchnorm_forward.hpp index 019f377a5c..aa93dd9c19 100644 --- a/include/ck/tensor_operation/gpu/device/device_batchnorm_forward.hpp +++ b/include/ck/tensor_operation/gpu/device/device_batchnorm_forward.hpp @@ -13,7 +13,15 @@ namespace ck { namespace tensor_operation { namespace device { -template +template struct DeviceBatchNormFwd : public BaseOperator { virtual std::unique_ptr MakeArgumentPointer( @@ -40,9 +48,24 @@ struct DeviceBatchNormFwd : public BaseOperator virtual std::unique_ptr MakeInvokerPointer() = 0; }; -template -using DeviceBatchNormFwdPtr = - std::unique_ptr>; +template +using DeviceBatchNormFwdPtr = std::unique_ptr>; } // namespace device } // namespace tensor_operation diff --git a/include/ck/tensor_operation/gpu/device/device_batchnorm_infer.hpp b/include/ck/tensor_operation/gpu/device/device_batchnorm_infer.hpp index fabb2394c5..8a00fd9db3 100644 --- a/include/ck/tensor_operation/gpu/device/device_batchnorm_infer.hpp +++ b/include/ck/tensor_operation/gpu/device/device_batchnorm_infer.hpp @@ -13,13 +13,22 @@ namespace ck { namespace tensor_operation { namespace device { -template +template struct DeviceBatchNormInfer : public BaseOperator { virtual std::unique_ptr MakeArgumentPointer( const std::array xyLengths, const std::array xStrides, const std::array yStrides, + const std::array reduceDims, const std::array bnScaleBiasMeanVarLengths, const std::array bnScaleStrides, const std::array bnBiasStrides, @@ -28,6 +37,7 @@ struct DeviceBatchNormInfer : public BaseOperator const void* bnScale, const void* bnBias, double epsilon, + const YElementwiseOp y_elementwise_op, const void* estimatedMean, const void* estimatedInvVariance, void* p_y) = 0; @@ -35,8 +45,24 @@ struct DeviceBatchNormInfer : public BaseOperator virtual std::unique_ptr MakeInvokerPointer() = 0; }; -template -using DeviceBatchNormInferPtr = std::unique_ptr>; +template +using DeviceBatchNormInferPtr = std::unique_ptr>; } // namespace device } // namespace tensor_operation diff --git a/include/ck/tensor_operation/gpu/device/impl/device_batchnorm_forward_impl.hpp b/include/ck/tensor_operation/gpu/device/impl/device_batchnorm_forward_impl.hpp index 220456955d..5a16ff765b 100644 --- a/include/ck/tensor_operation/gpu/device/impl/device_batchnorm_forward_impl.hpp +++ b/include/ck/tensor_operation/gpu/device/impl/device_batchnorm_forward_impl.hpp @@ -42,8 +42,15 @@ template -struct DeviceBatchNormFwdImpl - : public DeviceBatchNormFwd +struct DeviceBatchNormFwdImpl : public DeviceBatchNormFwd { static_assert(Rank <= 6, "Bigger Rank size is not supported!"); static_assert(BlockSize == MThreadClusterSize * KThreadClusterSize, diff --git a/library/include/ck/library/reference_tensor_operation/cpu/reference_batchnorm_forward.hpp b/library/include/ck/library/reference_tensor_operation/cpu/reference_batchnorm_forward.hpp new file mode 100644 index 0000000000..dd0db31680 --- /dev/null +++ b/library/include/ck/library/reference_tensor_operation/cpu/reference_batchnorm_forward.hpp @@ -0,0 +1,368 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once + +#include +#include +#include +#include + +#include "ck/utility/math_v2.hpp" +#include "ck/utility/ignore.hpp" +#include "ck/library/utility/host_common_util.hpp" +#include "ck/tensor_operation/gpu/device/device_batchnorm_forward.hpp" + +namespace ck { +namespace tensor_operation { +namespace host { + +template +struct ReferenceBatchNormFwd : public device::DeviceBatchNormFwd +{ + static_assert(Rank <= 6, "Bigger Rank size is not supported!"); + + static constexpr index_t NumInvariantDim = Rank - NumBatchNormReduceDim; + + struct Argument : public device::BaseArgument + { + Argument(const std::array xyLengths, + const std::array xStrides, + const std::array yStrides, + const std::array reduceDims, + const std::array bnScaleBiasMeanVarLengths, + const std::array bnScaleStrides, + const std::array bnBiasStrides, + const std::array bnMeanVarStrides, + const XDataType* p_x, + const ScaleDataType* bnScale, + const BiasDataType* bnBias, + double epsilon, + const YElementwiseOp y_elementwise_op, + YDataType* p_y, + MeanVarDataType* resultSaveMean, + MeanVarDataType* resultSaveInvVariance, + double averageFactor, + MeanVarDataType* resultRunningMean, + MeanVarDataType* resultRunningVariance) + : reduceDims_(reduceDims), + bnScaleBiasMeanVarLengths_(bnScaleBiasMeanVarLengths), + bnScaleStrides_(bnScaleStrides), + bnBiasStrides_(bnBiasStrides), + bnMeanVarStrides_(bnMeanVarStrides), + p_x_(p_x), + bnScale_(bnScale), + bnBias_(bnBias), + y_elementwise_op_(y_elementwise_op), + p_y_(p_y), + resultSaveMean_(resultSaveMean), + resultSaveInvVariance_(resultSaveInvVariance), + resultRunningMean_(resultRunningMean), + resultRunningVariance_(resultRunningVariance) + { + using ck::host_common::get_index_set; + + if(std::any_of( + reduceDims.begin(), reduceDims.end(), [](int d) { return d < 0 || d >= Rank; })) + throw std::runtime_error("Invalid reduce dimensions!"); + + // get invariant_dims[] and invariant_lengths[] + for(int dim = 0, i = 0; dim < Rank; dim++) + if(std::none_of( + reduceDims.begin(), reduceDims.end(), [&](int d) { return d == dim; })) + { + invariantDims_[i] = dim; + invariant_lengths_[i] = xyLengths[dim]; + i++; + }; + + // get reduce_lengths_[] + for(int j = 0, i = 0; j < NumBatchNormReduceDim; j++) + { + int dim = reduceDims[j]; + reduce_lengths_[i++] = xyLengths[dim]; + }; + + for(int i = 0; i < NumInvariantDim; i++) + if(invariant_lengths_[i] != bnScaleBiasMeanVarLengths_[i]) + throw std::runtime_error("Invalid lengths parameters!"); + + for(int j = 0, i = 0; j < NumInvariantDim; j++) + { + int dim = invariantDims_[j]; + x_invariant_strides_[i] = xStrides[dim]; + y_invariant_strides_[i] = yStrides[dim]; + i++; + }; + + for(int j = 0, i = 0; j < NumBatchNormReduceDim; j++) + { + int dim = reduceDims_[j]; + x_reduce_strides_[i] = xStrides[dim]; + y_reduce_strides_[i] = yStrides[dim]; + i++; + }; + + invariant_index_set_ = get_index_set(invariant_lengths_); + reduce_index_set_ = get_index_set(reduce_lengths_); + + epsilon_ = type_convert(epsilon); + averageFactor_ = type_convert(averageFactor); + + resultSave = (resultSaveMean != nullptr && resultSaveInvVariance != nullptr); + resultRunning = (resultRunningMean != nullptr && resultRunningVariance != nullptr); + } + + std::array reduceDims_; + std::array invariantDims_; + std::array invariant_lengths_; + std::array reduce_lengths_; + + const std::array bnScaleBiasMeanVarLengths_; + const std::array bnScaleStrides_; + const std::array bnBiasStrides_; + const std::array bnMeanVarStrides_; + + std::array x_invariant_strides_; + std::array y_invariant_strides_; + std::array x_reduce_strides_; + std::array y_reduce_strides_; + + const XDataType* p_x_; + const ScaleDataType* bnScale_; + const BiasDataType* bnBias_; + const YElementwiseOp y_elementwise_op_; + YDataType* p_y_; + + MeanVarDataType* resultSaveMean_; + MeanVarDataType* resultSaveInvVariance_; + MeanVarDataType* resultRunningMean_; + MeanVarDataType* resultRunningVariance_; + + bool resultSave, resultRunning; + + std::vector> invariant_index_set_; + std::vector> reduce_index_set_; + + AccDataType averageFactor_; + AccDataType epsilon_; + }; + + struct Invoker : public device::BaseInvoker + { + float Run(const Argument& arg) + { + using ck::host_common::get_offset_from_index; + + auto thread_reduce_func = [&](auto invariant_index) { + size_t x_invariant_offset = get_offset_from_index( + arg.x_invariant_strides_, invariant_index); + size_t y_invariant_offset = get_offset_from_index( + arg.y_invariant_strides_, invariant_index); + AccDataType mean = type_convert(0.0f); + AccDataType variance = type_convert(0.0f); + int32_t curr_count = 0; + + // compute mean, variance using welford method + for(const auto& reduce_index : arg.reduce_index_set_) + { + size_t x_reduce_offset = get_offset_from_index( + arg.x_reduce_strides_, reduce_index); + + auto x_offset = x_invariant_offset + x_reduce_offset; + + curr_count++; + + AccDataType x = type_convert(arg.p_x_[x_offset]); + + AccDataType delta = x - mean; + + mean += delta / curr_count; + + AccDataType delta2 = x - mean; + + variance += delta * delta2; + }; + + // actual variance + variance = variance / curr_count; + + // inv-variance defined as 1/sqrt(epsilon+variance) + AccDataType invVariance = + type_convert(1.0f) / ck::math::sqrt(arg.epsilon_ + variance); + + // save the mean/inv-variance if required + if(arg.resultSave) + { + size_t offset = get_offset_from_index(arg.bnMeanVarStrides_, + invariant_index); + + arg.resultSaveMean_[offset] = type_convert(mean); + arg.resultSaveInvVariance_[offset] = type_convert(invVariance); + }; + + // update the moving average if required + if(arg.resultRunning) + { + size_t offset = get_offset_from_index(arg.bnMeanVarStrides_, + invariant_index); + + AccDataType oneMinusAverageFactor = + type_convert(1.0) - arg.averageFactor_; + arg.resultRunningMean_[offset] = type_convert( + type_convert(arg.resultRunningMean_[offset]) * + oneMinusAverageFactor + + mean * arg.averageFactor_); + arg.resultRunningVariance_[offset] = type_convert( + arg.resultRunningVariance_[offset] * oneMinusAverageFactor + + variance * arg.averageFactor_); + }; + + size_t scale_offset = + get_offset_from_index(arg.bnScaleStrides_, invariant_index); + size_t bias_offset = + get_offset_from_index(arg.bnBiasStrides_, invariant_index); + + AccDataType scale = type_convert(arg.bnScale_[scale_offset]); + AccDataType bias = type_convert(arg.bnBias_[bias_offset]); + + // Normalization + for(const auto& reduce_index : arg.reduce_index_set_) + { + size_t x_reduce_offset = get_offset_from_index( + arg.x_reduce_strides_, reduce_index); + size_t y_reduce_offset = get_offset_from_index( + arg.y_reduce_strides_, reduce_index); + + auto x_offset = x_invariant_offset + x_reduce_offset; + auto y_offset = y_invariant_offset + y_reduce_offset; + + AccDataType x = type_convert(arg.p_x_[x_offset]); + + AccDataType norm_x = (x - mean) * invVariance; + + AccDataType y = scale * norm_x + bias; + + arg.y_elementwise_op_(y, y); + + arg.p_y_[y_offset] = type_convert(y); + }; + }; + + std::size_t num_thread = std::thread::hardware_concurrency(); + std::size_t work_per_thread = + (arg.invariant_index_set_.size() + num_thread - 1) / num_thread; + + std::vector threads(num_thread); + + for(std::size_t it = 0; it < num_thread; ++it) + { + std::size_t i_begin = it * work_per_thread; + std::size_t i_end = std::min(static_cast((it + 1) * work_per_thread), + arg.invariant_index_set_.size()); + + auto f = [=] { + for(std::size_t i = i_begin; i < i_end; ++i) + { + thread_reduce_func(arg.invariant_index_set_[i]); + } + }; + + threads[it] = joinable_thread(f); + } + + return (0.0f); + }; + + float Run(const device::BaseArgument* p_arg, + const StreamConfig& /*stream_config*/ = StreamConfig{}) override + { + return Run(*dynamic_cast(p_arg)); + }; + }; + + bool IsSupportedArgument(const device::BaseArgument* p_arg) override + { + (void)p_arg; + + return (true); + }; + + std::unique_ptr + MakeArgumentPointer(const std::array xyLengths, + const std::array xStrides, + const std::array yStrides, + const std::array reduceDims, + const std::array bnScaleBiasMeanVarLengths, + const std::array bnScaleStrides, + const std::array bnBiasStrides, + const std::array bnMeanVarStrides, + const void* p_x, + const void* bnScale, + const void* bnBias, + double epsilon, + const YElementwiseOp y_elementwise_op, + void* p_y, + void* resultSaveMean, + void* resultSaveInvVariance, + double averageFactor, + void* resultRunningMean, + void* resultRunningVariance) override + { + return std::make_unique(xyLengths, + xStrides, + yStrides, + reduceDims, + bnScaleBiasMeanVarLengths, + bnScaleStrides, + bnBiasStrides, + bnMeanVarStrides, + static_cast(p_x), + static_cast(bnScale), + static_cast(bnBias), + epsilon, + y_elementwise_op, + static_cast(p_y), + static_cast(resultSaveMean), + static_cast(resultSaveInvVariance), + averageFactor, + static_cast(resultRunningMean), + static_cast(resultRunningVariance)); + }; + + std::unique_ptr MakeInvokerPointer() override + { + return std::make_unique(); + }; + + std::string GetTypeString() const override + { + auto str = std::stringstream(); + + // clang-format off + str << "Reference_BatchNorm_Forward" << std::endl; + // clang-format on + + return str.str(); + } +}; + +} // namespace host +} // namespace tensor_operation +} // namespace ck diff --git a/library/include/ck/library/reference_tensor_operation/cpu/reference_batchnorm_forward_nhwc_c.hpp b/library/include/ck/library/reference_tensor_operation/cpu/reference_batchnorm_forward_nhwc_c.hpp deleted file mode 100644 index c54766b6a0..0000000000 --- a/library/include/ck/library/reference_tensor_operation/cpu/reference_batchnorm_forward_nhwc_c.hpp +++ /dev/null @@ -1,290 +0,0 @@ -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. - -#pragma once - -#include -#include -#include -#include -#include - -#include "ck/utility/math_v2.hpp" -#include "ck/utility/ignore.hpp" -#include "ck/tensor_operation/gpu/device/device_batchnorm_forward.hpp" - -namespace ck { -namespace tensor_operation { -namespace host { - -template -struct ReferenceBatchNormFwd_Input_N_H_W_C_Output_C - : public device::DeviceBatchNormFwd<4, 3, YElementwiseOp> -{ - struct Argument : public device::BaseArgument - { - Argument(const std::array xyLengths, - const std::array xStrides, - const std::array yStrides, - const std::array reduceDims, - const std::array bnScaleBiasMeanVarLengths, - const std::array bnScaleStrides, - const std::array bnBiasStrides, - const std::array bnMeanVarStrides, - const XDataType* p_x, - const ScaleDataType* bnScale, - const BiasDataType* bnBias, - double epsilon, - const YElementwiseOp y_elementwise_op, - YDataType* p_y, - MeanVarDataType* resultSaveMean, - MeanVarDataType* resultSaveInvVariance, - double averageFactor, - MeanVarDataType* resultRunningMean, - MeanVarDataType* resultRunningVariance) - : p_x_(p_x), - bnScale_(bnScale), - bnBias_(bnBias), - y_elementwise_op_(y_elementwise_op), - p_y_(p_y), - resultSaveMean_(resultSaveMean), - resultSaveInvVariance_(resultSaveInvVariance), - resultRunningMean_(resultRunningMean), - resultRunningVariance_(resultRunningVariance) - { - ignore = xStrides; - ignore = yStrides; - ignore = bnScaleStrides; - ignore = bnBiasStrides; - ignore = bnMeanVarStrides; - ignore = reduceDims; - - if(xyLengths.size() != 4 || bnScaleBiasMeanVarLengths.size() != 1 || - bnScaleBiasMeanVarLengths[0] != xyLengths[3]) - throw std::runtime_error("Invalid tensor dimensions!"); - - n = xyLengths[0]; - h = xyLengths[1]; - w = xyLengths[2]; - c = xyLengths[3]; - - epsilon_ = type_convert(epsilon); - averageFactor_ = type_convert(averageFactor); - - resultSave = (resultSaveMean != nullptr && resultSaveInvVariance != nullptr); - resultRunning = (resultRunningMean != nullptr && resultRunningVariance != nullptr); - } - - const XDataType* p_x_; - const ScaleDataType* bnScale_; - const BiasDataType* bnBias_; - const YElementwiseOp y_elementwise_op_; - YDataType* p_y_; - - MeanVarDataType* resultSaveMean_; - MeanVarDataType* resultSaveInvVariance_; - MeanVarDataType* resultRunningMean_; - MeanVarDataType* resultRunningVariance_; - - bool resultSave, resultRunning; - - index_t n, h, w, c; - - AccDataType averageFactor_; - AccDataType epsilon_; - }; - - struct Invoker : public device::BaseInvoker - { - float Run(const Argument& arg) - { - auto thread_reduce_func = [&](auto iC) { - index_t offset_C = iC; - AccDataType mean = type_convert(0.0f); - AccDataType variance = type_convert(0.0f); - int32_t curr_count = 0; - - // compute mean, variance using welford method - for(index_t iN = 0; iN < arg.n; iN++) - { - index_t offset_N = iN * arg.h * arg.w * arg.c; - for(index_t iH = 0; iH < arg.h; iH++) - { - index_t offset_H = iH * arg.w * arg.c; - for(index_t iW = 0; iW < arg.w; iW++) - { - index_t offset_W = iW * arg.c; - - auto offset = offset_N + offset_H + offset_W + offset_C; - - curr_count++; - - AccDataType x = type_convert(arg.p_x_[offset]); - - AccDataType delta = x - mean; - - mean += delta / curr_count; - - AccDataType delta2 = x - mean; - - variance += delta * delta2; - }; - } - }; - - // actual variance - variance = variance / curr_count; - - AccDataType invVariance = - type_convert(1.0f) / ck::math::sqrt(arg.epsilon_ + variance); - - // save the mean/invVariance if required - if(arg.resultSave) - { - arg.resultSaveMean_[iC] = type_convert(mean); - arg.resultSaveInvVariance_[iC] = type_convert(invVariance); - }; - - // update the moving average if required - if(arg.resultRunning) - { - AccDataType oneMinusAverageFactor = - type_convert(1.0) - arg.averageFactor_; - arg.resultRunningMean_[iC] = type_convert( - type_convert(arg.resultRunningMean_[iC]) * - oneMinusAverageFactor + - mean * arg.averageFactor_); - arg.resultRunningVariance_[iC] = type_convert( - arg.resultRunningVariance_[iC] * oneMinusAverageFactor + - variance * arg.averageFactor_); - }; - - // Normalization - for(index_t iN = 0; iN < arg.n; iN++) - { - index_t offset_N = iN * arg.h * arg.w * arg.c; - for(index_t iH = 0; iH < arg.h; iH++) - { - index_t offset_H = iH * arg.w * arg.c; - for(index_t iW = 0; iW < arg.w; iW++) - { - index_t offset_W = iW * arg.c; - - auto offset = offset_N + offset_H + offset_W + offset_C; - - AccDataType x = type_convert(arg.p_x_[offset]); - - AccDataType norm_x = - arg.bnScale_[iC] * (x - mean) * invVariance + arg.bnBias_[iC]; - - arg.p_y_[offset] = type_convert(norm_x); - }; - } - }; - }; - - std::size_t num_thread = std::thread::hardware_concurrency(); - std::size_t work_per_thread = (arg.c + num_thread - 1) / num_thread; - - std::vector threads(num_thread); - - for(std::size_t it = 0; it < num_thread; ++it) - { - std::size_t ic_begin = it * work_per_thread; - std::size_t ic_end = std::min(static_cast((it + 1) * work_per_thread), arg.c); - - auto f = [=] { - for(std::size_t ic = ic_begin; ic < ic_end; ++ic) - { - thread_reduce_func(ic); - } - }; - - threads[it] = joinable_thread(f); - } - - return (0.0f); - }; - - float Run(const device::BaseArgument* p_arg, - const StreamConfig& /*stream_config*/ = StreamConfig{}) override - { - return Run(*dynamic_cast(p_arg)); - }; - }; - - bool IsSupportedArgument(const device::BaseArgument* p_arg) override - { - (void)p_arg; - - return (true); - }; - - std::unique_ptr - MakeArgumentPointer(const std::array xyLengths, - const std::array xStrides, - const std::array yStrides, - const std::array reduceDims, - const std::array bnScaleBiasMeanVarLengths, - const std::array bnScaleStrides, - const std::array bnBiasStrides, - const std::array bnMeanVarStrides, - const void* p_x, - const void* bnScale, - const void* bnBias, - double epsilon, - const YElementwiseOp y_elementwise_op, - void* p_y, - void* resultSaveMean, - void* resultSaveInvVariance, - double averageFactor, - void* resultRunningMean, - void* resultRunningVariance) override - { - return std::make_unique(xyLengths, - xStrides, - yStrides, - reduceDims, - bnScaleBiasMeanVarLengths, - bnScaleStrides, - bnBiasStrides, - bnMeanVarStrides, - static_cast(p_x), - static_cast(bnScale), - static_cast(bnBias), - epsilon, - y_elementwise_op, - static_cast(p_y), - static_cast(resultSaveMean), - static_cast(resultSaveInvVariance), - averageFactor, - static_cast(resultRunningMean), - static_cast(resultRunningVariance)); - }; - - std::unique_ptr MakeInvokerPointer() override - { - return std::make_unique(); - }; - - std::string GetTypeString() const override - { - auto str = std::stringstream(); - - // clang-format off - str << "Reference_BatchNorm_Forward_NHWC_C<" << std::endl; - // clang-format on - - return str.str(); - } -}; - -} // namespace host -} // namespace tensor_operation -} // namespace ck diff --git a/library/include/ck/library/reference_tensor_operation/cpu/reference_batchnorm_infer.hpp b/library/include/ck/library/reference_tensor_operation/cpu/reference_batchnorm_infer.hpp new file mode 100644 index 0000000000..463c655ac1 --- /dev/null +++ b/library/include/ck/library/reference_tensor_operation/cpu/reference_batchnorm_infer.hpp @@ -0,0 +1,300 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once + +#include +#include +#include +#include + +#include "ck/library/utility/host_common_util.hpp" +#include "ck/tensor_operation/gpu/device/device_batchnorm_infer.hpp" + +namespace ck { +namespace tensor_operation { +namespace host { + +template +struct ReferenceBatchNormInfer : public device::DeviceBatchNormInfer +{ + static_assert(Rank <= 6, "Bigger Rank size is not supported!"); + + static constexpr index_t NumInvariantDim = Rank - NumBatchNormReduceDim; + + struct Argument : public device::BaseArgument + { + Argument(const std::array xyLengths, + const std::array xStrides, + const std::array yStrides, + const std::array reduceDims, + const std::array bnScaleBiasMeanVarLengths, + const std::array bnScaleStrides, + const std::array bnBiasStrides, + const std::array bnMeanVarStrides, + const XDataType* p_x, + const ScaleDataType* bnScale, + const BiasDataType* bnBias, + double epsilon, + const YElementwiseOp y_elementwise_op, + const MeanVarDataType* estimatedMean, + const MeanVarDataType* estimatedVariance, + YDataType* p_y) + : reduceDims_(reduceDims), + bnScaleBiasMeanVarLengths_(bnScaleBiasMeanVarLengths), + bnScaleStrides_(bnScaleStrides), + bnBiasStrides_(bnBiasStrides), + bnMeanVarStrides_(bnMeanVarStrides), + p_x_(p_x), + bnScale_(bnScale), + bnBias_(bnBias), + y_elementwise_op_(y_elementwise_op), + estimatedMean_(estimatedMean), + estimatedVariance_(estimatedVariance), + p_y_(p_y) + { + using ck::host_common::get_index_set; + + if(std::any_of( + reduceDims.begin(), reduceDims.end(), [](int d) { return d < 0 || d >= Rank; })) + throw std::runtime_error("Invalid reduce dimensions!"); + + // get invariant_dims[] and invariant_lengths[] + for(int dim = 0, i = 0; dim < Rank; dim++) + if(std::none_of( + reduceDims.begin(), reduceDims.end(), [&](int d) { return d == dim; })) + { + invariantDims_[i] = dim; + invariant_lengths_[i] = xyLengths[dim]; + i++; + }; + + // get reduce_lengths_[] + for(int j = 0, i = 0; j < NumBatchNormReduceDim; j++) + { + int dim = reduceDims[j]; + reduce_lengths_[i++] = xyLengths[dim]; + }; + + // check invariant_lengths_ and bnScaleBiasMeanVarLengths + for(int i = 0; i < NumInvariantDim; i++) + if(invariant_lengths_[i] != bnScaleBiasMeanVarLengths_[i]) + throw std::runtime_error("Invalid lengths parameters!"); + + for(int j = 0, i = 0; j < NumInvariantDim; j++) + { + int dim = invariantDims_[j]; + x_invariant_strides_[i] = xStrides[dim]; + y_invariant_strides_[i] = yStrides[dim]; + i++; + }; + + for(int j = 0, i = 0; j < NumBatchNormReduceDim; j++) + { + int dim = reduceDims_[j]; + x_reduce_strides_[i] = xStrides[dim]; + y_reduce_strides_[i] = yStrides[dim]; + i++; + }; + + invariant_index_set_ = get_index_set(invariant_lengths_); + reduce_index_set_ = get_index_set(reduce_lengths_); + + epsilon_ = type_convert(epsilon); + } + + std::array reduceDims_; + std::array invariantDims_; + std::array invariant_lengths_; + std::array reduce_lengths_; + + const std::array bnScaleBiasMeanVarLengths_; + const std::array bnScaleStrides_; + const std::array bnBiasStrides_; + const std::array bnMeanVarStrides_; + + std::array x_invariant_strides_; + std::array y_invariant_strides_; + std::array x_reduce_strides_; + std::array y_reduce_strides_; + + const XDataType* p_x_; + const ScaleDataType* bnScale_; + const BiasDataType* bnBias_; + const YElementwiseOp y_elementwise_op_; + + const MeanVarDataType* estimatedMean_; + const MeanVarDataType* estimatedVariance_; + + YDataType* p_y_; + + std::vector> invariant_index_set_; + std::vector> reduce_index_set_; + + AccDataType epsilon_; + }; + + struct Invoker : public device::BaseInvoker + { + float Run(const Argument& arg) + { + using ck::host_common::get_offset_from_index; + + auto thread_reduce_func = [&](auto invariant_index) { + size_t x_invariant_offset = get_offset_from_index( + arg.x_invariant_strides_, invariant_index); + size_t y_invariant_offset = get_offset_from_index( + arg.y_invariant_strides_, invariant_index); + + size_t mean_variance_offset = + get_offset_from_index(arg.bnMeanVarStrides_, invariant_index); + + AccDataType mean = arg.estimatedMean_[mean_variance_offset]; + AccDataType variance = arg.estimatedVariance_[mean_variance_offset]; + + // inv-variance defined as 1/sqrt(epsilon+variance) + AccDataType invVariance = + type_convert(1.0f) / std::sqrt(arg.epsilon_ + variance); + + size_t scale_offset = + get_offset_from_index(arg.bnScaleStrides_, invariant_index); + size_t bias_offset = + get_offset_from_index(arg.bnBiasStrides_, invariant_index); + + AccDataType scale = type_convert(arg.bnScale_[scale_offset]); + AccDataType bias = type_convert(arg.bnBias_[bias_offset]); + + // normalization + for(const auto& reduce_index : arg.reduce_index_set_) + { + size_t x_reduce_offset = get_offset_from_index( + arg.x_reduce_strides_, reduce_index); + size_t y_reduce_offset = get_offset_from_index( + arg.y_reduce_strides_, reduce_index); + + auto x_offset = x_invariant_offset + x_reduce_offset; + auto y_offset = y_invariant_offset + y_reduce_offset; + + AccDataType x = type_convert(arg.p_x_[x_offset]); + + AccDataType norm_x = (x - mean) * invVariance; + + AccDataType y = scale * norm_x + bias; + + arg.y_elementwise_op_(y, y); + + arg.p_y_[y_offset] = type_convert(y); + }; + }; + + std::size_t num_thread = std::thread::hardware_concurrency(); + std::size_t work_per_thread = + (arg.invariant_index_set_.size() + num_thread - 1) / num_thread; + + std::vector threads(num_thread); + + for(std::size_t it = 0; it < num_thread; ++it) + { + std::size_t i_begin = it * work_per_thread; + std::size_t i_end = std::min(static_cast((it + 1) * work_per_thread), + arg.invariant_index_set_.size()); + + auto f = [=] { + for(std::size_t i = i_begin; i < i_end; ++i) + { + thread_reduce_func(arg.invariant_index_set_[i]); + } + }; + + threads[it] = joinable_thread(f); + } + + return (0.0f); + }; + + float Run(const device::BaseArgument* p_arg, + const StreamConfig& /*stream_config*/ = StreamConfig{}) override + { + return Run(*dynamic_cast(p_arg)); + }; + }; + + bool IsSupportedArgument(const device::BaseArgument* p_arg) override + { + (void)p_arg; + + return (true); + }; + + std::unique_ptr + MakeArgumentPointer(const std::array xyLengths, + const std::array xStrides, + const std::array yStrides, + const std::array reduceDims, + const std::array bnScaleBiasMeanVarLengths, + const std::array bnScaleStrides, + const std::array bnBiasStrides, + const std::array bnMeanVarStrides, + const void* p_x, + const void* bnScale, + const void* bnBias, + double epsilon, + const YElementwiseOp y_elementwise_op, + const void* estimatedMean, + const void* estimatedVariance, + void* p_y) override + { + return std::make_unique(xyLengths, + xStrides, + yStrides, + reduceDims, + bnScaleBiasMeanVarLengths, + bnScaleStrides, + bnBiasStrides, + bnMeanVarStrides, + static_cast(p_x), + static_cast(bnScale), + static_cast(bnBias), + epsilon, + y_elementwise_op, + static_cast(estimatedMean), + static_cast(estimatedVariance), + static_cast(p_y)); + }; + + std::unique_ptr MakeInvokerPointer() override + { + return std::make_unique(); + }; + + std::string GetTypeString() const override + { + auto str = std::stringstream(); + + // clang-format off + str << "Reference_BatchNorm_Infer<" << std::endl; + // clang-format on + + return str.str(); + } +}; + +} // namespace host +} // namespace tensor_operation +} // namespace ck diff --git a/library/include/ck/library/reference_tensor_operation/cpu/reference_batchnorm_infer_nhwc_c.hpp b/library/include/ck/library/reference_tensor_operation/cpu/reference_batchnorm_infer_nhwc_c.hpp deleted file mode 100644 index 01e9572740..0000000000 --- a/library/include/ck/library/reference_tensor_operation/cpu/reference_batchnorm_infer_nhwc_c.hpp +++ /dev/null @@ -1,204 +0,0 @@ -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. - -#pragma once - -#include -#include -#include -#include - -#include "ck/tensor_operation/gpu/device/device_batchnorm_infer.hpp" - -namespace ck { -namespace tensor_operation { -namespace host { - -template -struct ReferenceBatchNormInfer_Input_N_H_W_C_Output_C : public device::DeviceBatchNormInfer<4, 3> -{ - struct Argument : public device::BaseArgument - { - Argument(const std::array xyLengths, - const std::array xStrides, - const std::array yStrides, - const std::array bnScaleBiasMeanVarLengths, - const std::array bnScaleStrides, - const std::array bnBiasStrides, - const std::array bnMeanVarStrides, - const XDataType* p_x, - const ScaleDataType* bnScale, - const BiasDataType* bnBias, - double epsilon, - const MeanVarDataType* estimatedMean, - const MeanVarDataType* estimatedVariance, - YDataType* p_y) - : p_x_(p_x), - bnScale_(bnScale), - bnBias_(bnBias), - epsilon_(epsilon), - estimatedMean_(estimatedMean), - estimatedVariance_(estimatedVariance), - p_y_(p_y) - { - ignore = xStrides; - ignore = yStrides; - ignore = bnScaleStrides; - ignore = bnBiasStrides; - ignore = bnMeanVarStrides; - - if(xyLengths.size() != 4 || bnScaleBiasMeanVarLengths.size() != 1 || - bnScaleBiasMeanVarLengths[0] != xyLengths[3]) - throw std::runtime_error("Invalid tensor dimensions!"); - - n_ = xyLengths[0]; - h_ = xyLengths[1]; - w_ = xyLengths[2]; - c_ = xyLengths[3]; - } - - const XDataType* p_x_; - const ScaleDataType* bnScale_; - const BiasDataType* bnBias_; - - double epsilon_; - - const MeanVarDataType* estimatedMean_; - const MeanVarDataType* estimatedVariance_; - - YDataType* p_y_; - - index_t n_, h_, w_, c_; - }; - - struct Invoker : public device::BaseInvoker - { - float Run(const Argument& arg) - { - auto thread_reduce_func = [&](auto iC) { - index_t offset_C = iC; - AccDataType mean = arg.estimatedMean_[offset_C]; - AccDataType variance = arg.estimatedVariance_[offset_C]; - - AccDataType invVariance = - type_convert(1.0f) / - std::sqrt(type_convert(arg.epsilon_) + variance); - - // Normalization - for(index_t iN = 0; iN < arg.n_; iN++) - { - index_t offset_N = iN * arg.h_ * arg.w_ * arg.c_; - for(index_t iH = 0; iH < arg.h_; iH++) - { - index_t offset_H = iH * arg.w_ * arg.c_; - for(index_t iW = 0; iW < arg.w_; iW++) - { - index_t offset_W = iW * arg.c_; - - auto offset = offset_N + offset_H + offset_W + offset_C; - - AccDataType x = type_convert(arg.p_x_[offset]); - - AccDataType norm_x = - arg.bnScale_[iC] * (x - mean) * invVariance + arg.bnBias_[iC]; - - arg.p_y_[offset] = type_convert(norm_x); - }; - } - }; - }; - - std::size_t num_thread = std::thread::hardware_concurrency(); - std::size_t work_per_thread = (arg.c_ + num_thread - 1) / num_thread; - - std::vector threads(num_thread); - - for(std::size_t it = 0; it < num_thread; ++it) - { - std::size_t ic_begin = it * work_per_thread; - std::size_t ic_end = std::min(static_cast((it + 1) * work_per_thread), arg.c_); - - auto f = [=] { - for(std::size_t ic = ic_begin; ic < ic_end; ++ic) - { - thread_reduce_func(ic); - } - }; - - threads[it] = joinable_thread(f); - } - - return (0.0f); - }; - - float Run(const device::BaseArgument* p_arg, - const StreamConfig& /*stream_config*/ = StreamConfig{}) override - { - return Run(*dynamic_cast(p_arg)); - }; - }; - - bool IsSupportedArgument(const device::BaseArgument* p_arg) override - { - (void)p_arg; - - return (true); - }; - - std::unique_ptr - MakeArgumentPointer(const std::array xyLengths, - const std::array xStrides, - const std::array yStrides, - const std::array bnScaleBiasMeanVarLengths, - const std::array bnScaleStrides, - const std::array bnBiasStrides, - const std::array bnMeanVarStrides, - const void* p_x, - const void* bnScale, - const void* bnBias, - double epsilon, - const void* estimatedMean, - const void* estimatedVariance, - void* p_y) override - { - return std::make_unique(xyLengths, - xStrides, - yStrides, - bnScaleBiasMeanVarLengths, - bnScaleStrides, - bnBiasStrides, - bnMeanVarStrides, - static_cast(p_x), - static_cast(bnScale), - static_cast(bnBias), - epsilon, - static_cast(estimatedMean), - static_cast(estimatedVariance), - static_cast(p_y)); - }; - - std::unique_ptr MakeInvokerPointer() override - { - return std::make_unique(); - }; - - std::string GetTypeString() const override - { - auto str = std::stringstream(); - - // clang-format off - str << "Reference_BatchNorm_Forward_NHWC_C<" << std::endl; - // clang-format on - - return str.str(); - } -}; - -} // namespace host -} // namespace tensor_operation -} // namespace ck diff --git a/library/include/ck/library/tensor_operation_instance/gpu/batchnorm_forward.hpp b/library/include/ck/library/tensor_operation_instance/gpu/batchnorm_forward.hpp new file mode 100644 index 0000000000..9a06988ff8 --- /dev/null +++ b/library/include/ck/library/tensor_operation_instance/gpu/batchnorm_forward.hpp @@ -0,0 +1,130 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once + +#include + +#include "ck/ck.hpp" +#include "ck/tensor_operation/gpu/device/device_batchnorm_forward.hpp" +#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" + +#include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +// FP16 +void add_device_batchnorm_forward_rank_4_3_f16_instances( + std::vector< + std::unique_ptr>>&); + +// FP32 +void add_device_batchnorm_forward_rank_4_3_f32_instances( + std::vector< + std::unique_ptr>>&); + +// BF16 +void add_device_batchnorm_forward_rank_4_3_bf16_instances( + std::vector< + std::unique_ptr>>&); + +// Int8 +void add_device_batchnorm_forward_rank_4_3_i8_instances( + std::vector>>&); + +// FP64 +void add_device_batchnorm_forward_rank_4_3_f64_instances( + std::vector< + std::unique_ptr>>&); + +template +struct DeviceOperationInstanceFactory< + ck::tensor_operation::device::DeviceBatchNormFwd> +{ + using DeviceOp = DeviceBatchNormFwd; + + static auto GetInstances() + { + std::vector> op_ptrs; + + if constexpr(is_same_v && is_same_v && + is_same_v && is_same_v && + is_same_v && is_same_v) + { + if constexpr(Rank == 4 && NumReduceDim == 3 && is_same_v) + { + add_device_batchnorm_forward_rank_4_3_f16_instances(op_ptrs); + } + } + else if constexpr(is_same_v && is_same_v && + is_same_v && is_same_v && + is_same_v && is_same_v) + { + if constexpr(Rank == 4 && NumReduceDim == 3 && is_same_v) + { + add_device_batchnorm_forward_rank_4_3_f32_instances(op_ptrs); + } + } + else if constexpr(is_same_v && is_same_v && + is_same_v && is_same_v && + is_same_v && is_same_v) + { + if constexpr(Rank == 4 && NumReduceDim == 3 && is_same_v) + { + add_device_batchnorm_forward_rank_4_3_bf16_instances(op_ptrs); + } + } + else if constexpr(is_same_v && is_same_v && + is_same_v && is_same_v && + is_same_v && is_same_v) + { + if constexpr(Rank == 4 && NumReduceDim == 3 && is_same_v) + { + add_device_batchnorm_forward_rank_4_3_i8_instances(op_ptrs); + } + } + else if constexpr(is_same_v && is_same_v && + is_same_v && is_same_v && + is_same_v && is_same_v) + { + if constexpr(Rank == 4 && NumReduceDim == 3 && is_same_v) + { + add_device_batchnorm_forward_rank_4_3_f64_instances(op_ptrs); + } + } + + return op_ptrs; + } +}; + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/include/ck/library/utility/host_common_util.hpp b/library/include/ck/library/utility/host_common_util.hpp index 31e5571eed..6f4466e8da 100644 --- a/library/include/ck/library/utility/host_common_util.hpp +++ b/library/include/ck/library/utility/host_common_util.hpp @@ -4,9 +4,11 @@ #pragma once #include +#include #include #include #include +#include #include "ck/ck.hpp" @@ -72,5 +74,63 @@ static inline std::vector getTypeValuesFromString(const char* cstr_values) return (values); } +template +static inline std::vector> +get_index_set(const std::array& dim_lengths) +{ + static_assert(NDim >= 1, "NDim >= 1 is required to use this function!"); + + if constexpr(NDim == 1) + { + std::vector> index_set; + + for(int i = 0; i < dim_lengths[0]; i++) + { + std::array index{i}; + + index_set.push_back(index); + }; + + return index_set; + } + else + { + std::vector> index_set; + std::array partial_dim_lengths; + + std::copy(dim_lengths.begin() + 1, dim_lengths.end(), partial_dim_lengths.begin()); + + std::vector> partial_index_set; + + partial_index_set = get_index_set(partial_dim_lengths); + + for(index_t i = 0; i < dim_lengths[0]; i++) + for(const auto& partial_index : partial_index_set) + { + std::array index; + + index[0] = i; + + std::copy(partial_index.begin(), partial_index.end(), index.begin() + 1); + + index_set.push_back(index); + }; + + return index_set; + }; +}; + +template +static inline size_t get_offset_from_index(const std::array& strides, + const std::array& index) +{ + size_t offset = 0; + + for(int i = 0; i < NDim; i++) + offset += index[i] * strides[i]; + + return (offset); +}; + } // namespace host_common } // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/batchnorm/CMakeLists.txt b/library/src/tensor_operation_instance/gpu/batchnorm/CMakeLists.txt new file mode 100644 index 0000000000..c637693f10 --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/batchnorm/CMakeLists.txt @@ -0,0 +1,7 @@ +add_instance_library(device_batchnorm_instance + device_batchnorm_forward_f16_instance.cpp + device_batchnorm_forward_f32_instance.cpp + device_batchnorm_forward_bf16_instance.cpp + device_batchnorm_forward_i8_instance.cpp + device_batchnorm_forward_f64_instance.cpp +) diff --git a/library/src/tensor_operation_instance/gpu/batchnorm/device_batchnorm_forward_bf16_instance.cpp b/library/src/tensor_operation_instance/gpu/batchnorm/device_batchnorm_forward_bf16_instance.cpp new file mode 100644 index 0000000000..cd1e05b113 --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/batchnorm/device_batchnorm_forward_bf16_instance.cpp @@ -0,0 +1,147 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +#include "ck/ck.hpp" +#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" +#include "ck/tensor_operation/gpu/device/impl/device_batchnorm_forward_impl.hpp" +#include "ck/utility/data_type.hpp" + +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +using BF16 = ck::bhalf_t; +using F32 = float; + +using PassThrough = ck::tensor_operation::element_wise::PassThrough; + +// clang-format off +template +using device_batchnorm_forward_bf16_blockwise_instances = + std::tuple < + // XDataType, YDataType, AccDataType, ScaleDataType, BiasDataType, MeanVarDataType, YElementwiseOp, Rank, NumReduceDim, UseMultiBlockInK, BLockSize, MThreadClusterSize, KThreadClusterSize, MThreadSliceSize, KThreadSliceSize, XSrcYDstVectorDim, XSrcVectorSize, YDstVectorSize, ScaleSrcVectorSize, BiasSrcVectorSize, MeanVarSrcDstVectorSize + DeviceBatchNormFwdImpl, + DeviceBatchNormFwdImpl, + DeviceBatchNormFwdImpl, + DeviceBatchNormFwdImpl, + DeviceBatchNormFwdImpl, + DeviceBatchNormFwdImpl, + DeviceBatchNormFwdImpl, + DeviceBatchNormFwdImpl, + DeviceBatchNormFwdImpl, + DeviceBatchNormFwdImpl, + DeviceBatchNormFwdImpl, + DeviceBatchNormFwdImpl, + DeviceBatchNormFwdImpl, + DeviceBatchNormFwdImpl, + DeviceBatchNormFwdImpl, + DeviceBatchNormFwdImpl, + DeviceBatchNormFwdImpl, + DeviceBatchNormFwdImpl, + DeviceBatchNormFwdImpl, + DeviceBatchNormFwdImpl, + DeviceBatchNormFwdImpl, + DeviceBatchNormFwdImpl, + DeviceBatchNormFwdImpl, + DeviceBatchNormFwdImpl, + DeviceBatchNormFwdImpl, + DeviceBatchNormFwdImpl, + DeviceBatchNormFwdImpl, + DeviceBatchNormFwdImpl, + DeviceBatchNormFwdImpl, + DeviceBatchNormFwdImpl, + DeviceBatchNormFwdImpl, + DeviceBatchNormFwdImpl, + DeviceBatchNormFwdImpl, + DeviceBatchNormFwdImpl, + DeviceBatchNormFwdImpl, + DeviceBatchNormFwdImpl, + DeviceBatchNormFwdImpl, + DeviceBatchNormFwdImpl, + DeviceBatchNormFwdImpl, + DeviceBatchNormFwdImpl, + DeviceBatchNormFwdImpl, + DeviceBatchNormFwdImpl, + DeviceBatchNormFwdImpl, + DeviceBatchNormFwdImpl, + DeviceBatchNormFwdImpl, + DeviceBatchNormFwdImpl, + DeviceBatchNormFwdImpl, + DeviceBatchNormFwdImpl + >; +// clang-format on + +// clang-format off +template +using device_batchnorm_forward_bf16_multiblock_instances = + std::tuple < + // XDataType, YDataType, AccDataType, ScaleDataType, BiasDataType, MeanVarDataType, YElementwiseOp, Rank, NumReduceDim, UseMultiBlockInK, BLockSize, MThreadClusterSize, KThreadClusterSize, MThreadSliceSize, KThreadSliceSize, XSrcYDstVectorDim, XSrcVectorSize, YDstVectorSize, ScaleSrcVectorSize, BiasSrcVectorSize, MeanVarSrcDstVectorSize + DeviceBatchNormFwdImpl, + DeviceBatchNormFwdImpl, + DeviceBatchNormFwdImpl, + DeviceBatchNormFwdImpl, + DeviceBatchNormFwdImpl, + DeviceBatchNormFwdImpl, + DeviceBatchNormFwdImpl, + DeviceBatchNormFwdImpl, + DeviceBatchNormFwdImpl, + DeviceBatchNormFwdImpl, + DeviceBatchNormFwdImpl, + DeviceBatchNormFwdImpl, + DeviceBatchNormFwdImpl, + DeviceBatchNormFwdImpl, + DeviceBatchNormFwdImpl, + DeviceBatchNormFwdImpl, + DeviceBatchNormFwdImpl, + DeviceBatchNormFwdImpl, + DeviceBatchNormFwdImpl, + DeviceBatchNormFwdImpl, + DeviceBatchNormFwdImpl, + DeviceBatchNormFwdImpl, + DeviceBatchNormFwdImpl, + DeviceBatchNormFwdImpl, + DeviceBatchNormFwdImpl, + DeviceBatchNormFwdImpl, + DeviceBatchNormFwdImpl, + DeviceBatchNormFwdImpl, + DeviceBatchNormFwdImpl, + DeviceBatchNormFwdImpl, + DeviceBatchNormFwdImpl, + DeviceBatchNormFwdImpl, + DeviceBatchNormFwdImpl, + DeviceBatchNormFwdImpl, + DeviceBatchNormFwdImpl, + DeviceBatchNormFwdImpl, + DeviceBatchNormFwdImpl, + DeviceBatchNormFwdImpl, + DeviceBatchNormFwdImpl, + DeviceBatchNormFwdImpl, + DeviceBatchNormFwdImpl, + DeviceBatchNormFwdImpl, + DeviceBatchNormFwdImpl, + DeviceBatchNormFwdImpl, + DeviceBatchNormFwdImpl, + DeviceBatchNormFwdImpl, + DeviceBatchNormFwdImpl, + DeviceBatchNormFwdImpl + >; +// clang-format on + +void add_device_batchnorm_forward_rank_4_3_bf16_instances( + std::vector< + std::unique_ptr>>& + instances) +{ + add_device_operation_instances( + instances, device_batchnorm_forward_bf16_blockwise_instances<4, 3, PassThrough>{}); + add_device_operation_instances( + instances, device_batchnorm_forward_bf16_multiblock_instances<4, 3, PassThrough>{}); +} + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/batchnorm/device_batchnorm_forward_f16_instance.cpp b/library/src/tensor_operation_instance/gpu/batchnorm/device_batchnorm_forward_f16_instance.cpp new file mode 100644 index 0000000000..073dd583f9 --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/batchnorm/device_batchnorm_forward_f16_instance.cpp @@ -0,0 +1,147 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +#include "ck/ck.hpp" +#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" +#include "ck/tensor_operation/gpu/device/impl/device_batchnorm_forward_impl.hpp" +#include "ck/utility/data_type.hpp" + +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +using F16 = ck::half_t; +using F32 = float; + +using PassThrough = ck::tensor_operation::element_wise::PassThrough; + +// clang-format off +template +using device_batchnorm_forward_f16_blockwise_instances = + std::tuple < + // XDataType, YDataType, AccDataType, ScaleDataType, BiasDataType, MeanVarDataType, YElementwiseOp, Rank, NumReduceDim, UseMultiBlockInK, BLockSize, MThreadClusterSize, KThreadClusterSize, MThreadSliceSize, KThreadSliceSize, XSrcYDstVectorDim, XSrcVectorSize, YDstVectorSize, ScaleSrcVectorSize, BiasSrcVectorSize, MeanVarSrcDstVectorSize + DeviceBatchNormFwdImpl, + DeviceBatchNormFwdImpl, + DeviceBatchNormFwdImpl, + DeviceBatchNormFwdImpl, + DeviceBatchNormFwdImpl, + DeviceBatchNormFwdImpl, + DeviceBatchNormFwdImpl, + DeviceBatchNormFwdImpl, + DeviceBatchNormFwdImpl, + DeviceBatchNormFwdImpl, + DeviceBatchNormFwdImpl, + DeviceBatchNormFwdImpl, + DeviceBatchNormFwdImpl, + DeviceBatchNormFwdImpl, + DeviceBatchNormFwdImpl, + DeviceBatchNormFwdImpl, + DeviceBatchNormFwdImpl, + DeviceBatchNormFwdImpl, + DeviceBatchNormFwdImpl, + DeviceBatchNormFwdImpl, + DeviceBatchNormFwdImpl, + DeviceBatchNormFwdImpl, + DeviceBatchNormFwdImpl, + DeviceBatchNormFwdImpl, + DeviceBatchNormFwdImpl, + DeviceBatchNormFwdImpl, + DeviceBatchNormFwdImpl, + DeviceBatchNormFwdImpl, + DeviceBatchNormFwdImpl, + DeviceBatchNormFwdImpl, + DeviceBatchNormFwdImpl, + DeviceBatchNormFwdImpl, + DeviceBatchNormFwdImpl, + DeviceBatchNormFwdImpl, + DeviceBatchNormFwdImpl, + DeviceBatchNormFwdImpl, + DeviceBatchNormFwdImpl, + DeviceBatchNormFwdImpl, + DeviceBatchNormFwdImpl, + DeviceBatchNormFwdImpl, + DeviceBatchNormFwdImpl, + DeviceBatchNormFwdImpl, + DeviceBatchNormFwdImpl, + DeviceBatchNormFwdImpl, + DeviceBatchNormFwdImpl, + DeviceBatchNormFwdImpl, + DeviceBatchNormFwdImpl, + DeviceBatchNormFwdImpl + >; +// clang-format on + +// clang-format off +template +using device_batchnorm_forward_f16_multiblock_instances = + std::tuple < + // XDataType, YDataType, AccDataType, ScaleDataType, BiasDataType, MeanVarDataType, YElementwiseOp, Rank, NumReduceDim, UseMultiBlockInK, BLockSize, MThreadClusterSize, KThreadClusterSize, MThreadSliceSize, KThreadSliceSize, XSrcYDstVectorDim, XSrcVectorSize, YDstVectorSize, ScaleSrcVectorSize, BiasSrcVectorSize, MeanVarSrcDstVectorSize + DeviceBatchNormFwdImpl, + DeviceBatchNormFwdImpl, + DeviceBatchNormFwdImpl, + DeviceBatchNormFwdImpl, + DeviceBatchNormFwdImpl, + DeviceBatchNormFwdImpl, + DeviceBatchNormFwdImpl, + DeviceBatchNormFwdImpl, + DeviceBatchNormFwdImpl, + DeviceBatchNormFwdImpl, + DeviceBatchNormFwdImpl, + DeviceBatchNormFwdImpl, + DeviceBatchNormFwdImpl, + DeviceBatchNormFwdImpl, + DeviceBatchNormFwdImpl, + DeviceBatchNormFwdImpl, + DeviceBatchNormFwdImpl, + DeviceBatchNormFwdImpl, + DeviceBatchNormFwdImpl, + DeviceBatchNormFwdImpl, + DeviceBatchNormFwdImpl, + DeviceBatchNormFwdImpl, + DeviceBatchNormFwdImpl, + DeviceBatchNormFwdImpl, + DeviceBatchNormFwdImpl, + DeviceBatchNormFwdImpl, + DeviceBatchNormFwdImpl, + DeviceBatchNormFwdImpl, + DeviceBatchNormFwdImpl, + DeviceBatchNormFwdImpl, + DeviceBatchNormFwdImpl, + DeviceBatchNormFwdImpl, + DeviceBatchNormFwdImpl, + DeviceBatchNormFwdImpl, + DeviceBatchNormFwdImpl, + DeviceBatchNormFwdImpl, + DeviceBatchNormFwdImpl, + DeviceBatchNormFwdImpl, + DeviceBatchNormFwdImpl, + DeviceBatchNormFwdImpl, + DeviceBatchNormFwdImpl, + DeviceBatchNormFwdImpl, + DeviceBatchNormFwdImpl, + DeviceBatchNormFwdImpl, + DeviceBatchNormFwdImpl, + DeviceBatchNormFwdImpl, + DeviceBatchNormFwdImpl, + DeviceBatchNormFwdImpl + >; +// clang-format on + +void add_device_batchnorm_forward_rank_4_3_f16_instances( + std::vector< + std::unique_ptr>>& + instances) +{ + add_device_operation_instances( + instances, device_batchnorm_forward_f16_blockwise_instances<4, 3, PassThrough>{}); + add_device_operation_instances( + instances, device_batchnorm_forward_f16_multiblock_instances<4, 3, PassThrough>{}); +} + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/batchnorm/device_batchnorm_forward_f32_instance.cpp b/library/src/tensor_operation_instance/gpu/batchnorm/device_batchnorm_forward_f32_instance.cpp new file mode 100644 index 0000000000..be63bd44c6 --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/batchnorm/device_batchnorm_forward_f32_instance.cpp @@ -0,0 +1,145 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +#include "ck/ck.hpp" +#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" +#include "ck/tensor_operation/gpu/device/impl/device_batchnorm_forward_impl.hpp" +#include "ck/utility/data_type.hpp" + +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +using F32 = float; + +using PassThrough = ck::tensor_operation::element_wise::PassThrough; + +// clang-format off +template +using device_batchnorm_forward_f32_blockwise_instances = std::tuple< + // XDataType, YDataType, AccDataType, ScaleDataType, BiasDataType, MeanVarDataType, YElementwiseOp, Rank, NumReduceDim, UseMultiBlockInK, BLockSize, MThreadClusterSize, KThreadClusterSize, MThreadSliceSize, KThreadSliceSize, XSrcYDstVectorDim, XSrcVectorSize, YDstVectorSize, ScaleSrcVectorSize, BiasSrcVectorSize, MeanVarSrcDstVectorSize + DeviceBatchNormFwdImpl, + DeviceBatchNormFwdImpl, + DeviceBatchNormFwdImpl, + DeviceBatchNormFwdImpl, + DeviceBatchNormFwdImpl, + DeviceBatchNormFwdImpl, + DeviceBatchNormFwdImpl, + DeviceBatchNormFwdImpl, + DeviceBatchNormFwdImpl, + DeviceBatchNormFwdImpl, + DeviceBatchNormFwdImpl, + DeviceBatchNormFwdImpl, + DeviceBatchNormFwdImpl, + DeviceBatchNormFwdImpl, + DeviceBatchNormFwdImpl, + DeviceBatchNormFwdImpl, + DeviceBatchNormFwdImpl, + DeviceBatchNormFwdImpl, + DeviceBatchNormFwdImpl, + DeviceBatchNormFwdImpl, + DeviceBatchNormFwdImpl, + DeviceBatchNormFwdImpl, + DeviceBatchNormFwdImpl, + DeviceBatchNormFwdImpl, + DeviceBatchNormFwdImpl, + DeviceBatchNormFwdImpl, + DeviceBatchNormFwdImpl, + DeviceBatchNormFwdImpl, + DeviceBatchNormFwdImpl, + DeviceBatchNormFwdImpl, + DeviceBatchNormFwdImpl, + DeviceBatchNormFwdImpl, + DeviceBatchNormFwdImpl, + DeviceBatchNormFwdImpl, + DeviceBatchNormFwdImpl, + DeviceBatchNormFwdImpl, + DeviceBatchNormFwdImpl, + DeviceBatchNormFwdImpl, + DeviceBatchNormFwdImpl, + DeviceBatchNormFwdImpl, + DeviceBatchNormFwdImpl, + DeviceBatchNormFwdImpl, + DeviceBatchNormFwdImpl, + DeviceBatchNormFwdImpl, + DeviceBatchNormFwdImpl, + DeviceBatchNormFwdImpl, + DeviceBatchNormFwdImpl, + DeviceBatchNormFwdImpl + >; +// clang-format on + +// clang-format off +template +using device_batchnorm_forward_f32_multiblock_instances = + std::tuple < + // XDataType, YDataType, AccDataType, ScaleDataType, BiasDataType, MeanVarDataType, YElementwiseOp, Rank, NumReduceDim, UseMultiBlockInK, BLockSize, MThreadClusterSize, KThreadClusterSize, MThreadSliceSize, KThreadSliceSize, XSrcYDstVectorDim, XSrcVectorSize, YDstVectorSize, ScaleSrcVectorSize, BiasSrcVectorSize, MeanVarSrcDstVectorSize + DeviceBatchNormFwdImpl, + DeviceBatchNormFwdImpl, + DeviceBatchNormFwdImpl, + DeviceBatchNormFwdImpl, + DeviceBatchNormFwdImpl, + DeviceBatchNormFwdImpl, + DeviceBatchNormFwdImpl, + DeviceBatchNormFwdImpl, + DeviceBatchNormFwdImpl, + DeviceBatchNormFwdImpl, + DeviceBatchNormFwdImpl, + DeviceBatchNormFwdImpl, + DeviceBatchNormFwdImpl, + DeviceBatchNormFwdImpl, + DeviceBatchNormFwdImpl, + DeviceBatchNormFwdImpl, + DeviceBatchNormFwdImpl, + DeviceBatchNormFwdImpl, + DeviceBatchNormFwdImpl, + DeviceBatchNormFwdImpl, + DeviceBatchNormFwdImpl, + DeviceBatchNormFwdImpl, + DeviceBatchNormFwdImpl, + DeviceBatchNormFwdImpl, + DeviceBatchNormFwdImpl, + DeviceBatchNormFwdImpl, + DeviceBatchNormFwdImpl, + DeviceBatchNormFwdImpl, + DeviceBatchNormFwdImpl, + DeviceBatchNormFwdImpl, + DeviceBatchNormFwdImpl, + DeviceBatchNormFwdImpl, + DeviceBatchNormFwdImpl, + DeviceBatchNormFwdImpl, + DeviceBatchNormFwdImpl, + DeviceBatchNormFwdImpl, + DeviceBatchNormFwdImpl, + DeviceBatchNormFwdImpl, + DeviceBatchNormFwdImpl, + DeviceBatchNormFwdImpl, + DeviceBatchNormFwdImpl, + DeviceBatchNormFwdImpl, + DeviceBatchNormFwdImpl, + DeviceBatchNormFwdImpl, + DeviceBatchNormFwdImpl, + DeviceBatchNormFwdImpl, + DeviceBatchNormFwdImpl, + DeviceBatchNormFwdImpl + >; +// clang-format on + +void add_device_batchnorm_forward_rank_4_3_f32_instances( + std::vector< + std::unique_ptr>>& + instances) +{ + add_device_operation_instances( + instances, device_batchnorm_forward_f32_blockwise_instances<4, 3, PassThrough>{}); + add_device_operation_instances( + instances, device_batchnorm_forward_f32_multiblock_instances<4, 3, PassThrough>{}); +} + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/batchnorm/device_batchnorm_forward_f64_instance.cpp b/library/src/tensor_operation_instance/gpu/batchnorm/device_batchnorm_forward_f64_instance.cpp new file mode 100644 index 0000000000..fe87091e8d --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/batchnorm/device_batchnorm_forward_f64_instance.cpp @@ -0,0 +1,145 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +#include "ck/ck.hpp" +#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" +#include "ck/tensor_operation/gpu/device/impl/device_batchnorm_forward_impl.hpp" +#include "ck/utility/data_type.hpp" + +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +using F64 = double; + +using PassThrough = ck::tensor_operation::element_wise::PassThrough; + +// clang-format off +template +using device_batchnorm_forward_f64_blockwise_instances = std::tuple< + // XDataType, YDataType, AccDataType, ScaleDataType, BiasDataType, MeanVarDataType, YElementwiseOp, Rank, NumReduceDim, UseMultiBlockInK, BLockSize, MThreadClusterSize, KThreadClusterSize, MThreadSliceSize, KThreadSliceSize, XSrcYDstVectorDim, XSrcVectorSize, YDstVectorSize, ScaleSrcVectorSize, BiasSrcVectorSize, MeanVarSrcDstVectorSize + DeviceBatchNormFwdImpl, + DeviceBatchNormFwdImpl, + DeviceBatchNormFwdImpl, + DeviceBatchNormFwdImpl, + DeviceBatchNormFwdImpl, + DeviceBatchNormFwdImpl, + DeviceBatchNormFwdImpl, + DeviceBatchNormFwdImpl, + DeviceBatchNormFwdImpl, + DeviceBatchNormFwdImpl, + DeviceBatchNormFwdImpl, + DeviceBatchNormFwdImpl, + DeviceBatchNormFwdImpl, + DeviceBatchNormFwdImpl, + DeviceBatchNormFwdImpl, + DeviceBatchNormFwdImpl, + DeviceBatchNormFwdImpl, + DeviceBatchNormFwdImpl, + DeviceBatchNormFwdImpl, + DeviceBatchNormFwdImpl, + DeviceBatchNormFwdImpl, + DeviceBatchNormFwdImpl, + DeviceBatchNormFwdImpl, + DeviceBatchNormFwdImpl, + DeviceBatchNormFwdImpl, + DeviceBatchNormFwdImpl, + DeviceBatchNormFwdImpl, + DeviceBatchNormFwdImpl, + DeviceBatchNormFwdImpl, + DeviceBatchNormFwdImpl, + DeviceBatchNormFwdImpl, + DeviceBatchNormFwdImpl, + DeviceBatchNormFwdImpl, + DeviceBatchNormFwdImpl, + DeviceBatchNormFwdImpl, + DeviceBatchNormFwdImpl, + DeviceBatchNormFwdImpl, + DeviceBatchNormFwdImpl, + DeviceBatchNormFwdImpl, + DeviceBatchNormFwdImpl, + DeviceBatchNormFwdImpl, + DeviceBatchNormFwdImpl, + DeviceBatchNormFwdImpl, + DeviceBatchNormFwdImpl, + DeviceBatchNormFwdImpl, + DeviceBatchNormFwdImpl, + DeviceBatchNormFwdImpl, + DeviceBatchNormFwdImpl + >; +// clang-format on + +// clang-format off +template +using device_batchnorm_forward_f64_multiblock_instances = + std::tuple < + // XDataType, YDataType, AccDataType, ScaleDataType, BiasDataType, MeanVarDataType, YElementwiseOp, Rank, NumReduceDim, UseMultiBlockInK, BLockSize, MThreadClusterSize, KThreadClusterSize, MThreadSliceSize, KThreadSliceSize, XSrcYDstVectorDim, XSrcVectorSize, YDstVectorSize, ScaleSrcVectorSize, BiasSrcVectorSize, MeanVarSrcDstVectorSize + DeviceBatchNormFwdImpl, + DeviceBatchNormFwdImpl, + DeviceBatchNormFwdImpl, + DeviceBatchNormFwdImpl, + DeviceBatchNormFwdImpl, + DeviceBatchNormFwdImpl, + DeviceBatchNormFwdImpl, + DeviceBatchNormFwdImpl, + DeviceBatchNormFwdImpl, + DeviceBatchNormFwdImpl, + DeviceBatchNormFwdImpl, + DeviceBatchNormFwdImpl, + DeviceBatchNormFwdImpl, + DeviceBatchNormFwdImpl, + DeviceBatchNormFwdImpl, + DeviceBatchNormFwdImpl, + DeviceBatchNormFwdImpl, + DeviceBatchNormFwdImpl, + DeviceBatchNormFwdImpl, + DeviceBatchNormFwdImpl, + DeviceBatchNormFwdImpl, + DeviceBatchNormFwdImpl, + DeviceBatchNormFwdImpl, + DeviceBatchNormFwdImpl, + DeviceBatchNormFwdImpl, + DeviceBatchNormFwdImpl, + DeviceBatchNormFwdImpl, + DeviceBatchNormFwdImpl, + DeviceBatchNormFwdImpl, + DeviceBatchNormFwdImpl, + DeviceBatchNormFwdImpl, + DeviceBatchNormFwdImpl, + DeviceBatchNormFwdImpl, + DeviceBatchNormFwdImpl, + DeviceBatchNormFwdImpl, + DeviceBatchNormFwdImpl, + DeviceBatchNormFwdImpl, + DeviceBatchNormFwdImpl, + DeviceBatchNormFwdImpl, + DeviceBatchNormFwdImpl, + DeviceBatchNormFwdImpl, + DeviceBatchNormFwdImpl, + DeviceBatchNormFwdImpl, + DeviceBatchNormFwdImpl, + DeviceBatchNormFwdImpl, + DeviceBatchNormFwdImpl, + DeviceBatchNormFwdImpl, + DeviceBatchNormFwdImpl + >; +// clang-format on + +void add_device_batchnorm_forward_rank_4_3_f64_instances( + std::vector< + std::unique_ptr>>& + instances) +{ + add_device_operation_instances( + instances, device_batchnorm_forward_f64_blockwise_instances<4, 3, PassThrough>{}); + add_device_operation_instances( + instances, device_batchnorm_forward_f64_multiblock_instances<4, 3, PassThrough>{}); +} + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/batchnorm/device_batchnorm_forward_i8_instance.cpp b/library/src/tensor_operation_instance/gpu/batchnorm/device_batchnorm_forward_i8_instance.cpp new file mode 100644 index 0000000000..88ce369e15 --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/batchnorm/device_batchnorm_forward_i8_instance.cpp @@ -0,0 +1,145 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +#include "ck/ck.hpp" +#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" +#include "ck/tensor_operation/gpu/device/impl/device_batchnorm_forward_impl.hpp" +#include "ck/utility/data_type.hpp" + +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +using I8 = int8_t; +using F32 = float; + +using PassThrough = ck::tensor_operation::element_wise::PassThrough; + +// clang-format off +template +using device_batchnorm_forward_i8_blockwise_instances = std::tuple< + // XDataType, YDataType, AccDataType, ScaleDataType, BiasDataType, MeanVarDataType, YElementwiseOp, Rank, NumReduceDim, UseMultiBlockInK, BLockSize, MThreadClusterSize, KThreadClusterSize, MThreadSliceSize, KThreadSliceSize, XSrcYDstVectorDim, XSrcVectorSize, YDstVectorSize, ScaleSrcVectorSize, BiasSrcVectorSize, MeanVarSrcDstVectorSize + DeviceBatchNormFwdImpl, + DeviceBatchNormFwdImpl, + DeviceBatchNormFwdImpl, + DeviceBatchNormFwdImpl, + DeviceBatchNormFwdImpl, + DeviceBatchNormFwdImpl, + DeviceBatchNormFwdImpl, + DeviceBatchNormFwdImpl, + DeviceBatchNormFwdImpl, + DeviceBatchNormFwdImpl, + DeviceBatchNormFwdImpl, + DeviceBatchNormFwdImpl, + DeviceBatchNormFwdImpl, + DeviceBatchNormFwdImpl, + DeviceBatchNormFwdImpl, + DeviceBatchNormFwdImpl, + DeviceBatchNormFwdImpl, + DeviceBatchNormFwdImpl, + DeviceBatchNormFwdImpl, + DeviceBatchNormFwdImpl, + DeviceBatchNormFwdImpl, + DeviceBatchNormFwdImpl, + DeviceBatchNormFwdImpl, + DeviceBatchNormFwdImpl, + DeviceBatchNormFwdImpl, + DeviceBatchNormFwdImpl, + DeviceBatchNormFwdImpl, + DeviceBatchNormFwdImpl, + DeviceBatchNormFwdImpl, + DeviceBatchNormFwdImpl, + DeviceBatchNormFwdImpl, + DeviceBatchNormFwdImpl, + DeviceBatchNormFwdImpl, + DeviceBatchNormFwdImpl, + DeviceBatchNormFwdImpl, + DeviceBatchNormFwdImpl, + DeviceBatchNormFwdImpl, + DeviceBatchNormFwdImpl, + DeviceBatchNormFwdImpl, + DeviceBatchNormFwdImpl, + DeviceBatchNormFwdImpl, + DeviceBatchNormFwdImpl, + DeviceBatchNormFwdImpl, + DeviceBatchNormFwdImpl, + DeviceBatchNormFwdImpl, + DeviceBatchNormFwdImpl, + DeviceBatchNormFwdImpl, + DeviceBatchNormFwdImpl + >; +// clang-format on + +// clang-format off +template +using device_batchnorm_forward_i8_multiblock_instances = + std::tuple < + // XDataType, YDataType, AccDataType, ScaleDataType, BiasDataType, MeanVarDataType, YElementwiseOp, Rank, NumReduceDim, UseMultiBlockInK, BLockSize, MThreadClusterSize, KThreadClusterSize, MThreadSliceSize, KThreadSliceSize, XSrcYDstVectorDim, XSrcVectorSize, YDstVectorSize, ScaleSrcVectorSize, BiasSrcVectorSize, MeanVarSrcDstVectorSize + DeviceBatchNormFwdImpl, + DeviceBatchNormFwdImpl, + DeviceBatchNormFwdImpl, + DeviceBatchNormFwdImpl, + DeviceBatchNormFwdImpl, + DeviceBatchNormFwdImpl, + DeviceBatchNormFwdImpl, + DeviceBatchNormFwdImpl, + DeviceBatchNormFwdImpl, + DeviceBatchNormFwdImpl, + DeviceBatchNormFwdImpl, + DeviceBatchNormFwdImpl, + DeviceBatchNormFwdImpl, + DeviceBatchNormFwdImpl, + DeviceBatchNormFwdImpl, + DeviceBatchNormFwdImpl, + DeviceBatchNormFwdImpl, + DeviceBatchNormFwdImpl, + DeviceBatchNormFwdImpl, + DeviceBatchNormFwdImpl, + DeviceBatchNormFwdImpl, + DeviceBatchNormFwdImpl, + DeviceBatchNormFwdImpl, + DeviceBatchNormFwdImpl, + DeviceBatchNormFwdImpl, + DeviceBatchNormFwdImpl, + DeviceBatchNormFwdImpl, + DeviceBatchNormFwdImpl, + DeviceBatchNormFwdImpl, + DeviceBatchNormFwdImpl, + DeviceBatchNormFwdImpl, + DeviceBatchNormFwdImpl, + DeviceBatchNormFwdImpl, + DeviceBatchNormFwdImpl, + DeviceBatchNormFwdImpl, + DeviceBatchNormFwdImpl, + DeviceBatchNormFwdImpl, + DeviceBatchNormFwdImpl, + DeviceBatchNormFwdImpl, + DeviceBatchNormFwdImpl, + DeviceBatchNormFwdImpl, + DeviceBatchNormFwdImpl, + DeviceBatchNormFwdImpl, + DeviceBatchNormFwdImpl, + DeviceBatchNormFwdImpl, + DeviceBatchNormFwdImpl, + DeviceBatchNormFwdImpl, + DeviceBatchNormFwdImpl + >; +// clang-format on + +void add_device_batchnorm_forward_rank_4_3_i8_instances( + std::vector>>& + instances) +{ + add_device_operation_instances( + instances, device_batchnorm_forward_i8_blockwise_instances<4, 3, PassThrough>{}); + add_device_operation_instances( + instances, device_batchnorm_forward_i8_multiblock_instances<4, 3, PassThrough>{}); +} + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/profiler/CMakeLists.txt b/profiler/CMakeLists.txt index 9e1f6f5232..aad40cc79f 100644 --- a/profiler/CMakeLists.txt +++ b/profiler/CMakeLists.txt @@ -26,6 +26,7 @@ set(PROFILER_SOURCE src/profile_groupnorm.cpp src/profile_layernorm.cpp src/profile_softmax.cpp + src/profile_batchnorm_fwd.cpp ) add_executable(ckProfiler ${PROFILER_SOURCE}) @@ -57,5 +58,6 @@ target_link_libraries(ckProfiler PRIVATE device_conv2d_fwd_bias_relu_add_instanc target_link_libraries(ckProfiler PRIVATE device_normalization_instance) target_link_libraries(ckProfiler PRIVATE device_softmax_instance) target_link_libraries(ckProfiler PRIVATE device_reduce_instance) +target_link_libraries(ckProfiler PRIVATE device_batchnorm_instance) rocm_install(TARGETS ckProfiler COMPONENT profiler) diff --git a/profiler/include/profile_batchnorm_forward_impl.hpp b/profiler/include/profile_batchnorm_forward_impl.hpp new file mode 100644 index 0000000000..b7fc435f07 --- /dev/null +++ b/profiler/include/profile_batchnorm_forward_impl.hpp @@ -0,0 +1,440 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once + +#include +#include + +#include "ck/ck.hpp" +#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" +#include "ck/library/utility/check_err.hpp" +#include "ck/library/utility/device_memory.hpp" +#include "ck/library/utility/host_tensor.hpp" +#include "ck/library/utility/host_tensor_generator.hpp" +#include "ck/library/tensor_operation_instance/gpu/batchnorm_forward.hpp" +#include "ck/library/reference_tensor_operation/cpu/reference_batchnorm_forward.hpp" + +namespace ck { +namespace profiler { + +template +bool profile_batchnorm_forward_impl(int do_verification, + int init_method, + bool do_dumpout, + bool time_kernel, + const std::vector inOutLengths, + const std::vector reduceDims, + bool updateMovingAverage, + bool saveMeanAndInvVariance, + double averageFactor, + double epsilon) +{ + if(inOutLengths.size() != Rank || reduceDims.size() != NumBatchNormReduceDim) + { + throw std::runtime_error("Invalid tensor lengths or number of reduce dimensions!"); + }; + + std::vector scaleBiasMeanVarLengths; + + // used for calculating the effective transferred bytes by each operation + size_t total_length; + size_t invariant_length = 1; + + total_length = + std::accumulate(inOutLengths.begin(), inOutLengths.end(), 1, std::multiplies{}); + + if(std::any_of(reduceDims.begin(), reduceDims.end(), [](int d) { return d < 0 || d >= Rank; })) + throw std::runtime_error("Invalid reduce dimensions!"); + + for(int dim = 0; dim < Rank; dim++) + { + if(std::none_of(reduceDims.begin(), reduceDims.end(), [&](int d) { return dim == d; })) + { + scaleBiasMeanVarLengths.push_back(inOutLengths[dim]); + invariant_length *= inOutLengths[dim]; + }; + } + + // input data of the batchnorm forward algorithm + Tensor x(inOutLengths); + Tensor bnScale(scaleBiasMeanVarLengths); + Tensor bnBias(scaleBiasMeanVarLengths); + + // output data of the batchnorm forward algorithm + Tensor y_ref(inOutLengths); + Tensor y(inOutLengths); + + Tensor resultSaveMean_ref(scaleBiasMeanVarLengths); + Tensor resultSaveInvVariance_ref(scaleBiasMeanVarLengths); + + Tensor resultRunningMean_ref(scaleBiasMeanVarLengths); + Tensor resultRunningVariance_ref(scaleBiasMeanVarLengths); + + auto inOutStrides = x.mDesc.GetStrides(); + auto scaleBiasMeanVarStrides = bnScale.mDesc.GetStrides(); + + std::size_t num_thread = std::thread::hardware_concurrency(); + + if(updateMovingAverage) + { + if constexpr(ck::is_same_v) + { + x.GenerateTensorValue(GeneratorTensor_2{-5, 5}, num_thread); + + const float x_mean = 0.0f; + const float x_stddev = 2.5f; + const float noise_stddev = 0.04f; + + resultRunningMean_ref.GenerateTensorValue( + GeneratorTensor_4{x_mean, noise_stddev}, num_thread); + + resultRunningVariance_ref.GenerateTensorValue( + GeneratorTensor_4{x_stddev * x_stddev, noise_stddev}, num_thread); + } + else + { + const float x_mean = 0.0f; + const float x_stddev = 1.0f; + const float noise_stddev = 0.04f; + + // input data in normal distribution + x.GenerateTensorValue(GeneratorTensor_4{x_mean, x_stddev}, num_thread); + + // initialize the runningMean to be values with tiny variation to the mean of the x + // values + resultRunningMean_ref.GenerateTensorValue( + GeneratorTensor_4{x_mean, noise_stddev}, num_thread); + + // initialize the runningVariance to be values with tiny variation to the variance of + // the x values + resultRunningVariance_ref.GenerateTensorValue( + GeneratorTensor_4{x_stddev * x_stddev, noise_stddev}, num_thread); + }; + } + else + { + if constexpr(ck::is_same_v) + x.GenerateTensorValue(GeneratorTensor_2{-5, 5}, num_thread); + else + x.GenerateTensorValue(GeneratorTensor_3{-1.0f, 1.0f}, num_thread); + }; + + if(do_verification) + { + if constexpr(ck::is_same_v && ck::is_same_v) + { + bnScale.GenerateTensorValue(GeneratorTensor_2{-5, 5}, num_thread); + bnBias.GenerateTensorValue(GeneratorTensor_2{-5, 5}, num_thread); + } + else + { + + switch(init_method) + { + case 0: + bnScale.GenerateTensorValue(GeneratorTensor_0{}, num_thread); + bnBias.GenerateTensorValue(GeneratorTensor_0{}, num_thread); + break; + case 1: + bnScale.GenerateTensorValue(GeneratorTensor_1{1}, num_thread); + bnBias.GenerateTensorValue(GeneratorTensor_1{0}, num_thread); + break; + case 2: + bnScale.GenerateTensorValue(GeneratorTensor_2{-5, 5}, num_thread); + bnBias.GenerateTensorValue(GeneratorTensor_2{-5, 5}, num_thread); + break; + default: + bnScale.GenerateTensorValue(GeneratorTensor_3{-1.0f, 1.0f}, + num_thread); + bnBias.GenerateTensorValue(GeneratorTensor_3{-1.0f, 1.0f}, + num_thread); + } + }; + }; + + // these buffers are usually provided by the user application + DeviceMem x_dev(sizeof(XDataType) * x.mDesc.GetElementSpaceSize()); + DeviceMem y_dev(sizeof(XDataType) * y.mDesc.GetElementSpaceSize()); + DeviceMem bnScale_dev(sizeof(ScaleDataType) * bnScale.mDesc.GetElementSpaceSize()); + DeviceMem bnBias_dev(sizeof(BiasDataType) * bnBias.mDesc.GetElementSpaceSize()); + + // mean_dev or resultSaveMean_dev + DeviceMem resultSaveMean_dev(sizeof(MeanVarDataType) * + resultSaveMean_ref.mDesc.GetElementSpaceSize()); + // meansquare_dev or resultSaveInvVariance_dev + DeviceMem resultSaveInvVariance_dev(sizeof(MeanVarDataType) * + resultSaveInvVariance_ref.mDesc.GetElementSpaceSize()); + // resultRunningMean_dev + DeviceMem resultRunningMean_dev(sizeof(MeanVarDataType) * + resultRunningMean_ref.mDesc.GetElementSpaceSize()); + // resultRunningVariance_dev + DeviceMem resultRunningVariance_dev(sizeof(MeanVarDataType) * + resultRunningVariance_ref.mDesc.GetElementSpaceSize()); + + x_dev.ToDevice(x.mData.data()); + bnScale_dev.ToDevice(bnScale.mData.data()); + bnBias_dev.ToDevice(bnBias.mData.data()); + + if(updateMovingAverage) + { + resultRunningMean_dev.ToDevice(resultRunningMean_ref.mData.data()); + resultRunningVariance_dev.ToDevice(resultRunningVariance_ref.mData.data()); + }; + + // used for storing the device result for verification when updateMovingAverage is enabled + Tensor resultRunningMean(scaleBiasMeanVarLengths); + Tensor resultRunningVariance(scaleBiasMeanVarLengths); + + // used for storing the device result for verification when saveMeanAndInvVariance is enabled + Tensor resultSaveMean(scaleBiasMeanVarLengths); + Tensor resultSaveInvVariance(scaleBiasMeanVarLengths); + + std::array arrInOutLengths; + std::array arrInOutStrides; + std::array arrScaleBiasMeanVarLengths; + std::array arrScaleBiasMeanVarStrides; + std::array arrReduceDims; + + std::copy(inOutLengths.begin(), inOutLengths.end(), arrInOutLengths.begin()); + std::copy(inOutStrides.begin(), inOutStrides.end(), arrInOutStrides.begin()); + std::copy(scaleBiasMeanVarLengths.begin(), + scaleBiasMeanVarLengths.end(), + arrScaleBiasMeanVarLengths.begin()); + std::copy(scaleBiasMeanVarStrides.begin(), + scaleBiasMeanVarStrides.end(), + arrScaleBiasMeanVarStrides.begin()); + + std::copy(reduceDims.begin(), reduceDims.end(), arrReduceDims.begin()); + + using PassThroughOp = ck::tensor_operation::element_wise::PassThrough; + + // add device batchnorm-forward instances + using DeviceOp = ck::tensor_operation::device::DeviceBatchNormFwd; + + // get device op instances + const auto instance_ptrs = + ck::tensor_operation::device::instance::DeviceOperationInstanceFactory< + DeviceOp>::GetInstances(); + + std::cout << "found " << instance_ptrs.size() << " instances" << std::endl; + + std::string best_instance_name; + float best_avg_time = std::numeric_limits::max(); + float best_gb_per_sec = 0; + + if(do_verification) + { + using ReferenceBatchNormFwdInstance = + ck::tensor_operation::host::ReferenceBatchNormFwd; + + auto batchNormFwd_ref = ReferenceBatchNormFwdInstance{}; + + auto argument_ptr_ref = batchNormFwd_ref.MakeArgumentPointer( + arrInOutLengths, + arrInOutStrides, + arrInOutStrides, + arrReduceDims, + arrScaleBiasMeanVarLengths, + arrScaleBiasMeanVarStrides, + arrScaleBiasMeanVarStrides, + arrScaleBiasMeanVarStrides, + x.mData.data(), + bnScale.mData.data(), + bnBias.mData.data(), + epsilon, + PassThroughOp{}, + y_ref.mData.data(), + saveMeanAndInvVariance ? resultSaveMean_ref.mData.data() : nullptr, + saveMeanAndInvVariance ? resultSaveInvVariance_ref.mData.data() : nullptr, + averageFactor, + updateMovingAverage ? resultRunningMean_ref.mData.data() : nullptr, + updateMovingAverage ? resultRunningVariance_ref.mData.data() : nullptr); + + if(!batchNormFwd_ref.IsSupportedArgument(argument_ptr_ref.get())) + { + std::cout << "The runtime parameters not supported by the reference instance, exiting!" + << std::endl; + return (false); + }; + + auto invoker_ptr_ref = batchNormFwd_ref.MakeInvokerPointer(); + + (void)invoker_ptr_ref->Run(argument_ptr_ref.get()); + } + + int num_kernel = 0; + bool pass = true; + + for(auto& inst_ptr : instance_ptrs) + { + auto argument_ptr = inst_ptr->MakeArgumentPointer( + arrInOutLengths, + arrInOutStrides, + arrInOutStrides, + arrReduceDims, + arrScaleBiasMeanVarLengths, + arrScaleBiasMeanVarStrides, + arrScaleBiasMeanVarStrides, + arrScaleBiasMeanVarStrides, + x_dev.GetDeviceBuffer(), + bnScale_dev.GetDeviceBuffer(), + bnBias_dev.GetDeviceBuffer(), + epsilon, + PassThroughOp{}, + y_dev.GetDeviceBuffer(), + saveMeanAndInvVariance ? resultSaveMean_dev.GetDeviceBuffer() : nullptr, + saveMeanAndInvVariance ? resultSaveInvVariance_dev.GetDeviceBuffer() : nullptr, + averageFactor, + updateMovingAverage ? resultRunningMean_dev.GetDeviceBuffer() : nullptr, + updateMovingAverage ? resultRunningVariance_dev.GetDeviceBuffer() : nullptr); + + if(inst_ptr->IsSupportedArgument(argument_ptr.get())) + { + num_kernel++; + } + else + { + if(time_kernel) + { + std::cout << inst_ptr->GetTypeString() + << " skipped due to unsupported argument: " << std::endl; + } + + continue; + }; + + size_t workspace_sz = inst_ptr->GetWorkSpaceSize(argument_ptr.get()); + + DeviceMem workspace_dev(workspace_sz); + + inst_ptr->SetWorkSpacePointer(argument_ptr.get(), workspace_dev.GetDeviceBuffer()); + + auto invoker_ptr = inst_ptr->MakeInvokerPointer(); + + float avg_time = invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, time_kernel}); + + size_t num_bytes = 0; + + // inputing of x, scale, bias, outputing of y + num_bytes += total_length * (sizeof(XDataType) + sizeof(YDataType)) + + invariant_length * (sizeof(ScaleDataType) + sizeof(BiasDataType)); + + // outputing of mean, inv-variance + num_bytes += saveMeanAndInvVariance ? invariant_length * sizeof(MeanVarDataType) * 2 : 0; + + // updating of moving mean, variance + num_bytes += updateMovingAverage ? invariant_length * sizeof(MeanVarDataType) * 4 : 0; + + float gb_per_sec = num_bytes / 1.E6 / avg_time; + + if(time_kernel) + std::cout << "Perf: " << avg_time << " ms, " << gb_per_sec << " GB/s, " + << inst_ptr->GetTypeString() << std::endl; + + if(avg_time < best_avg_time) + { + best_instance_name = inst_ptr->GetTypeString(); + best_avg_time = avg_time; + best_gb_per_sec = gb_per_sec; + } + + if(do_verification) + { + using ck::utils::check_err; + bool single_pass; + + y_dev.FromDevice(y.mData.data()); + + if constexpr(ck::is_same_v) + single_pass = check_err(y.mData, y_ref.mData, "y results", 1e-2, 1e-2); + else + single_pass = check_err(y.mData, y_ref.mData, "y results", 4e-3, 4e-3); + + if(updateMovingAverage) + { + resultRunningMean_dev.FromDevice(resultRunningMean.mData.data()); + resultRunningVariance_dev.FromDevice(resultRunningVariance.mData.data()); + + // clang-format off + single_pass = single_pass && check_err(resultRunningMean.mData, resultRunningMean_ref.mData, "average mean results", 1.5e-5, 1.5e-5); + single_pass = single_pass && check_err(resultRunningVariance.mData, resultRunningVariance_ref.mData, "average variance results", 1e-5, 1e-5); + // clang-format on + }; + + if(saveMeanAndInvVariance) + { + resultSaveMean_dev.FromDevice(resultSaveMean.mData.data()); + resultSaveInvVariance_dev.FromDevice(resultSaveInvVariance.mData.data()); + + // clang-format off + single_pass = single_pass && check_err(resultSaveMean.mData, resultSaveMean_ref.mData, "mean results", 3e-5, 3e-5); + single_pass = single_pass && check_err(resultSaveInvVariance.mData, resultSaveInvVariance_ref.mData, "inv-variance results", 7e-5, 7e-5); + // clang-format on + }; + + pass = pass && single_pass; + }; + + if(do_dumpout) + { + using ck::host_common::dumpBufferToFile; + + // clang-format off + dumpBufferToFile("dump_x.bin", x.mData.data(), x.mDesc.GetElementSize()); + dumpBufferToFile("dump_y.bin", y.mData.data(), y.mDesc.GetElementSize()); + dumpBufferToFile("dump_y_ref.bin", y_ref.mData.data(), y_ref.mDesc.GetElementSize()); + // clang-format off + + if(saveMeanAndInvVariance) + { + // clang-format off + dumpBufferToFile("dump_mean.bin", resultSaveMean.mData.data(), resultSaveMean.mDesc.GetElementSize()); + dumpBufferToFile("dump_mean_ref.bin", resultSaveMean_ref.mData.data(), resultSaveMean_ref.mDesc.GetElementSize()); + dumpBufferToFile("dump_invvar.bin", resultSaveInvVariance.mData.data(), resultSaveInvVariance.mDesc.GetElementSize()); + dumpBufferToFile("dump_invvar_ref.bin", resultSaveInvVariance_ref.mData.data(), resultSaveInvVariance_ref.mDesc.GetElementSize()); + // clang-format on + }; + }; + } + + if(time_kernel) + { + std::cout << "best perf = " << best_avg_time << " ms, " << best_gb_per_sec << " GB/s, " + << best_instance_name << std::endl; + } + + if(num_kernel == 0) + { + std::cout << "Error: No kernel is applicable" << std::endl; + return false; + } + + return pass; +} + +} // namespace profiler +} // namespace ck diff --git a/profiler/src/profile_batchnorm_fwd.cpp b/profiler/src/profile_batchnorm_fwd.cpp new file mode 100644 index 0000000000..077963f828 --- /dev/null +++ b/profiler/src/profile_batchnorm_fwd.cpp @@ -0,0 +1,234 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +#include +#include +#include + +#include "ck/library/utility/host_common_util.hpp" +#include "profiler/include/profile_batchnorm_forward_impl.hpp" + +using ck::index_t; + +using namespace std; + +static const struct option long_options[] = {{"inOutLengths", required_argument, nullptr, 'D'}, + {"reduceDims", required_argument, nullptr, 'R'}, + {"dumpout", required_argument, nullptr, 'o'}, + {"verify", required_argument, nullptr, 'v'}, + {"help", no_argument, nullptr, '?'}, + {nullptr, 0, nullptr, 0}}; + +class BatchnormFwdArgParser +{ + private: + int option_index = 0; + + public: + std::vector inLengths; + std::vector reduceDims; + + bool do_verification = false; + bool do_dumpout = false; + + bool updateMovingAverage; + bool saveMeanAndInvVariance; + + int data_type = 0; + int init_method = 2; + bool time_kernel = false; + + BatchnormFwdArgParser() = default; + ~BatchnormFwdArgParser() = default; + + void show_usage(const char* cmd) + { + // clang-format off + std::cout << "Usage of " << cmd << std::endl; + std::cout << "--inOutLengths or -D, comma separated list of input tensor dimension lengths, must have 4 integers for nhwc" << std::endl; + std::cout << "--reduceDims or -R, comma separated list of dimensions to reduce on" << std::endl; + std::cout << "--verify or -v, 1/0 to indicate whether to verify the result by comparing with the host-based batch-normalization" << std::endl; + std::cout << "Arg1: data type (0: fp16, 1: fp32, 3: int8, 5: bp16, 6: fp64)" << std::endl; + std::cout << "Arg2: 1/0 to indicate whether to update the moving average and variance (0=no, 1=yes)" << std::endl; + std::cout << "Arg3: 1/0 to indicate whether to save the calculated mean and invVariance (0=no, 1=yes)" << std::endl; + std::cout << "Arg4: init method used for bnScale and bnBias (0=no init, 1=single integer value, 2=scope integer value, 3=decimal value)" << std::endl; + std::cout << "Arg5: time kernel (0=no, 1=yes)" << std::endl; + // clang-format on + }; + + int operator()(int argc, char* argv[]) + { + using ck::host_common::getTypeValuesFromString; + + int ch; + + optind++; // to skip the module name + + while(1) + { + ch = getopt_long(argc, argv, "D:R:v:o:", long_options, &option_index); + if(ch == -1) + break; + switch(ch) + { + case 'D': + if(!optarg) + throw std::runtime_error("Invalid option format!"); + + inLengths = getTypeValuesFromString(optarg); + break; + case 'R': + if(!optarg) + throw std::runtime_error("Invalid option format!"); + + reduceDims = getTypeValuesFromString(optarg); + break; + case 'v': + if(!optarg) + throw std::runtime_error("Invalid option format!"); + + do_verification = static_cast(std::atoi(optarg)); + break; + case 'o': + if(!optarg) + throw std::runtime_error("Invalid option format!"); + + do_dumpout = static_cast(std::atoi(optarg)); + break; + case '?': + if(std::string(long_options[option_index].name) == "help") + { + show_usage(argv[0]); + return -1; + }; + break; + + default: + show_usage(argv[0]); + std::cerr << "Invalid cmd-line options!" << std::endl; + return -1; + }; + }; + + if(optind + 5 > argc) + throw std::runtime_error("Invalid cmd-line arguments, more argumetns are needed!"); + + data_type = std::atoi(argv[optind++]); + updateMovingAverage = std::atoi(argv[optind++]); + saveMeanAndInvVariance = std::atoi(argv[optind++]); + init_method = std::atoi(argv[optind++]); + time_kernel = static_cast(std::atoi(argv[optind++])); + + if(data_type != 0 && data_type != 1 && data_type != 3 && data_type != 5 && data_type != 6) + return -1; + + return 0; + }; +}; // end of class AppArgs + +static const double epsilon = std::numeric_limits::epsilon(); +static const double averageFactor = 0.1; + +int profile_batchnorm_forward(int argc, char* argv[]) +{ + using ck::profiler::profile_batchnorm_forward_impl; + + BatchnormFwdArgParser arg_parser; + + if(arg_parser(argc, argv) != 0) + return -1; + + using F16 = ck::half_t; + using F32 = float; + using BF16 = ck::bhalf_t; + using I8 = int8_t; + using F64 = double; + + if(arg_parser.data_type == 0) + { + if(arg_parser.inLengths.size() == 4 && arg_parser.reduceDims.size() == 3) + { + profile_batchnorm_forward_impl( + arg_parser.do_verification, + arg_parser.init_method, + arg_parser.do_dumpout, + arg_parser.time_kernel, + arg_parser.inLengths, + arg_parser.reduceDims, + arg_parser.updateMovingAverage, + arg_parser.saveMeanAndInvVariance, + epsilon, + averageFactor); + }; + } + else if(arg_parser.data_type == 1) + { + if(arg_parser.inLengths.size() == 4 && arg_parser.reduceDims.size() == 3) + { + profile_batchnorm_forward_impl( + arg_parser.do_verification, + arg_parser.init_method, + arg_parser.do_dumpout, + arg_parser.time_kernel, + arg_parser.inLengths, + arg_parser.reduceDims, + arg_parser.updateMovingAverage, + arg_parser.saveMeanAndInvVariance, + epsilon, + averageFactor); + }; + } + else if(arg_parser.data_type == 3) + { + if(arg_parser.inLengths.size() == 4 && arg_parser.reduceDims.size() == 3) + { + profile_batchnorm_forward_impl( + arg_parser.do_verification, + arg_parser.init_method, + arg_parser.do_dumpout, + arg_parser.time_kernel, + arg_parser.inLengths, + arg_parser.reduceDims, + arg_parser.updateMovingAverage, + arg_parser.saveMeanAndInvVariance, + epsilon, + averageFactor); + }; + } + else if(arg_parser.data_type == 5) + { + if(arg_parser.inLengths.size() == 4 && arg_parser.reduceDims.size() == 3) + { + profile_batchnorm_forward_impl( + arg_parser.do_verification, + arg_parser.init_method, + arg_parser.do_dumpout, + arg_parser.time_kernel, + arg_parser.inLengths, + arg_parser.reduceDims, + arg_parser.updateMovingAverage, + arg_parser.saveMeanAndInvVariance, + epsilon, + averageFactor); + }; + } + else if(arg_parser.data_type == 6) + { + if(arg_parser.inLengths.size() == 4 && arg_parser.reduceDims.size() == 3) + { + profile_batchnorm_forward_impl( + arg_parser.do_verification, + arg_parser.init_method, + arg_parser.do_dumpout, + arg_parser.time_kernel, + arg_parser.inLengths, + arg_parser.reduceDims, + arg_parser.updateMovingAverage, + arg_parser.saveMeanAndInvVariance, + epsilon, + averageFactor); + }; + } + + return 0; +} diff --git a/profiler/src/profiler.cpp b/profiler/src/profiler.cpp index 7b329464a8..4942d3c558 100644 --- a/profiler/src/profiler.cpp +++ b/profiler/src/profiler.cpp @@ -24,6 +24,7 @@ int profile_softmax(int, char*[]); int profile_layernorm(int, char*[]); int profile_groupnorm(int, char*[]); int profile_reduce(int, char*[]); +int profile_batchnorm_forward(int, char*[]); static void print_helper_message() { @@ -46,7 +47,8 @@ static void print_helper_message() " grouped_conv_fwd: Grouped Convolution Forward\n" " grouped_conv_bwd_weight: Grouped Convolution Backward Weight\n" " softmax: Softmax\n" - " reduce: Reduce\n"); + " reduce: Reduce\n" + " bnorm_fwd: Batchnorm forward\n"); // clang-format on } @@ -142,6 +144,10 @@ int main(int argc, char* argv[]) { return profile_groupnorm(argc, argv); } + else if(strcmp(argv[1], "bnorm_fwd") == 0) + { + return profile_batchnorm_forward(argc, argv); + } else { print_helper_message(); diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index 68b98ec8b9..57c11b55aa 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -53,3 +53,4 @@ add_subdirectory(softmax) add_subdirectory(normalization) add_subdirectory(data_type) add_subdirectory(elementwise_normalization) +add_subdirectory(batchnorm_fwd) diff --git a/test/batchnorm_fwd/CMakeLists.txt b/test/batchnorm_fwd/CMakeLists.txt new file mode 100644 index 0000000000..87361f9d0a --- /dev/null +++ b/test/batchnorm_fwd/CMakeLists.txt @@ -0,0 +1,2 @@ +add_gtest_executable(test_batchnorm_fwd_rank_4 batchnorm_fwd_rank_4.cpp) +target_link_libraries(test_batchnorm_fwd_rank_4 PRIVATE utility device_batchnorm_instance) diff --git a/test/batchnorm_fwd/batchnorm_fwd_rank_4.cpp b/test/batchnorm_fwd/batchnorm_fwd_rank_4.cpp new file mode 100644 index 0000000000..a19664a87c --- /dev/null +++ b/test/batchnorm_fwd/batchnorm_fwd_rank_4.cpp @@ -0,0 +1,110 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +#include +#include +#include +#include +#include +#include + +#include "profiler/include/profile_batchnorm_forward_impl.hpp" + +using F16 = ck::half_t; +using F32 = float; +using BF16 = ck::bhalf_t; +using I8 = int8_t; +using F64 = double; + +template +class TestBatchNormFwdRank4 : public ::testing::Test +{ + private: + const double epsilon = std::numeric_limits::epsilon(); + const double averageFactor = 0.1; + + protected: + using XDataType = std::tuple_element_t<0, Tuple>; + using YDataType = std::tuple_element_t<1, Tuple>; + using AccDataType = std::tuple_element_t<2, Tuple>; + using ScaleDataType = std::tuple_element_t<3, Tuple>; + using BiasDataType = std::tuple_element_t<4, Tuple>; + using MeanVarDataType = std::tuple_element_t<5, Tuple>; + + std::vector> list_of_lengths = { + {128, 16, 3, 1024}, {128, 16, 6, 512}, {1, 1, 1, 1}, {4, 4, 4, 4}, {32, 32, 32, 32}}; + std::vector reduceDims; + + template + void Run() + { + for(auto& inOutLengths : list_of_lengths) + { + bool pass = true; + + EXPECT_FALSE(reduceDims.size() != NumReduceDim); + + pass = + pass && ck::profiler::profile_batchnorm_forward_impl(true, + 3, + false, + false, + inOutLengths, + reduceDims, + true, + true, + epsilon, + averageFactor); + + pass = + pass && ck::profiler::profile_batchnorm_forward_impl(true, + 3, + false, + false, + inOutLengths, + reduceDims, + false, + false, + epsilon, + averageFactor); + + EXPECT_TRUE(pass); + } + } +}; + +using KernelTypes = ::testing::Types, + std::tuple, + std::tuple, + std::tuple, + std::tuple>; + +TYPED_TEST_SUITE(TestBatchNormFwdRank4, KernelTypes); + +// nhwc +TYPED_TEST(TestBatchNormFwdRank4, nhwc) +{ + this->reduceDims = {0, 1, 2}; + this->template Run<3>(); +} + +// nchw +TYPED_TEST(TestBatchNormFwdRank4, nchw) +{ + this->reduceDims = {0, 2, 3}; + this->template Run<3>(); +}