mirror of
https://github.com/ROCm/composable_kernel.git
synced 2026-05-15 10:37:44 +00:00
Batchnorm inference instances, external API, client examples and gtests (#531)
* File renaming and class renaming for device element-wise operation
* Add batchnorm-infer instances, external API and client example
* Add batchnorm-infer profiler module and gtests
* Remove file device_elementwise_extension.hpp and move NormalizeInInfer operation to element_wise_operation.hpp
* Remove the using of class aliasing for DeviceElementwiseForBatchNormInfer
* Rename class and file due to conflict from device_elementwise_2d.hpp
* Fix namespace in batcnnorm_infer_nhwc client example
[ROCm/composable_kernel commit: a1b2441f8d]
This commit is contained in:
@@ -7,4 +7,8 @@ add_instance_library(device_batchnorm_instance
|
||||
device_batchnorm_backward_f32_instance.cpp
|
||||
device_batchnorm_backward_bf16_instance.cpp
|
||||
device_batchnorm_backward_f64_instance.cpp
|
||||
device_batchnorm_infer_f16_instance.cpp
|
||||
device_batchnorm_infer_f32_instance.cpp
|
||||
device_batchnorm_infer_bf16_instance.cpp
|
||||
device_batchnorm_infer_f64_instance.cpp
|
||||
)
|
||||
|
||||
@@ -0,0 +1,55 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#include "ck/ck.hpp"
|
||||
#include "ck/utility/tuple.hpp"
|
||||
#include "ck/utility/data_type.hpp"
|
||||
|
||||
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/impl/device_elementwise_impl.hpp"
|
||||
#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
|
||||
|
||||
namespace ck {
|
||||
namespace tensor_operation {
|
||||
namespace device {
|
||||
namespace instance {
|
||||
|
||||
using BF16 = ck::bhalf_t;
|
||||
using F32 = float;
|
||||
|
||||
using Normalize = ck::tensor_operation::element_wise::NormalizeInInfer;
|
||||
|
||||
// clang-format off
|
||||
template <index_t Rank>
|
||||
using device_batchnorm_infer_bf16_instances =
|
||||
std::tuple <
|
||||
// Tuple<XDataType, MeanDataType, VarDataType, ScaleDataType, BiasDataType>, Tuple<YDataType>, NormalizeOp, Rank, MPerThread, Sequence<XVectorSize, MeanDataType, VarDataType, ScaleVectorSize, BiasVectorSize>, Sequence<YVectorSize>
|
||||
DeviceElementwiseImpl<Tuple<BF16, F32, F32, BF16, BF16>, Tuple<BF16>, Normalize, Rank, 1, Sequence<1, 1, 1, 1, 1>, Sequence<1> >,
|
||||
DeviceElementwiseImpl<Tuple<BF16, F32, F32, BF16, BF16>, Tuple<BF16>, Normalize, Rank, 2, Sequence<1, 1, 1, 1, 1>, Sequence<1> >,
|
||||
DeviceElementwiseImpl<Tuple<BF16, F32, F32, BF16, BF16>, Tuple<BF16>, Normalize, Rank, 2, Sequence<2, 1, 1, 1, 1>, Sequence<2> >,
|
||||
DeviceElementwiseImpl<Tuple<BF16, F32, F32, BF16, BF16>, Tuple<BF16>, Normalize, Rank, 2, Sequence<1, 2, 2, 2, 2>, Sequence<1> >,
|
||||
DeviceElementwiseImpl<Tuple<BF16, F32, F32, BF16, BF16>, Tuple<BF16>, Normalize, Rank, 2, Sequence<2, 2, 2, 2, 2>, Sequence<2> >,
|
||||
DeviceElementwiseImpl<Tuple<BF16, F32, F32, BF16, BF16>, Tuple<BF16>, Normalize, Rank, 4, Sequence<1, 1, 1, 1, 1>, Sequence<1> >,
|
||||
DeviceElementwiseImpl<Tuple<BF16, F32, F32, BF16, BF16>, Tuple<BF16>, Normalize, Rank, 4, Sequence<2, 1, 1, 1, 1>, Sequence<2> >,
|
||||
DeviceElementwiseImpl<Tuple<BF16, F32, F32, BF16, BF16>, Tuple<BF16>, Normalize, Rank, 4, Sequence<1, 2, 2, 2, 2>, Sequence<1> >,
|
||||
DeviceElementwiseImpl<Tuple<BF16, F32, F32, BF16, BF16>, Tuple<BF16>, Normalize, Rank, 4, Sequence<2, 2, 2, 2, 2>, Sequence<2> >,
|
||||
DeviceElementwiseImpl<Tuple<BF16, F32, F32, BF16, BF16>, Tuple<BF16>, Normalize, Rank, 4, Sequence<4, 1, 1, 1, 1>, Sequence<4> >,
|
||||
DeviceElementwiseImpl<Tuple<BF16, F32, F32, BF16, BF16>, Tuple<BF16>, Normalize, Rank, 4, Sequence<1, 4, 4, 4, 4>, Sequence<1> >,
|
||||
DeviceElementwiseImpl<Tuple<BF16, F32, F32, BF16, BF16>, Tuple<BF16>, Normalize, Rank, 4, Sequence<4, 2, 2, 2, 2>, Sequence<4> >,
|
||||
DeviceElementwiseImpl<Tuple<BF16, F32, F32, BF16, BF16>, Tuple<BF16>, Normalize, Rank, 4, Sequence<2, 4, 4, 4, 4>, Sequence<2> >,
|
||||
DeviceElementwiseImpl<Tuple<BF16, F32, F32, BF16, BF16>, Tuple<BF16>, Normalize, Rank, 4, Sequence<4, 4, 4, 4, 4>, Sequence<4> >
|
||||
>;
|
||||
// clang-format on
|
||||
|
||||
void add_device_batchnorm_infer_rank_4_bf16_instances(
|
||||
std::vector<std::unique_ptr<
|
||||
DeviceElementwise<Tuple<BF16, F32, F32, BF16, BF16>, Tuple<BF16>, Normalize, 4>>>&
|
||||
instances)
|
||||
{
|
||||
add_device_operation_instances(instances, device_batchnorm_infer_bf16_instances<4>{});
|
||||
}
|
||||
|
||||
} // namespace instance
|
||||
} // namespace device
|
||||
} // namespace tensor_operation
|
||||
} // namespace ck
|
||||
@@ -0,0 +1,54 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#include "ck/ck.hpp"
|
||||
#include "ck/utility/tuple.hpp"
|
||||
#include "ck/utility/data_type.hpp"
|
||||
|
||||
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/impl/device_elementwise_impl.hpp"
|
||||
#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
|
||||
|
||||
namespace ck {
|
||||
namespace tensor_operation {
|
||||
namespace device {
|
||||
namespace instance {
|
||||
|
||||
using F16 = ck::half_t;
|
||||
using F32 = float;
|
||||
|
||||
using Normalize = ck::tensor_operation::element_wise::NormalizeInInfer;
|
||||
|
||||
// clang-format off
|
||||
template <index_t Rank>
|
||||
using device_batchnorm_infer_f16_instances =
|
||||
std::tuple <
|
||||
// Tuple<XDataType, MeanDataType, VarDataType, ScaleDataType, BiasDataType>, Tuple<YDataType>, NormalizeOp, Rank, MPerThread, Sequence<XVectorSize, MeanDataType, VarDataType, ScaleVectorSize, BiasVectorSize>, Sequence<YVectorSize>
|
||||
DeviceElementwiseImpl<Tuple<F16, F32, F32, F16, F16>, Tuple<F16>, Normalize, Rank, 1, Sequence<1, 1, 1, 1, 1>, Sequence<1> >,
|
||||
DeviceElementwiseImpl<Tuple<F16, F32, F32, F16, F16>, Tuple<F16>, Normalize, Rank, 2, Sequence<1, 1, 1, 1, 1>, Sequence<1> >,
|
||||
DeviceElementwiseImpl<Tuple<F16, F32, F32, F16, F16>, Tuple<F16>, Normalize, Rank, 2, Sequence<2, 1, 1, 1, 1>, Sequence<2> >,
|
||||
DeviceElementwiseImpl<Tuple<F16, F32, F32, F16, F16>, Tuple<F16>, Normalize, Rank, 2, Sequence<1, 2, 2, 2, 2>, Sequence<1> >,
|
||||
DeviceElementwiseImpl<Tuple<F16, F32, F32, F16, F16>, Tuple<F16>, Normalize, Rank, 2, Sequence<2, 2, 2, 2, 2>, Sequence<2> >,
|
||||
DeviceElementwiseImpl<Tuple<F16, F32, F32, F16, F16>, Tuple<F16>, Normalize, Rank, 4, Sequence<1, 1, 1, 1, 1>, Sequence<1> >,
|
||||
DeviceElementwiseImpl<Tuple<F16, F32, F32, F16, F16>, Tuple<F16>, Normalize, Rank, 4, Sequence<2, 1, 1, 1, 1>, Sequence<2> >,
|
||||
DeviceElementwiseImpl<Tuple<F16, F32, F32, F16, F16>, Tuple<F16>, Normalize, Rank, 4, Sequence<1, 2, 2, 2, 2>, Sequence<1> >,
|
||||
DeviceElementwiseImpl<Tuple<F16, F32, F32, F16, F16>, Tuple<F16>, Normalize, Rank, 4, Sequence<2, 2, 2, 2, 2>, Sequence<2> >,
|
||||
DeviceElementwiseImpl<Tuple<F16, F32, F32, F16, F16>, Tuple<F16>, Normalize, Rank, 4, Sequence<4, 1, 1, 1, 1>, Sequence<4> >,
|
||||
DeviceElementwiseImpl<Tuple<F16, F32, F32, F16, F16>, Tuple<F16>, Normalize, Rank, 4, Sequence<1, 4, 4, 4, 4>, Sequence<1> >,
|
||||
DeviceElementwiseImpl<Tuple<F16, F32, F32, F16, F16>, Tuple<F16>, Normalize, Rank, 4, Sequence<4, 2, 2, 2, 2>, Sequence<4> >,
|
||||
DeviceElementwiseImpl<Tuple<F16, F32, F32, F16, F16>, Tuple<F16>, Normalize, Rank, 4, Sequence<2, 4, 4, 4, 4>, Sequence<2> >,
|
||||
DeviceElementwiseImpl<Tuple<F16, F32, F32, F16, F16>, Tuple<F16>, Normalize, Rank, 4, Sequence<4, 4, 4, 4, 4>, Sequence<4> >
|
||||
>;
|
||||
// clang-format on
|
||||
|
||||
void add_device_batchnorm_infer_rank_4_f16_instances(
|
||||
std::vector<std::unique_ptr<
|
||||
DeviceElementwise<Tuple<F16, F32, F32, F16, F16>, Tuple<F16>, Normalize, 4>>>& instances)
|
||||
{
|
||||
add_device_operation_instances(instances, device_batchnorm_infer_f16_instances<4>{});
|
||||
}
|
||||
|
||||
} // namespace instance
|
||||
} // namespace device
|
||||
} // namespace tensor_operation
|
||||
} // namespace ck
|
||||
@@ -0,0 +1,52 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#include "ck/ck.hpp"
|
||||
#include "ck/utility/tuple.hpp"
|
||||
|
||||
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/impl/device_elementwise_impl.hpp"
|
||||
#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
|
||||
|
||||
namespace ck {
|
||||
namespace tensor_operation {
|
||||
namespace device {
|
||||
namespace instance {
|
||||
|
||||
using F32 = float;
|
||||
|
||||
using Normalize = ck::tensor_operation::element_wise::NormalizeInInfer;
|
||||
|
||||
// clang-format off
|
||||
template <index_t Rank>
|
||||
using device_batchnorm_infer_f32_instances =
|
||||
std::tuple <
|
||||
// Tuple<XDataType, MeanDataType, VarDataType, ScaleDataType, BiasDataType>, Tuple<YDataType>, NormalizeOp, Rank, MPerThread, Sequence<XVectorSize, MeanDataType, VarDataType, ScaleVectorSize, BiasVectorSize>, Sequence<YVectorSize>
|
||||
DeviceElementwiseImpl<Tuple<F32, F32, F32, F32, F32>, Tuple<F32>, Normalize, Rank, 1, Sequence<1, 1, 1, 1, 1>, Sequence<1> >,
|
||||
DeviceElementwiseImpl<Tuple<F32, F32, F32, F32, F32>, Tuple<F32>, Normalize, Rank, 2, Sequence<1, 1, 1, 1, 1>, Sequence<1> >,
|
||||
DeviceElementwiseImpl<Tuple<F32, F32, F32, F32, F32>, Tuple<F32>, Normalize, Rank, 2, Sequence<2, 1, 1, 1, 1>, Sequence<2> >,
|
||||
DeviceElementwiseImpl<Tuple<F32, F32, F32, F32, F32>, Tuple<F32>, Normalize, Rank, 2, Sequence<1, 2, 2, 2, 2>, Sequence<1> >,
|
||||
DeviceElementwiseImpl<Tuple<F32, F32, F32, F32, F32>, Tuple<F32>, Normalize, Rank, 2, Sequence<2, 2, 2, 2, 2>, Sequence<2> >,
|
||||
DeviceElementwiseImpl<Tuple<F32, F32, F32, F32, F32>, Tuple<F32>, Normalize, Rank, 4, Sequence<1, 1, 1, 1, 1>, Sequence<1> >,
|
||||
DeviceElementwiseImpl<Tuple<F32, F32, F32, F32, F32>, Tuple<F32>, Normalize, Rank, 4, Sequence<2, 1, 1, 1, 1>, Sequence<2> >,
|
||||
DeviceElementwiseImpl<Tuple<F32, F32, F32, F32, F32>, Tuple<F32>, Normalize, Rank, 4, Sequence<1, 2, 2, 2, 2>, Sequence<1> >,
|
||||
DeviceElementwiseImpl<Tuple<F32, F32, F32, F32, F32>, Tuple<F32>, Normalize, Rank, 4, Sequence<2, 2, 2, 2, 2>, Sequence<2> >,
|
||||
DeviceElementwiseImpl<Tuple<F32, F32, F32, F32, F32>, Tuple<F32>, Normalize, Rank, 4, Sequence<4, 1, 1, 1, 1>, Sequence<4> >,
|
||||
DeviceElementwiseImpl<Tuple<F32, F32, F32, F32, F32>, Tuple<F32>, Normalize, Rank, 4, Sequence<1, 4, 4, 4, 4>, Sequence<1> >,
|
||||
DeviceElementwiseImpl<Tuple<F32, F32, F32, F32, F32>, Tuple<F32>, Normalize, Rank, 4, Sequence<4, 2, 2, 2, 2>, Sequence<4> >,
|
||||
DeviceElementwiseImpl<Tuple<F32, F32, F32, F32, F32>, Tuple<F32>, Normalize, Rank, 4, Sequence<2, 4, 4, 4, 4>, Sequence<2> >,
|
||||
DeviceElementwiseImpl<Tuple<F32, F32, F32, F32, F32>, Tuple<F32>, Normalize, Rank, 4, Sequence<4, 4, 4, 4, 4>, Sequence<4> >
|
||||
>;
|
||||
// clang-format on
|
||||
|
||||
void add_device_batchnorm_infer_rank_4_f32_instances(
|
||||
std::vector<std::unique_ptr<
|
||||
DeviceElementwise<Tuple<F32, F32, F32, F32, F32>, Tuple<F32>, Normalize, 4>>>& instances)
|
||||
{
|
||||
add_device_operation_instances(instances, device_batchnorm_infer_f32_instances<4>{});
|
||||
}
|
||||
|
||||
} // namespace instance
|
||||
} // namespace device
|
||||
} // namespace tensor_operation
|
||||
} // namespace ck
|
||||
@@ -0,0 +1,47 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#include "ck/ck.hpp"
|
||||
#include "ck/utility/tuple.hpp"
|
||||
|
||||
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/impl/device_elementwise_impl.hpp"
|
||||
#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
|
||||
|
||||
namespace ck {
|
||||
namespace tensor_operation {
|
||||
namespace device {
|
||||
namespace instance {
|
||||
|
||||
using F64 = double;
|
||||
|
||||
using Normalize = ck::tensor_operation::element_wise::NormalizeInInfer;
|
||||
|
||||
// clang-format off
|
||||
template <index_t Rank>
|
||||
using device_batchnorm_infer_f64_instances =
|
||||
std::tuple <
|
||||
// Tuple<XDataType, MeanDataType, VarDataType, ScaleDataType, BiasDataType>, Tuple<YDataType>, NormalizeOp, Rank, MPerThread, Sequence<XVectorSize, MeanDataType, VarDataType, ScaleVectorSize, BiasVectorSize>, Sequence<YVectorSize>
|
||||
DeviceElementwiseImpl<Tuple<F64, F64, F64, F64, F64>, Tuple<F64>, Normalize, Rank, 1, Sequence<1, 1, 1, 1, 1>, Sequence<1> >,
|
||||
DeviceElementwiseImpl<Tuple<F64, F64, F64, F64, F64>, Tuple<F64>, Normalize, Rank, 2, Sequence<1, 1, 1, 1, 1>, Sequence<1> >,
|
||||
DeviceElementwiseImpl<Tuple<F64, F64, F64, F64, F64>, Tuple<F64>, Normalize, Rank, 2, Sequence<2, 1, 1, 1, 1>, Sequence<2> >,
|
||||
DeviceElementwiseImpl<Tuple<F64, F64, F64, F64, F64>, Tuple<F64>, Normalize, Rank, 2, Sequence<1, 2, 2, 2, 2>, Sequence<1> >,
|
||||
DeviceElementwiseImpl<Tuple<F64, F64, F64, F64, F64>, Tuple<F64>, Normalize, Rank, 2, Sequence<2, 2, 2, 2, 2>, Sequence<2> >,
|
||||
DeviceElementwiseImpl<Tuple<F64, F64, F64, F64, F64>, Tuple<F64>, Normalize, Rank, 4, Sequence<1, 1, 1, 1, 1>, Sequence<1> >,
|
||||
DeviceElementwiseImpl<Tuple<F64, F64, F64, F64, F64>, Tuple<F64>, Normalize, Rank, 4, Sequence<2, 1, 1, 1, 1>, Sequence<2> >,
|
||||
DeviceElementwiseImpl<Tuple<F64, F64, F64, F64, F64>, Tuple<F64>, Normalize, Rank, 4, Sequence<1, 2, 2, 2, 2>, Sequence<1> >,
|
||||
DeviceElementwiseImpl<Tuple<F64, F64, F64, F64, F64>, Tuple<F64>, Normalize, Rank, 4, Sequence<2, 2, 2, 2, 2>, Sequence<2> >
|
||||
>;
|
||||
// clang-format on
|
||||
|
||||
void add_device_batchnorm_infer_rank_4_f64_instances(
|
||||
std::vector<std::unique_ptr<
|
||||
DeviceElementwise<Tuple<F64, F64, F64, F64, F64>, Tuple<F64>, Normalize, 4>>>& instances)
|
||||
{
|
||||
add_device_operation_instances(instances, device_batchnorm_infer_f64_instances<4>{});
|
||||
}
|
||||
|
||||
} // namespace instance
|
||||
} // namespace device
|
||||
} // namespace tensor_operation
|
||||
} // namespace ck
|
||||
@@ -6,7 +6,7 @@
|
||||
#include "ck/ck.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/impl/device_elementwise.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/impl/device_elementwise_impl.hpp"
|
||||
#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
|
||||
|
||||
namespace ck {
|
||||
@@ -28,15 +28,15 @@ using Normalize = ck::tensor_operation::element_wise::Normalize;
|
||||
using device_normalize_from_mean_squaremean_f16_f32_f32_f16_f16_instances = std::tuple<
|
||||
// clang-format off
|
||||
//###################|<in, mean, square_mean, gamma, beta>| <out>| functor| NDim| MPerThread| <in, mean, square_mean, gamma, beta ScalarPerVector>| <out ScalarPerVector>|
|
||||
DeviceElementwise<Tuple<F16, F32, F32, F16, F16>, Tuple<F16>, Normalize, 2, 8, Sequence<8, 1, 1, 8, 8>, Sequence<8> >,
|
||||
DeviceElementwise<Tuple<F16, F32, F32, F16, F16>, Tuple<F16>, Normalize, 2, 4, Sequence<4, 1, 1, 4, 4>, Sequence<4> >,
|
||||
DeviceElementwise<Tuple<F16, F32, F32, F16, F16>, Tuple<F16>, Normalize, 2, 2, Sequence<2, 1, 1, 2, 2>, Sequence<2> >,
|
||||
DeviceElementwise<Tuple<F16, F32, F32, F16, F16>, Tuple<F16>, Normalize, 2, 1, Sequence<1, 1, 1, 1, 1>, Sequence<1> >
|
||||
DeviceElementwiseImpl<Tuple<F16, F32, F32, F16, F16>, Tuple<F16>, Normalize, 2, 8, Sequence<8, 1, 1, 8, 8>, Sequence<8> >,
|
||||
DeviceElementwiseImpl<Tuple<F16, F32, F32, F16, F16>, Tuple<F16>, Normalize, 2, 4, Sequence<4, 1, 1, 4, 4>, Sequence<4> >,
|
||||
DeviceElementwiseImpl<Tuple<F16, F32, F32, F16, F16>, Tuple<F16>, Normalize, 2, 2, Sequence<2, 1, 1, 2, 2>, Sequence<2> >,
|
||||
DeviceElementwiseImpl<Tuple<F16, F32, F32, F16, F16>, Tuple<F16>, Normalize, 2, 1, Sequence<1, 1, 1, 1, 1>, Sequence<1> >
|
||||
// clang-format on
|
||||
>;
|
||||
|
||||
void add_device_normalize_from_mean_squaremean_f16_f32_f32_f16_f16_instances(
|
||||
std::vector<DeviceElementwiseBasePtr<Tuple<F16, F32, F32, F16, F16>, Tuple<F16>, Normalize, 2>>&
|
||||
std::vector<DeviceElementwisePtr<Tuple<F16, F32, F32, F16, F16>, Tuple<F16>, Normalize, 2>>&
|
||||
instances)
|
||||
{
|
||||
add_device_operation_instances(
|
||||
|
||||
Reference in New Issue
Block a user