Batchnorm inference instances, external API, client examples and gtests (#531)

* File renaming and class renaming for device element-wise operation

* Add batchnorm-infer instances, external API and client example

* Add batchnorm-infer profiler module and gtests

* Remove file device_elementwise_extension.hpp and move NormalizeInInfer operation to element_wise_operation.hpp

* Remove the using of class aliasing for DeviceElementwiseForBatchNormInfer

* Rename class and file due to conflict from device_elementwise_2d.hpp

* Fix namespace in batcnnorm_infer_nhwc client example

[ROCm/composable_kernel commit: a1b2441f8d]
This commit is contained in:
Qianfeng
2023-01-26 07:09:04 +08:00
committed by GitHub
parent 4180697c64
commit b148acfaaa
29 changed files with 1260 additions and 80 deletions

View File

@@ -6,7 +6,7 @@
#include "ck/ck.hpp"
#include "ck/tensor_operation/gpu/element/binary_element_wise_operation.hpp"
#include "ck/tensor_operation/gpu/device/impl/device_elementwise.hpp"
#include "ck/tensor_operation/gpu/device/impl/device_elementwise_impl.hpp"
#include "ck/library/utility/check_err.hpp"
#include "ck/library/utility/device_memory.hpp"
@@ -23,13 +23,13 @@ using CDataType = F16;
using Add = ck::tensor_operation::element_wise::Add;
using DeviceElementwiseAddInstance =
ck::tensor_operation::device::DeviceElementwise<ck::Tuple<ABDataType, ABDataType>,
ck::Tuple<CDataType>,
Add,
2,
8,
ck::Sequence<8, 8>,
ck::Sequence<8>>;
ck::tensor_operation::device::DeviceElementwiseImpl<ck::Tuple<ABDataType, ABDataType>,
ck::Tuple<CDataType>,
Add,
2,
8,
ck::Sequence<8, 8>,
ck::Sequence<8>>;
template <typename HostTensorA,
typename HostTensorB,

View File

@@ -6,7 +6,7 @@
#include "ck/ck.hpp"
#include "ck/tensor_operation/gpu/element/binary_element_wise_operation.hpp"
#include "ck/tensor_operation/gpu/device/impl/device_elementwise.hpp"
#include "ck/tensor_operation/gpu/device/impl/device_elementwise_impl.hpp"
#include "ck/library/utility/algorithm.hpp"
#include "ck/library/utility/check_err.hpp"
@@ -23,13 +23,13 @@ using CDataType = F16;
using Add = ck::tensor_operation::element_wise::Add;
using DeviceElementwiseAddInstance =
ck::tensor_operation::device::DeviceElementwise<ck::Tuple<ABDataType, ABDataType>,
ck::Tuple<CDataType>,
Add,
3,
8,
ck::Sequence<1, 8>,
ck::Sequence<8>>;
ck::tensor_operation::device::DeviceElementwiseImpl<ck::Tuple<ABDataType, ABDataType>,
ck::Tuple<CDataType>,
Add,
3,
8,
ck::Sequence<1, 8>,
ck::Sequence<8>>;
template <typename HostTensorA, typename HostTensorB, typename HostTensorC, typename Functor>
void host_broadcast3D_am_bmnk(HostTensorC& C,

View File

@@ -5,7 +5,7 @@
#include <cstdlib>
#include "ck/ck.hpp"
#include "ck/tensor_operation/gpu/device/impl/device_elementwise.hpp"
#include "ck/tensor_operation/gpu/device/impl/device_elementwise_impl.hpp"
#include "ck/tensor_operation/gpu/element/binary_element_wise_operation.hpp"
#include "ck/library/utility/check_err.hpp"
#include "ck/library/utility/device_memory.hpp"
@@ -21,13 +21,13 @@ using CDataType = F16;
using Add = ck::tensor_operation::element_wise::Add;
using DeviceElementwiseAddInstance =
ck::tensor_operation::device::DeviceElementwise<ck::Tuple<ABDataType, ABDataType>,
ck::Tuple<CDataType>,
Add,
1,
8,
ck::Sequence<8, 8>,
ck::Sequence<8>>;
ck::tensor_operation::device::DeviceElementwiseImpl<ck::Tuple<ABDataType, ABDataType>,
ck::Tuple<CDataType>,
Add,
1,
8,
ck::Sequence<8, 8>,
ck::Sequence<8>>;
template <typename HostTensorA, typename HostTensorB, typename HostTensorC, typename Functor>
void host_elementwise1D(

View File

@@ -6,7 +6,7 @@
#include "ck/ck.hpp"
#include "ck/tensor_operation/gpu/element/binary_element_wise_operation.hpp"
#include "ck/tensor_operation/gpu/device/impl/device_elementwise.hpp"
#include "ck/tensor_operation/gpu/device/impl/device_elementwise_impl.hpp"
#include "ck/library/utility/algorithm.hpp"
#include "ck/library/utility/check_err.hpp"
@@ -23,13 +23,13 @@ using CDataType = F16;
using Add = ck::tensor_operation::element_wise::Add;
using DeviceElementwiseAddInstance =
ck::tensor_operation::device::DeviceElementwise<ck::Tuple<ABDataType, ABDataType>,
ck::Tuple<CDataType>,
Add,
4,
8,
ck::Sequence<8, 8>,
ck::Sequence<8>>;
ck::tensor_operation::device::DeviceElementwiseImpl<ck::Tuple<ABDataType, ABDataType>,
ck::Tuple<CDataType>,
Add,
4,
8,
ck::Sequence<8, 8>,
ck::Sequence<8>>;
template <typename HostTensorA, typename HostTensorB, typename HostTensorC, typename Functor>
void host_elementwise4D(HostTensorC& C,

View File

@@ -10,7 +10,7 @@
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
#include "ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_multiple_r_xdl_cshuffle.hpp"
#include "ck/tensor_operation/gpu/device/impl/device_elementwise.hpp"
#include "ck/tensor_operation/gpu/device/impl/device_elementwise_impl.hpp"
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
#include "ck/library/utility/device_memory.hpp"
@@ -95,7 +95,7 @@ using ReferenceGemmInstance = ck::tensor_operation::host::ReferenceGemm<ADataTyp
using NormalizeFunctor = ck::tensor_operation::element_wise::Normalize;
// A:x, B:E[x], C:E[x^2], D:Gamma, E:Beta , F:y
using DeviceNormalizeInstance = ck::tensor_operation::device::DeviceElementwise<
using DeviceNormalizeInstance = ck::tensor_operation::device::DeviceElementwiseImpl<
ck::Tuple<EDataType,
R0DataType,
R1DataType,

View File

@@ -10,7 +10,7 @@
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
#include "ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_multiple_r_xdl_cshuffle.hpp"
#include "ck/tensor_operation/gpu/device/impl/device_elementwise.hpp"
#include "ck/tensor_operation/gpu/device/impl/device_elementwise_impl.hpp"
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
#include "ck/library/utility/device_memory.hpp"
@@ -92,7 +92,7 @@ using ReferenceGemmInstance = ck::tensor_operation::host::ReferenceGemm<ADataTyp
using NormalizeFunctor = ck::tensor_operation::element_wise::Normalize;
// A:x, B:E[x], C:E[x^2], D:Gamma, E:Beta , F:y
using DeviceNormalizeInstance = ck::tensor_operation::device::DeviceElementwise<
using DeviceNormalizeInstance = ck::tensor_operation::device::DeviceElementwiseImpl<
ck::Tuple<EDataType,
R0DataType,
R1DataType,

View File

@@ -10,7 +10,7 @@
#include "ck/utility/sequence.hpp"
#include "ck/utility/tuple.hpp"
#include "ck/utility/reduction_operator.hpp"
#include "ck/tensor_operation/gpu/device/impl/device_elementwise.hpp"
#include "ck/tensor_operation/gpu/device/impl/device_elementwise_impl.hpp"
#include "batchnorm_common.hpp"
@@ -46,7 +46,7 @@ int bnorm_infer(
static_assert(NumBatchNormReduceDim < Rank,
"Invalid number of reduced dimensions for batchnorm!");
using DeviceNormalizeInstance = ck::tensor_operation::device::DeviceElementwise<
using DeviceNormalizeInstance = ck::tensor_operation::device::DeviceElementwiseImpl<
ck::Tuple<XDataType, AccDataType, AccDataType, AccDataType, AccDataType>, // x, mean,
// variance,
// scale,

View File

@@ -3,7 +3,7 @@
#include "ck/ck.hpp"
#include "ck/tensor_operation/gpu/element/binary_element_wise_operation.hpp"
#include "ck/tensor_operation/gpu/device/impl/device_elementwise.hpp"
#include "ck/tensor_operation/gpu/device/impl/device_elementwise_impl.hpp"
#include "ck/library/utility/algorithm.hpp"
#include "ck/library/utility/check_err.hpp"
@@ -19,13 +19,13 @@ using BDataType = F16;
using PassThrough = ck::tensor_operation::element_wise::PassThrough;
using DeviceElementwisePermuteInstance =
ck::tensor_operation::device::DeviceElementwise<ck::Tuple<ADataType>,
ck::Tuple<BDataType>,
PassThrough,
4,
8,
ck::Sequence<8>,
ck::Sequence<1>>;
ck::tensor_operation::device::DeviceElementwiseImpl<ck::Tuple<ADataType>,
ck::Tuple<BDataType>,
PassThrough,
4,
8,
ck::Sequence<8>,
ck::Sequence<1>>;
template <typename HostTensorA, typename HostTensorB, typename Functor>
void host_elementwise4D(HostTensorB& B_nhwc, const HostTensorA& A_nchw, Functor functor)

View File

@@ -3,7 +3,7 @@
#include "ck/ck.hpp"
#include "ck/tensor_operation/gpu/element/binary_element_wise_operation.hpp"
#include "ck/tensor_operation/gpu/device/device_elementwise_2d.hpp"
#include "ck/tensor_operation/gpu/device/impl/device_elementwise_2d_impl.hpp"
#include "ck/library/utility/check_err.hpp"
#include "ck/library/utility/device_memory.hpp"
@@ -17,15 +17,15 @@ using BDataType = F16;
using PassThrough = ck::tensor_operation::element_wise::PassThrough;
using DeviceElementwisePermuteInstance =
ck::tensor_operation::device::DeviceElementwise<ck::Tuple<ADataType>,
ck::Tuple<BDataType>,
PassThrough,
3, // NumDim_M
1, // NumDim_N
8,
8,
ck::Sequence<8>,
ck::Sequence<8>>;
ck::tensor_operation::device::DeviceElementwise2dImpl<ck::Tuple<ADataType>,
ck::Tuple<BDataType>,
PassThrough,
3, // NumDim_M
1, // NumDim_N
8,
8,
ck::Sequence<8>,
ck::Sequence<8>>;
template <typename HostTensorA, typename HostTensorB, typename Functor>
void host_elementwise4D(HostTensorB& B_nhwc,