mirror of
https://github.com/ROCm/composable_kernel.git
synced 2026-04-19 22:39:03 +00:00
Batchnorm inference instances, external API, client examples and gtests (#531)
* File renaming and class renaming for device element-wise operation * Add batchnorm-infer instances, external API and client example * Add batchnorm-infer profiler module and gtests * Remove file device_elementwise_extension.hpp and move NormalizeInInfer operation to element_wise_operation.hpp * Remove the using of class aliasing for DeviceElementwiseForBatchNormInfer * Rename class and file due to conflict from device_elementwise_2d.hpp * Fix namespace in batcnnorm_infer_nhwc client example
This commit is contained in:
@@ -8,7 +8,7 @@
|
||||
#include "ck/ck.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/device_gemm_reduce.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/impl/device_elementwise.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/impl/device_elementwise_impl.hpp"
|
||||
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
|
||||
|
||||
#include "ck/library/tensor_operation_instance/gpu/device_elementwise_instance.hpp"
|
||||
|
||||
@@ -1,4 +1,6 @@
|
||||
add_executable(client_batchnorm_fwd_nhwc batchnorm_fwd_nhwc.cpp)
|
||||
add_executable(client_batchnorm_bwd_nhwc batchnorm_bwd_nhwc.cpp)
|
||||
add_executable(client_batchnorm_infer_nhwc batchnorm_infer_nhwc.cpp)
|
||||
target_link_libraries(client_batchnorm_fwd_nhwc PRIVATE composable_kernel::device_operations)
|
||||
target_link_libraries(client_batchnorm_bwd_nhwc PRIVATE composable_kernel::device_operations)
|
||||
target_link_libraries(client_batchnorm_infer_nhwc PRIVATE composable_kernel::device_operations)
|
||||
|
||||
189
client_example/13_batchnorm/batchnorm_infer_nhwc.cpp
Normal file
189
client_example/13_batchnorm/batchnorm_infer_nhwc.cpp
Normal file
@@ -0,0 +1,189 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#include <functional>
|
||||
#include <numeric>
|
||||
#include <iomanip>
|
||||
#include <iostream>
|
||||
#include <vector>
|
||||
|
||||
#include "ck/ck.hpp"
|
||||
#include "ck/utility/tuple.hpp"
|
||||
#include "ck/library/tensor_operation_instance/gpu/batchnorm_infer.hpp"
|
||||
|
||||
using XDataType = float;
|
||||
using YDataType = float;
|
||||
using ScaleDataType = float;
|
||||
using BiasDataType = float;
|
||||
using MeanVarDataType = float;
|
||||
|
||||
constexpr int Rank = 4;
|
||||
constexpr int NumBatchNormReduceDim = 3;
|
||||
|
||||
using Normalize = ck::tensor_operation::element_wise::NormalizeInInfer;
|
||||
|
||||
const double epsilon = std::numeric_limits<float>::epsilon();
|
||||
|
||||
struct SimpleDeviceMem
|
||||
{
|
||||
SimpleDeviceMem() = delete;
|
||||
|
||||
SimpleDeviceMem(std::size_t mem_size) : p_mem_{}
|
||||
{
|
||||
(void)hipMalloc(static_cast<void**>(&p_mem_), mem_size);
|
||||
}
|
||||
|
||||
void* GetDeviceBuffer() { return p_mem_; }
|
||||
|
||||
~SimpleDeviceMem() { (void)hipFree(p_mem_); }
|
||||
|
||||
void* p_mem_;
|
||||
};
|
||||
|
||||
int main(int argc, char* argv[])
|
||||
{
|
||||
std::array<ck::index_t, Rank> xyLengths{16, 8, 128, 256};
|
||||
std::array<ck::index_t, Rank> xyStrides{8 * 128 * 256, 128 * 256, 256, 1};
|
||||
std::array<ck::index_t, Rank - NumBatchNormReduceDim> scaleBiasMeanVarLengths{256};
|
||||
std::array<ck::index_t, Rank - NumBatchNormReduceDim> scaleBiasMeanVarStrides{1};
|
||||
std::array<int, NumBatchNormReduceDim> reduceDims{0, 1, 2};
|
||||
std::array<int, Rank - NumBatchNormReduceDim> invariantDims{3};
|
||||
|
||||
ck::index_t numXYElement =
|
||||
std::accumulate(xyLengths.begin(), xyLengths.end(), 1, std::multiplies<ck::index_t>());
|
||||
|
||||
ck::index_t numScaleBiasMeanVarElement = std::accumulate(scaleBiasMeanVarLengths.begin(),
|
||||
scaleBiasMeanVarLengths.end(),
|
||||
1,
|
||||
std::multiplies<ck::index_t>());
|
||||
|
||||
SimpleDeviceMem x(sizeof(XDataType) * numXYElement);
|
||||
SimpleDeviceMem y(sizeof(YDataType) * numXYElement);
|
||||
SimpleDeviceMem scale(sizeof(ScaleDataType) * numScaleBiasMeanVarElement);
|
||||
SimpleDeviceMem bias(sizeof(BiasDataType) * numScaleBiasMeanVarElement);
|
||||
SimpleDeviceMem mean(sizeof(MeanVarDataType) * numScaleBiasMeanVarElement);
|
||||
SimpleDeviceMem variance(sizeof(MeanVarDataType) * numScaleBiasMeanVarElement);
|
||||
|
||||
// values in variance need be non-negative
|
||||
(void)hipMemset(
|
||||
variance.GetDeviceBuffer(), 0, sizeof(MeanVarDataType) * numScaleBiasMeanVarElement);
|
||||
|
||||
std::array<ck::index_t, Rank> aligned_scaleBiasMeanVarStrides{0};
|
||||
|
||||
int i = 0;
|
||||
for(auto dim : invariantDims)
|
||||
{
|
||||
assert(xyLengths[dim] == scaleBiasMeanVarLengths[i]);
|
||||
|
||||
aligned_scaleBiasMeanVarStrides[dim] = scaleBiasMeanVarStrides[i];
|
||||
i++;
|
||||
};
|
||||
|
||||
using DeviceOp = ck::tensor_operation::device::DeviceElementwise<
|
||||
ck::Tuple<XDataType, MeanVarDataType, MeanVarDataType, ScaleDataType, BiasDataType>,
|
||||
ck::Tuple<YDataType>,
|
||||
Normalize,
|
||||
Rank>;
|
||||
|
||||
const auto op_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory<
|
||||
DeviceOp>::GetInstances();
|
||||
|
||||
std::cout << "found " << op_ptrs.size() << " instances" << std::endl;
|
||||
|
||||
std::string best_op_name;
|
||||
bool found = false;
|
||||
int best_op_id = -1;
|
||||
float best_ave_time = std::numeric_limits<float>::max();
|
||||
float best_gb_per_sec = 0;
|
||||
|
||||
// profile device operation instances
|
||||
std::cout << "Run all instances and do timing" << std::endl;
|
||||
|
||||
for(int i = 0; i < op_ptrs.size(); ++i)
|
||||
{
|
||||
auto& op_ptr = op_ptrs[i];
|
||||
|
||||
auto argument_ptr = op_ptr->MakeArgumentPointer(xyLengths,
|
||||
{xyStrides,
|
||||
aligned_scaleBiasMeanVarStrides,
|
||||
aligned_scaleBiasMeanVarStrides,
|
||||
aligned_scaleBiasMeanVarStrides,
|
||||
aligned_scaleBiasMeanVarStrides},
|
||||
{xyStrides},
|
||||
{x.GetDeviceBuffer(),
|
||||
mean.GetDeviceBuffer(),
|
||||
variance.GetDeviceBuffer(),
|
||||
scale.GetDeviceBuffer(),
|
||||
bias.GetDeviceBuffer()},
|
||||
{y.GetDeviceBuffer()},
|
||||
Normalize{epsilon});
|
||||
|
||||
auto invoker_ptr = op_ptr->MakeInvokerPointer();
|
||||
std::string op_name = op_ptr->GetTypeString();
|
||||
|
||||
if(op_ptr->IsSupportedArgument(argument_ptr.get()))
|
||||
{
|
||||
float ave_time = invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, true});
|
||||
|
||||
std::size_t num_bytes =
|
||||
numXYElement * (sizeof(XDataType) + sizeof(YDataType)) +
|
||||
numScaleBiasMeanVarElement * (sizeof(ScaleDataType) + sizeof(BiasDataType) +
|
||||
sizeof(MeanVarDataType) + sizeof(MeanVarDataType));
|
||||
|
||||
float gb_per_sec = num_bytes / 1.E6 / ave_time;
|
||||
|
||||
std::cout << "Perf: " << std::setw(10) << ave_time << " ms, " << gb_per_sec << " GB/s, "
|
||||
<< op_name << std::endl;
|
||||
|
||||
if(ave_time < best_ave_time)
|
||||
{
|
||||
found = true;
|
||||
best_op_id = i;
|
||||
best_op_name = op_name;
|
||||
best_ave_time = ave_time;
|
||||
best_gb_per_sec = gb_per_sec;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
std::cout << op_name << " does not support this problem" << std::endl;
|
||||
}
|
||||
}
|
||||
|
||||
if(found)
|
||||
{
|
||||
std::cout << "Best Perf: " << best_ave_time << " ms, " << best_gb_per_sec << " GB/s, "
|
||||
<< best_op_name << std::endl;
|
||||
|
||||
// run the best intance
|
||||
auto& op_ptr = op_ptrs[best_op_id];
|
||||
std::cout << "Run the best instance without timing: " << op_ptr->GetTypeString()
|
||||
<< std::endl;
|
||||
|
||||
auto argument_ptr = op_ptr->MakeArgumentPointer(xyLengths,
|
||||
{xyStrides,
|
||||
aligned_scaleBiasMeanVarStrides,
|
||||
aligned_scaleBiasMeanVarStrides,
|
||||
aligned_scaleBiasMeanVarStrides,
|
||||
aligned_scaleBiasMeanVarStrides},
|
||||
{xyStrides},
|
||||
{x.GetDeviceBuffer(),
|
||||
mean.GetDeviceBuffer(),
|
||||
variance.GetDeviceBuffer(),
|
||||
scale.GetDeviceBuffer(),
|
||||
bias.GetDeviceBuffer()},
|
||||
{y.GetDeviceBuffer()},
|
||||
Normalize{epsilon});
|
||||
|
||||
auto invoker_ptr = op_ptr->MakeInvokerPointer();
|
||||
|
||||
if(op_ptr->IsSupportedArgument(argument_ptr.get()))
|
||||
{
|
||||
invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, false});
|
||||
}
|
||||
|
||||
std::cout << "Done" << std::endl;
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
@@ -6,7 +6,7 @@
|
||||
|
||||
#include "ck/ck.hpp"
|
||||
#include "ck/tensor_operation/gpu/element/binary_element_wise_operation.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/impl/device_elementwise.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/impl/device_elementwise_impl.hpp"
|
||||
|
||||
#include "ck/library/utility/check_err.hpp"
|
||||
#include "ck/library/utility/device_memory.hpp"
|
||||
@@ -23,13 +23,13 @@ using CDataType = F16;
|
||||
using Add = ck::tensor_operation::element_wise::Add;
|
||||
|
||||
using DeviceElementwiseAddInstance =
|
||||
ck::tensor_operation::device::DeviceElementwise<ck::Tuple<ABDataType, ABDataType>,
|
||||
ck::Tuple<CDataType>,
|
||||
Add,
|
||||
2,
|
||||
8,
|
||||
ck::Sequence<8, 8>,
|
||||
ck::Sequence<8>>;
|
||||
ck::tensor_operation::device::DeviceElementwiseImpl<ck::Tuple<ABDataType, ABDataType>,
|
||||
ck::Tuple<CDataType>,
|
||||
Add,
|
||||
2,
|
||||
8,
|
||||
ck::Sequence<8, 8>,
|
||||
ck::Sequence<8>>;
|
||||
|
||||
template <typename HostTensorA,
|
||||
typename HostTensorB,
|
||||
|
||||
@@ -6,7 +6,7 @@
|
||||
|
||||
#include "ck/ck.hpp"
|
||||
#include "ck/tensor_operation/gpu/element/binary_element_wise_operation.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/impl/device_elementwise.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/impl/device_elementwise_impl.hpp"
|
||||
|
||||
#include "ck/library/utility/algorithm.hpp"
|
||||
#include "ck/library/utility/check_err.hpp"
|
||||
@@ -23,13 +23,13 @@ using CDataType = F16;
|
||||
using Add = ck::tensor_operation::element_wise::Add;
|
||||
|
||||
using DeviceElementwiseAddInstance =
|
||||
ck::tensor_operation::device::DeviceElementwise<ck::Tuple<ABDataType, ABDataType>,
|
||||
ck::Tuple<CDataType>,
|
||||
Add,
|
||||
3,
|
||||
8,
|
||||
ck::Sequence<1, 8>,
|
||||
ck::Sequence<8>>;
|
||||
ck::tensor_operation::device::DeviceElementwiseImpl<ck::Tuple<ABDataType, ABDataType>,
|
||||
ck::Tuple<CDataType>,
|
||||
Add,
|
||||
3,
|
||||
8,
|
||||
ck::Sequence<1, 8>,
|
||||
ck::Sequence<8>>;
|
||||
|
||||
template <typename HostTensorA, typename HostTensorB, typename HostTensorC, typename Functor>
|
||||
void host_broadcast3D_am_bmnk(HostTensorC& C,
|
||||
|
||||
@@ -5,7 +5,7 @@
|
||||
#include <cstdlib>
|
||||
|
||||
#include "ck/ck.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/impl/device_elementwise.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/impl/device_elementwise_impl.hpp"
|
||||
#include "ck/tensor_operation/gpu/element/binary_element_wise_operation.hpp"
|
||||
#include "ck/library/utility/check_err.hpp"
|
||||
#include "ck/library/utility/device_memory.hpp"
|
||||
@@ -21,13 +21,13 @@ using CDataType = F16;
|
||||
using Add = ck::tensor_operation::element_wise::Add;
|
||||
|
||||
using DeviceElementwiseAddInstance =
|
||||
ck::tensor_operation::device::DeviceElementwise<ck::Tuple<ABDataType, ABDataType>,
|
||||
ck::Tuple<CDataType>,
|
||||
Add,
|
||||
1,
|
||||
8,
|
||||
ck::Sequence<8, 8>,
|
||||
ck::Sequence<8>>;
|
||||
ck::tensor_operation::device::DeviceElementwiseImpl<ck::Tuple<ABDataType, ABDataType>,
|
||||
ck::Tuple<CDataType>,
|
||||
Add,
|
||||
1,
|
||||
8,
|
||||
ck::Sequence<8, 8>,
|
||||
ck::Sequence<8>>;
|
||||
|
||||
template <typename HostTensorA, typename HostTensorB, typename HostTensorC, typename Functor>
|
||||
void host_elementwise1D(
|
||||
|
||||
@@ -6,7 +6,7 @@
|
||||
|
||||
#include "ck/ck.hpp"
|
||||
#include "ck/tensor_operation/gpu/element/binary_element_wise_operation.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/impl/device_elementwise.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/impl/device_elementwise_impl.hpp"
|
||||
|
||||
#include "ck/library/utility/algorithm.hpp"
|
||||
#include "ck/library/utility/check_err.hpp"
|
||||
@@ -23,13 +23,13 @@ using CDataType = F16;
|
||||
using Add = ck::tensor_operation::element_wise::Add;
|
||||
|
||||
using DeviceElementwiseAddInstance =
|
||||
ck::tensor_operation::device::DeviceElementwise<ck::Tuple<ABDataType, ABDataType>,
|
||||
ck::Tuple<CDataType>,
|
||||
Add,
|
||||
4,
|
||||
8,
|
||||
ck::Sequence<8, 8>,
|
||||
ck::Sequence<8>>;
|
||||
ck::tensor_operation::device::DeviceElementwiseImpl<ck::Tuple<ABDataType, ABDataType>,
|
||||
ck::Tuple<CDataType>,
|
||||
Add,
|
||||
4,
|
||||
8,
|
||||
ck::Sequence<8, 8>,
|
||||
ck::Sequence<8>>;
|
||||
|
||||
template <typename HostTensorA, typename HostTensorB, typename HostTensorC, typename Functor>
|
||||
void host_elementwise4D(HostTensorC& C,
|
||||
|
||||
@@ -10,7 +10,7 @@
|
||||
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_multiple_r_xdl_cshuffle.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/impl/device_elementwise.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/impl/device_elementwise_impl.hpp"
|
||||
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
|
||||
|
||||
#include "ck/library/utility/device_memory.hpp"
|
||||
@@ -95,7 +95,7 @@ using ReferenceGemmInstance = ck::tensor_operation::host::ReferenceGemm<ADataTyp
|
||||
using NormalizeFunctor = ck::tensor_operation::element_wise::Normalize;
|
||||
|
||||
// A:x, B:E[x], C:E[x^2], D:Gamma, E:Beta , F:y
|
||||
using DeviceNormalizeInstance = ck::tensor_operation::device::DeviceElementwise<
|
||||
using DeviceNormalizeInstance = ck::tensor_operation::device::DeviceElementwiseImpl<
|
||||
ck::Tuple<EDataType,
|
||||
R0DataType,
|
||||
R1DataType,
|
||||
|
||||
@@ -10,7 +10,7 @@
|
||||
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_multiple_r_xdl_cshuffle.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/impl/device_elementwise.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/impl/device_elementwise_impl.hpp"
|
||||
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
|
||||
|
||||
#include "ck/library/utility/device_memory.hpp"
|
||||
@@ -92,7 +92,7 @@ using ReferenceGemmInstance = ck::tensor_operation::host::ReferenceGemm<ADataTyp
|
||||
using NormalizeFunctor = ck::tensor_operation::element_wise::Normalize;
|
||||
|
||||
// A:x, B:E[x], C:E[x^2], D:Gamma, E:Beta , F:y
|
||||
using DeviceNormalizeInstance = ck::tensor_operation::device::DeviceElementwise<
|
||||
using DeviceNormalizeInstance = ck::tensor_operation::device::DeviceElementwiseImpl<
|
||||
ck::Tuple<EDataType,
|
||||
R0DataType,
|
||||
R1DataType,
|
||||
|
||||
@@ -10,7 +10,7 @@
|
||||
#include "ck/utility/sequence.hpp"
|
||||
#include "ck/utility/tuple.hpp"
|
||||
#include "ck/utility/reduction_operator.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/impl/device_elementwise.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/impl/device_elementwise_impl.hpp"
|
||||
|
||||
#include "batchnorm_common.hpp"
|
||||
|
||||
@@ -46,7 +46,7 @@ int bnorm_infer(
|
||||
static_assert(NumBatchNormReduceDim < Rank,
|
||||
"Invalid number of reduced dimensions for batchnorm!");
|
||||
|
||||
using DeviceNormalizeInstance = ck::tensor_operation::device::DeviceElementwise<
|
||||
using DeviceNormalizeInstance = ck::tensor_operation::device::DeviceElementwiseImpl<
|
||||
ck::Tuple<XDataType, AccDataType, AccDataType, AccDataType, AccDataType>, // x, mean,
|
||||
// variance,
|
||||
// scale,
|
||||
|
||||
@@ -3,7 +3,7 @@
|
||||
|
||||
#include "ck/ck.hpp"
|
||||
#include "ck/tensor_operation/gpu/element/binary_element_wise_operation.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/impl/device_elementwise.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/impl/device_elementwise_impl.hpp"
|
||||
|
||||
#include "ck/library/utility/algorithm.hpp"
|
||||
#include "ck/library/utility/check_err.hpp"
|
||||
@@ -19,13 +19,13 @@ using BDataType = F16;
|
||||
|
||||
using PassThrough = ck::tensor_operation::element_wise::PassThrough;
|
||||
using DeviceElementwisePermuteInstance =
|
||||
ck::tensor_operation::device::DeviceElementwise<ck::Tuple<ADataType>,
|
||||
ck::Tuple<BDataType>,
|
||||
PassThrough,
|
||||
4,
|
||||
8,
|
||||
ck::Sequence<8>,
|
||||
ck::Sequence<1>>;
|
||||
ck::tensor_operation::device::DeviceElementwiseImpl<ck::Tuple<ADataType>,
|
||||
ck::Tuple<BDataType>,
|
||||
PassThrough,
|
||||
4,
|
||||
8,
|
||||
ck::Sequence<8>,
|
||||
ck::Sequence<1>>;
|
||||
|
||||
template <typename HostTensorA, typename HostTensorB, typename Functor>
|
||||
void host_elementwise4D(HostTensorB& B_nhwc, const HostTensorA& A_nchw, Functor functor)
|
||||
|
||||
@@ -3,7 +3,7 @@
|
||||
|
||||
#include "ck/ck.hpp"
|
||||
#include "ck/tensor_operation/gpu/element/binary_element_wise_operation.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/device_elementwise_2d.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/impl/device_elementwise_2d_impl.hpp"
|
||||
|
||||
#include "ck/library/utility/check_err.hpp"
|
||||
#include "ck/library/utility/device_memory.hpp"
|
||||
@@ -17,15 +17,15 @@ using BDataType = F16;
|
||||
|
||||
using PassThrough = ck::tensor_operation::element_wise::PassThrough;
|
||||
using DeviceElementwisePermuteInstance =
|
||||
ck::tensor_operation::device::DeviceElementwise<ck::Tuple<ADataType>,
|
||||
ck::Tuple<BDataType>,
|
||||
PassThrough,
|
||||
3, // NumDim_M
|
||||
1, // NumDim_N
|
||||
8,
|
||||
8,
|
||||
ck::Sequence<8>,
|
||||
ck::Sequence<8>>;
|
||||
ck::tensor_operation::device::DeviceElementwise2dImpl<ck::Tuple<ADataType>,
|
||||
ck::Tuple<BDataType>,
|
||||
PassThrough,
|
||||
3, // NumDim_M
|
||||
1, // NumDim_N
|
||||
8,
|
||||
8,
|
||||
ck::Sequence<8>,
|
||||
ck::Sequence<8>>;
|
||||
|
||||
template <typename HostTensorA, typename HostTensorB, typename Functor>
|
||||
void host_elementwise4D(HostTensorB& B_nhwc,
|
||||
|
||||
@@ -17,7 +17,7 @@ template <typename InDataTypeTuple,
|
||||
typename OutDataTypeTuple,
|
||||
typename ElementwiseOperation,
|
||||
index_t NumDim>
|
||||
struct DeviceElementwiseBase : public BaseOperator
|
||||
struct DeviceElementwise : public BaseOperator
|
||||
{
|
||||
static constexpr int NumInput = InDataTypeTuple::Size();
|
||||
static constexpr int NumOutput = OutDataTypeTuple::Size();
|
||||
@@ -37,8 +37,8 @@ template <typename InDataTypeTuple,
|
||||
typename OutDataTypeTuple,
|
||||
typename ElementwiseOperation,
|
||||
index_t NumDim>
|
||||
using DeviceElementwiseBasePtr = std::unique_ptr<
|
||||
DeviceElementwiseBase<InDataTypeTuple, OutDataTypeTuple, ElementwiseOperation, NumDim>>;
|
||||
using DeviceElementwisePtr = std::unique_ptr<
|
||||
DeviceElementwise<InDataTypeTuple, OutDataTypeTuple, ElementwiseOperation, NumDim>>;
|
||||
|
||||
} // namespace device
|
||||
} // namespace tensor_operation
|
||||
@@ -8,7 +8,7 @@
|
||||
|
||||
#include "ck/utility/math.hpp"
|
||||
#include "ck/utility/sequence.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/device_elementwise_base.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/device_elementwise.hpp"
|
||||
#include "ck/tensor_operation/gpu/grid/gridwise_elementwise_2d.hpp"
|
||||
#include "ck/tensor_description/tensor_descriptor_helper.hpp"
|
||||
|
||||
@@ -26,10 +26,10 @@ template <typename InDataTypeTuple,
|
||||
index_t NPerThread,
|
||||
typename InScalarPerVectorSeq,
|
||||
typename OutScalarPerVectorSeq>
|
||||
struct DeviceElementwise : public DeviceElementwiseBase<InDataTypeTuple,
|
||||
OutDataTypeTuple,
|
||||
ElementwiseOperation,
|
||||
NumDim_m + NumDim_n>
|
||||
struct DeviceElementwise2dImpl : public DeviceElementwise<InDataTypeTuple,
|
||||
OutDataTypeTuple,
|
||||
ElementwiseOperation,
|
||||
NumDim_m + NumDim_n>
|
||||
{
|
||||
static constexpr index_t NumDim = NumDim_m + NumDim_n;
|
||||
|
||||
@@ -8,7 +8,7 @@
|
||||
|
||||
#include "ck/utility/math.hpp"
|
||||
#include "ck/utility/sequence.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/device_elementwise_base.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/device_elementwise.hpp"
|
||||
#include "ck/tensor_operation/gpu/grid/gridwise_elementwise_1d.hpp"
|
||||
#include "ck/tensor_description/tensor_descriptor_helper.hpp"
|
||||
|
||||
@@ -25,8 +25,8 @@ template <typename InDataTypeTuple,
|
||||
index_t MPerThread,
|
||||
typename InScalarPerVectorSeq,
|
||||
typename OutScalarPerVectorSeq>
|
||||
struct DeviceElementwise
|
||||
: public DeviceElementwiseBase<InDataTypeTuple, OutDataTypeTuple, ElementwiseOperation, NumDim>
|
||||
struct DeviceElementwiseImpl
|
||||
: public DeviceElementwise<InDataTypeTuple, OutDataTypeTuple, ElementwiseOperation, NumDim>
|
||||
{
|
||||
static constexpr int NumInput = InDataTypeTuple::Size();
|
||||
static constexpr int NumOutput = OutDataTypeTuple::Size();
|
||||
@@ -314,6 +314,40 @@ struct Normalize
|
||||
double epsilon_;
|
||||
};
|
||||
|
||||
// used by BatchNorm inference
|
||||
// y = gamma * (x-mean) / sqrt(epsilon+variance) + beta
|
||||
// The data type of mean and variance is used as AccDataType
|
||||
struct NormalizeInInfer
|
||||
{
|
||||
NormalizeInInfer(double epsilon = 1e-4) : epsilon_(epsilon) {}
|
||||
|
||||
template <typename T1, typename T2, typename T3, typename T4>
|
||||
__host__ __device__ constexpr void operator()(T1& y,
|
||||
const T1& x,
|
||||
const T2& mean,
|
||||
const T2& variance,
|
||||
const T3& gamma,
|
||||
const T4& beta) const
|
||||
{
|
||||
static_assert(std::is_same<T2, float>::value || std::is_same<T2, double>::value,
|
||||
"Data type is not supported by this operation!");
|
||||
|
||||
using ck::type_convert;
|
||||
using ck::math::sqrt;
|
||||
|
||||
T2 tmp_x, tmp_y;
|
||||
|
||||
tmp_x = type_convert<T2>(x);
|
||||
|
||||
tmp_y = ((tmp_x - mean) / sqrt(variance + type_convert<T2>(epsilon_))) *
|
||||
type_convert<T2>(gamma) +
|
||||
type_convert<T2>(beta);
|
||||
y = type_convert<T1>(tmp_y);
|
||||
};
|
||||
|
||||
double epsilon_;
|
||||
};
|
||||
|
||||
template <typename Y, typename X>
|
||||
struct UnaryTypeConvert;
|
||||
|
||||
|
||||
@@ -0,0 +1,117 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <cstdlib>
|
||||
|
||||
#include "ck/ck.hpp"
|
||||
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/device_elementwise.hpp"
|
||||
|
||||
#include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp"
|
||||
|
||||
namespace ck {
|
||||
namespace tensor_operation {
|
||||
namespace device {
|
||||
namespace instance {
|
||||
|
||||
// FP16
|
||||
void add_device_batchnorm_infer_rank_4_f16_instances(
|
||||
std::vector<std::unique_ptr<ck::tensor_operation::device::DeviceElementwise<
|
||||
ck::Tuple<F16, F32, F32, F16, F16>,
|
||||
ck::Tuple<F16>,
|
||||
ck::tensor_operation::element_wise::NormalizeInInfer,
|
||||
4>>>&);
|
||||
|
||||
// FP32
|
||||
void add_device_batchnorm_infer_rank_4_f32_instances(
|
||||
std::vector<std::unique_ptr<ck::tensor_operation::device::DeviceElementwise<
|
||||
ck::Tuple<F32, F32, F32, F32, F32>,
|
||||
ck::Tuple<F32>,
|
||||
ck::tensor_operation::element_wise::NormalizeInInfer,
|
||||
4>>>&);
|
||||
|
||||
// BF16
|
||||
void add_device_batchnorm_infer_rank_4_bf16_instances(
|
||||
std::vector<std::unique_ptr<ck::tensor_operation::device::DeviceElementwise<
|
||||
ck::Tuple<BF16, F32, F32, BF16, BF16>,
|
||||
ck::Tuple<BF16>,
|
||||
ck::tensor_operation::element_wise::NormalizeInInfer,
|
||||
4>>>&);
|
||||
|
||||
// FP64
|
||||
void add_device_batchnorm_infer_rank_4_f64_instances(
|
||||
std::vector<std::unique_ptr<ck::tensor_operation::device::DeviceElementwise<
|
||||
ck::Tuple<F64, F64, F64, F64, F64>,
|
||||
ck::Tuple<F64>,
|
||||
ck::tensor_operation::element_wise::NormalizeInInfer,
|
||||
4>>>&);
|
||||
|
||||
template <typename XDataType,
|
||||
typename YDataType,
|
||||
typename ScaleDataType,
|
||||
typename BiasDataType,
|
||||
typename MeanVarDataType,
|
||||
index_t Rank>
|
||||
struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceElementwise<
|
||||
ck::Tuple<XDataType, MeanVarDataType, MeanVarDataType, ScaleDataType, BiasDataType>,
|
||||
ck::Tuple<YDataType>,
|
||||
ck::tensor_operation::element_wise::NormalizeInInfer,
|
||||
Rank>>
|
||||
{
|
||||
using DeviceOp = ck::tensor_operation::device::DeviceElementwise<
|
||||
ck::Tuple<XDataType, MeanVarDataType, MeanVarDataType, ScaleDataType, BiasDataType>,
|
||||
ck::Tuple<YDataType>,
|
||||
ck::tensor_operation::element_wise::NormalizeInInfer,
|
||||
Rank>;
|
||||
|
||||
static auto GetInstances()
|
||||
{
|
||||
std::vector<std::unique_ptr<DeviceOp>> op_ptrs;
|
||||
|
||||
if constexpr(is_same_v<XDataType, F16> && is_same_v<YDataType, F16> &&
|
||||
is_same_v<ScaleDataType, F16> && is_same_v<BiasDataType, F16> &&
|
||||
is_same_v<MeanVarDataType, F32>)
|
||||
{
|
||||
if constexpr(Rank == 4)
|
||||
{
|
||||
add_device_batchnorm_infer_rank_4_f16_instances(op_ptrs);
|
||||
}
|
||||
}
|
||||
else if constexpr(is_same_v<XDataType, F32> && is_same_v<YDataType, F32> &&
|
||||
is_same_v<ScaleDataType, F32> && is_same_v<BiasDataType, F32> &&
|
||||
is_same_v<MeanVarDataType, F32>)
|
||||
{
|
||||
if constexpr(Rank == 4)
|
||||
{
|
||||
add_device_batchnorm_infer_rank_4_f32_instances(op_ptrs);
|
||||
}
|
||||
}
|
||||
else if constexpr(is_same_v<XDataType, BF16> && is_same_v<YDataType, BF16> &&
|
||||
is_same_v<ScaleDataType, BF16> && is_same_v<BiasDataType, BF16> &&
|
||||
is_same_v<MeanVarDataType, F32>)
|
||||
{
|
||||
if constexpr(Rank == 4)
|
||||
{
|
||||
add_device_batchnorm_infer_rank_4_bf16_instances(op_ptrs);
|
||||
}
|
||||
}
|
||||
else if constexpr(is_same_v<XDataType, F64> && is_same_v<YDataType, F64> &&
|
||||
is_same_v<ScaleDataType, F64> && is_same_v<BiasDataType, F64> &&
|
||||
is_same_v<MeanVarDataType, F64>)
|
||||
{
|
||||
if constexpr(Rank == 4)
|
||||
{
|
||||
add_device_batchnorm_infer_rank_4_f64_instances(op_ptrs);
|
||||
}
|
||||
}
|
||||
|
||||
return op_ptrs;
|
||||
}
|
||||
};
|
||||
|
||||
} // namespace instance
|
||||
} // namespace device
|
||||
} // namespace tensor_operation
|
||||
} // namespace ck
|
||||
@@ -7,7 +7,7 @@
|
||||
|
||||
#include "ck/ck.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/impl/device_elementwise.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/impl/device_elementwise_impl.hpp"
|
||||
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
|
||||
|
||||
#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
|
||||
@@ -18,11 +18,8 @@ namespace device {
|
||||
namespace instance {
|
||||
|
||||
using Normalize = ck::tensor_operation::element_wise::Normalize;
|
||||
using DeviceNormalizeFromMeanMeanSquarePtr = ck::tensor_operation::device::DeviceElementwiseBasePtr<
|
||||
Tuple<half_t, float, float, half_t, half_t>,
|
||||
Tuple<half_t>,
|
||||
Normalize,
|
||||
2>;
|
||||
using DeviceNormalizeFromMeanMeanSquarePtr = ck::tensor_operation::device::
|
||||
DeviceElementwisePtr<Tuple<half_t, float, float, half_t, half_t>, Tuple<half_t>, Normalize, 2>;
|
||||
|
||||
void add_device_normalize_from_mean_squaremean_f16_f32_f32_f16_f16_instances(
|
||||
std::vector<DeviceNormalizeFromMeanMeanSquarePtr>& instances);
|
||||
|
||||
@@ -7,4 +7,8 @@ add_instance_library(device_batchnorm_instance
|
||||
device_batchnorm_backward_f32_instance.cpp
|
||||
device_batchnorm_backward_bf16_instance.cpp
|
||||
device_batchnorm_backward_f64_instance.cpp
|
||||
device_batchnorm_infer_f16_instance.cpp
|
||||
device_batchnorm_infer_f32_instance.cpp
|
||||
device_batchnorm_infer_bf16_instance.cpp
|
||||
device_batchnorm_infer_f64_instance.cpp
|
||||
)
|
||||
|
||||
@@ -0,0 +1,55 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#include "ck/ck.hpp"
|
||||
#include "ck/utility/tuple.hpp"
|
||||
#include "ck/utility/data_type.hpp"
|
||||
|
||||
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/impl/device_elementwise_impl.hpp"
|
||||
#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
|
||||
|
||||
namespace ck {
|
||||
namespace tensor_operation {
|
||||
namespace device {
|
||||
namespace instance {
|
||||
|
||||
using BF16 = ck::bhalf_t;
|
||||
using F32 = float;
|
||||
|
||||
using Normalize = ck::tensor_operation::element_wise::NormalizeInInfer;
|
||||
|
||||
// clang-format off
|
||||
template <index_t Rank>
|
||||
using device_batchnorm_infer_bf16_instances =
|
||||
std::tuple <
|
||||
// Tuple<XDataType, MeanDataType, VarDataType, ScaleDataType, BiasDataType>, Tuple<YDataType>, NormalizeOp, Rank, MPerThread, Sequence<XVectorSize, MeanDataType, VarDataType, ScaleVectorSize, BiasVectorSize>, Sequence<YVectorSize>
|
||||
DeviceElementwiseImpl<Tuple<BF16, F32, F32, BF16, BF16>, Tuple<BF16>, Normalize, Rank, 1, Sequence<1, 1, 1, 1, 1>, Sequence<1> >,
|
||||
DeviceElementwiseImpl<Tuple<BF16, F32, F32, BF16, BF16>, Tuple<BF16>, Normalize, Rank, 2, Sequence<1, 1, 1, 1, 1>, Sequence<1> >,
|
||||
DeviceElementwiseImpl<Tuple<BF16, F32, F32, BF16, BF16>, Tuple<BF16>, Normalize, Rank, 2, Sequence<2, 1, 1, 1, 1>, Sequence<2> >,
|
||||
DeviceElementwiseImpl<Tuple<BF16, F32, F32, BF16, BF16>, Tuple<BF16>, Normalize, Rank, 2, Sequence<1, 2, 2, 2, 2>, Sequence<1> >,
|
||||
DeviceElementwiseImpl<Tuple<BF16, F32, F32, BF16, BF16>, Tuple<BF16>, Normalize, Rank, 2, Sequence<2, 2, 2, 2, 2>, Sequence<2> >,
|
||||
DeviceElementwiseImpl<Tuple<BF16, F32, F32, BF16, BF16>, Tuple<BF16>, Normalize, Rank, 4, Sequence<1, 1, 1, 1, 1>, Sequence<1> >,
|
||||
DeviceElementwiseImpl<Tuple<BF16, F32, F32, BF16, BF16>, Tuple<BF16>, Normalize, Rank, 4, Sequence<2, 1, 1, 1, 1>, Sequence<2> >,
|
||||
DeviceElementwiseImpl<Tuple<BF16, F32, F32, BF16, BF16>, Tuple<BF16>, Normalize, Rank, 4, Sequence<1, 2, 2, 2, 2>, Sequence<1> >,
|
||||
DeviceElementwiseImpl<Tuple<BF16, F32, F32, BF16, BF16>, Tuple<BF16>, Normalize, Rank, 4, Sequence<2, 2, 2, 2, 2>, Sequence<2> >,
|
||||
DeviceElementwiseImpl<Tuple<BF16, F32, F32, BF16, BF16>, Tuple<BF16>, Normalize, Rank, 4, Sequence<4, 1, 1, 1, 1>, Sequence<4> >,
|
||||
DeviceElementwiseImpl<Tuple<BF16, F32, F32, BF16, BF16>, Tuple<BF16>, Normalize, Rank, 4, Sequence<1, 4, 4, 4, 4>, Sequence<1> >,
|
||||
DeviceElementwiseImpl<Tuple<BF16, F32, F32, BF16, BF16>, Tuple<BF16>, Normalize, Rank, 4, Sequence<4, 2, 2, 2, 2>, Sequence<4> >,
|
||||
DeviceElementwiseImpl<Tuple<BF16, F32, F32, BF16, BF16>, Tuple<BF16>, Normalize, Rank, 4, Sequence<2, 4, 4, 4, 4>, Sequence<2> >,
|
||||
DeviceElementwiseImpl<Tuple<BF16, F32, F32, BF16, BF16>, Tuple<BF16>, Normalize, Rank, 4, Sequence<4, 4, 4, 4, 4>, Sequence<4> >
|
||||
>;
|
||||
// clang-format on
|
||||
|
||||
void add_device_batchnorm_infer_rank_4_bf16_instances(
|
||||
std::vector<std::unique_ptr<
|
||||
DeviceElementwise<Tuple<BF16, F32, F32, BF16, BF16>, Tuple<BF16>, Normalize, 4>>>&
|
||||
instances)
|
||||
{
|
||||
add_device_operation_instances(instances, device_batchnorm_infer_bf16_instances<4>{});
|
||||
}
|
||||
|
||||
} // namespace instance
|
||||
} // namespace device
|
||||
} // namespace tensor_operation
|
||||
} // namespace ck
|
||||
@@ -0,0 +1,54 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#include "ck/ck.hpp"
|
||||
#include "ck/utility/tuple.hpp"
|
||||
#include "ck/utility/data_type.hpp"
|
||||
|
||||
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/impl/device_elementwise_impl.hpp"
|
||||
#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
|
||||
|
||||
namespace ck {
|
||||
namespace tensor_operation {
|
||||
namespace device {
|
||||
namespace instance {
|
||||
|
||||
using F16 = ck::half_t;
|
||||
using F32 = float;
|
||||
|
||||
using Normalize = ck::tensor_operation::element_wise::NormalizeInInfer;
|
||||
|
||||
// clang-format off
|
||||
template <index_t Rank>
|
||||
using device_batchnorm_infer_f16_instances =
|
||||
std::tuple <
|
||||
// Tuple<XDataType, MeanDataType, VarDataType, ScaleDataType, BiasDataType>, Tuple<YDataType>, NormalizeOp, Rank, MPerThread, Sequence<XVectorSize, MeanDataType, VarDataType, ScaleVectorSize, BiasVectorSize>, Sequence<YVectorSize>
|
||||
DeviceElementwiseImpl<Tuple<F16, F32, F32, F16, F16>, Tuple<F16>, Normalize, Rank, 1, Sequence<1, 1, 1, 1, 1>, Sequence<1> >,
|
||||
DeviceElementwiseImpl<Tuple<F16, F32, F32, F16, F16>, Tuple<F16>, Normalize, Rank, 2, Sequence<1, 1, 1, 1, 1>, Sequence<1> >,
|
||||
DeviceElementwiseImpl<Tuple<F16, F32, F32, F16, F16>, Tuple<F16>, Normalize, Rank, 2, Sequence<2, 1, 1, 1, 1>, Sequence<2> >,
|
||||
DeviceElementwiseImpl<Tuple<F16, F32, F32, F16, F16>, Tuple<F16>, Normalize, Rank, 2, Sequence<1, 2, 2, 2, 2>, Sequence<1> >,
|
||||
DeviceElementwiseImpl<Tuple<F16, F32, F32, F16, F16>, Tuple<F16>, Normalize, Rank, 2, Sequence<2, 2, 2, 2, 2>, Sequence<2> >,
|
||||
DeviceElementwiseImpl<Tuple<F16, F32, F32, F16, F16>, Tuple<F16>, Normalize, Rank, 4, Sequence<1, 1, 1, 1, 1>, Sequence<1> >,
|
||||
DeviceElementwiseImpl<Tuple<F16, F32, F32, F16, F16>, Tuple<F16>, Normalize, Rank, 4, Sequence<2, 1, 1, 1, 1>, Sequence<2> >,
|
||||
DeviceElementwiseImpl<Tuple<F16, F32, F32, F16, F16>, Tuple<F16>, Normalize, Rank, 4, Sequence<1, 2, 2, 2, 2>, Sequence<1> >,
|
||||
DeviceElementwiseImpl<Tuple<F16, F32, F32, F16, F16>, Tuple<F16>, Normalize, Rank, 4, Sequence<2, 2, 2, 2, 2>, Sequence<2> >,
|
||||
DeviceElementwiseImpl<Tuple<F16, F32, F32, F16, F16>, Tuple<F16>, Normalize, Rank, 4, Sequence<4, 1, 1, 1, 1>, Sequence<4> >,
|
||||
DeviceElementwiseImpl<Tuple<F16, F32, F32, F16, F16>, Tuple<F16>, Normalize, Rank, 4, Sequence<1, 4, 4, 4, 4>, Sequence<1> >,
|
||||
DeviceElementwiseImpl<Tuple<F16, F32, F32, F16, F16>, Tuple<F16>, Normalize, Rank, 4, Sequence<4, 2, 2, 2, 2>, Sequence<4> >,
|
||||
DeviceElementwiseImpl<Tuple<F16, F32, F32, F16, F16>, Tuple<F16>, Normalize, Rank, 4, Sequence<2, 4, 4, 4, 4>, Sequence<2> >,
|
||||
DeviceElementwiseImpl<Tuple<F16, F32, F32, F16, F16>, Tuple<F16>, Normalize, Rank, 4, Sequence<4, 4, 4, 4, 4>, Sequence<4> >
|
||||
>;
|
||||
// clang-format on
|
||||
|
||||
void add_device_batchnorm_infer_rank_4_f16_instances(
|
||||
std::vector<std::unique_ptr<
|
||||
DeviceElementwise<Tuple<F16, F32, F32, F16, F16>, Tuple<F16>, Normalize, 4>>>& instances)
|
||||
{
|
||||
add_device_operation_instances(instances, device_batchnorm_infer_f16_instances<4>{});
|
||||
}
|
||||
|
||||
} // namespace instance
|
||||
} // namespace device
|
||||
} // namespace tensor_operation
|
||||
} // namespace ck
|
||||
@@ -0,0 +1,52 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#include "ck/ck.hpp"
|
||||
#include "ck/utility/tuple.hpp"
|
||||
|
||||
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/impl/device_elementwise_impl.hpp"
|
||||
#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
|
||||
|
||||
namespace ck {
|
||||
namespace tensor_operation {
|
||||
namespace device {
|
||||
namespace instance {
|
||||
|
||||
using F32 = float;
|
||||
|
||||
using Normalize = ck::tensor_operation::element_wise::NormalizeInInfer;
|
||||
|
||||
// clang-format off
|
||||
template <index_t Rank>
|
||||
using device_batchnorm_infer_f32_instances =
|
||||
std::tuple <
|
||||
// Tuple<XDataType, MeanDataType, VarDataType, ScaleDataType, BiasDataType>, Tuple<YDataType>, NormalizeOp, Rank, MPerThread, Sequence<XVectorSize, MeanDataType, VarDataType, ScaleVectorSize, BiasVectorSize>, Sequence<YVectorSize>
|
||||
DeviceElementwiseImpl<Tuple<F32, F32, F32, F32, F32>, Tuple<F32>, Normalize, Rank, 1, Sequence<1, 1, 1, 1, 1>, Sequence<1> >,
|
||||
DeviceElementwiseImpl<Tuple<F32, F32, F32, F32, F32>, Tuple<F32>, Normalize, Rank, 2, Sequence<1, 1, 1, 1, 1>, Sequence<1> >,
|
||||
DeviceElementwiseImpl<Tuple<F32, F32, F32, F32, F32>, Tuple<F32>, Normalize, Rank, 2, Sequence<2, 1, 1, 1, 1>, Sequence<2> >,
|
||||
DeviceElementwiseImpl<Tuple<F32, F32, F32, F32, F32>, Tuple<F32>, Normalize, Rank, 2, Sequence<1, 2, 2, 2, 2>, Sequence<1> >,
|
||||
DeviceElementwiseImpl<Tuple<F32, F32, F32, F32, F32>, Tuple<F32>, Normalize, Rank, 2, Sequence<2, 2, 2, 2, 2>, Sequence<2> >,
|
||||
DeviceElementwiseImpl<Tuple<F32, F32, F32, F32, F32>, Tuple<F32>, Normalize, Rank, 4, Sequence<1, 1, 1, 1, 1>, Sequence<1> >,
|
||||
DeviceElementwiseImpl<Tuple<F32, F32, F32, F32, F32>, Tuple<F32>, Normalize, Rank, 4, Sequence<2, 1, 1, 1, 1>, Sequence<2> >,
|
||||
DeviceElementwiseImpl<Tuple<F32, F32, F32, F32, F32>, Tuple<F32>, Normalize, Rank, 4, Sequence<1, 2, 2, 2, 2>, Sequence<1> >,
|
||||
DeviceElementwiseImpl<Tuple<F32, F32, F32, F32, F32>, Tuple<F32>, Normalize, Rank, 4, Sequence<2, 2, 2, 2, 2>, Sequence<2> >,
|
||||
DeviceElementwiseImpl<Tuple<F32, F32, F32, F32, F32>, Tuple<F32>, Normalize, Rank, 4, Sequence<4, 1, 1, 1, 1>, Sequence<4> >,
|
||||
DeviceElementwiseImpl<Tuple<F32, F32, F32, F32, F32>, Tuple<F32>, Normalize, Rank, 4, Sequence<1, 4, 4, 4, 4>, Sequence<1> >,
|
||||
DeviceElementwiseImpl<Tuple<F32, F32, F32, F32, F32>, Tuple<F32>, Normalize, Rank, 4, Sequence<4, 2, 2, 2, 2>, Sequence<4> >,
|
||||
DeviceElementwiseImpl<Tuple<F32, F32, F32, F32, F32>, Tuple<F32>, Normalize, Rank, 4, Sequence<2, 4, 4, 4, 4>, Sequence<2> >,
|
||||
DeviceElementwiseImpl<Tuple<F32, F32, F32, F32, F32>, Tuple<F32>, Normalize, Rank, 4, Sequence<4, 4, 4, 4, 4>, Sequence<4> >
|
||||
>;
|
||||
// clang-format on
|
||||
|
||||
void add_device_batchnorm_infer_rank_4_f32_instances(
|
||||
std::vector<std::unique_ptr<
|
||||
DeviceElementwise<Tuple<F32, F32, F32, F32, F32>, Tuple<F32>, Normalize, 4>>>& instances)
|
||||
{
|
||||
add_device_operation_instances(instances, device_batchnorm_infer_f32_instances<4>{});
|
||||
}
|
||||
|
||||
} // namespace instance
|
||||
} // namespace device
|
||||
} // namespace tensor_operation
|
||||
} // namespace ck
|
||||
@@ -0,0 +1,47 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#include "ck/ck.hpp"
|
||||
#include "ck/utility/tuple.hpp"
|
||||
|
||||
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/impl/device_elementwise_impl.hpp"
|
||||
#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
|
||||
|
||||
namespace ck {
|
||||
namespace tensor_operation {
|
||||
namespace device {
|
||||
namespace instance {
|
||||
|
||||
using F64 = double;
|
||||
|
||||
using Normalize = ck::tensor_operation::element_wise::NormalizeInInfer;
|
||||
|
||||
// clang-format off
|
||||
template <index_t Rank>
|
||||
using device_batchnorm_infer_f64_instances =
|
||||
std::tuple <
|
||||
// Tuple<XDataType, MeanDataType, VarDataType, ScaleDataType, BiasDataType>, Tuple<YDataType>, NormalizeOp, Rank, MPerThread, Sequence<XVectorSize, MeanDataType, VarDataType, ScaleVectorSize, BiasVectorSize>, Sequence<YVectorSize>
|
||||
DeviceElementwiseImpl<Tuple<F64, F64, F64, F64, F64>, Tuple<F64>, Normalize, Rank, 1, Sequence<1, 1, 1, 1, 1>, Sequence<1> >,
|
||||
DeviceElementwiseImpl<Tuple<F64, F64, F64, F64, F64>, Tuple<F64>, Normalize, Rank, 2, Sequence<1, 1, 1, 1, 1>, Sequence<1> >,
|
||||
DeviceElementwiseImpl<Tuple<F64, F64, F64, F64, F64>, Tuple<F64>, Normalize, Rank, 2, Sequence<2, 1, 1, 1, 1>, Sequence<2> >,
|
||||
DeviceElementwiseImpl<Tuple<F64, F64, F64, F64, F64>, Tuple<F64>, Normalize, Rank, 2, Sequence<1, 2, 2, 2, 2>, Sequence<1> >,
|
||||
DeviceElementwiseImpl<Tuple<F64, F64, F64, F64, F64>, Tuple<F64>, Normalize, Rank, 2, Sequence<2, 2, 2, 2, 2>, Sequence<2> >,
|
||||
DeviceElementwiseImpl<Tuple<F64, F64, F64, F64, F64>, Tuple<F64>, Normalize, Rank, 4, Sequence<1, 1, 1, 1, 1>, Sequence<1> >,
|
||||
DeviceElementwiseImpl<Tuple<F64, F64, F64, F64, F64>, Tuple<F64>, Normalize, Rank, 4, Sequence<2, 1, 1, 1, 1>, Sequence<2> >,
|
||||
DeviceElementwiseImpl<Tuple<F64, F64, F64, F64, F64>, Tuple<F64>, Normalize, Rank, 4, Sequence<1, 2, 2, 2, 2>, Sequence<1> >,
|
||||
DeviceElementwiseImpl<Tuple<F64, F64, F64, F64, F64>, Tuple<F64>, Normalize, Rank, 4, Sequence<2, 2, 2, 2, 2>, Sequence<2> >
|
||||
>;
|
||||
// clang-format on
|
||||
|
||||
void add_device_batchnorm_infer_rank_4_f64_instances(
|
||||
std::vector<std::unique_ptr<
|
||||
DeviceElementwise<Tuple<F64, F64, F64, F64, F64>, Tuple<F64>, Normalize, 4>>>& instances)
|
||||
{
|
||||
add_device_operation_instances(instances, device_batchnorm_infer_f64_instances<4>{});
|
||||
}
|
||||
|
||||
} // namespace instance
|
||||
} // namespace device
|
||||
} // namespace tensor_operation
|
||||
} // namespace ck
|
||||
@@ -6,7 +6,7 @@
|
||||
#include "ck/ck.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/impl/device_elementwise.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/impl/device_elementwise_impl.hpp"
|
||||
#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
|
||||
|
||||
namespace ck {
|
||||
@@ -28,15 +28,15 @@ using Normalize = ck::tensor_operation::element_wise::Normalize;
|
||||
using device_normalize_from_mean_squaremean_f16_f32_f32_f16_f16_instances = std::tuple<
|
||||
// clang-format off
|
||||
//###################|<in, mean, square_mean, gamma, beta>| <out>| functor| NDim| MPerThread| <in, mean, square_mean, gamma, beta ScalarPerVector>| <out ScalarPerVector>|
|
||||
DeviceElementwise<Tuple<F16, F32, F32, F16, F16>, Tuple<F16>, Normalize, 2, 8, Sequence<8, 1, 1, 8, 8>, Sequence<8> >,
|
||||
DeviceElementwise<Tuple<F16, F32, F32, F16, F16>, Tuple<F16>, Normalize, 2, 4, Sequence<4, 1, 1, 4, 4>, Sequence<4> >,
|
||||
DeviceElementwise<Tuple<F16, F32, F32, F16, F16>, Tuple<F16>, Normalize, 2, 2, Sequence<2, 1, 1, 2, 2>, Sequence<2> >,
|
||||
DeviceElementwise<Tuple<F16, F32, F32, F16, F16>, Tuple<F16>, Normalize, 2, 1, Sequence<1, 1, 1, 1, 1>, Sequence<1> >
|
||||
DeviceElementwiseImpl<Tuple<F16, F32, F32, F16, F16>, Tuple<F16>, Normalize, 2, 8, Sequence<8, 1, 1, 8, 8>, Sequence<8> >,
|
||||
DeviceElementwiseImpl<Tuple<F16, F32, F32, F16, F16>, Tuple<F16>, Normalize, 2, 4, Sequence<4, 1, 1, 4, 4>, Sequence<4> >,
|
||||
DeviceElementwiseImpl<Tuple<F16, F32, F32, F16, F16>, Tuple<F16>, Normalize, 2, 2, Sequence<2, 1, 1, 2, 2>, Sequence<2> >,
|
||||
DeviceElementwiseImpl<Tuple<F16, F32, F32, F16, F16>, Tuple<F16>, Normalize, 2, 1, Sequence<1, 1, 1, 1, 1>, Sequence<1> >
|
||||
// clang-format on
|
||||
>;
|
||||
|
||||
void add_device_normalize_from_mean_squaremean_f16_f32_f32_f16_f16_instances(
|
||||
std::vector<DeviceElementwiseBasePtr<Tuple<F16, F32, F32, F16, F16>, Tuple<F16>, Normalize, 2>>&
|
||||
std::vector<DeviceElementwisePtr<Tuple<F16, F32, F32, F16, F16>, Tuple<F16>, Normalize, 2>>&
|
||||
instances)
|
||||
{
|
||||
add_device_operation_instances(
|
||||
|
||||
335
profiler/include/profiler/profile_batchnorm_infer_impl.hpp
Normal file
335
profiler/include/profiler/profile_batchnorm_infer_impl.hpp
Normal file
@@ -0,0 +1,335 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <iomanip>
|
||||
#include <stdexcept>
|
||||
|
||||
#include "ck/ck.hpp"
|
||||
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
|
||||
#include "ck/library/utility/check_err.hpp"
|
||||
#include "ck/library/utility/device_memory.hpp"
|
||||
#include "ck/library/utility/host_tensor.hpp"
|
||||
#include "ck/library/utility/host_tensor_generator.hpp"
|
||||
#include "ck/library/tensor_operation_instance/gpu/batchnorm_infer.hpp"
|
||||
#include "ck/library/reference_tensor_operation/cpu/reference_batchnorm_infer.hpp"
|
||||
|
||||
namespace ck {
|
||||
namespace profiler {
|
||||
|
||||
template <typename XDataType,
|
||||
typename YDataType,
|
||||
typename AccDataType,
|
||||
typename ScaleDataType,
|
||||
typename BiasDataType,
|
||||
typename MeanVarDataType,
|
||||
index_t Rank,
|
||||
index_t NumBatchNormReduceDim>
|
||||
bool profile_batchnorm_infer_impl(int do_verification,
|
||||
int init_method,
|
||||
bool do_dumpout,
|
||||
bool time_kernel,
|
||||
const std::vector<size_t> inOutLengths,
|
||||
const std::vector<int> reduceDims,
|
||||
double epsilon)
|
||||
{
|
||||
if(inOutLengths.size() != Rank || reduceDims.size() != NumBatchNormReduceDim)
|
||||
{
|
||||
throw std::runtime_error("Invalid tensor lengths or number of reduce dimensions!");
|
||||
};
|
||||
|
||||
std::vector<size_t> scaleBiasMeanVarLengths;
|
||||
std::vector<int> invariantDims;
|
||||
|
||||
// used for calculating the effective transferred bytes by each operation
|
||||
size_t total_length;
|
||||
size_t invariant_length = 1;
|
||||
|
||||
total_length =
|
||||
std::accumulate(inOutLengths.begin(), inOutLengths.end(), 1, std::multiplies<size_t>{});
|
||||
|
||||
if(std::any_of(reduceDims.begin(), reduceDims.end(), [](int d) { return d < 0 || d >= Rank; }))
|
||||
throw std::runtime_error("Invalid reduce dimensions!");
|
||||
|
||||
for(int dim = 0; dim < Rank; dim++)
|
||||
{
|
||||
if(std::none_of(reduceDims.begin(), reduceDims.end(), [&](int d) { return dim == d; }))
|
||||
{
|
||||
invariantDims.push_back(dim);
|
||||
scaleBiasMeanVarLengths.push_back(inOutLengths[dim]);
|
||||
invariant_length *= inOutLengths[dim];
|
||||
};
|
||||
}
|
||||
|
||||
// input data of the batchnorm infer algorithm
|
||||
Tensor<XDataType> x(inOutLengths);
|
||||
Tensor<ScaleDataType> scale(scaleBiasMeanVarLengths);
|
||||
Tensor<BiasDataType> bias(scaleBiasMeanVarLengths);
|
||||
Tensor<MeanVarDataType> estimatedMean(scaleBiasMeanVarLengths);
|
||||
Tensor<MeanVarDataType> estimatedVariance(scaleBiasMeanVarLengths);
|
||||
|
||||
// output data of the batchnorm infer algorithm
|
||||
Tensor<YDataType> y_ref(inOutLengths);
|
||||
Tensor<YDataType> y(inOutLengths);
|
||||
|
||||
auto inOutStrides = x.mDesc.GetStrides();
|
||||
auto scaleBiasMeanVarStrides = scale.mDesc.GetStrides();
|
||||
|
||||
std::size_t num_thread = std::thread::hardware_concurrency();
|
||||
|
||||
const float x_mean = 0.0f;
|
||||
const float x_stddev = 1.0f;
|
||||
const float noise_stddev = 0.04f;
|
||||
|
||||
// input data in normal distribution
|
||||
x.GenerateTensorValue(GeneratorTensor_4<XDataType>{x_mean, x_stddev}, num_thread);
|
||||
|
||||
// initialize the estimatedMean to be values with tiny variation to the mean of the x
|
||||
// values
|
||||
estimatedMean.GenerateTensorValue(GeneratorTensor_4<MeanVarDataType>{x_mean, noise_stddev},
|
||||
num_thread);
|
||||
|
||||
// initialize the estimatedVariance to be values with tiny variation to the variance of
|
||||
// the x values
|
||||
estimatedVariance.GenerateTensorValue(
|
||||
GeneratorTensor_4<MeanVarDataType>{x_stddev * x_stddev, noise_stddev}, num_thread);
|
||||
|
||||
if(do_verification)
|
||||
{
|
||||
switch(init_method)
|
||||
{
|
||||
case 0:
|
||||
scale.GenerateTensorValue(GeneratorTensor_0<ScaleDataType>{}, num_thread);
|
||||
bias.GenerateTensorValue(GeneratorTensor_0<BiasDataType>{}, num_thread);
|
||||
break;
|
||||
case 1:
|
||||
scale.GenerateTensorValue(GeneratorTensor_1<ScaleDataType>{1}, num_thread);
|
||||
bias.GenerateTensorValue(GeneratorTensor_1<BiasDataType>{0}, num_thread);
|
||||
break;
|
||||
case 2:
|
||||
scale.GenerateTensorValue(GeneratorTensor_2<ScaleDataType>{-5, 5}, num_thread);
|
||||
bias.GenerateTensorValue(GeneratorTensor_2<BiasDataType>{-5, 5}, num_thread);
|
||||
break;
|
||||
default:
|
||||
scale.GenerateTensorValue(GeneratorTensor_3<ScaleDataType>{-1.0f, 1.0f}, num_thread);
|
||||
bias.GenerateTensorValue(GeneratorTensor_3<BiasDataType>{-1.0f, 1.0f}, num_thread);
|
||||
}
|
||||
};
|
||||
|
||||
// these buffers are usually provided by the user application
|
||||
DeviceMem x_dev(sizeof(XDataType) * x.mDesc.GetElementSpaceSize());
|
||||
DeviceMem y_dev(sizeof(XDataType) * y.mDesc.GetElementSpaceSize());
|
||||
DeviceMem scale_dev(sizeof(ScaleDataType) * scale.mDesc.GetElementSpaceSize());
|
||||
DeviceMem bias_dev(sizeof(BiasDataType) * bias.mDesc.GetElementSpaceSize());
|
||||
|
||||
// estimatedMean_dev
|
||||
DeviceMem estimatedMean_dev(sizeof(MeanVarDataType) *
|
||||
estimatedMean.mDesc.GetElementSpaceSize());
|
||||
// estimatedVariance_dev
|
||||
DeviceMem estimatedVariance_dev(sizeof(MeanVarDataType) *
|
||||
estimatedVariance.mDesc.GetElementSpaceSize());
|
||||
|
||||
x_dev.ToDevice(x.mData.data());
|
||||
scale_dev.ToDevice(scale.mData.data());
|
||||
bias_dev.ToDevice(bias.mData.data());
|
||||
estimatedMean_dev.ToDevice(estimatedMean.mData.data());
|
||||
estimatedVariance_dev.ToDevice(estimatedVariance.mData.data());
|
||||
|
||||
std::array<index_t, Rank> arrInOutLengths;
|
||||
std::array<index_t, Rank> arrInOutStrides;
|
||||
std::array<index_t, Rank - NumBatchNormReduceDim> arrScaleBiasMeanVarLengths;
|
||||
std::array<index_t, Rank - NumBatchNormReduceDim> arrScaleBiasMeanVarStrides;
|
||||
std::array<int, NumBatchNormReduceDim> arrReduceDims;
|
||||
|
||||
std::copy(inOutLengths.begin(), inOutLengths.end(), arrInOutLengths.begin());
|
||||
std::copy(inOutStrides.begin(), inOutStrides.end(), arrInOutStrides.begin());
|
||||
std::copy(scaleBiasMeanVarLengths.begin(),
|
||||
scaleBiasMeanVarLengths.end(),
|
||||
arrScaleBiasMeanVarLengths.begin());
|
||||
std::copy(scaleBiasMeanVarStrides.begin(),
|
||||
scaleBiasMeanVarStrides.end(),
|
||||
arrScaleBiasMeanVarStrides.begin());
|
||||
|
||||
std::copy(reduceDims.begin(), reduceDims.end(), arrReduceDims.begin());
|
||||
|
||||
std::array<index_t, Rank> aligned_scaleBiasMeanVarStrides{0};
|
||||
|
||||
int i = 0;
|
||||
for(auto dim : invariantDims)
|
||||
{
|
||||
assert(inOutLengths[dim] == scaleBiasMeanVarLengths[i]);
|
||||
|
||||
aligned_scaleBiasMeanVarStrides[dim] = scaleBiasMeanVarStrides[i];
|
||||
i++;
|
||||
};
|
||||
|
||||
using Normalize = ck::tensor_operation::element_wise::NormalizeInInfer;
|
||||
|
||||
// add device batchnorm-infer instances
|
||||
using DeviceOp = ck::tensor_operation::device::DeviceElementwise<
|
||||
ck::Tuple<XDataType, MeanVarDataType, MeanVarDataType, ScaleDataType, BiasDataType>,
|
||||
ck::Tuple<YDataType>,
|
||||
Normalize,
|
||||
Rank>;
|
||||
|
||||
// get device op instances
|
||||
const auto instance_ptrs =
|
||||
ck::tensor_operation::device::instance::DeviceOperationInstanceFactory<
|
||||
DeviceOp>::GetInstances();
|
||||
|
||||
std::cout << "found " << instance_ptrs.size() << " instances" << std::endl;
|
||||
|
||||
std::string best_instance_name;
|
||||
float best_avg_time = std::numeric_limits<float>::max();
|
||||
float best_gb_per_sec = 0;
|
||||
|
||||
if(do_verification)
|
||||
{
|
||||
using PassThroughOp = ck::tensor_operation::element_wise::PassThrough;
|
||||
|
||||
using ReferenceBatchNormInferInstance =
|
||||
ck::tensor_operation::host::ReferenceBatchNormInfer<XDataType,
|
||||
YDataType,
|
||||
AccDataType,
|
||||
ScaleDataType,
|
||||
BiasDataType,
|
||||
MeanVarDataType,
|
||||
PassThroughOp,
|
||||
Rank,
|
||||
NumBatchNormReduceDim>;
|
||||
auto batchNormInfer_ref = ReferenceBatchNormInferInstance{};
|
||||
|
||||
auto argument_ptr_ref =
|
||||
batchNormInfer_ref.MakeArgumentPointer(arrInOutLengths,
|
||||
arrInOutStrides,
|
||||
arrInOutStrides,
|
||||
arrReduceDims,
|
||||
arrScaleBiasMeanVarLengths,
|
||||
arrScaleBiasMeanVarStrides,
|
||||
arrScaleBiasMeanVarStrides,
|
||||
arrScaleBiasMeanVarStrides,
|
||||
x.mData.data(),
|
||||
scale.mData.data(),
|
||||
bias.mData.data(),
|
||||
epsilon,
|
||||
PassThroughOp{},
|
||||
estimatedMean.mData.data(),
|
||||
estimatedVariance.mData.data(),
|
||||
y_ref.mData.data());
|
||||
|
||||
if(!batchNormInfer_ref.IsSupportedArgument(argument_ptr_ref.get()))
|
||||
{
|
||||
std::cout << "The runtime parameters not supported by the reference instance, exiting!"
|
||||
<< std::endl;
|
||||
return (false);
|
||||
};
|
||||
|
||||
auto invoker_ptr_ref = batchNormInfer_ref.MakeInvokerPointer();
|
||||
|
||||
(void)invoker_ptr_ref->Run(argument_ptr_ref.get());
|
||||
}
|
||||
|
||||
int num_kernel = 0;
|
||||
bool pass = true;
|
||||
|
||||
for(auto& inst_ptr : instance_ptrs)
|
||||
{
|
||||
auto argument_ptr = inst_ptr->MakeArgumentPointer(arrInOutLengths,
|
||||
{arrInOutStrides,
|
||||
aligned_scaleBiasMeanVarStrides,
|
||||
aligned_scaleBiasMeanVarStrides,
|
||||
aligned_scaleBiasMeanVarStrides,
|
||||
aligned_scaleBiasMeanVarStrides},
|
||||
{arrInOutStrides},
|
||||
{x_dev.GetDeviceBuffer(),
|
||||
estimatedMean_dev.GetDeviceBuffer(),
|
||||
estimatedVariance_dev.GetDeviceBuffer(),
|
||||
scale_dev.GetDeviceBuffer(),
|
||||
bias_dev.GetDeviceBuffer()},
|
||||
{y_dev.GetDeviceBuffer()},
|
||||
Normalize{epsilon});
|
||||
|
||||
if(inst_ptr->IsSupportedArgument(argument_ptr.get()))
|
||||
{
|
||||
num_kernel++;
|
||||
}
|
||||
else
|
||||
{
|
||||
if(time_kernel)
|
||||
{
|
||||
std::cout << inst_ptr->GetTypeString()
|
||||
<< " skipped due to unsupported argument: " << std::endl;
|
||||
}
|
||||
|
||||
continue;
|
||||
};
|
||||
|
||||
auto invoker_ptr = inst_ptr->MakeInvokerPointer();
|
||||
|
||||
float avg_time = invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, time_kernel});
|
||||
|
||||
size_t num_bytes = 0;
|
||||
|
||||
// inputing of x, scale, bias, outputing of y
|
||||
num_bytes += total_length * (sizeof(XDataType) + sizeof(YDataType)) +
|
||||
invariant_length *
|
||||
(sizeof(ScaleDataType) + sizeof(BiasDataType) + sizeof(MeanVarDataType));
|
||||
|
||||
float gb_per_sec = num_bytes / 1.E6 / avg_time;
|
||||
|
||||
if(time_kernel)
|
||||
std::cout << "Perf: " << avg_time << " ms, " << gb_per_sec << " GB/s, "
|
||||
<< inst_ptr->GetTypeString() << std::endl;
|
||||
|
||||
if(avg_time < best_avg_time)
|
||||
{
|
||||
best_instance_name = inst_ptr->GetTypeString();
|
||||
best_avg_time = avg_time;
|
||||
best_gb_per_sec = gb_per_sec;
|
||||
}
|
||||
|
||||
if(do_verification)
|
||||
{
|
||||
using ck::utils::check_err;
|
||||
bool single_pass;
|
||||
|
||||
y_dev.FromDevice(y.mData.data());
|
||||
|
||||
if constexpr(ck::is_same_v<YDataType, ck::bhalf_t>)
|
||||
single_pass = check_err(y.mData, y_ref.mData, "y results", 1e-2, 1e-2);
|
||||
else
|
||||
single_pass = check_err(y.mData, y_ref.mData, "y results", 4e-3, 4e-3);
|
||||
|
||||
pass = pass && single_pass;
|
||||
};
|
||||
|
||||
if(do_dumpout)
|
||||
{
|
||||
using ck::host_common::dumpBufferToFile;
|
||||
|
||||
// clang-format off
|
||||
dumpBufferToFile("dump_x.bin", x.mData.data(), x.mDesc.GetElementSize());
|
||||
dumpBufferToFile("dump_y.bin", y.mData.data(), y.mDesc.GetElementSize());
|
||||
dumpBufferToFile("dump_y_ref.bin", y_ref.mData.data(), y_ref.mDesc.GetElementSize());
|
||||
// clang-format off
|
||||
};
|
||||
}
|
||||
|
||||
if(time_kernel)
|
||||
{
|
||||
std::cout << "best perf = " << best_avg_time << " ms, " << best_gb_per_sec << " GB/s, "
|
||||
<< best_instance_name << std::endl;
|
||||
}
|
||||
|
||||
if(num_kernel == 0)
|
||||
{
|
||||
std::cout << "Error: No kernel is applicable" << std::endl;
|
||||
return false;
|
||||
}
|
||||
|
||||
return pass;
|
||||
}
|
||||
|
||||
} // namespace profiler
|
||||
} // namespace ck
|
||||
@@ -27,6 +27,7 @@ set(PROFILER_SOURCES
|
||||
profile_softmax.cpp
|
||||
profile_batchnorm_fwd.cpp
|
||||
profile_batchnorm_bwd.cpp
|
||||
profile_batchnorm_infer.cpp
|
||||
)
|
||||
|
||||
set(PROFILER_EXECUTABLE ckProfiler)
|
||||
|
||||
202
profiler/src/profile_batchnorm_infer.cpp
Normal file
202
profiler/src/profile_batchnorm_infer.cpp
Normal file
@@ -0,0 +1,202 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#include <iostream>
|
||||
#include <vector>
|
||||
#include <getopt.h>
|
||||
|
||||
#include "ck/library/utility/host_common_util.hpp"
|
||||
#include "profiler/profile_batchnorm_infer_impl.hpp"
|
||||
#include "profiler_operation_registry.hpp"
|
||||
|
||||
using ck::index_t;
|
||||
|
||||
using namespace std;
|
||||
|
||||
static const struct option long_options[] = {{"inOutLengths", required_argument, nullptr, 'D'},
|
||||
{"reduceDims", required_argument, nullptr, 'R'},
|
||||
{"dumpout", required_argument, nullptr, 'o'},
|
||||
{"verify", required_argument, nullptr, 'v'},
|
||||
{"help", no_argument, nullptr, '?'},
|
||||
{nullptr, 0, nullptr, 0}};
|
||||
|
||||
class BatchnormInferArgParser
|
||||
{
|
||||
private:
|
||||
int option_index = 0;
|
||||
|
||||
public:
|
||||
std::vector<size_t> inLengths;
|
||||
std::vector<int> reduceDims;
|
||||
|
||||
bool do_verification = false;
|
||||
bool do_dumpout = false;
|
||||
|
||||
bool updateMovingAverage;
|
||||
bool saveMeanAndInvVariance;
|
||||
|
||||
int data_type = 0;
|
||||
int init_method = 2;
|
||||
bool time_kernel = false;
|
||||
|
||||
BatchnormInferArgParser() = default;
|
||||
~BatchnormInferArgParser() = default;
|
||||
|
||||
void show_usage(const char* cmd)
|
||||
{
|
||||
// clang-format off
|
||||
std::cout << "Usage of " << cmd << std::endl;
|
||||
std::cout << "--inOutLengths or -D, comma separated list of input tensor dimension lengths, must have 4 integers for nhwc" << std::endl;
|
||||
std::cout << "--reduceDims or -R, comma separated list of dimensions to reduce on" << std::endl;
|
||||
std::cout << "--verify or -v, 1/0 to indicate whether to verify the result by comparing with the host-based batch-normalization" << std::endl;
|
||||
std::cout << "Arg1: data type (0: fp16, 1: fp32, 5: bp16, 6: fp64)" << std::endl;
|
||||
std::cout << "Arg2: init method used for bnScale and bnBias (0=no init, 1=single integer value, 2=scope integer value, 3=decimal value)" << std::endl;
|
||||
std::cout << "Arg3: time kernel (0=no, 1=yes)" << std::endl;
|
||||
// clang-format on
|
||||
};
|
||||
|
||||
int operator()(int argc, char* argv[])
|
||||
{
|
||||
using ck::host_common::getTypeValuesFromString;
|
||||
|
||||
int ch;
|
||||
|
||||
optind++; // to skip the module name
|
||||
|
||||
while(1)
|
||||
{
|
||||
ch = getopt_long(argc, argv, "D:R:v:o:", long_options, &option_index);
|
||||
if(ch == -1)
|
||||
break;
|
||||
switch(ch)
|
||||
{
|
||||
case 'D':
|
||||
if(!optarg)
|
||||
throw std::runtime_error("Invalid option format!");
|
||||
|
||||
inLengths = getTypeValuesFromString<size_t>(optarg);
|
||||
break;
|
||||
case 'R':
|
||||
if(!optarg)
|
||||
throw std::runtime_error("Invalid option format!");
|
||||
|
||||
reduceDims = getTypeValuesFromString<int>(optarg);
|
||||
break;
|
||||
case 'v':
|
||||
if(!optarg)
|
||||
throw std::runtime_error("Invalid option format!");
|
||||
|
||||
do_verification = static_cast<bool>(std::atoi(optarg));
|
||||
break;
|
||||
case 'o':
|
||||
if(!optarg)
|
||||
throw std::runtime_error("Invalid option format!");
|
||||
|
||||
do_dumpout = static_cast<bool>(std::atoi(optarg));
|
||||
break;
|
||||
case '?':
|
||||
if(std::string(long_options[option_index].name) == "help")
|
||||
{
|
||||
show_usage(argv[0]);
|
||||
return -1;
|
||||
};
|
||||
break;
|
||||
|
||||
default:
|
||||
show_usage(argv[0]);
|
||||
std::cerr << "Invalid cmd-line options!" << std::endl;
|
||||
return -1;
|
||||
};
|
||||
};
|
||||
|
||||
if(optind + 3 > argc)
|
||||
throw std::runtime_error("Invalid cmd-line arguments, more argumetns are needed!");
|
||||
|
||||
data_type = std::atoi(argv[optind++]);
|
||||
init_method = std::atoi(argv[optind++]);
|
||||
time_kernel = static_cast<bool>(std::atoi(argv[optind++]));
|
||||
|
||||
if(data_type != 0 && data_type != 1 && data_type != 5 && data_type != 6)
|
||||
return -1;
|
||||
|
||||
return 0;
|
||||
};
|
||||
}; // end of class AppArgs
|
||||
|
||||
static const double epsilon = std::numeric_limits<float>::epsilon();
|
||||
|
||||
int profile_batchnorm_infer(int argc, char* argv[])
|
||||
{
|
||||
using ck::profiler::profile_batchnorm_infer_impl;
|
||||
|
||||
BatchnormInferArgParser arg_parser;
|
||||
|
||||
if(arg_parser(argc, argv) != 0)
|
||||
return -1;
|
||||
|
||||
using F16 = ck::half_t;
|
||||
using F32 = float;
|
||||
using BF16 = ck::bhalf_t;
|
||||
using F64 = double;
|
||||
|
||||
if(arg_parser.data_type == 0)
|
||||
{
|
||||
if(arg_parser.inLengths.size() == 4 && arg_parser.reduceDims.size() == 3)
|
||||
{
|
||||
profile_batchnorm_infer_impl<F16, F16, F32, F16, F16, F32, 4, 3>(
|
||||
arg_parser.do_verification,
|
||||
arg_parser.init_method,
|
||||
arg_parser.do_dumpout,
|
||||
arg_parser.time_kernel,
|
||||
arg_parser.inLengths,
|
||||
arg_parser.reduceDims,
|
||||
epsilon);
|
||||
};
|
||||
}
|
||||
else if(arg_parser.data_type == 1)
|
||||
{
|
||||
if(arg_parser.inLengths.size() == 4 && arg_parser.reduceDims.size() == 3)
|
||||
{
|
||||
profile_batchnorm_infer_impl<F32, F32, F32, F32, F32, F32, 4, 3>(
|
||||
arg_parser.do_verification,
|
||||
arg_parser.init_method,
|
||||
arg_parser.do_dumpout,
|
||||
arg_parser.time_kernel,
|
||||
arg_parser.inLengths,
|
||||
arg_parser.reduceDims,
|
||||
epsilon);
|
||||
};
|
||||
}
|
||||
else if(arg_parser.data_type == 5)
|
||||
{
|
||||
if(arg_parser.inLengths.size() == 4 && arg_parser.reduceDims.size() == 3)
|
||||
{
|
||||
profile_batchnorm_infer_impl<BF16, BF16, F32, BF16, BF16, F32, 4, 3>(
|
||||
arg_parser.do_verification,
|
||||
arg_parser.init_method,
|
||||
arg_parser.do_dumpout,
|
||||
arg_parser.time_kernel,
|
||||
arg_parser.inLengths,
|
||||
arg_parser.reduceDims,
|
||||
epsilon);
|
||||
};
|
||||
}
|
||||
else if(arg_parser.data_type == 6)
|
||||
{
|
||||
if(arg_parser.inLengths.size() == 4 && arg_parser.reduceDims.size() == 3)
|
||||
{
|
||||
profile_batchnorm_infer_impl<F64, F64, F64, F64, F64, F64, 4, 3>(
|
||||
arg_parser.do_verification,
|
||||
arg_parser.init_method,
|
||||
arg_parser.do_dumpout,
|
||||
arg_parser.time_kernel,
|
||||
arg_parser.inLengths,
|
||||
arg_parser.reduceDims,
|
||||
epsilon);
|
||||
};
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
REGISTER_PROFILER_OPERATION("bnorm_infer", "Batchnorm inference", profile_batchnorm_infer);
|
||||
@@ -1,4 +1,6 @@
|
||||
add_gtest_executable(test_batchnorm_fwd_rank_4 batchnorm_fwd_rank_4.cpp)
|
||||
add_gtest_executable(test_batchnorm_bwd_rank_4 batchnorm_bwd_rank_4.cpp)
|
||||
add_gtest_executable(test_batchnorm_infer_rank_4 batchnorm_infer_rank_4.cpp)
|
||||
target_link_libraries(test_batchnorm_fwd_rank_4 PRIVATE utility device_batchnorm_instance)
|
||||
target_link_libraries(test_batchnorm_bwd_rank_4 PRIVATE utility device_batchnorm_instance)
|
||||
target_link_libraries(test_batchnorm_infer_rank_4 PRIVATE utility device_batchnorm_instance)
|
||||
|
||||
89
test/batchnorm/batchnorm_infer_rank_4.cpp
Normal file
89
test/batchnorm/batchnorm_infer_rank_4.cpp
Normal file
@@ -0,0 +1,89 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#include <cstdlib>
|
||||
#include <iostream>
|
||||
#include <initializer_list>
|
||||
#include <vector>
|
||||
#include <tuple>
|
||||
#include <gtest/gtest.h>
|
||||
|
||||
#include "profiler/profile_batchnorm_infer_impl.hpp"
|
||||
|
||||
using F16 = ck::half_t;
|
||||
using F32 = float;
|
||||
using BF16 = ck::bhalf_t;
|
||||
using F64 = double;
|
||||
|
||||
template <typename Tuple>
|
||||
class TestBatchNormInferRank4 : public ::testing::Test
|
||||
{
|
||||
private:
|
||||
const double epsilon = std::numeric_limits<float>::epsilon();
|
||||
|
||||
protected:
|
||||
using XDataType = std::tuple_element_t<0, Tuple>;
|
||||
using YDataType = std::tuple_element_t<1, Tuple>;
|
||||
using AccDataType = std::tuple_element_t<2, Tuple>;
|
||||
using ScaleDataType = std::tuple_element_t<3, Tuple>;
|
||||
using BiasDataType = std::tuple_element_t<4, Tuple>;
|
||||
using MeanVarDataType = std::tuple_element_t<5, Tuple>;
|
||||
|
||||
std::vector<std::vector<size_t>> list_of_lengths = {
|
||||
{128, 16, 3, 1024}, {128, 16, 6, 512}, {4, 4, 4, 4}, {32, 32, 32, 32}};
|
||||
std::vector<int> reduceDims;
|
||||
|
||||
template <int NumReduceDim>
|
||||
void Run()
|
||||
{
|
||||
for(auto& inOutLengths : list_of_lengths)
|
||||
{
|
||||
bool pass = true;
|
||||
|
||||
EXPECT_FALSE(reduceDims.size() != NumReduceDim);
|
||||
|
||||
pass = pass && ck::profiler::profile_batchnorm_infer_impl<XDataType,
|
||||
YDataType,
|
||||
AccDataType,
|
||||
ScaleDataType,
|
||||
BiasDataType,
|
||||
MeanVarDataType,
|
||||
4,
|
||||
NumReduceDim>(
|
||||
true, 3, false, false, inOutLengths, reduceDims, epsilon);
|
||||
|
||||
pass = pass && ck::profiler::profile_batchnorm_infer_impl<XDataType,
|
||||
YDataType,
|
||||
AccDataType,
|
||||
ScaleDataType,
|
||||
BiasDataType,
|
||||
MeanVarDataType,
|
||||
4,
|
||||
NumReduceDim>(
|
||||
true, 3, false, false, inOutLengths, reduceDims, epsilon);
|
||||
|
||||
EXPECT_TRUE(pass);
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
using KernelTypes = ::testing::Types<std::tuple<F16, F16, F32, F16, F16, F32>,
|
||||
std::tuple<F32, F32, F32, F32, F32, F32>,
|
||||
std::tuple<BF16, BF16, F32, BF16, BF16, F32>,
|
||||
std::tuple<F64, F64, F64, F64, F64, F64>>;
|
||||
|
||||
TYPED_TEST_SUITE(TestBatchNormInferRank4, KernelTypes);
|
||||
|
||||
// nhwc
|
||||
TYPED_TEST(TestBatchNormInferRank4, nhwc)
|
||||
{
|
||||
this->reduceDims = {0, 1, 2};
|
||||
this->template Run<3>();
|
||||
}
|
||||
|
||||
// nchw
|
||||
TYPED_TEST(TestBatchNormInferRank4, nchw)
|
||||
{
|
||||
this->reduceDims = {0, 2, 3};
|
||||
this->template Run<3>();
|
||||
}
|
||||
Reference in New Issue
Block a user