Batchnorm-forward and Batchnorm-infer Implemented using generic kernels (#320)

* Implement multiple-reduction in one kernel (kernels, device ops, examples)

* Add generic elementwise kernel and device interface

* Add generator for normal-distributed data initialization

* Add host refer implementation of batchnorm-forward and batchnorm-infer

* Add examples for implementing batchnorm-forward and batchnorm-infer using generic kernels

* Remove un-needed including in batchnorm example

* Renaming generic_elementwise to elementiwise in kernel and device classes/functions

* Change in gemm_layernorm examples to use DeviceElementwise instead of Device5AryElementwise

* Change in exampe 19_binary_elementwise to use DeviceElementwise instead of DeviceBinaryElementwise

* Change in device_cgemm_4gemm_xdl_cshuffle.hpp to use kernel_elementwise instead of kernel_binary_elementwise

* Add DeviceElementwiseBase and use it in device_normalize_instance.cpp

* Removing and renaming files

* Update to synchronize gemm_layernorm client example to the generic element-wise device op API

* Update to synchronize with the latest headers directory and HostTensorDescriptor interface renaming

* Merge two static member functions in device_elementwise.hpp

* Remove unary_elementwise_1d kernel and device
This commit is contained in:
Qianfeng
2022-08-15 23:11:02 +08:00
committed by GitHub
parent 5ee304595c
commit 53ea4713af
47 changed files with 5195 additions and 1707 deletions

View File

@@ -0,0 +1,2 @@
add_example_executable(example_batchnorm_forward batchnorm_forward_nhwc.cpp)
add_example_executable(example_batchnorm_infer batchnorm_infer_nhwc.cpp)

View File

@@ -0,0 +1,56 @@
# Instructions for ```batchnorm nhwc``` Example
## Run ```batchnorm forward nhwc```
```bash
# -D <xxx> : input 4-d tensor lengths
# -v <x> : verification (0=no, 1=yes)
#arg1: data type (0: fp16, 1: fp32, 3: int8, 5: bp16, 6: fp64)
#arg2: 1/0 to indicate whether to update the moving average and variance (0=no, 1=yes)
#arg3: 1/0 to indicate whether to save result mean/invVariance (0=no, 1=yes)
#arg4: initialization (0=no init, 1=single integer value, 2=scope integer value, 3=decimal value)
#arg5: time kernel (0=no, 1=yes)
./bin/example_batchnorm_forward -D 128,16,16,1024 -v 1 0 0 1 2 1
```
Result
```
./bin/example_batchnorm_forward -D 128,16,16,1024 -v 1 0 0 1 2 1
launch_and_time_kernel: grid_dim {64, 1, 1}, block_dim {256, 1, 1}
Warm up 1 time
Start running 10 times...
launch_and_time_kernel: grid_dim {120, 1, 1}, block_dim {256, 1, 1}
Warm up 1 time
Start running 10 times...
launch_and_time_kernel: grid_dim {120, 1, 1}, block_dim {256, 1, 1}
Warm up 1 time
Start running 10 times...
Perf: 2.08231 ms, 354.519 GB/s
```
Result
```
./bin/example_batchnorm_forward -D 128,16,16,1024 -v 1 0 1 0 2 0
echo $?
0
```
## Run ```batchnorm infer nhwc```
```bash
# -D <xxx> : input 4-d tensor lengths
# -v <x> : verification (0=no, 1=yes)
#arg1: data type (0: fp16, 1: fp32, 3: int8, 5: bp16, 6: fp64)
#arg2: initialization (0=no init, 1=single integer value, 2=scope integer value, 3=decimal value)
#arg3: time kernel (0=no, 1=yes)
./bin/example_batchnorm_infer -D 128,16,16,1024 -v 1 0 2 1
```
Result
```
./bin/example_batchnorm_infer -D 128,16,16,1024 -v 1 0 2 1
launch_and_time_kernel: grid_dim {120, 1, 1}, block_dim {256, 1, 1}
Warm up 1 time
Start running 10 times...
Perf: 1.28235 ms, 523.329 GB/s
```

View File

@@ -0,0 +1,181 @@
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#include <cassert>
#include <vector>
#include <array>
#include <type_traits>
#include "ck/utility/data_type.hpp"
// binary operation used to calculate invVariance from mean and meansquare
struct InvVariance
{
InvVariance(double epsilon) : epsilon_(epsilon){};
template <typename T>
__host__ __device__ constexpr void operator()(T& y, const T& mean, const T& meansquare) const
{
static_assert(std::is_same<T, float>::value || std::is_same<T, double>::value,
"Data type is not supported by this operation!");
using ck::type_convert;
using ck::math::sqrt;
T tmp_epsilon = type_convert<T>(epsilon_);
y = meansquare - mean * mean;
y = 1.0f / sqrt(tmp_epsilon + y);
};
double epsilon_;
};
// (4-in, 2-out) element-wise operation used to update the moving average of mean and variance
struct MovingAverage
{
MovingAverage(double factor) : factor_(factor){};
template <typename T>
__host__ __device__ constexpr void operator()(T& y0,
T& y1,
const T& mean,
const T& runningMean,
const T& meansquare,
const T& runningVariance) const
{
static_assert(std::is_same<T, float>::value || std::is_same<T, double>::value,
"Data type is not supported by this operation!");
using ck::type_convert;
T tmp_factor = type_convert<T>(factor_);
T variance = meansquare - mean * mean;
y0 = runningMean * (type_convert<T>(1.0f) - tmp_factor) + mean * tmp_factor;
y1 = runningVariance * (type_convert<T>(1.0f) - tmp_factor) + variance * tmp_factor;
};
double factor_;
};
struct MovingAverageAndInvVariance
{
MovingAverageAndInvVariance(double epsilon, double factor)
: epsilon_(epsilon), factor_(factor){};
template <typename T>
__host__ __device__ constexpr void operator()(T& y0, // resultRunningMean
T& y1, // resultRunningVariance
T& y2, // saveInvVariance
const T& mean,
const T& runningMean,
const T& meansquare,
const T& runningVariance) const
{
static_assert(std::is_same<T, float>::value || std::is_same<T, double>::value,
"Data type is not supported by this operation!");
using ck::type_convert;
using ck::math::sqrt;
T tmp_epsilon = type_convert<T>(epsilon_);
T tmp_factor = type_convert<T>(factor_);
T variance = meansquare - mean * mean;
y0 = runningMean * (type_convert<T>(1.0f) - tmp_factor) + mean * tmp_factor;
y1 = runningVariance * (type_convert<T>(1.0f) - tmp_factor) + variance * tmp_factor;
y2 = 1.0f / sqrt(tmp_epsilon + variance);
};
double epsilon_;
double factor_;
};
struct NormalizeInInfer
{
NormalizeInInfer(double epsilon = 1e-4) : epsilon_(epsilon) {}
template <typename T1, typename T2>
__host__ __device__ constexpr void operator()(T1& y,
const T1& x,
const T2& mean,
const T2& variance,
const T2& gamma,
const T2& beta) const
{
static_assert(std::is_same<T2, float>::value || std::is_same<T2, double>::value,
"Data type is not supported by this operation!");
using ck::type_convert;
using ck::math::sqrt;
T2 tmp_x, tmp_y;
tmp_x = type_convert<T2>(x);
tmp_y = ((tmp_x - mean) / sqrt(variance + type_convert<T2>(epsilon_))) * gamma + beta;
y = type_convert<T1>(tmp_y);
};
double epsilon_;
};
struct NormalizeInForward
{
NormalizeInForward(double epsilon = 1e-4) : epsilon_(epsilon) {}
template <typename T1, typename T2>
__host__ __device__ constexpr void operator()(T1& y,
const T1& x,
const T2& mean,
const T2& meansquare,
const T2& gamma,
const T2& beta) const
{
static_assert(std::is_same<T2, float>::value || std::is_same<T2, double>::value,
"Data type is not supported by this operation!");
using ck::type_convert;
using ck::math::sqrt;
T2 tmp_x, tmp_y;
T2 variance = meansquare - mean * mean;
tmp_x = type_convert<T2>(x);
tmp_y = ((tmp_x - mean) / sqrt(variance + type_convert<T2>(epsilon_))) * gamma + beta;
y = type_convert<T1>(tmp_y);
};
double epsilon_;
};
template <int Rank, int NumReduceDim>
static inline std::array<int, Rank - NumReduceDim>
get_invariant_dims(const std::array<int, NumReduceDim>& reduceDims)
{
int reduceFlag = 0;
// flag the bits for the reduceDims
for(int i = 0; i < NumReduceDim; i++)
{
reduceFlag |= 1 << reduceDims[i];
};
std::array<int, Rank - NumReduceDim> invariantDims;
// collect invariant dimensions
int dim = 0;
for(int i = 0; i < Rank; i++)
if((reduceFlag & (1 << i)) == 0)
{
invariantDims[dim] = i;
dim++;
};
return invariantDims;
};

View File

@@ -0,0 +1,295 @@
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#include <cassert>
#include <vector>
#include "ck/ck.hpp"
#include "ck/utility/reduction_operator.hpp"
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
#include "ck/tensor_operation/gpu/device/device_multiple_reduce_multiblock.hpp"
#include "ck/tensor_operation/gpu/device/device_elementwise.hpp"
#include "batchnorm_common.hpp"
template <typename InOutDataType,
typename AccDataType,
ck::index_t Rank,
ck::index_t NumBatchNormReduceDim,
bool fastest_dim_is_reduced = false>
int bnorm_fwd(bool time_kernel,
bool updateMovingAverage,
bool saveMeanAndInvVariance,
const std::array<int, NumBatchNormReduceDim> reduceDims,
const std::array<ck::index_t, Rank> xyLengths,
const std::array<ck::index_t, Rank> xStrides,
const std::array<ck::index_t, Rank> yStrides,
const std::array<ck::index_t, Rank - NumBatchNormReduceDim> bnScaleBiasMeanVarLengths,
const std::array<ck::index_t, Rank - NumBatchNormReduceDim> bnScaleBiasMeanVarStrides,
const void* p_x,
const void* p_scale,
const void* p_bias,
void* p_y,
double exponentialAverageFactor,
void* p_runningMean,
void* p_runningVariance,
double epsilon,
void* p_saveMean,
void* p_saveInvVariance,
void* p_tmp_mean,
void* p_tmp_meansquare)
{
static_assert(NumBatchNormReduceDim < Rank,
"Invalid number of reduced dimensions for batchnorm!");
constexpr ck::index_t NumScaleBiasMeanVarDim = Rank - NumBatchNormReduceDim;
using InElementwiseOperation_Mean = ck::tensor_operation::element_wise::PassThrough;
using AccElementwiseOperation_Mean = ck::tensor_operation::element_wise::UnaryDivide;
using InElementwiseOperation_Meansquare = ck::tensor_operation::element_wise::UnarySquare;
using AccElementwiseOperation_Meansquare = ck::tensor_operation::element_wise::UnaryDivide;
using DeviceMeanAndMeansquareInstance =
ck::tensor_operation::device::DeviceMultipleReduceMultiBlock<
2,
InOutDataType,
AccDataType,
ck::Tuple<AccDataType, AccDataType>,
Rank,
NumBatchNormReduceDim,
ck::reduce::Add,
ck::Tuple<InElementwiseOperation_Mean, InElementwiseOperation_Meansquare>,
ck::Tuple<AccElementwiseOperation_Mean, AccElementwiseOperation_Meansquare>,
ck::InMemoryDataOperationEnum::Set,
false, // PropagateNan
256,
16,
16,
1,
1,
fastest_dim_is_reduced ? 1 : 0,
1,
ck::Sequence<1, 1>>;
using DeviceNormalizeInstance = ck::tensor_operation::device::DeviceElementwise<
ck::Tuple<InOutDataType, AccDataType, AccDataType, AccDataType, AccDataType>, // x, mean,
// meansquare,
// scale, bias
ck::Tuple<InOutDataType>, // y
NormalizeInForward,
Rank,
2, // MPerthread
ck::Sequence<1, 1, 1, 1, 1>, // scalarPerVector: x, mean, meansquare, scale, bias
ck::Sequence<1>>; // scalarPerVector: y
using DeviceInvVarianceInstance = ck::tensor_operation::device::DeviceElementwise<
ck::Tuple<AccDataType, AccDataType>, // mean, meansquare
ck::Tuple<AccDataType>, // invVariance
InvVariance,
NumScaleBiasMeanVarDim,
2, // MPerthread
ck::Sequence<1, 1>, // scalarPerVector: mean, meansquare
ck::Sequence<1>>; // scalarPerVector: invVariance
using DeviceMovingAverageInstance = ck::tensor_operation::device::DeviceElementwise<
ck::Tuple<AccDataType, AccDataType, AccDataType, AccDataType>, // old moving mean, new mean,
// old moving variance, new
// meansquare
ck::Tuple<AccDataType, AccDataType>, // updated moving mean, updated moving variance
MovingAverage,
NumScaleBiasMeanVarDim,
4, // MPerthread
ck::Sequence<1, 1, 1, 1>, // scalarPerVector: old moving mean, new mean, old moving
// variance, new meansquare
ck::Sequence<1, 1>>; // scalarPerVector: updated moving mean, updated moving variance
using DeviceMovingAverageAndInvVarianceInstance =
ck::tensor_operation::device::DeviceElementwise<
ck::Tuple<AccDataType, AccDataType, AccDataType, AccDataType>, // old moving mean, new
// mean, old moving
// variance, new
// meansquare
ck::Tuple<AccDataType, AccDataType, AccDataType>, // updated moving mean, updated moving
// variancem, invVariance
MovingAverageAndInvVariance,
NumScaleBiasMeanVarDim,
4, // MPerthread
ck::Sequence<1, 1, 1, 1>, // scalarPerVector: old moving mean, new mean, old moving
// variance, new meansquare
ck::Sequence<1, 1, 1>>; // scalarPerVector: updated moving mean, updated moving variance
auto invariantDims = get_invariant_dims<Rank, NumBatchNormReduceDim>(reduceDims);
std::array<ck::index_t, Rank> aligned_scaleBiasMeanVarStrides{0};
int i = 0;
for(auto dim : invariantDims)
{
assert(xyLengths[dim] == bnScaleBiasMeanVarLengths[i]);
aligned_scaleBiasMeanVarStrides[dim] = bnScaleBiasMeanVarStrides[i];
i++;
};
int32_t reduceLength = 1;
for(auto dim : reduceDims)
reduceLength *= xyLengths[dim];
int32_t invariantLength = 1;
for(auto dim : invariantDims)
invariantLength *= xyLengths[dim];
size_t total_length = static_cast<size_t>(invariantLength) * reduceLength;
float avg_time = 0.0f;
std::size_t num_bytes = 0;
auto dev_mean_and_meansquare = DeviceMeanAndMeansquareInstance{};
void* p_mean = saveMeanAndInvVariance ? p_saveMean : p_tmp_mean;
const AccDataType alpha = ck::type_convert<AccDataType>(1.0f);
const AccDataType beta = ck::type_convert<AccDataType>(0.0f);
auto argument_ptr1 = dev_mean_and_meansquare.MakeArgumentPointer(
xyLengths,
xStrides,
bnScaleBiasMeanVarLengths,
{bnScaleBiasMeanVarStrides, bnScaleBiasMeanVarStrides},
reduceDims,
{&alpha, &alpha},
{&beta, &beta},
p_x,
{p_mean, p_tmp_meansquare},
ck::make_tuple(InElementwiseOperation_Mean{}, InElementwiseOperation_Meansquare{}),
ck::make_tuple(AccElementwiseOperation_Mean{reduceLength},
AccElementwiseOperation_Meansquare{reduceLength}));
auto dev_normalize = DeviceNormalizeInstance{};
auto argument_ptr2 =
dev_normalize.MakeArgumentPointer(xyLengths,
{xStrides,
aligned_scaleBiasMeanVarStrides,
aligned_scaleBiasMeanVarStrides,
aligned_scaleBiasMeanVarStrides,
aligned_scaleBiasMeanVarStrides},
{yStrides},
{p_x, p_mean, p_tmp_meansquare, p_scale, p_bias},
{p_y},
NormalizeInForward{epsilon});
if(!dev_mean_and_meansquare.IsSupportedArgument(argument_ptr1.get()) ||
!dev_normalize.IsSupportedArgument(argument_ptr2.get()))
{
std::cout << "The runtime parameters seems not supported by the Devic, exiting!"
<< std::endl;
return (-1);
};
auto invoker_ptr1 = dev_mean_and_meansquare.MakeInvokerPointer();
auto invoker_ptr2 = dev_normalize.MakeInvokerPointer();
avg_time += invoker_ptr1->Run(argument_ptr1.get(), StreamConfig{nullptr, time_kernel});
avg_time += invoker_ptr2->Run(argument_ptr2.get(), StreamConfig{nullptr, time_kernel});
num_bytes +=
(total_length * sizeof(InOutDataType) + invariantLength * 2 * sizeof(AccDataType)) + // No.1
(total_length * (1 * sizeof(InOutDataType) + 4 * sizeof(AccDataType)) +
total_length * sizeof(InOutDataType)); // No.2
if(saveMeanAndInvVariance && updateMovingAverage)
{
auto dev_moving_average_inv_variance = DeviceMovingAverageAndInvVarianceInstance{};
auto argument_ptr3 = dev_moving_average_inv_variance.MakeArgumentPointer(
bnScaleBiasMeanVarLengths,
{bnScaleBiasMeanVarStrides,
bnScaleBiasMeanVarStrides,
bnScaleBiasMeanVarStrides,
bnScaleBiasMeanVarStrides},
{bnScaleBiasMeanVarStrides, bnScaleBiasMeanVarStrides, bnScaleBiasMeanVarStrides},
{p_mean, p_runningMean, p_tmp_meansquare, p_runningVariance},
{p_runningMean, p_runningVariance, p_saveInvVariance},
MovingAverageAndInvVariance{epsilon, exponentialAverageFactor});
if(!dev_moving_average_inv_variance.IsSupportedArgument(argument_ptr3.get()))
{
std::cout << "Runtime parameters not supported by the Device, exiting!" << std::endl;
return (-1);
};
auto invoker_ptr3 = dev_moving_average_inv_variance.MakeInvokerPointer();
avg_time += invoker_ptr3->Run(argument_ptr3.get(), StreamConfig{nullptr, time_kernel});
num_bytes += invariantLength * (4 + 3) * sizeof(AccDataType) * 2; // No.5
}
else if(saveMeanAndInvVariance)
{
auto dev_inv_variance = DeviceInvVarianceInstance{};
auto argument_ptr3 = dev_inv_variance.MakeArgumentPointer(
bnScaleBiasMeanVarLengths,
{bnScaleBiasMeanVarStrides, bnScaleBiasMeanVarStrides},
{bnScaleBiasMeanVarStrides},
{p_mean, p_tmp_meansquare},
{p_saveInvVariance},
InvVariance{epsilon});
if(!dev_inv_variance.IsSupportedArgument(argument_ptr3.get()))
{
std::cout << "Runtime parameters not supported by the Device, exiting!" << std::endl;
return (-1);
};
auto invoker_ptr3 = dev_inv_variance.MakeInvokerPointer();
avg_time += invoker_ptr3->Run(argument_ptr3.get(), StreamConfig{nullptr, time_kernel});
num_bytes += invariantLength * (2 + 1) * sizeof(AccDataType);
}
else if(updateMovingAverage)
{
auto dev_moving_average = DeviceMovingAverageInstance{};
auto argument_ptr3 = dev_moving_average.MakeArgumentPointer(
bnScaleBiasMeanVarLengths,
{bnScaleBiasMeanVarStrides,
bnScaleBiasMeanVarStrides,
bnScaleBiasMeanVarStrides,
bnScaleBiasMeanVarStrides},
{bnScaleBiasMeanVarStrides, bnScaleBiasMeanVarStrides},
{p_mean, p_runningMean, p_tmp_meansquare, p_runningVariance},
{p_runningMean, p_runningVariance},
MovingAverage{exponentialAverageFactor});
if(!dev_moving_average.IsSupportedArgument(argument_ptr3.get()))
{
std::cout << "Runtime parameters not supported by the Device, exiting!" << std::endl;
return (-1);
};
auto invoker_ptr3 = dev_moving_average.MakeInvokerPointer();
avg_time += invoker_ptr3->Run(argument_ptr3.get(), StreamConfig{nullptr, time_kernel});
num_bytes += invariantLength * (4 + 2) * sizeof(AccDataType) * 2; // No.5
};
if(time_kernel)
{
float gb_per_sec = num_bytes / 1.E6 / avg_time;
std::cout << "Perf: " << avg_time << " ms, " << gb_per_sec << " GB/s" << std::endl;
};
return (0);
};

View File

@@ -0,0 +1,466 @@
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#include <limits>
#include <iostream>
#include <vector>
#include <array>
#include <algorithm>
#include <getopt.h>
#include "ck/ck.hpp"
#include "ck/library/utility/check_err.hpp"
#include "ck/library/utility/device_memory.hpp"
#include "ck/library/utility/host_tensor.hpp"
#include "ck/library/utility/host_tensor_generator.hpp"
#include "ck/library/utility/host_common_util.hpp"
#include "ck/library/reference_tensor_operation/cpu/reference_batchnorm_forward_nhwc_c.hpp"
#include "batchnorm_forward_impl.hpp"
template <typename InOutDataType, typename AccDataType>
using ReferenceBatchNormFwdInstance =
ck::tensor_operation::host::ReferenceBatchNormFwd_Input_N_H_W_C_Output_C<InOutDataType,
AccDataType>;
static struct option long_options[] = {{"inOutLengths", required_argument, nullptr, 'D'},
{"verify", required_argument, nullptr, 'v'},
{"help", no_argument, nullptr, '?'},
{nullptr, 0, nullptr, 0}};
class BatchNormFwdArg
{
private:
int option_index = 0;
public:
std::vector<size_t> inOutLengths;
bool do_verification = false;
bool updateMovingAverage;
bool saveMeanAndInvVariance;
int data_type = 0;
int init_method = 2;
bool time_kernel = false;
public:
void show_usage(const char* cmd)
{
std::cout << "Usage of " << cmd << std::endl;
std::cout << "--inOutLengths or -D, comma separated list of input tensor dimension "
"lengths, must have 4 integers for nhwc"
<< std::endl;
std::cout << "--verify or -v, 1/0 to indicate whether to verify the batch-normalization "
"result by "
"comparing with the host-based batch-normalization"
<< std::endl;
std::cout << "Arg1: data type (0: fp16, 1: fp32, 3: int8, 5: bp16, 6: fp64)" << std::endl;
std::cout << "Arg2: 1/0 to indicate whether to update the moving average and variance "
"(0=no, 1=yes)"
<< std::endl;
std::cout << "Arg3: 1/0 to indicate whether to save the calculated mean and invVariance "
"(0=no, 1=yes)"
<< std::endl;
std::cout << "Arg4: init method used for bnScale and bnBias (0=no init, 1=single integer "
"value, 2=scope integer "
"value, 3=decimal value)"
<< std::endl;
std::cout << "Arg5: time kernel (0=no, 1=yes)" << std::endl;
};
int processArgs(int argc, char* argv[])
{
using ck::host_common::getTypeValuesFromString;
int ch;
while(1)
{
ch = getopt_long(argc, argv, "D:v:", long_options, &option_index);
if(ch == -1)
break;
switch(ch)
{
case 'D':
if(!optarg)
throw std::runtime_error("Invalid option format!");
inOutLengths = getTypeValuesFromString<size_t>(optarg);
if(inOutLengths.size() != 4)
throw std::runtime_error(
"NHWC tensor layout should have 4 length values specified!");
break;
case 'v':
if(!optarg)
throw std::runtime_error("Invalid option format!");
do_verification = static_cast<bool>(std::atoi(optarg));
break;
case '?':
if(std::string(long_options[option_index].name) == "help")
{
show_usage(argv[0]);
return (-1);
};
break;
default: show_usage(argv[0]); return (-1);
};
};
if(optind + 5 > argc)
throw std::runtime_error("Invalid cmd-line arguments, more argumetns are needed!");
data_type = std::atoi(argv[optind++]);
updateMovingAverage = std::atoi(argv[optind++]);
saveMeanAndInvVariance = std::atoi(argv[optind++]);
init_method = std::atoi(argv[optind++]);
time_kernel = static_cast<bool>(std::atoi(argv[optind]));
if(data_type != 0 && data_type != 1 && data_type != 3 && data_type != 5 && data_type != 6)
return (-1);
return (0);
};
};
using namespace ck;
template <typename InOutDataType, typename AccDataType>
bool bnorm_fwd_nhwc_test(bool do_verification,
int init_method,
bool time_kernel,
const std::vector<size_t> inOutLengths,
bool updateMovingAverage,
bool saveMeanAndInvVariance,
double averageFactor,
double epsilon)
{
// for NHWC BatchNorm calculation of mean and meansquare
constexpr int Rank = 4;
constexpr int NumReduceDim = 3;
const std::vector<size_t> scaleBiasMeanVarLengths = {inOutLengths[3]};
// input data of the batchnorm forward algorithm
Tensor<InOutDataType> x(inOutLengths);
Tensor<AccDataType> bnScale(scaleBiasMeanVarLengths);
Tensor<AccDataType> bnBias(scaleBiasMeanVarLengths);
// output data of the batchnorm forward algorithm
Tensor<InOutDataType> y_ref(inOutLengths);
Tensor<InOutDataType> y(inOutLengths);
Tensor<AccDataType> resultSaveMean_ref(scaleBiasMeanVarLengths);
Tensor<AccDataType> resultSaveInvVariance_ref(scaleBiasMeanVarLengths);
Tensor<AccDataType> resultRunningMean_ref(scaleBiasMeanVarLengths);
Tensor<AccDataType> resultRunningVariance_ref(scaleBiasMeanVarLengths);
auto inOutStrides = x.mDesc.GetStrides();
auto scaleBiasMeanVarStrides = bnScale.mDesc.GetStrides();
std::size_t num_thread = std::thread::hardware_concurrency();
if(updateMovingAverage)
{
if constexpr(std::is_same<InOutDataType, int8_t>::value)
{
x.GenerateTensorValue(GeneratorTensor_2<InOutDataType>{-5, 5}, num_thread);
const float x_mean = 0.0f;
const float x_stddev = 2.5f;
const float noise_stddev = 0.04f;
resultRunningMean_ref.GenerateTensorValue(
GeneratorTensor_4<AccDataType>{x_mean, noise_stddev}, num_thread);
resultRunningVariance_ref.GenerateTensorValue(
GeneratorTensor_4<AccDataType>{x_stddev * x_stddev, noise_stddev}, num_thread);
}
else
{
const float x_mean = 0.0f;
const float x_stddev = 1.0f;
const float noise_stddev = 0.04f;
// input data in normal distribution
x.GenerateTensorValue(GeneratorTensor_4<InOutDataType>{x_mean, x_stddev}, num_thread);
// initialize the runningMean to be values with tiny variation to the mean of the x
// values
resultRunningMean_ref.GenerateTensorValue(
GeneratorTensor_4<AccDataType>{x_mean, noise_stddev}, num_thread);
// initialize the runningVariance to be values with tiny variation to the variance of
// the x values
resultRunningVariance_ref.GenerateTensorValue(
GeneratorTensor_4<AccDataType>{x_stddev * x_stddev, noise_stddev}, num_thread);
};
}
else
{
if constexpr(std::is_same<InOutDataType, int8_t>::value)
x.GenerateTensorValue(GeneratorTensor_2<InOutDataType>{-5, 5}, num_thread);
else
x.GenerateTensorValue(GeneratorTensor_3<InOutDataType>{-5.0f, 5.0f}, num_thread);
};
if(do_verification)
{
switch(init_method)
{
case 0:
bnScale.GenerateTensorValue(GeneratorTensor_0<AccDataType>{}, num_thread);
bnBias.GenerateTensorValue(GeneratorTensor_0<AccDataType>{}, num_thread);
break;
case 1:
bnScale.GenerateTensorValue(GeneratorTensor_1<AccDataType>{1}, num_thread);
bnBias.GenerateTensorValue(GeneratorTensor_1<AccDataType>{0}, num_thread);
break;
case 2:
bnScale.GenerateTensorValue(GeneratorTensor_2<AccDataType>{-5, 5}, num_thread);
bnBias.GenerateTensorValue(GeneratorTensor_2<AccDataType>{-5, 5}, num_thread);
break;
default:
bnScale.GenerateTensorValue(GeneratorTensor_3<AccDataType>{-5.0f, 5.0f}, num_thread);
bnBias.GenerateTensorValue(GeneratorTensor_3<AccDataType>{-5.0f, 5.0f}, num_thread);
}
};
// these buffers are usually provided by the user application
DeviceMem x_dev(sizeof(InOutDataType) * x.mDesc.GetElementSpaceSize());
DeviceMem y_dev(sizeof(InOutDataType) * y.mDesc.GetElementSpaceSize());
DeviceMem bnScale_dev(sizeof(AccDataType) * bnScale.mDesc.GetElementSpaceSize());
DeviceMem bnBias_dev(sizeof(AccDataType) * bnBias.mDesc.GetElementSpaceSize());
// mean_dev or resultSaveMean_dev
DeviceMem resultSaveMean_dev(sizeof(AccDataType) *
resultSaveMean_ref.mDesc.GetElementSpaceSize());
// meansquare_dev or resultSaveInvVariance_dev
DeviceMem resultSaveInvVariance_dev(sizeof(AccDataType) *
resultSaveInvVariance_ref.mDesc.GetElementSpaceSize());
// resultRunningMean_dev
DeviceMem resultRunningMean_dev(sizeof(AccDataType) *
resultRunningMean_ref.mDesc.GetElementSpaceSize());
// resultRunningVariance_dev
DeviceMem resultRunningVariance_dev(sizeof(AccDataType) *
resultRunningVariance_ref.mDesc.GetElementSpaceSize());
x_dev.ToDevice(x.mData.data());
bnScale_dev.ToDevice(bnScale.mData.data());
bnBias_dev.ToDevice(bnBias.mData.data());
if(updateMovingAverage)
{
resultRunningMean_dev.ToDevice(resultRunningMean_ref.mData.data());
resultRunningVariance_dev.ToDevice(resultRunningVariance_ref.mData.data());
};
std::array<index_t, Rank> i_inOutLengths;
std::array<index_t, Rank> i_inOutStrides;
std::array<index_t, Rank - NumReduceDim> i_scaleBiasMeanVarLengths;
std::array<index_t, Rank - NumReduceDim> i_scaleBiasMeanVarStrides;
std::copy(inOutLengths.begin(), inOutLengths.end(), i_inOutLengths.begin());
std::copy(inOutStrides.begin(), inOutStrides.end(), i_inOutStrides.begin());
std::copy(scaleBiasMeanVarLengths.begin(),
scaleBiasMeanVarLengths.end(),
i_scaleBiasMeanVarLengths.begin());
std::copy(scaleBiasMeanVarStrides.begin(),
scaleBiasMeanVarStrides.end(),
i_scaleBiasMeanVarStrides.begin());
int result = 0;
// used for saving meansquare
DeviceMem workspace(sizeof(AccDataType) * 2 * resultSaveMean_ref.mDesc.GetElementSpaceSize() +
128);
void* p_tmp_mean = workspace.GetDeviceBuffer();
void* p_tmp_meansquare =
static_cast<char*>(p_tmp_mean) +
(sizeof(AccDataType) * resultSaveMean_ref.mDesc.GetElementSpaceSize() + 63) / 64 * 64;
result = bnorm_fwd<InOutDataType, AccDataType, Rank, NumReduceDim, false>(
time_kernel,
updateMovingAverage,
saveMeanAndInvVariance,
{0, 1, 2},
i_inOutLengths,
i_inOutStrides,
i_inOutStrides,
i_scaleBiasMeanVarLengths,
i_scaleBiasMeanVarStrides,
x_dev.GetDeviceBuffer(),
bnScale_dev.GetDeviceBuffer(),
bnBias_dev.GetDeviceBuffer(),
y_dev.GetDeviceBuffer(),
averageFactor,
updateMovingAverage ? resultRunningMean_dev.GetDeviceBuffer() : nullptr,
updateMovingAverage ? resultRunningVariance_dev.GetDeviceBuffer() : nullptr,
epsilon,
saveMeanAndInvVariance ? resultSaveMean_dev.GetDeviceBuffer() : nullptr,
saveMeanAndInvVariance ? resultSaveInvVariance_dev.GetDeviceBuffer() : nullptr,
p_tmp_mean,
p_tmp_meansquare);
if(result < 0)
return (false);
bool pass = true;
if(do_verification)
{
auto batchNormFwd_ref = ReferenceBatchNormFwdInstance<InOutDataType, AccDataType>{};
auto argument_ptr_ref = batchNormFwd_ref.MakeArgumentPointer(
i_inOutLengths,
i_inOutStrides,
i_inOutStrides,
i_scaleBiasMeanVarLengths,
i_scaleBiasMeanVarStrides,
x.mData.data(),
bnScale.mData.data(),
bnBias.mData.data(),
y_ref.mData.data(),
0.1, // exponentialAverageFactor
updateMovingAverage ? resultRunningMean_ref.mData.data() : nullptr, // resultRunningMean
updateMovingAverage ? resultRunningVariance_ref.mData.data()
: nullptr, // resultRunningVariance
epsilon,
saveMeanAndInvVariance ? resultSaveMean_ref.mData.data() : nullptr,
saveMeanAndInvVariance ? resultSaveInvVariance_ref.mData.data() : nullptr);
if(!batchNormFwd_ref.IsSupportedArgument(argument_ptr_ref.get()))
{
std::cout
<< "The runtime parameters seems not supported by the BatchNorm instance, exiting!"
<< std::endl;
return (-2);
};
auto invoker_ptr_ref = batchNormFwd_ref.MakeInvokerPointer();
(void)invoker_ptr_ref->Run(argument_ptr_ref.get());
y_dev.FromDevice(y.mData.data());
pass = pass && ck::utils::check_err(y.mData, y_ref.mData);
if(updateMovingAverage)
{
Tensor<AccDataType> resultRunningMean(scaleBiasMeanVarLengths);
Tensor<AccDataType> resultRunningVariance(scaleBiasMeanVarLengths);
resultRunningMean_dev.FromDevice(resultRunningMean.mData.data());
resultRunningVariance_dev.FromDevice(resultRunningVariance.mData.data());
pass =
pass && ck::utils::check_err(resultRunningMean.mData, resultRunningMean_ref.mData);
pass = pass && ck::utils::check_err(resultRunningVariance.mData,
resultRunningVariance_ref.mData);
};
if(saveMeanAndInvVariance)
{
Tensor<AccDataType> resultSaveMean(scaleBiasMeanVarLengths);
Tensor<AccDataType> resultSaveInvVariance(scaleBiasMeanVarLengths);
resultSaveMean_dev.FromDevice(resultSaveMean.mData.data());
resultSaveInvVariance_dev.FromDevice(resultSaveInvVariance.mData.data());
pass = pass && ck::utils::check_err(resultSaveMean.mData, resultSaveMean_ref.mData);
pass = pass && ck::utils::check_err(resultSaveInvVariance.mData,
resultSaveInvVariance_ref.mData);
};
};
return (pass);
};
const double epsilon = std::numeric_limits<float>::epsilon();
static const double averageFactor = 0.1;
int main(int argc, char* argv[])
{
bool pass = true;
if(argc > 1)
{
BatchNormFwdArg arg;
if(arg.processArgs(argc, argv) < 0)
return (-1);
if(arg.data_type == 0)
{
pass = bnorm_fwd_nhwc_test<ck::half_t, float>(arg.do_verification,
arg.init_method,
arg.time_kernel,
arg.inOutLengths,
arg.updateMovingAverage,
arg.saveMeanAndInvVariance,
averageFactor,
epsilon);
}
else if(arg.data_type == 1)
{
pass = bnorm_fwd_nhwc_test<float, float>(arg.do_verification,
arg.init_method,
arg.time_kernel,
arg.inOutLengths,
arg.updateMovingAverage,
arg.saveMeanAndInvVariance,
averageFactor,
epsilon);
}
else if(arg.data_type == 3)
{
pass = bnorm_fwd_nhwc_test<int8_t, float>(arg.do_verification,
arg.init_method,
arg.time_kernel,
arg.inOutLengths,
arg.updateMovingAverage,
arg.saveMeanAndInvVariance,
averageFactor,
epsilon);
}
else if(arg.data_type == 5)
{
pass = bnorm_fwd_nhwc_test<ck::bhalf_t, float>(arg.do_verification,
arg.init_method,
arg.time_kernel,
arg.inOutLengths,
arg.updateMovingAverage,
arg.saveMeanAndInvVariance,
averageFactor,
epsilon);
}
else if(arg.data_type == 6)
{
pass = bnorm_fwd_nhwc_test<double, double>(arg.do_verification,
arg.init_method,
arg.time_kernel,
arg.inOutLengths,
arg.updateMovingAverage,
arg.saveMeanAndInvVariance,
averageFactor,
epsilon);
}
}
else
{
pass = bnorm_fwd_nhwc_test<ck::half_t, float>(true,
2,
false, // don't time kernel
{128, 16, 16, 1024},
true,
false,
averageFactor,
epsilon);
};
return (pass ? 0 : 1);
}

View File

@@ -0,0 +1,119 @@
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#include <cassert>
#include <vector>
#include "ck/ck.hpp"
#include "ck/utility/sequence.hpp"
#include "ck/utility/tuple.hpp"
#include "ck/utility/reduction_operator.hpp"
#include "ck/tensor_operation/gpu/device/device_elementwise.hpp"
#include "batchnorm_common.hpp"
template <typename InOutDataType,
typename AccDataType,
ck::index_t Rank,
ck::index_t NumBatchNormReduceDim,
bool fastest_dim_is_reduced = false>
int bnorm_infer(
bool time_kernel,
const std::array<int, NumBatchNormReduceDim> reduceDims,
const std::array<ck::index_t, Rank> xyLengths,
const std::array<ck::index_t, Rank> xStrides,
const std::array<ck::index_t, Rank> yStrides,
const std::array<ck::index_t, Rank - NumBatchNormReduceDim> bnScaleBiasMeanVarLengths,
const std::array<ck::index_t, Rank - NumBatchNormReduceDim> bnScaleBiasMeanVarStrides,
const void* p_x,
const void* p_scale,
const void* p_bias,
double epsilon,
const void* p_estimatedMean,
const void* p_estimatedVariance,
void* p_y)
{
(void)bnScaleBiasMeanVarLengths;
static_assert(NumBatchNormReduceDim < Rank,
"Invalid number of reduced dimensions for batchnorm!");
using DeviceNormalizeInstance = ck::tensor_operation::device::DeviceElementwise<
ck::Tuple<InOutDataType, AccDataType, AccDataType, AccDataType, AccDataType>, // x, mean,
// variance,
// scale,
// bias,
ck::Tuple<InOutDataType>, // y
NormalizeInInfer,
Rank,
2, // MPerthread
ck::Sequence<1, 1, 1, 1, 1>, // x, mean, variance, scale, bias
ck::Sequence<1>>; // scalarPerVector: y
auto invariantDims = get_invariant_dims<Rank, NumBatchNormReduceDim>(reduceDims);
std::array<ck::index_t, Rank> aligned_scaleBiasMeanVarStrides{0};
int i = 0;
for(auto dim : invariantDims)
{
assert(xyLengths[dim] == bnScaleBiasMeanVarLengths[i]);
aligned_scaleBiasMeanVarStrides[dim] = bnScaleBiasMeanVarStrides[i];
i++;
};
int32_t reduceLength = 1;
for(auto dim : reduceDims)
reduceLength *= xyLengths[dim];
int32_t invariantLength = 1;
for(auto dim : invariantDims)
invariantLength *= xyLengths[dim];
size_t total_length = static_cast<size_t>(invariantLength) * reduceLength;
float avg_time = 0.0f;
std::size_t num_bytes = 0;
auto dev_normalize = DeviceNormalizeInstance{};
auto argument_ptr1 = dev_normalize.MakeArgumentPointer(
xyLengths,
{xStrides,
aligned_scaleBiasMeanVarStrides,
aligned_scaleBiasMeanVarStrides,
aligned_scaleBiasMeanVarStrides,
aligned_scaleBiasMeanVarStrides},
{yStrides},
{p_x, p_estimatedMean, p_estimatedVariance, p_scale, p_bias},
{p_y},
NormalizeInInfer{epsilon});
if(!dev_normalize.IsSupportedArgument(argument_ptr1.get()))
{
std::cout << "The runtime parameters seems not supported by the Devic, exiting!"
<< std::endl;
return (-1);
};
auto invoker_ptr1 = dev_normalize.MakeInvokerPointer();
avg_time += invoker_ptr1->Run(argument_ptr1.get(), StreamConfig{nullptr, time_kernel});
num_bytes += (total_length * (1 * sizeof(InOutDataType) + 4 * sizeof(AccDataType)) +
total_length * sizeof(InOutDataType));
if(time_kernel)
{
float gb_per_sec = num_bytes / 1.E6 / avg_time;
std::cout << "Perf: " << avg_time << " ms, " << gb_per_sec << " GB/s" << std::endl;
};
return (0);
};

View File

@@ -0,0 +1,346 @@
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#include <limits>
#include <iostream>
#include <vector>
#include <array>
#include <algorithm>
#include <getopt.h>
#include "ck/ck.hpp"
#include "ck/library/utility/check_err.hpp"
#include "ck/library/utility/device_memory.hpp"
#include "ck/library/utility/host_tensor.hpp"
#include "ck/library/utility/host_tensor_generator.hpp"
#include "ck/library/utility/host_common_util.hpp"
#include "ck/library/reference_tensor_operation/cpu/reference_batchnorm_infer_nhwc_c.hpp"
#include "batchnorm_infer_impl.hpp"
template <typename InOutDataType, typename AccDataType>
using ReferenceBatchNormInferInstance =
ck::tensor_operation::host::ReferenceBatchNormInfer_Input_N_H_W_C_Output_C<InOutDataType,
AccDataType>;
static struct option long_options[] = {{"inOutLengths", required_argument, nullptr, 'D'},
{"verify", required_argument, nullptr, 'v'},
{"help", no_argument, nullptr, '?'},
{nullptr, 0, nullptr, 0}};
class BatchNormInferArg
{
private:
int option_index = 0;
public:
std::vector<size_t> inOutLengths;
bool do_verification = false;
int data_type = 0;
int init_method = 2;
bool time_kernel = false;
public:
void show_usage(const char* cmd)
{
std::cout << "Usage of " << cmd << std::endl;
std::cout << "--inOutLengths or -D, comma separated list of input tensor dimension "
"lengths, must have 4 integers for nhwc"
<< std::endl;
std::cout << "--verify or -v, 1/0 to indicate whether to verify the batch-normalization "
"result by "
"comparing with the host-based batch-normalization"
<< std::endl;
std::cout << "Arg1: data type (0: fp16, 1: fp32, 3: int8, 5: bp16, 6: fp64)" << std::endl;
std::cout << "Arg2: init method used for bnScale and bnBias (0=no init, 1=single integer "
"value, 2=scope integer "
"value, 3=decimal value)"
<< std::endl;
std::cout << "Arg3: time kernel (0=no, 1=yes)" << std::endl;
};
int processArgs(int argc, char* argv[])
{
using ck::host_common::getTypeValuesFromString;
int ch;
while(1)
{
ch = getopt_long(argc, argv, "D:v:", long_options, &option_index);
if(ch == -1)
break;
switch(ch)
{
case 'D':
if(!optarg)
throw std::runtime_error("Invalid option format!");
inOutLengths = getTypeValuesFromString<size_t>(optarg);
if(inOutLengths.size() != 4)
throw std::runtime_error(
"NHWC tensor layout should have 4 length values specified!");
break;
case 'v':
if(!optarg)
throw std::runtime_error("Invalid option format!");
do_verification = static_cast<bool>(std::atoi(optarg));
break;
case '?':
if(std::string(long_options[option_index].name) == "help")
{
show_usage(argv[0]);
return (-1);
};
break;
default: show_usage(argv[0]); return (-1);
};
};
if(optind + 3 > argc)
throw std::runtime_error("Invalid cmd-line arguments, more argumetns are needed!");
data_type = std::atoi(argv[optind++]);
init_method = std::atoi(argv[optind++]);
time_kernel = static_cast<bool>(std::atoi(argv[optind]));
if(data_type != 0 && data_type != 1 && data_type != 3 && data_type != 5 && data_type != 6)
return (-1);
return (0);
};
};
using namespace ck;
template <typename InOutDataType, typename AccDataType>
bool bnorm_infer_nhwc_test(bool do_verification,
int init_method,
bool time_kernel,
const std::vector<size_t> inOutLengths,
double epsilon)
{
// for NHWC BatchNorm calculation of mean and meansquare
constexpr int Rank = 4;
constexpr int NumReduceDim = 3;
const std::vector<size_t> scaleBiasMeanVarLengths = {inOutLengths[3]};
// input data of the batchnorm forward algorithm
Tensor<InOutDataType> x(inOutLengths);
Tensor<AccDataType> bnScale(scaleBiasMeanVarLengths);
Tensor<AccDataType> bnBias(scaleBiasMeanVarLengths);
// output data of the batchnorm forward algorithm
Tensor<InOutDataType> y_ref(inOutLengths);
Tensor<InOutDataType> y(inOutLengths);
Tensor<AccDataType> estimatedMean(scaleBiasMeanVarLengths);
Tensor<AccDataType> estimatedVariance(scaleBiasMeanVarLengths);
auto inOutStrides = x.mDesc.GetStrides();
auto scaleBiasMeanVarStrides = bnScale.mDesc.GetStrides();
std::size_t num_thread = std::thread::hardware_concurrency();
if constexpr(std::is_same<InOutDataType, int8_t>::value)
{
x.GenerateTensorValue(GeneratorTensor_2<InOutDataType>{-5, 5}, num_thread);
const float x_mean = 0.0f;
const float x_stddev = 2.5f;
const float noise_stddev = 0.0001f;
estimatedMean.GenerateTensorValue(GeneratorTensor_4<AccDataType>{x_mean, noise_stddev},
num_thread);
estimatedVariance.GenerateTensorValue(
GeneratorTensor_4<AccDataType>{x_stddev * x_stddev, noise_stddev}, num_thread);
}
else
{
const float x_mean = 0.0f;
const float x_stddev = 1.0f;
const float noise_stddev = 0.0001f;
x.GenerateTensorValue(GeneratorTensor_4<InOutDataType>{x_mean, x_stddev}, num_thread);
// initialize the savedMean to be values with tiny variation to the mean of the x values
estimatedMean.GenerateTensorValue(GeneratorTensor_4<AccDataType>{x_mean, noise_stddev},
num_thread);
// initialize the variance to be values with tiny variation to the variance of the x values
estimatedVariance.GenerateTensorValue(
GeneratorTensor_4<AccDataType>{x_stddev * x_stddev, noise_stddev}, num_thread);
};
if(do_verification)
{
switch(init_method)
{
case 0:
bnScale.GenerateTensorValue(GeneratorTensor_0<AccDataType>{}, num_thread);
bnBias.GenerateTensorValue(GeneratorTensor_0<AccDataType>{}, num_thread);
break;
case 1:
bnScale.GenerateTensorValue(GeneratorTensor_1<AccDataType>{1}, num_thread);
bnBias.GenerateTensorValue(GeneratorTensor_1<AccDataType>{0}, num_thread);
break;
case 2:
bnScale.GenerateTensorValue(GeneratorTensor_2<AccDataType>{-5, 5}, num_thread);
bnBias.GenerateTensorValue(GeneratorTensor_2<AccDataType>{-5, 5}, num_thread);
break;
default:
bnScale.GenerateTensorValue(GeneratorTensor_3<AccDataType>{-5.0f, 5.0f}, num_thread);
bnBias.GenerateTensorValue(GeneratorTensor_3<AccDataType>{-5.0f, 5.0f}, num_thread);
}
};
// these buffers are usually provided by the user application
DeviceMem x_dev(sizeof(InOutDataType) * x.mDesc.GetElementSpaceSize());
DeviceMem y_dev(sizeof(InOutDataType) * y.mDesc.GetElementSpaceSize());
DeviceMem bnScale_dev(sizeof(AccDataType) * bnScale.mDesc.GetElementSpaceSize());
DeviceMem bnBias_dev(sizeof(AccDataType) * bnBias.mDesc.GetElementSpaceSize());
// mean_dev or resultSaveMean_dev
DeviceMem estimatedMean_dev(sizeof(AccDataType) * estimatedMean.mDesc.GetElementSpaceSize());
// meansquare_dev or resultSaveInvVariance_dev
DeviceMem estimatedVariance_dev(sizeof(AccDataType) *
estimatedVariance.mDesc.GetElementSpaceSize());
x_dev.ToDevice(x.mData.data());
bnScale_dev.ToDevice(bnScale.mData.data());
bnBias_dev.ToDevice(bnBias.mData.data());
estimatedMean_dev.ToDevice(estimatedMean.mData.data());
estimatedVariance_dev.ToDevice(estimatedVariance.mData.data());
using ck::index_t;
std::array<index_t, Rank> i_inOutLengths;
std::array<index_t, Rank> i_inOutStrides;
std::array<index_t, Rank - NumReduceDim> i_scaleBiasMeanVarLengths;
std::array<index_t, Rank - NumReduceDim> i_scaleBiasMeanVarStrides;
std::copy(inOutLengths.begin(), inOutLengths.end(), i_inOutLengths.begin());
std::copy(inOutStrides.begin(), inOutStrides.end(), i_inOutStrides.begin());
std::copy(scaleBiasMeanVarLengths.begin(),
scaleBiasMeanVarLengths.end(),
i_scaleBiasMeanVarLengths.begin());
std::copy(scaleBiasMeanVarStrides.begin(),
scaleBiasMeanVarStrides.end(),
i_scaleBiasMeanVarStrides.begin());
int result = 0;
result = bnorm_infer<InOutDataType, AccDataType, Rank, NumReduceDim, false>(
time_kernel,
{0, 1, 2},
i_inOutLengths,
i_inOutStrides,
i_inOutStrides,
i_scaleBiasMeanVarLengths,
i_scaleBiasMeanVarStrides,
x_dev.GetDeviceBuffer(),
bnScale_dev.GetDeviceBuffer(),
bnBias_dev.GetDeviceBuffer(),
epsilon,
estimatedMean_dev.GetDeviceBuffer(),
estimatedVariance_dev.GetDeviceBuffer(),
y_dev.GetDeviceBuffer());
if(result < 0)
return (false);
bool pass = true;
if(do_verification)
{
auto batchNormInfer_ref = ReferenceBatchNormInferInstance<InOutDataType, AccDataType>{};
auto argument_ptr_ref =
batchNormInfer_ref.MakeArgumentPointer(i_inOutLengths,
i_inOutStrides,
i_inOutStrides,
i_scaleBiasMeanVarLengths,
i_scaleBiasMeanVarStrides,
x.mData.data(),
bnScale.mData.data(),
bnBias.mData.data(),
epsilon,
estimatedMean.mData.data(),
estimatedVariance.mData.data(),
y_ref.mData.data());
if(!batchNormInfer_ref.IsSupportedArgument(argument_ptr_ref.get()))
{
std::cout
<< "The runtime parameters seems not supported by the BatchNorm instance, exiting!"
<< std::endl;
return (-2);
};
auto invoker_ptr_ref = batchNormInfer_ref.MakeInvokerPointer();
(void)invoker_ptr_ref->Run(argument_ptr_ref.get());
y_dev.FromDevice(y.mData.data());
pass = pass && ck::utils::check_err(y.mData, y_ref.mData);
};
return (pass);
};
static const double epsilon = std::numeric_limits<float>::epsilon();
int main(int argc, char* argv[])
{
bool pass = true;
if(argc > 1)
{
BatchNormInferArg arg;
if(arg.processArgs(argc, argv) < 0)
return (-1);
if(arg.data_type == 0)
{
pass = bnorm_infer_nhwc_test<ck::half_t, float>(
arg.do_verification, arg.init_method, arg.time_kernel, arg.inOutLengths, epsilon);
}
else if(arg.data_type == 1)
{
pass = bnorm_infer_nhwc_test<float, float>(
arg.do_verification, arg.init_method, arg.time_kernel, arg.inOutLengths, epsilon);
}
else if(arg.data_type == 3)
{
pass = bnorm_infer_nhwc_test<int8_t, float>(
arg.do_verification, arg.init_method, arg.time_kernel, arg.inOutLengths, epsilon);
}
else if(arg.data_type == 5)
{
pass = bnorm_infer_nhwc_test<ck::bhalf_t, float>(
arg.do_verification, arg.init_method, arg.time_kernel, arg.inOutLengths, epsilon);
}
else if(arg.data_type == 6)
{
pass = bnorm_infer_nhwc_test<double, double>(
arg.do_verification, arg.init_method, arg.time_kernel, arg.inOutLengths, epsilon);
};
}
else
{
pass = bnorm_infer_nhwc_test<ck::half_t, float>(true,
2,
false, // don't time kernel
{128, 16, 16, 1024},
epsilon);
};
return (pass ? 0 : 1);
}