mirror of
https://github.com/ROCm/composable_kernel.git
synced 2026-05-17 19:40:04 +00:00
Reorganize project folders (#6)
This commit is contained in:
4
example/34_batchnorm/CMakeLists.txt
Normal file
4
example/34_batchnorm/CMakeLists.txt
Normal file
@@ -0,0 +1,4 @@
|
||||
add_example_executable(example_batchnorm_forward_training batchnorm_forward_training_nhwc.cpp)
|
||||
add_example_executable(example_batchnorm_forward_training_obsolete batchnorm_forward_training_nhwc_obsolete.cpp)
|
||||
add_example_executable(example_batchnorm_forward_inferring batchnorm_forward_inferring_nhwc.cpp)
|
||||
add_example_executable(example_batchnorm_backward batchnorm_backward_nhwc.cpp)
|
||||
81
example/34_batchnorm/README.md
Normal file
81
example/34_batchnorm/README.md
Normal file
@@ -0,0 +1,81 @@
|
||||
# Instructions for ```batchnorm nhwc``` Example
|
||||
|
||||
## Run ```batchnorm forward nhwc```
|
||||
```bash
|
||||
# -D <xxx> : input 4-d tensor lengths
|
||||
# -v <x> : verification (0=no, 1=yes)
|
||||
#arg1: data type (0: fp16, 1: fp32, 3: int8, 5: bp16, 6: fp64)
|
||||
#arg2: 1/0 to indicate whether to update the moving average and variance (0=no, 1=yes)
|
||||
#arg3: 1/0 to indicate whether to save result mean/invVariance (0=no, 1=yes)
|
||||
#arg4: initialization (0=no init, 1=single integer value, 2=scope integer value, 3=decimal value)
|
||||
#arg5: time kernel (0=no, 1=yes)
|
||||
./bin/example_batchnorm_forward -D 128,16,16,1024 -v 1 0 0 1 2 1
|
||||
```
|
||||
|
||||
Result
|
||||
```
|
||||
./bin/example_batchnorm_forward -D 128,16,16,1024 -v 1 0 0 1 2 1
|
||||
launch_and_time_kernel: grid_dim {64, 1, 1}, block_dim {256, 1, 1}
|
||||
Warm up 1 time
|
||||
Start running 10 times...
|
||||
launch_and_time_kernel: grid_dim {120, 1, 1}, block_dim {256, 1, 1}
|
||||
Warm up 1 time
|
||||
Start running 10 times...
|
||||
launch_and_time_kernel: grid_dim {120, 1, 1}, block_dim {256, 1, 1}
|
||||
Warm up 1 time
|
||||
Start running 10 times...
|
||||
Perf: 2.08231 ms, 354.519 GB/s
|
||||
```
|
||||
|
||||
Result
|
||||
```
|
||||
./bin/example_batchnorm_forward -D 128,16,16,1024 -v 1 0 1 0 2 0
|
||||
echo $?
|
||||
0
|
||||
```
|
||||
|
||||
## Run ```batchnorm infer nhwc```
|
||||
```bash
|
||||
# -D <xxx> : input 4-d tensor lengths
|
||||
# -v <x> : verification (0=no, 1=yes)
|
||||
#arg1: data type (0: fp16, 1: fp32, 3: int8, 5: bp16, 6: fp64)
|
||||
#arg2: initialization (0=no init, 1=single integer value, 2=scope integer value, 3=decimal value)
|
||||
#arg3: time kernel (0=no, 1=yes)
|
||||
./bin/example_batchnorm_infer -D 128,16,16,1024 -v 1 0 2 1
|
||||
```
|
||||
|
||||
Result
|
||||
```
|
||||
./bin/example_batchnorm_infer -D 128,16,16,1024 -v 1 0 2 1
|
||||
launch_and_time_kernel: grid_dim {120, 1, 1}, block_dim {256, 1, 1}
|
||||
Warm up 1 time
|
||||
Start running 10 times...
|
||||
Perf: 1.28235 ms, 523.329 GB/s
|
||||
```
|
||||
|
||||
## Run ```batchnorm backward nhwc```
|
||||
```bash
|
||||
# -D <xxx> : input 4-d tensor lengths
|
||||
# -v <x> : verification (0=no, 1=yes)
|
||||
Arg1: data type (0: fp16, 1: fp32, 3: int8, 5: bp16, 6: fp64)
|
||||
Arg2 -- 1/0 to indicate whether to use saved mean and invVariance
|
||||
Arg3 -- init method used for dy and bnScale (0=no init, 1=single integer value, 2=scope integer value, 3=decimal value)
|
||||
Arg4 -- time kernel (0=no, 1=yes)
|
||||
Arg5: use multi-block welford (0=n0, 1=yes)
|
||||
./bin/example_batchnorm_backward -D 128,16,3,1024 -v 1 0 0 3 1 1
|
||||
```
|
||||
|
||||
Result
|
||||
```
|
||||
./bin/example_batchnorm_backward -D 128,16,3,1024 -v 1 0 0 3 1 1
|
||||
launch_and_time_kernel: grid_dim {6144, 1, 1}, block_dim {256, 1, 1}
|
||||
Warm up 1 time
|
||||
Start running 10 times...
|
||||
launch_and_time_kernel: grid_dim {6144, 1, 1}, block_dim {256, 1, 1}
|
||||
Warm up 1 time
|
||||
Start running 10 times...
|
||||
launch_and_time_kernel: grid_dim {6144, 1, 1}, block_dim {256, 1, 1}
|
||||
Warm up 1 time
|
||||
Start running 10 times...
|
||||
Perf: 0.411026 ms, 91.8702 GB/s
|
||||
```
|
||||
506
example/34_batchnorm/batchnorm_backward_nhwc.cpp
Normal file
506
example/34_batchnorm/batchnorm_backward_nhwc.cpp
Normal file
@@ -0,0 +1,506 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#include <limits>
|
||||
#include <iostream>
|
||||
#include <getopt.h>
|
||||
|
||||
#include "ck/ck.hpp"
|
||||
#include "ck/library/utility/check_err.hpp"
|
||||
#include "ck/library/utility/device_memory.hpp"
|
||||
#include "ck/library/utility/host_tensor.hpp"
|
||||
#include "ck/library/utility/host_tensor_generator.hpp"
|
||||
#include "ck/library/utility/host_common_util.hpp"
|
||||
#include "ck/library/reference_tensor_operation/cpu/reference_batchnorm_backward.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/impl/device_batchnorm_backward_impl.hpp"
|
||||
|
||||
static struct option long_options[] = {{"inOutLengths", required_argument, nullptr, 'D'},
|
||||
{"verify", required_argument, nullptr, 'v'},
|
||||
{"help", no_argument, nullptr, '?'},
|
||||
{nullptr, 0, nullptr, 0}};
|
||||
|
||||
class BatchNormBwdArg
|
||||
{
|
||||
private:
|
||||
int option_index = 0;
|
||||
|
||||
public:
|
||||
std::vector<size_t> inOutLengths;
|
||||
|
||||
bool do_verification = false;
|
||||
|
||||
bool haveSavedMeanInvVar;
|
||||
|
||||
int data_type = 0;
|
||||
int init_method = 3;
|
||||
bool time_kernel = false;
|
||||
bool use_multiblock_welford = false;
|
||||
|
||||
public:
|
||||
void show_usage(const char* cmd)
|
||||
{
|
||||
// clang-format off
|
||||
std::cout << "Usage of " << cmd << std::endl;
|
||||
std::cout << "--inOutLengths or -D, comma separated list of input tensor dimension lengths, must have 4 integers for nhwc" << std::endl;
|
||||
std::cout << "--verify or -v, 1/0 to indicate whether to verify the result by comparing with the host-based batch-normalization" << std::endl;
|
||||
std::cout << "Arg1: data type (0: fp16, 1: fp32, 3: int8, 5: bp16, 6: fp64)" << std::endl;
|
||||
std::cout << "Arg2 -- 1/0 to indicate whether to use saved mean and invVariance" << std::endl;
|
||||
std::cout << "Arg3 -- init method used for dy and bnScale (0=no init, 1=single integer value, 2=scope integer value, 3=decimal value)" << std::endl;
|
||||
std::cout << "Arg4 -- time kernel (0=no, 1=yes)" << std::endl;
|
||||
std::cout << "Arg5: use multi-block welford (0=n0, 1=yes)" << std::endl;
|
||||
// clang-format on
|
||||
};
|
||||
|
||||
int processArgs(int argc, char* argv[])
|
||||
{
|
||||
using ck::host_common::getTypeValuesFromString;
|
||||
|
||||
int ch;
|
||||
|
||||
while(1)
|
||||
{
|
||||
ch = getopt_long(argc, argv, "D:v:", long_options, &option_index);
|
||||
if(ch == -1)
|
||||
break;
|
||||
switch(ch)
|
||||
{
|
||||
case 'D':
|
||||
if(!optarg)
|
||||
throw std::runtime_error("Invalid option format!");
|
||||
|
||||
inOutLengths = getTypeValuesFromString<size_t>(optarg);
|
||||
|
||||
if(inOutLengths.size() != 4)
|
||||
throw std::runtime_error(
|
||||
"NHWC tensor layout should have 4 length values specified!");
|
||||
break;
|
||||
case 'v':
|
||||
if(!optarg)
|
||||
throw std::runtime_error("Invalid option format!");
|
||||
|
||||
do_verification = static_cast<bool>(std::atoi(optarg));
|
||||
break;
|
||||
case '?':
|
||||
if(std::string(long_options[option_index].name) == "help")
|
||||
{
|
||||
show_usage(argv[0]);
|
||||
return (-1);
|
||||
};
|
||||
break;
|
||||
default: show_usage(argv[0]); return (-1);
|
||||
};
|
||||
};
|
||||
|
||||
if(optind + 5 > argc)
|
||||
throw std::runtime_error("Invalid cmd-line arguments, more argumetns are needed!");
|
||||
|
||||
data_type = std::atoi(argv[optind++]);
|
||||
haveSavedMeanInvVar = std::atoi(argv[optind++]);
|
||||
init_method = std::atoi(argv[optind++]);
|
||||
time_kernel = static_cast<bool>(std::atoi(argv[optind++]));
|
||||
use_multiblock_welford = static_cast<bool>(std::atoi(argv[optind]));
|
||||
|
||||
return (0);
|
||||
};
|
||||
};
|
||||
|
||||
using namespace ck;
|
||||
|
||||
template <typename XDataType, typename AccDataType, bool UseMultiblockInK>
|
||||
bool bnorm_bwd_nhwc_test(bool do_verification,
|
||||
int init_method,
|
||||
bool time_kernel,
|
||||
const std::vector<size_t> inOutLengths,
|
||||
bool haveSavedMeanInvVar,
|
||||
double epsilon)
|
||||
{
|
||||
// for NHWC BatchNorm calculation of mean and meansquare
|
||||
constexpr index_t Rank = 4;
|
||||
constexpr index_t NumReduceDim = 3;
|
||||
|
||||
using ScaleDataType = XDataType;
|
||||
|
||||
const std::vector<size_t> scaleBiasMeanVarLengths = {inOutLengths[3]};
|
||||
|
||||
// input data of the batchnorm backward algorithm
|
||||
Tensor<XDataType> x(inOutLengths);
|
||||
Tensor<AccDataType> dy(inOutLengths);
|
||||
|
||||
Tensor<ScaleDataType> bnScale(scaleBiasMeanVarLengths);
|
||||
|
||||
Tensor<AccDataType> savedMean(scaleBiasMeanVarLengths);
|
||||
Tensor<AccDataType> savedInvVar(scaleBiasMeanVarLengths);
|
||||
// savedVariance is only used for initializing savedInvVar
|
||||
Tensor<AccDataType> savedVariance(scaleBiasMeanVarLengths);
|
||||
|
||||
// output data of the batchnorm backward algorithm
|
||||
Tensor<AccDataType> dx_ref(inOutLengths);
|
||||
Tensor<AccDataType> dx(inOutLengths);
|
||||
|
||||
Tensor<AccDataType> dscale(scaleBiasMeanVarLengths);
|
||||
Tensor<AccDataType> dbias(scaleBiasMeanVarLengths);
|
||||
|
||||
Tensor<AccDataType> dscale_ref(scaleBiasMeanVarLengths);
|
||||
Tensor<AccDataType> dbias_ref(scaleBiasMeanVarLengths);
|
||||
|
||||
auto inOutStrides = dy.mDesc.GetStrides();
|
||||
auto scaleBiasMeanVarStrides = dscale.mDesc.GetStrides();
|
||||
|
||||
std::size_t num_thread = std::thread::hardware_concurrency();
|
||||
|
||||
if(haveSavedMeanInvVar)
|
||||
{
|
||||
const float x_mean = 0.0f;
|
||||
const float x_stddev = 1.0f;
|
||||
const float noise_stddev = 0.0001f;
|
||||
|
||||
// input data in normal distribution
|
||||
x.GenerateTensorValue(GeneratorTensor_4<XDataType>{x_mean, x_stddev}, num_thread);
|
||||
|
||||
// initialize the savedMean to be values with tiny variation to the mean of the x values
|
||||
savedMean.GenerateTensorValue(GeneratorTensor_4<AccDataType>{x_mean, noise_stddev},
|
||||
num_thread);
|
||||
|
||||
// initialize the variance to be values with tiny variation to the variance of the x values
|
||||
savedVariance.GenerateTensorValue(
|
||||
GeneratorTensor_4<AccDataType>{x_stddev * x_stddev, noise_stddev}, num_thread);
|
||||
|
||||
auto it_src = savedVariance.mData.begin();
|
||||
auto it_dst = savedInvVar.mData.begin();
|
||||
float tmp_epsilon = std::numeric_limits<float>::epsilon();
|
||||
|
||||
while(it_src != savedVariance.mData.end())
|
||||
{
|
||||
*it_dst = type_convert<AccDataType>(
|
||||
1.0f / std::sqrtf(type_convert<float>(*it_src) + tmp_epsilon));
|
||||
|
||||
it_src++;
|
||||
it_dst++;
|
||||
};
|
||||
}
|
||||
else
|
||||
{
|
||||
const float x_mean = 0.0f;
|
||||
const float x_stddev = 1.0f;
|
||||
|
||||
// input data in normal distribution
|
||||
x.GenerateTensorValue(GeneratorTensor_4<XDataType>{x_mean, x_stddev}, num_thread);
|
||||
};
|
||||
|
||||
if(do_verification)
|
||||
{
|
||||
switch(init_method)
|
||||
{
|
||||
case 0:
|
||||
dy.GenerateTensorValue(GeneratorTensor_0<AccDataType>{}, num_thread);
|
||||
bnScale.GenerateTensorValue(GeneratorTensor_0<ScaleDataType>{}, num_thread);
|
||||
break;
|
||||
case 1:
|
||||
dy.GenerateTensorValue(GeneratorTensor_1<AccDataType>{1}, num_thread);
|
||||
bnScale.GenerateTensorValue(GeneratorTensor_1<ScaleDataType>{1}, num_thread);
|
||||
break;
|
||||
case 2:
|
||||
dy.GenerateTensorValue(GeneratorTensor_2<AccDataType>{-2, 2}, num_thread);
|
||||
bnScale.GenerateTensorValue(GeneratorTensor_2<ScaleDataType>{-5, 5}, num_thread);
|
||||
break;
|
||||
default:
|
||||
dy.GenerateTensorValue(GeneratorTensor_3<AccDataType>{-0.2f, 0.2f}, num_thread);
|
||||
bnScale.GenerateTensorValue(GeneratorTensor_3<ScaleDataType>{-0.5f, 0.5f}, num_thread);
|
||||
}
|
||||
};
|
||||
|
||||
// input data of the batchnorm backward algorithm
|
||||
DeviceMem x_dev(sizeof(XDataType) * x.mDesc.GetElementSpaceSize());
|
||||
DeviceMem dy_dev(sizeof(AccDataType) * dy.mDesc.GetElementSpaceSize());
|
||||
|
||||
DeviceMem bnScale_dev(sizeof(ScaleDataType) * bnScale.mDesc.GetElementSpaceSize());
|
||||
|
||||
DeviceMem savedMean_dev(sizeof(AccDataType) * savedMean.mDesc.GetElementSpaceSize());
|
||||
DeviceMem savedInvVar_dev(sizeof(AccDataType) * savedInvVar.mDesc.GetElementSpaceSize());
|
||||
|
||||
// output data of the batchnorm backward algorithm
|
||||
DeviceMem dx_dev(sizeof(AccDataType) * dx.mDesc.GetElementSpaceSize());
|
||||
|
||||
DeviceMem dscale_dev(sizeof(AccDataType) * dscale.mDesc.GetElementSpaceSize());
|
||||
DeviceMem dbias_dev(sizeof(AccDataType) * dbias.mDesc.GetElementSpaceSize());
|
||||
|
||||
x_dev.ToDevice(x.mData.data());
|
||||
dy_dev.ToDevice(dy.mData.data());
|
||||
bnScale_dev.ToDevice(bnScale.mData.data());
|
||||
|
||||
if(haveSavedMeanInvVar)
|
||||
{
|
||||
savedMean_dev.ToDevice(savedMean.mData.data());
|
||||
savedInvVar_dev.ToDevice(savedInvVar.mData.data());
|
||||
};
|
||||
|
||||
std::array<index_t, Rank> i_inOutLengths;
|
||||
std::array<index_t, Rank> i_inOutStrides;
|
||||
std::array<index_t, Rank - NumReduceDim> i_scaleBiasMeanVarLengths;
|
||||
std::array<index_t, Rank - NumReduceDim> i_scaleBiasMeanVarStrides;
|
||||
|
||||
std::copy(inOutLengths.begin(), inOutLengths.end(), i_inOutLengths.begin());
|
||||
std::copy(inOutStrides.begin(), inOutStrides.end(), i_inOutStrides.begin());
|
||||
std::copy(scaleBiasMeanVarLengths.begin(),
|
||||
scaleBiasMeanVarLengths.end(),
|
||||
i_scaleBiasMeanVarLengths.begin());
|
||||
std::copy(scaleBiasMeanVarStrides.begin(),
|
||||
scaleBiasMeanVarStrides.end(),
|
||||
i_scaleBiasMeanVarStrides.begin());
|
||||
|
||||
using PassThroughOp = ck::tensor_operation::element_wise::PassThrough;
|
||||
|
||||
using DeviceBatchNormBwdInstance =
|
||||
ck::tensor_operation::device::DeviceBatchNormBwdImpl<XDataType,
|
||||
AccDataType,
|
||||
AccDataType,
|
||||
AccDataType,
|
||||
ScaleDataType, // ScaleDataType
|
||||
AccDataType, // DscaleDbiasDataType
|
||||
AccDataType, // MeanVarDataType
|
||||
PassThroughOp,
|
||||
Rank,
|
||||
NumReduceDim,
|
||||
UseMultiblockInK,
|
||||
256,
|
||||
16,
|
||||
16,
|
||||
1,
|
||||
2,
|
||||
0,
|
||||
1, // XSrcVectorSize
|
||||
1, // DySrcVectorSize
|
||||
1, // DxDstVectorSize
|
||||
1, // ScaleSrcVectorSize
|
||||
1, // DscaleDbiasDstVectorSize
|
||||
1>; // MeanVarSrcVectorSize
|
||||
|
||||
auto batchnorm_bwd = DeviceBatchNormBwdInstance{};
|
||||
|
||||
auto argument_ptr = batchnorm_bwd.MakeArgumentPointer(
|
||||
i_inOutLengths,
|
||||
i_inOutStrides,
|
||||
i_inOutStrides,
|
||||
i_inOutStrides,
|
||||
{0, 1, 2},
|
||||
i_scaleBiasMeanVarLengths,
|
||||
i_scaleBiasMeanVarStrides,
|
||||
i_scaleBiasMeanVarStrides,
|
||||
i_scaleBiasMeanVarStrides,
|
||||
x_dev.GetDeviceBuffer(),
|
||||
dy_dev.GetDeviceBuffer(),
|
||||
bnScale_dev.GetDeviceBuffer(),
|
||||
haveSavedMeanInvVar ? savedMean_dev.GetDeviceBuffer() : nullptr,
|
||||
haveSavedMeanInvVar ? savedInvVar_dev.GetDeviceBuffer() : nullptr,
|
||||
epsilon,
|
||||
PassThroughOp{},
|
||||
dx_dev.GetDeviceBuffer(),
|
||||
dscale_dev.GetDeviceBuffer(),
|
||||
dbias_dev.GetDeviceBuffer());
|
||||
|
||||
if(!batchnorm_bwd.IsSupportedArgument(argument_ptr.get()))
|
||||
{
|
||||
std::cout << "The runtime parameters seems not supported by the BatchNorm device instance, "
|
||||
"exiting!"
|
||||
<< std::endl;
|
||||
return (false);
|
||||
};
|
||||
|
||||
size_t workspace_sz = batchnorm_bwd.GetWorkSpaceSize(argument_ptr.get());
|
||||
|
||||
DeviceMem workspace_dev(workspace_sz);
|
||||
|
||||
batchnorm_bwd.SetWorkSpacePointer(argument_ptr.get(), workspace_dev.GetDeviceBuffer());
|
||||
|
||||
auto invoker_ptr = batchnorm_bwd.MakeInvokerPointer();
|
||||
|
||||
if(time_kernel)
|
||||
{
|
||||
float avg_time = 0.0f;
|
||||
size_t num_bytes = 0;
|
||||
|
||||
size_t total_length = inOutLengths[0] * inOutLengths[1] * inOutLengths[2] * inOutLengths[3];
|
||||
size_t invariant_length = inOutLengths[3];
|
||||
|
||||
avg_time = invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, time_kernel});
|
||||
|
||||
// inputing of x, dy, scale, outputing of dx, dscale, dbias
|
||||
num_bytes +=
|
||||
total_length * sizeof(XDataType) * 3 + invariant_length * sizeof(AccDataType) * 3;
|
||||
|
||||
// outputing of mean, inv-variance
|
||||
num_bytes += haveSavedMeanInvVar ? invariant_length * sizeof(AccDataType) * 2 : 0;
|
||||
|
||||
float gb_per_sec = num_bytes / 1.E6 / avg_time;
|
||||
|
||||
std::cout << "Perf: " << avg_time << " ms, " << gb_per_sec << " GB/s" << std::endl;
|
||||
}
|
||||
else
|
||||
(void)invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, time_kernel});
|
||||
|
||||
bool pass = true;
|
||||
|
||||
if(do_verification)
|
||||
{
|
||||
using ReferenceBatchNormBwdInstance =
|
||||
ck::tensor_operation::host::ReferenceBatchNormBwd<XDataType,
|
||||
AccDataType,
|
||||
AccDataType,
|
||||
AccDataType,
|
||||
ScaleDataType, // ScaleDataType
|
||||
AccDataType,
|
||||
AccDataType,
|
||||
PassThroughOp,
|
||||
Rank,
|
||||
NumReduceDim>;
|
||||
|
||||
auto batchNormBwd_ref = ReferenceBatchNormBwdInstance{};
|
||||
|
||||
auto argument_ptr_ref = batchNormBwd_ref.MakeArgumentPointer(
|
||||
i_inOutLengths,
|
||||
i_inOutStrides,
|
||||
i_inOutStrides,
|
||||
i_inOutStrides,
|
||||
{0, 1, 2},
|
||||
i_scaleBiasMeanVarLengths,
|
||||
i_scaleBiasMeanVarStrides,
|
||||
i_scaleBiasMeanVarStrides,
|
||||
i_scaleBiasMeanVarStrides,
|
||||
x.mData.data(),
|
||||
dy.mData.data(),
|
||||
bnScale.mData.data(),
|
||||
haveSavedMeanInvVar ? savedMean.mData.data() : nullptr,
|
||||
haveSavedMeanInvVar ? savedInvVar.mData.data() : nullptr,
|
||||
epsilon,
|
||||
PassThroughOp{},
|
||||
dx_ref.mData.data(),
|
||||
dscale_ref.mData.data(),
|
||||
dbias_ref.mData.data());
|
||||
|
||||
if(!batchNormBwd_ref.IsSupportedArgument(argument_ptr_ref.get()))
|
||||
{
|
||||
std::cout
|
||||
<< "The runtime parameters seems not supported by the device instance, exiting!"
|
||||
<< std::endl;
|
||||
return (false);
|
||||
};
|
||||
|
||||
auto invoker_ptr_ref = batchNormBwd_ref.MakeInvokerPointer();
|
||||
|
||||
(void)invoker_ptr_ref->Run(argument_ptr_ref.get());
|
||||
|
||||
dx_dev.FromDevice(dx.mData.data());
|
||||
dscale_dev.FromDevice(dscale.data());
|
||||
dbias_dev.FromDevice(dbias.data());
|
||||
|
||||
// clang-format off
|
||||
pass = pass && ck::utils::check_err(dbias.mData, dbias_ref.mData, "dBias result:", 2e-4, 2e-4);
|
||||
pass = pass && ck::utils::check_err(dscale.mData, dscale_ref.mData, "dScale result:", 2e-4, 2e-4);
|
||||
pass = pass && ck::utils::check_err(dx.mData, dx_ref.mData, "dx result:");
|
||||
// clang-format on
|
||||
};
|
||||
|
||||
return (pass);
|
||||
};
|
||||
|
||||
static const double epsilon = std::numeric_limits<float>::epsilon();
|
||||
|
||||
int main(int argc, char* argv[])
|
||||
{
|
||||
bool pass = true;
|
||||
|
||||
if(argc > 1)
|
||||
{
|
||||
BatchNormBwdArg arg;
|
||||
|
||||
if(arg.processArgs(argc, argv) < 0)
|
||||
return (-1);
|
||||
|
||||
if(arg.data_type == 0)
|
||||
{
|
||||
if(arg.use_multiblock_welford)
|
||||
pass = bnorm_bwd_nhwc_test<ck::half_t, float, true>(arg.do_verification,
|
||||
arg.init_method,
|
||||
arg.time_kernel,
|
||||
arg.inOutLengths,
|
||||
arg.haveSavedMeanInvVar,
|
||||
epsilon);
|
||||
else
|
||||
pass = bnorm_bwd_nhwc_test<ck::half_t, float, false>(arg.do_verification,
|
||||
arg.init_method,
|
||||
arg.time_kernel,
|
||||
arg.inOutLengths,
|
||||
arg.haveSavedMeanInvVar,
|
||||
epsilon);
|
||||
}
|
||||
else if(arg.data_type == 1)
|
||||
{
|
||||
if(arg.use_multiblock_welford)
|
||||
pass = bnorm_bwd_nhwc_test<float, float, true>(arg.do_verification,
|
||||
arg.init_method,
|
||||
arg.time_kernel,
|
||||
arg.inOutLengths,
|
||||
arg.haveSavedMeanInvVar,
|
||||
epsilon);
|
||||
else
|
||||
pass = bnorm_bwd_nhwc_test<float, float, false>(arg.do_verification,
|
||||
arg.init_method,
|
||||
arg.time_kernel,
|
||||
arg.inOutLengths,
|
||||
arg.haveSavedMeanInvVar,
|
||||
epsilon);
|
||||
}
|
||||
else if(arg.data_type == 5)
|
||||
{
|
||||
if(arg.use_multiblock_welford)
|
||||
pass = bnorm_bwd_nhwc_test<ck::bhalf_t, float, true>(arg.do_verification,
|
||||
arg.init_method,
|
||||
arg.time_kernel,
|
||||
arg.inOutLengths,
|
||||
arg.haveSavedMeanInvVar,
|
||||
epsilon);
|
||||
else
|
||||
pass = bnorm_bwd_nhwc_test<ck::bhalf_t, float, false>(arg.do_verification,
|
||||
arg.init_method,
|
||||
arg.time_kernel,
|
||||
arg.inOutLengths,
|
||||
arg.haveSavedMeanInvVar,
|
||||
epsilon);
|
||||
}
|
||||
else if(arg.data_type == 6)
|
||||
{
|
||||
if(arg.use_multiblock_welford)
|
||||
pass = bnorm_bwd_nhwc_test<double, double, true>(arg.do_verification,
|
||||
arg.init_method,
|
||||
arg.time_kernel,
|
||||
arg.inOutLengths,
|
||||
arg.haveSavedMeanInvVar,
|
||||
epsilon);
|
||||
else
|
||||
pass = bnorm_bwd_nhwc_test<double, double, false>(arg.do_verification,
|
||||
arg.init_method,
|
||||
arg.time_kernel,
|
||||
arg.inOutLengths,
|
||||
arg.haveSavedMeanInvVar,
|
||||
epsilon);
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
pass = bnorm_bwd_nhwc_test<ck::half_t, float, true>(true,
|
||||
3,
|
||||
false, // don't time kernel
|
||||
{128, 16, 6, 512},
|
||||
false,
|
||||
epsilon);
|
||||
|
||||
pass = pass && bnorm_bwd_nhwc_test<ck::half_t, float, false>(true,
|
||||
3,
|
||||
false, // don't time kernel
|
||||
{128, 16, 3, 1024},
|
||||
false,
|
||||
epsilon);
|
||||
};
|
||||
|
||||
return (pass ? 0 : 1);
|
||||
}
|
||||
68
example/34_batchnorm/batchnorm_common.hpp
Normal file
68
example/34_batchnorm/batchnorm_common.hpp
Normal file
@@ -0,0 +1,68 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <cassert>
|
||||
#include <vector>
|
||||
#include <array>
|
||||
#include <type_traits>
|
||||
|
||||
#include "ck/utility/data_type.hpp"
|
||||
|
||||
struct NormalizeInInfer
|
||||
{
|
||||
NormalizeInInfer(double epsilon = 1e-4) : epsilon_(epsilon) {}
|
||||
|
||||
template <typename T1, typename T2, typename T3, typename T4>
|
||||
__host__ __device__ constexpr void operator()(T1& y,
|
||||
const T1& x,
|
||||
const T2& mean,
|
||||
const T2& variance,
|
||||
const T3& gamma,
|
||||
const T4& beta) const
|
||||
{
|
||||
static_assert(std::is_same<T2, float>::value || std::is_same<T2, double>::value,
|
||||
"Data type is not supported by this operation!");
|
||||
|
||||
using ck::type_convert;
|
||||
using ck::math::sqrt;
|
||||
|
||||
T2 tmp_x, tmp_y;
|
||||
|
||||
tmp_x = type_convert<T2>(x);
|
||||
|
||||
tmp_y = ((tmp_x - mean) / sqrt(variance + type_convert<T2>(epsilon_))) *
|
||||
type_convert<T2>(gamma) +
|
||||
type_convert<T2>(beta);
|
||||
y = type_convert<T1>(tmp_y);
|
||||
};
|
||||
|
||||
double epsilon_;
|
||||
};
|
||||
|
||||
template <int Rank, int NumReduceDim>
|
||||
static inline std::array<int, Rank - NumReduceDim>
|
||||
get_invariant_dims(const std::array<int, NumReduceDim>& reduceDims)
|
||||
{
|
||||
int reduceFlag = 0;
|
||||
|
||||
// flag the bits for the reduceDims
|
||||
for(int i = 0; i < NumReduceDim; i++)
|
||||
{
|
||||
reduceFlag |= 1 << reduceDims[i];
|
||||
};
|
||||
|
||||
std::array<int, Rank - NumReduceDim> invariantDims;
|
||||
|
||||
// collect invariant dimensions
|
||||
int dim = 0;
|
||||
for(int i = 0; i < Rank; i++)
|
||||
if((reduceFlag & (1 << i)) == 0)
|
||||
{
|
||||
invariantDims[dim] = i;
|
||||
dim++;
|
||||
};
|
||||
|
||||
return invariantDims;
|
||||
};
|
||||
366
example/34_batchnorm/batchnorm_forward_inferring_nhwc.cpp
Normal file
366
example/34_batchnorm/batchnorm_forward_inferring_nhwc.cpp
Normal file
@@ -0,0 +1,366 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#include <limits>
|
||||
#include <iostream>
|
||||
#include <vector>
|
||||
#include <array>
|
||||
#include <algorithm>
|
||||
#include <getopt.h>
|
||||
|
||||
#include "ck/ck.hpp"
|
||||
#include "ck/library/utility/algorithm.hpp"
|
||||
#include "ck/library/utility/check_err.hpp"
|
||||
#include "ck/library/utility/device_memory.hpp"
|
||||
#include "ck/library/utility/host_tensor.hpp"
|
||||
#include "ck/library/utility/host_tensor_generator.hpp"
|
||||
#include "ck/library/utility/host_common_util.hpp"
|
||||
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
|
||||
#include "ck/library/reference_tensor_operation/cpu/reference_batchnorm_infer.hpp"
|
||||
|
||||
#include "batchnorm_infer_impl.hpp"
|
||||
|
||||
static struct option long_options[] = {{"inOutLengths", required_argument, nullptr, 'D'},
|
||||
{"verify", required_argument, nullptr, 'v'},
|
||||
{"help", no_argument, nullptr, '?'},
|
||||
{nullptr, 0, nullptr, 0}};
|
||||
|
||||
class BatchNormInferArg
|
||||
{
|
||||
private:
|
||||
int option_index = 0;
|
||||
|
||||
public:
|
||||
std::vector<size_t> inOutLengths;
|
||||
|
||||
bool do_verification = false;
|
||||
|
||||
int data_type = 0;
|
||||
int init_method = 2;
|
||||
bool time_kernel = false;
|
||||
|
||||
public:
|
||||
void show_usage(const char* cmd)
|
||||
{
|
||||
std::cout << "Usage of " << cmd << std::endl;
|
||||
std::cout << "--inOutLengths or -D, comma separated list of input tensor dimension "
|
||||
"lengths, must have 4 integers for nhwc"
|
||||
<< std::endl;
|
||||
std::cout << "--verify or -v, 1/0 to indicate whether to verify the batch-normalization "
|
||||
"result by "
|
||||
"comparing with the host-based batch-normalization"
|
||||
<< std::endl;
|
||||
std::cout << "Arg1: data type (0: fp16, 1: fp32, 3: int8, 5: bp16, 6: fp64)" << std::endl;
|
||||
std::cout << "Arg2: init method used for bnScale and bnBias (0=no init, 1=single integer "
|
||||
"value, 2=scope integer "
|
||||
"value, 3=decimal value)"
|
||||
<< std::endl;
|
||||
std::cout << "Arg3: time kernel (0=no, 1=yes)" << std::endl;
|
||||
};
|
||||
|
||||
int processArgs(int argc, char* argv[])
|
||||
{
|
||||
using ck::host_common::getTypeValuesFromString;
|
||||
|
||||
int ch;
|
||||
|
||||
while(1)
|
||||
{
|
||||
ch = getopt_long(argc, argv, "D:v:", long_options, &option_index);
|
||||
if(ch == -1)
|
||||
break;
|
||||
switch(ch)
|
||||
{
|
||||
case 'D':
|
||||
if(!optarg)
|
||||
throw std::runtime_error("Invalid option format!");
|
||||
|
||||
inOutLengths = getTypeValuesFromString<size_t>(optarg);
|
||||
|
||||
if(inOutLengths.size() != 4)
|
||||
throw std::runtime_error(
|
||||
"NHWC tensor layout should have 4 length values specified!");
|
||||
break;
|
||||
case 'v':
|
||||
if(!optarg)
|
||||
throw std::runtime_error("Invalid option format!");
|
||||
|
||||
do_verification = static_cast<bool>(std::atoi(optarg));
|
||||
break;
|
||||
case '?':
|
||||
if(std::string(long_options[option_index].name) == "help")
|
||||
{
|
||||
show_usage(argv[0]);
|
||||
return (-1);
|
||||
};
|
||||
break;
|
||||
default: show_usage(argv[0]); return (-1);
|
||||
};
|
||||
};
|
||||
|
||||
if(optind + 3 > argc)
|
||||
throw std::runtime_error("Invalid cmd-line arguments, more argumetns are needed!");
|
||||
|
||||
data_type = std::atoi(argv[optind++]);
|
||||
init_method = std::atoi(argv[optind++]);
|
||||
time_kernel = static_cast<bool>(std::atoi(argv[optind]));
|
||||
|
||||
if(data_type != 0 && data_type != 1 && data_type != 3 && data_type != 5 && data_type != 6)
|
||||
return (-1);
|
||||
|
||||
return (0);
|
||||
};
|
||||
};
|
||||
|
||||
using namespace ck;
|
||||
|
||||
template <typename InOutDataType, typename AccDataType>
|
||||
bool bnorm_infer_nhwc_test(bool do_verification,
|
||||
int init_method,
|
||||
bool time_kernel,
|
||||
const std::vector<size_t> inOutLengths,
|
||||
double epsilon)
|
||||
{
|
||||
// for NHWC BatchNorm calculation of mean and meansquare
|
||||
constexpr int Rank = 4;
|
||||
constexpr int NumReduceDim = 3;
|
||||
|
||||
// when using lengths[] to create a tensor, lengths[0] is the length of highest dimension
|
||||
// eg. N of NHWC, so lengths[3] is the dimension C length of NHWC
|
||||
const std::vector<size_t> scaleBiasMeanVarLengths = {inOutLengths[3]};
|
||||
|
||||
// input data of the batchnorm forward algorithm
|
||||
Tensor<InOutDataType> x(inOutLengths);
|
||||
Tensor<AccDataType> bnScale(scaleBiasMeanVarLengths);
|
||||
Tensor<AccDataType> bnBias(scaleBiasMeanVarLengths);
|
||||
|
||||
// output data of the batchnorm forward algorithm
|
||||
Tensor<InOutDataType> y_ref(inOutLengths);
|
||||
Tensor<InOutDataType> y(inOutLengths);
|
||||
|
||||
Tensor<AccDataType> estimatedMean(scaleBiasMeanVarLengths);
|
||||
Tensor<AccDataType> estimatedVariance(scaleBiasMeanVarLengths);
|
||||
|
||||
auto inOutStrides = x.mDesc.GetStrides();
|
||||
auto scaleBiasMeanVarStrides = bnScale.mDesc.GetStrides();
|
||||
|
||||
std::size_t num_thread = std::thread::hardware_concurrency();
|
||||
|
||||
if constexpr(std::is_same<InOutDataType, int8_t>::value)
|
||||
{
|
||||
x.GenerateTensorValue(GeneratorTensor_2<InOutDataType>{-5, 5}, num_thread);
|
||||
|
||||
const float x_mean = 0.0f;
|
||||
const float x_stddev = 2.5f;
|
||||
const float noise_stddev = 0.0001f;
|
||||
|
||||
estimatedMean.GenerateTensorValue(GeneratorTensor_4<AccDataType>{x_mean, noise_stddev},
|
||||
num_thread);
|
||||
|
||||
estimatedVariance.GenerateTensorValue(
|
||||
GeneratorTensor_4<AccDataType>{x_stddev * x_stddev, noise_stddev}, num_thread);
|
||||
}
|
||||
else
|
||||
{
|
||||
const float x_mean = 0.0f;
|
||||
const float x_stddev = 1.0f;
|
||||
const float noise_stddev = 0.0001f;
|
||||
|
||||
x.GenerateTensorValue(GeneratorTensor_4<InOutDataType>{x_mean, x_stddev}, num_thread);
|
||||
|
||||
// initialize the savedMean to be values with tiny variation to the mean of the x values
|
||||
estimatedMean.GenerateTensorValue(GeneratorTensor_4<AccDataType>{x_mean, noise_stddev},
|
||||
num_thread);
|
||||
|
||||
// initialize the variance to be values with tiny variation to the variance of the x values
|
||||
estimatedVariance.GenerateTensorValue(
|
||||
GeneratorTensor_4<AccDataType>{x_stddev * x_stddev, noise_stddev}, num_thread);
|
||||
};
|
||||
|
||||
if(do_verification)
|
||||
{
|
||||
switch(init_method)
|
||||
{
|
||||
case 0:
|
||||
bnScale.GenerateTensorValue(GeneratorTensor_0<AccDataType>{}, num_thread);
|
||||
bnBias.GenerateTensorValue(GeneratorTensor_0<AccDataType>{}, num_thread);
|
||||
break;
|
||||
case 1:
|
||||
bnScale.GenerateTensorValue(GeneratorTensor_1<AccDataType>{1}, num_thread);
|
||||
bnBias.GenerateTensorValue(GeneratorTensor_1<AccDataType>{0}, num_thread);
|
||||
break;
|
||||
case 2:
|
||||
bnScale.GenerateTensorValue(GeneratorTensor_2<AccDataType>{-5, 5}, num_thread);
|
||||
bnBias.GenerateTensorValue(GeneratorTensor_2<AccDataType>{-5, 5}, num_thread);
|
||||
break;
|
||||
default:
|
||||
bnScale.GenerateTensorValue(GeneratorTensor_3<AccDataType>{-5.0f, 5.0f}, num_thread);
|
||||
bnBias.GenerateTensorValue(GeneratorTensor_3<AccDataType>{-5.0f, 5.0f}, num_thread);
|
||||
}
|
||||
};
|
||||
|
||||
// these buffers are usually provided by the user application
|
||||
DeviceMem x_dev(sizeof(InOutDataType) * x.mDesc.GetElementSpaceSize());
|
||||
DeviceMem y_dev(sizeof(InOutDataType) * y.mDesc.GetElementSpaceSize());
|
||||
DeviceMem bnScale_dev(sizeof(AccDataType) * bnScale.mDesc.GetElementSpaceSize());
|
||||
DeviceMem bnBias_dev(sizeof(AccDataType) * bnBias.mDesc.GetElementSpaceSize());
|
||||
|
||||
// mean_dev or resultSaveMean_dev
|
||||
DeviceMem estimatedMean_dev(sizeof(AccDataType) * estimatedMean.mDesc.GetElementSpaceSize());
|
||||
// meansquare_dev or resultSaveInvVariance_dev
|
||||
DeviceMem estimatedVariance_dev(sizeof(AccDataType) *
|
||||
estimatedVariance.mDesc.GetElementSpaceSize());
|
||||
|
||||
x_dev.ToDevice(x.mData.data());
|
||||
bnScale_dev.ToDevice(bnScale.mData.data());
|
||||
bnBias_dev.ToDevice(bnBias.mData.data());
|
||||
estimatedMean_dev.ToDevice(estimatedMean.mData.data());
|
||||
estimatedVariance_dev.ToDevice(estimatedVariance.mData.data());
|
||||
|
||||
using ck::index_t;
|
||||
|
||||
std::array<index_t, Rank> i_inOutLengths;
|
||||
std::array<index_t, Rank> i_inOutStrides;
|
||||
std::array<index_t, Rank - NumReduceDim> i_scaleBiasMeanVarLengths;
|
||||
std::array<index_t, Rank - NumReduceDim> i_scaleBiasMeanVarStrides;
|
||||
|
||||
ck::ranges::copy(inOutLengths, i_inOutLengths.begin());
|
||||
ck::ranges::copy(inOutStrides, i_inOutStrides.begin());
|
||||
ck::ranges::copy(scaleBiasMeanVarLengths, i_scaleBiasMeanVarLengths.begin());
|
||||
ck::ranges::copy(scaleBiasMeanVarStrides, i_scaleBiasMeanVarStrides.begin());
|
||||
|
||||
int result = 0;
|
||||
|
||||
result = bnorm_infer<InOutDataType,
|
||||
InOutDataType,
|
||||
AccDataType,
|
||||
AccDataType,
|
||||
AccDataType,
|
||||
AccDataType,
|
||||
Rank,
|
||||
NumReduceDim,
|
||||
false>(time_kernel,
|
||||
{0, 1, 2},
|
||||
i_inOutLengths,
|
||||
i_inOutStrides,
|
||||
i_inOutStrides,
|
||||
i_scaleBiasMeanVarLengths,
|
||||
i_scaleBiasMeanVarStrides,
|
||||
i_scaleBiasMeanVarStrides,
|
||||
i_scaleBiasMeanVarStrides,
|
||||
x_dev.GetDeviceBuffer(),
|
||||
bnScale_dev.GetDeviceBuffer(),
|
||||
bnBias_dev.GetDeviceBuffer(),
|
||||
epsilon,
|
||||
estimatedMean_dev.GetDeviceBuffer(),
|
||||
estimatedVariance_dev.GetDeviceBuffer(),
|
||||
y_dev.GetDeviceBuffer());
|
||||
|
||||
if(result < 0)
|
||||
return (false);
|
||||
|
||||
bool pass = true;
|
||||
|
||||
if(do_verification)
|
||||
{
|
||||
using PassThroughOp = ck::tensor_operation::element_wise::PassThrough;
|
||||
|
||||
using ReferenceBatchNormInferInstance =
|
||||
ck::tensor_operation::host::ReferenceBatchNormInfer<InOutDataType,
|
||||
InOutDataType,
|
||||
AccDataType,
|
||||
AccDataType,
|
||||
AccDataType,
|
||||
AccDataType,
|
||||
PassThroughOp,
|
||||
Rank,
|
||||
NumReduceDim>;
|
||||
auto batchNormInfer_ref = ReferenceBatchNormInferInstance{};
|
||||
|
||||
auto argument_ptr_ref =
|
||||
batchNormInfer_ref.MakeArgumentPointer(i_inOutLengths,
|
||||
i_inOutStrides,
|
||||
i_inOutStrides,
|
||||
{0, 1, 2},
|
||||
i_scaleBiasMeanVarLengths,
|
||||
i_scaleBiasMeanVarStrides,
|
||||
i_scaleBiasMeanVarStrides,
|
||||
i_scaleBiasMeanVarStrides,
|
||||
x.mData.data(),
|
||||
bnScale.mData.data(),
|
||||
bnBias.mData.data(),
|
||||
epsilon,
|
||||
PassThroughOp{},
|
||||
estimatedMean.mData.data(),
|
||||
estimatedVariance.mData.data(),
|
||||
y_ref.mData.data());
|
||||
|
||||
if(!batchNormInfer_ref.IsSupportedArgument(argument_ptr_ref.get()))
|
||||
{
|
||||
std::cout
|
||||
<< "The runtime parameters seems not supported by the BatchNorm instance, exiting!"
|
||||
<< std::endl;
|
||||
return (-2);
|
||||
};
|
||||
|
||||
auto invoker_ptr_ref = batchNormInfer_ref.MakeInvokerPointer();
|
||||
|
||||
(void)invoker_ptr_ref->Run(argument_ptr_ref.get());
|
||||
|
||||
y_dev.FromDevice(y.mData.data());
|
||||
pass = pass && ck::utils::check_err(y, y_ref);
|
||||
};
|
||||
|
||||
return (pass);
|
||||
};
|
||||
|
||||
static const double epsilon = std::numeric_limits<float>::epsilon();
|
||||
|
||||
int main(int argc, char* argv[])
|
||||
{
|
||||
bool pass = true;
|
||||
|
||||
if(argc > 1)
|
||||
{
|
||||
BatchNormInferArg arg;
|
||||
|
||||
if(arg.processArgs(argc, argv) < 0)
|
||||
return (-1);
|
||||
|
||||
if(arg.data_type == 0)
|
||||
{
|
||||
pass = bnorm_infer_nhwc_test<ck::half_t, float>(
|
||||
arg.do_verification, arg.init_method, arg.time_kernel, arg.inOutLengths, epsilon);
|
||||
}
|
||||
else if(arg.data_type == 1)
|
||||
{
|
||||
pass = bnorm_infer_nhwc_test<float, float>(
|
||||
arg.do_verification, arg.init_method, arg.time_kernel, arg.inOutLengths, epsilon);
|
||||
}
|
||||
else if(arg.data_type == 3)
|
||||
{
|
||||
pass = bnorm_infer_nhwc_test<int8_t, float>(
|
||||
arg.do_verification, arg.init_method, arg.time_kernel, arg.inOutLengths, epsilon);
|
||||
}
|
||||
else if(arg.data_type == 5)
|
||||
{
|
||||
pass = bnorm_infer_nhwc_test<ck::bhalf_t, float>(
|
||||
arg.do_verification, arg.init_method, arg.time_kernel, arg.inOutLengths, epsilon);
|
||||
}
|
||||
else if(arg.data_type == 6)
|
||||
{
|
||||
pass = bnorm_infer_nhwc_test<double, double>(
|
||||
arg.do_verification, arg.init_method, arg.time_kernel, arg.inOutLengths, epsilon);
|
||||
};
|
||||
}
|
||||
else
|
||||
{
|
||||
pass = bnorm_infer_nhwc_test<ck::half_t, float>(true,
|
||||
2,
|
||||
false, // don't time kernel
|
||||
{128, 16, 16, 1024},
|
||||
epsilon);
|
||||
};
|
||||
|
||||
return (pass ? 0 : 1);
|
||||
}
|
||||
598
example/34_batchnorm/batchnorm_forward_training_nhwc.cpp
Normal file
598
example/34_batchnorm/batchnorm_forward_training_nhwc.cpp
Normal file
@@ -0,0 +1,598 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#include <limits>
|
||||
#include <iostream>
|
||||
#include <vector>
|
||||
#include <array>
|
||||
#include <algorithm>
|
||||
#include <getopt.h>
|
||||
|
||||
#include "ck/ck.hpp"
|
||||
#include "ck/library/utility/algorithm.hpp"
|
||||
#include "ck/library/utility/check_err.hpp"
|
||||
#include "ck/library/utility/device_memory.hpp"
|
||||
#include "ck/library/utility/host_tensor.hpp"
|
||||
#include "ck/library/utility/host_tensor_generator.hpp"
|
||||
#include "ck/library/utility/host_common_util.hpp"
|
||||
#include "ck/library/reference_tensor_operation/cpu/reference_batchnorm_forward.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/impl/device_batchnorm_forward_impl.hpp"
|
||||
#include "ck/library/utility/host_common_util.hpp"
|
||||
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
|
||||
|
||||
static struct option long_options[] = {{"inOutLengths", required_argument, nullptr, 'D'},
|
||||
{"verify", required_argument, nullptr, 'v'},
|
||||
{"help", no_argument, nullptr, '?'},
|
||||
{nullptr, 0, nullptr, 0}};
|
||||
|
||||
class BatchNormFwdArg
|
||||
{
|
||||
private:
|
||||
int option_index = 0;
|
||||
|
||||
public:
|
||||
std::vector<size_t> inOutLengths;
|
||||
|
||||
bool do_verification = false;
|
||||
|
||||
bool updateMovingAverage;
|
||||
bool saveMeanAndInvVariance;
|
||||
|
||||
int data_type = 0;
|
||||
int init_method = 2;
|
||||
bool time_kernel = false;
|
||||
bool use_multiblock_welford = false;
|
||||
|
||||
public:
|
||||
void show_usage(const char* cmd)
|
||||
{
|
||||
std::cout << "Usage of " << cmd << std::endl;
|
||||
std::cout << "--inOutLengths or -D, comma separated list of input tensor dimension "
|
||||
"lengths, must have 4 integers for nhwc"
|
||||
<< std::endl;
|
||||
std::cout << "--verify or -v, 1/0 to indicate whether to verify the batch-normalization "
|
||||
"result by "
|
||||
"comparing with the host-based batch-normalization"
|
||||
<< std::endl;
|
||||
std::cout << "Arg1: data type (0: fp16, 1: fp32, 3: int8, 5: bp16, 6: fp64)" << std::endl;
|
||||
std::cout << "Arg2: 1/0 to indicate whether to update the moving average and variance "
|
||||
"(0=no, 1=yes)"
|
||||
<< std::endl;
|
||||
std::cout << "Arg3: 1/0 to indicate whether to save the calculated mean and invVariance "
|
||||
"(0=no, 1=yes)"
|
||||
<< std::endl;
|
||||
std::cout << "Arg4: init method used for bnScale and bnBias (0=no init, 1=single integer "
|
||||
"value, 2=scope integer "
|
||||
"value, 3=decimal value)"
|
||||
<< std::endl;
|
||||
std::cout << "Arg5: time kernel (0=no, 1=yes)" << std::endl;
|
||||
std::cout << "Arg6: use multi-block welford (0=n0, 1=yes)" << std::endl;
|
||||
};
|
||||
|
||||
int processArgs(int argc, char* argv[])
|
||||
{
|
||||
using ck::host_common::getTypeValuesFromString;
|
||||
|
||||
int ch;
|
||||
|
||||
while(1)
|
||||
{
|
||||
ch = getopt_long(argc, argv, "D:v:", long_options, &option_index);
|
||||
if(ch == -1)
|
||||
break;
|
||||
switch(ch)
|
||||
{
|
||||
case 'D':
|
||||
if(!optarg)
|
||||
throw std::runtime_error("Invalid option format!");
|
||||
|
||||
inOutLengths = getTypeValuesFromString<size_t>(optarg);
|
||||
|
||||
if(inOutLengths.size() != 4)
|
||||
throw std::runtime_error(
|
||||
"NHWC tensor layout should have 4 length values specified!");
|
||||
break;
|
||||
case 'v':
|
||||
if(!optarg)
|
||||
throw std::runtime_error("Invalid option format!");
|
||||
|
||||
do_verification = static_cast<bool>(std::atoi(optarg));
|
||||
break;
|
||||
case '?':
|
||||
if(std::string(long_options[option_index].name) == "help")
|
||||
{
|
||||
show_usage(argv[0]);
|
||||
return (-1);
|
||||
};
|
||||
break;
|
||||
default: show_usage(argv[0]); return (-1);
|
||||
};
|
||||
};
|
||||
|
||||
if(optind + 6 > argc)
|
||||
throw std::runtime_error("Invalid cmd-line arguments, more argumetns are needed!");
|
||||
|
||||
data_type = std::atoi(argv[optind++]);
|
||||
updateMovingAverage = std::atoi(argv[optind++]);
|
||||
saveMeanAndInvVariance = std::atoi(argv[optind++]);
|
||||
init_method = std::atoi(argv[optind++]);
|
||||
time_kernel = static_cast<bool>(std::atoi(argv[optind++]));
|
||||
use_multiblock_welford = static_cast<bool>(std::atoi(argv[optind]));
|
||||
|
||||
if(data_type != 0 && data_type != 1 && data_type != 3 && data_type != 5 && data_type != 6)
|
||||
return (-1);
|
||||
|
||||
return (0);
|
||||
};
|
||||
};
|
||||
|
||||
using namespace ck;
|
||||
|
||||
template <typename InOutDataType, typename AccDataType, bool UseMultiblockInK>
|
||||
bool bnorm_fwd_nhwc_test(bool do_verification,
|
||||
int init_method,
|
||||
bool time_kernel,
|
||||
const std::vector<size_t> inOutLengths,
|
||||
bool updateMovingAverage,
|
||||
bool saveMeanAndInvVariance,
|
||||
double averageFactor,
|
||||
double epsilon)
|
||||
{
|
||||
// for NHWC BatchNorm calculation of mean and meansquare
|
||||
constexpr int Rank = 4;
|
||||
constexpr int NumReduceDim = 3;
|
||||
|
||||
// when using lengths[] to create a tensor, lengths[0] is the length of highest dimension
|
||||
// eg. N of NHWC, so lengths[3] is the dimension C length of NHWC
|
||||
const std::vector<size_t> scaleBiasMeanVarLengths = {inOutLengths[3]};
|
||||
|
||||
// input data of the batchnorm forward algorithm
|
||||
Tensor<InOutDataType> x(inOutLengths);
|
||||
Tensor<AccDataType> bnScale(scaleBiasMeanVarLengths);
|
||||
Tensor<AccDataType> bnBias(scaleBiasMeanVarLengths);
|
||||
|
||||
// output data of the batchnorm forward algorithm
|
||||
Tensor<InOutDataType> y_ref(inOutLengths);
|
||||
Tensor<InOutDataType> y(inOutLengths);
|
||||
|
||||
Tensor<AccDataType> resultSaveMean_ref(scaleBiasMeanVarLengths);
|
||||
Tensor<AccDataType> resultSaveInvVariance_ref(scaleBiasMeanVarLengths);
|
||||
|
||||
Tensor<AccDataType> resultRunningMean_ref(scaleBiasMeanVarLengths);
|
||||
Tensor<AccDataType> resultRunningVariance_ref(scaleBiasMeanVarLengths);
|
||||
|
||||
auto inOutStrides = x.mDesc.GetStrides();
|
||||
auto scaleBiasMeanVarStrides = bnScale.mDesc.GetStrides();
|
||||
|
||||
std::size_t num_thread = std::thread::hardware_concurrency();
|
||||
|
||||
if(updateMovingAverage)
|
||||
{
|
||||
if constexpr(std::is_same<InOutDataType, int8_t>::value)
|
||||
{
|
||||
x.GenerateTensorValue(GeneratorTensor_2<InOutDataType>{-5, 5}, num_thread);
|
||||
|
||||
const float x_mean = 0.0f;
|
||||
const float x_stddev = 2.5f;
|
||||
const float noise_stddev = 0.04f;
|
||||
|
||||
resultRunningMean_ref.GenerateTensorValue(
|
||||
GeneratorTensor_4<AccDataType>{x_mean, noise_stddev}, num_thread);
|
||||
|
||||
resultRunningVariance_ref.GenerateTensorValue(
|
||||
GeneratorTensor_4<AccDataType>{x_stddev * x_stddev, noise_stddev}, num_thread);
|
||||
}
|
||||
else
|
||||
{
|
||||
const float x_mean = 0.0f;
|
||||
const float x_stddev = 1.0f;
|
||||
const float noise_stddev = 0.04f;
|
||||
|
||||
// input data in normal distribution
|
||||
x.GenerateTensorValue(GeneratorTensor_4<InOutDataType>{x_mean, x_stddev}, num_thread);
|
||||
|
||||
// initialize the runningMean to be values with tiny variation to the mean of the x
|
||||
// values
|
||||
resultRunningMean_ref.GenerateTensorValue(
|
||||
GeneratorTensor_4<AccDataType>{x_mean, noise_stddev}, num_thread);
|
||||
|
||||
// initialize the runningVariance to be values with tiny variation to the variance of
|
||||
// the x values
|
||||
resultRunningVariance_ref.GenerateTensorValue(
|
||||
GeneratorTensor_4<AccDataType>{x_stddev * x_stddev, noise_stddev}, num_thread);
|
||||
};
|
||||
}
|
||||
else
|
||||
{
|
||||
if constexpr(std::is_same<InOutDataType, int8_t>::value)
|
||||
x.GenerateTensorValue(GeneratorTensor_2<InOutDataType>{-5, 5}, num_thread);
|
||||
else
|
||||
x.GenerateTensorValue(GeneratorTensor_3<InOutDataType>{-5.0f, 5.0f}, num_thread);
|
||||
};
|
||||
|
||||
if(do_verification)
|
||||
{
|
||||
switch(init_method)
|
||||
{
|
||||
case 0:
|
||||
bnScale.GenerateTensorValue(GeneratorTensor_0<AccDataType>{}, num_thread);
|
||||
bnBias.GenerateTensorValue(GeneratorTensor_0<AccDataType>{}, num_thread);
|
||||
break;
|
||||
case 1:
|
||||
bnScale.GenerateTensorValue(GeneratorTensor_1<AccDataType>{1}, num_thread);
|
||||
bnBias.GenerateTensorValue(GeneratorTensor_1<AccDataType>{0}, num_thread);
|
||||
break;
|
||||
case 2:
|
||||
bnScale.GenerateTensorValue(GeneratorTensor_2<AccDataType>{-5, 5}, num_thread);
|
||||
bnBias.GenerateTensorValue(GeneratorTensor_2<AccDataType>{-5, 5}, num_thread);
|
||||
break;
|
||||
default:
|
||||
bnScale.GenerateTensorValue(GeneratorTensor_3<AccDataType>{-5.0f, 5.0f}, num_thread);
|
||||
bnBias.GenerateTensorValue(GeneratorTensor_3<AccDataType>{-5.0f, 5.0f}, num_thread);
|
||||
}
|
||||
};
|
||||
|
||||
// these buffers are usually provided by the user application
|
||||
DeviceMem x_dev(sizeof(InOutDataType) * x.mDesc.GetElementSpaceSize());
|
||||
DeviceMem y_dev(sizeof(InOutDataType) * y.mDesc.GetElementSpaceSize());
|
||||
DeviceMem bnScale_dev(sizeof(AccDataType) * bnScale.mDesc.GetElementSpaceSize());
|
||||
DeviceMem bnBias_dev(sizeof(AccDataType) * bnBias.mDesc.GetElementSpaceSize());
|
||||
|
||||
// mean_dev or resultSaveMean_dev
|
||||
DeviceMem resultSaveMean_dev(sizeof(AccDataType) *
|
||||
resultSaveMean_ref.mDesc.GetElementSpaceSize());
|
||||
// meansquare_dev or resultSaveInvVariance_dev
|
||||
DeviceMem resultSaveInvVariance_dev(sizeof(AccDataType) *
|
||||
resultSaveInvVariance_ref.mDesc.GetElementSpaceSize());
|
||||
// resultRunningMean_dev
|
||||
DeviceMem resultRunningMean_dev(sizeof(AccDataType) *
|
||||
resultRunningMean_ref.mDesc.GetElementSpaceSize());
|
||||
// resultRunningVariance_dev
|
||||
DeviceMem resultRunningVariance_dev(sizeof(AccDataType) *
|
||||
resultRunningVariance_ref.mDesc.GetElementSpaceSize());
|
||||
|
||||
x_dev.ToDevice(x.mData.data());
|
||||
bnScale_dev.ToDevice(bnScale.mData.data());
|
||||
bnBias_dev.ToDevice(bnBias.mData.data());
|
||||
|
||||
if(updateMovingAverage)
|
||||
{
|
||||
resultRunningMean_dev.ToDevice(resultRunningMean_ref.mData.data());
|
||||
resultRunningVariance_dev.ToDevice(resultRunningVariance_ref.mData.data());
|
||||
};
|
||||
|
||||
std::array<index_t, Rank> i_inOutLengths;
|
||||
std::array<index_t, Rank> i_inOutStrides;
|
||||
std::array<index_t, Rank - NumReduceDim> i_scaleBiasMeanVarLengths;
|
||||
std::array<index_t, Rank - NumReduceDim> i_scaleBiasMeanVarStrides;
|
||||
|
||||
ck::ranges::copy(inOutLengths, i_inOutLengths.begin());
|
||||
ck::ranges::copy(inOutStrides, i_inOutStrides.begin());
|
||||
ck::ranges::copy(scaleBiasMeanVarLengths, i_scaleBiasMeanVarLengths.begin());
|
||||
ck::ranges::copy(scaleBiasMeanVarStrides, i_scaleBiasMeanVarStrides.begin());
|
||||
|
||||
using PassThroughOp = ck::tensor_operation::element_wise::PassThrough;
|
||||
|
||||
using DeviceBatchNormFwdInstance =
|
||||
ck::tensor_operation::device::DeviceBatchNormFwdImpl<InOutDataType,
|
||||
InOutDataType,
|
||||
AccDataType,
|
||||
AccDataType, // ScaleDataType
|
||||
AccDataType, // BiasDataType
|
||||
AccDataType, // MeanVarDataType
|
||||
PassThroughOp, // YElementwiseOp
|
||||
Rank,
|
||||
NumReduceDim,
|
||||
UseMultiblockInK,
|
||||
256,
|
||||
16,
|
||||
16,
|
||||
1,
|
||||
2,
|
||||
0,
|
||||
1,
|
||||
1,
|
||||
1,
|
||||
1,
|
||||
1>;
|
||||
|
||||
auto batchnorm_fwd = DeviceBatchNormFwdInstance{};
|
||||
|
||||
auto argument_ptr = batchnorm_fwd.MakeArgumentPointer(
|
||||
i_inOutLengths,
|
||||
i_inOutStrides,
|
||||
i_inOutStrides,
|
||||
{0, 1, 2}, // indicates physical indices of reduce dimensions in lengths[] and strides[]
|
||||
i_scaleBiasMeanVarLengths,
|
||||
i_scaleBiasMeanVarStrides,
|
||||
i_scaleBiasMeanVarStrides,
|
||||
i_scaleBiasMeanVarStrides,
|
||||
x_dev.GetDeviceBuffer(),
|
||||
bnScale_dev.GetDeviceBuffer(),
|
||||
bnBias_dev.GetDeviceBuffer(),
|
||||
epsilon,
|
||||
PassThroughOp{},
|
||||
y_dev.GetDeviceBuffer(),
|
||||
saveMeanAndInvVariance ? resultSaveMean_dev.GetDeviceBuffer() : nullptr,
|
||||
saveMeanAndInvVariance ? resultSaveInvVariance_dev.GetDeviceBuffer() : nullptr,
|
||||
averageFactor,
|
||||
updateMovingAverage ? resultRunningMean_dev.GetDeviceBuffer() : nullptr,
|
||||
updateMovingAverage ? resultRunningVariance_dev.GetDeviceBuffer() : nullptr);
|
||||
|
||||
if(!batchnorm_fwd.IsSupportedArgument(argument_ptr.get()))
|
||||
{
|
||||
std::cout << "The runtime parameters seems not supported by the BatchNorm device instance, "
|
||||
"exiting!"
|
||||
<< std::endl;
|
||||
return (false);
|
||||
};
|
||||
|
||||
size_t workspace_sz = batchnorm_fwd.GetWorkSpaceSize(argument_ptr.get());
|
||||
|
||||
DeviceMem workspace_dev(workspace_sz);
|
||||
|
||||
batchnorm_fwd.SetWorkSpacePointer(argument_ptr.get(), workspace_dev.GetDeviceBuffer());
|
||||
|
||||
auto invoker_ptr = batchnorm_fwd.MakeInvokerPointer();
|
||||
|
||||
if(time_kernel)
|
||||
{
|
||||
float avg_time = 0.0f;
|
||||
size_t num_bytes = 0;
|
||||
|
||||
size_t total_length = inOutLengths[0] * inOutLengths[1] * inOutLengths[2] * inOutLengths[3];
|
||||
size_t invariant_length = inOutLengths[3];
|
||||
|
||||
avg_time = invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, time_kernel});
|
||||
|
||||
// inputing of x, scale, bias, outputing of y
|
||||
num_bytes +=
|
||||
total_length * sizeof(InOutDataType) * 2 + invariant_length * sizeof(AccDataType) * 2;
|
||||
|
||||
// outputing of mean, inv-variance
|
||||
num_bytes += saveMeanAndInvVariance ? invariant_length * sizeof(AccDataType) * 2 : 0;
|
||||
|
||||
// updating of moving mean, variance
|
||||
num_bytes += updateMovingAverage ? invariant_length * sizeof(AccDataType) * 4 : 0;
|
||||
|
||||
float gb_per_sec = num_bytes / 1.E6 / avg_time;
|
||||
|
||||
std::cout << "Perf: " << avg_time << " ms, " << gb_per_sec << " GB/s" << std::endl;
|
||||
}
|
||||
else
|
||||
(void)invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, time_kernel});
|
||||
|
||||
bool pass = true;
|
||||
|
||||
if(do_verification)
|
||||
{
|
||||
|
||||
using ReferenceBatchNormFwdInstance =
|
||||
ck::tensor_operation::host::ReferenceBatchNormFwd<InOutDataType,
|
||||
InOutDataType,
|
||||
AccDataType,
|
||||
AccDataType,
|
||||
AccDataType,
|
||||
AccDataType,
|
||||
PassThroughOp,
|
||||
Rank,
|
||||
NumReduceDim>;
|
||||
|
||||
auto batchNormFwd_ref = ReferenceBatchNormFwdInstance{};
|
||||
|
||||
auto argument_ptr_ref = batchNormFwd_ref.MakeArgumentPointer(
|
||||
i_inOutLengths,
|
||||
i_inOutStrides,
|
||||
i_inOutStrides,
|
||||
{0, 1, 2}, // indicates physical indices of reduce dimensions in lengths[] and strides[]
|
||||
i_scaleBiasMeanVarLengths,
|
||||
i_scaleBiasMeanVarStrides,
|
||||
i_scaleBiasMeanVarStrides,
|
||||
i_scaleBiasMeanVarStrides,
|
||||
x.mData.data(),
|
||||
bnScale.mData.data(),
|
||||
bnBias.mData.data(),
|
||||
epsilon,
|
||||
PassThroughOp{},
|
||||
y_ref.mData.data(),
|
||||
saveMeanAndInvVariance ? resultSaveMean_ref.mData.data() : nullptr,
|
||||
saveMeanAndInvVariance ? resultSaveInvVariance_ref.mData.data() : nullptr,
|
||||
averageFactor,
|
||||
updateMovingAverage ? resultRunningMean_ref.mData.data() : nullptr,
|
||||
updateMovingAverage ? resultRunningVariance_ref.mData.data() : nullptr);
|
||||
|
||||
if(!batchNormFwd_ref.IsSupportedArgument(argument_ptr_ref.get()))
|
||||
{
|
||||
std::cout << "The runtime parameters seems not supported by the BatchNorm reference "
|
||||
"instance, exiting!"
|
||||
<< std::endl;
|
||||
return (false);
|
||||
};
|
||||
|
||||
auto invoker_ptr_ref = batchNormFwd_ref.MakeInvokerPointer();
|
||||
|
||||
(void)invoker_ptr_ref->Run(argument_ptr_ref.get());
|
||||
|
||||
y_dev.FromDevice(y.mData.data());
|
||||
pass = pass && ck::utils::check_err(y, y_ref, "Incorrect normalized output values");
|
||||
|
||||
if(updateMovingAverage)
|
||||
{
|
||||
Tensor<AccDataType> resultRunningMean(scaleBiasMeanVarLengths);
|
||||
Tensor<AccDataType> resultRunningVariance(scaleBiasMeanVarLengths);
|
||||
|
||||
resultRunningMean_dev.FromDevice(resultRunningMean.mData.data());
|
||||
resultRunningVariance_dev.FromDevice(resultRunningVariance.mData.data());
|
||||
|
||||
pass = pass && ck::utils::check_err(resultRunningMean,
|
||||
resultRunningMean_ref,
|
||||
"Incorrect running mean values");
|
||||
pass = pass && ck::utils::check_err(resultRunningVariance,
|
||||
resultRunningVariance_ref,
|
||||
"Incorrect running variance values");
|
||||
};
|
||||
|
||||
if(saveMeanAndInvVariance)
|
||||
{
|
||||
using ck::host_common::dumpBufferToFile;
|
||||
|
||||
Tensor<AccDataType> resultSaveMean(scaleBiasMeanVarLengths);
|
||||
Tensor<AccDataType> resultSaveInvVariance(scaleBiasMeanVarLengths);
|
||||
|
||||
resultSaveMean_dev.FromDevice(resultSaveMean.mData.data());
|
||||
resultSaveInvVariance_dev.FromDevice(resultSaveInvVariance.mData.data());
|
||||
|
||||
pass = pass && ck::utils::check_err(
|
||||
resultSaveMean, resultSaveMean_ref, "Incorrect saved mean values");
|
||||
pass = pass && ck::utils::check_err(resultSaveInvVariance,
|
||||
resultSaveInvVariance_ref,
|
||||
"Incorrect saved invvariance values");
|
||||
};
|
||||
};
|
||||
|
||||
return (pass);
|
||||
};
|
||||
|
||||
const double epsilon = std::numeric_limits<float>::epsilon();
|
||||
static const double averageFactor = 0.1;
|
||||
|
||||
int main(int argc, char* argv[])
|
||||
{
|
||||
bool pass = true;
|
||||
|
||||
if(argc > 1)
|
||||
{
|
||||
BatchNormFwdArg arg;
|
||||
|
||||
if(arg.processArgs(argc, argv) < 0)
|
||||
return (-1);
|
||||
|
||||
if(arg.data_type == 0)
|
||||
{
|
||||
if(arg.use_multiblock_welford)
|
||||
pass = bnorm_fwd_nhwc_test<ck::half_t, float, true>(arg.do_verification,
|
||||
arg.init_method,
|
||||
arg.time_kernel,
|
||||
arg.inOutLengths,
|
||||
arg.updateMovingAverage,
|
||||
arg.saveMeanAndInvVariance,
|
||||
averageFactor,
|
||||
epsilon);
|
||||
else
|
||||
pass = bnorm_fwd_nhwc_test<ck::half_t, float, false>(arg.do_verification,
|
||||
arg.init_method,
|
||||
arg.time_kernel,
|
||||
arg.inOutLengths,
|
||||
arg.updateMovingAverage,
|
||||
arg.saveMeanAndInvVariance,
|
||||
averageFactor,
|
||||
epsilon);
|
||||
}
|
||||
else if(arg.data_type == 1)
|
||||
{
|
||||
if(arg.use_multiblock_welford)
|
||||
pass = bnorm_fwd_nhwc_test<float, float, true>(arg.do_verification,
|
||||
arg.init_method,
|
||||
arg.time_kernel,
|
||||
arg.inOutLengths,
|
||||
arg.updateMovingAverage,
|
||||
arg.saveMeanAndInvVariance,
|
||||
averageFactor,
|
||||
epsilon);
|
||||
else
|
||||
pass = bnorm_fwd_nhwc_test<float, float, false>(arg.do_verification,
|
||||
arg.init_method,
|
||||
arg.time_kernel,
|
||||
arg.inOutLengths,
|
||||
arg.updateMovingAverage,
|
||||
arg.saveMeanAndInvVariance,
|
||||
averageFactor,
|
||||
epsilon);
|
||||
}
|
||||
else if(arg.data_type == 3)
|
||||
{
|
||||
if(arg.use_multiblock_welford)
|
||||
pass = bnorm_fwd_nhwc_test<int8_t, float, true>(arg.do_verification,
|
||||
arg.init_method,
|
||||
arg.time_kernel,
|
||||
arg.inOutLengths,
|
||||
arg.updateMovingAverage,
|
||||
arg.saveMeanAndInvVariance,
|
||||
averageFactor,
|
||||
epsilon);
|
||||
else
|
||||
pass = bnorm_fwd_nhwc_test<int8_t, float, false>(arg.do_verification,
|
||||
arg.init_method,
|
||||
arg.time_kernel,
|
||||
arg.inOutLengths,
|
||||
arg.updateMovingAverage,
|
||||
arg.saveMeanAndInvVariance,
|
||||
averageFactor,
|
||||
epsilon);
|
||||
}
|
||||
else if(arg.data_type == 5)
|
||||
{
|
||||
if(arg.use_multiblock_welford)
|
||||
pass = bnorm_fwd_nhwc_test<ck::bhalf_t, float, true>(arg.do_verification,
|
||||
arg.init_method,
|
||||
arg.time_kernel,
|
||||
arg.inOutLengths,
|
||||
arg.updateMovingAverage,
|
||||
arg.saveMeanAndInvVariance,
|
||||
averageFactor,
|
||||
epsilon);
|
||||
else
|
||||
pass = bnorm_fwd_nhwc_test<ck::bhalf_t, float, false>(arg.do_verification,
|
||||
arg.init_method,
|
||||
arg.time_kernel,
|
||||
arg.inOutLengths,
|
||||
arg.updateMovingAverage,
|
||||
arg.saveMeanAndInvVariance,
|
||||
averageFactor,
|
||||
epsilon);
|
||||
}
|
||||
else if(arg.data_type == 6)
|
||||
{
|
||||
if(arg.use_multiblock_welford)
|
||||
pass = bnorm_fwd_nhwc_test<double, double, true>(arg.do_verification,
|
||||
arg.init_method,
|
||||
arg.time_kernel,
|
||||
arg.inOutLengths,
|
||||
arg.updateMovingAverage,
|
||||
arg.saveMeanAndInvVariance,
|
||||
averageFactor,
|
||||
epsilon);
|
||||
else
|
||||
pass = bnorm_fwd_nhwc_test<double, double, false>(arg.do_verification,
|
||||
arg.init_method,
|
||||
arg.time_kernel,
|
||||
arg.inOutLengths,
|
||||
arg.updateMovingAverage,
|
||||
arg.saveMeanAndInvVariance,
|
||||
averageFactor,
|
||||
epsilon);
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
pass = bnorm_fwd_nhwc_test<ck::half_t, float, true>(true,
|
||||
2,
|
||||
false, // don't time kernel
|
||||
{128, 16, 6, 512},
|
||||
true,
|
||||
true,
|
||||
averageFactor,
|
||||
epsilon);
|
||||
|
||||
pass = pass && bnorm_fwd_nhwc_test<ck::half_t, float, false>(true,
|
||||
2,
|
||||
false, // don't time kernel
|
||||
{128, 16, 3, 1024},
|
||||
true,
|
||||
true,
|
||||
averageFactor,
|
||||
epsilon);
|
||||
};
|
||||
|
||||
return (pass ? 0 : 1);
|
||||
}
|
||||
@@ -0,0 +1,598 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#include <limits>
|
||||
#include <iostream>
|
||||
#include <vector>
|
||||
#include <array>
|
||||
#include <algorithm>
|
||||
#include <getopt.h>
|
||||
|
||||
#include "ck/ck.hpp"
|
||||
#include "ck/library/utility/algorithm.hpp"
|
||||
#include "ck/library/utility/check_err.hpp"
|
||||
#include "ck/library/utility/device_memory.hpp"
|
||||
#include "ck/library/utility/host_tensor.hpp"
|
||||
#include "ck/library/utility/host_tensor_generator.hpp"
|
||||
#include "ck/library/utility/host_common_util.hpp"
|
||||
#include "ck/library/reference_tensor_operation/cpu/reference_batchnorm_forward.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/impl/device_batchnorm_forward_impl_obsolete.hpp"
|
||||
#include "ck/library/utility/host_common_util.hpp"
|
||||
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
|
||||
|
||||
static struct option long_options[] = {{"inOutLengths", required_argument, nullptr, 'D'},
|
||||
{"verify", required_argument, nullptr, 'v'},
|
||||
{"help", no_argument, nullptr, '?'},
|
||||
{nullptr, 0, nullptr, 0}};
|
||||
|
||||
class BatchNormFwdArg
|
||||
{
|
||||
private:
|
||||
int option_index = 0;
|
||||
|
||||
public:
|
||||
std::vector<size_t> inOutLengths;
|
||||
|
||||
bool do_verification = false;
|
||||
|
||||
bool updateMovingAverage;
|
||||
bool saveMeanAndInvVariance;
|
||||
|
||||
int data_type = 0;
|
||||
int init_method = 2;
|
||||
bool time_kernel = false;
|
||||
bool use_multiblock_welford = false;
|
||||
|
||||
public:
|
||||
void show_usage(const char* cmd)
|
||||
{
|
||||
std::cout << "Usage of " << cmd << std::endl;
|
||||
std::cout << "--inOutLengths or -D, comma separated list of input tensor dimension "
|
||||
"lengths, must have 4 integers for nhwc"
|
||||
<< std::endl;
|
||||
std::cout << "--verify or -v, 1/0 to indicate whether to verify the batch-normalization "
|
||||
"result by "
|
||||
"comparing with the host-based batch-normalization"
|
||||
<< std::endl;
|
||||
std::cout << "Arg1: data type (0: fp16, 1: fp32, 3: int8, 5: bp16, 6: fp64)" << std::endl;
|
||||
std::cout << "Arg2: 1/0 to indicate whether to update the moving average and variance "
|
||||
"(0=no, 1=yes)"
|
||||
<< std::endl;
|
||||
std::cout << "Arg3: 1/0 to indicate whether to save the calculated mean and invVariance "
|
||||
"(0=no, 1=yes)"
|
||||
<< std::endl;
|
||||
std::cout << "Arg4: init method used for bnScale and bnBias (0=no init, 1=single integer "
|
||||
"value, 2=scope integer "
|
||||
"value, 3=decimal value)"
|
||||
<< std::endl;
|
||||
std::cout << "Arg5: time kernel (0=no, 1=yes)" << std::endl;
|
||||
std::cout << "Arg6: use multi-block welford (0=n0, 1=yes)" << std::endl;
|
||||
};
|
||||
|
||||
int processArgs(int argc, char* argv[])
|
||||
{
|
||||
using ck::host_common::getTypeValuesFromString;
|
||||
|
||||
int ch;
|
||||
|
||||
while(1)
|
||||
{
|
||||
ch = getopt_long(argc, argv, "D:v:", long_options, &option_index);
|
||||
if(ch == -1)
|
||||
break;
|
||||
switch(ch)
|
||||
{
|
||||
case 'D':
|
||||
if(!optarg)
|
||||
throw std::runtime_error("Invalid option format!");
|
||||
|
||||
inOutLengths = getTypeValuesFromString<size_t>(optarg);
|
||||
|
||||
if(inOutLengths.size() != 4)
|
||||
throw std::runtime_error(
|
||||
"NHWC tensor layout should have 4 length values specified!");
|
||||
break;
|
||||
case 'v':
|
||||
if(!optarg)
|
||||
throw std::runtime_error("Invalid option format!");
|
||||
|
||||
do_verification = static_cast<bool>(std::atoi(optarg));
|
||||
break;
|
||||
case '?':
|
||||
if(std::string(long_options[option_index].name) == "help")
|
||||
{
|
||||
show_usage(argv[0]);
|
||||
return (-1);
|
||||
};
|
||||
break;
|
||||
default: show_usage(argv[0]); return (-1);
|
||||
};
|
||||
};
|
||||
|
||||
if(optind + 6 > argc)
|
||||
throw std::runtime_error("Invalid cmd-line arguments, more argumetns are needed!");
|
||||
|
||||
data_type = std::atoi(argv[optind++]);
|
||||
updateMovingAverage = std::atoi(argv[optind++]);
|
||||
saveMeanAndInvVariance = std::atoi(argv[optind++]);
|
||||
init_method = std::atoi(argv[optind++]);
|
||||
time_kernel = static_cast<bool>(std::atoi(argv[optind++]));
|
||||
use_multiblock_welford = static_cast<bool>(std::atoi(argv[optind]));
|
||||
|
||||
if(data_type != 0 && data_type != 1 && data_type != 3 && data_type != 5 && data_type != 6)
|
||||
return (-1);
|
||||
|
||||
return (0);
|
||||
};
|
||||
};
|
||||
|
||||
using namespace ck;
|
||||
|
||||
template <typename InOutDataType, typename AccDataType, bool UseMultiblockInK>
|
||||
bool bnorm_fwd_nhwc_test(bool do_verification,
|
||||
int init_method,
|
||||
bool time_kernel,
|
||||
const std::vector<size_t> inOutLengths,
|
||||
bool updateMovingAverage,
|
||||
bool saveMeanAndInvVariance,
|
||||
double averageFactor,
|
||||
double epsilon)
|
||||
{
|
||||
// for NHWC BatchNorm calculation of mean and meansquare
|
||||
constexpr int Rank = 4;
|
||||
constexpr int NumReduceDim = 3;
|
||||
|
||||
// when using lengths[] to create a tensor, lengths[0] is the length of highest dimension
|
||||
// eg. N of NHWC, so lengths[3] is the dimension C length of NHWC
|
||||
const std::vector<size_t> scaleBiasMeanVarLengths = {inOutLengths[3]};
|
||||
|
||||
// input data of the batchnorm forward algorithm
|
||||
Tensor<InOutDataType> x(inOutLengths);
|
||||
Tensor<AccDataType> bnScale(scaleBiasMeanVarLengths);
|
||||
Tensor<AccDataType> bnBias(scaleBiasMeanVarLengths);
|
||||
|
||||
// output data of the batchnorm forward algorithm
|
||||
Tensor<InOutDataType> y_ref(inOutLengths);
|
||||
Tensor<InOutDataType> y(inOutLengths);
|
||||
|
||||
Tensor<AccDataType> resultSaveMean_ref(scaleBiasMeanVarLengths);
|
||||
Tensor<AccDataType> resultSaveInvVariance_ref(scaleBiasMeanVarLengths);
|
||||
|
||||
Tensor<AccDataType> resultRunningMean_ref(scaleBiasMeanVarLengths);
|
||||
Tensor<AccDataType> resultRunningVariance_ref(scaleBiasMeanVarLengths);
|
||||
|
||||
auto inOutStrides = x.mDesc.GetStrides();
|
||||
auto scaleBiasMeanVarStrides = bnScale.mDesc.GetStrides();
|
||||
|
||||
std::size_t num_thread = std::thread::hardware_concurrency();
|
||||
|
||||
if(updateMovingAverage)
|
||||
{
|
||||
if constexpr(std::is_same<InOutDataType, int8_t>::value)
|
||||
{
|
||||
x.GenerateTensorValue(GeneratorTensor_2<InOutDataType>{-5, 5}, num_thread);
|
||||
|
||||
const float x_mean = 0.0f;
|
||||
const float x_stddev = 2.5f;
|
||||
const float noise_stddev = 0.04f;
|
||||
|
||||
resultRunningMean_ref.GenerateTensorValue(
|
||||
GeneratorTensor_4<AccDataType>{x_mean, noise_stddev}, num_thread);
|
||||
|
||||
resultRunningVariance_ref.GenerateTensorValue(
|
||||
GeneratorTensor_4<AccDataType>{x_stddev * x_stddev, noise_stddev}, num_thread);
|
||||
}
|
||||
else
|
||||
{
|
||||
const float x_mean = 0.0f;
|
||||
const float x_stddev = 1.0f;
|
||||
const float noise_stddev = 0.04f;
|
||||
|
||||
// input data in normal distribution
|
||||
x.GenerateTensorValue(GeneratorTensor_4<InOutDataType>{x_mean, x_stddev}, num_thread);
|
||||
|
||||
// initialize the runningMean to be values with tiny variation to the mean of the x
|
||||
// values
|
||||
resultRunningMean_ref.GenerateTensorValue(
|
||||
GeneratorTensor_4<AccDataType>{x_mean, noise_stddev}, num_thread);
|
||||
|
||||
// initialize the runningVariance to be values with tiny variation to the variance of
|
||||
// the x values
|
||||
resultRunningVariance_ref.GenerateTensorValue(
|
||||
GeneratorTensor_4<AccDataType>{x_stddev * x_stddev, noise_stddev}, num_thread);
|
||||
};
|
||||
}
|
||||
else
|
||||
{
|
||||
if constexpr(std::is_same<InOutDataType, int8_t>::value)
|
||||
x.GenerateTensorValue(GeneratorTensor_2<InOutDataType>{-5, 5}, num_thread);
|
||||
else
|
||||
x.GenerateTensorValue(GeneratorTensor_3<InOutDataType>{-5.0f, 5.0f}, num_thread);
|
||||
};
|
||||
|
||||
if(do_verification)
|
||||
{
|
||||
switch(init_method)
|
||||
{
|
||||
case 0:
|
||||
bnScale.GenerateTensorValue(GeneratorTensor_0<AccDataType>{}, num_thread);
|
||||
bnBias.GenerateTensorValue(GeneratorTensor_0<AccDataType>{}, num_thread);
|
||||
break;
|
||||
case 1:
|
||||
bnScale.GenerateTensorValue(GeneratorTensor_1<AccDataType>{1}, num_thread);
|
||||
bnBias.GenerateTensorValue(GeneratorTensor_1<AccDataType>{0}, num_thread);
|
||||
break;
|
||||
case 2:
|
||||
bnScale.GenerateTensorValue(GeneratorTensor_2<AccDataType>{-5, 5}, num_thread);
|
||||
bnBias.GenerateTensorValue(GeneratorTensor_2<AccDataType>{-5, 5}, num_thread);
|
||||
break;
|
||||
default:
|
||||
bnScale.GenerateTensorValue(GeneratorTensor_3<AccDataType>{-5.0f, 5.0f}, num_thread);
|
||||
bnBias.GenerateTensorValue(GeneratorTensor_3<AccDataType>{-5.0f, 5.0f}, num_thread);
|
||||
}
|
||||
};
|
||||
|
||||
// these buffers are usually provided by the user application
|
||||
DeviceMem x_dev(sizeof(InOutDataType) * x.mDesc.GetElementSpaceSize());
|
||||
DeviceMem y_dev(sizeof(InOutDataType) * y.mDesc.GetElementSpaceSize());
|
||||
DeviceMem bnScale_dev(sizeof(AccDataType) * bnScale.mDesc.GetElementSpaceSize());
|
||||
DeviceMem bnBias_dev(sizeof(AccDataType) * bnBias.mDesc.GetElementSpaceSize());
|
||||
|
||||
// mean_dev or resultSaveMean_dev
|
||||
DeviceMem resultSaveMean_dev(sizeof(AccDataType) *
|
||||
resultSaveMean_ref.mDesc.GetElementSpaceSize());
|
||||
// meansquare_dev or resultSaveInvVariance_dev
|
||||
DeviceMem resultSaveInvVariance_dev(sizeof(AccDataType) *
|
||||
resultSaveInvVariance_ref.mDesc.GetElementSpaceSize());
|
||||
// resultRunningMean_dev
|
||||
DeviceMem resultRunningMean_dev(sizeof(AccDataType) *
|
||||
resultRunningMean_ref.mDesc.GetElementSpaceSize());
|
||||
// resultRunningVariance_dev
|
||||
DeviceMem resultRunningVariance_dev(sizeof(AccDataType) *
|
||||
resultRunningVariance_ref.mDesc.GetElementSpaceSize());
|
||||
|
||||
x_dev.ToDevice(x.mData.data());
|
||||
bnScale_dev.ToDevice(bnScale.mData.data());
|
||||
bnBias_dev.ToDevice(bnBias.mData.data());
|
||||
|
||||
if(updateMovingAverage)
|
||||
{
|
||||
resultRunningMean_dev.ToDevice(resultRunningMean_ref.mData.data());
|
||||
resultRunningVariance_dev.ToDevice(resultRunningVariance_ref.mData.data());
|
||||
};
|
||||
|
||||
std::array<index_t, Rank> i_inOutLengths;
|
||||
std::array<index_t, Rank> i_inOutStrides;
|
||||
std::array<index_t, Rank - NumReduceDim> i_scaleBiasMeanVarLengths;
|
||||
std::array<index_t, Rank - NumReduceDim> i_scaleBiasMeanVarStrides;
|
||||
|
||||
ck::ranges::copy(inOutLengths, i_inOutLengths.begin());
|
||||
ck::ranges::copy(inOutStrides, i_inOutStrides.begin());
|
||||
ck::ranges::copy(scaleBiasMeanVarLengths, i_scaleBiasMeanVarLengths.begin());
|
||||
ck::ranges::copy(scaleBiasMeanVarStrides, i_scaleBiasMeanVarStrides.begin());
|
||||
|
||||
using PassThroughOp = ck::tensor_operation::element_wise::PassThrough;
|
||||
|
||||
using DeviceBatchNormFwdInstance =
|
||||
ck::tensor_operation::device::DeviceBatchNormFwdImpl<InOutDataType,
|
||||
InOutDataType,
|
||||
AccDataType,
|
||||
AccDataType, // ScaleDataType
|
||||
AccDataType, // BiasDataType
|
||||
AccDataType, // MeanVarDataType
|
||||
PassThroughOp, // YElementwiseOp
|
||||
Rank,
|
||||
NumReduceDim,
|
||||
UseMultiblockInK,
|
||||
256,
|
||||
16,
|
||||
16,
|
||||
1,
|
||||
2,
|
||||
0,
|
||||
1,
|
||||
1,
|
||||
1,
|
||||
1,
|
||||
1>;
|
||||
|
||||
auto batchnorm_fwd = DeviceBatchNormFwdInstance{};
|
||||
|
||||
auto argument_ptr = batchnorm_fwd.MakeArgumentPointer(
|
||||
i_inOutLengths,
|
||||
i_inOutStrides,
|
||||
i_inOutStrides,
|
||||
{0, 1, 2}, // indicates physical indices of reduce dimensions in lengths[] and strides[]
|
||||
i_scaleBiasMeanVarLengths,
|
||||
i_scaleBiasMeanVarStrides,
|
||||
i_scaleBiasMeanVarStrides,
|
||||
i_scaleBiasMeanVarStrides,
|
||||
x_dev.GetDeviceBuffer(),
|
||||
bnScale_dev.GetDeviceBuffer(),
|
||||
bnBias_dev.GetDeviceBuffer(),
|
||||
epsilon,
|
||||
PassThroughOp{},
|
||||
y_dev.GetDeviceBuffer(),
|
||||
saveMeanAndInvVariance ? resultSaveMean_dev.GetDeviceBuffer() : nullptr,
|
||||
saveMeanAndInvVariance ? resultSaveInvVariance_dev.GetDeviceBuffer() : nullptr,
|
||||
averageFactor,
|
||||
updateMovingAverage ? resultRunningMean_dev.GetDeviceBuffer() : nullptr,
|
||||
updateMovingAverage ? resultRunningVariance_dev.GetDeviceBuffer() : nullptr);
|
||||
|
||||
if(!batchnorm_fwd.IsSupportedArgument(argument_ptr.get()))
|
||||
{
|
||||
std::cout << "The runtime parameters seems not supported by the BatchNorm device instance, "
|
||||
"exiting!"
|
||||
<< std::endl;
|
||||
return (false);
|
||||
};
|
||||
|
||||
size_t workspace_sz = batchnorm_fwd.GetWorkSpaceSize(argument_ptr.get());
|
||||
|
||||
DeviceMem workspace_dev(workspace_sz);
|
||||
|
||||
batchnorm_fwd.SetWorkSpacePointer(argument_ptr.get(), workspace_dev.GetDeviceBuffer());
|
||||
|
||||
auto invoker_ptr = batchnorm_fwd.MakeInvokerPointer();
|
||||
|
||||
if(time_kernel)
|
||||
{
|
||||
float avg_time = 0.0f;
|
||||
size_t num_bytes = 0;
|
||||
|
||||
size_t total_length = inOutLengths[0] * inOutLengths[1] * inOutLengths[2] * inOutLengths[3];
|
||||
size_t invariant_length = inOutLengths[3];
|
||||
|
||||
avg_time = invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, time_kernel});
|
||||
|
||||
// inputing of x, scale, bias, outputing of y
|
||||
num_bytes +=
|
||||
total_length * sizeof(InOutDataType) * 2 + invariant_length * sizeof(AccDataType) * 2;
|
||||
|
||||
// outputing of mean, inv-variance
|
||||
num_bytes += saveMeanAndInvVariance ? invariant_length * sizeof(AccDataType) * 2 : 0;
|
||||
|
||||
// updating of moving mean, variance
|
||||
num_bytes += updateMovingAverage ? invariant_length * sizeof(AccDataType) * 4 : 0;
|
||||
|
||||
float gb_per_sec = num_bytes / 1.E6 / avg_time;
|
||||
|
||||
std::cout << "Perf: " << avg_time << " ms, " << gb_per_sec << " GB/s" << std::endl;
|
||||
}
|
||||
else
|
||||
(void)invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, time_kernel});
|
||||
|
||||
bool pass = true;
|
||||
|
||||
if(do_verification)
|
||||
{
|
||||
|
||||
using ReferenceBatchNormFwdInstance =
|
||||
ck::tensor_operation::host::ReferenceBatchNormFwd<InOutDataType,
|
||||
InOutDataType,
|
||||
AccDataType,
|
||||
AccDataType,
|
||||
AccDataType,
|
||||
AccDataType,
|
||||
PassThroughOp,
|
||||
Rank,
|
||||
NumReduceDim>;
|
||||
|
||||
auto batchNormFwd_ref = ReferenceBatchNormFwdInstance{};
|
||||
|
||||
auto argument_ptr_ref = batchNormFwd_ref.MakeArgumentPointer(
|
||||
i_inOutLengths,
|
||||
i_inOutStrides,
|
||||
i_inOutStrides,
|
||||
{0, 1, 2}, // indicates physical indices of reduce dimensions in lengths[] and strides[]
|
||||
i_scaleBiasMeanVarLengths,
|
||||
i_scaleBiasMeanVarStrides,
|
||||
i_scaleBiasMeanVarStrides,
|
||||
i_scaleBiasMeanVarStrides,
|
||||
x.mData.data(),
|
||||
bnScale.mData.data(),
|
||||
bnBias.mData.data(),
|
||||
epsilon,
|
||||
PassThroughOp{},
|
||||
y_ref.mData.data(),
|
||||
saveMeanAndInvVariance ? resultSaveMean_ref.mData.data() : nullptr,
|
||||
saveMeanAndInvVariance ? resultSaveInvVariance_ref.mData.data() : nullptr,
|
||||
averageFactor,
|
||||
updateMovingAverage ? resultRunningMean_ref.mData.data() : nullptr,
|
||||
updateMovingAverage ? resultRunningVariance_ref.mData.data() : nullptr);
|
||||
|
||||
if(!batchNormFwd_ref.IsSupportedArgument(argument_ptr_ref.get()))
|
||||
{
|
||||
std::cout << "The runtime parameters seems not supported by the BatchNorm reference "
|
||||
"instance, exiting!"
|
||||
<< std::endl;
|
||||
return (false);
|
||||
};
|
||||
|
||||
auto invoker_ptr_ref = batchNormFwd_ref.MakeInvokerPointer();
|
||||
|
||||
(void)invoker_ptr_ref->Run(argument_ptr_ref.get());
|
||||
|
||||
y_dev.FromDevice(y.mData.data());
|
||||
pass = pass && ck::utils::check_err(y, y_ref, "Incorrect normalized output values");
|
||||
|
||||
if(updateMovingAverage)
|
||||
{
|
||||
Tensor<AccDataType> resultRunningMean(scaleBiasMeanVarLengths);
|
||||
Tensor<AccDataType> resultRunningVariance(scaleBiasMeanVarLengths);
|
||||
|
||||
resultRunningMean_dev.FromDevice(resultRunningMean.mData.data());
|
||||
resultRunningVariance_dev.FromDevice(resultRunningVariance.mData.data());
|
||||
|
||||
pass = pass && ck::utils::check_err(resultRunningMean,
|
||||
resultRunningMean_ref,
|
||||
"Incorrect running mean values");
|
||||
pass = pass && ck::utils::check_err(resultRunningVariance,
|
||||
resultRunningVariance_ref,
|
||||
"Incorrect running variance values");
|
||||
};
|
||||
|
||||
if(saveMeanAndInvVariance)
|
||||
{
|
||||
using ck::host_common::dumpBufferToFile;
|
||||
|
||||
Tensor<AccDataType> resultSaveMean(scaleBiasMeanVarLengths);
|
||||
Tensor<AccDataType> resultSaveInvVariance(scaleBiasMeanVarLengths);
|
||||
|
||||
resultSaveMean_dev.FromDevice(resultSaveMean.mData.data());
|
||||
resultSaveInvVariance_dev.FromDevice(resultSaveInvVariance.mData.data());
|
||||
|
||||
pass = pass && ck::utils::check_err(
|
||||
resultSaveMean, resultSaveMean_ref, "Incorrect saved mean values");
|
||||
pass = pass && ck::utils::check_err(resultSaveInvVariance,
|
||||
resultSaveInvVariance_ref,
|
||||
"Incorrect saved invvariance values");
|
||||
};
|
||||
};
|
||||
|
||||
return (pass);
|
||||
};
|
||||
|
||||
const double epsilon = std::numeric_limits<float>::epsilon();
|
||||
static const double averageFactor = 0.1;
|
||||
|
||||
int main(int argc, char* argv[])
|
||||
{
|
||||
bool pass = true;
|
||||
|
||||
if(argc > 1)
|
||||
{
|
||||
BatchNormFwdArg arg;
|
||||
|
||||
if(arg.processArgs(argc, argv) < 0)
|
||||
return (-1);
|
||||
|
||||
if(arg.data_type == 0)
|
||||
{
|
||||
if(arg.use_multiblock_welford)
|
||||
pass = bnorm_fwd_nhwc_test<ck::half_t, float, true>(arg.do_verification,
|
||||
arg.init_method,
|
||||
arg.time_kernel,
|
||||
arg.inOutLengths,
|
||||
arg.updateMovingAverage,
|
||||
arg.saveMeanAndInvVariance,
|
||||
averageFactor,
|
||||
epsilon);
|
||||
else
|
||||
pass = bnorm_fwd_nhwc_test<ck::half_t, float, false>(arg.do_verification,
|
||||
arg.init_method,
|
||||
arg.time_kernel,
|
||||
arg.inOutLengths,
|
||||
arg.updateMovingAverage,
|
||||
arg.saveMeanAndInvVariance,
|
||||
averageFactor,
|
||||
epsilon);
|
||||
}
|
||||
else if(arg.data_type == 1)
|
||||
{
|
||||
if(arg.use_multiblock_welford)
|
||||
pass = bnorm_fwd_nhwc_test<float, float, true>(arg.do_verification,
|
||||
arg.init_method,
|
||||
arg.time_kernel,
|
||||
arg.inOutLengths,
|
||||
arg.updateMovingAverage,
|
||||
arg.saveMeanAndInvVariance,
|
||||
averageFactor,
|
||||
epsilon);
|
||||
else
|
||||
pass = bnorm_fwd_nhwc_test<float, float, false>(arg.do_verification,
|
||||
arg.init_method,
|
||||
arg.time_kernel,
|
||||
arg.inOutLengths,
|
||||
arg.updateMovingAverage,
|
||||
arg.saveMeanAndInvVariance,
|
||||
averageFactor,
|
||||
epsilon);
|
||||
}
|
||||
else if(arg.data_type == 3)
|
||||
{
|
||||
if(arg.use_multiblock_welford)
|
||||
pass = bnorm_fwd_nhwc_test<int8_t, float, true>(arg.do_verification,
|
||||
arg.init_method,
|
||||
arg.time_kernel,
|
||||
arg.inOutLengths,
|
||||
arg.updateMovingAverage,
|
||||
arg.saveMeanAndInvVariance,
|
||||
averageFactor,
|
||||
epsilon);
|
||||
else
|
||||
pass = bnorm_fwd_nhwc_test<int8_t, float, false>(arg.do_verification,
|
||||
arg.init_method,
|
||||
arg.time_kernel,
|
||||
arg.inOutLengths,
|
||||
arg.updateMovingAverage,
|
||||
arg.saveMeanAndInvVariance,
|
||||
averageFactor,
|
||||
epsilon);
|
||||
}
|
||||
else if(arg.data_type == 5)
|
||||
{
|
||||
if(arg.use_multiblock_welford)
|
||||
pass = bnorm_fwd_nhwc_test<ck::bhalf_t, float, true>(arg.do_verification,
|
||||
arg.init_method,
|
||||
arg.time_kernel,
|
||||
arg.inOutLengths,
|
||||
arg.updateMovingAverage,
|
||||
arg.saveMeanAndInvVariance,
|
||||
averageFactor,
|
||||
epsilon);
|
||||
else
|
||||
pass = bnorm_fwd_nhwc_test<ck::bhalf_t, float, false>(arg.do_verification,
|
||||
arg.init_method,
|
||||
arg.time_kernel,
|
||||
arg.inOutLengths,
|
||||
arg.updateMovingAverage,
|
||||
arg.saveMeanAndInvVariance,
|
||||
averageFactor,
|
||||
epsilon);
|
||||
}
|
||||
else if(arg.data_type == 6)
|
||||
{
|
||||
if(arg.use_multiblock_welford)
|
||||
pass = bnorm_fwd_nhwc_test<double, double, true>(arg.do_verification,
|
||||
arg.init_method,
|
||||
arg.time_kernel,
|
||||
arg.inOutLengths,
|
||||
arg.updateMovingAverage,
|
||||
arg.saveMeanAndInvVariance,
|
||||
averageFactor,
|
||||
epsilon);
|
||||
else
|
||||
pass = bnorm_fwd_nhwc_test<double, double, false>(arg.do_verification,
|
||||
arg.init_method,
|
||||
arg.time_kernel,
|
||||
arg.inOutLengths,
|
||||
arg.updateMovingAverage,
|
||||
arg.saveMeanAndInvVariance,
|
||||
averageFactor,
|
||||
epsilon);
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
pass = bnorm_fwd_nhwc_test<ck::half_t, float, true>(true,
|
||||
2,
|
||||
false, // don't time kernel
|
||||
{128, 16, 6, 512},
|
||||
true,
|
||||
true,
|
||||
averageFactor,
|
||||
epsilon);
|
||||
|
||||
pass = pass && bnorm_fwd_nhwc_test<ck::half_t, float, false>(true,
|
||||
2,
|
||||
false, // don't time kernel
|
||||
{128, 16, 3, 1024},
|
||||
true,
|
||||
true,
|
||||
averageFactor,
|
||||
epsilon);
|
||||
};
|
||||
|
||||
return (pass ? 0 : 1);
|
||||
}
|
||||
136
example/34_batchnorm/batchnorm_infer_impl.hpp
Normal file
136
example/34_batchnorm/batchnorm_infer_impl.hpp
Normal file
@@ -0,0 +1,136 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <cassert>
|
||||
#include <vector>
|
||||
|
||||
#include "ck/ck.hpp"
|
||||
#include "ck/utility/sequence.hpp"
|
||||
#include "ck/utility/tuple.hpp"
|
||||
#include "ck/utility/reduction_operator.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/impl/device_elementwise_dynamic_vector_dims_impl.hpp"
|
||||
|
||||
#include "batchnorm_common.hpp"
|
||||
|
||||
template <typename XDataType,
|
||||
typename YDataType,
|
||||
typename AccDataType,
|
||||
typename ScaleDataType,
|
||||
typename BiasDataType,
|
||||
typename MeanVarDataType,
|
||||
ck::index_t Rank,
|
||||
ck::index_t NumBatchNormReduceDim,
|
||||
bool fastest_dim_is_reduced = false>
|
||||
int bnorm_infer(
|
||||
bool time_kernel,
|
||||
const std::array<int, NumBatchNormReduceDim> reduceDims,
|
||||
const std::array<ck::index_t, Rank> xyLengths,
|
||||
const std::array<ck::index_t, Rank> xStrides,
|
||||
const std::array<ck::index_t, Rank> yStrides,
|
||||
const std::array<ck::index_t, Rank - NumBatchNormReduceDim> bnScaleBiasMeanVarLengths,
|
||||
const std::array<ck::index_t, Rank - NumBatchNormReduceDim> bnScaleStrides,
|
||||
const std::array<ck::index_t, Rank - NumBatchNormReduceDim> bnBiasStrides,
|
||||
const std::array<ck::index_t, Rank - NumBatchNormReduceDim> bnMeanVarStrides,
|
||||
const void* p_x,
|
||||
const void* p_scale,
|
||||
const void* p_bias,
|
||||
double epsilon,
|
||||
const void* p_estimatedMean,
|
||||
const void* p_estimatedVariance,
|
||||
void* p_y)
|
||||
{
|
||||
(void)bnScaleBiasMeanVarLengths;
|
||||
|
||||
static_assert(NumBatchNormReduceDim < Rank,
|
||||
"Invalid number of reduced dimensions for batchnorm!");
|
||||
|
||||
using DeviceNormalizeInstance = ck::tensor_operation::device::DeviceElementwiseImpl<
|
||||
ck::Tuple<XDataType, AccDataType, AccDataType, AccDataType, AccDataType>, // x, mean,
|
||||
// variance,
|
||||
// scale,
|
||||
// bias,
|
||||
ck::Tuple<YDataType>, // y
|
||||
NormalizeInInfer,
|
||||
Rank,
|
||||
64, // BlockSize
|
||||
32, // MPerBlock
|
||||
32, // NPerBlock
|
||||
4, // MPerthread
|
||||
4, // NPerthread
|
||||
ck::Sequence<1, 0>, // ThreadClusterArrangeOrder
|
||||
ck::Sequence<1, 1, 1, 1, 1>, // x, mean, variance, scale, bias
|
||||
ck::Sequence<1>>; // scalarPerVector: y
|
||||
|
||||
auto invariantDims = get_invariant_dims<Rank, NumBatchNormReduceDim>(reduceDims);
|
||||
std::array<ck::index_t, Rank> aligned_bnScaleStrides{0};
|
||||
std::array<ck::index_t, Rank> aligned_bnBiasStrides{0};
|
||||
std::array<ck::index_t, Rank> aligned_bnMeanVarStrides{0};
|
||||
|
||||
int i = 0;
|
||||
for(auto dim : invariantDims)
|
||||
{
|
||||
assert(xyLengths[dim] == bnScaleBiasMeanVarLengths[i]);
|
||||
|
||||
aligned_bnScaleStrides[dim] = bnScaleStrides[i];
|
||||
aligned_bnBiasStrides[dim] = bnBiasStrides[i];
|
||||
aligned_bnMeanVarStrides[dim] = bnMeanVarStrides[i];
|
||||
i++;
|
||||
};
|
||||
|
||||
int32_t reduceLength = 1;
|
||||
|
||||
for(auto dim : reduceDims)
|
||||
reduceLength *= xyLengths[dim];
|
||||
|
||||
int32_t invariantLength = 1;
|
||||
|
||||
for(auto dim : invariantDims)
|
||||
invariantLength *= xyLengths[dim];
|
||||
|
||||
size_t total_length = static_cast<size_t>(invariantLength) * reduceLength;
|
||||
|
||||
float avg_time = 0.0f;
|
||||
std::size_t num_bytes = 0;
|
||||
|
||||
auto dev_normalize = DeviceNormalizeInstance{};
|
||||
|
||||
auto argument_ptr1 = dev_normalize.MakeArgumentPointer(
|
||||
xyLengths,
|
||||
{xStrides,
|
||||
aligned_bnMeanVarStrides,
|
||||
aligned_bnMeanVarStrides,
|
||||
aligned_bnScaleStrides,
|
||||
aligned_bnBiasStrides},
|
||||
{yStrides},
|
||||
{p_x, p_estimatedMean, p_estimatedVariance, p_scale, p_bias},
|
||||
{p_y},
|
||||
NormalizeInInfer{epsilon});
|
||||
|
||||
if(!dev_normalize.IsSupportedArgument(argument_ptr1.get()))
|
||||
{
|
||||
std::cout << "The runtime parameters seems not supported by the Devic, exiting!"
|
||||
<< std::endl;
|
||||
|
||||
return (-1);
|
||||
};
|
||||
|
||||
auto invoker_ptr1 = dev_normalize.MakeInvokerPointer();
|
||||
|
||||
avg_time += invoker_ptr1->Run(argument_ptr1.get(), StreamConfig{nullptr, time_kernel});
|
||||
|
||||
num_bytes += total_length * sizeof(XDataType) +
|
||||
invariantLength *
|
||||
(sizeof(ScaleDataType) + sizeof(BiasDataType) + 2 * sizeof(MeanVarDataType)) +
|
||||
total_length * sizeof(YDataType);
|
||||
|
||||
if(time_kernel)
|
||||
{
|
||||
float gb_per_sec = num_bytes / 1.E6 / avg_time;
|
||||
|
||||
std::cout << "Perf: " << avg_time << " ms, " << gb_per_sec << " GB/s" << std::endl;
|
||||
};
|
||||
|
||||
return (0);
|
||||
};
|
||||
Reference in New Issue
Block a user