mirror of
https://github.com/ROCm/composable_kernel.git
synced 2026-04-19 22:39:03 +00:00
BatchNorm backward implementation (#461)
* Implemented batchnorm-backward Blockwise and Multiblock kernels * Add batchnorm-backward device op * Add batchnorm-backward host-reference op * Add batchnorm-backward example * Parameters renaming in batchnorm backward kernels and device op * Change in the example to loose the threshold for ScaleDiff checking * Add comments to explain the implementation of batchnorm-backward * Parameters renaming again in batchnorm backward kernels * Improve the expression calculation for performance * Add batchnorm backward to README * Add comments to explain inv-variance in batchnorm forward and backward * Renaming the batchnorm forward training and inferring examples * Add/update the comments for batchnorm-backward kernels * Renaming again * Add block_sync_lds between two consecutive blockwise reductions * Move common expression 1/N out of the static_for loops * Add dy_elementwise_op * Renaming in backward example again * Add checking for reduceDims in reference_batchnorm_backward * Update to comments and codes format * Rename in the comments * Remove common expression out of the loop in reference_batchnorm_backward_nhwc_c * Add block_sync_lds() between blockwise reduction again * Fix comments again * Remove int8 from batchnorm-forward instances since it is not needed for forward training and could fail test
This commit is contained in:
@@ -1,2 +1,3 @@
|
||||
add_example_executable(example_batchnorm_forward batchnorm_forward_nhwc.cpp)
|
||||
add_example_executable(example_batchnorm_infer batchnorm_infer_nhwc.cpp)
|
||||
add_example_executable(example_batchnorm_forward_training batchnorm_forward_training_nhwc.cpp)
|
||||
add_example_executable(example_batchnorm_forward_inferring batchnorm_forward_inferring_nhwc.cpp)
|
||||
add_example_executable(example_batchnorm_backward batchnorm_backward_nhwc.cpp)
|
||||
|
||||
@@ -53,4 +53,29 @@ Start running 10 times...
|
||||
Perf: 1.28235 ms, 523.329 GB/s
|
||||
```
|
||||
|
||||
## Run ```batchnorm backward nhwc```
|
||||
```bash
|
||||
# -D <xxx> : input 4-d tensor lengths
|
||||
# -v <x> : verification (0=no, 1=yes)
|
||||
Arg1: data type (0: fp16, 1: fp32, 3: int8, 5: bp16, 6: fp64)
|
||||
Arg2 -- 1/0 to indicate whether to use saved mean and invVariance
|
||||
Arg3 -- init method used for dy and bnScale (0=no init, 1=single integer value, 2=scope integer value, 3=decimal value)
|
||||
Arg4 -- time kernel (0=no, 1=yes)
|
||||
Arg5: use multi-block welford (0=n0, 1=yes)
|
||||
./bin/example_batchnorm_backward -D 128,16,3,1024 -v 1 0 0 3 1 1
|
||||
```
|
||||
|
||||
Result
|
||||
```
|
||||
./bin/example_batchnorm_backward -D 128,16,3,1024 -v 1 0 0 3 1 1
|
||||
launch_and_time_kernel: grid_dim {6144, 1, 1}, block_dim {256, 1, 1}
|
||||
Warm up 1 time
|
||||
Start running 10 times...
|
||||
launch_and_time_kernel: grid_dim {6144, 1, 1}, block_dim {256, 1, 1}
|
||||
Warm up 1 time
|
||||
Start running 10 times...
|
||||
launch_and_time_kernel: grid_dim {6144, 1, 1}, block_dim {256, 1, 1}
|
||||
Warm up 1 time
|
||||
Start running 10 times...
|
||||
Perf: 0.411026 ms, 91.8702 GB/s
|
||||
```
|
||||
|
||||
502
example/34_batchnorm/batchnorm_backward_nhwc.cpp
Normal file
502
example/34_batchnorm/batchnorm_backward_nhwc.cpp
Normal file
@@ -0,0 +1,502 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#include <limits>
|
||||
#include <iostream>
|
||||
#include <getopt.h>
|
||||
|
||||
#include "ck/ck.hpp"
|
||||
#include "ck/library/utility/check_err.hpp"
|
||||
#include "ck/library/utility/device_memory.hpp"
|
||||
#include "ck/library/utility/host_tensor.hpp"
|
||||
#include "ck/library/utility/host_tensor_generator.hpp"
|
||||
#include "ck/library/utility/host_common_util.hpp"
|
||||
#include "ck/library/reference_tensor_operation/cpu/reference_batchnorm_backward_nhwc_c.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/impl/device_batchnorm_backward_impl.hpp"
|
||||
|
||||
static struct option long_options[] = {{"inOutLengths", required_argument, nullptr, 'D'},
|
||||
{"verify", required_argument, nullptr, 'v'},
|
||||
{"help", no_argument, nullptr, '?'},
|
||||
{nullptr, 0, nullptr, 0}};
|
||||
|
||||
class BatchNormBwdArg
|
||||
{
|
||||
private:
|
||||
int option_index = 0;
|
||||
|
||||
public:
|
||||
std::vector<size_t> inOutLengths;
|
||||
|
||||
bool do_verification = false;
|
||||
|
||||
bool haveSavedMeanInvVar;
|
||||
|
||||
int data_type = 0;
|
||||
int init_method = 3;
|
||||
bool time_kernel = false;
|
||||
bool use_multiblock_welford = false;
|
||||
|
||||
public:
|
||||
void show_usage(const char* cmd)
|
||||
{
|
||||
// clang-format off
|
||||
std::cout << "Usage of " << cmd << std::endl;
|
||||
std::cout << "--inOutLengths or -D, comma separated list of input tensor dimension lengths, must have 4 integers for nhwc" << std::endl;
|
||||
std::cout << "--verify or -v, 1/0 to indicate whether to verify the result by comparing with the host-based batch-normalization" << std::endl;
|
||||
std::cout << "Arg1: data type (0: fp16, 1: fp32, 3: int8, 5: bp16, 6: fp64)" << std::endl;
|
||||
std::cout << "Arg2 -- 1/0 to indicate whether to use saved mean and invVariance" << std::endl;
|
||||
std::cout << "Arg3 -- init method used for dy and bnScale (0=no init, 1=single integer value, 2=scope integer value, 3=decimal value)" << std::endl;
|
||||
std::cout << "Arg4 -- time kernel (0=no, 1=yes)" << std::endl;
|
||||
std::cout << "Arg5: use multi-block welford (0=n0, 1=yes)" << std::endl;
|
||||
// clang-format on
|
||||
};
|
||||
|
||||
int processArgs(int argc, char* argv[])
|
||||
{
|
||||
using ck::host_common::getTypeValuesFromString;
|
||||
|
||||
int ch;
|
||||
|
||||
while(1)
|
||||
{
|
||||
ch = getopt_long(argc, argv, "D:v:", long_options, &option_index);
|
||||
if(ch == -1)
|
||||
break;
|
||||
switch(ch)
|
||||
{
|
||||
case 'D':
|
||||
if(!optarg)
|
||||
throw std::runtime_error("Invalid option format!");
|
||||
|
||||
inOutLengths = getTypeValuesFromString<size_t>(optarg);
|
||||
|
||||
if(inOutLengths.size() != 4)
|
||||
throw std::runtime_error(
|
||||
"NHWC tensor layout should have 4 length values specified!");
|
||||
break;
|
||||
case 'v':
|
||||
if(!optarg)
|
||||
throw std::runtime_error("Invalid option format!");
|
||||
|
||||
do_verification = static_cast<bool>(std::atoi(optarg));
|
||||
break;
|
||||
case '?':
|
||||
if(std::string(long_options[option_index].name) == "help")
|
||||
{
|
||||
show_usage(argv[0]);
|
||||
return (-1);
|
||||
};
|
||||
break;
|
||||
default: show_usage(argv[0]); return (-1);
|
||||
};
|
||||
};
|
||||
|
||||
if(optind + 5 > argc)
|
||||
throw std::runtime_error("Invalid cmd-line arguments, more argumetns are needed!");
|
||||
|
||||
data_type = std::atoi(argv[optind++]);
|
||||
haveSavedMeanInvVar = std::atoi(argv[optind++]);
|
||||
init_method = std::atoi(argv[optind++]);
|
||||
time_kernel = static_cast<bool>(std::atoi(argv[optind++]));
|
||||
use_multiblock_welford = static_cast<bool>(std::atoi(argv[optind]));
|
||||
|
||||
return (0);
|
||||
};
|
||||
};
|
||||
|
||||
using namespace ck;
|
||||
|
||||
template <typename InOutDataType, typename AccDataType, bool UseMultiblockInK>
|
||||
bool bnorm_bwd_nhwc_test(bool do_verification,
|
||||
int init_method,
|
||||
bool time_kernel,
|
||||
const std::vector<size_t> inOutLengths,
|
||||
bool haveSavedMeanInvVar,
|
||||
double epsilon)
|
||||
{
|
||||
// for NHWC BatchNorm calculation of mean and meansquare
|
||||
constexpr index_t Rank = 4;
|
||||
constexpr index_t NumReduceDim = 3;
|
||||
|
||||
const std::vector<size_t> scaleBiasMeanVarLengths = {inOutLengths[3]};
|
||||
|
||||
// input data of the batchnorm backward algorithm
|
||||
Tensor<InOutDataType> x(inOutLengths);
|
||||
Tensor<InOutDataType> dy(inOutLengths);
|
||||
|
||||
Tensor<AccDataType> bnScale(scaleBiasMeanVarLengths);
|
||||
|
||||
Tensor<AccDataType> savedMean(scaleBiasMeanVarLengths);
|
||||
Tensor<AccDataType> savedInvVar(scaleBiasMeanVarLengths);
|
||||
// savedVariance is only used for initializing savedInvVar
|
||||
Tensor<AccDataType> savedVariance(scaleBiasMeanVarLengths);
|
||||
|
||||
// output data of the batchnorm backward algorithm
|
||||
Tensor<InOutDataType> dx_ref(inOutLengths);
|
||||
Tensor<InOutDataType> dx(inOutLengths);
|
||||
|
||||
Tensor<AccDataType> dscale(scaleBiasMeanVarLengths);
|
||||
Tensor<AccDataType> dbias(scaleBiasMeanVarLengths);
|
||||
|
||||
Tensor<AccDataType> dscale_ref(scaleBiasMeanVarLengths);
|
||||
Tensor<AccDataType> dbias_ref(scaleBiasMeanVarLengths);
|
||||
|
||||
auto inOutStrides = dy.mDesc.GetStrides();
|
||||
auto scaleBiasMeanVarStrides = dscale.mDesc.GetStrides();
|
||||
|
||||
std::size_t num_thread = std::thread::hardware_concurrency();
|
||||
|
||||
if(haveSavedMeanInvVar)
|
||||
{
|
||||
const float x_mean = 0.0f;
|
||||
const float x_stddev = 1.0f;
|
||||
const float noise_stddev = 0.0001f;
|
||||
|
||||
// input data in normal distribution
|
||||
x.GenerateTensorValue(GeneratorTensor_4<InOutDataType>{x_mean, x_stddev}, num_thread);
|
||||
|
||||
// initialize the savedMean to be values with tiny variation to the mean of the x values
|
||||
savedMean.GenerateTensorValue(GeneratorTensor_4<AccDataType>{x_mean, noise_stddev},
|
||||
num_thread);
|
||||
|
||||
// initialize the variance to be values with tiny variation to the variance of the x values
|
||||
savedVariance.GenerateTensorValue(
|
||||
GeneratorTensor_4<AccDataType>{x_stddev * x_stddev, noise_stddev}, num_thread);
|
||||
|
||||
auto it_src = savedVariance.mData.begin();
|
||||
auto it_dst = savedInvVar.mData.begin();
|
||||
float tmp_epsilon = std::numeric_limits<float>::epsilon();
|
||||
|
||||
while(it_src != savedVariance.mData.end())
|
||||
{
|
||||
*it_dst = type_convert<AccDataType>(
|
||||
1.0f / std::sqrtf(type_convert<float>(*it_src) + tmp_epsilon));
|
||||
|
||||
it_src++;
|
||||
it_dst++;
|
||||
};
|
||||
}
|
||||
else
|
||||
{
|
||||
const float x_mean = 0.0f;
|
||||
const float x_stddev = 1.0f;
|
||||
|
||||
// input data in normal distribution
|
||||
x.GenerateTensorValue(GeneratorTensor_4<InOutDataType>{x_mean, x_stddev}, num_thread);
|
||||
};
|
||||
|
||||
if(do_verification)
|
||||
{
|
||||
switch(init_method)
|
||||
{
|
||||
case 0:
|
||||
dy.GenerateTensorValue(GeneratorTensor_0<InOutDataType>{}, num_thread);
|
||||
bnScale.GenerateTensorValue(GeneratorTensor_0<InOutDataType>{}, num_thread);
|
||||
break;
|
||||
case 1:
|
||||
dy.GenerateTensorValue(GeneratorTensor_1<InOutDataType>{1}, num_thread);
|
||||
bnScale.GenerateTensorValue(GeneratorTensor_1<InOutDataType>{1}, num_thread);
|
||||
break;
|
||||
case 2:
|
||||
dy.GenerateTensorValue(GeneratorTensor_2<InOutDataType>{-5, 5}, num_thread);
|
||||
bnScale.GenerateTensorValue(GeneratorTensor_2<InOutDataType>{-5, 5}, num_thread);
|
||||
break;
|
||||
default:
|
||||
dy.GenerateTensorValue(GeneratorTensor_3<InOutDataType>{-0.2f, 0.2f}, num_thread);
|
||||
bnScale.GenerateTensorValue(GeneratorTensor_3<InOutDataType>{-0.5f, 0.5f}, num_thread);
|
||||
}
|
||||
};
|
||||
|
||||
// input data of the batchnorm backward algorithm
|
||||
DeviceMem x_dev(sizeof(InOutDataType) * x.mDesc.GetElementSpaceSize());
|
||||
DeviceMem dy_dev(sizeof(InOutDataType) * dy.mDesc.GetElementSpaceSize());
|
||||
|
||||
DeviceMem bnScale_dev(sizeof(AccDataType) * bnScale.mDesc.GetElementSpaceSize());
|
||||
|
||||
DeviceMem savedMean_dev(sizeof(AccDataType) * savedMean.mDesc.GetElementSpaceSize());
|
||||
DeviceMem savedInvVar_dev(sizeof(AccDataType) * savedInvVar.mDesc.GetElementSpaceSize());
|
||||
|
||||
// output data of the batchnorm backward algorithm
|
||||
DeviceMem dx_dev(sizeof(InOutDataType) * dx.mDesc.GetElementSpaceSize());
|
||||
|
||||
DeviceMem dscale_dev(sizeof(AccDataType) * dscale.mDesc.GetElementSpaceSize());
|
||||
DeviceMem dbias_dev(sizeof(AccDataType) * dbias.mDesc.GetElementSpaceSize());
|
||||
|
||||
x_dev.ToDevice(x.mData.data());
|
||||
dy_dev.ToDevice(dy.mData.data());
|
||||
bnScale_dev.ToDevice(bnScale.mData.data());
|
||||
|
||||
if(haveSavedMeanInvVar)
|
||||
{
|
||||
savedMean_dev.ToDevice(savedMean.mData.data());
|
||||
savedInvVar_dev.ToDevice(savedInvVar.mData.data());
|
||||
};
|
||||
|
||||
std::array<index_t, Rank> i_inOutLengths;
|
||||
std::array<index_t, Rank> i_inOutStrides;
|
||||
std::array<index_t, Rank - NumReduceDim> i_scaleBiasMeanVarLengths;
|
||||
std::array<index_t, Rank - NumReduceDim> i_scaleBiasMeanVarStrides;
|
||||
|
||||
std::copy(inOutLengths.begin(), inOutLengths.end(), i_inOutLengths.begin());
|
||||
std::copy(inOutStrides.begin(), inOutStrides.end(), i_inOutStrides.begin());
|
||||
std::copy(scaleBiasMeanVarLengths.begin(),
|
||||
scaleBiasMeanVarLengths.end(),
|
||||
i_scaleBiasMeanVarLengths.begin());
|
||||
std::copy(scaleBiasMeanVarStrides.begin(),
|
||||
scaleBiasMeanVarStrides.end(),
|
||||
i_scaleBiasMeanVarStrides.begin());
|
||||
|
||||
using PassThroughOp = ck::tensor_operation::element_wise::PassThrough;
|
||||
|
||||
using DeviceBatchNormBwdInstance =
|
||||
ck::tensor_operation::device::DeviceBatchNormBwdImpl<InOutDataType,
|
||||
InOutDataType,
|
||||
InOutDataType,
|
||||
AccDataType,
|
||||
AccDataType, // ScaleDataType
|
||||
AccDataType, // BiasDataType
|
||||
AccDataType, // MeanVarDataType
|
||||
PassThroughOp,
|
||||
Rank,
|
||||
NumReduceDim,
|
||||
UseMultiblockInK,
|
||||
256,
|
||||
16,
|
||||
16,
|
||||
1,
|
||||
2,
|
||||
0,
|
||||
1, // XSrcVectorSize
|
||||
1, // DySrcVectorSize
|
||||
1, // DxDstVectorSize
|
||||
1, // ScaleSrcDstVectorSize
|
||||
1, // BiasDstVectorSize
|
||||
1>; // MeanVarSrcVectorSize
|
||||
|
||||
auto batchnorm_bwd = DeviceBatchNormBwdInstance{};
|
||||
|
||||
auto argument_ptr = batchnorm_bwd.MakeArgumentPointer(
|
||||
i_inOutLengths,
|
||||
i_inOutStrides,
|
||||
i_inOutStrides,
|
||||
i_inOutStrides,
|
||||
{0, 1, 2},
|
||||
i_scaleBiasMeanVarLengths,
|
||||
i_scaleBiasMeanVarStrides,
|
||||
i_scaleBiasMeanVarStrides,
|
||||
i_scaleBiasMeanVarStrides,
|
||||
x_dev.GetDeviceBuffer(),
|
||||
dy_dev.GetDeviceBuffer(),
|
||||
bnScale_dev.GetDeviceBuffer(),
|
||||
haveSavedMeanInvVar ? savedMean_dev.GetDeviceBuffer() : nullptr,
|
||||
haveSavedMeanInvVar ? savedInvVar_dev.GetDeviceBuffer() : nullptr,
|
||||
epsilon,
|
||||
PassThroughOp{},
|
||||
dx_dev.GetDeviceBuffer(),
|
||||
dscale_dev.GetDeviceBuffer(),
|
||||
dbias_dev.GetDeviceBuffer());
|
||||
|
||||
if(!batchnorm_bwd.IsSupportedArgument(argument_ptr.get()))
|
||||
{
|
||||
std::cout << "The runtime parameters seems not supported by the BatchNorm device instance, "
|
||||
"exiting!"
|
||||
<< std::endl;
|
||||
return (false);
|
||||
};
|
||||
|
||||
size_t workspace_sz = batchnorm_bwd.GetWorkSpaceSize(argument_ptr.get());
|
||||
|
||||
DeviceMem workspace_dev(workspace_sz);
|
||||
|
||||
batchnorm_bwd.SetWorkSpacePointer(argument_ptr.get(), workspace_dev.GetDeviceBuffer());
|
||||
|
||||
auto invoker_ptr = batchnorm_bwd.MakeInvokerPointer();
|
||||
|
||||
if(time_kernel)
|
||||
{
|
||||
float avg_time = 0.0f;
|
||||
size_t num_bytes = 0;
|
||||
|
||||
size_t total_length = inOutLengths[0] * inOutLengths[1] * inOutLengths[2] * inOutLengths[3];
|
||||
size_t invariant_length = inOutLengths[3];
|
||||
|
||||
avg_time = invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, time_kernel});
|
||||
|
||||
// inputing of x, dy, scale, outputing of dx, dscale, dbias
|
||||
num_bytes +=
|
||||
total_length * sizeof(InOutDataType) * 3 + invariant_length * sizeof(AccDataType) * 3;
|
||||
|
||||
// outputing of mean, inv-variance
|
||||
num_bytes += haveSavedMeanInvVar ? invariant_length * sizeof(AccDataType) * 2 : 0;
|
||||
|
||||
float gb_per_sec = num_bytes / 1.E6 / avg_time;
|
||||
|
||||
std::cout << "Perf: " << avg_time << " ms, " << gb_per_sec << " GB/s" << std::endl;
|
||||
}
|
||||
else
|
||||
(void)invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, time_kernel});
|
||||
|
||||
bool pass = true;
|
||||
|
||||
if(do_verification)
|
||||
{
|
||||
using ReferenceBatchNormBwdInstance =
|
||||
ck::tensor_operation::host::ReferenceBatchNormBwd_Input_N_H_W_C_Output_C<InOutDataType,
|
||||
InOutDataType,
|
||||
InOutDataType,
|
||||
AccDataType,
|
||||
AccDataType,
|
||||
AccDataType,
|
||||
AccDataType,
|
||||
PassThroughOp>;
|
||||
|
||||
auto batchNormBwd_ref = ReferenceBatchNormBwdInstance{};
|
||||
|
||||
auto argument_ptr_ref = batchNormBwd_ref.MakeArgumentPointer(
|
||||
i_inOutLengths,
|
||||
i_inOutStrides,
|
||||
i_inOutStrides,
|
||||
i_inOutStrides,
|
||||
{0, 1, 2},
|
||||
i_scaleBiasMeanVarLengths,
|
||||
i_scaleBiasMeanVarStrides,
|
||||
i_scaleBiasMeanVarStrides,
|
||||
i_scaleBiasMeanVarStrides,
|
||||
x.mData.data(),
|
||||
dy.mData.data(),
|
||||
bnScale.mData.data(),
|
||||
haveSavedMeanInvVar ? savedMean.mData.data() : nullptr,
|
||||
haveSavedMeanInvVar ? savedInvVar.mData.data() : nullptr,
|
||||
epsilon,
|
||||
PassThroughOp{},
|
||||
dx_ref.mData.data(),
|
||||
dscale_ref.mData.data(),
|
||||
dbias_ref.mData.data());
|
||||
|
||||
if(!batchNormBwd_ref.IsSupportedArgument(argument_ptr_ref.get()))
|
||||
{
|
||||
std::cout
|
||||
<< "The runtime parameters seems not supported by the device instance, exiting!"
|
||||
<< std::endl;
|
||||
return (false);
|
||||
};
|
||||
|
||||
auto invoker_ptr_ref = batchNormBwd_ref.MakeInvokerPointer();
|
||||
|
||||
(void)invoker_ptr_ref->Run(argument_ptr_ref.get());
|
||||
|
||||
dx_dev.FromDevice(dx.mData.data());
|
||||
dscale_dev.FromDevice(dscale.data());
|
||||
dbias_dev.FromDevice(dbias.data());
|
||||
|
||||
// clang-format off
|
||||
pass = pass && ck::utils::check_err(dbias.mData, dbias_ref.mData, "dBias result:", 1e-5, 1e-5);
|
||||
pass = pass && ck::utils::check_err(dscale.mData, dscale_ref.mData, "dScale result:", 1e-5, 2e-4);
|
||||
pass = pass && ck::utils::check_err(dx.mData, dx_ref.mData, "dx result:");
|
||||
// clang-format on
|
||||
};
|
||||
|
||||
return (pass);
|
||||
};
|
||||
|
||||
static const double epsilon = std::numeric_limits<float>::epsilon();
|
||||
|
||||
int main(int argc, char* argv[])
|
||||
{
|
||||
bool pass = true;
|
||||
|
||||
if(argc > 1)
|
||||
{
|
||||
BatchNormBwdArg arg;
|
||||
|
||||
if(arg.processArgs(argc, argv) < 0)
|
||||
return (-1);
|
||||
|
||||
if(arg.data_type == 0)
|
||||
{
|
||||
if(arg.use_multiblock_welford)
|
||||
pass = bnorm_bwd_nhwc_test<ck::half_t, float, true>(arg.do_verification,
|
||||
arg.init_method,
|
||||
arg.time_kernel,
|
||||
arg.inOutLengths,
|
||||
arg.haveSavedMeanInvVar,
|
||||
epsilon);
|
||||
else
|
||||
pass = bnorm_bwd_nhwc_test<ck::half_t, float, false>(arg.do_verification,
|
||||
arg.init_method,
|
||||
arg.time_kernel,
|
||||
arg.inOutLengths,
|
||||
arg.haveSavedMeanInvVar,
|
||||
epsilon);
|
||||
}
|
||||
else if(arg.data_type == 1)
|
||||
{
|
||||
if(arg.use_multiblock_welford)
|
||||
pass = bnorm_bwd_nhwc_test<float, float, true>(arg.do_verification,
|
||||
arg.init_method,
|
||||
arg.time_kernel,
|
||||
arg.inOutLengths,
|
||||
arg.haveSavedMeanInvVar,
|
||||
epsilon);
|
||||
else
|
||||
pass = bnorm_bwd_nhwc_test<float, float, false>(arg.do_verification,
|
||||
arg.init_method,
|
||||
arg.time_kernel,
|
||||
arg.inOutLengths,
|
||||
arg.haveSavedMeanInvVar,
|
||||
epsilon);
|
||||
}
|
||||
else if(arg.data_type == 5)
|
||||
{
|
||||
if(arg.use_multiblock_welford)
|
||||
pass = bnorm_bwd_nhwc_test<ck::bhalf_t, float, true>(arg.do_verification,
|
||||
arg.init_method,
|
||||
arg.time_kernel,
|
||||
arg.inOutLengths,
|
||||
arg.haveSavedMeanInvVar,
|
||||
epsilon);
|
||||
else
|
||||
pass = bnorm_bwd_nhwc_test<ck::bhalf_t, float, false>(arg.do_verification,
|
||||
arg.init_method,
|
||||
arg.time_kernel,
|
||||
arg.inOutLengths,
|
||||
arg.haveSavedMeanInvVar,
|
||||
epsilon);
|
||||
}
|
||||
else if(arg.data_type == 6)
|
||||
{
|
||||
if(arg.use_multiblock_welford)
|
||||
pass = bnorm_bwd_nhwc_test<double, double, true>(arg.do_verification,
|
||||
arg.init_method,
|
||||
arg.time_kernel,
|
||||
arg.inOutLengths,
|
||||
arg.haveSavedMeanInvVar,
|
||||
epsilon);
|
||||
else
|
||||
pass = bnorm_bwd_nhwc_test<double, double, false>(arg.do_verification,
|
||||
arg.init_method,
|
||||
arg.time_kernel,
|
||||
arg.inOutLengths,
|
||||
arg.haveSavedMeanInvVar,
|
||||
epsilon);
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
pass = bnorm_bwd_nhwc_test<ck::half_t, float, true>(true,
|
||||
3,
|
||||
false, // don't time kernel
|
||||
{128, 16, 6, 512},
|
||||
false,
|
||||
epsilon);
|
||||
|
||||
pass = pass && bnorm_bwd_nhwc_test<ck::half_t, float, false>(true,
|
||||
3,
|
||||
false, // don't time kernel
|
||||
{128, 16, 3, 1024},
|
||||
false,
|
||||
epsilon);
|
||||
};
|
||||
|
||||
return (pass ? 0 : 1);
|
||||
}
|
||||
@@ -0,0 +1,51 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <array>
|
||||
#include <memory>
|
||||
|
||||
#include "ck/ck.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/device_base.hpp"
|
||||
|
||||
namespace ck {
|
||||
namespace tensor_operation {
|
||||
namespace device {
|
||||
|
||||
template <index_t Rank, index_t NumBatchNormReduceDim, typename DyElementwiseOp>
|
||||
struct DeviceBatchNormBwd : public BaseOperator
|
||||
{
|
||||
static constexpr index_t NumInvariantDim = Rank - NumBatchNormReduceDim;
|
||||
|
||||
virtual std::unique_ptr<BaseArgument>
|
||||
MakeArgumentPointer(const std::array<index_t, Rank> xyLengths,
|
||||
const std::array<index_t, Rank> xStrides,
|
||||
const std::array<index_t, Rank> dyStrides,
|
||||
const std::array<index_t, Rank> dxStrides,
|
||||
const std::array<int, NumBatchNormReduceDim> reduceDims,
|
||||
const std::array<ck::index_t, NumInvariantDim> bnScaleBiasMeanVarLengths,
|
||||
const std::array<ck::index_t, NumInvariantDim> bnScaleStrides,
|
||||
const std::array<ck::index_t, NumInvariantDim> bnBiasStrides,
|
||||
const std::array<ck::index_t, NumInvariantDim> bnMeanVarStrides,
|
||||
const void* p_x,
|
||||
const void* p_dy,
|
||||
const void* p_scale,
|
||||
const void* p_savedMean,
|
||||
const void* p_savedInvVar,
|
||||
double epsilon,
|
||||
const DyElementwiseOp dy_elementwise_op,
|
||||
void* p_dx,
|
||||
void* p_dscale,
|
||||
void* p_dbias) = 0;
|
||||
|
||||
virtual std::unique_ptr<BaseInvoker> MakeInvokerPointer() = 0;
|
||||
};
|
||||
|
||||
template <index_t Rank, index_t NumBatchNormReduceDim, typename DyElementwiseOp>
|
||||
using DeviceBatchNormBwdPtr =
|
||||
std::unique_ptr<DeviceBatchNormBwd<Rank, NumBatchNormReduceDim, DyElementwiseOp>>;
|
||||
|
||||
} // namespace device
|
||||
} // namespace tensor_operation
|
||||
} // namespace ck
|
||||
@@ -0,0 +1,866 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <iostream>
|
||||
#include <sstream>
|
||||
|
||||
#include "ck/utility/reduction_operator.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/device_batchnorm_backward.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/impl/device_reduce_common.hpp"
|
||||
#include "ck/tensor_operation/gpu/grid/gridwise_batchnorm_backward_blockwise_welford.hpp"
|
||||
#include "ck/tensor_operation/gpu/grid/batchnorm_multiblock/gridwise_multiblock_welford_first_half.hpp"
|
||||
#include "ck/tensor_operation/gpu/grid/batchnorm_multiblock/gridwise_multiblock_welford_second_half_multiblock_reduce_first_half.hpp"
|
||||
#include "ck/tensor_operation/gpu/grid/batchnorm_multiblock/gridwise_multiblock_reduce_second_half_batchnorm_backward_final.hpp"
|
||||
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/welford_helper.hpp"
|
||||
#include "ck/host_utility/device_prop.hpp"
|
||||
#include "ck/host_utility/kernel_launch.hpp"
|
||||
|
||||
namespace ck {
|
||||
namespace tensor_operation {
|
||||
namespace device {
|
||||
|
||||
template <typename XDataType,
|
||||
typename DxDataType,
|
||||
typename DyDataType,
|
||||
typename AccDataType,
|
||||
typename ScaleDataType,
|
||||
typename BiasDataType,
|
||||
typename MeanVarDataType,
|
||||
typename DyElementwiseOp,
|
||||
index_t Rank,
|
||||
index_t NumBatchNormReduceDim,
|
||||
bool UseMultiblockInK,
|
||||
index_t BlockSize,
|
||||
index_t MThreadClusterSize,
|
||||
index_t KThreadClusterSize,
|
||||
index_t MThreadSliceSize,
|
||||
index_t KThreadSliceSize,
|
||||
index_t XDyDxVectorDim,
|
||||
index_t XSrcVectorSize,
|
||||
index_t DySrcVectorSize,
|
||||
index_t DxDstVectorSize,
|
||||
index_t ScaleSrcDstVectorSize,
|
||||
index_t BiasDstVectorSize,
|
||||
index_t MeanVarSrcVectorSize>
|
||||
struct DeviceBatchNormBwdImpl
|
||||
: public DeviceBatchNormBwd<Rank, NumBatchNormReduceDim, DyElementwiseOp>
|
||||
{
|
||||
static_assert(Rank <= 6, "Bigger Rank size is not supported!");
|
||||
static_assert(BlockSize == MThreadClusterSize * KThreadClusterSize,
|
||||
"Invalid thread cluster size assignments!");
|
||||
|
||||
static_assert((XDyDxVectorDim == 0 && MThreadSliceSize % XSrcVectorSize == 0 &&
|
||||
MThreadSliceSize % DySrcVectorSize == 0 &&
|
||||
MThreadSliceSize % DxDstVectorSize == 0) ||
|
||||
(XDyDxVectorDim == 1 && KThreadSliceSize % XSrcVectorSize == 0 &&
|
||||
KThreadSliceSize % DySrcVectorSize == 0 &&
|
||||
KThreadSliceSize % DxDstVectorSize == 0),
|
||||
"Invalid thread slice sizes and/or vector sizes configuration, please check!");
|
||||
|
||||
static constexpr index_t NumInvariantDim = Rank - NumBatchNormReduceDim;
|
||||
|
||||
static constexpr index_t M_BlockTileSize = MThreadClusterSize * MThreadSliceSize;
|
||||
static constexpr index_t K_BlockTileSize = KThreadClusterSize * KThreadSliceSize;
|
||||
|
||||
static auto MakeXY2dDescriptor(const std::array<index_t, Rank>& xyLengths,
|
||||
const std::array<index_t, Rank>& xyStrides,
|
||||
int blkGroupSize,
|
||||
int numBlockTileIteration)
|
||||
{
|
||||
const auto tupleXYLengths =
|
||||
generate_tuple([&](auto I) { return xyLengths[I]; }, Number<Rank>{});
|
||||
const auto tupleXYStrides =
|
||||
generate_tuple([&](auto I) { return xyStrides[I]; }, Number<Rank>{});
|
||||
|
||||
const auto raw_grid_desc = make_naive_tensor_descriptor(tupleXYLengths, tupleXYStrides);
|
||||
|
||||
const auto grid_desc_m_k = [&]() {
|
||||
using InvariantDims = typename arithmetic_sequence_gen<0, NumInvariantDim, 1>::type;
|
||||
using ReduceDims = typename arithmetic_sequence_gen<NumInvariantDim, Rank, 1>::type;
|
||||
|
||||
const auto reduceDimLengths =
|
||||
generate_tuple([&](auto I) { return xyLengths[NumInvariantDim + I]; },
|
||||
Number<NumBatchNormReduceDim>{});
|
||||
const auto invariantDimLengths =
|
||||
generate_tuple([&](auto I) { return xyLengths[I]; }, Number<NumInvariantDim>{});
|
||||
|
||||
return transform_tensor_descriptor(raw_grid_desc,
|
||||
make_tuple(make_merge_transform(invariantDimLengths),
|
||||
make_merge_transform(reduceDimLengths)),
|
||||
make_tuple(InvariantDims{}, ReduceDims{}),
|
||||
make_tuple(Sequence<0>{}, Sequence<1>{}));
|
||||
}();
|
||||
|
||||
const auto invariantLength = grid_desc_m_k.GetLength(Number<0>{});
|
||||
const auto reduceLength = grid_desc_m_k.GetLength(Number<1>{});
|
||||
|
||||
const int workSizePerBlock = K_BlockTileSize * numBlockTileIteration;
|
||||
const auto mPad =
|
||||
math::integer_least_multiple(invariantLength, M_BlockTileSize) - invariantLength;
|
||||
const auto kPad = workSizePerBlock * blkGroupSize - reduceLength;
|
||||
|
||||
auto grid_desc_m_k_padded =
|
||||
transform_tensor_descriptor(grid_desc_m_k,
|
||||
make_tuple(make_right_pad_transform(invariantLength, mPad),
|
||||
make_right_pad_transform(reduceLength, kPad)),
|
||||
make_tuple(Sequence<0>{}, Sequence<1>{}),
|
||||
make_tuple(Sequence<0>{}, Sequence<1>{}));
|
||||
|
||||
return (grid_desc_m_k_padded);
|
||||
};
|
||||
|
||||
static auto MakeMultiblockFirstReduceOutputMG2dDescriptor(int invariantLength, int blkGroupSize)
|
||||
{
|
||||
const auto grid_desc_m_g =
|
||||
make_naive_tensor_descriptor_packed(make_tuple(invariantLength, blkGroupSize));
|
||||
|
||||
const auto mPad =
|
||||
math::integer_least_multiple(invariantLength, M_BlockTileSize) - invariantLength;
|
||||
|
||||
auto grid_desc_m_g_padded =
|
||||
transform_tensor_descriptor(grid_desc_m_g,
|
||||
make_tuple(make_right_pad_transform(invariantLength, mPad),
|
||||
make_pass_through_transform(blkGroupSize)),
|
||||
make_tuple(Sequence<0>{}, Sequence<1>{}),
|
||||
make_tuple(Sequence<0>{}, Sequence<1>{}));
|
||||
|
||||
return (grid_desc_m_g_padded);
|
||||
};
|
||||
|
||||
static auto MakeMultiblockFinalReduceInputMK2dDescriptor(int invariantLength, int blkGroupSize)
|
||||
{
|
||||
const auto reduceLength = blkGroupSize;
|
||||
const auto grid_desc_m_k =
|
||||
make_naive_tensor_descriptor_packed(make_tuple(invariantLength, reduceLength));
|
||||
|
||||
const auto mPad =
|
||||
math::integer_least_multiple(invariantLength, M_BlockTileSize) - invariantLength;
|
||||
const auto kPad =
|
||||
math::integer_least_multiple(reduceLength, KThreadClusterSize) - reduceLength;
|
||||
|
||||
auto grid_desc_m_k_padded =
|
||||
transform_tensor_descriptor(grid_desc_m_k,
|
||||
make_tuple(make_right_pad_transform(invariantLength, mPad),
|
||||
make_right_pad_transform(reduceLength, kPad)),
|
||||
make_tuple(Sequence<0>{}, Sequence<1>{}),
|
||||
make_tuple(Sequence<0>{}, Sequence<1>{}));
|
||||
|
||||
return (grid_desc_m_k_padded);
|
||||
};
|
||||
|
||||
static auto
|
||||
MakeScaleBiasMeanVar1dDescriptor(const std::array<index_t, NumInvariantDim>& lengths,
|
||||
const std::array<index_t, NumInvariantDim>& strides)
|
||||
{
|
||||
const auto tupleLengths =
|
||||
generate_tuple([&](auto I) { return lengths[I]; }, Number<NumInvariantDim>{});
|
||||
const auto tupleStrides =
|
||||
generate_tuple([&](auto I) { return strides[I]; }, Number<NumInvariantDim>{});
|
||||
|
||||
auto raw_grid_desc = make_naive_tensor_descriptor(tupleLengths, tupleStrides);
|
||||
|
||||
auto grid_desc_m = transform_tensor_descriptor(
|
||||
raw_grid_desc,
|
||||
make_tuple(make_merge_transform(tupleLengths)),
|
||||
make_tuple(typename arithmetic_sequence_gen<0, NumInvariantDim, 1>::type{}),
|
||||
make_tuple(Sequence<0>{}));
|
||||
|
||||
const auto invariantLength = grid_desc_m.GetLength(Number<0>{});
|
||||
|
||||
const auto mPad =
|
||||
math::integer_least_multiple(invariantLength, M_BlockTileSize) - invariantLength;
|
||||
|
||||
auto grid_desc_m_padded =
|
||||
transform_tensor_descriptor(grid_desc_m,
|
||||
make_tuple(make_right_pad_transform(invariantLength, mPad)),
|
||||
make_tuple(Sequence<0>{}),
|
||||
make_tuple(Sequence<0>{}));
|
||||
return (grid_desc_m_padded);
|
||||
};
|
||||
|
||||
using XYGridDesc_M_K = decltype(MakeXY2dDescriptor({1}, {1}, 1, 1));
|
||||
using ScaleBiasGridDesc_M = decltype(MakeScaleBiasMeanVar1dDescriptor({1}, {1}));
|
||||
using MeanVarGridDesc_M = ScaleBiasGridDesc_M;
|
||||
|
||||
struct Argument : public BaseArgument
|
||||
{
|
||||
Argument(const std::array<index_t, Rank> xyLengths,
|
||||
const std::array<index_t, Rank> xStrides,
|
||||
const std::array<index_t, Rank> dyStrides,
|
||||
const std::array<index_t, Rank> dxStrides,
|
||||
const std::array<int, NumBatchNormReduceDim> reduceDims,
|
||||
const std::array<ck::index_t, NumInvariantDim> bnScaleBiasMeanVarLengths,
|
||||
const std::array<ck::index_t, NumInvariantDim> bnScaleStrides,
|
||||
const std::array<ck::index_t, NumInvariantDim> bnBiasStrides,
|
||||
const std::array<ck::index_t, NumInvariantDim> bnMeanVarStrides,
|
||||
const XDataType* p_x,
|
||||
const DyDataType* p_dy,
|
||||
const ScaleDataType* p_scale,
|
||||
const MeanVarDataType* p_savedMean,
|
||||
const MeanVarDataType* p_savedInvVar,
|
||||
const DyElementwiseOp dy_elementwise_op,
|
||||
double epsilon,
|
||||
DxDataType* p_dx,
|
||||
ScaleDataType* p_dscale,
|
||||
BiasDataType* p_dbias)
|
||||
: bnScaleBiasMeanVarLengths_(bnScaleBiasMeanVarLengths),
|
||||
bnScaleStrides_(bnScaleStrides),
|
||||
bnBiasStrides_(bnBiasStrides),
|
||||
bnMeanVarStrides_(bnMeanVarStrides),
|
||||
p_x_(p_x),
|
||||
p_dy_(p_dy),
|
||||
p_scale_(p_scale),
|
||||
p_savedMean_(p_savedMean),
|
||||
p_savedInvVar_(p_savedInvVar),
|
||||
dy_elementwise_op_(dy_elementwise_op),
|
||||
p_dx_(p_dx),
|
||||
p_dscale_(p_dscale),
|
||||
p_dbias_(p_dbias)
|
||||
{
|
||||
xyLengths_ =
|
||||
shuffle_tensor_dimensions<Rank, NumBatchNormReduceDim>(xyLengths, reduceDims);
|
||||
xStrides_ =
|
||||
shuffle_tensor_dimensions<Rank, NumBatchNormReduceDim>(xStrides, reduceDims);
|
||||
dyStrides_ =
|
||||
shuffle_tensor_dimensions<Rank, NumBatchNormReduceDim>(dyStrides, reduceDims);
|
||||
dxStrides_ =
|
||||
shuffle_tensor_dimensions<Rank, NumBatchNormReduceDim>(dxStrides, reduceDims);
|
||||
|
||||
std::tie(invariant_length, reduce_length) =
|
||||
get_2d_lengths<Rank, NumBatchNormReduceDim>(xyLengths_);
|
||||
|
||||
epsilon_ = type_convert<AccDataType>(epsilon);
|
||||
|
||||
haveSavedMeanInvVar_ = (p_savedMean_ != nullptr && p_savedInvVar_ != nullptr);
|
||||
|
||||
if(UseMultiblockInK)
|
||||
{
|
||||
int iterations = 1;
|
||||
while(true)
|
||||
{
|
||||
int testBlkGroupSize = (reduce_length + (K_BlockTileSize * iterations) - 1) /
|
||||
(K_BlockTileSize * iterations);
|
||||
|
||||
// we want the blkGroupSize be not more than 128
|
||||
if(testBlkGroupSize <= 128)
|
||||
break;
|
||||
|
||||
iterations++;
|
||||
};
|
||||
|
||||
blkGroupSize = (reduce_length + (K_BlockTileSize * iterations) - 1) /
|
||||
(K_BlockTileSize * iterations);
|
||||
|
||||
numBlockTileIteration = iterations;
|
||||
}
|
||||
else
|
||||
{
|
||||
blkGroupSize = 1;
|
||||
numBlockTileIteration = (reduce_length + K_BlockTileSize - 1) / K_BlockTileSize;
|
||||
};
|
||||
|
||||
gridSize = (invariant_length + M_BlockTileSize - 1) / M_BlockTileSize * blkGroupSize;
|
||||
|
||||
x_grid_desc_m_k =
|
||||
MakeXY2dDescriptor(xyLengths_, xStrides_, blkGroupSize, numBlockTileIteration);
|
||||
dy_grid_desc_m_k =
|
||||
MakeXY2dDescriptor(xyLengths_, dyStrides_, blkGroupSize, numBlockTileIteration);
|
||||
dx_grid_desc_m_k =
|
||||
MakeXY2dDescriptor(xyLengths_, dxStrides_, blkGroupSize, numBlockTileIteration);
|
||||
scale_grid_desc_m =
|
||||
MakeScaleBiasMeanVar1dDescriptor(bnScaleBiasMeanVarLengths, bnScaleStrides);
|
||||
bias_grid_desc_m =
|
||||
MakeScaleBiasMeanVar1dDescriptor(bnScaleBiasMeanVarLengths, bnBiasStrides);
|
||||
mean_var_grid_desc_m =
|
||||
MakeScaleBiasMeanVar1dDescriptor(bnScaleBiasMeanVarLengths, bnMeanVarStrides);
|
||||
}
|
||||
|
||||
AccDataType epsilon_;
|
||||
|
||||
bool haveSavedMeanInvVar_;
|
||||
|
||||
std::array<index_t, Rank> xyLengths_;
|
||||
std::array<index_t, Rank> xStrides_;
|
||||
std::array<index_t, Rank> dyStrides_;
|
||||
std::array<index_t, Rank> dxStrides_;
|
||||
|
||||
std::array<index_t, Rank - NumBatchNormReduceDim> bnScaleBiasMeanVarLengths_;
|
||||
std::array<index_t, Rank - NumBatchNormReduceDim> bnScaleStrides_;
|
||||
std::array<index_t, Rank - NumBatchNormReduceDim> bnBiasStrides_;
|
||||
std::array<index_t, Rank - NumBatchNormReduceDim> bnMeanVarStrides_;
|
||||
|
||||
const XDataType* p_x_;
|
||||
const DyDataType* p_dy_;
|
||||
const ScaleDataType* p_scale_;
|
||||
const MeanVarDataType* p_savedMean_;
|
||||
const MeanVarDataType* p_savedInvVar_;
|
||||
const DyElementwiseOp dy_elementwise_op_;
|
||||
DxDataType* p_dx_;
|
||||
ScaleDataType* p_dscale_;
|
||||
BiasDataType* p_dbias_;
|
||||
|
||||
long_index_t invariant_length;
|
||||
long_index_t reduce_length;
|
||||
|
||||
int blkGroupSize;
|
||||
int numBlockTileIteration;
|
||||
size_t gridSize;
|
||||
|
||||
XYGridDesc_M_K x_grid_desc_m_k;
|
||||
XYGridDesc_M_K dy_grid_desc_m_k;
|
||||
XYGridDesc_M_K dx_grid_desc_m_k;
|
||||
ScaleBiasGridDesc_M scale_grid_desc_m;
|
||||
ScaleBiasGridDesc_M bias_grid_desc_m;
|
||||
MeanVarGridDesc_M mean_var_grid_desc_m;
|
||||
|
||||
void* workspace_mean;
|
||||
void* workspace_variance;
|
||||
void* workspace_count;
|
||||
|
||||
void* workspace_savedMean;
|
||||
void* workspace_savedInvVar;
|
||||
|
||||
void* workspace_reduce_dscale;
|
||||
void* workspace_reduce_dbias;
|
||||
};
|
||||
|
||||
size_t GetWorkSpaceSize(const BaseArgument* pArg) const override
|
||||
{
|
||||
const Argument* pArg_ = dynamic_cast<const Argument*>(pArg);
|
||||
|
||||
size_t workspace_size = 0;
|
||||
|
||||
if(UseMultiblockInK && pArg_->blkGroupSize > 1)
|
||||
{
|
||||
// workspace for the partial reduced result for dscale
|
||||
workspace_size +=
|
||||
pArg_->invariant_length * pArg_->blkGroupSize * sizeof(ScaleDataType) + 64;
|
||||
|
||||
// workspace for the partial reduced result for dbias
|
||||
workspace_size +=
|
||||
pArg_->invariant_length * pArg_->blkGroupSize * sizeof(BiasDataType) + 64;
|
||||
|
||||
if(!pArg_->haveSavedMeanInvVar_)
|
||||
{
|
||||
// workspace for welford intermediate mean
|
||||
workspace_size +=
|
||||
pArg_->invariant_length * pArg_->blkGroupSize * sizeof(MeanVarDataType) + 64;
|
||||
|
||||
// workspace for welford intermediate variance
|
||||
workspace_size +=
|
||||
pArg_->invariant_length * pArg_->blkGroupSize * sizeof(MeanVarDataType) + 64;
|
||||
|
||||
// workspace for welford intermediate count
|
||||
workspace_size +=
|
||||
pArg_->invariant_length * pArg_->blkGroupSize * sizeof(int32_t) + 64;
|
||||
|
||||
// workspace for welford result mean
|
||||
workspace_size += pArg_->invariant_length * sizeof(MeanVarDataType) + 64;
|
||||
|
||||
// workspace for welford result inv_variance
|
||||
workspace_size += pArg_->invariant_length * sizeof(MeanVarDataType) + 64;
|
||||
};
|
||||
}
|
||||
|
||||
return (workspace_size);
|
||||
};
|
||||
|
||||
void SetWorkSpacePointer(BaseArgument* pArg, void* p_workspace) const override
|
||||
{
|
||||
Argument* pArg_ = dynamic_cast<Argument*>(pArg);
|
||||
|
||||
pArg_->p_workspace_ = p_workspace;
|
||||
|
||||
index_t space_sz;
|
||||
|
||||
// setup buffer for the partial reduced result for dscale
|
||||
pArg_->workspace_reduce_dscale = pArg_->p_workspace_;
|
||||
|
||||
space_sz = pArg_->invariant_length * pArg_->blkGroupSize * sizeof(ScaleDataType);
|
||||
space_sz = math::integer_least_multiple(space_sz, 64);
|
||||
|
||||
// setup buffer for the partial reduced result for dbias
|
||||
pArg_->workspace_reduce_dbias =
|
||||
reinterpret_cast<char*>(pArg_->workspace_reduce_dscale) + space_sz;
|
||||
|
||||
if(UseMultiblockInK && pArg_->blkGroupSize > 1)
|
||||
{
|
||||
space_sz = pArg_->invariant_length * pArg_->blkGroupSize * sizeof(BiasDataType);
|
||||
space_sz = math::integer_least_multiple(space_sz, 64);
|
||||
|
||||
// setup buffer for welford intermediate mean
|
||||
pArg_->workspace_mean =
|
||||
reinterpret_cast<char*>(pArg_->workspace_reduce_dbias) + space_sz;
|
||||
|
||||
space_sz = pArg_->invariant_length * pArg_->blkGroupSize * sizeof(MeanVarDataType);
|
||||
space_sz = math::integer_least_multiple(space_sz, 64);
|
||||
|
||||
// setup buffer for welford intermediate varirance
|
||||
pArg_->workspace_variance = reinterpret_cast<char*>(pArg_->workspace_mean) + space_sz;
|
||||
|
||||
space_sz = pArg_->invariant_length * pArg_->blkGroupSize * sizeof(MeanVarDataType);
|
||||
space_sz = math::integer_least_multiple(space_sz, 64);
|
||||
|
||||
// setup buffer for welford intermediate count
|
||||
pArg_->workspace_count = reinterpret_cast<char*>(pArg_->workspace_variance) + space_sz;
|
||||
|
||||
space_sz = pArg_->invariant_length * pArg_->blkGroupSize * sizeof(int32_t);
|
||||
space_sz = math::integer_least_multiple(space_sz, 64);
|
||||
|
||||
// setup buffer for welford result mean
|
||||
pArg_->workspace_savedMean = reinterpret_cast<char*>(pArg_->workspace_count) + space_sz;
|
||||
|
||||
space_sz = pArg_->invariant_length * sizeof(MeanVarDataType);
|
||||
space_sz = math::integer_least_multiple(space_sz, 64);
|
||||
|
||||
// setup buffer for welford result inv_variance
|
||||
pArg_->workspace_savedInvVar =
|
||||
reinterpret_cast<char*>(pArg_->workspace_savedMean) + space_sz;
|
||||
};
|
||||
};
|
||||
|
||||
struct Invoker : public BaseInvoker
|
||||
{
|
||||
float Run(const Argument& arg, const StreamConfig& stream_config = StreamConfig{})
|
||||
{
|
||||
float avg_time = 0;
|
||||
|
||||
const auto mean_var_count_grid_desc_m_g =
|
||||
DeviceBatchNormBwdImpl::MakeMultiblockFirstReduceOutputMG2dDescriptor(
|
||||
arg.invariant_length, arg.blkGroupSize);
|
||||
|
||||
const auto dscale_dbias_grid_desc_m_g =
|
||||
DeviceBatchNormBwdImpl::MakeMultiblockFirstReduceOutputMG2dDescriptor(
|
||||
arg.invariant_length, arg.blkGroupSize);
|
||||
|
||||
const auto mean_var_count_grid_desc_m_k =
|
||||
DeviceBatchNormBwdImpl::MakeMultiblockFinalReduceInputMK2dDescriptor(
|
||||
arg.invariant_length, arg.blkGroupSize);
|
||||
|
||||
const auto dscale_dbias_grid_desc_m_k =
|
||||
DeviceBatchNormBwdImpl::MakeMultiblockFinalReduceInputMK2dDescriptor(
|
||||
arg.invariant_length, arg.blkGroupSize);
|
||||
|
||||
using MeanVarCountGridDesc_M_G = decltype(mean_var_count_grid_desc_m_g);
|
||||
using MeanVarCountGridDesc_M_K = decltype(mean_var_count_grid_desc_m_k);
|
||||
using DscaleDbiasGridDesc_M_G = decltype(dscale_dbias_grid_desc_m_g);
|
||||
using DscaleDbiasGridDesc_M_K = decltype(dscale_dbias_grid_desc_m_k);
|
||||
|
||||
using GridwiseWelfordSecondHalfReduceFirstHalf_ =
|
||||
GridwiseWelfordSecondHalfReduceFirstHalf<XDataType,
|
||||
DyDataType,
|
||||
AccDataType,
|
||||
ScaleDataType,
|
||||
BiasDataType,
|
||||
MeanVarDataType,
|
||||
DyElementwiseOp,
|
||||
XYGridDesc_M_K,
|
||||
MeanVarGridDesc_M,
|
||||
MeanVarCountGridDesc_M_K,
|
||||
DscaleDbiasGridDesc_M_G,
|
||||
BlockSize,
|
||||
MThreadClusterSize,
|
||||
KThreadClusterSize,
|
||||
MThreadSliceSize,
|
||||
KThreadSliceSize,
|
||||
XDyDxVectorDim,
|
||||
XSrcVectorSize,
|
||||
DySrcVectorSize,
|
||||
MeanVarSrcVectorSize>;
|
||||
|
||||
using GridwiseReduceSecondHalfBatchNormBwdFinal_ =
|
||||
GridwiseReduceSecondHalfBatchNormBackwardFinal<XDataType,
|
||||
DyDataType,
|
||||
DxDataType,
|
||||
AccDataType,
|
||||
ScaleDataType,
|
||||
BiasDataType,
|
||||
MeanVarDataType,
|
||||
DyElementwiseOp,
|
||||
XYGridDesc_M_K,
|
||||
DscaleDbiasGridDesc_M_K,
|
||||
MeanVarGridDesc_M,
|
||||
ScaleBiasGridDesc_M,
|
||||
BlockSize,
|
||||
MThreadClusterSize,
|
||||
KThreadClusterSize,
|
||||
MThreadSliceSize,
|
||||
KThreadSliceSize,
|
||||
XDyDxVectorDim,
|
||||
XSrcVectorSize,
|
||||
DySrcVectorSize,
|
||||
DxDstVectorSize,
|
||||
ScaleSrcDstVectorSize,
|
||||
BiasDstVectorSize,
|
||||
MeanVarSrcVectorSize>;
|
||||
|
||||
if(UseMultiblockInK && arg.blkGroupSize > 1)
|
||||
{
|
||||
using GetReduceCountPerThreadFunctor =
|
||||
GetReduceCountPerThreadForMultiblockWelford<K_BlockTileSize, KThreadSliceSize>;
|
||||
|
||||
GetReduceCountPerThreadFunctor get_reduce_count_per_thread(
|
||||
arg.blkGroupSize, arg.numBlockTileIteration, arg.reduce_length);
|
||||
|
||||
if(!arg.haveSavedMeanInvVar_)
|
||||
{
|
||||
using GridwiseMultiblockWelfordFirstHalf_ =
|
||||
GridwiseMultiblockWelfordFirstHalf<XDataType,
|
||||
AccDataType,
|
||||
MeanVarDataType,
|
||||
XYGridDesc_M_K,
|
||||
MeanVarCountGridDesc_M_G,
|
||||
GetReduceCountPerThreadFunctor,
|
||||
BlockSize,
|
||||
MThreadClusterSize,
|
||||
KThreadClusterSize,
|
||||
MThreadSliceSize,
|
||||
KThreadSliceSize,
|
||||
XDyDxVectorDim,
|
||||
XSrcVectorSize>;
|
||||
|
||||
const auto kern_multiblock_welford_first_half =
|
||||
kernel_multiblock_welford_first_half<GridwiseMultiblockWelfordFirstHalf_,
|
||||
XDataType,
|
||||
MeanVarDataType,
|
||||
XYGridDesc_M_K,
|
||||
MeanVarCountGridDesc_M_G,
|
||||
GetReduceCountPerThreadFunctor>;
|
||||
|
||||
avg_time += launch_and_time_kernel(
|
||||
stream_config,
|
||||
kern_multiblock_welford_first_half,
|
||||
dim3(arg.gridSize),
|
||||
dim3(BlockSize),
|
||||
0,
|
||||
arg.x_grid_desc_m_k,
|
||||
mean_var_count_grid_desc_m_g,
|
||||
get_reduce_count_per_thread,
|
||||
arg.numBlockTileIteration,
|
||||
arg.p_x_,
|
||||
static_cast<MeanVarDataType*>(arg.workspace_mean),
|
||||
static_cast<MeanVarDataType*>(arg.workspace_variance),
|
||||
static_cast<int32_t*>(arg.workspace_count));
|
||||
};
|
||||
|
||||
const auto kern_welford_second_half_reduce_first_half =
|
||||
kernel_welford_second_half_reduce_first_half<
|
||||
GridwiseWelfordSecondHalfReduceFirstHalf_,
|
||||
XDataType,
|
||||
DyDataType,
|
||||
AccDataType,
|
||||
ScaleDataType,
|
||||
BiasDataType,
|
||||
MeanVarDataType,
|
||||
DyElementwiseOp,
|
||||
XYGridDesc_M_K,
|
||||
MeanVarGridDesc_M,
|
||||
MeanVarCountGridDesc_M_K,
|
||||
DscaleDbiasGridDesc_M_G>;
|
||||
|
||||
const auto kern_reduce_second_half_batchnorm_backward_final =
|
||||
kernel_reduce_second_half_batchnorm_backward_final<
|
||||
GridwiseReduceSecondHalfBatchNormBwdFinal_,
|
||||
XDataType,
|
||||
DyDataType,
|
||||
DxDataType,
|
||||
ScaleDataType,
|
||||
BiasDataType,
|
||||
MeanVarDataType,
|
||||
DyElementwiseOp,
|
||||
XYGridDesc_M_K,
|
||||
DscaleDbiasGridDesc_M_K,
|
||||
MeanVarGridDesc_M,
|
||||
ScaleBiasGridDesc_M>;
|
||||
|
||||
index_t numDscaleDbiasBlockTileIteration =
|
||||
(arg.blkGroupSize + KThreadClusterSize - 1) / KThreadClusterSize;
|
||||
|
||||
avg_time += launch_and_time_kernel(
|
||||
stream_config,
|
||||
kern_welford_second_half_reduce_first_half,
|
||||
dim3(arg.gridSize),
|
||||
dim3(BlockSize),
|
||||
0,
|
||||
arg.x_grid_desc_m_k,
|
||||
arg.dy_grid_desc_m_k,
|
||||
arg.mean_var_grid_desc_m,
|
||||
mean_var_count_grid_desc_m_k,
|
||||
dscale_dbias_grid_desc_m_g,
|
||||
arg.blkGroupSize,
|
||||
arg.numBlockTileIteration,
|
||||
numDscaleDbiasBlockTileIteration,
|
||||
arg.epsilon_,
|
||||
arg.haveSavedMeanInvVar_,
|
||||
arg.haveSavedMeanInvVar_ ? arg.p_savedMean_ : nullptr,
|
||||
arg.haveSavedMeanInvVar_ ? arg.p_savedInvVar_ : nullptr,
|
||||
arg.haveSavedMeanInvVar_
|
||||
? nullptr
|
||||
: static_cast<const MeanVarDataType*>(arg.workspace_mean),
|
||||
arg.haveSavedMeanInvVar_
|
||||
? nullptr
|
||||
: static_cast<const MeanVarDataType*>(arg.workspace_variance),
|
||||
arg.haveSavedMeanInvVar_ ? nullptr
|
||||
: static_cast<const int32_t*>(arg.workspace_count),
|
||||
arg.dy_elementwise_op_,
|
||||
arg.haveSavedMeanInvVar_
|
||||
? nullptr
|
||||
: static_cast<MeanVarDataType*>(arg.workspace_savedMean),
|
||||
arg.haveSavedMeanInvVar_
|
||||
? nullptr
|
||||
: static_cast<MeanVarDataType*>(arg.workspace_savedInvVar),
|
||||
arg.p_x_,
|
||||
arg.p_dy_,
|
||||
static_cast<ScaleDataType*>(arg.workspace_reduce_dscale),
|
||||
static_cast<BiasDataType*>(arg.workspace_reduce_dbias));
|
||||
|
||||
avg_time += launch_and_time_kernel(
|
||||
stream_config,
|
||||
kern_reduce_second_half_batchnorm_backward_final,
|
||||
dim3(arg.gridSize),
|
||||
dim3(BlockSize),
|
||||
0,
|
||||
arg.x_grid_desc_m_k,
|
||||
arg.dy_grid_desc_m_k,
|
||||
arg.dx_grid_desc_m_k,
|
||||
dscale_dbias_grid_desc_m_k,
|
||||
arg.mean_var_grid_desc_m,
|
||||
arg.scale_grid_desc_m,
|
||||
arg.bias_grid_desc_m,
|
||||
arg.blkGroupSize,
|
||||
arg.reduce_length,
|
||||
arg.numBlockTileIteration,
|
||||
numDscaleDbiasBlockTileIteration,
|
||||
static_cast<const ScaleDataType*>(arg.workspace_reduce_dscale),
|
||||
static_cast<const BiasDataType*>(arg.workspace_reduce_dbias),
|
||||
arg.haveSavedMeanInvVar_
|
||||
? arg.p_savedMean_
|
||||
: static_cast<const MeanVarDataType*>(arg.workspace_savedMean),
|
||||
arg.haveSavedMeanInvVar_
|
||||
? arg.p_savedInvVar_
|
||||
: static_cast<const MeanVarDataType*>(arg.workspace_savedInvVar),
|
||||
arg.p_x_,
|
||||
arg.p_dy_,
|
||||
arg.p_scale_,
|
||||
arg.dy_elementwise_op_,
|
||||
arg.p_dx_,
|
||||
arg.p_dscale_,
|
||||
arg.p_dbias_);
|
||||
}
|
||||
else
|
||||
{
|
||||
using GetReduceCountPerThreadFunctor =
|
||||
GetReduceCountPerThreadForBlockwiseWelford<K_BlockTileSize, KThreadSliceSize>;
|
||||
|
||||
GetReduceCountPerThreadFunctor get_reduce_count_per_thread(
|
||||
arg.numBlockTileIteration, arg.reduce_length);
|
||||
|
||||
using GridwiseBatchNormBackwardWithBlockwiseWelford_ =
|
||||
GridwiseBatchNormBackwardWithBlockwiseWelford<XDataType,
|
||||
DyDataType,
|
||||
DxDataType,
|
||||
AccDataType,
|
||||
ScaleDataType,
|
||||
BiasDataType,
|
||||
MeanVarDataType,
|
||||
DyElementwiseOp,
|
||||
XYGridDesc_M_K,
|
||||
ScaleBiasGridDesc_M,
|
||||
MeanVarGridDesc_M,
|
||||
GetReduceCountPerThreadFunctor,
|
||||
BlockSize,
|
||||
MThreadClusterSize,
|
||||
KThreadClusterSize,
|
||||
MThreadSliceSize,
|
||||
KThreadSliceSize,
|
||||
XDyDxVectorDim,
|
||||
XSrcVectorSize,
|
||||
DySrcVectorSize,
|
||||
DxDstVectorSize,
|
||||
ScaleSrcDstVectorSize,
|
||||
BiasDstVectorSize,
|
||||
MeanVarSrcVectorSize>;
|
||||
|
||||
const auto kern_batchnorm_bwd = kernel_batchnorm_backward_with_blockwise_welford<
|
||||
GridwiseBatchNormBackwardWithBlockwiseWelford_,
|
||||
XDataType,
|
||||
DyDataType,
|
||||
DxDataType,
|
||||
AccDataType,
|
||||
ScaleDataType,
|
||||
BiasDataType,
|
||||
MeanVarDataType,
|
||||
DyElementwiseOp,
|
||||
XYGridDesc_M_K,
|
||||
ScaleBiasGridDesc_M,
|
||||
MeanVarGridDesc_M,
|
||||
GetReduceCountPerThreadFunctor>;
|
||||
|
||||
avg_time += launch_and_time_kernel(stream_config,
|
||||
kern_batchnorm_bwd,
|
||||
dim3(arg.gridSize),
|
||||
dim3(BlockSize),
|
||||
0,
|
||||
arg.x_grid_desc_m_k,
|
||||
arg.dy_grid_desc_m_k,
|
||||
arg.dx_grid_desc_m_k,
|
||||
arg.scale_grid_desc_m,
|
||||
arg.bias_grid_desc_m,
|
||||
arg.mean_var_grid_desc_m,
|
||||
get_reduce_count_per_thread,
|
||||
arg.reduce_length,
|
||||
arg.numBlockTileIteration,
|
||||
arg.epsilon_,
|
||||
arg.p_x_,
|
||||
arg.p_dy_,
|
||||
arg.p_scale_,
|
||||
arg.haveSavedMeanInvVar_,
|
||||
arg.p_savedMean_,
|
||||
arg.p_savedInvVar_,
|
||||
arg.dy_elementwise_op_,
|
||||
arg.p_dx_,
|
||||
arg.p_dscale_,
|
||||
arg.p_dbias_);
|
||||
};
|
||||
|
||||
return (avg_time);
|
||||
};
|
||||
|
||||
float Run(const BaseArgument* pArg,
|
||||
const StreamConfig& stream_config = StreamConfig{}) override
|
||||
{
|
||||
return Run(*dynamic_cast<const Argument*>(pArg), stream_config);
|
||||
};
|
||||
};
|
||||
|
||||
bool IsSupportedArgument(const BaseArgument* pArg) override
|
||||
{
|
||||
const Argument* pArg_ = dynamic_cast<const Argument*>(pArg);
|
||||
|
||||
if constexpr(XDyDxVectorDim == 0)
|
||||
{
|
||||
if(pArg_->xStrides_[NumInvariantDim - 1] != 1 ||
|
||||
pArg_->dyStrides_[NumInvariantDim - 1] != 1 ||
|
||||
pArg_->dxStrides_[NumInvariantDim - 1] != 1)
|
||||
return false;
|
||||
|
||||
if(pArg_->xyLengths_[NumInvariantDim - 1] % XSrcVectorSize != 0 ||
|
||||
pArg_->xyLengths_[NumInvariantDim - 1] % DySrcVectorSize != 0 ||
|
||||
pArg_->xyLengths_[NumInvariantDim - 1] % DxDstVectorSize != 0)
|
||||
return false;
|
||||
}
|
||||
else
|
||||
{
|
||||
if(pArg_->xStrides_[Rank - 1] != 1 || pArg_->dyStrides_[Rank - 1] != 1 ||
|
||||
pArg_->dxStrides_[Rank - 1] != 1)
|
||||
return false;
|
||||
|
||||
if(pArg_->xyLengths_[Rank - 1] % XSrcVectorSize != 0 ||
|
||||
pArg_->xyLengths_[Rank - 1] % DySrcVectorSize != 0 ||
|
||||
pArg_->xyLengths_[Rank - 1] % DxDstVectorSize != 0)
|
||||
return false;
|
||||
};
|
||||
|
||||
if(pArg_->bnScaleStrides_[NumInvariantDim - 1] != 1 && ScaleSrcDstVectorSize != 1)
|
||||
return false;
|
||||
|
||||
if(pArg_->bnBiasStrides_[NumInvariantDim - 1] != 1 && BiasDstVectorSize != 1)
|
||||
return false;
|
||||
|
||||
if(pArg_->bnScaleBiasMeanVarLengths_[NumInvariantDim - 1] % ScaleSrcDstVectorSize != 0)
|
||||
return false;
|
||||
|
||||
if(pArg_->bnScaleBiasMeanVarLengths_[NumInvariantDim - 1] % BiasDstVectorSize != 0)
|
||||
return false;
|
||||
|
||||
if(pArg_->haveSavedMeanInvVar_)
|
||||
{
|
||||
if(pArg_->bnMeanVarStrides_[NumInvariantDim - 1] != 1 && MeanVarSrcVectorSize != 1)
|
||||
return false;
|
||||
|
||||
if(pArg_->bnScaleBiasMeanVarLengths_[NumInvariantDim - 1] % MeanVarSrcVectorSize != 0)
|
||||
return false;
|
||||
};
|
||||
|
||||
bool is_valid = true;
|
||||
|
||||
static_for<0, NumInvariantDim, 1>{}([&](auto I) {
|
||||
if(pArg_->xyLengths_[I] != pArg_->bnScaleBiasMeanVarLengths_[I])
|
||||
is_valid = false;
|
||||
});
|
||||
|
||||
if(!is_valid)
|
||||
return false;
|
||||
|
||||
return true;
|
||||
};
|
||||
|
||||
std::unique_ptr<BaseArgument>
|
||||
MakeArgumentPointer(const std::array<index_t, Rank> xyLengths,
|
||||
const std::array<index_t, Rank> xStrides,
|
||||
const std::array<index_t, Rank> dyStrides,
|
||||
const std::array<index_t, Rank> dxStrides,
|
||||
const std::array<int, NumBatchNormReduceDim> reduceDims,
|
||||
const std::array<ck::index_t, NumInvariantDim> bnScaleBiasMeanVarLengths,
|
||||
const std::array<ck::index_t, NumInvariantDim> bnScaleStrides,
|
||||
const std::array<ck::index_t, NumInvariantDim> bnBiasStrides,
|
||||
const std::array<ck::index_t, NumInvariantDim> bnMeanVarStrides,
|
||||
const void* p_x,
|
||||
const void* p_dy,
|
||||
const void* p_scale,
|
||||
const void* p_savedMean,
|
||||
const void* p_savedInvVar,
|
||||
double epsilon,
|
||||
const DyElementwiseOp dy_elementwise_op,
|
||||
void* p_dx,
|
||||
void* p_dscale,
|
||||
void* p_dbias) override
|
||||
{
|
||||
return std::make_unique<Argument>(xyLengths,
|
||||
xStrides,
|
||||
dyStrides,
|
||||
dxStrides,
|
||||
reduceDims,
|
||||
bnScaleBiasMeanVarLengths,
|
||||
bnScaleStrides,
|
||||
bnBiasStrides,
|
||||
bnMeanVarStrides,
|
||||
static_cast<const XDataType*>(p_x),
|
||||
static_cast<const DyDataType*>(p_dy),
|
||||
static_cast<const ScaleDataType*>(p_scale),
|
||||
static_cast<const MeanVarDataType*>(p_savedMean),
|
||||
static_cast<const MeanVarDataType*>(p_savedInvVar),
|
||||
dy_elementwise_op,
|
||||
epsilon,
|
||||
static_cast<DxDataType*>(p_dx),
|
||||
static_cast<ScaleDataType*>(p_dscale),
|
||||
static_cast<BiasDataType*>(p_dbias));
|
||||
};
|
||||
|
||||
std::unique_ptr<BaseInvoker> MakeInvokerPointer() override
|
||||
{
|
||||
return std::make_unique<Invoker>();
|
||||
};
|
||||
|
||||
std::string GetTypeString() const override
|
||||
{
|
||||
auto str = std::stringstream();
|
||||
|
||||
// clang-format off
|
||||
str << "DeviceBatchNormBwdImpl<" << BlockSize << ",";
|
||||
str << "M_C" << MThreadClusterSize << "_S" << MThreadSliceSize << ",";
|
||||
str << "K_C" << KThreadClusterSize << "_S" << KThreadSliceSize << ",";
|
||||
str << "XDyDxVectorDim_" << XDyDxVectorDim << ",";
|
||||
str << "VectorSize_X" << XSrcVectorSize << "_scale_" << ScaleSrcDstVectorSize << "_bias_" << BiasDstVectorSize << "_mean_var_" << MeanVarSrcVectorSize << "_Dx_" << DxDstVectorSize << ">";
|
||||
// clang-format on
|
||||
|
||||
return str.str();
|
||||
}
|
||||
}; // namespace device
|
||||
|
||||
} // namespace device
|
||||
} // namespace tensor_operation
|
||||
} // namespace ck
|
||||
@@ -0,0 +1,534 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "ck/utility/data_type.hpp"
|
||||
#include "ck/tensor_operation/gpu/block/reduction_functions_blockwise.hpp"
|
||||
#include "ck/tensor_operation/gpu/thread/threadwise_welford.hpp"
|
||||
#include "ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer.hpp"
|
||||
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
|
||||
|
||||
namespace ck {
|
||||
|
||||
template <typename GridwiseReduceSecondHalfBatchNormBackwardFinal_,
|
||||
typename XDataType,
|
||||
typename DyDataType,
|
||||
typename DxDataType,
|
||||
typename ScaleDataType,
|
||||
typename BiasDataType,
|
||||
typename MeanVarDataType,
|
||||
typename DyElementwiseOp,
|
||||
typename XYGridDesc_M_K,
|
||||
typename DscaleDbiasGridDesc_M_K,
|
||||
typename MeanVarGridDesc_M,
|
||||
typename ScaleBiasGridDesc_M>
|
||||
__global__ void kernel_reduce_second_half_batchnorm_backward_final(
|
||||
const XYGridDesc_M_K x_grid_desc_m_k,
|
||||
const XYGridDesc_M_K dy_grid_desc_m_k,
|
||||
const XYGridDesc_M_K dx_grid_desc_m_k,
|
||||
const DscaleDbiasGridDesc_M_K dscale_dbias_grid_desc_m_k,
|
||||
const MeanVarGridDesc_M mean_var_grid_desc_m,
|
||||
const ScaleBiasGridDesc_M scale_grid_desc_m,
|
||||
const ScaleBiasGridDesc_M bias_grid_desc_m,
|
||||
index_t blkgroup_size,
|
||||
long_index_t reduce_size,
|
||||
index_t num_xy_k_block_tile_iteration,
|
||||
index_t num_dscale_dbias_k_block_tile_iteration,
|
||||
const ScaleDataType* const __restrict__ p_reduce_dscale,
|
||||
const BiasDataType* const __restrict__ p_reduce_dbias,
|
||||
const MeanVarDataType* const __restrict__ p_mean,
|
||||
const MeanVarDataType* const __restrict__ p_inv_var,
|
||||
const XDataType* const __restrict__ p_x,
|
||||
const DyDataType* const __restrict__ p_dy,
|
||||
const ScaleDataType* const __restrict__ p_scale,
|
||||
const DyElementwiseOp dy_elementwise_op,
|
||||
DxDataType* const __restrict__ p_dx,
|
||||
ScaleDataType* const __restrict__ p_dscale,
|
||||
BiasDataType* const __restrict__ p_dbias)
|
||||
{
|
||||
GridwiseReduceSecondHalfBatchNormBackwardFinal_::Run(x_grid_desc_m_k,
|
||||
dy_grid_desc_m_k,
|
||||
dx_grid_desc_m_k,
|
||||
dscale_dbias_grid_desc_m_k,
|
||||
mean_var_grid_desc_m,
|
||||
scale_grid_desc_m,
|
||||
bias_grid_desc_m,
|
||||
blkgroup_size,
|
||||
reduce_size,
|
||||
num_xy_k_block_tile_iteration,
|
||||
num_dscale_dbias_k_block_tile_iteration,
|
||||
p_reduce_dscale,
|
||||
p_reduce_dbias,
|
||||
p_mean,
|
||||
p_inv_var,
|
||||
p_x,
|
||||
p_dy,
|
||||
p_scale,
|
||||
dy_elementwise_op,
|
||||
p_dx,
|
||||
p_dscale,
|
||||
p_dbias);
|
||||
};
|
||||
|
||||
template <typename XDataType,
|
||||
typename DyDataType,
|
||||
typename DxDataType,
|
||||
typename AccDataType,
|
||||
typename ScaleDataType,
|
||||
typename BiasDataType,
|
||||
typename MeanVarDataType,
|
||||
typename DyElementwiseOp,
|
||||
typename XYGridDesc_M_K,
|
||||
typename DscaleDbiasGridDesc_M_K,
|
||||
typename MeanVarGridDesc_M,
|
||||
typename ScaleBiasGridDesc_M,
|
||||
index_t BlockSize,
|
||||
index_t MThreadClusterSize,
|
||||
index_t KThreadClusterSize,
|
||||
index_t MThreadSliceSize,
|
||||
index_t KThreadSliceSize,
|
||||
index_t XDyDxVectorDim,
|
||||
index_t XSrcVectorSize,
|
||||
index_t DySrcVectorSize,
|
||||
index_t DxDstVectorSize,
|
||||
index_t ScaleSrcDstVectorSize,
|
||||
index_t BiasDstVectorSize,
|
||||
index_t MeanVarSrcVectorSize>
|
||||
struct GridwiseReduceSecondHalfBatchNormBackwardFinal
|
||||
{
|
||||
static_assert((XDyDxVectorDim == 0 && MThreadSliceSize % XSrcVectorSize == 0 &&
|
||||
MThreadSliceSize % DySrcVectorSize == 0 &&
|
||||
MThreadSliceSize % DxDstVectorSize == 0) ||
|
||||
(XDyDxVectorDim == 1 && KThreadSliceSize % XSrcVectorSize == 0 &&
|
||||
KThreadSliceSize % DySrcVectorSize == 0 &&
|
||||
KThreadSliceSize % DxDstVectorSize == 0),
|
||||
"Invalid thread slice sizes and/or vector sizes configuration, please check!");
|
||||
|
||||
static constexpr bool reorder_thread_cluster = (XDyDxVectorDim == 0);
|
||||
|
||||
using ThreadClusterLengths_M_K = Sequence<MThreadClusterSize, KThreadClusterSize>;
|
||||
|
||||
using ThreadBufferDimAccessOrder =
|
||||
typename conditional<reorder_thread_cluster, Sequence<1, 0>, Sequence<0, 1>>::type;
|
||||
|
||||
using ThreadClusterArrangeOrder =
|
||||
typename conditional<reorder_thread_cluster, Sequence<1, 0>, Sequence<0, 1>>::type;
|
||||
|
||||
static constexpr auto thread_cluster_desc =
|
||||
make_cluster_descriptor(ThreadClusterLengths_M_K{}, ThreadClusterArrangeOrder{});
|
||||
|
||||
using ThreadReduceSrcDesc_M_1 = decltype(
|
||||
make_naive_tensor_descriptor_packed(make_tuple(Number<MThreadSliceSize>{}, Number<1>{})));
|
||||
using ThreadReduceDstDesc_M =
|
||||
decltype(make_naive_tensor_descriptor_packed(make_tuple(Number<MThreadSliceSize>{})));
|
||||
|
||||
using BlockwiseReduce = PartitionedBlockwiseReduction<AccDataType,
|
||||
BlockSize,
|
||||
ThreadClusterLengths_M_K,
|
||||
ThreadClusterArrangeOrder,
|
||||
ck::reduce::Add,
|
||||
false>;
|
||||
|
||||
using ThreadwiseReduce = ThreadwiseReduction<AccDataType,
|
||||
ThreadReduceSrcDesc_M_1,
|
||||
ThreadReduceDstDesc_M,
|
||||
ck::reduce::Add,
|
||||
false>;
|
||||
|
||||
using PassThroughOp = tensor_operation::element_wise::PassThrough;
|
||||
|
||||
static constexpr auto I0 = Number<0>{};
|
||||
static constexpr auto I1 = Number<1>{};
|
||||
|
||||
static constexpr index_t M_BlockTileSize = MThreadClusterSize * MThreadSliceSize;
|
||||
static constexpr index_t K_BlockTileSize = KThreadClusterSize * KThreadSliceSize;
|
||||
|
||||
// clang-format off
|
||||
// Two of the steps of Multiblock BatchNorm Backward
|
||||
// Step 1: Second half of Reduction: dbias = sum(dy), dscale = sum(dy * (x-mean) * inv-variance)
|
||||
// Step 2: calculating dx = 1/reduce_size * inv-variance * scale * (reduce_size * dy - dbias - dscale * (x - mean) * inv-variance)) elementwise-ly
|
||||
// clang-format on
|
||||
__device__ static void Run(const XYGridDesc_M_K& x_grid_desc_m_k,
|
||||
const XYGridDesc_M_K& dy_grid_desc_m_k,
|
||||
const XYGridDesc_M_K& dx_grid_desc_m_k,
|
||||
const DscaleDbiasGridDesc_M_K& dscale_dbias_grid_desc_m_k,
|
||||
const MeanVarGridDesc_M& mean_var_grid_desc_m,
|
||||
const ScaleBiasGridDesc_M& scale_grid_desc_m,
|
||||
const ScaleBiasGridDesc_M& bias_grid_desc_m,
|
||||
index_t blkgroup_size,
|
||||
long_index_t reduce_size,
|
||||
index_t num_xy_k_block_tile_iteration,
|
||||
index_t num_dscale_dbias_k_block_tile_iteration,
|
||||
const ScaleDataType* const __restrict__ p_reduce_dscale,
|
||||
const BiasDataType* const __restrict__ p_reduce_dbias,
|
||||
const MeanVarDataType* const __restrict__ p_mean,
|
||||
const MeanVarDataType* const __restrict__ p_inv_var,
|
||||
const XDataType* const __restrict__ p_x,
|
||||
const DyDataType* const __restrict__ p_dy,
|
||||
const ScaleDataType* const __restrict__ p_scale,
|
||||
const DyElementwiseOp dy_elementwise_op,
|
||||
DxDataType* const __restrict__ p_dx,
|
||||
ScaleDataType* const __restrict__ p_dscale,
|
||||
BiasDataType* const __restrict__ p_dbias)
|
||||
{
|
||||
__shared__ AccDataType p_reduce_work_buffer[BlockSize];
|
||||
|
||||
auto reduce_work_buf =
|
||||
make_dynamic_buffer<AddressSpaceEnum::Lds>(p_reduce_work_buffer, BlockSize);
|
||||
|
||||
StaticBuffer<AddressSpaceEnum::Vgpr, AccDataType, MThreadSliceSize * 1, true>
|
||||
reduce_dscale_thread_buf;
|
||||
StaticBuffer<AddressSpaceEnum::Vgpr, AccDataType, MThreadSliceSize * 1, true>
|
||||
reduce_dbias_thread_buf;
|
||||
|
||||
StaticBuffer<AddressSpaceEnum::Vgpr, AccDataType, MThreadSliceSize, true> dscale_thread_buf;
|
||||
StaticBuffer<AddressSpaceEnum::Vgpr, AccDataType, MThreadSliceSize, true> dbias_thread_buf;
|
||||
|
||||
StaticBuffer<AddressSpaceEnum::Vgpr, AccDataType, MThreadSliceSize * KThreadSliceSize, true>
|
||||
x_thread_buf;
|
||||
StaticBuffer<AddressSpaceEnum::Vgpr, AccDataType, MThreadSliceSize * KThreadSliceSize, true>
|
||||
dy_thread_buf;
|
||||
StaticBuffer<AddressSpaceEnum::Vgpr, AccDataType, MThreadSliceSize * KThreadSliceSize, true>
|
||||
dx_thread_buf;
|
||||
|
||||
StaticBuffer<AddressSpaceEnum::Vgpr, AccDataType, MThreadSliceSize, true> mean_thread_buf;
|
||||
StaticBuffer<AddressSpaceEnum::Vgpr, AccDataType, MThreadSliceSize, true>
|
||||
inv_var_thread_buf;
|
||||
StaticBuffer<AddressSpaceEnum::Vgpr, AccDataType, MThreadSliceSize, true> scale_thread_buf;
|
||||
|
||||
const index_t thread_local_id = get_thread_local_1d_id();
|
||||
const index_t block_global_id = get_block_1d_id();
|
||||
const index_t blkgroup_id = block_global_id / blkgroup_size;
|
||||
const index_t block_local_id = block_global_id % blkgroup_size;
|
||||
|
||||
const auto thread_cluster_idx =
|
||||
thread_cluster_desc.CalculateBottomIndex(make_multi_index(thread_local_id));
|
||||
|
||||
const auto thread_m_cluster_id = thread_cluster_idx[I0];
|
||||
const auto thread_k_cluster_id = thread_cluster_idx[I1];
|
||||
|
||||
using ThreadBufferLengths_M_K = Sequence<MThreadSliceSize, KThreadSliceSize>;
|
||||
using ThreadBufferLengths_M = Sequence<MThreadSliceSize>;
|
||||
using ThreadBufferLengths_M_1 = Sequence<MThreadSliceSize, 1>;
|
||||
constexpr auto thread_buffer_desc_m_k = make_naive_tensor_descriptor_packed(
|
||||
make_tuple(Number<MThreadSliceSize>{}, Number<KThreadSliceSize>{}));
|
||||
constexpr auto thread_buffer_desc_m =
|
||||
make_naive_tensor_descriptor_packed(make_tuple(Number<MThreadSliceSize>{}));
|
||||
constexpr auto thread_buffer_desc_m_1 = make_naive_tensor_descriptor_packed(
|
||||
make_tuple(Number<MThreadSliceSize>{}, Number<1>{}));
|
||||
|
||||
// clang-format off
|
||||
// Step 1: do final reduction of dbias = sum(dy), dscale = sum(dy * (x-mean) * inv-variance)
|
||||
// clang-format on
|
||||
|
||||
auto threadwise_dscale_load_m_k =
|
||||
ThreadwiseTensorSliceTransfer_v2<ScaleDataType,
|
||||
AccDataType,
|
||||
DscaleDbiasGridDesc_M_K,
|
||||
decltype(thread_buffer_desc_m_1),
|
||||
ThreadBufferLengths_M_1,
|
||||
Sequence<0, 1>,
|
||||
1,
|
||||
1,
|
||||
1,
|
||||
true>(
|
||||
dscale_dbias_grid_desc_m_k,
|
||||
make_multi_index(blkgroup_id * M_BlockTileSize +
|
||||
thread_m_cluster_id * MThreadSliceSize,
|
||||
thread_k_cluster_id * 1));
|
||||
|
||||
auto threadwise_dbias_load_m_k =
|
||||
ThreadwiseTensorSliceTransfer_v2<BiasDataType,
|
||||
AccDataType,
|
||||
DscaleDbiasGridDesc_M_K,
|
||||
decltype(thread_buffer_desc_m_1),
|
||||
ThreadBufferLengths_M_1,
|
||||
Sequence<0, 1>,
|
||||
1,
|
||||
1,
|
||||
1,
|
||||
true>(
|
||||
dscale_dbias_grid_desc_m_k,
|
||||
make_multi_index(blkgroup_id * M_BlockTileSize +
|
||||
thread_m_cluster_id * MThreadSliceSize,
|
||||
thread_k_cluster_id * 1));
|
||||
|
||||
auto threadwise_dscale_store_m =
|
||||
ThreadwiseTensorSliceTransfer_v1r3<AccDataType,
|
||||
ScaleDataType,
|
||||
decltype(thread_buffer_desc_m),
|
||||
ScaleBiasGridDesc_M,
|
||||
PassThroughOp,
|
||||
ThreadBufferLengths_M,
|
||||
Sequence<0>,
|
||||
0,
|
||||
ScaleSrcDstVectorSize,
|
||||
InMemoryDataOperationEnum::Set,
|
||||
1,
|
||||
true>(
|
||||
scale_grid_desc_m,
|
||||
make_multi_index(blkgroup_id * M_BlockTileSize +
|
||||
thread_m_cluster_id * MThreadSliceSize),
|
||||
PassThroughOp{});
|
||||
|
||||
auto threadwise_dbias_store_m =
|
||||
ThreadwiseTensorSliceTransfer_v1r3<AccDataType,
|
||||
BiasDataType,
|
||||
decltype(thread_buffer_desc_m),
|
||||
ScaleBiasGridDesc_M,
|
||||
PassThroughOp,
|
||||
ThreadBufferLengths_M,
|
||||
Sequence<0>,
|
||||
0,
|
||||
BiasDstVectorSize,
|
||||
InMemoryDataOperationEnum::Set,
|
||||
1,
|
||||
true>(
|
||||
bias_grid_desc_m,
|
||||
make_multi_index(blkgroup_id * M_BlockTileSize +
|
||||
thread_m_cluster_id * MThreadSliceSize),
|
||||
PassThroughOp{});
|
||||
|
||||
const auto reduce_dscale_global_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
|
||||
p_reduce_dscale, dscale_dbias_grid_desc_m_k.GetElementSpaceSize());
|
||||
|
||||
const auto reduce_dbias_global_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
|
||||
p_reduce_dbias, dscale_dbias_grid_desc_m_k.GetElementSpaceSize());
|
||||
|
||||
auto dscale_global_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
|
||||
p_dscale, scale_grid_desc_m.GetElementSpaceSize());
|
||||
|
||||
auto dbias_global_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
|
||||
p_dbias, bias_grid_desc_m.GetElementSpaceSize());
|
||||
|
||||
constexpr auto dscale_dbias_thread_copy_step_m_k =
|
||||
make_multi_index(0, KThreadClusterSize * 1);
|
||||
|
||||
static_for<0, MThreadSliceSize, 1>{}([&](auto I) {
|
||||
dscale_thread_buf(I) = type_convert<AccDataType>(0.0f);
|
||||
dbias_thread_buf(I) = type_convert<AccDataType>(0.0f);
|
||||
});
|
||||
|
||||
for(index_t reducedTiles = 0; reducedTiles < num_dscale_dbias_k_block_tile_iteration;
|
||||
++reducedTiles)
|
||||
{
|
||||
threadwise_dscale_load_m_k.Run(dscale_dbias_grid_desc_m_k,
|
||||
reduce_dscale_global_buf,
|
||||
thread_buffer_desc_m_1,
|
||||
make_tuple(I0, I0),
|
||||
reduce_dscale_thread_buf);
|
||||
|
||||
threadwise_dbias_load_m_k.Run(dscale_dbias_grid_desc_m_k,
|
||||
reduce_dbias_global_buf,
|
||||
thread_buffer_desc_m_1,
|
||||
make_tuple(I0, I0),
|
||||
reduce_dbias_thread_buf);
|
||||
|
||||
ThreadwiseReduce::Reduce(reduce_dscale_thread_buf, dscale_thread_buf);
|
||||
ThreadwiseReduce::Reduce(reduce_dbias_thread_buf, dbias_thread_buf);
|
||||
|
||||
threadwise_dscale_load_m_k.MoveSrcSliceWindow(dscale_dbias_grid_desc_m_k,
|
||||
dscale_dbias_thread_copy_step_m_k);
|
||||
threadwise_dbias_load_m_k.MoveSrcSliceWindow(dscale_dbias_grid_desc_m_k,
|
||||
dscale_dbias_thread_copy_step_m_k);
|
||||
}
|
||||
|
||||
static_for<0, MThreadSliceSize, 1>{}([&](auto I) {
|
||||
if constexpr(I > 0)
|
||||
block_sync_lds();
|
||||
|
||||
BlockwiseReduce::Reduce(reduce_work_buf, dscale_thread_buf(I));
|
||||
block_sync_lds();
|
||||
BlockwiseReduce::Reduce(reduce_work_buf, dbias_thread_buf(I));
|
||||
});
|
||||
|
||||
threadwise_dscale_store_m.Run(thread_buffer_desc_m,
|
||||
make_tuple(I0),
|
||||
dscale_thread_buf,
|
||||
scale_grid_desc_m,
|
||||
dscale_global_buf);
|
||||
|
||||
threadwise_dbias_store_m.Run(thread_buffer_desc_m,
|
||||
make_tuple(I0),
|
||||
dbias_thread_buf,
|
||||
bias_grid_desc_m,
|
||||
dbias_global_buf);
|
||||
|
||||
// clang-format off
|
||||
// Step 2: calculate dx = 1/N * inv-variance * scale * (N * dy - dbias - dscale * (x - mean) * inv-variance)
|
||||
// clang-format on
|
||||
|
||||
const index_t workSizePerBlock = K_BlockTileSize * num_xy_k_block_tile_iteration;
|
||||
|
||||
auto threadwise_x_load = ThreadwiseTensorSliceTransfer_v2<XDataType,
|
||||
AccDataType,
|
||||
XYGridDesc_M_K,
|
||||
decltype(thread_buffer_desc_m_k),
|
||||
ThreadBufferLengths_M_K,
|
||||
ThreadBufferDimAccessOrder,
|
||||
XDyDxVectorDim,
|
||||
XSrcVectorSize,
|
||||
1,
|
||||
true>(
|
||||
x_grid_desc_m_k,
|
||||
make_multi_index(blkgroup_id * M_BlockTileSize + thread_m_cluster_id * MThreadSliceSize,
|
||||
workSizePerBlock * block_local_id +
|
||||
thread_k_cluster_id * KThreadSliceSize));
|
||||
|
||||
auto threadwise_dy_load = ThreadwiseTensorSliceTransfer_v2<DyDataType,
|
||||
AccDataType,
|
||||
XYGridDesc_M_K,
|
||||
decltype(thread_buffer_desc_m_k),
|
||||
ThreadBufferLengths_M_K,
|
||||
ThreadBufferDimAccessOrder,
|
||||
XDyDxVectorDim,
|
||||
DySrcVectorSize,
|
||||
1,
|
||||
true>(
|
||||
dy_grid_desc_m_k,
|
||||
make_multi_index(blkgroup_id * M_BlockTileSize + thread_m_cluster_id * MThreadSliceSize,
|
||||
workSizePerBlock * block_local_id +
|
||||
thread_k_cluster_id * KThreadSliceSize));
|
||||
|
||||
auto threadwise_dx_store =
|
||||
ThreadwiseTensorSliceTransfer_v1r3<AccDataType,
|
||||
DxDataType,
|
||||
decltype(thread_buffer_desc_m_k),
|
||||
XYGridDesc_M_K,
|
||||
PassThroughOp,
|
||||
ThreadBufferLengths_M_K,
|
||||
ThreadBufferDimAccessOrder,
|
||||
XDyDxVectorDim,
|
||||
DxDstVectorSize,
|
||||
InMemoryDataOperationEnum::Set,
|
||||
1,
|
||||
true>(
|
||||
dx_grid_desc_m_k,
|
||||
make_multi_index(
|
||||
blkgroup_id * M_BlockTileSize + thread_m_cluster_id * MThreadSliceSize,
|
||||
workSizePerBlock * block_local_id + thread_k_cluster_id * KThreadSliceSize),
|
||||
PassThroughOp{});
|
||||
|
||||
auto threadwise_scale_load =
|
||||
ThreadwiseTensorSliceTransfer_v2<ScaleDataType,
|
||||
AccDataType,
|
||||
ScaleBiasGridDesc_M,
|
||||
decltype(thread_buffer_desc_m),
|
||||
ThreadBufferLengths_M,
|
||||
Sequence<0>,
|
||||
0,
|
||||
ScaleSrcDstVectorSize,
|
||||
1,
|
||||
true>(
|
||||
scale_grid_desc_m,
|
||||
make_multi_index(blkgroup_id * M_BlockTileSize +
|
||||
thread_m_cluster_id * MThreadSliceSize));
|
||||
|
||||
auto threadwise_mean_var_load =
|
||||
ThreadwiseTensorSliceTransfer_v2<MeanVarDataType,
|
||||
AccDataType,
|
||||
MeanVarGridDesc_M,
|
||||
decltype(thread_buffer_desc_m),
|
||||
ThreadBufferLengths_M,
|
||||
Sequence<0>,
|
||||
0,
|
||||
MeanVarSrcVectorSize,
|
||||
1,
|
||||
true>(
|
||||
mean_var_grid_desc_m,
|
||||
make_multi_index(blkgroup_id * M_BlockTileSize +
|
||||
thread_m_cluster_id * MThreadSliceSize));
|
||||
|
||||
const auto x_global_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
|
||||
p_x, x_grid_desc_m_k.GetElementSpaceSize());
|
||||
|
||||
const auto dy_global_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
|
||||
p_dy, dy_grid_desc_m_k.GetElementSpaceSize());
|
||||
|
||||
auto dx_global_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
|
||||
p_dx, dx_grid_desc_m_k.GetElementSpaceSize());
|
||||
|
||||
const auto scale_global_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
|
||||
p_scale, scale_grid_desc_m.GetElementSpaceSize());
|
||||
|
||||
const auto mean_global_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
|
||||
p_mean, mean_var_grid_desc_m.GetElementSpaceSize());
|
||||
|
||||
const auto inv_var_global_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
|
||||
p_inv_var, mean_var_grid_desc_m.GetElementSpaceSize());
|
||||
|
||||
threadwise_scale_load.Run(scale_grid_desc_m,
|
||||
scale_global_buf,
|
||||
thread_buffer_desc_m,
|
||||
make_tuple(I0),
|
||||
scale_thread_buf);
|
||||
|
||||
threadwise_mean_var_load.Run(mean_var_grid_desc_m,
|
||||
mean_global_buf,
|
||||
thread_buffer_desc_m,
|
||||
make_tuple(I0),
|
||||
mean_thread_buf);
|
||||
|
||||
threadwise_mean_var_load.Run(mean_var_grid_desc_m,
|
||||
inv_var_global_buf,
|
||||
thread_buffer_desc_m,
|
||||
make_tuple(I0),
|
||||
inv_var_thread_buf);
|
||||
|
||||
constexpr auto xy_thread_copy_step_m_k = make_multi_index(0, K_BlockTileSize);
|
||||
|
||||
AccDataType inv_reduce_size =
|
||||
type_convert<AccDataType>(1.0) / type_convert<AccDataType>(reduce_size);
|
||||
|
||||
for(index_t reducedTiles = 0; reducedTiles < num_xy_k_block_tile_iteration; ++reducedTiles)
|
||||
{
|
||||
threadwise_x_load.Run(x_grid_desc_m_k,
|
||||
x_global_buf,
|
||||
thread_buffer_desc_m_k,
|
||||
make_tuple(I0, I0),
|
||||
x_thread_buf);
|
||||
|
||||
threadwise_dy_load.Run(dy_grid_desc_m_k,
|
||||
dy_global_buf,
|
||||
thread_buffer_desc_m_k,
|
||||
make_tuple(I0, I0),
|
||||
dy_thread_buf);
|
||||
|
||||
static_for<0, MThreadSliceSize, 1>{}([&](auto iM) {
|
||||
AccDataType multiplier =
|
||||
inv_reduce_size * inv_var_thread_buf[iM] * scale_thread_buf[iM];
|
||||
|
||||
static_for<0, KThreadSliceSize, 1>{}([&](auto iK) {
|
||||
constexpr auto offset =
|
||||
thread_buffer_desc_m_k.CalculateOffset(make_tuple(iM, iK));
|
||||
|
||||
dy_elementwise_op(dy_thread_buf(Number<offset>{}),
|
||||
dy_thread_buf[Number<offset>{}]);
|
||||
|
||||
AccDataType norm_x = (x_thread_buf[Number<offset>{}] - mean_thread_buf[iM]) *
|
||||
inv_var_thread_buf[iM];
|
||||
|
||||
AccDataType tmpVal = norm_x * dscale_thread_buf[iM];
|
||||
|
||||
dx_thread_buf(Number<offset>{}) =
|
||||
multiplier *
|
||||
(type_convert<AccDataType>(reduce_size) * dy_thread_buf[Number<offset>{}] -
|
||||
dbias_thread_buf[iM] - tmpVal);
|
||||
});
|
||||
});
|
||||
|
||||
threadwise_dx_store.Run(thread_buffer_desc_m_k,
|
||||
make_tuple(I0, I0),
|
||||
dx_thread_buf,
|
||||
dx_grid_desc_m_k,
|
||||
dx_global_buf);
|
||||
|
||||
threadwise_x_load.MoveSrcSliceWindow(x_grid_desc_m_k, xy_thread_copy_step_m_k);
|
||||
threadwise_dy_load.MoveSrcSliceWindow(dy_grid_desc_m_k, xy_thread_copy_step_m_k);
|
||||
threadwise_dx_store.MoveDstSliceWindow(dx_grid_desc_m_k, xy_thread_copy_step_m_k);
|
||||
}
|
||||
};
|
||||
};
|
||||
|
||||
} // namespace ck
|
||||
@@ -93,6 +93,9 @@ struct GridwiseMultiblockWelfordFirstHalf
|
||||
static constexpr index_t M_BlockTileSize = MThreadClusterSize * MThreadSliceSize;
|
||||
static constexpr index_t K_BlockTileSize = KThreadClusterSize * KThreadSliceSize;
|
||||
|
||||
// clang-format off
|
||||
// First half of the Multiblock Welford method to calculate mean and variance, used by both batchnorm-forward and batchnorm-backward.
|
||||
// clang-format on
|
||||
__device__ static void Run(const XGridDesc_M_K& x_grid_desc_m_k,
|
||||
const MeanVarCountGridDesc_M_G& mean_var_count_grid_desc_m_g,
|
||||
const GetReduceCountPerThreadFunctor& get_reduce_count_per_thread,
|
||||
|
||||
@@ -529,6 +529,7 @@ struct GridwiseWelfordSecondHalfBatchNormForwardFinal
|
||||
auto result_inv_var_global_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
|
||||
resultSaveInvVariance, mean_var_grid_desc_m.GetElementSpaceSize());
|
||||
|
||||
// calculate inv-variance as 1/sqrt(epsilon+variance)
|
||||
static_for<0, MThreadSliceSize, 1>{}([&](auto I) {
|
||||
welford_var_thread_buf(I) =
|
||||
type_convert<AccDataType>(1.0f) / sqrt(epsilon + welford_var_thread_buf[I]);
|
||||
|
||||
@@ -0,0 +1,575 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "ck/utility/data_type.hpp"
|
||||
#include "ck/tensor_operation/gpu/block/blockwise_welford.hpp"
|
||||
#include "ck/tensor_operation/gpu/block/reduction_functions_blockwise.hpp"
|
||||
#include "ck/tensor_operation/gpu/thread/threadwise_welford.hpp"
|
||||
#include "ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer.hpp"
|
||||
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
|
||||
|
||||
namespace ck {
|
||||
|
||||
template <typename GridwiseWelfordSecondHalfReduceFirstHalf_,
|
||||
typename XDataType,
|
||||
typename DyDataType,
|
||||
typename AccDataType,
|
||||
typename ScaleDataType,
|
||||
typename BiasDataType,
|
||||
typename MeanVarDataType,
|
||||
typename DyElementwiseOp,
|
||||
typename XYGridDesc_M_K,
|
||||
typename MeanVarGridDesc_M,
|
||||
typename MeanVarCountGridDesc_M_K,
|
||||
typename DscaleDbiasGridDesc_M_G>
|
||||
__global__ void kernel_welford_second_half_reduce_first_half(
|
||||
const XYGridDesc_M_K x_grid_desc_m_k,
|
||||
const XYGridDesc_M_K dy_grid_desc_m_k,
|
||||
const MeanVarGridDesc_M mean_var_grid_desc_m,
|
||||
const MeanVarCountGridDesc_M_K mean_var_count_grid_desc_m_k,
|
||||
const DscaleDbiasGridDesc_M_G dscale_dbias_grid_desc_m_g,
|
||||
index_t blkgroup_size,
|
||||
index_t num_xy_k_block_tile_iteration,
|
||||
index_t num_mean_var_count_k_block_tile_iteration,
|
||||
AccDataType epsilon,
|
||||
bool haveSavedMeanInvVar,
|
||||
const MeanVarDataType* const __restrict__ p_savedMean,
|
||||
const MeanVarDataType* const __restrict__ p_savedInvVar,
|
||||
const MeanVarDataType* const __restrict__ p_in_welford_mean,
|
||||
const MeanVarDataType* const __restrict__ p_in_welford_variance,
|
||||
const int32_t* const __restrict__ p_in_welford_count,
|
||||
const DyElementwiseOp dy_elementwise_op,
|
||||
MeanVarDataType* const __restrict__ p_out_welford_mean,
|
||||
MeanVarDataType* const __restrict__ p_out_welford_inv_variance,
|
||||
const XDataType* const __restrict__ p_x,
|
||||
const DyDataType* const __restrict__ p_dy,
|
||||
ScaleDataType* const __restrict__ p_reduce_dscale,
|
||||
BiasDataType* const __restrict__ p_reduce_dbias)
|
||||
{
|
||||
GridwiseWelfordSecondHalfReduceFirstHalf_::Run(x_grid_desc_m_k,
|
||||
dy_grid_desc_m_k,
|
||||
mean_var_grid_desc_m,
|
||||
mean_var_count_grid_desc_m_k,
|
||||
dscale_dbias_grid_desc_m_g,
|
||||
blkgroup_size,
|
||||
num_xy_k_block_tile_iteration,
|
||||
num_mean_var_count_k_block_tile_iteration,
|
||||
epsilon,
|
||||
haveSavedMeanInvVar,
|
||||
p_savedMean,
|
||||
p_savedInvVar,
|
||||
p_in_welford_mean,
|
||||
p_in_welford_variance,
|
||||
p_in_welford_count,
|
||||
dy_elementwise_op,
|
||||
p_out_welford_mean,
|
||||
p_out_welford_inv_variance,
|
||||
p_x,
|
||||
p_dy,
|
||||
p_reduce_dscale,
|
||||
p_reduce_dbias);
|
||||
};
|
||||
|
||||
template <typename XDataType,
|
||||
typename DyDataType,
|
||||
typename AccDataType,
|
||||
typename ScaleDataType,
|
||||
typename BiasDataType,
|
||||
typename MeanVarDataType,
|
||||
typename DyElementwiseOp,
|
||||
typename XYGridDesc_M_K,
|
||||
typename MeanVarGridDesc_M,
|
||||
typename MeanVarCountGridDesc_M_K,
|
||||
typename DscaleDbiasGridDesc_M_G,
|
||||
index_t BlockSize,
|
||||
index_t MThreadClusterSize,
|
||||
index_t KThreadClusterSize,
|
||||
index_t MThreadSliceSize,
|
||||
index_t KThreadSliceSize,
|
||||
index_t XDyVectorDim,
|
||||
index_t XSrcVectorSize,
|
||||
index_t DySrcVectorSize,
|
||||
index_t MeanVarSrcVectorSize>
|
||||
struct GridwiseWelfordSecondHalfReduceFirstHalf
|
||||
{
|
||||
static_assert((XDyVectorDim == 0 && MThreadSliceSize % XSrcVectorSize == 0 &&
|
||||
MThreadSliceSize % DySrcVectorSize == 0) ||
|
||||
(XDyVectorDim == 1 && KThreadSliceSize % XSrcVectorSize == 0 &&
|
||||
KThreadSliceSize % DySrcVectorSize == 0),
|
||||
"Invalid thread slice sizes and/or vector sizes configuration, please check!");
|
||||
|
||||
static constexpr bool reorder_thread_cluster = (XDyVectorDim == 0);
|
||||
|
||||
using ThreadClusterLengths_M_K = Sequence<MThreadClusterSize, KThreadClusterSize>;
|
||||
|
||||
using ThreadBufferDimAccessOrder =
|
||||
typename conditional<reorder_thread_cluster, Sequence<1, 0>, Sequence<0, 1>>::type;
|
||||
|
||||
using ThreadClusterArrangeOrder =
|
||||
typename conditional<reorder_thread_cluster, Sequence<1, 0>, Sequence<0, 1>>::type;
|
||||
|
||||
static constexpr auto thread_cluster_desc =
|
||||
make_cluster_descriptor(ThreadClusterLengths_M_K{}, ThreadClusterArrangeOrder{});
|
||||
|
||||
using ThreadReduceSrcDesc_M_K = decltype(make_naive_tensor_descriptor_packed(
|
||||
make_tuple(Number<MThreadSliceSize>{}, Number<KThreadSliceSize>{})));
|
||||
using ThreadReduceSrcDesc_M_1 = decltype(
|
||||
make_naive_tensor_descriptor_packed(make_tuple(Number<MThreadSliceSize>{}, Number<1>{})));
|
||||
using ThreadReduceDstDesc_M =
|
||||
decltype(make_naive_tensor_descriptor_packed(make_tuple(Number<MThreadSliceSize>{})));
|
||||
|
||||
using ThreadwiseWelford =
|
||||
ThreadwiseWelfordMerge<AccDataType, ThreadReduceSrcDesc_M_1, ThreadReduceDstDesc_M>;
|
||||
|
||||
using BlockwiseWelford = BlockwiseWelford<AccDataType,
|
||||
BlockSize,
|
||||
ThreadClusterLengths_M_K,
|
||||
ThreadClusterArrangeOrder>;
|
||||
|
||||
using BlockwiseReduce = PartitionedBlockwiseReduction<AccDataType,
|
||||
BlockSize,
|
||||
ThreadClusterLengths_M_K,
|
||||
ThreadClusterArrangeOrder,
|
||||
ck::reduce::Add,
|
||||
false>;
|
||||
|
||||
using ThreadwiseReduce = ThreadwiseReduction<AccDataType,
|
||||
ThreadReduceSrcDesc_M_K,
|
||||
ThreadReduceDstDesc_M,
|
||||
ck::reduce::Add,
|
||||
false>;
|
||||
|
||||
using PassThroughOp = tensor_operation::element_wise::PassThrough;
|
||||
|
||||
static constexpr auto I0 = Number<0>{};
|
||||
static constexpr auto I1 = Number<1>{};
|
||||
|
||||
static constexpr index_t M_BlockTileSize = MThreadClusterSize * MThreadSliceSize;
|
||||
static constexpr index_t K_BlockTileSize = KThreadClusterSize * KThreadSliceSize;
|
||||
|
||||
// clang-format off
|
||||
// Two of the steps of Multiblock BatchNorm Backward
|
||||
// Step 1: Second half of Welford method to calculate mean and variance, as well as getting inv-variance = 1/sqrt(epsilon+variance)
|
||||
// Step 2: First half of Reduction: dbias = sum(dy), dscale = sum(dy * (x-mean) * inv-variance)
|
||||
// clang-format on
|
||||
__device__ static void Run(const XYGridDesc_M_K& x_grid_desc_m_k,
|
||||
const XYGridDesc_M_K& dy_grid_desc_m_k,
|
||||
const MeanVarGridDesc_M& mean_var_grid_desc_m,
|
||||
const MeanVarCountGridDesc_M_K& mean_var_count_grid_desc_m_k,
|
||||
const DscaleDbiasGridDesc_M_G& dscale_dbias_grid_desc_m_g,
|
||||
index_t blkgroup_size,
|
||||
index_t num_xy_k_block_tile_iteration,
|
||||
index_t num_mean_var_count_k_block_tile_iteration,
|
||||
AccDataType epsilon,
|
||||
bool haveSavedMeanInvVar,
|
||||
const MeanVarDataType* const __restrict__ p_savedMean,
|
||||
const MeanVarDataType* const __restrict__ p_savedInvVar,
|
||||
const MeanVarDataType* const __restrict__ p_in_welford_mean,
|
||||
const MeanVarDataType* const __restrict__ p_in_welford_variance,
|
||||
const int32_t* const __restrict__ p_in_welford_count,
|
||||
const DyElementwiseOp dy_elementwise_op,
|
||||
MeanVarDataType* const __restrict__ p_out_welford_mean,
|
||||
MeanVarDataType* const __restrict__ p_out_welford_inv_variance,
|
||||
const XDataType* const __restrict__ p_x,
|
||||
const DyDataType* const __restrict__ p_dy,
|
||||
ScaleDataType* const __restrict__ p_reduce_dscale,
|
||||
BiasDataType* const __restrict__ p_reduce_dbias)
|
||||
{
|
||||
__shared__ AccDataType p_reduce_work_buffer[BlockSize];
|
||||
|
||||
auto reduce_work_buf =
|
||||
make_dynamic_buffer<AddressSpaceEnum::Lds>(p_reduce_work_buffer, BlockSize);
|
||||
|
||||
StaticBuffer<AddressSpaceEnum::Vgpr, AccDataType, MThreadSliceSize * 1, true>
|
||||
in_welford_mean_thread_buf;
|
||||
StaticBuffer<AddressSpaceEnum::Vgpr, AccDataType, MThreadSliceSize * 1, true>
|
||||
in_welford_var_thread_buf;
|
||||
StaticBuffer<AddressSpaceEnum::Vgpr, int32_t, MThreadSliceSize * 1, true>
|
||||
in_welford_count_thread_buf;
|
||||
|
||||
StaticBuffer<AddressSpaceEnum::Vgpr, AccDataType, MThreadSliceSize, true>
|
||||
welford_mean_thread_buf;
|
||||
StaticBuffer<AddressSpaceEnum::Vgpr, AccDataType, MThreadSliceSize, true>
|
||||
welford_var_thread_buf;
|
||||
StaticBuffer<AddressSpaceEnum::Vgpr, int32_t, MThreadSliceSize, true>
|
||||
welford_count_thread_buf;
|
||||
|
||||
StaticBuffer<AddressSpaceEnum::Vgpr, AccDataType, MThreadSliceSize, true>& mean_thread_buf =
|
||||
welford_mean_thread_buf;
|
||||
StaticBuffer<AddressSpaceEnum::Vgpr, AccDataType, MThreadSliceSize, true>&
|
||||
inv_var_thread_buf = welford_var_thread_buf;
|
||||
|
||||
StaticBuffer<AddressSpaceEnum::Vgpr, AccDataType, MThreadSliceSize * KThreadSliceSize, true>
|
||||
x_thread_buf;
|
||||
StaticBuffer<AddressSpaceEnum::Vgpr, AccDataType, MThreadSliceSize * KThreadSliceSize, true>
|
||||
dy_thread_buf;
|
||||
|
||||
// buffer of values of dy * (x-mean) * inv-variance, used as input of Blockwise reduction
|
||||
StaticBuffer<AddressSpaceEnum::Vgpr, AccDataType, MThreadSliceSize * KThreadSliceSize, true>
|
||||
tmp1_thread_buf;
|
||||
|
||||
StaticBuffer<AddressSpaceEnum::Vgpr, AccDataType, MThreadSliceSize, true>
|
||||
reduce_dscale_thread_buf;
|
||||
StaticBuffer<AddressSpaceEnum::Vgpr, AccDataType, MThreadSliceSize, true>
|
||||
reduce_dbias_thread_buf;
|
||||
|
||||
const index_t thread_local_id = get_thread_local_1d_id();
|
||||
const index_t block_global_id = get_block_1d_id();
|
||||
const index_t blkgroup_id = block_global_id / blkgroup_size;
|
||||
const index_t block_local_id = block_global_id % blkgroup_size;
|
||||
|
||||
const auto thread_cluster_idx =
|
||||
thread_cluster_desc.CalculateBottomIndex(make_multi_index(thread_local_id));
|
||||
|
||||
const auto thread_m_cluster_id = thread_cluster_idx[I0];
|
||||
const auto thread_k_cluster_id = thread_cluster_idx[I1];
|
||||
|
||||
using ThreadBufferLengths_M_K = Sequence<MThreadSliceSize, KThreadSliceSize>;
|
||||
using ThreadBufferLengths_M = Sequence<MThreadSliceSize>;
|
||||
using ThreadBufferLengths_M_1 = Sequence<MThreadSliceSize, 1>;
|
||||
constexpr auto thread_buffer_desc_m_k = make_naive_tensor_descriptor_packed(
|
||||
make_tuple(Number<MThreadSliceSize>{}, Number<KThreadSliceSize>{}));
|
||||
constexpr auto thread_buffer_desc_m =
|
||||
make_naive_tensor_descriptor_packed(make_tuple(Number<MThreadSliceSize>{}));
|
||||
constexpr auto thread_buffer_desc_m_1 = make_naive_tensor_descriptor_packed(
|
||||
make_tuple(Number<MThreadSliceSize>{}, Number<1>{}));
|
||||
|
||||
// clang-format off
|
||||
// Step 1: load existing mean and inv-variance, or do final welford reduction on mean and variance as well as get inv-variance = 1/sqrt(epsilon+variance)
|
||||
// clang-format on
|
||||
|
||||
if(haveSavedMeanInvVar)
|
||||
{
|
||||
const auto mean_global_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
|
||||
p_savedMean, mean_var_grid_desc_m.GetElementSpaceSize());
|
||||
|
||||
const auto inv_var_global_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
|
||||
p_savedInvVar, mean_var_grid_desc_m.GetElementSpaceSize());
|
||||
|
||||
auto threadwise_mean_inv_var_load =
|
||||
ThreadwiseTensorSliceTransfer_v2<MeanVarDataType,
|
||||
AccDataType,
|
||||
MeanVarGridDesc_M,
|
||||
decltype(thread_buffer_desc_m),
|
||||
ThreadBufferLengths_M,
|
||||
Sequence<0>,
|
||||
0,
|
||||
MeanVarSrcVectorSize,
|
||||
1,
|
||||
true>(
|
||||
mean_var_grid_desc_m,
|
||||
make_multi_index(blkgroup_id * M_BlockTileSize +
|
||||
thread_m_cluster_id * MThreadSliceSize));
|
||||
|
||||
threadwise_mean_inv_var_load.Run(mean_var_grid_desc_m,
|
||||
mean_global_buf,
|
||||
thread_buffer_desc_m,
|
||||
make_tuple(I0),
|
||||
mean_thread_buf);
|
||||
|
||||
threadwise_mean_inv_var_load.Run(mean_var_grid_desc_m,
|
||||
inv_var_global_buf,
|
||||
thread_buffer_desc_m,
|
||||
make_tuple(I0),
|
||||
inv_var_thread_buf);
|
||||
}
|
||||
else
|
||||
{
|
||||
const auto welford_mean_global_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
|
||||
p_in_welford_mean, mean_var_count_grid_desc_m_k.GetElementSpaceSize());
|
||||
|
||||
const auto welford_var_global_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
|
||||
p_in_welford_variance, mean_var_count_grid_desc_m_k.GetElementSpaceSize());
|
||||
|
||||
const auto welford_count_global_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
|
||||
p_in_welford_count, mean_var_count_grid_desc_m_k.GetElementSpaceSize());
|
||||
|
||||
auto threadwise_mean_var_load_m_k =
|
||||
ThreadwiseTensorSliceTransfer_v2<AccDataType,
|
||||
AccDataType,
|
||||
MeanVarCountGridDesc_M_K,
|
||||
decltype(thread_buffer_desc_m_1),
|
||||
ThreadBufferLengths_M_1,
|
||||
Sequence<0, 1>,
|
||||
1,
|
||||
1,
|
||||
1,
|
||||
true>(
|
||||
mean_var_count_grid_desc_m_k,
|
||||
make_multi_index(blkgroup_id * M_BlockTileSize +
|
||||
thread_m_cluster_id * MThreadSliceSize,
|
||||
thread_k_cluster_id * 1));
|
||||
|
||||
auto threadwise_count_load_m_k =
|
||||
ThreadwiseTensorSliceTransfer_v2<int32_t,
|
||||
int32_t,
|
||||
MeanVarCountGridDesc_M_K,
|
||||
decltype(thread_buffer_desc_m_1),
|
||||
ThreadBufferLengths_M_1,
|
||||
Sequence<0, 1>,
|
||||
1,
|
||||
1,
|
||||
1,
|
||||
true>(
|
||||
mean_var_count_grid_desc_m_k,
|
||||
make_multi_index(blkgroup_id * M_BlockTileSize +
|
||||
thread_m_cluster_id * MThreadSliceSize,
|
||||
thread_k_cluster_id * 1));
|
||||
|
||||
constexpr auto mean_var_count_thread_copy_step_m_k =
|
||||
make_multi_index(0, KThreadClusterSize * 1);
|
||||
|
||||
static_for<0, MThreadSliceSize, 1>{}([&](auto I) {
|
||||
welford_mean_thread_buf(I) = type_convert<AccDataType>(0.0f);
|
||||
welford_var_thread_buf(I) = type_convert<AccDataType>(0.0f);
|
||||
welford_count_thread_buf(I) = 0;
|
||||
});
|
||||
|
||||
for(index_t reducedTiles = 0; reducedTiles < num_mean_var_count_k_block_tile_iteration;
|
||||
++reducedTiles)
|
||||
{
|
||||
threadwise_mean_var_load_m_k.Run(mean_var_count_grid_desc_m_k,
|
||||
welford_mean_global_buf,
|
||||
thread_buffer_desc_m_1,
|
||||
make_tuple(I0, I0),
|
||||
in_welford_mean_thread_buf);
|
||||
|
||||
threadwise_mean_var_load_m_k.Run(mean_var_count_grid_desc_m_k,
|
||||
welford_var_global_buf,
|
||||
thread_buffer_desc_m_1,
|
||||
make_tuple(I0, I0),
|
||||
in_welford_var_thread_buf);
|
||||
|
||||
threadwise_count_load_m_k.Run(mean_var_count_grid_desc_m_k,
|
||||
welford_count_global_buf,
|
||||
thread_buffer_desc_m_1,
|
||||
make_tuple(I0, I0),
|
||||
in_welford_count_thread_buf);
|
||||
|
||||
ThreadwiseWelford::Run(in_welford_mean_thread_buf,
|
||||
in_welford_var_thread_buf,
|
||||
in_welford_count_thread_buf,
|
||||
welford_mean_thread_buf,
|
||||
welford_var_thread_buf,
|
||||
welford_count_thread_buf);
|
||||
|
||||
threadwise_mean_var_load_m_k.MoveSrcSliceWindow(
|
||||
mean_var_count_grid_desc_m_k, mean_var_count_thread_copy_step_m_k);
|
||||
threadwise_count_load_m_k.MoveSrcSliceWindow(mean_var_count_grid_desc_m_k,
|
||||
mean_var_count_thread_copy_step_m_k);
|
||||
}
|
||||
|
||||
static_for<0, MThreadSliceSize, 1>{}([&](auto I) {
|
||||
if constexpr(I > 0)
|
||||
block_sync_lds();
|
||||
|
||||
BlockwiseWelford::Run(welford_mean_thread_buf(I),
|
||||
welford_var_thread_buf(I),
|
||||
welford_count_thread_buf(I));
|
||||
});
|
||||
|
||||
// calculate inv-variance as 1/sqrt(epsilon+variance), stored in place of variance
|
||||
static_for<0, MThreadSliceSize, 1>{}([&](auto I) {
|
||||
welford_var_thread_buf(I) =
|
||||
type_convert<AccDataType>(1.0) / sqrt(welford_var_thread_buf[I] + epsilon);
|
||||
});
|
||||
|
||||
if(block_local_id == 0 && thread_k_cluster_id == 0)
|
||||
{
|
||||
|
||||
auto threadwise_mean_inv_var_store =
|
||||
ThreadwiseTensorSliceTransfer_v1r3<AccDataType,
|
||||
MeanVarDataType,
|
||||
decltype(thread_buffer_desc_m),
|
||||
MeanVarGridDesc_M,
|
||||
PassThroughOp,
|
||||
ThreadBufferLengths_M,
|
||||
Sequence<0>,
|
||||
0,
|
||||
1,
|
||||
InMemoryDataOperationEnum::Set,
|
||||
1,
|
||||
true>(
|
||||
mean_var_grid_desc_m,
|
||||
make_multi_index(blkgroup_id * M_BlockTileSize +
|
||||
thread_m_cluster_id * MThreadSliceSize),
|
||||
PassThroughOp{});
|
||||
|
||||
auto mean_global_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
|
||||
p_out_welford_mean, mean_var_grid_desc_m.GetElementSpaceSize());
|
||||
|
||||
auto inv_var_global_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
|
||||
p_out_welford_inv_variance, mean_var_grid_desc_m.GetElementSpaceSize());
|
||||
|
||||
threadwise_mean_inv_var_store.Run(thread_buffer_desc_m,
|
||||
make_tuple(I0),
|
||||
mean_thread_buf,
|
||||
mean_var_grid_desc_m,
|
||||
mean_global_buf);
|
||||
|
||||
threadwise_mean_inv_var_store.Run(thread_buffer_desc_m,
|
||||
make_tuple(I0),
|
||||
inv_var_thread_buf,
|
||||
mean_var_grid_desc_m,
|
||||
inv_var_global_buf);
|
||||
};
|
||||
};
|
||||
|
||||
const index_t workSizePerBlock = K_BlockTileSize * num_xy_k_block_tile_iteration;
|
||||
|
||||
auto threadwise_x_load = ThreadwiseTensorSliceTransfer_v2<XDataType,
|
||||
AccDataType,
|
||||
XYGridDesc_M_K,
|
||||
decltype(thread_buffer_desc_m_k),
|
||||
ThreadBufferLengths_M_K,
|
||||
ThreadBufferDimAccessOrder,
|
||||
XDyVectorDim,
|
||||
XSrcVectorSize,
|
||||
1,
|
||||
true>(
|
||||
x_grid_desc_m_k,
|
||||
make_multi_index(blkgroup_id * M_BlockTileSize + thread_m_cluster_id * MThreadSliceSize,
|
||||
workSizePerBlock * block_local_id +
|
||||
thread_k_cluster_id * KThreadSliceSize));
|
||||
|
||||
auto threadwise_dy_load = ThreadwiseTensorSliceTransfer_v2<DyDataType,
|
||||
AccDataType,
|
||||
XYGridDesc_M_K,
|
||||
decltype(thread_buffer_desc_m_k),
|
||||
ThreadBufferLengths_M_K,
|
||||
ThreadBufferDimAccessOrder,
|
||||
XDyVectorDim,
|
||||
DySrcVectorSize,
|
||||
1,
|
||||
true>(
|
||||
dy_grid_desc_m_k,
|
||||
make_multi_index(blkgroup_id * M_BlockTileSize + thread_m_cluster_id * MThreadSliceSize,
|
||||
workSizePerBlock * block_local_id +
|
||||
thread_k_cluster_id * KThreadSliceSize));
|
||||
|
||||
const auto x_global_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
|
||||
p_x, x_grid_desc_m_k.GetElementSpaceSize());
|
||||
|
||||
const auto dy_global_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
|
||||
p_dy, dy_grid_desc_m_k.GetElementSpaceSize());
|
||||
|
||||
constexpr auto xy_thread_copy_step_m_k = make_multi_index(0, K_BlockTileSize);
|
||||
|
||||
static_for<0, MThreadSliceSize, 1>{}([&](auto I) {
|
||||
reduce_dscale_thread_buf(I) = type_convert<AccDataType>(0);
|
||||
reduce_dbias_thread_buf(I) = type_convert<AccDataType>(0);
|
||||
});
|
||||
|
||||
// clang-format off
|
||||
// Step 2: first-half of reduction: dbias = sum(dy), dscale = sum(dy * (x-mean) * inv-variance)
|
||||
// clang-format on
|
||||
|
||||
for(index_t reducedTiles = 0; reducedTiles < num_xy_k_block_tile_iteration; ++reducedTiles)
|
||||
{
|
||||
threadwise_x_load.Run(x_grid_desc_m_k,
|
||||
x_global_buf,
|
||||
thread_buffer_desc_m_k,
|
||||
make_tuple(I0, I0),
|
||||
x_thread_buf);
|
||||
|
||||
threadwise_dy_load.Run(dy_grid_desc_m_k,
|
||||
dy_global_buf,
|
||||
thread_buffer_desc_m_k,
|
||||
make_tuple(I0, I0),
|
||||
dy_thread_buf);
|
||||
|
||||
static_for<0, MThreadSliceSize, 1>{}([&](auto iM) {
|
||||
static_for<0, KThreadSliceSize, 1>{}([&](auto iK) {
|
||||
constexpr auto offset =
|
||||
thread_buffer_desc_m_k.CalculateOffset(make_tuple(iM, iK));
|
||||
|
||||
dy_elementwise_op(dy_thread_buf(Number<offset>{}),
|
||||
dy_thread_buf[Number<offset>{}]);
|
||||
|
||||
AccDataType norm_x = (x_thread_buf[Number<offset>{}] - mean_thread_buf[iM]) *
|
||||
inv_var_thread_buf[iM];
|
||||
|
||||
tmp1_thread_buf(Number<offset>{}) = norm_x * dy_thread_buf[Number<offset>{}];
|
||||
});
|
||||
});
|
||||
|
||||
ThreadwiseReduce::Reduce(tmp1_thread_buf, reduce_dscale_thread_buf);
|
||||
ThreadwiseReduce::Reduce(dy_thread_buf, reduce_dbias_thread_buf);
|
||||
|
||||
threadwise_x_load.MoveSrcSliceWindow(x_grid_desc_m_k, xy_thread_copy_step_m_k);
|
||||
threadwise_dy_load.MoveSrcSliceWindow(dy_grid_desc_m_k, xy_thread_copy_step_m_k);
|
||||
};
|
||||
|
||||
static_for<0, MThreadSliceSize, 1>{}([&](auto I) {
|
||||
if constexpr(I > 0)
|
||||
block_sync_lds();
|
||||
|
||||
BlockwiseReduce::Reduce(reduce_work_buf, reduce_dscale_thread_buf(I));
|
||||
block_sync_lds();
|
||||
BlockwiseReduce::Reduce(reduce_work_buf, reduce_dbias_thread_buf(I));
|
||||
});
|
||||
|
||||
auto threadwise_dscale_store =
|
||||
ThreadwiseTensorSliceTransfer_v1r3<AccDataType,
|
||||
ScaleDataType,
|
||||
decltype(thread_buffer_desc_m_1),
|
||||
DscaleDbiasGridDesc_M_G,
|
||||
PassThroughOp,
|
||||
ThreadBufferLengths_M_1,
|
||||
Sequence<0, 1>,
|
||||
1,
|
||||
1,
|
||||
InMemoryDataOperationEnum::Set,
|
||||
1,
|
||||
true>(
|
||||
dscale_dbias_grid_desc_m_g,
|
||||
make_multi_index(blkgroup_id * M_BlockTileSize +
|
||||
thread_m_cluster_id * MThreadSliceSize,
|
||||
block_local_id),
|
||||
PassThroughOp{});
|
||||
|
||||
auto threadwise_dbias_store =
|
||||
ThreadwiseTensorSliceTransfer_v1r3<AccDataType,
|
||||
BiasDataType,
|
||||
decltype(thread_buffer_desc_m_1),
|
||||
DscaleDbiasGridDesc_M_G,
|
||||
PassThroughOp,
|
||||
ThreadBufferLengths_M_1,
|
||||
Sequence<0, 1>,
|
||||
1,
|
||||
1,
|
||||
InMemoryDataOperationEnum::Set,
|
||||
1,
|
||||
true>(
|
||||
dscale_dbias_grid_desc_m_g,
|
||||
make_multi_index(blkgroup_id * M_BlockTileSize +
|
||||
thread_m_cluster_id * MThreadSliceSize,
|
||||
block_local_id),
|
||||
PassThroughOp{});
|
||||
|
||||
auto reduce_dscale_global_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
|
||||
p_reduce_dscale, dscale_dbias_grid_desc_m_g.GetElementSpaceSize());
|
||||
|
||||
auto reduce_dbias_global_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
|
||||
p_reduce_dbias, dscale_dbias_grid_desc_m_g.GetElementSpaceSize());
|
||||
|
||||
if(thread_k_cluster_id == 0)
|
||||
{
|
||||
threadwise_dscale_store.Run(thread_buffer_desc_m_1,
|
||||
make_tuple(I0, I0),
|
||||
reduce_dscale_thread_buf,
|
||||
dscale_dbias_grid_desc_m_g,
|
||||
reduce_dscale_global_buf);
|
||||
|
||||
threadwise_dbias_store.Run(thread_buffer_desc_m_1,
|
||||
make_tuple(I0, I0),
|
||||
reduce_dbias_thread_buf,
|
||||
dscale_dbias_grid_desc_m_g,
|
||||
reduce_dbias_global_buf);
|
||||
};
|
||||
};
|
||||
};
|
||||
|
||||
} // namespace ck
|
||||
@@ -0,0 +1,572 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "ck/utility/data_type.hpp"
|
||||
#include "ck/utility/math_v2.hpp"
|
||||
#include "ck/utility/reduction_operator.hpp"
|
||||
#include "ck/tensor_operation/gpu/block/blockwise_welford.hpp"
|
||||
#include "ck/tensor_operation/gpu/block/reduction_functions_blockwise.hpp"
|
||||
#include "ck/tensor_operation/gpu/thread/threadwise_welford.hpp"
|
||||
#include "ck/tensor_operation/gpu/thread/reduction_functions_threadwise.hpp"
|
||||
#include "ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer.hpp"
|
||||
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
|
||||
|
||||
namespace ck {
|
||||
|
||||
template <typename GridwiseBatchrNormBackwardWithBlockwiseWelford_,
|
||||
typename XDataType,
|
||||
typename DyDataType,
|
||||
typename DxDataType,
|
||||
typename AccDataType,
|
||||
typename ScaleDataType,
|
||||
typename BiasDataType,
|
||||
typename MeanVarDataType,
|
||||
typename DyElementwiseOp,
|
||||
typename XYGridDesc_M_K,
|
||||
typename ScaleBiasGridDesc_M,
|
||||
typename MeanVarGridDesc_M,
|
||||
typename GetReduceCountPerThreadFunctor>
|
||||
__global__ void kernel_batchnorm_backward_with_blockwise_welford(
|
||||
const XYGridDesc_M_K x_grid_desc_m_k,
|
||||
const XYGridDesc_M_K dy_grid_desc_m_k,
|
||||
const XYGridDesc_M_K dx_grid_desc_m_k,
|
||||
const ScaleBiasGridDesc_M scale_grid_desc_m,
|
||||
const ScaleBiasGridDesc_M bias_grid_desc_m,
|
||||
const MeanVarGridDesc_M mean_var_grid_desc_m,
|
||||
const GetReduceCountPerThreadFunctor get_reduce_count_per_thread,
|
||||
long_index_t reduce_size,
|
||||
index_t num_k_block_tile_iteration,
|
||||
AccDataType epsilon,
|
||||
const XDataType* const __restrict__ p_x,
|
||||
const DyDataType* const __restrict__ p_dy,
|
||||
const ScaleDataType* const __restrict__ p_scale,
|
||||
bool haveSavedMeanInvVar,
|
||||
const MeanVarDataType* const __restrict__ p_savedMean,
|
||||
const MeanVarDataType* const __restrict__ p_savedInvVar,
|
||||
const DyElementwiseOp dy_elementwise_op,
|
||||
DxDataType* const __restrict__ p_dx,
|
||||
ScaleDataType* const __restrict__ p_dscale,
|
||||
BiasDataType* const __restrict__ p_dbias)
|
||||
{
|
||||
GridwiseBatchrNormBackwardWithBlockwiseWelford_::Run(x_grid_desc_m_k,
|
||||
dy_grid_desc_m_k,
|
||||
dx_grid_desc_m_k,
|
||||
scale_grid_desc_m,
|
||||
bias_grid_desc_m,
|
||||
mean_var_grid_desc_m,
|
||||
get_reduce_count_per_thread,
|
||||
reduce_size,
|
||||
num_k_block_tile_iteration,
|
||||
epsilon,
|
||||
p_x,
|
||||
p_dy,
|
||||
p_scale,
|
||||
haveSavedMeanInvVar,
|
||||
p_savedMean,
|
||||
p_savedInvVar,
|
||||
dy_elementwise_op,
|
||||
p_dx,
|
||||
p_dscale,
|
||||
p_dbias);
|
||||
};
|
||||
|
||||
template <typename XDataType,
|
||||
typename DyDataType,
|
||||
typename DxDataType,
|
||||
typename AccDataType,
|
||||
typename ScaleDataType,
|
||||
typename BiasDataType,
|
||||
typename MeanVarDataType,
|
||||
typename DyElementwiseOp,
|
||||
typename XYGridDesc_M_K,
|
||||
typename ScaleBiasGridDesc_M,
|
||||
typename MeanVarGridDesc_M,
|
||||
typename GetReduceCountPerThreadFunctor,
|
||||
index_t BlockSize,
|
||||
index_t MThreadClusterSize,
|
||||
index_t KThreadClusterSize,
|
||||
index_t MThreadSliceSize,
|
||||
index_t KThreadSliceSize,
|
||||
index_t XDyDxVectorDim,
|
||||
index_t XSrcVectorSize,
|
||||
index_t DySrcVectorSize,
|
||||
index_t DxDstVectorSize,
|
||||
index_t ScaleSrcDstVectorSize,
|
||||
index_t BiasDstVectorSize,
|
||||
index_t MeanVarSrcVectorSize>
|
||||
struct GridwiseBatchNormBackwardWithBlockwiseWelford
|
||||
{
|
||||
static_assert((XDyDxVectorDim == 0 && MThreadSliceSize % XSrcVectorSize == 0 &&
|
||||
MThreadSliceSize % DySrcVectorSize == 0 &&
|
||||
MThreadSliceSize % DxDstVectorSize == 0) ||
|
||||
(XDyDxVectorDim == 1 && KThreadSliceSize % XSrcVectorSize == 0 &&
|
||||
KThreadSliceSize % DySrcVectorSize == 0 &&
|
||||
KThreadSliceSize % DxDstVectorSize == 0),
|
||||
"Invalid thread slice sizes and/or vector sizes configuration, please check!");
|
||||
|
||||
static constexpr bool reorder_thread_cluster = (XDyDxVectorDim == 0);
|
||||
|
||||
using ThreadClusterLengths_M_K = Sequence<MThreadClusterSize, KThreadClusterSize>;
|
||||
|
||||
using ThreadBufferDimAccessOrder =
|
||||
typename conditional<reorder_thread_cluster, Sequence<1, 0>, Sequence<0, 1>>::type;
|
||||
|
||||
using ThreadClusterArrangeOrder =
|
||||
typename conditional<reorder_thread_cluster, Sequence<1, 0>, Sequence<0, 1>>::type;
|
||||
|
||||
static constexpr auto thread_cluster_desc =
|
||||
make_cluster_descriptor(ThreadClusterLengths_M_K{}, ThreadClusterArrangeOrder{});
|
||||
|
||||
using ThreadReduceSrcDesc_M_K = decltype(make_naive_tensor_descriptor_packed(
|
||||
make_tuple(Number<MThreadSliceSize>{}, Number<KThreadSliceSize>{})));
|
||||
using ThreadReduceDstDesc_M =
|
||||
decltype(make_naive_tensor_descriptor_packed(make_tuple(Number<MThreadSliceSize>{})));
|
||||
|
||||
using ThreadwiseWelford =
|
||||
ThreadwiseWelford<AccDataType, ThreadReduceSrcDesc_M_K, ThreadReduceDstDesc_M>;
|
||||
|
||||
using BlockwiseWelford = BlockwiseWelford<AccDataType,
|
||||
BlockSize,
|
||||
ThreadClusterLengths_M_K,
|
||||
ThreadClusterArrangeOrder>;
|
||||
|
||||
using BlockwiseReduce = PartitionedBlockwiseReduction<AccDataType,
|
||||
BlockSize,
|
||||
ThreadClusterLengths_M_K,
|
||||
ThreadClusterArrangeOrder,
|
||||
ck::reduce::Add,
|
||||
false>;
|
||||
|
||||
using ThreadwiseReduce = ThreadwiseReduction<AccDataType,
|
||||
ThreadReduceSrcDesc_M_K,
|
||||
ThreadReduceDstDesc_M,
|
||||
ck::reduce::Add,
|
||||
false>;
|
||||
|
||||
using PassThroughOp = tensor_operation::element_wise::PassThrough;
|
||||
|
||||
static constexpr auto I0 = Number<0>{};
|
||||
static constexpr auto I1 = Number<1>{};
|
||||
|
||||
static constexpr index_t M_BlockTileSize = MThreadClusterSize * MThreadSliceSize;
|
||||
static constexpr index_t K_BlockTileSize = KThreadClusterSize * KThreadSliceSize;
|
||||
|
||||
// clang-format off
|
||||
// Blockwise BatchNorm Backward
|
||||
// Input: x, dy, scale, savedMean and savedInvVar (optional), reduce_size
|
||||
// Output: dx, dscale, dbias
|
||||
// Step 1: calculating mean and inv-variance using welford method (if savedMean/savedInvVar not available), where inv-variance = 1/sqrt(epsilon+variance)
|
||||
// Step 2: reduction: dbias = sum(dy), dscale = sum(dy *(x-mean) * inv-variance)
|
||||
// Step 3: calculating dx = 1/reduce_size * inv-variance * scale * (reduce_size * dy - dbias - dscale * (x - mean) * inv-variance)) elementwise-ly
|
||||
// clang-format on
|
||||
__device__ static void Run(const XYGridDesc_M_K x_grid_desc_m_k,
|
||||
const XYGridDesc_M_K dy_grid_desc_m_k,
|
||||
const XYGridDesc_M_K dx_grid_desc_m_k,
|
||||
const ScaleBiasGridDesc_M scale_grid_desc_m,
|
||||
const ScaleBiasGridDesc_M bias_grid_desc_m,
|
||||
const MeanVarGridDesc_M mean_var_grid_desc_m,
|
||||
const GetReduceCountPerThreadFunctor get_reduce_count_per_thread,
|
||||
long_index_t reduce_size,
|
||||
index_t num_k_block_tile_iteration,
|
||||
AccDataType epsilon,
|
||||
const XDataType* const __restrict__ p_x,
|
||||
const DyDataType* const __restrict__ p_dy,
|
||||
const ScaleDataType* const __restrict__ p_scale,
|
||||
bool haveSavedMeanInvVar,
|
||||
const MeanVarDataType* const __restrict__ p_savedMean,
|
||||
const MeanVarDataType* const __restrict__ p_savedInvVar,
|
||||
const DyElementwiseOp dy_elementwise_op,
|
||||
DxDataType* const __restrict__ p_dx,
|
||||
ScaleDataType* const __restrict__ p_dscale,
|
||||
BiasDataType* const __restrict__ p_dbias)
|
||||
{
|
||||
using ck::math::sqrt;
|
||||
|
||||
__shared__ AccDataType p_reduce_work_buffer[BlockSize];
|
||||
|
||||
auto reduce_work_buf =
|
||||
make_dynamic_buffer<AddressSpaceEnum::Lds>(p_reduce_work_buffer, BlockSize);
|
||||
|
||||
StaticBuffer<AddressSpaceEnum::Vgpr, AccDataType, MThreadSliceSize * KThreadSliceSize, true>
|
||||
x_thread_buf;
|
||||
|
||||
StaticBuffer<AddressSpaceEnum::Vgpr, AccDataType, MThreadSliceSize * KThreadSliceSize, true>
|
||||
dy_thread_buf;
|
||||
|
||||
StaticBuffer<AddressSpaceEnum::Vgpr, AccDataType, MThreadSliceSize * KThreadSliceSize, true>
|
||||
dx_thread_buf;
|
||||
|
||||
// buffer of values of dy * (x-mean) * invVariance, used as input of Blockwise reduction
|
||||
StaticBuffer<AddressSpaceEnum::Vgpr, AccDataType, MThreadSliceSize * KThreadSliceSize, true>
|
||||
tmp1_thread_buf;
|
||||
|
||||
StaticBuffer<AddressSpaceEnum::Vgpr, AccDataType, MThreadSliceSize, true> scale_thread_buf;
|
||||
|
||||
StaticBuffer<AddressSpaceEnum::Vgpr, AccDataType, MThreadSliceSize, true> mean_thread_buf;
|
||||
StaticBuffer<AddressSpaceEnum::Vgpr, AccDataType, MThreadSliceSize, true> var_thread_buf;
|
||||
StaticBuffer<AddressSpaceEnum::Vgpr, AccDataType, MThreadSliceSize, true>&
|
||||
inv_var_thread_buf = var_thread_buf;
|
||||
|
||||
StaticBuffer<AddressSpaceEnum::Vgpr, AccDataType, MThreadSliceSize, true> dscale_thread_buf;
|
||||
StaticBuffer<AddressSpaceEnum::Vgpr, AccDataType, MThreadSliceSize, true> dbias_thread_buf;
|
||||
|
||||
const index_t thread_local_id = get_thread_local_1d_id();
|
||||
const index_t block_global_id = get_block_1d_id();
|
||||
|
||||
const auto thread_cluster_idx =
|
||||
thread_cluster_desc.CalculateBottomIndex(make_multi_index(thread_local_id));
|
||||
|
||||
const auto thread_m_cluster_id = thread_cluster_idx[I0];
|
||||
const auto thread_k_cluster_id = thread_cluster_idx[I1];
|
||||
|
||||
using ThreadBufferLengths_M_K = Sequence<MThreadSliceSize, KThreadSliceSize>;
|
||||
using ThreadBufferLengths_M = Sequence<MThreadSliceSize>;
|
||||
constexpr auto thread_buffer_desc_m_k = make_naive_tensor_descriptor_packed(
|
||||
make_tuple(Number<MThreadSliceSize>{}, Number<KThreadSliceSize>{}));
|
||||
constexpr auto thread_buffer_desc_m =
|
||||
make_naive_tensor_descriptor_packed(make_tuple(Number<MThreadSliceSize>{}));
|
||||
|
||||
auto threadwise_x_load = ThreadwiseTensorSliceTransfer_v2<XDataType,
|
||||
AccDataType,
|
||||
XYGridDesc_M_K,
|
||||
decltype(thread_buffer_desc_m_k),
|
||||
ThreadBufferLengths_M_K,
|
||||
ThreadBufferDimAccessOrder,
|
||||
XDyDxVectorDim,
|
||||
XSrcVectorSize,
|
||||
1,
|
||||
true>(
|
||||
x_grid_desc_m_k,
|
||||
make_multi_index(block_global_id * M_BlockTileSize +
|
||||
thread_m_cluster_id * MThreadSliceSize,
|
||||
thread_k_cluster_id * KThreadSliceSize));
|
||||
|
||||
auto threadwise_dy_load = ThreadwiseTensorSliceTransfer_v2<DyDataType,
|
||||
AccDataType,
|
||||
XYGridDesc_M_K,
|
||||
decltype(thread_buffer_desc_m_k),
|
||||
ThreadBufferLengths_M_K,
|
||||
ThreadBufferDimAccessOrder,
|
||||
XDyDxVectorDim,
|
||||
XSrcVectorSize,
|
||||
1,
|
||||
true>(
|
||||
x_grid_desc_m_k,
|
||||
make_multi_index(block_global_id * M_BlockTileSize +
|
||||
thread_m_cluster_id * MThreadSliceSize,
|
||||
thread_k_cluster_id * KThreadSliceSize));
|
||||
|
||||
auto threadwise_dx_store =
|
||||
ThreadwiseTensorSliceTransfer_v1r3<AccDataType,
|
||||
DxDataType,
|
||||
decltype(thread_buffer_desc_m_k),
|
||||
XYGridDesc_M_K,
|
||||
PassThroughOp,
|
||||
ThreadBufferLengths_M_K,
|
||||
ThreadBufferDimAccessOrder,
|
||||
XDyDxVectorDim,
|
||||
DxDstVectorSize,
|
||||
InMemoryDataOperationEnum::Set,
|
||||
1,
|
||||
true>(
|
||||
dy_grid_desc_m_k,
|
||||
make_multi_index(block_global_id * M_BlockTileSize +
|
||||
thread_m_cluster_id * MThreadSliceSize,
|
||||
thread_k_cluster_id * KThreadSliceSize),
|
||||
PassThroughOp{});
|
||||
|
||||
auto threadwise_scale_load =
|
||||
ThreadwiseTensorSliceTransfer_v2<ScaleDataType,
|
||||
AccDataType,
|
||||
ScaleBiasGridDesc_M,
|
||||
decltype(thread_buffer_desc_m),
|
||||
ThreadBufferLengths_M,
|
||||
Sequence<0>,
|
||||
0,
|
||||
ScaleSrcDstVectorSize,
|
||||
1,
|
||||
true>(
|
||||
scale_grid_desc_m,
|
||||
make_multi_index(block_global_id * M_BlockTileSize +
|
||||
thread_m_cluster_id * MThreadSliceSize));
|
||||
|
||||
auto threadwise_dscale_store =
|
||||
ThreadwiseTensorSliceTransfer_v1r3<AccDataType,
|
||||
ScaleDataType,
|
||||
decltype(thread_buffer_desc_m),
|
||||
ScaleBiasGridDesc_M,
|
||||
PassThroughOp,
|
||||
ThreadBufferLengths_M,
|
||||
Sequence<0>,
|
||||
0,
|
||||
ScaleSrcDstVectorSize,
|
||||
InMemoryDataOperationEnum::Set,
|
||||
1,
|
||||
true>(
|
||||
scale_grid_desc_m,
|
||||
make_multi_index(block_global_id * M_BlockTileSize +
|
||||
thread_m_cluster_id * MThreadSliceSize),
|
||||
PassThroughOp{});
|
||||
|
||||
auto threadwise_dbias_store =
|
||||
ThreadwiseTensorSliceTransfer_v1r3<AccDataType,
|
||||
BiasDataType,
|
||||
decltype(thread_buffer_desc_m),
|
||||
ScaleBiasGridDesc_M,
|
||||
PassThroughOp,
|
||||
ThreadBufferLengths_M,
|
||||
Sequence<0>,
|
||||
0,
|
||||
BiasDstVectorSize,
|
||||
InMemoryDataOperationEnum::Set,
|
||||
1,
|
||||
true>(
|
||||
bias_grid_desc_m,
|
||||
make_multi_index(block_global_id * M_BlockTileSize +
|
||||
thread_m_cluster_id * MThreadSliceSize),
|
||||
PassThroughOp{});
|
||||
|
||||
constexpr auto thread_copy_fwd_step_m_k = make_multi_index(0, K_BlockTileSize);
|
||||
constexpr auto thread_copy_bwd_step_m_k = make_multi_index(0, -K_BlockTileSize);
|
||||
|
||||
const auto x_global_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
|
||||
p_x, x_grid_desc_m_k.GetElementSpaceSize());
|
||||
|
||||
const auto dy_global_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
|
||||
p_dy, dy_grid_desc_m_k.GetElementSpaceSize());
|
||||
|
||||
auto dx_global_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
|
||||
p_dx, dx_grid_desc_m_k.GetElementSpaceSize());
|
||||
|
||||
const auto scale_global_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
|
||||
p_scale, scale_grid_desc_m.GetElementSpaceSize());
|
||||
|
||||
auto dscale_global_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
|
||||
p_dscale, scale_grid_desc_m.GetElementSpaceSize());
|
||||
|
||||
auto dbias_global_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
|
||||
p_dbias, bias_grid_desc_m.GetElementSpaceSize());
|
||||
|
||||
// clang-format off
|
||||
// Step 1: calculating mean and inv-variance using welford method (if savedMean/savedInvVar not available), where inv-variance = 1/sqrt(epsilon+variance)
|
||||
// clang-format on
|
||||
|
||||
if(haveSavedMeanInvVar)
|
||||
{
|
||||
const auto mean_global_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
|
||||
p_savedMean, mean_var_grid_desc_m.GetElementSpaceSize());
|
||||
|
||||
const auto inv_var_global_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
|
||||
p_savedInvVar, mean_var_grid_desc_m.GetElementSpaceSize());
|
||||
|
||||
auto threadwise_mean_inv_var_load =
|
||||
ThreadwiseTensorSliceTransfer_v2<MeanVarDataType,
|
||||
AccDataType,
|
||||
MeanVarGridDesc_M,
|
||||
decltype(thread_buffer_desc_m),
|
||||
ThreadBufferLengths_M,
|
||||
Sequence<0>,
|
||||
0,
|
||||
MeanVarSrcVectorSize,
|
||||
1,
|
||||
true>(
|
||||
mean_var_grid_desc_m,
|
||||
make_multi_index(block_global_id * M_BlockTileSize +
|
||||
thread_m_cluster_id * MThreadSliceSize));
|
||||
|
||||
threadwise_mean_inv_var_load.Run(mean_var_grid_desc_m,
|
||||
mean_global_buf,
|
||||
thread_buffer_desc_m,
|
||||
make_tuple(I0),
|
||||
mean_thread_buf);
|
||||
|
||||
threadwise_mean_inv_var_load.Run(mean_var_grid_desc_m,
|
||||
inv_var_global_buf,
|
||||
thread_buffer_desc_m,
|
||||
make_tuple(I0),
|
||||
inv_var_thread_buf);
|
||||
}
|
||||
else
|
||||
{
|
||||
auto threadwise_welford = ThreadwiseWelford();
|
||||
threadwise_welford.max_count_ = get_reduce_count_per_thread(thread_k_cluster_id);
|
||||
|
||||
static_for<0, MThreadSliceSize, 1>{}([&](auto I) {
|
||||
mean_thread_buf(I) = type_convert<AccDataType>(0.0f);
|
||||
var_thread_buf(I) = type_convert<AccDataType>(0.0f);
|
||||
});
|
||||
|
||||
for(index_t reducedTiles = 0; reducedTiles < num_k_block_tile_iteration; ++reducedTiles)
|
||||
{
|
||||
|
||||
threadwise_x_load.Run(x_grid_desc_m_k,
|
||||
x_global_buf,
|
||||
thread_buffer_desc_m_k,
|
||||
make_tuple(I0, I0),
|
||||
x_thread_buf);
|
||||
|
||||
threadwise_x_load.MoveSrcSliceWindow(x_grid_desc_m_k, thread_copy_fwd_step_m_k);
|
||||
threadwise_welford.Run(x_thread_buf, mean_thread_buf, var_thread_buf);
|
||||
}
|
||||
|
||||
static_for<0, MThreadSliceSize, 1>{}([&](auto I) {
|
||||
if constexpr(I > 0)
|
||||
block_sync_lds();
|
||||
|
||||
int count = threadwise_welford.cur_count_;
|
||||
BlockwiseWelford::Run(mean_thread_buf(I), var_thread_buf(I), count);
|
||||
});
|
||||
|
||||
// calculate inv-variance as 1/sqrt(epsilon+variance)
|
||||
static_for<0, MThreadSliceSize, 1>{}([&](auto I) {
|
||||
inv_var_thread_buf(I) =
|
||||
type_convert<AccDataType>(1.0) / sqrt(var_thread_buf[I] + epsilon);
|
||||
});
|
||||
|
||||
threadwise_x_load.SetSrcSliceOrigin(
|
||||
x_grid_desc_m_k,
|
||||
make_multi_index(block_global_id * M_BlockTileSize +
|
||||
thread_m_cluster_id * MThreadSliceSize,
|
||||
thread_k_cluster_id * KThreadSliceSize));
|
||||
};
|
||||
|
||||
// clang-format off
|
||||
// Step 2: reduction: dbias = sum(dy), dscale = sum(dy *(x-mean) * inv-variance)
|
||||
// clang-format on
|
||||
|
||||
static_for<0, MThreadSliceSize, 1>{}([&](auto I) {
|
||||
dscale_thread_buf(I) = type_convert<AccDataType>(0);
|
||||
dbias_thread_buf(I) = type_convert<AccDataType>(0);
|
||||
});
|
||||
|
||||
for(index_t reducedTiles = 0; reducedTiles < num_k_block_tile_iteration; ++reducedTiles)
|
||||
{
|
||||
threadwise_x_load.Run(x_grid_desc_m_k,
|
||||
x_global_buf,
|
||||
thread_buffer_desc_m_k,
|
||||
make_tuple(I0, I0),
|
||||
x_thread_buf);
|
||||
|
||||
threadwise_dy_load.Run(dx_grid_desc_m_k,
|
||||
dy_global_buf,
|
||||
thread_buffer_desc_m_k,
|
||||
make_tuple(I0, I0),
|
||||
dy_thread_buf);
|
||||
|
||||
static_for<0, MThreadSliceSize, 1>{}([&](auto iM) {
|
||||
static_for<0, KThreadSliceSize, 1>{}([&](auto iK) {
|
||||
constexpr auto offset =
|
||||
thread_buffer_desc_m_k.CalculateOffset(make_tuple(iM, iK));
|
||||
|
||||
dy_elementwise_op(dy_thread_buf(Number<offset>{}),
|
||||
dy_thread_buf[Number<offset>{}]);
|
||||
|
||||
AccDataType norm_x = (x_thread_buf[Number<offset>{}] - mean_thread_buf[iM]) *
|
||||
inv_var_thread_buf[iM];
|
||||
|
||||
tmp1_thread_buf(Number<offset>{}) = norm_x * dy_thread_buf[Number<offset>{}];
|
||||
});
|
||||
});
|
||||
|
||||
ThreadwiseReduce::Reduce(tmp1_thread_buf, dscale_thread_buf);
|
||||
ThreadwiseReduce::Reduce(dy_thread_buf, dbias_thread_buf);
|
||||
|
||||
threadwise_x_load.MoveSrcSliceWindow(x_grid_desc_m_k, thread_copy_fwd_step_m_k);
|
||||
threadwise_dy_load.MoveSrcSliceWindow(dy_grid_desc_m_k, thread_copy_fwd_step_m_k);
|
||||
};
|
||||
|
||||
static_for<0, MThreadSliceSize, 1>{}([&](auto I) {
|
||||
if constexpr(I > 0)
|
||||
block_sync_lds();
|
||||
BlockwiseReduce::Reduce(reduce_work_buf, dscale_thread_buf(I));
|
||||
block_sync_lds();
|
||||
BlockwiseReduce::Reduce(reduce_work_buf, dbias_thread_buf(I));
|
||||
});
|
||||
|
||||
if(thread_k_cluster_id == 0)
|
||||
{
|
||||
threadwise_dscale_store.Run(thread_buffer_desc_m,
|
||||
make_tuple(I0),
|
||||
dscale_thread_buf,
|
||||
scale_grid_desc_m,
|
||||
dscale_global_buf);
|
||||
|
||||
threadwise_dbias_store.Run(thread_buffer_desc_m,
|
||||
make_tuple(I0),
|
||||
dbias_thread_buf,
|
||||
bias_grid_desc_m,
|
||||
dbias_global_buf);
|
||||
};
|
||||
|
||||
// clang-format off
|
||||
// Step 3: calculating dx = 1/reduce_size * inv-variance * scale * (reduce_size * dy - dbias - dscale * (x - mean) * inv-variance)) elementwise-ly
|
||||
// clang-format on
|
||||
|
||||
threadwise_scale_load.Run(scale_grid_desc_m,
|
||||
scale_global_buf,
|
||||
thread_buffer_desc_m,
|
||||
make_tuple(I0),
|
||||
scale_thread_buf);
|
||||
|
||||
auto thread_copy_tail_m_k = (num_k_block_tile_iteration - 1) * thread_copy_fwd_step_m_k;
|
||||
|
||||
threadwise_x_load.MoveSrcSliceWindow(x_grid_desc_m_k, thread_copy_bwd_step_m_k);
|
||||
threadwise_dy_load.MoveSrcSliceWindow(dy_grid_desc_m_k, thread_copy_bwd_step_m_k);
|
||||
threadwise_dx_store.MoveDstSliceWindow(dx_grid_desc_m_k, thread_copy_tail_m_k);
|
||||
|
||||
AccDataType inv_reduce_size =
|
||||
type_convert<AccDataType>(1.0) / type_convert<AccDataType>(reduce_size);
|
||||
|
||||
for(index_t reducedTiles = 0; reducedTiles < num_k_block_tile_iteration; ++reducedTiles)
|
||||
{
|
||||
threadwise_x_load.Run(x_grid_desc_m_k,
|
||||
x_global_buf,
|
||||
thread_buffer_desc_m_k,
|
||||
make_tuple(I0, I0),
|
||||
x_thread_buf);
|
||||
|
||||
threadwise_dy_load.Run(dy_grid_desc_m_k,
|
||||
dy_global_buf,
|
||||
thread_buffer_desc_m_k,
|
||||
make_tuple(I0, I0),
|
||||
dy_thread_buf);
|
||||
|
||||
static_for<0, MThreadSliceSize, 1>{}([&](auto iM) {
|
||||
AccDataType multiplier =
|
||||
inv_reduce_size * inv_var_thread_buf[iM] * scale_thread_buf[iM];
|
||||
|
||||
static_for<0, KThreadSliceSize, 1>{}([&](auto iK) {
|
||||
constexpr auto offset =
|
||||
thread_buffer_desc_m_k.CalculateOffset(make_tuple(iM, iK));
|
||||
|
||||
dy_elementwise_op(dy_thread_buf(Number<offset>{}),
|
||||
dy_thread_buf[Number<offset>{}]);
|
||||
|
||||
AccDataType norm_x = (x_thread_buf[Number<offset>{}] - mean_thread_buf[iM]) *
|
||||
inv_var_thread_buf[iM];
|
||||
|
||||
AccDataType tmpVal = norm_x * dscale_thread_buf[iM];
|
||||
|
||||
dx_thread_buf(Number<offset>{}) =
|
||||
multiplier *
|
||||
(type_convert<AccDataType>(reduce_size) * dy_thread_buf[Number<offset>{}] -
|
||||
dbias_thread_buf[iM] - tmpVal);
|
||||
});
|
||||
});
|
||||
|
||||
threadwise_dx_store.Run(thread_buffer_desc_m_k,
|
||||
make_tuple(I0, I0),
|
||||
dx_thread_buf,
|
||||
dx_grid_desc_m_k,
|
||||
dx_global_buf);
|
||||
|
||||
threadwise_x_load.MoveSrcSliceWindow(x_grid_desc_m_k, thread_copy_bwd_step_m_k);
|
||||
threadwise_dy_load.MoveSrcSliceWindow(dy_grid_desc_m_k, thread_copy_bwd_step_m_k);
|
||||
threadwise_dx_store.MoveDstSliceWindow(dx_grid_desc_m_k, thread_copy_bwd_step_m_k);
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
} // namespace ck
|
||||
@@ -441,6 +441,7 @@ struct GridwiseBatchNormForwardWithBlockwiseWelford
|
||||
auto result_inv_var_global_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
|
||||
resultSaveInvVariance, mean_var_grid_desc_m.GetElementSpaceSize());
|
||||
|
||||
// calculate inv-variance as 1/sqrt(epsilon+variance), stored in place of variance
|
||||
static_for<0, MThreadSliceSize, 1>{}([&](auto I) {
|
||||
var_thread_buf(I) =
|
||||
type_convert<AccDataType>(1.0f) / sqrt(epsilon + var_thread_buf[I]);
|
||||
|
||||
@@ -0,0 +1,258 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "ck/utility/data_type.hpp"
|
||||
#include "ck/utility/math.hpp"
|
||||
#include "ck/tensor_operation/gpu/block/blockwise_welford.hpp"
|
||||
#include "ck/tensor_operation/gpu/thread/threadwise_welford.hpp"
|
||||
#include "ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer.hpp"
|
||||
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
|
||||
|
||||
namespace ck {
|
||||
|
||||
template <typename GridwiseMultiblockWelfordFirstHalf_,
|
||||
typename XDataType,
|
||||
typename MeanVarDataType,
|
||||
typename XGridDesc_M_K,
|
||||
typename MeanVarCountGridDesc_M_G,
|
||||
typename GetReduceCountPerThreadFunctor>
|
||||
__global__ void kernel_multiblock_welford_first_half(
|
||||
const XGridDesc_M_K x_grid_desc_m_k,
|
||||
const MeanVarCountGridDesc_M_G mean_var_count_grid_desc_m_g,
|
||||
const GetReduceCountPerThreadFunctor get_reduce_count_per_thread,
|
||||
index_t num_k_block_tile_iteration,
|
||||
const XDataType* const __restrict__ p_x,
|
||||
MeanVarDataType* const p_welford_mean,
|
||||
MeanVarDataType* const p_welford_variance,
|
||||
int32_t* const p_welford_count)
|
||||
{
|
||||
GridwiseMultiblockWelfordFirstHalf_::Run(x_grid_desc_m_k,
|
||||
mean_var_count_grid_desc_m_g,
|
||||
get_reduce_count_per_thread,
|
||||
num_k_block_tile_iteration,
|
||||
p_x,
|
||||
p_welford_mean,
|
||||
p_welford_variance,
|
||||
p_welford_count);
|
||||
};
|
||||
|
||||
template <typename XDataType,
|
||||
typename AccDataType,
|
||||
typename MeanVarDataType,
|
||||
typename XGridDesc_M_K,
|
||||
typename MeanVarCountGridDesc_M_G,
|
||||
typename GetReduceCountPerThreadFunctor,
|
||||
index_t BlockSize,
|
||||
index_t MThreadClusterSize,
|
||||
index_t KThreadClusterSize,
|
||||
index_t MThreadSliceSize,
|
||||
index_t KThreadSliceSize,
|
||||
index_t XSrcCountSrcVectorDim,
|
||||
index_t XSrcCountSrcVectorSize>
|
||||
struct GridwiseMultiblockWelfordFirstHalf
|
||||
{
|
||||
static_assert((XSrcCountSrcVectorDim == 0 && MThreadSliceSize % XSrcCountSrcVectorSize == 0) ||
|
||||
(XSrcCountSrcVectorDim == 1 &&
|
||||
KThreadSliceSize % XSrcCountSrcVectorSize == 0),
|
||||
"Invalid thread slice sizes and/or vector sizes configuration, please check!");
|
||||
|
||||
static constexpr bool reorder_thread_cluster = (XSrcCountSrcVectorDim == 0);
|
||||
|
||||
using ThreadClusterLengths_M_K = Sequence<MThreadClusterSize, KThreadClusterSize>;
|
||||
|
||||
using ThreadBufferDimAccessOrder =
|
||||
typename conditional<reorder_thread_cluster, Sequence<1, 0>, Sequence<0, 1>>::type;
|
||||
|
||||
using ThreadClusterArrangeOrder =
|
||||
typename conditional<reorder_thread_cluster, Sequence<1, 0>, Sequence<0, 1>>::type;
|
||||
|
||||
static constexpr auto thread_cluster_desc =
|
||||
make_cluster_descriptor(ThreadClusterLengths_M_K{}, ThreadClusterArrangeOrder{});
|
||||
|
||||
using ThreadReduceSrcDesc_M_K = decltype(make_naive_tensor_descriptor_packed(
|
||||
make_tuple(Number<MThreadSliceSize>{}, Number<KThreadSliceSize>{})));
|
||||
using ThreadReduceDstDesc_M =
|
||||
decltype(make_naive_tensor_descriptor_packed(make_tuple(Number<MThreadSliceSize>{})));
|
||||
|
||||
using ThreadwiseWelford =
|
||||
ThreadwiseWelford<AccDataType, ThreadReduceSrcDesc_M_K, ThreadReduceDstDesc_M>;
|
||||
|
||||
using BlockwiseWelford = BlockwiseWelford<AccDataType,
|
||||
BlockSize,
|
||||
ThreadClusterLengths_M_K,
|
||||
ThreadClusterArrangeOrder,
|
||||
false>;
|
||||
|
||||
using PassThroughOp = tensor_operation::element_wise::PassThrough;
|
||||
|
||||
static constexpr auto I0 = Number<0>{};
|
||||
static constexpr auto I1 = Number<1>{};
|
||||
|
||||
static constexpr index_t M_BlockTileSize = MThreadClusterSize * MThreadSliceSize;
|
||||
static constexpr index_t K_BlockTileSize = KThreadClusterSize * KThreadSliceSize;
|
||||
|
||||
__device__ static void Run(const XGridDesc_M_K& x_grid_desc_m_k,
|
||||
const MeanVarCountGridDesc_M_G& mean_var_count_grid_desc_m_g,
|
||||
const GetReduceCountPerThreadFunctor& get_reduce_count_per_thread,
|
||||
index_t num_k_block_tile_iteration,
|
||||
const XDataType* const __restrict__ p_x,
|
||||
MeanVarDataType* const p_welford_mean,
|
||||
MeanVarDataType* const p_welford_variance,
|
||||
int32_t* const p_welford_count)
|
||||
{
|
||||
StaticBuffer<AddressSpaceEnum::Vgpr, AccDataType, MThreadSliceSize * KThreadSliceSize, true>
|
||||
x_thread_buf;
|
||||
|
||||
StaticBuffer<AddressSpaceEnum::Vgpr, AccDataType, MThreadSliceSize, true>
|
||||
welford_mean_thread_buf;
|
||||
StaticBuffer<AddressSpaceEnum::Vgpr, AccDataType, MThreadSliceSize, true>
|
||||
welford_var_thread_buf;
|
||||
StaticBuffer<AddressSpaceEnum::Vgpr, int32_t, MThreadSliceSize, true>
|
||||
welford_count_thread_buf;
|
||||
|
||||
const index_t blkgroup_size = mean_var_count_grid_desc_m_g.GetLength(I1);
|
||||
|
||||
const index_t thread_local_id = get_thread_local_1d_id();
|
||||
const index_t block_global_id = get_block_1d_id();
|
||||
const index_t blkgroup_id = block_global_id / blkgroup_size;
|
||||
const index_t block_local_id = block_global_id % blkgroup_size;
|
||||
|
||||
const auto thread_cluster_idx =
|
||||
thread_cluster_desc.CalculateBottomIndex(make_multi_index(thread_local_id));
|
||||
|
||||
const auto thread_m_cluster_id = thread_cluster_idx[I0];
|
||||
const auto thread_k_cluster_id = thread_cluster_idx[I1];
|
||||
|
||||
using ThreadBufferLengths_M_K = Sequence<MThreadSliceSize, KThreadSliceSize>;
|
||||
using ThreadBufferLengths_M_1 = Sequence<MThreadSliceSize, 1>;
|
||||
|
||||
constexpr auto thread_buffer_desc_m_k = make_naive_tensor_descriptor_packed(
|
||||
make_tuple(Number<MThreadSliceSize>{}, Number<KThreadSliceSize>{}));
|
||||
constexpr auto thread_buffer_desc_m_1 = make_naive_tensor_descriptor_packed(
|
||||
make_tuple(Number<MThreadSliceSize>{}, Number<1>{}));
|
||||
|
||||
const index_t reduceSizePerBlock = K_BlockTileSize * num_k_block_tile_iteration;
|
||||
|
||||
auto threadwise_x_load = ThreadwiseTensorSliceTransfer_v2<XDataType,
|
||||
AccDataType,
|
||||
XGridDesc_M_K,
|
||||
decltype(thread_buffer_desc_m_k),
|
||||
ThreadBufferLengths_M_K,
|
||||
ThreadBufferDimAccessOrder,
|
||||
XSrcCountSrcVectorDim,
|
||||
XSrcCountSrcVectorSize,
|
||||
1,
|
||||
true>(
|
||||
x_grid_desc_m_k,
|
||||
make_multi_index(blkgroup_id * M_BlockTileSize + thread_m_cluster_id * MThreadSliceSize,
|
||||
block_local_id * reduceSizePerBlock +
|
||||
thread_k_cluster_id * KThreadSliceSize));
|
||||
|
||||
auto threadwise_welford_mean_var_store =
|
||||
ThreadwiseTensorSliceTransfer_v1r3<AccDataType,
|
||||
MeanVarDataType,
|
||||
decltype(thread_buffer_desc_m_1),
|
||||
MeanVarCountGridDesc_M_G,
|
||||
PassThroughOp,
|
||||
ThreadBufferLengths_M_1,
|
||||
Sequence<0, 1>,
|
||||
1,
|
||||
1,
|
||||
InMemoryDataOperationEnum::Set,
|
||||
1,
|
||||
true>(
|
||||
mean_var_count_grid_desc_m_g,
|
||||
make_multi_index(blkgroup_id * M_BlockTileSize +
|
||||
thread_m_cluster_id * MThreadSliceSize,
|
||||
block_local_id),
|
||||
PassThroughOp{});
|
||||
|
||||
auto threadwise_welford_count_store =
|
||||
ThreadwiseTensorSliceTransfer_v1r3<int32_t,
|
||||
int32_t,
|
||||
decltype(thread_buffer_desc_m_1),
|
||||
MeanVarCountGridDesc_M_G,
|
||||
PassThroughOp,
|
||||
ThreadBufferLengths_M_1,
|
||||
Sequence<0, 1>,
|
||||
1,
|
||||
1,
|
||||
InMemoryDataOperationEnum::Set,
|
||||
1,
|
||||
true>(
|
||||
mean_var_count_grid_desc_m_g,
|
||||
make_multi_index(blkgroup_id * M_BlockTileSize +
|
||||
thread_m_cluster_id * MThreadSliceSize,
|
||||
block_local_id),
|
||||
PassThroughOp{});
|
||||
|
||||
constexpr auto thread_copy_fwd_step_m_k = make_multi_index(0, K_BlockTileSize);
|
||||
|
||||
const auto x_global_val_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
|
||||
p_x, x_grid_desc_m_k.GetElementSpaceSize());
|
||||
|
||||
auto welford_mean_global_val_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
|
||||
p_welford_mean, mean_var_count_grid_desc_m_g.GetElementSpaceSize());
|
||||
|
||||
auto welford_var_global_val_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
|
||||
p_welford_variance, mean_var_count_grid_desc_m_g.GetElementSpaceSize());
|
||||
|
||||
auto welford_count_global_val_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
|
||||
p_welford_count, mean_var_count_grid_desc_m_g.GetElementSpaceSize());
|
||||
|
||||
auto threadwise_welford = ThreadwiseWelford();
|
||||
threadwise_welford.max_count_ =
|
||||
get_reduce_count_per_thread(block_local_id, thread_k_cluster_id);
|
||||
|
||||
static_for<0, MThreadSliceSize, 1>{}([&](auto I) {
|
||||
welford_mean_thread_buf(I) = type_convert<AccDataType>(0.0f);
|
||||
welford_var_thread_buf(I) = type_convert<AccDataType>(0.0f);
|
||||
});
|
||||
|
||||
for(index_t reducedTiles = 0; reducedTiles < num_k_block_tile_iteration; ++reducedTiles)
|
||||
{
|
||||
threadwise_x_load.Run(x_grid_desc_m_k,
|
||||
x_global_val_buf,
|
||||
thread_buffer_desc_m_k,
|
||||
make_tuple(I0, I0),
|
||||
x_thread_buf);
|
||||
|
||||
threadwise_x_load.MoveSrcSliceWindow(x_grid_desc_m_k, thread_copy_fwd_step_m_k);
|
||||
threadwise_welford.Run(x_thread_buf, welford_mean_thread_buf, welford_var_thread_buf);
|
||||
}
|
||||
|
||||
static_for<0, MThreadSliceSize, 1>{}([&](auto I) {
|
||||
if constexpr(I > 0)
|
||||
block_sync_lds();
|
||||
|
||||
welford_count_thread_buf(I) = threadwise_welford.cur_count_;
|
||||
BlockwiseWelford::Run(
|
||||
welford_mean_thread_buf(I), welford_var_thread_buf(I), welford_count_thread_buf(I));
|
||||
});
|
||||
|
||||
if(thread_k_cluster_id == 0)
|
||||
{
|
||||
threadwise_welford_mean_var_store.Run(thread_buffer_desc_m_1,
|
||||
make_tuple(I0, I0),
|
||||
welford_mean_thread_buf,
|
||||
mean_var_count_grid_desc_m_g,
|
||||
welford_mean_global_val_buf);
|
||||
|
||||
threadwise_welford_mean_var_store.Run(thread_buffer_desc_m_1,
|
||||
make_tuple(I0, I0),
|
||||
welford_var_thread_buf,
|
||||
mean_var_count_grid_desc_m_g,
|
||||
welford_var_global_val_buf);
|
||||
|
||||
threadwise_welford_count_store.Run(thread_buffer_desc_m_1,
|
||||
make_tuple(I0, I0),
|
||||
welford_count_thread_buf,
|
||||
mean_var_count_grid_desc_m_g,
|
||||
welford_count_global_val_buf);
|
||||
};
|
||||
}
|
||||
};
|
||||
|
||||
} // namespace ck
|
||||
@@ -0,0 +1,319 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <iostream>
|
||||
#include <sstream>
|
||||
#include <algorithm>
|
||||
|
||||
#include "ck/tensor_operation/gpu/device/device_batchnorm_backward.hpp"
|
||||
|
||||
namespace ck {
|
||||
namespace tensor_operation {
|
||||
namespace host {
|
||||
|
||||
template <typename XDataType,
|
||||
typename DyDataType,
|
||||
typename DxDataType,
|
||||
typename AccDataType,
|
||||
typename ScaleDataType,
|
||||
typename BiasDataType,
|
||||
typename MeanVarDataType,
|
||||
typename DyElementwiseOp>
|
||||
struct ReferenceBatchNormBwd_Input_N_H_W_C_Output_C
|
||||
: public device::DeviceBatchNormBwd<4, 3, DyElementwiseOp>
|
||||
{
|
||||
struct Argument : public device::BaseArgument
|
||||
{
|
||||
Argument(const std::array<index_t, 4> xyLengths,
|
||||
const std::array<index_t, 4> xStrides,
|
||||
const std::array<index_t, 4> dyStrides,
|
||||
const std::array<index_t, 4> dxStrides,
|
||||
const std::array<int, 3> reduceDims,
|
||||
const std::array<ck::index_t, 1> bnScaleBiasMeanVarLengths,
|
||||
const std::array<ck::index_t, 1> bnScaleStrides,
|
||||
const std::array<ck::index_t, 1> bnBiasStrides,
|
||||
const std::array<ck::index_t, 1> bnMeanVarStrides,
|
||||
const XDataType* p_x,
|
||||
const DyDataType* p_dy,
|
||||
const ScaleDataType* p_scale,
|
||||
const MeanVarDataType* p_savedMean,
|
||||
const MeanVarDataType* p_savedInvVar,
|
||||
double epsilon,
|
||||
const DyElementwiseOp dy_elementwise_op,
|
||||
DxDataType* p_dx,
|
||||
ScaleDataType* p_dscale,
|
||||
BiasDataType* p_dbias)
|
||||
: p_x_(p_x),
|
||||
p_dy_(p_dy),
|
||||
p_scale_(p_scale),
|
||||
p_savedMean_(p_savedMean),
|
||||
p_savedInvVar_(p_savedInvVar),
|
||||
epsilon_(epsilon),
|
||||
dy_elementwise_op_(dy_elementwise_op),
|
||||
p_dx_(p_dx),
|
||||
p_dscale_(p_dscale),
|
||||
p_dbias_(p_dbias)
|
||||
{
|
||||
ignore = xStrides;
|
||||
ignore = dyStrides;
|
||||
ignore = dxStrides;
|
||||
ignore = bnScaleStrides;
|
||||
ignore = bnBiasStrides;
|
||||
ignore = bnMeanVarStrides;
|
||||
|
||||
if(xyLengths.size() != 4 || bnScaleBiasMeanVarLengths.size() != 1 ||
|
||||
bnScaleBiasMeanVarLengths[0] != xyLengths[3])
|
||||
throw std::runtime_error("Invalid tensor dimensions!");
|
||||
|
||||
if(reduceDims[0] != 0 || reduceDims[1] != 1 || reduceDims[2] != 2)
|
||||
throw std::runtime_error("Invalid reduce dimensions!");
|
||||
|
||||
n_ = xyLengths[0];
|
||||
h_ = xyLengths[1];
|
||||
w_ = xyLengths[2];
|
||||
c_ = xyLengths[3];
|
||||
|
||||
haveSavedMeanInvVar_ = (p_savedMean != nullptr && p_savedInvVar != nullptr);
|
||||
}
|
||||
|
||||
const XDataType* p_x_;
|
||||
const DyDataType* p_dy_;
|
||||
const ScaleDataType* p_scale_;
|
||||
const MeanVarDataType* p_savedMean_;
|
||||
const MeanVarDataType* p_savedInvVar_;
|
||||
|
||||
double epsilon_;
|
||||
const DyElementwiseOp dy_elementwise_op_;
|
||||
|
||||
DxDataType* p_dx_;
|
||||
ScaleDataType* p_dscale_;
|
||||
BiasDataType* p_dbias_;
|
||||
|
||||
bool haveSavedMeanInvVar_;
|
||||
|
||||
index_t n_, h_, w_, c_;
|
||||
};
|
||||
|
||||
struct Invoker : public device::BaseInvoker
|
||||
{
|
||||
float Run(const Argument& arg)
|
||||
{
|
||||
auto thread_reduce_func = [&](auto iC) {
|
||||
AccDataType reduceSize = type_convert<AccDataType>(arg.n_) *
|
||||
type_convert<AccDataType>(arg.h_) *
|
||||
type_convert<AccDataType>(arg.w_);
|
||||
index_t offset_C = iC;
|
||||
AccDataType mean;
|
||||
AccDataType invVar;
|
||||
|
||||
if(arg.haveSavedMeanInvVar_)
|
||||
{
|
||||
mean = arg.p_savedMean_[offset_C];
|
||||
invVar = arg.p_savedInvVar_[offset_C];
|
||||
}
|
||||
else
|
||||
{
|
||||
AccDataType meansquare;
|
||||
|
||||
meansquare = type_convert<AccDataType>(0.0f);
|
||||
mean = type_convert<AccDataType>(0.0f);
|
||||
|
||||
// compute mean, meanquare, variance, inv-variance
|
||||
for(index_t iN = 0; iN < arg.n_; iN++)
|
||||
{
|
||||
index_t offset_N = iN * arg.h_ * arg.w_ * arg.c_;
|
||||
for(index_t iH = 0; iH < arg.h_; iH++)
|
||||
{
|
||||
index_t offset_H = iH * arg.w_ * arg.c_;
|
||||
for(index_t iW = 0; iW < arg.w_; iW++)
|
||||
{
|
||||
index_t offset_W = iW * arg.c_;
|
||||
|
||||
auto offset = offset_N + offset_H + offset_W + offset_C;
|
||||
|
||||
AccDataType x = type_convert<AccDataType>(arg.p_x_[offset]);
|
||||
|
||||
mean += x;
|
||||
meansquare += x * x;
|
||||
};
|
||||
}
|
||||
};
|
||||
|
||||
mean = mean / reduceSize;
|
||||
meansquare = meansquare / reduceSize;
|
||||
|
||||
AccDataType variance = meansquare - mean * mean;
|
||||
invVar = type_convert<AccDataType>(1.0f) /
|
||||
std::sqrt(type_convert<AccDataType>(arg.epsilon_) + variance);
|
||||
};
|
||||
|
||||
AccDataType dbias = type_convert<AccDataType>(0.0f); // Sum on NHW of dy
|
||||
AccDataType dscale = type_convert<AccDataType>(0.0f); // Sum on NHW of dy * norm_x
|
||||
|
||||
// 1) calculate dy * (x - mean) * inv-variance
|
||||
// 2) calculate sum(dy) on NHW dimensions
|
||||
// 3) calculate sum(dy * norm_x) on NHW dimensions
|
||||
for(index_t iN = 0; iN < arg.n_; iN++)
|
||||
{
|
||||
index_t offset_N = iN * arg.h_ * arg.w_ * arg.c_;
|
||||
for(index_t iH = 0; iH < arg.h_; iH++)
|
||||
{
|
||||
index_t offset_H = iH * arg.w_ * arg.c_;
|
||||
for(index_t iW = 0; iW < arg.w_; iW++)
|
||||
{
|
||||
index_t offset_W = iW * arg.c_;
|
||||
|
||||
auto offset = offset_N + offset_H + offset_W + offset_C;
|
||||
|
||||
AccDataType x = type_convert<AccDataType>(arg.p_x_[offset]);
|
||||
|
||||
AccDataType norm_x = (x - mean) * invVar;
|
||||
AccDataType dy = type_convert<AccDataType>(arg.p_dy_[offset]);
|
||||
|
||||
arg.dy_elementwise_op_(dy, dy);
|
||||
|
||||
dbias += dy;
|
||||
dscale += norm_x * dy;
|
||||
};
|
||||
}
|
||||
};
|
||||
|
||||
arg.p_dscale_[offset_C] = type_convert<ScaleDataType>(dscale);
|
||||
arg.p_dbias_[offset_C] = type_convert<BiasDataType>(dbias);
|
||||
|
||||
AccDataType scale = type_convert<AccDataType>(arg.p_scale_[offset_C]);
|
||||
AccDataType multiplier =
|
||||
type_convert<AccDataType>(1.0f) / reduceSize * invVar * scale;
|
||||
|
||||
// 1) calculate tmp = dscale * (x - mean) * inv-variance
|
||||
// 2) calculate dx = 1/nhw * inv-variance * scale * (nhw * dy - dbias - tmp)
|
||||
for(index_t iN = 0; iN < arg.n_; iN++)
|
||||
{
|
||||
index_t offset_N = iN * arg.h_ * arg.w_ * arg.c_;
|
||||
for(index_t iH = 0; iH < arg.h_; iH++)
|
||||
{
|
||||
index_t offset_H = iH * arg.w_ * arg.c_;
|
||||
for(index_t iW = 0; iW < arg.w_; iW++)
|
||||
{
|
||||
index_t offset_W = iW * arg.c_;
|
||||
|
||||
auto offset = offset_N + offset_H + offset_W + offset_C;
|
||||
|
||||
AccDataType x = type_convert<AccDataType>(arg.p_x_[offset]);
|
||||
|
||||
AccDataType norm_x = (x - mean) * invVar;
|
||||
AccDataType dy = type_convert<AccDataType>(arg.p_dy_[offset]);
|
||||
|
||||
arg.dy_elementwise_op_(dy, dy);
|
||||
|
||||
AccDataType tmpVal = norm_x * dscale;
|
||||
|
||||
AccDataType dx = multiplier * (reduceSize * dy - dbias - tmpVal);
|
||||
|
||||
arg.p_dx_[offset] = type_convert<XDataType>(dx);
|
||||
};
|
||||
}
|
||||
};
|
||||
};
|
||||
|
||||
std::size_t num_thread = std::thread::hardware_concurrency();
|
||||
std::size_t work_per_thread = (arg.c_ + num_thread - 1) / num_thread;
|
||||
|
||||
std::vector<joinable_thread> threads(num_thread);
|
||||
|
||||
for(std::size_t it = 0; it < num_thread; ++it)
|
||||
{
|
||||
std::size_t ic_begin = it * work_per_thread;
|
||||
std::size_t ic_end = std::min(static_cast<int>((it + 1) * work_per_thread), arg.c_);
|
||||
|
||||
auto f = [=] {
|
||||
for(std::size_t ic = ic_begin; ic < ic_end; ++ic)
|
||||
{
|
||||
thread_reduce_func(ic);
|
||||
}
|
||||
};
|
||||
|
||||
threads[it] = joinable_thread(f);
|
||||
}
|
||||
|
||||
return (0.0f);
|
||||
};
|
||||
|
||||
float Run(const device::BaseArgument* p_arg,
|
||||
const StreamConfig& /*stream_config*/ = StreamConfig{}) override
|
||||
{
|
||||
return Run(*dynamic_cast<const Argument*>(p_arg));
|
||||
};
|
||||
};
|
||||
|
||||
bool IsSupportedArgument(const device::BaseArgument* p_arg) override
|
||||
{
|
||||
(void)p_arg;
|
||||
|
||||
return (true);
|
||||
};
|
||||
|
||||
std::unique_ptr<device::BaseArgument>
|
||||
MakeArgumentPointer(const std::array<index_t, 4> xyLengths,
|
||||
const std::array<index_t, 4> xStrides,
|
||||
const std::array<index_t, 4> dyStrides,
|
||||
const std::array<index_t, 4> dxStrides,
|
||||
const std::array<int, 3> reduceDims,
|
||||
const std::array<ck::index_t, 1> bnScaleBiasMeanVarLengths,
|
||||
const std::array<ck::index_t, 1> bnScaleStrides,
|
||||
const std::array<ck::index_t, 1> bnBiasStrides,
|
||||
const std::array<ck::index_t, 1> bnMeanVarStrides,
|
||||
const void* p_x,
|
||||
const void* p_dy,
|
||||
const void* p_scale,
|
||||
const void* p_savedMean,
|
||||
const void* p_savedInvVar,
|
||||
double epsilon,
|
||||
const DyElementwiseOp dy_elementwise_op,
|
||||
void* p_dx,
|
||||
void* p_dscale,
|
||||
void* p_dbias) override
|
||||
{
|
||||
return std::make_unique<Argument>(xyLengths,
|
||||
xStrides,
|
||||
dyStrides,
|
||||
dxStrides,
|
||||
reduceDims,
|
||||
bnScaleBiasMeanVarLengths,
|
||||
bnScaleStrides,
|
||||
bnBiasStrides,
|
||||
bnMeanVarStrides,
|
||||
static_cast<const XDataType*>(p_x),
|
||||
static_cast<const DyDataType*>(p_dy),
|
||||
static_cast<const ScaleDataType*>(p_scale),
|
||||
static_cast<const MeanVarDataType*>(p_savedMean),
|
||||
static_cast<const MeanVarDataType*>(p_savedInvVar),
|
||||
epsilon,
|
||||
dy_elementwise_op,
|
||||
static_cast<DxDataType*>(p_dx),
|
||||
static_cast<ScaleDataType*>(p_dscale),
|
||||
static_cast<BiasDataType*>(p_dbias));
|
||||
};
|
||||
|
||||
std::unique_ptr<device::BaseInvoker> MakeInvokerPointer() override
|
||||
{
|
||||
return std::make_unique<Invoker>();
|
||||
};
|
||||
|
||||
std::string GetTypeString() const override
|
||||
{
|
||||
auto str = std::stringstream();
|
||||
|
||||
// clang-format off
|
||||
str << "Reference_BatchNorm_Backward_NHWC_C<" << std::endl;
|
||||
// clang-format on
|
||||
|
||||
return str.str();
|
||||
}
|
||||
};
|
||||
|
||||
} // namespace host
|
||||
} // namespace tensor_operation
|
||||
} // namespace ck
|
||||
Reference in New Issue
Block a user