From 0ef7876035a5f9848b455cd3a2afbb5cb99cd6c9 Mon Sep 17 00:00:00 2001 From: Qianfeng Date: Tue, 29 Nov 2022 10:51:10 +0800 Subject: [PATCH] BatchNorm backward implementation (#461) * Implemented batchnorm-backward Blockwise and Multiblock kernels * Add batchnorm-backward device op * Add batchnorm-backward host-reference op * Add batchnorm-backward example * Parameters renaming in batchnorm backward kernels and device op * Change in the example to loose the threshold for ScaleDiff checking * Add comments to explain the implementation of batchnorm-backward * Parameters renaming again in batchnorm backward kernels * Improve the expression calculation for performance * Add batchnorm backward to README * Add comments to explain inv-variance in batchnorm forward and backward * Renaming the batchnorm forward training and inferring examples * Add/update the comments for batchnorm-backward kernels * Renaming again * Add block_sync_lds between two consecutive blockwise reductions * Move common expression 1/N out of the static_for loops * Add dy_elementwise_op * Renaming in backward example again * Add checking for reduceDims in reference_batchnorm_backward * Update to comments and codes format * Rename in the comments * Remove common expression out of the loop in reference_batchnorm_backward_nhwc_c * Add block_sync_lds() between blockwise reduction again * Fix comments again * Remove int8 from batchnorm-forward instances since it is not needed for forward training and could fail test [ROCm/composable_kernel commit: 44789d992add03ae064b9b0f89c0542734d487b2] --- example/34_batchnorm/CMakeLists.txt | 5 +- example/34_batchnorm/README.md | 25 + .../34_batchnorm/batchnorm_backward_nhwc.cpp | 502 ++++++++++ ...p => batchnorm_forward_inferring_nhwc.cpp} | 0 ...pp => batchnorm_forward_training_nhwc.cpp} | 0 .../gpu/device/device_batchnorm_backward.hpp | 51 ++ .../impl/device_batchnorm_backward_impl.hpp | 866 ++++++++++++++++++ ...e_second_half_batchnorm_backward_final.hpp | 534 +++++++++++ ...gridwise_multiblock_welford_first_half.hpp | 3 + ...rd_second_half_batchnorm_forward_final.hpp | 1 + ...cond_half_multiblock_reduce_first_half.hpp | 575 ++++++++++++ ...e_batchnorm_backward_blockwise_welford.hpp | 572 ++++++++++++ ...se_batchnorm_forward_blockwise_welford.hpp | 1 + ...gridwise_multiblock_welford_first_half.hpp | 258 ++++++ .../reference_batchnorm_backward_nhwc_c.hpp | 319 +++++++ 15 files changed, 3710 insertions(+), 2 deletions(-) create mode 100644 example/34_batchnorm/batchnorm_backward_nhwc.cpp rename example/34_batchnorm/{batchnorm_infer_nhwc.cpp => batchnorm_forward_inferring_nhwc.cpp} (100%) rename example/34_batchnorm/{batchnorm_forward_nhwc.cpp => batchnorm_forward_training_nhwc.cpp} (100%) create mode 100644 include/ck/tensor_operation/gpu/device/device_batchnorm_backward.hpp create mode 100644 include/ck/tensor_operation/gpu/device/impl/device_batchnorm_backward_impl.hpp create mode 100644 include/ck/tensor_operation/gpu/grid/batchnorm_multiblock/gridwise_multiblock_reduce_second_half_batchnorm_backward_final.hpp create mode 100644 include/ck/tensor_operation/gpu/grid/batchnorm_multiblock/gridwise_multiblock_welford_second_half_multiblock_reduce_first_half.hpp create mode 100644 include/ck/tensor_operation/gpu/grid/gridwise_batchnorm_backward_blockwise_welford.hpp create mode 100644 include/ck/tensor_operation/gpu/grid/gridwise_multiblock_welford_first_half.hpp create mode 100644 library/include/ck/library/reference_tensor_operation/cpu/reference_batchnorm_backward_nhwc_c.hpp diff --git a/example/34_batchnorm/CMakeLists.txt b/example/34_batchnorm/CMakeLists.txt index 827435fed8..d964f40d87 100644 --- a/example/34_batchnorm/CMakeLists.txt +++ b/example/34_batchnorm/CMakeLists.txt @@ -1,2 +1,3 @@ -add_example_executable(example_batchnorm_forward batchnorm_forward_nhwc.cpp) -add_example_executable(example_batchnorm_infer batchnorm_infer_nhwc.cpp) +add_example_executable(example_batchnorm_forward_training batchnorm_forward_training_nhwc.cpp) +add_example_executable(example_batchnorm_forward_inferring batchnorm_forward_inferring_nhwc.cpp) +add_example_executable(example_batchnorm_backward batchnorm_backward_nhwc.cpp) diff --git a/example/34_batchnorm/README.md b/example/34_batchnorm/README.md index afee4ac670..294e32b998 100644 --- a/example/34_batchnorm/README.md +++ b/example/34_batchnorm/README.md @@ -53,4 +53,29 @@ Start running 10 times... Perf: 1.28235 ms, 523.329 GB/s ``` +## Run ```batchnorm backward nhwc``` +```bash +# -D : input 4-d tensor lengths +# -v : verification (0=no, 1=yes) +Arg1: data type (0: fp16, 1: fp32, 3: int8, 5: bp16, 6: fp64) +Arg2 -- 1/0 to indicate whether to use saved mean and invVariance +Arg3 -- init method used for dy and bnScale (0=no init, 1=single integer value, 2=scope integer value, 3=decimal value) +Arg4 -- time kernel (0=no, 1=yes) +Arg5: use multi-block welford (0=n0, 1=yes) +./bin/example_batchnorm_backward -D 128,16,3,1024 -v 1 0 0 3 1 1 +``` +Result +``` +./bin/example_batchnorm_backward -D 128,16,3,1024 -v 1 0 0 3 1 1 +launch_and_time_kernel: grid_dim {6144, 1, 1}, block_dim {256, 1, 1} +Warm up 1 time +Start running 10 times... +launch_and_time_kernel: grid_dim {6144, 1, 1}, block_dim {256, 1, 1} +Warm up 1 time +Start running 10 times... +launch_and_time_kernel: grid_dim {6144, 1, 1}, block_dim {256, 1, 1} +Warm up 1 time +Start running 10 times... +Perf: 0.411026 ms, 91.8702 GB/s +``` diff --git a/example/34_batchnorm/batchnorm_backward_nhwc.cpp b/example/34_batchnorm/batchnorm_backward_nhwc.cpp new file mode 100644 index 0000000000..90e3718441 --- /dev/null +++ b/example/34_batchnorm/batchnorm_backward_nhwc.cpp @@ -0,0 +1,502 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +#include +#include +#include + +#include "ck/ck.hpp" +#include "ck/library/utility/check_err.hpp" +#include "ck/library/utility/device_memory.hpp" +#include "ck/library/utility/host_tensor.hpp" +#include "ck/library/utility/host_tensor_generator.hpp" +#include "ck/library/utility/host_common_util.hpp" +#include "ck/library/reference_tensor_operation/cpu/reference_batchnorm_backward_nhwc_c.hpp" +#include "ck/tensor_operation/gpu/device/impl/device_batchnorm_backward_impl.hpp" + +static struct option long_options[] = {{"inOutLengths", required_argument, nullptr, 'D'}, + {"verify", required_argument, nullptr, 'v'}, + {"help", no_argument, nullptr, '?'}, + {nullptr, 0, nullptr, 0}}; + +class BatchNormBwdArg +{ + private: + int option_index = 0; + + public: + std::vector inOutLengths; + + bool do_verification = false; + + bool haveSavedMeanInvVar; + + int data_type = 0; + int init_method = 3; + bool time_kernel = false; + bool use_multiblock_welford = false; + + public: + void show_usage(const char* cmd) + { + // clang-format off + std::cout << "Usage of " << cmd << std::endl; + std::cout << "--inOutLengths or -D, comma separated list of input tensor dimension lengths, must have 4 integers for nhwc" << std::endl; + std::cout << "--verify or -v, 1/0 to indicate whether to verify the result by comparing with the host-based batch-normalization" << std::endl; + std::cout << "Arg1: data type (0: fp16, 1: fp32, 3: int8, 5: bp16, 6: fp64)" << std::endl; + std::cout << "Arg2 -- 1/0 to indicate whether to use saved mean and invVariance" << std::endl; + std::cout << "Arg3 -- init method used for dy and bnScale (0=no init, 1=single integer value, 2=scope integer value, 3=decimal value)" << std::endl; + std::cout << "Arg4 -- time kernel (0=no, 1=yes)" << std::endl; + std::cout << "Arg5: use multi-block welford (0=n0, 1=yes)" << std::endl; + // clang-format on + }; + + int processArgs(int argc, char* argv[]) + { + using ck::host_common::getTypeValuesFromString; + + int ch; + + while(1) + { + ch = getopt_long(argc, argv, "D:v:", long_options, &option_index); + if(ch == -1) + break; + switch(ch) + { + case 'D': + if(!optarg) + throw std::runtime_error("Invalid option format!"); + + inOutLengths = getTypeValuesFromString(optarg); + + if(inOutLengths.size() != 4) + throw std::runtime_error( + "NHWC tensor layout should have 4 length values specified!"); + break; + case 'v': + if(!optarg) + throw std::runtime_error("Invalid option format!"); + + do_verification = static_cast(std::atoi(optarg)); + break; + case '?': + if(std::string(long_options[option_index].name) == "help") + { + show_usage(argv[0]); + return (-1); + }; + break; + default: show_usage(argv[0]); return (-1); + }; + }; + + if(optind + 5 > argc) + throw std::runtime_error("Invalid cmd-line arguments, more argumetns are needed!"); + + data_type = std::atoi(argv[optind++]); + haveSavedMeanInvVar = std::atoi(argv[optind++]); + init_method = std::atoi(argv[optind++]); + time_kernel = static_cast(std::atoi(argv[optind++])); + use_multiblock_welford = static_cast(std::atoi(argv[optind])); + + return (0); + }; +}; + +using namespace ck; + +template +bool bnorm_bwd_nhwc_test(bool do_verification, + int init_method, + bool time_kernel, + const std::vector inOutLengths, + bool haveSavedMeanInvVar, + double epsilon) +{ + // for NHWC BatchNorm calculation of mean and meansquare + constexpr index_t Rank = 4; + constexpr index_t NumReduceDim = 3; + + const std::vector scaleBiasMeanVarLengths = {inOutLengths[3]}; + + // input data of the batchnorm backward algorithm + Tensor x(inOutLengths); + Tensor dy(inOutLengths); + + Tensor bnScale(scaleBiasMeanVarLengths); + + Tensor savedMean(scaleBiasMeanVarLengths); + Tensor savedInvVar(scaleBiasMeanVarLengths); + // savedVariance is only used for initializing savedInvVar + Tensor savedVariance(scaleBiasMeanVarLengths); + + // output data of the batchnorm backward algorithm + Tensor dx_ref(inOutLengths); + Tensor dx(inOutLengths); + + Tensor dscale(scaleBiasMeanVarLengths); + Tensor dbias(scaleBiasMeanVarLengths); + + Tensor dscale_ref(scaleBiasMeanVarLengths); + Tensor dbias_ref(scaleBiasMeanVarLengths); + + auto inOutStrides = dy.mDesc.GetStrides(); + auto scaleBiasMeanVarStrides = dscale.mDesc.GetStrides(); + + std::size_t num_thread = std::thread::hardware_concurrency(); + + if(haveSavedMeanInvVar) + { + const float x_mean = 0.0f; + const float x_stddev = 1.0f; + const float noise_stddev = 0.0001f; + + // input data in normal distribution + x.GenerateTensorValue(GeneratorTensor_4{x_mean, x_stddev}, num_thread); + + // initialize the savedMean to be values with tiny variation to the mean of the x values + savedMean.GenerateTensorValue(GeneratorTensor_4{x_mean, noise_stddev}, + num_thread); + + // initialize the variance to be values with tiny variation to the variance of the x values + savedVariance.GenerateTensorValue( + GeneratorTensor_4{x_stddev * x_stddev, noise_stddev}, num_thread); + + auto it_src = savedVariance.mData.begin(); + auto it_dst = savedInvVar.mData.begin(); + float tmp_epsilon = std::numeric_limits::epsilon(); + + while(it_src != savedVariance.mData.end()) + { + *it_dst = type_convert( + 1.0f / std::sqrtf(type_convert(*it_src) + tmp_epsilon)); + + it_src++; + it_dst++; + }; + } + else + { + const float x_mean = 0.0f; + const float x_stddev = 1.0f; + + // input data in normal distribution + x.GenerateTensorValue(GeneratorTensor_4{x_mean, x_stddev}, num_thread); + }; + + if(do_verification) + { + switch(init_method) + { + case 0: + dy.GenerateTensorValue(GeneratorTensor_0{}, num_thread); + bnScale.GenerateTensorValue(GeneratorTensor_0{}, num_thread); + break; + case 1: + dy.GenerateTensorValue(GeneratorTensor_1{1}, num_thread); + bnScale.GenerateTensorValue(GeneratorTensor_1{1}, num_thread); + break; + case 2: + dy.GenerateTensorValue(GeneratorTensor_2{-5, 5}, num_thread); + bnScale.GenerateTensorValue(GeneratorTensor_2{-5, 5}, num_thread); + break; + default: + dy.GenerateTensorValue(GeneratorTensor_3{-0.2f, 0.2f}, num_thread); + bnScale.GenerateTensorValue(GeneratorTensor_3{-0.5f, 0.5f}, num_thread); + } + }; + + // input data of the batchnorm backward algorithm + DeviceMem x_dev(sizeof(InOutDataType) * x.mDesc.GetElementSpaceSize()); + DeviceMem dy_dev(sizeof(InOutDataType) * dy.mDesc.GetElementSpaceSize()); + + DeviceMem bnScale_dev(sizeof(AccDataType) * bnScale.mDesc.GetElementSpaceSize()); + + DeviceMem savedMean_dev(sizeof(AccDataType) * savedMean.mDesc.GetElementSpaceSize()); + DeviceMem savedInvVar_dev(sizeof(AccDataType) * savedInvVar.mDesc.GetElementSpaceSize()); + + // output data of the batchnorm backward algorithm + DeviceMem dx_dev(sizeof(InOutDataType) * dx.mDesc.GetElementSpaceSize()); + + DeviceMem dscale_dev(sizeof(AccDataType) * dscale.mDesc.GetElementSpaceSize()); + DeviceMem dbias_dev(sizeof(AccDataType) * dbias.mDesc.GetElementSpaceSize()); + + x_dev.ToDevice(x.mData.data()); + dy_dev.ToDevice(dy.mData.data()); + bnScale_dev.ToDevice(bnScale.mData.data()); + + if(haveSavedMeanInvVar) + { + savedMean_dev.ToDevice(savedMean.mData.data()); + savedInvVar_dev.ToDevice(savedInvVar.mData.data()); + }; + + std::array i_inOutLengths; + std::array i_inOutStrides; + std::array i_scaleBiasMeanVarLengths; + std::array i_scaleBiasMeanVarStrides; + + std::copy(inOutLengths.begin(), inOutLengths.end(), i_inOutLengths.begin()); + std::copy(inOutStrides.begin(), inOutStrides.end(), i_inOutStrides.begin()); + std::copy(scaleBiasMeanVarLengths.begin(), + scaleBiasMeanVarLengths.end(), + i_scaleBiasMeanVarLengths.begin()); + std::copy(scaleBiasMeanVarStrides.begin(), + scaleBiasMeanVarStrides.end(), + i_scaleBiasMeanVarStrides.begin()); + + using PassThroughOp = ck::tensor_operation::element_wise::PassThrough; + + using DeviceBatchNormBwdInstance = + ck::tensor_operation::device::DeviceBatchNormBwdImpl; // MeanVarSrcVectorSize + + auto batchnorm_bwd = DeviceBatchNormBwdInstance{}; + + auto argument_ptr = batchnorm_bwd.MakeArgumentPointer( + i_inOutLengths, + i_inOutStrides, + i_inOutStrides, + i_inOutStrides, + {0, 1, 2}, + i_scaleBiasMeanVarLengths, + i_scaleBiasMeanVarStrides, + i_scaleBiasMeanVarStrides, + i_scaleBiasMeanVarStrides, + x_dev.GetDeviceBuffer(), + dy_dev.GetDeviceBuffer(), + bnScale_dev.GetDeviceBuffer(), + haveSavedMeanInvVar ? savedMean_dev.GetDeviceBuffer() : nullptr, + haveSavedMeanInvVar ? savedInvVar_dev.GetDeviceBuffer() : nullptr, + epsilon, + PassThroughOp{}, + dx_dev.GetDeviceBuffer(), + dscale_dev.GetDeviceBuffer(), + dbias_dev.GetDeviceBuffer()); + + if(!batchnorm_bwd.IsSupportedArgument(argument_ptr.get())) + { + std::cout << "The runtime parameters seems not supported by the BatchNorm device instance, " + "exiting!" + << std::endl; + return (false); + }; + + size_t workspace_sz = batchnorm_bwd.GetWorkSpaceSize(argument_ptr.get()); + + DeviceMem workspace_dev(workspace_sz); + + batchnorm_bwd.SetWorkSpacePointer(argument_ptr.get(), workspace_dev.GetDeviceBuffer()); + + auto invoker_ptr = batchnorm_bwd.MakeInvokerPointer(); + + if(time_kernel) + { + float avg_time = 0.0f; + size_t num_bytes = 0; + + size_t total_length = inOutLengths[0] * inOutLengths[1] * inOutLengths[2] * inOutLengths[3]; + size_t invariant_length = inOutLengths[3]; + + avg_time = invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, time_kernel}); + + // inputing of x, dy, scale, outputing of dx, dscale, dbias + num_bytes += + total_length * sizeof(InOutDataType) * 3 + invariant_length * sizeof(AccDataType) * 3; + + // outputing of mean, inv-variance + num_bytes += haveSavedMeanInvVar ? invariant_length * sizeof(AccDataType) * 2 : 0; + + float gb_per_sec = num_bytes / 1.E6 / avg_time; + + std::cout << "Perf: " << avg_time << " ms, " << gb_per_sec << " GB/s" << std::endl; + } + else + (void)invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, time_kernel}); + + bool pass = true; + + if(do_verification) + { + using ReferenceBatchNormBwdInstance = + ck::tensor_operation::host::ReferenceBatchNormBwd_Input_N_H_W_C_Output_C; + + auto batchNormBwd_ref = ReferenceBatchNormBwdInstance{}; + + auto argument_ptr_ref = batchNormBwd_ref.MakeArgumentPointer( + i_inOutLengths, + i_inOutStrides, + i_inOutStrides, + i_inOutStrides, + {0, 1, 2}, + i_scaleBiasMeanVarLengths, + i_scaleBiasMeanVarStrides, + i_scaleBiasMeanVarStrides, + i_scaleBiasMeanVarStrides, + x.mData.data(), + dy.mData.data(), + bnScale.mData.data(), + haveSavedMeanInvVar ? savedMean.mData.data() : nullptr, + haveSavedMeanInvVar ? savedInvVar.mData.data() : nullptr, + epsilon, + PassThroughOp{}, + dx_ref.mData.data(), + dscale_ref.mData.data(), + dbias_ref.mData.data()); + + if(!batchNormBwd_ref.IsSupportedArgument(argument_ptr_ref.get())) + { + std::cout + << "The runtime parameters seems not supported by the device instance, exiting!" + << std::endl; + return (false); + }; + + auto invoker_ptr_ref = batchNormBwd_ref.MakeInvokerPointer(); + + (void)invoker_ptr_ref->Run(argument_ptr_ref.get()); + + dx_dev.FromDevice(dx.mData.data()); + dscale_dev.FromDevice(dscale.data()); + dbias_dev.FromDevice(dbias.data()); + + // clang-format off + pass = pass && ck::utils::check_err(dbias.mData, dbias_ref.mData, "dBias result:", 1e-5, 1e-5); + pass = pass && ck::utils::check_err(dscale.mData, dscale_ref.mData, "dScale result:", 1e-5, 2e-4); + pass = pass && ck::utils::check_err(dx.mData, dx_ref.mData, "dx result:"); + // clang-format on + }; + + return (pass); +}; + +static const double epsilon = std::numeric_limits::epsilon(); + +int main(int argc, char* argv[]) +{ + bool pass = true; + + if(argc > 1) + { + BatchNormBwdArg arg; + + if(arg.processArgs(argc, argv) < 0) + return (-1); + + if(arg.data_type == 0) + { + if(arg.use_multiblock_welford) + pass = bnorm_bwd_nhwc_test(arg.do_verification, + arg.init_method, + arg.time_kernel, + arg.inOutLengths, + arg.haveSavedMeanInvVar, + epsilon); + else + pass = bnorm_bwd_nhwc_test(arg.do_verification, + arg.init_method, + arg.time_kernel, + arg.inOutLengths, + arg.haveSavedMeanInvVar, + epsilon); + } + else if(arg.data_type == 1) + { + if(arg.use_multiblock_welford) + pass = bnorm_bwd_nhwc_test(arg.do_verification, + arg.init_method, + arg.time_kernel, + arg.inOutLengths, + arg.haveSavedMeanInvVar, + epsilon); + else + pass = bnorm_bwd_nhwc_test(arg.do_verification, + arg.init_method, + arg.time_kernel, + arg.inOutLengths, + arg.haveSavedMeanInvVar, + epsilon); + } + else if(arg.data_type == 5) + { + if(arg.use_multiblock_welford) + pass = bnorm_bwd_nhwc_test(arg.do_verification, + arg.init_method, + arg.time_kernel, + arg.inOutLengths, + arg.haveSavedMeanInvVar, + epsilon); + else + pass = bnorm_bwd_nhwc_test(arg.do_verification, + arg.init_method, + arg.time_kernel, + arg.inOutLengths, + arg.haveSavedMeanInvVar, + epsilon); + } + else if(arg.data_type == 6) + { + if(arg.use_multiblock_welford) + pass = bnorm_bwd_nhwc_test(arg.do_verification, + arg.init_method, + arg.time_kernel, + arg.inOutLengths, + arg.haveSavedMeanInvVar, + epsilon); + else + pass = bnorm_bwd_nhwc_test(arg.do_verification, + arg.init_method, + arg.time_kernel, + arg.inOutLengths, + arg.haveSavedMeanInvVar, + epsilon); + } + } + else + { + pass = bnorm_bwd_nhwc_test(true, + 3, + false, // don't time kernel + {128, 16, 6, 512}, + false, + epsilon); + + pass = pass && bnorm_bwd_nhwc_test(true, + 3, + false, // don't time kernel + {128, 16, 3, 1024}, + false, + epsilon); + }; + + return (pass ? 0 : 1); +} diff --git a/example/34_batchnorm/batchnorm_infer_nhwc.cpp b/example/34_batchnorm/batchnorm_forward_inferring_nhwc.cpp similarity index 100% rename from example/34_batchnorm/batchnorm_infer_nhwc.cpp rename to example/34_batchnorm/batchnorm_forward_inferring_nhwc.cpp diff --git a/example/34_batchnorm/batchnorm_forward_nhwc.cpp b/example/34_batchnorm/batchnorm_forward_training_nhwc.cpp similarity index 100% rename from example/34_batchnorm/batchnorm_forward_nhwc.cpp rename to example/34_batchnorm/batchnorm_forward_training_nhwc.cpp diff --git a/include/ck/tensor_operation/gpu/device/device_batchnorm_backward.hpp b/include/ck/tensor_operation/gpu/device/device_batchnorm_backward.hpp new file mode 100644 index 0000000000..e969fd0be7 --- /dev/null +++ b/include/ck/tensor_operation/gpu/device/device_batchnorm_backward.hpp @@ -0,0 +1,51 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once + +#include +#include + +#include "ck/ck.hpp" +#include "ck/tensor_operation/gpu/device/device_base.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { + +template +struct DeviceBatchNormBwd : public BaseOperator +{ + static constexpr index_t NumInvariantDim = Rank - NumBatchNormReduceDim; + + virtual std::unique_ptr + MakeArgumentPointer(const std::array xyLengths, + const std::array xStrides, + const std::array dyStrides, + const std::array dxStrides, + const std::array reduceDims, + const std::array bnScaleBiasMeanVarLengths, + const std::array bnScaleStrides, + const std::array bnBiasStrides, + const std::array bnMeanVarStrides, + const void* p_x, + const void* p_dy, + const void* p_scale, + const void* p_savedMean, + const void* p_savedInvVar, + double epsilon, + const DyElementwiseOp dy_elementwise_op, + void* p_dx, + void* p_dscale, + void* p_dbias) = 0; + + virtual std::unique_ptr MakeInvokerPointer() = 0; +}; + +template +using DeviceBatchNormBwdPtr = + std::unique_ptr>; + +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/include/ck/tensor_operation/gpu/device/impl/device_batchnorm_backward_impl.hpp b/include/ck/tensor_operation/gpu/device/impl/device_batchnorm_backward_impl.hpp new file mode 100644 index 0000000000..d61dbd0010 --- /dev/null +++ b/include/ck/tensor_operation/gpu/device/impl/device_batchnorm_backward_impl.hpp @@ -0,0 +1,866 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once + +#include +#include + +#include "ck/utility/reduction_operator.hpp" +#include "ck/tensor_operation/gpu/device/device_batchnorm_backward.hpp" +#include "ck/tensor_operation/gpu/device/impl/device_reduce_common.hpp" +#include "ck/tensor_operation/gpu/grid/gridwise_batchnorm_backward_blockwise_welford.hpp" +#include "ck/tensor_operation/gpu/grid/batchnorm_multiblock/gridwise_multiblock_welford_first_half.hpp" +#include "ck/tensor_operation/gpu/grid/batchnorm_multiblock/gridwise_multiblock_welford_second_half_multiblock_reduce_first_half.hpp" +#include "ck/tensor_operation/gpu/grid/batchnorm_multiblock/gridwise_multiblock_reduce_second_half_batchnorm_backward_final.hpp" +#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" +#include "ck/tensor_operation/gpu/device/welford_helper.hpp" +#include "ck/host_utility/device_prop.hpp" +#include "ck/host_utility/kernel_launch.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { + +template +struct DeviceBatchNormBwdImpl + : public DeviceBatchNormBwd +{ + static_assert(Rank <= 6, "Bigger Rank size is not supported!"); + static_assert(BlockSize == MThreadClusterSize * KThreadClusterSize, + "Invalid thread cluster size assignments!"); + + static_assert((XDyDxVectorDim == 0 && MThreadSliceSize % XSrcVectorSize == 0 && + MThreadSliceSize % DySrcVectorSize == 0 && + MThreadSliceSize % DxDstVectorSize == 0) || + (XDyDxVectorDim == 1 && KThreadSliceSize % XSrcVectorSize == 0 && + KThreadSliceSize % DySrcVectorSize == 0 && + KThreadSliceSize % DxDstVectorSize == 0), + "Invalid thread slice sizes and/or vector sizes configuration, please check!"); + + static constexpr index_t NumInvariantDim = Rank - NumBatchNormReduceDim; + + static constexpr index_t M_BlockTileSize = MThreadClusterSize * MThreadSliceSize; + static constexpr index_t K_BlockTileSize = KThreadClusterSize * KThreadSliceSize; + + static auto MakeXY2dDescriptor(const std::array& xyLengths, + const std::array& xyStrides, + int blkGroupSize, + int numBlockTileIteration) + { + const auto tupleXYLengths = + generate_tuple([&](auto I) { return xyLengths[I]; }, Number{}); + const auto tupleXYStrides = + generate_tuple([&](auto I) { return xyStrides[I]; }, Number{}); + + const auto raw_grid_desc = make_naive_tensor_descriptor(tupleXYLengths, tupleXYStrides); + + const auto grid_desc_m_k = [&]() { + using InvariantDims = typename arithmetic_sequence_gen<0, NumInvariantDim, 1>::type; + using ReduceDims = typename arithmetic_sequence_gen::type; + + const auto reduceDimLengths = + generate_tuple([&](auto I) { return xyLengths[NumInvariantDim + I]; }, + Number{}); + const auto invariantDimLengths = + generate_tuple([&](auto I) { return xyLengths[I]; }, Number{}); + + return transform_tensor_descriptor(raw_grid_desc, + make_tuple(make_merge_transform(invariantDimLengths), + make_merge_transform(reduceDimLengths)), + make_tuple(InvariantDims{}, ReduceDims{}), + make_tuple(Sequence<0>{}, Sequence<1>{})); + }(); + + const auto invariantLength = grid_desc_m_k.GetLength(Number<0>{}); + const auto reduceLength = grid_desc_m_k.GetLength(Number<1>{}); + + const int workSizePerBlock = K_BlockTileSize * numBlockTileIteration; + const auto mPad = + math::integer_least_multiple(invariantLength, M_BlockTileSize) - invariantLength; + const auto kPad = workSizePerBlock * blkGroupSize - reduceLength; + + auto grid_desc_m_k_padded = + transform_tensor_descriptor(grid_desc_m_k, + make_tuple(make_right_pad_transform(invariantLength, mPad), + make_right_pad_transform(reduceLength, kPad)), + make_tuple(Sequence<0>{}, Sequence<1>{}), + make_tuple(Sequence<0>{}, Sequence<1>{})); + + return (grid_desc_m_k_padded); + }; + + static auto MakeMultiblockFirstReduceOutputMG2dDescriptor(int invariantLength, int blkGroupSize) + { + const auto grid_desc_m_g = + make_naive_tensor_descriptor_packed(make_tuple(invariantLength, blkGroupSize)); + + const auto mPad = + math::integer_least_multiple(invariantLength, M_BlockTileSize) - invariantLength; + + auto grid_desc_m_g_padded = + transform_tensor_descriptor(grid_desc_m_g, + make_tuple(make_right_pad_transform(invariantLength, mPad), + make_pass_through_transform(blkGroupSize)), + make_tuple(Sequence<0>{}, Sequence<1>{}), + make_tuple(Sequence<0>{}, Sequence<1>{})); + + return (grid_desc_m_g_padded); + }; + + static auto MakeMultiblockFinalReduceInputMK2dDescriptor(int invariantLength, int blkGroupSize) + { + const auto reduceLength = blkGroupSize; + const auto grid_desc_m_k = + make_naive_tensor_descriptor_packed(make_tuple(invariantLength, reduceLength)); + + const auto mPad = + math::integer_least_multiple(invariantLength, M_BlockTileSize) - invariantLength; + const auto kPad = + math::integer_least_multiple(reduceLength, KThreadClusterSize) - reduceLength; + + auto grid_desc_m_k_padded = + transform_tensor_descriptor(grid_desc_m_k, + make_tuple(make_right_pad_transform(invariantLength, mPad), + make_right_pad_transform(reduceLength, kPad)), + make_tuple(Sequence<0>{}, Sequence<1>{}), + make_tuple(Sequence<0>{}, Sequence<1>{})); + + return (grid_desc_m_k_padded); + }; + + static auto + MakeScaleBiasMeanVar1dDescriptor(const std::array& lengths, + const std::array& strides) + { + const auto tupleLengths = + generate_tuple([&](auto I) { return lengths[I]; }, Number{}); + const auto tupleStrides = + generate_tuple([&](auto I) { return strides[I]; }, Number{}); + + auto raw_grid_desc = make_naive_tensor_descriptor(tupleLengths, tupleStrides); + + auto grid_desc_m = transform_tensor_descriptor( + raw_grid_desc, + make_tuple(make_merge_transform(tupleLengths)), + make_tuple(typename arithmetic_sequence_gen<0, NumInvariantDim, 1>::type{}), + make_tuple(Sequence<0>{})); + + const auto invariantLength = grid_desc_m.GetLength(Number<0>{}); + + const auto mPad = + math::integer_least_multiple(invariantLength, M_BlockTileSize) - invariantLength; + + auto grid_desc_m_padded = + transform_tensor_descriptor(grid_desc_m, + make_tuple(make_right_pad_transform(invariantLength, mPad)), + make_tuple(Sequence<0>{}), + make_tuple(Sequence<0>{})); + return (grid_desc_m_padded); + }; + + using XYGridDesc_M_K = decltype(MakeXY2dDescriptor({1}, {1}, 1, 1)); + using ScaleBiasGridDesc_M = decltype(MakeScaleBiasMeanVar1dDescriptor({1}, {1})); + using MeanVarGridDesc_M = ScaleBiasGridDesc_M; + + struct Argument : public BaseArgument + { + Argument(const std::array xyLengths, + const std::array xStrides, + const std::array dyStrides, + const std::array dxStrides, + const std::array reduceDims, + const std::array bnScaleBiasMeanVarLengths, + const std::array bnScaleStrides, + const std::array bnBiasStrides, + const std::array bnMeanVarStrides, + const XDataType* p_x, + const DyDataType* p_dy, + const ScaleDataType* p_scale, + const MeanVarDataType* p_savedMean, + const MeanVarDataType* p_savedInvVar, + const DyElementwiseOp dy_elementwise_op, + double epsilon, + DxDataType* p_dx, + ScaleDataType* p_dscale, + BiasDataType* p_dbias) + : bnScaleBiasMeanVarLengths_(bnScaleBiasMeanVarLengths), + bnScaleStrides_(bnScaleStrides), + bnBiasStrides_(bnBiasStrides), + bnMeanVarStrides_(bnMeanVarStrides), + p_x_(p_x), + p_dy_(p_dy), + p_scale_(p_scale), + p_savedMean_(p_savedMean), + p_savedInvVar_(p_savedInvVar), + dy_elementwise_op_(dy_elementwise_op), + p_dx_(p_dx), + p_dscale_(p_dscale), + p_dbias_(p_dbias) + { + xyLengths_ = + shuffle_tensor_dimensions(xyLengths, reduceDims); + xStrides_ = + shuffle_tensor_dimensions(xStrides, reduceDims); + dyStrides_ = + shuffle_tensor_dimensions(dyStrides, reduceDims); + dxStrides_ = + shuffle_tensor_dimensions(dxStrides, reduceDims); + + std::tie(invariant_length, reduce_length) = + get_2d_lengths(xyLengths_); + + epsilon_ = type_convert(epsilon); + + haveSavedMeanInvVar_ = (p_savedMean_ != nullptr && p_savedInvVar_ != nullptr); + + if(UseMultiblockInK) + { + int iterations = 1; + while(true) + { + int testBlkGroupSize = (reduce_length + (K_BlockTileSize * iterations) - 1) / + (K_BlockTileSize * iterations); + + // we want the blkGroupSize be not more than 128 + if(testBlkGroupSize <= 128) + break; + + iterations++; + }; + + blkGroupSize = (reduce_length + (K_BlockTileSize * iterations) - 1) / + (K_BlockTileSize * iterations); + + numBlockTileIteration = iterations; + } + else + { + blkGroupSize = 1; + numBlockTileIteration = (reduce_length + K_BlockTileSize - 1) / K_BlockTileSize; + }; + + gridSize = (invariant_length + M_BlockTileSize - 1) / M_BlockTileSize * blkGroupSize; + + x_grid_desc_m_k = + MakeXY2dDescriptor(xyLengths_, xStrides_, blkGroupSize, numBlockTileIteration); + dy_grid_desc_m_k = + MakeXY2dDescriptor(xyLengths_, dyStrides_, blkGroupSize, numBlockTileIteration); + dx_grid_desc_m_k = + MakeXY2dDescriptor(xyLengths_, dxStrides_, blkGroupSize, numBlockTileIteration); + scale_grid_desc_m = + MakeScaleBiasMeanVar1dDescriptor(bnScaleBiasMeanVarLengths, bnScaleStrides); + bias_grid_desc_m = + MakeScaleBiasMeanVar1dDescriptor(bnScaleBiasMeanVarLengths, bnBiasStrides); + mean_var_grid_desc_m = + MakeScaleBiasMeanVar1dDescriptor(bnScaleBiasMeanVarLengths, bnMeanVarStrides); + } + + AccDataType epsilon_; + + bool haveSavedMeanInvVar_; + + std::array xyLengths_; + std::array xStrides_; + std::array dyStrides_; + std::array dxStrides_; + + std::array bnScaleBiasMeanVarLengths_; + std::array bnScaleStrides_; + std::array bnBiasStrides_; + std::array bnMeanVarStrides_; + + const XDataType* p_x_; + const DyDataType* p_dy_; + const ScaleDataType* p_scale_; + const MeanVarDataType* p_savedMean_; + const MeanVarDataType* p_savedInvVar_; + const DyElementwiseOp dy_elementwise_op_; + DxDataType* p_dx_; + ScaleDataType* p_dscale_; + BiasDataType* p_dbias_; + + long_index_t invariant_length; + long_index_t reduce_length; + + int blkGroupSize; + int numBlockTileIteration; + size_t gridSize; + + XYGridDesc_M_K x_grid_desc_m_k; + XYGridDesc_M_K dy_grid_desc_m_k; + XYGridDesc_M_K dx_grid_desc_m_k; + ScaleBiasGridDesc_M scale_grid_desc_m; + ScaleBiasGridDesc_M bias_grid_desc_m; + MeanVarGridDesc_M mean_var_grid_desc_m; + + void* workspace_mean; + void* workspace_variance; + void* workspace_count; + + void* workspace_savedMean; + void* workspace_savedInvVar; + + void* workspace_reduce_dscale; + void* workspace_reduce_dbias; + }; + + size_t GetWorkSpaceSize(const BaseArgument* pArg) const override + { + const Argument* pArg_ = dynamic_cast(pArg); + + size_t workspace_size = 0; + + if(UseMultiblockInK && pArg_->blkGroupSize > 1) + { + // workspace for the partial reduced result for dscale + workspace_size += + pArg_->invariant_length * pArg_->blkGroupSize * sizeof(ScaleDataType) + 64; + + // workspace for the partial reduced result for dbias + workspace_size += + pArg_->invariant_length * pArg_->blkGroupSize * sizeof(BiasDataType) + 64; + + if(!pArg_->haveSavedMeanInvVar_) + { + // workspace for welford intermediate mean + workspace_size += + pArg_->invariant_length * pArg_->blkGroupSize * sizeof(MeanVarDataType) + 64; + + // workspace for welford intermediate variance + workspace_size += + pArg_->invariant_length * pArg_->blkGroupSize * sizeof(MeanVarDataType) + 64; + + // workspace for welford intermediate count + workspace_size += + pArg_->invariant_length * pArg_->blkGroupSize * sizeof(int32_t) + 64; + + // workspace for welford result mean + workspace_size += pArg_->invariant_length * sizeof(MeanVarDataType) + 64; + + // workspace for welford result inv_variance + workspace_size += pArg_->invariant_length * sizeof(MeanVarDataType) + 64; + }; + } + + return (workspace_size); + }; + + void SetWorkSpacePointer(BaseArgument* pArg, void* p_workspace) const override + { + Argument* pArg_ = dynamic_cast(pArg); + + pArg_->p_workspace_ = p_workspace; + + index_t space_sz; + + // setup buffer for the partial reduced result for dscale + pArg_->workspace_reduce_dscale = pArg_->p_workspace_; + + space_sz = pArg_->invariant_length * pArg_->blkGroupSize * sizeof(ScaleDataType); + space_sz = math::integer_least_multiple(space_sz, 64); + + // setup buffer for the partial reduced result for dbias + pArg_->workspace_reduce_dbias = + reinterpret_cast(pArg_->workspace_reduce_dscale) + space_sz; + + if(UseMultiblockInK && pArg_->blkGroupSize > 1) + { + space_sz = pArg_->invariant_length * pArg_->blkGroupSize * sizeof(BiasDataType); + space_sz = math::integer_least_multiple(space_sz, 64); + + // setup buffer for welford intermediate mean + pArg_->workspace_mean = + reinterpret_cast(pArg_->workspace_reduce_dbias) + space_sz; + + space_sz = pArg_->invariant_length * pArg_->blkGroupSize * sizeof(MeanVarDataType); + space_sz = math::integer_least_multiple(space_sz, 64); + + // setup buffer for welford intermediate varirance + pArg_->workspace_variance = reinterpret_cast(pArg_->workspace_mean) + space_sz; + + space_sz = pArg_->invariant_length * pArg_->blkGroupSize * sizeof(MeanVarDataType); + space_sz = math::integer_least_multiple(space_sz, 64); + + // setup buffer for welford intermediate count + pArg_->workspace_count = reinterpret_cast(pArg_->workspace_variance) + space_sz; + + space_sz = pArg_->invariant_length * pArg_->blkGroupSize * sizeof(int32_t); + space_sz = math::integer_least_multiple(space_sz, 64); + + // setup buffer for welford result mean + pArg_->workspace_savedMean = reinterpret_cast(pArg_->workspace_count) + space_sz; + + space_sz = pArg_->invariant_length * sizeof(MeanVarDataType); + space_sz = math::integer_least_multiple(space_sz, 64); + + // setup buffer for welford result inv_variance + pArg_->workspace_savedInvVar = + reinterpret_cast(pArg_->workspace_savedMean) + space_sz; + }; + }; + + struct Invoker : public BaseInvoker + { + float Run(const Argument& arg, const StreamConfig& stream_config = StreamConfig{}) + { + float avg_time = 0; + + const auto mean_var_count_grid_desc_m_g = + DeviceBatchNormBwdImpl::MakeMultiblockFirstReduceOutputMG2dDescriptor( + arg.invariant_length, arg.blkGroupSize); + + const auto dscale_dbias_grid_desc_m_g = + DeviceBatchNormBwdImpl::MakeMultiblockFirstReduceOutputMG2dDescriptor( + arg.invariant_length, arg.blkGroupSize); + + const auto mean_var_count_grid_desc_m_k = + DeviceBatchNormBwdImpl::MakeMultiblockFinalReduceInputMK2dDescriptor( + arg.invariant_length, arg.blkGroupSize); + + const auto dscale_dbias_grid_desc_m_k = + DeviceBatchNormBwdImpl::MakeMultiblockFinalReduceInputMK2dDescriptor( + arg.invariant_length, arg.blkGroupSize); + + using MeanVarCountGridDesc_M_G = decltype(mean_var_count_grid_desc_m_g); + using MeanVarCountGridDesc_M_K = decltype(mean_var_count_grid_desc_m_k); + using DscaleDbiasGridDesc_M_G = decltype(dscale_dbias_grid_desc_m_g); + using DscaleDbiasGridDesc_M_K = decltype(dscale_dbias_grid_desc_m_k); + + using GridwiseWelfordSecondHalfReduceFirstHalf_ = + GridwiseWelfordSecondHalfReduceFirstHalf; + + using GridwiseReduceSecondHalfBatchNormBwdFinal_ = + GridwiseReduceSecondHalfBatchNormBackwardFinal; + + if(UseMultiblockInK && arg.blkGroupSize > 1) + { + using GetReduceCountPerThreadFunctor = + GetReduceCountPerThreadForMultiblockWelford; + + GetReduceCountPerThreadFunctor get_reduce_count_per_thread( + arg.blkGroupSize, arg.numBlockTileIteration, arg.reduce_length); + + if(!arg.haveSavedMeanInvVar_) + { + using GridwiseMultiblockWelfordFirstHalf_ = + GridwiseMultiblockWelfordFirstHalf; + + const auto kern_multiblock_welford_first_half = + kernel_multiblock_welford_first_half; + + avg_time += launch_and_time_kernel( + stream_config, + kern_multiblock_welford_first_half, + dim3(arg.gridSize), + dim3(BlockSize), + 0, + arg.x_grid_desc_m_k, + mean_var_count_grid_desc_m_g, + get_reduce_count_per_thread, + arg.numBlockTileIteration, + arg.p_x_, + static_cast(arg.workspace_mean), + static_cast(arg.workspace_variance), + static_cast(arg.workspace_count)); + }; + + const auto kern_welford_second_half_reduce_first_half = + kernel_welford_second_half_reduce_first_half< + GridwiseWelfordSecondHalfReduceFirstHalf_, + XDataType, + DyDataType, + AccDataType, + ScaleDataType, + BiasDataType, + MeanVarDataType, + DyElementwiseOp, + XYGridDesc_M_K, + MeanVarGridDesc_M, + MeanVarCountGridDesc_M_K, + DscaleDbiasGridDesc_M_G>; + + const auto kern_reduce_second_half_batchnorm_backward_final = + kernel_reduce_second_half_batchnorm_backward_final< + GridwiseReduceSecondHalfBatchNormBwdFinal_, + XDataType, + DyDataType, + DxDataType, + ScaleDataType, + BiasDataType, + MeanVarDataType, + DyElementwiseOp, + XYGridDesc_M_K, + DscaleDbiasGridDesc_M_K, + MeanVarGridDesc_M, + ScaleBiasGridDesc_M>; + + index_t numDscaleDbiasBlockTileIteration = + (arg.blkGroupSize + KThreadClusterSize - 1) / KThreadClusterSize; + + avg_time += launch_and_time_kernel( + stream_config, + kern_welford_second_half_reduce_first_half, + dim3(arg.gridSize), + dim3(BlockSize), + 0, + arg.x_grid_desc_m_k, + arg.dy_grid_desc_m_k, + arg.mean_var_grid_desc_m, + mean_var_count_grid_desc_m_k, + dscale_dbias_grid_desc_m_g, + arg.blkGroupSize, + arg.numBlockTileIteration, + numDscaleDbiasBlockTileIteration, + arg.epsilon_, + arg.haveSavedMeanInvVar_, + arg.haveSavedMeanInvVar_ ? arg.p_savedMean_ : nullptr, + arg.haveSavedMeanInvVar_ ? arg.p_savedInvVar_ : nullptr, + arg.haveSavedMeanInvVar_ + ? nullptr + : static_cast(arg.workspace_mean), + arg.haveSavedMeanInvVar_ + ? nullptr + : static_cast(arg.workspace_variance), + arg.haveSavedMeanInvVar_ ? nullptr + : static_cast(arg.workspace_count), + arg.dy_elementwise_op_, + arg.haveSavedMeanInvVar_ + ? nullptr + : static_cast(arg.workspace_savedMean), + arg.haveSavedMeanInvVar_ + ? nullptr + : static_cast(arg.workspace_savedInvVar), + arg.p_x_, + arg.p_dy_, + static_cast(arg.workspace_reduce_dscale), + static_cast(arg.workspace_reduce_dbias)); + + avg_time += launch_and_time_kernel( + stream_config, + kern_reduce_second_half_batchnorm_backward_final, + dim3(arg.gridSize), + dim3(BlockSize), + 0, + arg.x_grid_desc_m_k, + arg.dy_grid_desc_m_k, + arg.dx_grid_desc_m_k, + dscale_dbias_grid_desc_m_k, + arg.mean_var_grid_desc_m, + arg.scale_grid_desc_m, + arg.bias_grid_desc_m, + arg.blkGroupSize, + arg.reduce_length, + arg.numBlockTileIteration, + numDscaleDbiasBlockTileIteration, + static_cast(arg.workspace_reduce_dscale), + static_cast(arg.workspace_reduce_dbias), + arg.haveSavedMeanInvVar_ + ? arg.p_savedMean_ + : static_cast(arg.workspace_savedMean), + arg.haveSavedMeanInvVar_ + ? arg.p_savedInvVar_ + : static_cast(arg.workspace_savedInvVar), + arg.p_x_, + arg.p_dy_, + arg.p_scale_, + arg.dy_elementwise_op_, + arg.p_dx_, + arg.p_dscale_, + arg.p_dbias_); + } + else + { + using GetReduceCountPerThreadFunctor = + GetReduceCountPerThreadForBlockwiseWelford; + + GetReduceCountPerThreadFunctor get_reduce_count_per_thread( + arg.numBlockTileIteration, arg.reduce_length); + + using GridwiseBatchNormBackwardWithBlockwiseWelford_ = + GridwiseBatchNormBackwardWithBlockwiseWelford; + + const auto kern_batchnorm_bwd = kernel_batchnorm_backward_with_blockwise_welford< + GridwiseBatchNormBackwardWithBlockwiseWelford_, + XDataType, + DyDataType, + DxDataType, + AccDataType, + ScaleDataType, + BiasDataType, + MeanVarDataType, + DyElementwiseOp, + XYGridDesc_M_K, + ScaleBiasGridDesc_M, + MeanVarGridDesc_M, + GetReduceCountPerThreadFunctor>; + + avg_time += launch_and_time_kernel(stream_config, + kern_batchnorm_bwd, + dim3(arg.gridSize), + dim3(BlockSize), + 0, + arg.x_grid_desc_m_k, + arg.dy_grid_desc_m_k, + arg.dx_grid_desc_m_k, + arg.scale_grid_desc_m, + arg.bias_grid_desc_m, + arg.mean_var_grid_desc_m, + get_reduce_count_per_thread, + arg.reduce_length, + arg.numBlockTileIteration, + arg.epsilon_, + arg.p_x_, + arg.p_dy_, + arg.p_scale_, + arg.haveSavedMeanInvVar_, + arg.p_savedMean_, + arg.p_savedInvVar_, + arg.dy_elementwise_op_, + arg.p_dx_, + arg.p_dscale_, + arg.p_dbias_); + }; + + return (avg_time); + }; + + float Run(const BaseArgument* pArg, + const StreamConfig& stream_config = StreamConfig{}) override + { + return Run(*dynamic_cast(pArg), stream_config); + }; + }; + + bool IsSupportedArgument(const BaseArgument* pArg) override + { + const Argument* pArg_ = dynamic_cast(pArg); + + if constexpr(XDyDxVectorDim == 0) + { + if(pArg_->xStrides_[NumInvariantDim - 1] != 1 || + pArg_->dyStrides_[NumInvariantDim - 1] != 1 || + pArg_->dxStrides_[NumInvariantDim - 1] != 1) + return false; + + if(pArg_->xyLengths_[NumInvariantDim - 1] % XSrcVectorSize != 0 || + pArg_->xyLengths_[NumInvariantDim - 1] % DySrcVectorSize != 0 || + pArg_->xyLengths_[NumInvariantDim - 1] % DxDstVectorSize != 0) + return false; + } + else + { + if(pArg_->xStrides_[Rank - 1] != 1 || pArg_->dyStrides_[Rank - 1] != 1 || + pArg_->dxStrides_[Rank - 1] != 1) + return false; + + if(pArg_->xyLengths_[Rank - 1] % XSrcVectorSize != 0 || + pArg_->xyLengths_[Rank - 1] % DySrcVectorSize != 0 || + pArg_->xyLengths_[Rank - 1] % DxDstVectorSize != 0) + return false; + }; + + if(pArg_->bnScaleStrides_[NumInvariantDim - 1] != 1 && ScaleSrcDstVectorSize != 1) + return false; + + if(pArg_->bnBiasStrides_[NumInvariantDim - 1] != 1 && BiasDstVectorSize != 1) + return false; + + if(pArg_->bnScaleBiasMeanVarLengths_[NumInvariantDim - 1] % ScaleSrcDstVectorSize != 0) + return false; + + if(pArg_->bnScaleBiasMeanVarLengths_[NumInvariantDim - 1] % BiasDstVectorSize != 0) + return false; + + if(pArg_->haveSavedMeanInvVar_) + { + if(pArg_->bnMeanVarStrides_[NumInvariantDim - 1] != 1 && MeanVarSrcVectorSize != 1) + return false; + + if(pArg_->bnScaleBiasMeanVarLengths_[NumInvariantDim - 1] % MeanVarSrcVectorSize != 0) + return false; + }; + + bool is_valid = true; + + static_for<0, NumInvariantDim, 1>{}([&](auto I) { + if(pArg_->xyLengths_[I] != pArg_->bnScaleBiasMeanVarLengths_[I]) + is_valid = false; + }); + + if(!is_valid) + return false; + + return true; + }; + + std::unique_ptr + MakeArgumentPointer(const std::array xyLengths, + const std::array xStrides, + const std::array dyStrides, + const std::array dxStrides, + const std::array reduceDims, + const std::array bnScaleBiasMeanVarLengths, + const std::array bnScaleStrides, + const std::array bnBiasStrides, + const std::array bnMeanVarStrides, + const void* p_x, + const void* p_dy, + const void* p_scale, + const void* p_savedMean, + const void* p_savedInvVar, + double epsilon, + const DyElementwiseOp dy_elementwise_op, + void* p_dx, + void* p_dscale, + void* p_dbias) override + { + return std::make_unique(xyLengths, + xStrides, + dyStrides, + dxStrides, + reduceDims, + bnScaleBiasMeanVarLengths, + bnScaleStrides, + bnBiasStrides, + bnMeanVarStrides, + static_cast(p_x), + static_cast(p_dy), + static_cast(p_scale), + static_cast(p_savedMean), + static_cast(p_savedInvVar), + dy_elementwise_op, + epsilon, + static_cast(p_dx), + static_cast(p_dscale), + static_cast(p_dbias)); + }; + + std::unique_ptr MakeInvokerPointer() override + { + return std::make_unique(); + }; + + std::string GetTypeString() const override + { + auto str = std::stringstream(); + + // clang-format off + str << "DeviceBatchNormBwdImpl<" << BlockSize << ","; + str << "M_C" << MThreadClusterSize << "_S" << MThreadSliceSize << ","; + str << "K_C" << KThreadClusterSize << "_S" << KThreadSliceSize << ","; + str << "XDyDxVectorDim_" << XDyDxVectorDim << ","; + str << "VectorSize_X" << XSrcVectorSize << "_scale_" << ScaleSrcDstVectorSize << "_bias_" << BiasDstVectorSize << "_mean_var_" << MeanVarSrcVectorSize << "_Dx_" << DxDstVectorSize << ">"; + // clang-format on + + return str.str(); + } +}; // namespace device + +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/include/ck/tensor_operation/gpu/grid/batchnorm_multiblock/gridwise_multiblock_reduce_second_half_batchnorm_backward_final.hpp b/include/ck/tensor_operation/gpu/grid/batchnorm_multiblock/gridwise_multiblock_reduce_second_half_batchnorm_backward_final.hpp new file mode 100644 index 0000000000..9638755565 --- /dev/null +++ b/include/ck/tensor_operation/gpu/grid/batchnorm_multiblock/gridwise_multiblock_reduce_second_half_batchnorm_backward_final.hpp @@ -0,0 +1,534 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once + +#include "ck/utility/data_type.hpp" +#include "ck/tensor_operation/gpu/block/reduction_functions_blockwise.hpp" +#include "ck/tensor_operation/gpu/thread/threadwise_welford.hpp" +#include "ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer.hpp" +#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" + +namespace ck { + +template +__global__ void kernel_reduce_second_half_batchnorm_backward_final( + const XYGridDesc_M_K x_grid_desc_m_k, + const XYGridDesc_M_K dy_grid_desc_m_k, + const XYGridDesc_M_K dx_grid_desc_m_k, + const DscaleDbiasGridDesc_M_K dscale_dbias_grid_desc_m_k, + const MeanVarGridDesc_M mean_var_grid_desc_m, + const ScaleBiasGridDesc_M scale_grid_desc_m, + const ScaleBiasGridDesc_M bias_grid_desc_m, + index_t blkgroup_size, + long_index_t reduce_size, + index_t num_xy_k_block_tile_iteration, + index_t num_dscale_dbias_k_block_tile_iteration, + const ScaleDataType* const __restrict__ p_reduce_dscale, + const BiasDataType* const __restrict__ p_reduce_dbias, + const MeanVarDataType* const __restrict__ p_mean, + const MeanVarDataType* const __restrict__ p_inv_var, + const XDataType* const __restrict__ p_x, + const DyDataType* const __restrict__ p_dy, + const ScaleDataType* const __restrict__ p_scale, + const DyElementwiseOp dy_elementwise_op, + DxDataType* const __restrict__ p_dx, + ScaleDataType* const __restrict__ p_dscale, + BiasDataType* const __restrict__ p_dbias) +{ + GridwiseReduceSecondHalfBatchNormBackwardFinal_::Run(x_grid_desc_m_k, + dy_grid_desc_m_k, + dx_grid_desc_m_k, + dscale_dbias_grid_desc_m_k, + mean_var_grid_desc_m, + scale_grid_desc_m, + bias_grid_desc_m, + blkgroup_size, + reduce_size, + num_xy_k_block_tile_iteration, + num_dscale_dbias_k_block_tile_iteration, + p_reduce_dscale, + p_reduce_dbias, + p_mean, + p_inv_var, + p_x, + p_dy, + p_scale, + dy_elementwise_op, + p_dx, + p_dscale, + p_dbias); +}; + +template +struct GridwiseReduceSecondHalfBatchNormBackwardFinal +{ + static_assert((XDyDxVectorDim == 0 && MThreadSliceSize % XSrcVectorSize == 0 && + MThreadSliceSize % DySrcVectorSize == 0 && + MThreadSliceSize % DxDstVectorSize == 0) || + (XDyDxVectorDim == 1 && KThreadSliceSize % XSrcVectorSize == 0 && + KThreadSliceSize % DySrcVectorSize == 0 && + KThreadSliceSize % DxDstVectorSize == 0), + "Invalid thread slice sizes and/or vector sizes configuration, please check!"); + + static constexpr bool reorder_thread_cluster = (XDyDxVectorDim == 0); + + using ThreadClusterLengths_M_K = Sequence; + + using ThreadBufferDimAccessOrder = + typename conditional, Sequence<0, 1>>::type; + + using ThreadClusterArrangeOrder = + typename conditional, Sequence<0, 1>>::type; + + static constexpr auto thread_cluster_desc = + make_cluster_descriptor(ThreadClusterLengths_M_K{}, ThreadClusterArrangeOrder{}); + + using ThreadReduceSrcDesc_M_1 = decltype( + make_naive_tensor_descriptor_packed(make_tuple(Number{}, Number<1>{}))); + using ThreadReduceDstDesc_M = + decltype(make_naive_tensor_descriptor_packed(make_tuple(Number{}))); + + using BlockwiseReduce = PartitionedBlockwiseReduction; + + using ThreadwiseReduce = ThreadwiseReduction; + + using PassThroughOp = tensor_operation::element_wise::PassThrough; + + static constexpr auto I0 = Number<0>{}; + static constexpr auto I1 = Number<1>{}; + + static constexpr index_t M_BlockTileSize = MThreadClusterSize * MThreadSliceSize; + static constexpr index_t K_BlockTileSize = KThreadClusterSize * KThreadSliceSize; + + // clang-format off + // Two of the steps of Multiblock BatchNorm Backward + // Step 1: Second half of Reduction: dbias = sum(dy), dscale = sum(dy * (x-mean) * inv-variance) + // Step 2: calculating dx = 1/reduce_size * inv-variance * scale * (reduce_size * dy - dbias - dscale * (x - mean) * inv-variance)) elementwise-ly + // clang-format on + __device__ static void Run(const XYGridDesc_M_K& x_grid_desc_m_k, + const XYGridDesc_M_K& dy_grid_desc_m_k, + const XYGridDesc_M_K& dx_grid_desc_m_k, + const DscaleDbiasGridDesc_M_K& dscale_dbias_grid_desc_m_k, + const MeanVarGridDesc_M& mean_var_grid_desc_m, + const ScaleBiasGridDesc_M& scale_grid_desc_m, + const ScaleBiasGridDesc_M& bias_grid_desc_m, + index_t blkgroup_size, + long_index_t reduce_size, + index_t num_xy_k_block_tile_iteration, + index_t num_dscale_dbias_k_block_tile_iteration, + const ScaleDataType* const __restrict__ p_reduce_dscale, + const BiasDataType* const __restrict__ p_reduce_dbias, + const MeanVarDataType* const __restrict__ p_mean, + const MeanVarDataType* const __restrict__ p_inv_var, + const XDataType* const __restrict__ p_x, + const DyDataType* const __restrict__ p_dy, + const ScaleDataType* const __restrict__ p_scale, + const DyElementwiseOp dy_elementwise_op, + DxDataType* const __restrict__ p_dx, + ScaleDataType* const __restrict__ p_dscale, + BiasDataType* const __restrict__ p_dbias) + { + __shared__ AccDataType p_reduce_work_buffer[BlockSize]; + + auto reduce_work_buf = + make_dynamic_buffer(p_reduce_work_buffer, BlockSize); + + StaticBuffer + reduce_dscale_thread_buf; + StaticBuffer + reduce_dbias_thread_buf; + + StaticBuffer dscale_thread_buf; + StaticBuffer dbias_thread_buf; + + StaticBuffer + x_thread_buf; + StaticBuffer + dy_thread_buf; + StaticBuffer + dx_thread_buf; + + StaticBuffer mean_thread_buf; + StaticBuffer + inv_var_thread_buf; + StaticBuffer scale_thread_buf; + + const index_t thread_local_id = get_thread_local_1d_id(); + const index_t block_global_id = get_block_1d_id(); + const index_t blkgroup_id = block_global_id / blkgroup_size; + const index_t block_local_id = block_global_id % blkgroup_size; + + const auto thread_cluster_idx = + thread_cluster_desc.CalculateBottomIndex(make_multi_index(thread_local_id)); + + const auto thread_m_cluster_id = thread_cluster_idx[I0]; + const auto thread_k_cluster_id = thread_cluster_idx[I1]; + + using ThreadBufferLengths_M_K = Sequence; + using ThreadBufferLengths_M = Sequence; + using ThreadBufferLengths_M_1 = Sequence; + constexpr auto thread_buffer_desc_m_k = make_naive_tensor_descriptor_packed( + make_tuple(Number{}, Number{})); + constexpr auto thread_buffer_desc_m = + make_naive_tensor_descriptor_packed(make_tuple(Number{})); + constexpr auto thread_buffer_desc_m_1 = make_naive_tensor_descriptor_packed( + make_tuple(Number{}, Number<1>{})); + + // clang-format off + // Step 1: do final reduction of dbias = sum(dy), dscale = sum(dy * (x-mean) * inv-variance) + // clang-format on + + auto threadwise_dscale_load_m_k = + ThreadwiseTensorSliceTransfer_v2, + 1, + 1, + 1, + true>( + dscale_dbias_grid_desc_m_k, + make_multi_index(blkgroup_id * M_BlockTileSize + + thread_m_cluster_id * MThreadSliceSize, + thread_k_cluster_id * 1)); + + auto threadwise_dbias_load_m_k = + ThreadwiseTensorSliceTransfer_v2, + 1, + 1, + 1, + true>( + dscale_dbias_grid_desc_m_k, + make_multi_index(blkgroup_id * M_BlockTileSize + + thread_m_cluster_id * MThreadSliceSize, + thread_k_cluster_id * 1)); + + auto threadwise_dscale_store_m = + ThreadwiseTensorSliceTransfer_v1r3, + 0, + ScaleSrcDstVectorSize, + InMemoryDataOperationEnum::Set, + 1, + true>( + scale_grid_desc_m, + make_multi_index(blkgroup_id * M_BlockTileSize + + thread_m_cluster_id * MThreadSliceSize), + PassThroughOp{}); + + auto threadwise_dbias_store_m = + ThreadwiseTensorSliceTransfer_v1r3, + 0, + BiasDstVectorSize, + InMemoryDataOperationEnum::Set, + 1, + true>( + bias_grid_desc_m, + make_multi_index(blkgroup_id * M_BlockTileSize + + thread_m_cluster_id * MThreadSliceSize), + PassThroughOp{}); + + const auto reduce_dscale_global_buf = make_dynamic_buffer( + p_reduce_dscale, dscale_dbias_grid_desc_m_k.GetElementSpaceSize()); + + const auto reduce_dbias_global_buf = make_dynamic_buffer( + p_reduce_dbias, dscale_dbias_grid_desc_m_k.GetElementSpaceSize()); + + auto dscale_global_buf = make_dynamic_buffer( + p_dscale, scale_grid_desc_m.GetElementSpaceSize()); + + auto dbias_global_buf = make_dynamic_buffer( + p_dbias, bias_grid_desc_m.GetElementSpaceSize()); + + constexpr auto dscale_dbias_thread_copy_step_m_k = + make_multi_index(0, KThreadClusterSize * 1); + + static_for<0, MThreadSliceSize, 1>{}([&](auto I) { + dscale_thread_buf(I) = type_convert(0.0f); + dbias_thread_buf(I) = type_convert(0.0f); + }); + + for(index_t reducedTiles = 0; reducedTiles < num_dscale_dbias_k_block_tile_iteration; + ++reducedTiles) + { + threadwise_dscale_load_m_k.Run(dscale_dbias_grid_desc_m_k, + reduce_dscale_global_buf, + thread_buffer_desc_m_1, + make_tuple(I0, I0), + reduce_dscale_thread_buf); + + threadwise_dbias_load_m_k.Run(dscale_dbias_grid_desc_m_k, + reduce_dbias_global_buf, + thread_buffer_desc_m_1, + make_tuple(I0, I0), + reduce_dbias_thread_buf); + + ThreadwiseReduce::Reduce(reduce_dscale_thread_buf, dscale_thread_buf); + ThreadwiseReduce::Reduce(reduce_dbias_thread_buf, dbias_thread_buf); + + threadwise_dscale_load_m_k.MoveSrcSliceWindow(dscale_dbias_grid_desc_m_k, + dscale_dbias_thread_copy_step_m_k); + threadwise_dbias_load_m_k.MoveSrcSliceWindow(dscale_dbias_grid_desc_m_k, + dscale_dbias_thread_copy_step_m_k); + } + + static_for<0, MThreadSliceSize, 1>{}([&](auto I) { + if constexpr(I > 0) + block_sync_lds(); + + BlockwiseReduce::Reduce(reduce_work_buf, dscale_thread_buf(I)); + block_sync_lds(); + BlockwiseReduce::Reduce(reduce_work_buf, dbias_thread_buf(I)); + }); + + threadwise_dscale_store_m.Run(thread_buffer_desc_m, + make_tuple(I0), + dscale_thread_buf, + scale_grid_desc_m, + dscale_global_buf); + + threadwise_dbias_store_m.Run(thread_buffer_desc_m, + make_tuple(I0), + dbias_thread_buf, + bias_grid_desc_m, + dbias_global_buf); + + // clang-format off + // Step 2: calculate dx = 1/N * inv-variance * scale * (N * dy - dbias - dscale * (x - mean) * inv-variance) + // clang-format on + + const index_t workSizePerBlock = K_BlockTileSize * num_xy_k_block_tile_iteration; + + auto threadwise_x_load = ThreadwiseTensorSliceTransfer_v2( + x_grid_desc_m_k, + make_multi_index(blkgroup_id * M_BlockTileSize + thread_m_cluster_id * MThreadSliceSize, + workSizePerBlock * block_local_id + + thread_k_cluster_id * KThreadSliceSize)); + + auto threadwise_dy_load = ThreadwiseTensorSliceTransfer_v2( + dy_grid_desc_m_k, + make_multi_index(blkgroup_id * M_BlockTileSize + thread_m_cluster_id * MThreadSliceSize, + workSizePerBlock * block_local_id + + thread_k_cluster_id * KThreadSliceSize)); + + auto threadwise_dx_store = + ThreadwiseTensorSliceTransfer_v1r3( + dx_grid_desc_m_k, + make_multi_index( + blkgroup_id * M_BlockTileSize + thread_m_cluster_id * MThreadSliceSize, + workSizePerBlock * block_local_id + thread_k_cluster_id * KThreadSliceSize), + PassThroughOp{}); + + auto threadwise_scale_load = + ThreadwiseTensorSliceTransfer_v2, + 0, + ScaleSrcDstVectorSize, + 1, + true>( + scale_grid_desc_m, + make_multi_index(blkgroup_id * M_BlockTileSize + + thread_m_cluster_id * MThreadSliceSize)); + + auto threadwise_mean_var_load = + ThreadwiseTensorSliceTransfer_v2, + 0, + MeanVarSrcVectorSize, + 1, + true>( + mean_var_grid_desc_m, + make_multi_index(blkgroup_id * M_BlockTileSize + + thread_m_cluster_id * MThreadSliceSize)); + + const auto x_global_buf = make_dynamic_buffer( + p_x, x_grid_desc_m_k.GetElementSpaceSize()); + + const auto dy_global_buf = make_dynamic_buffer( + p_dy, dy_grid_desc_m_k.GetElementSpaceSize()); + + auto dx_global_buf = make_dynamic_buffer( + p_dx, dx_grid_desc_m_k.GetElementSpaceSize()); + + const auto scale_global_buf = make_dynamic_buffer( + p_scale, scale_grid_desc_m.GetElementSpaceSize()); + + const auto mean_global_buf = make_dynamic_buffer( + p_mean, mean_var_grid_desc_m.GetElementSpaceSize()); + + const auto inv_var_global_buf = make_dynamic_buffer( + p_inv_var, mean_var_grid_desc_m.GetElementSpaceSize()); + + threadwise_scale_load.Run(scale_grid_desc_m, + scale_global_buf, + thread_buffer_desc_m, + make_tuple(I0), + scale_thread_buf); + + threadwise_mean_var_load.Run(mean_var_grid_desc_m, + mean_global_buf, + thread_buffer_desc_m, + make_tuple(I0), + mean_thread_buf); + + threadwise_mean_var_load.Run(mean_var_grid_desc_m, + inv_var_global_buf, + thread_buffer_desc_m, + make_tuple(I0), + inv_var_thread_buf); + + constexpr auto xy_thread_copy_step_m_k = make_multi_index(0, K_BlockTileSize); + + AccDataType inv_reduce_size = + type_convert(1.0) / type_convert(reduce_size); + + for(index_t reducedTiles = 0; reducedTiles < num_xy_k_block_tile_iteration; ++reducedTiles) + { + threadwise_x_load.Run(x_grid_desc_m_k, + x_global_buf, + thread_buffer_desc_m_k, + make_tuple(I0, I0), + x_thread_buf); + + threadwise_dy_load.Run(dy_grid_desc_m_k, + dy_global_buf, + thread_buffer_desc_m_k, + make_tuple(I0, I0), + dy_thread_buf); + + static_for<0, MThreadSliceSize, 1>{}([&](auto iM) { + AccDataType multiplier = + inv_reduce_size * inv_var_thread_buf[iM] * scale_thread_buf[iM]; + + static_for<0, KThreadSliceSize, 1>{}([&](auto iK) { + constexpr auto offset = + thread_buffer_desc_m_k.CalculateOffset(make_tuple(iM, iK)); + + dy_elementwise_op(dy_thread_buf(Number{}), + dy_thread_buf[Number{}]); + + AccDataType norm_x = (x_thread_buf[Number{}] - mean_thread_buf[iM]) * + inv_var_thread_buf[iM]; + + AccDataType tmpVal = norm_x * dscale_thread_buf[iM]; + + dx_thread_buf(Number{}) = + multiplier * + (type_convert(reduce_size) * dy_thread_buf[Number{}] - + dbias_thread_buf[iM] - tmpVal); + }); + }); + + threadwise_dx_store.Run(thread_buffer_desc_m_k, + make_tuple(I0, I0), + dx_thread_buf, + dx_grid_desc_m_k, + dx_global_buf); + + threadwise_x_load.MoveSrcSliceWindow(x_grid_desc_m_k, xy_thread_copy_step_m_k); + threadwise_dy_load.MoveSrcSliceWindow(dy_grid_desc_m_k, xy_thread_copy_step_m_k); + threadwise_dx_store.MoveDstSliceWindow(dx_grid_desc_m_k, xy_thread_copy_step_m_k); + } + }; +}; + +} // namespace ck diff --git a/include/ck/tensor_operation/gpu/grid/batchnorm_multiblock/gridwise_multiblock_welford_first_half.hpp b/include/ck/tensor_operation/gpu/grid/batchnorm_multiblock/gridwise_multiblock_welford_first_half.hpp index 1afe9f9752..08cb0dd191 100644 --- a/include/ck/tensor_operation/gpu/grid/batchnorm_multiblock/gridwise_multiblock_welford_first_half.hpp +++ b/include/ck/tensor_operation/gpu/grid/batchnorm_multiblock/gridwise_multiblock_welford_first_half.hpp @@ -93,6 +93,9 @@ struct GridwiseMultiblockWelfordFirstHalf static constexpr index_t M_BlockTileSize = MThreadClusterSize * MThreadSliceSize; static constexpr index_t K_BlockTileSize = KThreadClusterSize * KThreadSliceSize; + // clang-format off + // First half of the Multiblock Welford method to calculate mean and variance, used by both batchnorm-forward and batchnorm-backward. + // clang-format on __device__ static void Run(const XGridDesc_M_K& x_grid_desc_m_k, const MeanVarCountGridDesc_M_G& mean_var_count_grid_desc_m_g, const GetReduceCountPerThreadFunctor& get_reduce_count_per_thread, diff --git a/include/ck/tensor_operation/gpu/grid/batchnorm_multiblock/gridwise_multiblock_welford_second_half_batchnorm_forward_final.hpp b/include/ck/tensor_operation/gpu/grid/batchnorm_multiblock/gridwise_multiblock_welford_second_half_batchnorm_forward_final.hpp index 53d3e8aee7..548d7fd40a 100644 --- a/include/ck/tensor_operation/gpu/grid/batchnorm_multiblock/gridwise_multiblock_welford_second_half_batchnorm_forward_final.hpp +++ b/include/ck/tensor_operation/gpu/grid/batchnorm_multiblock/gridwise_multiblock_welford_second_half_batchnorm_forward_final.hpp @@ -529,6 +529,7 @@ struct GridwiseWelfordSecondHalfBatchNormForwardFinal auto result_inv_var_global_buf = make_dynamic_buffer( resultSaveInvVariance, mean_var_grid_desc_m.GetElementSpaceSize()); + // calculate inv-variance as 1/sqrt(epsilon+variance) static_for<0, MThreadSliceSize, 1>{}([&](auto I) { welford_var_thread_buf(I) = type_convert(1.0f) / sqrt(epsilon + welford_var_thread_buf[I]); diff --git a/include/ck/tensor_operation/gpu/grid/batchnorm_multiblock/gridwise_multiblock_welford_second_half_multiblock_reduce_first_half.hpp b/include/ck/tensor_operation/gpu/grid/batchnorm_multiblock/gridwise_multiblock_welford_second_half_multiblock_reduce_first_half.hpp new file mode 100644 index 0000000000..a4de9b7e6c --- /dev/null +++ b/include/ck/tensor_operation/gpu/grid/batchnorm_multiblock/gridwise_multiblock_welford_second_half_multiblock_reduce_first_half.hpp @@ -0,0 +1,575 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once + +#include "ck/utility/data_type.hpp" +#include "ck/tensor_operation/gpu/block/blockwise_welford.hpp" +#include "ck/tensor_operation/gpu/block/reduction_functions_blockwise.hpp" +#include "ck/tensor_operation/gpu/thread/threadwise_welford.hpp" +#include "ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer.hpp" +#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" + +namespace ck { + +template +__global__ void kernel_welford_second_half_reduce_first_half( + const XYGridDesc_M_K x_grid_desc_m_k, + const XYGridDesc_M_K dy_grid_desc_m_k, + const MeanVarGridDesc_M mean_var_grid_desc_m, + const MeanVarCountGridDesc_M_K mean_var_count_grid_desc_m_k, + const DscaleDbiasGridDesc_M_G dscale_dbias_grid_desc_m_g, + index_t blkgroup_size, + index_t num_xy_k_block_tile_iteration, + index_t num_mean_var_count_k_block_tile_iteration, + AccDataType epsilon, + bool haveSavedMeanInvVar, + const MeanVarDataType* const __restrict__ p_savedMean, + const MeanVarDataType* const __restrict__ p_savedInvVar, + const MeanVarDataType* const __restrict__ p_in_welford_mean, + const MeanVarDataType* const __restrict__ p_in_welford_variance, + const int32_t* const __restrict__ p_in_welford_count, + const DyElementwiseOp dy_elementwise_op, + MeanVarDataType* const __restrict__ p_out_welford_mean, + MeanVarDataType* const __restrict__ p_out_welford_inv_variance, + const XDataType* const __restrict__ p_x, + const DyDataType* const __restrict__ p_dy, + ScaleDataType* const __restrict__ p_reduce_dscale, + BiasDataType* const __restrict__ p_reduce_dbias) +{ + GridwiseWelfordSecondHalfReduceFirstHalf_::Run(x_grid_desc_m_k, + dy_grid_desc_m_k, + mean_var_grid_desc_m, + mean_var_count_grid_desc_m_k, + dscale_dbias_grid_desc_m_g, + blkgroup_size, + num_xy_k_block_tile_iteration, + num_mean_var_count_k_block_tile_iteration, + epsilon, + haveSavedMeanInvVar, + p_savedMean, + p_savedInvVar, + p_in_welford_mean, + p_in_welford_variance, + p_in_welford_count, + dy_elementwise_op, + p_out_welford_mean, + p_out_welford_inv_variance, + p_x, + p_dy, + p_reduce_dscale, + p_reduce_dbias); +}; + +template +struct GridwiseWelfordSecondHalfReduceFirstHalf +{ + static_assert((XDyVectorDim == 0 && MThreadSliceSize % XSrcVectorSize == 0 && + MThreadSliceSize % DySrcVectorSize == 0) || + (XDyVectorDim == 1 && KThreadSliceSize % XSrcVectorSize == 0 && + KThreadSliceSize % DySrcVectorSize == 0), + "Invalid thread slice sizes and/or vector sizes configuration, please check!"); + + static constexpr bool reorder_thread_cluster = (XDyVectorDim == 0); + + using ThreadClusterLengths_M_K = Sequence; + + using ThreadBufferDimAccessOrder = + typename conditional, Sequence<0, 1>>::type; + + using ThreadClusterArrangeOrder = + typename conditional, Sequence<0, 1>>::type; + + static constexpr auto thread_cluster_desc = + make_cluster_descriptor(ThreadClusterLengths_M_K{}, ThreadClusterArrangeOrder{}); + + using ThreadReduceSrcDesc_M_K = decltype(make_naive_tensor_descriptor_packed( + make_tuple(Number{}, Number{}))); + using ThreadReduceSrcDesc_M_1 = decltype( + make_naive_tensor_descriptor_packed(make_tuple(Number{}, Number<1>{}))); + using ThreadReduceDstDesc_M = + decltype(make_naive_tensor_descriptor_packed(make_tuple(Number{}))); + + using ThreadwiseWelford = + ThreadwiseWelfordMerge; + + using BlockwiseWelford = BlockwiseWelford; + + using BlockwiseReduce = PartitionedBlockwiseReduction; + + using ThreadwiseReduce = ThreadwiseReduction; + + using PassThroughOp = tensor_operation::element_wise::PassThrough; + + static constexpr auto I0 = Number<0>{}; + static constexpr auto I1 = Number<1>{}; + + static constexpr index_t M_BlockTileSize = MThreadClusterSize * MThreadSliceSize; + static constexpr index_t K_BlockTileSize = KThreadClusterSize * KThreadSliceSize; + + // clang-format off + // Two of the steps of Multiblock BatchNorm Backward + // Step 1: Second half of Welford method to calculate mean and variance, as well as getting inv-variance = 1/sqrt(epsilon+variance) + // Step 2: First half of Reduction: dbias = sum(dy), dscale = sum(dy * (x-mean) * inv-variance) + // clang-format on + __device__ static void Run(const XYGridDesc_M_K& x_grid_desc_m_k, + const XYGridDesc_M_K& dy_grid_desc_m_k, + const MeanVarGridDesc_M& mean_var_grid_desc_m, + const MeanVarCountGridDesc_M_K& mean_var_count_grid_desc_m_k, + const DscaleDbiasGridDesc_M_G& dscale_dbias_grid_desc_m_g, + index_t blkgroup_size, + index_t num_xy_k_block_tile_iteration, + index_t num_mean_var_count_k_block_tile_iteration, + AccDataType epsilon, + bool haveSavedMeanInvVar, + const MeanVarDataType* const __restrict__ p_savedMean, + const MeanVarDataType* const __restrict__ p_savedInvVar, + const MeanVarDataType* const __restrict__ p_in_welford_mean, + const MeanVarDataType* const __restrict__ p_in_welford_variance, + const int32_t* const __restrict__ p_in_welford_count, + const DyElementwiseOp dy_elementwise_op, + MeanVarDataType* const __restrict__ p_out_welford_mean, + MeanVarDataType* const __restrict__ p_out_welford_inv_variance, + const XDataType* const __restrict__ p_x, + const DyDataType* const __restrict__ p_dy, + ScaleDataType* const __restrict__ p_reduce_dscale, + BiasDataType* const __restrict__ p_reduce_dbias) + { + __shared__ AccDataType p_reduce_work_buffer[BlockSize]; + + auto reduce_work_buf = + make_dynamic_buffer(p_reduce_work_buffer, BlockSize); + + StaticBuffer + in_welford_mean_thread_buf; + StaticBuffer + in_welford_var_thread_buf; + StaticBuffer + in_welford_count_thread_buf; + + StaticBuffer + welford_mean_thread_buf; + StaticBuffer + welford_var_thread_buf; + StaticBuffer + welford_count_thread_buf; + + StaticBuffer& mean_thread_buf = + welford_mean_thread_buf; + StaticBuffer& + inv_var_thread_buf = welford_var_thread_buf; + + StaticBuffer + x_thread_buf; + StaticBuffer + dy_thread_buf; + + // buffer of values of dy * (x-mean) * inv-variance, used as input of Blockwise reduction + StaticBuffer + tmp1_thread_buf; + + StaticBuffer + reduce_dscale_thread_buf; + StaticBuffer + reduce_dbias_thread_buf; + + const index_t thread_local_id = get_thread_local_1d_id(); + const index_t block_global_id = get_block_1d_id(); + const index_t blkgroup_id = block_global_id / blkgroup_size; + const index_t block_local_id = block_global_id % blkgroup_size; + + const auto thread_cluster_idx = + thread_cluster_desc.CalculateBottomIndex(make_multi_index(thread_local_id)); + + const auto thread_m_cluster_id = thread_cluster_idx[I0]; + const auto thread_k_cluster_id = thread_cluster_idx[I1]; + + using ThreadBufferLengths_M_K = Sequence; + using ThreadBufferLengths_M = Sequence; + using ThreadBufferLengths_M_1 = Sequence; + constexpr auto thread_buffer_desc_m_k = make_naive_tensor_descriptor_packed( + make_tuple(Number{}, Number{})); + constexpr auto thread_buffer_desc_m = + make_naive_tensor_descriptor_packed(make_tuple(Number{})); + constexpr auto thread_buffer_desc_m_1 = make_naive_tensor_descriptor_packed( + make_tuple(Number{}, Number<1>{})); + + // clang-format off + // Step 1: load existing mean and inv-variance, or do final welford reduction on mean and variance as well as get inv-variance = 1/sqrt(epsilon+variance) + // clang-format on + + if(haveSavedMeanInvVar) + { + const auto mean_global_buf = make_dynamic_buffer( + p_savedMean, mean_var_grid_desc_m.GetElementSpaceSize()); + + const auto inv_var_global_buf = make_dynamic_buffer( + p_savedInvVar, mean_var_grid_desc_m.GetElementSpaceSize()); + + auto threadwise_mean_inv_var_load = + ThreadwiseTensorSliceTransfer_v2, + 0, + MeanVarSrcVectorSize, + 1, + true>( + mean_var_grid_desc_m, + make_multi_index(blkgroup_id * M_BlockTileSize + + thread_m_cluster_id * MThreadSliceSize)); + + threadwise_mean_inv_var_load.Run(mean_var_grid_desc_m, + mean_global_buf, + thread_buffer_desc_m, + make_tuple(I0), + mean_thread_buf); + + threadwise_mean_inv_var_load.Run(mean_var_grid_desc_m, + inv_var_global_buf, + thread_buffer_desc_m, + make_tuple(I0), + inv_var_thread_buf); + } + else + { + const auto welford_mean_global_buf = make_dynamic_buffer( + p_in_welford_mean, mean_var_count_grid_desc_m_k.GetElementSpaceSize()); + + const auto welford_var_global_buf = make_dynamic_buffer( + p_in_welford_variance, mean_var_count_grid_desc_m_k.GetElementSpaceSize()); + + const auto welford_count_global_buf = make_dynamic_buffer( + p_in_welford_count, mean_var_count_grid_desc_m_k.GetElementSpaceSize()); + + auto threadwise_mean_var_load_m_k = + ThreadwiseTensorSliceTransfer_v2, + 1, + 1, + 1, + true>( + mean_var_count_grid_desc_m_k, + make_multi_index(blkgroup_id * M_BlockTileSize + + thread_m_cluster_id * MThreadSliceSize, + thread_k_cluster_id * 1)); + + auto threadwise_count_load_m_k = + ThreadwiseTensorSliceTransfer_v2, + 1, + 1, + 1, + true>( + mean_var_count_grid_desc_m_k, + make_multi_index(blkgroup_id * M_BlockTileSize + + thread_m_cluster_id * MThreadSliceSize, + thread_k_cluster_id * 1)); + + constexpr auto mean_var_count_thread_copy_step_m_k = + make_multi_index(0, KThreadClusterSize * 1); + + static_for<0, MThreadSliceSize, 1>{}([&](auto I) { + welford_mean_thread_buf(I) = type_convert(0.0f); + welford_var_thread_buf(I) = type_convert(0.0f); + welford_count_thread_buf(I) = 0; + }); + + for(index_t reducedTiles = 0; reducedTiles < num_mean_var_count_k_block_tile_iteration; + ++reducedTiles) + { + threadwise_mean_var_load_m_k.Run(mean_var_count_grid_desc_m_k, + welford_mean_global_buf, + thread_buffer_desc_m_1, + make_tuple(I0, I0), + in_welford_mean_thread_buf); + + threadwise_mean_var_load_m_k.Run(mean_var_count_grid_desc_m_k, + welford_var_global_buf, + thread_buffer_desc_m_1, + make_tuple(I0, I0), + in_welford_var_thread_buf); + + threadwise_count_load_m_k.Run(mean_var_count_grid_desc_m_k, + welford_count_global_buf, + thread_buffer_desc_m_1, + make_tuple(I0, I0), + in_welford_count_thread_buf); + + ThreadwiseWelford::Run(in_welford_mean_thread_buf, + in_welford_var_thread_buf, + in_welford_count_thread_buf, + welford_mean_thread_buf, + welford_var_thread_buf, + welford_count_thread_buf); + + threadwise_mean_var_load_m_k.MoveSrcSliceWindow( + mean_var_count_grid_desc_m_k, mean_var_count_thread_copy_step_m_k); + threadwise_count_load_m_k.MoveSrcSliceWindow(mean_var_count_grid_desc_m_k, + mean_var_count_thread_copy_step_m_k); + } + + static_for<0, MThreadSliceSize, 1>{}([&](auto I) { + if constexpr(I > 0) + block_sync_lds(); + + BlockwiseWelford::Run(welford_mean_thread_buf(I), + welford_var_thread_buf(I), + welford_count_thread_buf(I)); + }); + + // calculate inv-variance as 1/sqrt(epsilon+variance), stored in place of variance + static_for<0, MThreadSliceSize, 1>{}([&](auto I) { + welford_var_thread_buf(I) = + type_convert(1.0) / sqrt(welford_var_thread_buf[I] + epsilon); + }); + + if(block_local_id == 0 && thread_k_cluster_id == 0) + { + + auto threadwise_mean_inv_var_store = + ThreadwiseTensorSliceTransfer_v1r3, + 0, + 1, + InMemoryDataOperationEnum::Set, + 1, + true>( + mean_var_grid_desc_m, + make_multi_index(blkgroup_id * M_BlockTileSize + + thread_m_cluster_id * MThreadSliceSize), + PassThroughOp{}); + + auto mean_global_buf = make_dynamic_buffer( + p_out_welford_mean, mean_var_grid_desc_m.GetElementSpaceSize()); + + auto inv_var_global_buf = make_dynamic_buffer( + p_out_welford_inv_variance, mean_var_grid_desc_m.GetElementSpaceSize()); + + threadwise_mean_inv_var_store.Run(thread_buffer_desc_m, + make_tuple(I0), + mean_thread_buf, + mean_var_grid_desc_m, + mean_global_buf); + + threadwise_mean_inv_var_store.Run(thread_buffer_desc_m, + make_tuple(I0), + inv_var_thread_buf, + mean_var_grid_desc_m, + inv_var_global_buf); + }; + }; + + const index_t workSizePerBlock = K_BlockTileSize * num_xy_k_block_tile_iteration; + + auto threadwise_x_load = ThreadwiseTensorSliceTransfer_v2( + x_grid_desc_m_k, + make_multi_index(blkgroup_id * M_BlockTileSize + thread_m_cluster_id * MThreadSliceSize, + workSizePerBlock * block_local_id + + thread_k_cluster_id * KThreadSliceSize)); + + auto threadwise_dy_load = ThreadwiseTensorSliceTransfer_v2( + dy_grid_desc_m_k, + make_multi_index(blkgroup_id * M_BlockTileSize + thread_m_cluster_id * MThreadSliceSize, + workSizePerBlock * block_local_id + + thread_k_cluster_id * KThreadSliceSize)); + + const auto x_global_buf = make_dynamic_buffer( + p_x, x_grid_desc_m_k.GetElementSpaceSize()); + + const auto dy_global_buf = make_dynamic_buffer( + p_dy, dy_grid_desc_m_k.GetElementSpaceSize()); + + constexpr auto xy_thread_copy_step_m_k = make_multi_index(0, K_BlockTileSize); + + static_for<0, MThreadSliceSize, 1>{}([&](auto I) { + reduce_dscale_thread_buf(I) = type_convert(0); + reduce_dbias_thread_buf(I) = type_convert(0); + }); + + // clang-format off + // Step 2: first-half of reduction: dbias = sum(dy), dscale = sum(dy * (x-mean) * inv-variance) + // clang-format on + + for(index_t reducedTiles = 0; reducedTiles < num_xy_k_block_tile_iteration; ++reducedTiles) + { + threadwise_x_load.Run(x_grid_desc_m_k, + x_global_buf, + thread_buffer_desc_m_k, + make_tuple(I0, I0), + x_thread_buf); + + threadwise_dy_load.Run(dy_grid_desc_m_k, + dy_global_buf, + thread_buffer_desc_m_k, + make_tuple(I0, I0), + dy_thread_buf); + + static_for<0, MThreadSliceSize, 1>{}([&](auto iM) { + static_for<0, KThreadSliceSize, 1>{}([&](auto iK) { + constexpr auto offset = + thread_buffer_desc_m_k.CalculateOffset(make_tuple(iM, iK)); + + dy_elementwise_op(dy_thread_buf(Number{}), + dy_thread_buf[Number{}]); + + AccDataType norm_x = (x_thread_buf[Number{}] - mean_thread_buf[iM]) * + inv_var_thread_buf[iM]; + + tmp1_thread_buf(Number{}) = norm_x * dy_thread_buf[Number{}]; + }); + }); + + ThreadwiseReduce::Reduce(tmp1_thread_buf, reduce_dscale_thread_buf); + ThreadwiseReduce::Reduce(dy_thread_buf, reduce_dbias_thread_buf); + + threadwise_x_load.MoveSrcSliceWindow(x_grid_desc_m_k, xy_thread_copy_step_m_k); + threadwise_dy_load.MoveSrcSliceWindow(dy_grid_desc_m_k, xy_thread_copy_step_m_k); + }; + + static_for<0, MThreadSliceSize, 1>{}([&](auto I) { + if constexpr(I > 0) + block_sync_lds(); + + BlockwiseReduce::Reduce(reduce_work_buf, reduce_dscale_thread_buf(I)); + block_sync_lds(); + BlockwiseReduce::Reduce(reduce_work_buf, reduce_dbias_thread_buf(I)); + }); + + auto threadwise_dscale_store = + ThreadwiseTensorSliceTransfer_v1r3, + 1, + 1, + InMemoryDataOperationEnum::Set, + 1, + true>( + dscale_dbias_grid_desc_m_g, + make_multi_index(blkgroup_id * M_BlockTileSize + + thread_m_cluster_id * MThreadSliceSize, + block_local_id), + PassThroughOp{}); + + auto threadwise_dbias_store = + ThreadwiseTensorSliceTransfer_v1r3, + 1, + 1, + InMemoryDataOperationEnum::Set, + 1, + true>( + dscale_dbias_grid_desc_m_g, + make_multi_index(blkgroup_id * M_BlockTileSize + + thread_m_cluster_id * MThreadSliceSize, + block_local_id), + PassThroughOp{}); + + auto reduce_dscale_global_buf = make_dynamic_buffer( + p_reduce_dscale, dscale_dbias_grid_desc_m_g.GetElementSpaceSize()); + + auto reduce_dbias_global_buf = make_dynamic_buffer( + p_reduce_dbias, dscale_dbias_grid_desc_m_g.GetElementSpaceSize()); + + if(thread_k_cluster_id == 0) + { + threadwise_dscale_store.Run(thread_buffer_desc_m_1, + make_tuple(I0, I0), + reduce_dscale_thread_buf, + dscale_dbias_grid_desc_m_g, + reduce_dscale_global_buf); + + threadwise_dbias_store.Run(thread_buffer_desc_m_1, + make_tuple(I0, I0), + reduce_dbias_thread_buf, + dscale_dbias_grid_desc_m_g, + reduce_dbias_global_buf); + }; + }; +}; + +} // namespace ck diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_batchnorm_backward_blockwise_welford.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_batchnorm_backward_blockwise_welford.hpp new file mode 100644 index 0000000000..c210d17d09 --- /dev/null +++ b/include/ck/tensor_operation/gpu/grid/gridwise_batchnorm_backward_blockwise_welford.hpp @@ -0,0 +1,572 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once + +#include "ck/utility/data_type.hpp" +#include "ck/utility/math_v2.hpp" +#include "ck/utility/reduction_operator.hpp" +#include "ck/tensor_operation/gpu/block/blockwise_welford.hpp" +#include "ck/tensor_operation/gpu/block/reduction_functions_blockwise.hpp" +#include "ck/tensor_operation/gpu/thread/threadwise_welford.hpp" +#include "ck/tensor_operation/gpu/thread/reduction_functions_threadwise.hpp" +#include "ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer.hpp" +#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" + +namespace ck { + +template +__global__ void kernel_batchnorm_backward_with_blockwise_welford( + const XYGridDesc_M_K x_grid_desc_m_k, + const XYGridDesc_M_K dy_grid_desc_m_k, + const XYGridDesc_M_K dx_grid_desc_m_k, + const ScaleBiasGridDesc_M scale_grid_desc_m, + const ScaleBiasGridDesc_M bias_grid_desc_m, + const MeanVarGridDesc_M mean_var_grid_desc_m, + const GetReduceCountPerThreadFunctor get_reduce_count_per_thread, + long_index_t reduce_size, + index_t num_k_block_tile_iteration, + AccDataType epsilon, + const XDataType* const __restrict__ p_x, + const DyDataType* const __restrict__ p_dy, + const ScaleDataType* const __restrict__ p_scale, + bool haveSavedMeanInvVar, + const MeanVarDataType* const __restrict__ p_savedMean, + const MeanVarDataType* const __restrict__ p_savedInvVar, + const DyElementwiseOp dy_elementwise_op, + DxDataType* const __restrict__ p_dx, + ScaleDataType* const __restrict__ p_dscale, + BiasDataType* const __restrict__ p_dbias) +{ + GridwiseBatchrNormBackwardWithBlockwiseWelford_::Run(x_grid_desc_m_k, + dy_grid_desc_m_k, + dx_grid_desc_m_k, + scale_grid_desc_m, + bias_grid_desc_m, + mean_var_grid_desc_m, + get_reduce_count_per_thread, + reduce_size, + num_k_block_tile_iteration, + epsilon, + p_x, + p_dy, + p_scale, + haveSavedMeanInvVar, + p_savedMean, + p_savedInvVar, + dy_elementwise_op, + p_dx, + p_dscale, + p_dbias); +}; + +template +struct GridwiseBatchNormBackwardWithBlockwiseWelford +{ + static_assert((XDyDxVectorDim == 0 && MThreadSliceSize % XSrcVectorSize == 0 && + MThreadSliceSize % DySrcVectorSize == 0 && + MThreadSliceSize % DxDstVectorSize == 0) || + (XDyDxVectorDim == 1 && KThreadSliceSize % XSrcVectorSize == 0 && + KThreadSliceSize % DySrcVectorSize == 0 && + KThreadSliceSize % DxDstVectorSize == 0), + "Invalid thread slice sizes and/or vector sizes configuration, please check!"); + + static constexpr bool reorder_thread_cluster = (XDyDxVectorDim == 0); + + using ThreadClusterLengths_M_K = Sequence; + + using ThreadBufferDimAccessOrder = + typename conditional, Sequence<0, 1>>::type; + + using ThreadClusterArrangeOrder = + typename conditional, Sequence<0, 1>>::type; + + static constexpr auto thread_cluster_desc = + make_cluster_descriptor(ThreadClusterLengths_M_K{}, ThreadClusterArrangeOrder{}); + + using ThreadReduceSrcDesc_M_K = decltype(make_naive_tensor_descriptor_packed( + make_tuple(Number{}, Number{}))); + using ThreadReduceDstDesc_M = + decltype(make_naive_tensor_descriptor_packed(make_tuple(Number{}))); + + using ThreadwiseWelford = + ThreadwiseWelford; + + using BlockwiseWelford = BlockwiseWelford; + + using BlockwiseReduce = PartitionedBlockwiseReduction; + + using ThreadwiseReduce = ThreadwiseReduction; + + using PassThroughOp = tensor_operation::element_wise::PassThrough; + + static constexpr auto I0 = Number<0>{}; + static constexpr auto I1 = Number<1>{}; + + static constexpr index_t M_BlockTileSize = MThreadClusterSize * MThreadSliceSize; + static constexpr index_t K_BlockTileSize = KThreadClusterSize * KThreadSliceSize; + + // clang-format off + // Blockwise BatchNorm Backward + // Input: x, dy, scale, savedMean and savedInvVar (optional), reduce_size + // Output: dx, dscale, dbias + // Step 1: calculating mean and inv-variance using welford method (if savedMean/savedInvVar not available), where inv-variance = 1/sqrt(epsilon+variance) + // Step 2: reduction: dbias = sum(dy), dscale = sum(dy *(x-mean) * inv-variance) + // Step 3: calculating dx = 1/reduce_size * inv-variance * scale * (reduce_size * dy - dbias - dscale * (x - mean) * inv-variance)) elementwise-ly + // clang-format on + __device__ static void Run(const XYGridDesc_M_K x_grid_desc_m_k, + const XYGridDesc_M_K dy_grid_desc_m_k, + const XYGridDesc_M_K dx_grid_desc_m_k, + const ScaleBiasGridDesc_M scale_grid_desc_m, + const ScaleBiasGridDesc_M bias_grid_desc_m, + const MeanVarGridDesc_M mean_var_grid_desc_m, + const GetReduceCountPerThreadFunctor get_reduce_count_per_thread, + long_index_t reduce_size, + index_t num_k_block_tile_iteration, + AccDataType epsilon, + const XDataType* const __restrict__ p_x, + const DyDataType* const __restrict__ p_dy, + const ScaleDataType* const __restrict__ p_scale, + bool haveSavedMeanInvVar, + const MeanVarDataType* const __restrict__ p_savedMean, + const MeanVarDataType* const __restrict__ p_savedInvVar, + const DyElementwiseOp dy_elementwise_op, + DxDataType* const __restrict__ p_dx, + ScaleDataType* const __restrict__ p_dscale, + BiasDataType* const __restrict__ p_dbias) + { + using ck::math::sqrt; + + __shared__ AccDataType p_reduce_work_buffer[BlockSize]; + + auto reduce_work_buf = + make_dynamic_buffer(p_reduce_work_buffer, BlockSize); + + StaticBuffer + x_thread_buf; + + StaticBuffer + dy_thread_buf; + + StaticBuffer + dx_thread_buf; + + // buffer of values of dy * (x-mean) * invVariance, used as input of Blockwise reduction + StaticBuffer + tmp1_thread_buf; + + StaticBuffer scale_thread_buf; + + StaticBuffer mean_thread_buf; + StaticBuffer var_thread_buf; + StaticBuffer& + inv_var_thread_buf = var_thread_buf; + + StaticBuffer dscale_thread_buf; + StaticBuffer dbias_thread_buf; + + const index_t thread_local_id = get_thread_local_1d_id(); + const index_t block_global_id = get_block_1d_id(); + + const auto thread_cluster_idx = + thread_cluster_desc.CalculateBottomIndex(make_multi_index(thread_local_id)); + + const auto thread_m_cluster_id = thread_cluster_idx[I0]; + const auto thread_k_cluster_id = thread_cluster_idx[I1]; + + using ThreadBufferLengths_M_K = Sequence; + using ThreadBufferLengths_M = Sequence; + constexpr auto thread_buffer_desc_m_k = make_naive_tensor_descriptor_packed( + make_tuple(Number{}, Number{})); + constexpr auto thread_buffer_desc_m = + make_naive_tensor_descriptor_packed(make_tuple(Number{})); + + auto threadwise_x_load = ThreadwiseTensorSliceTransfer_v2( + x_grid_desc_m_k, + make_multi_index(block_global_id * M_BlockTileSize + + thread_m_cluster_id * MThreadSliceSize, + thread_k_cluster_id * KThreadSliceSize)); + + auto threadwise_dy_load = ThreadwiseTensorSliceTransfer_v2( + x_grid_desc_m_k, + make_multi_index(block_global_id * M_BlockTileSize + + thread_m_cluster_id * MThreadSliceSize, + thread_k_cluster_id * KThreadSliceSize)); + + auto threadwise_dx_store = + ThreadwiseTensorSliceTransfer_v1r3( + dy_grid_desc_m_k, + make_multi_index(block_global_id * M_BlockTileSize + + thread_m_cluster_id * MThreadSliceSize, + thread_k_cluster_id * KThreadSliceSize), + PassThroughOp{}); + + auto threadwise_scale_load = + ThreadwiseTensorSliceTransfer_v2, + 0, + ScaleSrcDstVectorSize, + 1, + true>( + scale_grid_desc_m, + make_multi_index(block_global_id * M_BlockTileSize + + thread_m_cluster_id * MThreadSliceSize)); + + auto threadwise_dscale_store = + ThreadwiseTensorSliceTransfer_v1r3, + 0, + ScaleSrcDstVectorSize, + InMemoryDataOperationEnum::Set, + 1, + true>( + scale_grid_desc_m, + make_multi_index(block_global_id * M_BlockTileSize + + thread_m_cluster_id * MThreadSliceSize), + PassThroughOp{}); + + auto threadwise_dbias_store = + ThreadwiseTensorSliceTransfer_v1r3, + 0, + BiasDstVectorSize, + InMemoryDataOperationEnum::Set, + 1, + true>( + bias_grid_desc_m, + make_multi_index(block_global_id * M_BlockTileSize + + thread_m_cluster_id * MThreadSliceSize), + PassThroughOp{}); + + constexpr auto thread_copy_fwd_step_m_k = make_multi_index(0, K_BlockTileSize); + constexpr auto thread_copy_bwd_step_m_k = make_multi_index(0, -K_BlockTileSize); + + const auto x_global_buf = make_dynamic_buffer( + p_x, x_grid_desc_m_k.GetElementSpaceSize()); + + const auto dy_global_buf = make_dynamic_buffer( + p_dy, dy_grid_desc_m_k.GetElementSpaceSize()); + + auto dx_global_buf = make_dynamic_buffer( + p_dx, dx_grid_desc_m_k.GetElementSpaceSize()); + + const auto scale_global_buf = make_dynamic_buffer( + p_scale, scale_grid_desc_m.GetElementSpaceSize()); + + auto dscale_global_buf = make_dynamic_buffer( + p_dscale, scale_grid_desc_m.GetElementSpaceSize()); + + auto dbias_global_buf = make_dynamic_buffer( + p_dbias, bias_grid_desc_m.GetElementSpaceSize()); + + // clang-format off + // Step 1: calculating mean and inv-variance using welford method (if savedMean/savedInvVar not available), where inv-variance = 1/sqrt(epsilon+variance) + // clang-format on + + if(haveSavedMeanInvVar) + { + const auto mean_global_buf = make_dynamic_buffer( + p_savedMean, mean_var_grid_desc_m.GetElementSpaceSize()); + + const auto inv_var_global_buf = make_dynamic_buffer( + p_savedInvVar, mean_var_grid_desc_m.GetElementSpaceSize()); + + auto threadwise_mean_inv_var_load = + ThreadwiseTensorSliceTransfer_v2, + 0, + MeanVarSrcVectorSize, + 1, + true>( + mean_var_grid_desc_m, + make_multi_index(block_global_id * M_BlockTileSize + + thread_m_cluster_id * MThreadSliceSize)); + + threadwise_mean_inv_var_load.Run(mean_var_grid_desc_m, + mean_global_buf, + thread_buffer_desc_m, + make_tuple(I0), + mean_thread_buf); + + threadwise_mean_inv_var_load.Run(mean_var_grid_desc_m, + inv_var_global_buf, + thread_buffer_desc_m, + make_tuple(I0), + inv_var_thread_buf); + } + else + { + auto threadwise_welford = ThreadwiseWelford(); + threadwise_welford.max_count_ = get_reduce_count_per_thread(thread_k_cluster_id); + + static_for<0, MThreadSliceSize, 1>{}([&](auto I) { + mean_thread_buf(I) = type_convert(0.0f); + var_thread_buf(I) = type_convert(0.0f); + }); + + for(index_t reducedTiles = 0; reducedTiles < num_k_block_tile_iteration; ++reducedTiles) + { + + threadwise_x_load.Run(x_grid_desc_m_k, + x_global_buf, + thread_buffer_desc_m_k, + make_tuple(I0, I0), + x_thread_buf); + + threadwise_x_load.MoveSrcSliceWindow(x_grid_desc_m_k, thread_copy_fwd_step_m_k); + threadwise_welford.Run(x_thread_buf, mean_thread_buf, var_thread_buf); + } + + static_for<0, MThreadSliceSize, 1>{}([&](auto I) { + if constexpr(I > 0) + block_sync_lds(); + + int count = threadwise_welford.cur_count_; + BlockwiseWelford::Run(mean_thread_buf(I), var_thread_buf(I), count); + }); + + // calculate inv-variance as 1/sqrt(epsilon+variance) + static_for<0, MThreadSliceSize, 1>{}([&](auto I) { + inv_var_thread_buf(I) = + type_convert(1.0) / sqrt(var_thread_buf[I] + epsilon); + }); + + threadwise_x_load.SetSrcSliceOrigin( + x_grid_desc_m_k, + make_multi_index(block_global_id * M_BlockTileSize + + thread_m_cluster_id * MThreadSliceSize, + thread_k_cluster_id * KThreadSliceSize)); + }; + + // clang-format off + // Step 2: reduction: dbias = sum(dy), dscale = sum(dy *(x-mean) * inv-variance) + // clang-format on + + static_for<0, MThreadSliceSize, 1>{}([&](auto I) { + dscale_thread_buf(I) = type_convert(0); + dbias_thread_buf(I) = type_convert(0); + }); + + for(index_t reducedTiles = 0; reducedTiles < num_k_block_tile_iteration; ++reducedTiles) + { + threadwise_x_load.Run(x_grid_desc_m_k, + x_global_buf, + thread_buffer_desc_m_k, + make_tuple(I0, I0), + x_thread_buf); + + threadwise_dy_load.Run(dx_grid_desc_m_k, + dy_global_buf, + thread_buffer_desc_m_k, + make_tuple(I0, I0), + dy_thread_buf); + + static_for<0, MThreadSliceSize, 1>{}([&](auto iM) { + static_for<0, KThreadSliceSize, 1>{}([&](auto iK) { + constexpr auto offset = + thread_buffer_desc_m_k.CalculateOffset(make_tuple(iM, iK)); + + dy_elementwise_op(dy_thread_buf(Number{}), + dy_thread_buf[Number{}]); + + AccDataType norm_x = (x_thread_buf[Number{}] - mean_thread_buf[iM]) * + inv_var_thread_buf[iM]; + + tmp1_thread_buf(Number{}) = norm_x * dy_thread_buf[Number{}]; + }); + }); + + ThreadwiseReduce::Reduce(tmp1_thread_buf, dscale_thread_buf); + ThreadwiseReduce::Reduce(dy_thread_buf, dbias_thread_buf); + + threadwise_x_load.MoveSrcSliceWindow(x_grid_desc_m_k, thread_copy_fwd_step_m_k); + threadwise_dy_load.MoveSrcSliceWindow(dy_grid_desc_m_k, thread_copy_fwd_step_m_k); + }; + + static_for<0, MThreadSliceSize, 1>{}([&](auto I) { + if constexpr(I > 0) + block_sync_lds(); + BlockwiseReduce::Reduce(reduce_work_buf, dscale_thread_buf(I)); + block_sync_lds(); + BlockwiseReduce::Reduce(reduce_work_buf, dbias_thread_buf(I)); + }); + + if(thread_k_cluster_id == 0) + { + threadwise_dscale_store.Run(thread_buffer_desc_m, + make_tuple(I0), + dscale_thread_buf, + scale_grid_desc_m, + dscale_global_buf); + + threadwise_dbias_store.Run(thread_buffer_desc_m, + make_tuple(I0), + dbias_thread_buf, + bias_grid_desc_m, + dbias_global_buf); + }; + + // clang-format off + // Step 3: calculating dx = 1/reduce_size * inv-variance * scale * (reduce_size * dy - dbias - dscale * (x - mean) * inv-variance)) elementwise-ly + // clang-format on + + threadwise_scale_load.Run(scale_grid_desc_m, + scale_global_buf, + thread_buffer_desc_m, + make_tuple(I0), + scale_thread_buf); + + auto thread_copy_tail_m_k = (num_k_block_tile_iteration - 1) * thread_copy_fwd_step_m_k; + + threadwise_x_load.MoveSrcSliceWindow(x_grid_desc_m_k, thread_copy_bwd_step_m_k); + threadwise_dy_load.MoveSrcSliceWindow(dy_grid_desc_m_k, thread_copy_bwd_step_m_k); + threadwise_dx_store.MoveDstSliceWindow(dx_grid_desc_m_k, thread_copy_tail_m_k); + + AccDataType inv_reduce_size = + type_convert(1.0) / type_convert(reduce_size); + + for(index_t reducedTiles = 0; reducedTiles < num_k_block_tile_iteration; ++reducedTiles) + { + threadwise_x_load.Run(x_grid_desc_m_k, + x_global_buf, + thread_buffer_desc_m_k, + make_tuple(I0, I0), + x_thread_buf); + + threadwise_dy_load.Run(dy_grid_desc_m_k, + dy_global_buf, + thread_buffer_desc_m_k, + make_tuple(I0, I0), + dy_thread_buf); + + static_for<0, MThreadSliceSize, 1>{}([&](auto iM) { + AccDataType multiplier = + inv_reduce_size * inv_var_thread_buf[iM] * scale_thread_buf[iM]; + + static_for<0, KThreadSliceSize, 1>{}([&](auto iK) { + constexpr auto offset = + thread_buffer_desc_m_k.CalculateOffset(make_tuple(iM, iK)); + + dy_elementwise_op(dy_thread_buf(Number{}), + dy_thread_buf[Number{}]); + + AccDataType norm_x = (x_thread_buf[Number{}] - mean_thread_buf[iM]) * + inv_var_thread_buf[iM]; + + AccDataType tmpVal = norm_x * dscale_thread_buf[iM]; + + dx_thread_buf(Number{}) = + multiplier * + (type_convert(reduce_size) * dy_thread_buf[Number{}] - + dbias_thread_buf[iM] - tmpVal); + }); + }); + + threadwise_dx_store.Run(thread_buffer_desc_m_k, + make_tuple(I0, I0), + dx_thread_buf, + dx_grid_desc_m_k, + dx_global_buf); + + threadwise_x_load.MoveSrcSliceWindow(x_grid_desc_m_k, thread_copy_bwd_step_m_k); + threadwise_dy_load.MoveSrcSliceWindow(dy_grid_desc_m_k, thread_copy_bwd_step_m_k); + threadwise_dx_store.MoveDstSliceWindow(dx_grid_desc_m_k, thread_copy_bwd_step_m_k); + } + } +}; + +} // namespace ck diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_batchnorm_forward_blockwise_welford.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_batchnorm_forward_blockwise_welford.hpp index b0c9ceb3da..33c45a0f03 100644 --- a/include/ck/tensor_operation/gpu/grid/gridwise_batchnorm_forward_blockwise_welford.hpp +++ b/include/ck/tensor_operation/gpu/grid/gridwise_batchnorm_forward_blockwise_welford.hpp @@ -441,6 +441,7 @@ struct GridwiseBatchNormForwardWithBlockwiseWelford auto result_inv_var_global_buf = make_dynamic_buffer( resultSaveInvVariance, mean_var_grid_desc_m.GetElementSpaceSize()); + // calculate inv-variance as 1/sqrt(epsilon+variance), stored in place of variance static_for<0, MThreadSliceSize, 1>{}([&](auto I) { var_thread_buf(I) = type_convert(1.0f) / sqrt(epsilon + var_thread_buf[I]); diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_multiblock_welford_first_half.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_multiblock_welford_first_half.hpp new file mode 100644 index 0000000000..1afe9f9752 --- /dev/null +++ b/include/ck/tensor_operation/gpu/grid/gridwise_multiblock_welford_first_half.hpp @@ -0,0 +1,258 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once + +#include "ck/utility/data_type.hpp" +#include "ck/utility/math.hpp" +#include "ck/tensor_operation/gpu/block/blockwise_welford.hpp" +#include "ck/tensor_operation/gpu/thread/threadwise_welford.hpp" +#include "ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer.hpp" +#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" + +namespace ck { + +template +__global__ void kernel_multiblock_welford_first_half( + const XGridDesc_M_K x_grid_desc_m_k, + const MeanVarCountGridDesc_M_G mean_var_count_grid_desc_m_g, + const GetReduceCountPerThreadFunctor get_reduce_count_per_thread, + index_t num_k_block_tile_iteration, + const XDataType* const __restrict__ p_x, + MeanVarDataType* const p_welford_mean, + MeanVarDataType* const p_welford_variance, + int32_t* const p_welford_count) +{ + GridwiseMultiblockWelfordFirstHalf_::Run(x_grid_desc_m_k, + mean_var_count_grid_desc_m_g, + get_reduce_count_per_thread, + num_k_block_tile_iteration, + p_x, + p_welford_mean, + p_welford_variance, + p_welford_count); +}; + +template +struct GridwiseMultiblockWelfordFirstHalf +{ + static_assert((XSrcCountSrcVectorDim == 0 && MThreadSliceSize % XSrcCountSrcVectorSize == 0) || + (XSrcCountSrcVectorDim == 1 && + KThreadSliceSize % XSrcCountSrcVectorSize == 0), + "Invalid thread slice sizes and/or vector sizes configuration, please check!"); + + static constexpr bool reorder_thread_cluster = (XSrcCountSrcVectorDim == 0); + + using ThreadClusterLengths_M_K = Sequence; + + using ThreadBufferDimAccessOrder = + typename conditional, Sequence<0, 1>>::type; + + using ThreadClusterArrangeOrder = + typename conditional, Sequence<0, 1>>::type; + + static constexpr auto thread_cluster_desc = + make_cluster_descriptor(ThreadClusterLengths_M_K{}, ThreadClusterArrangeOrder{}); + + using ThreadReduceSrcDesc_M_K = decltype(make_naive_tensor_descriptor_packed( + make_tuple(Number{}, Number{}))); + using ThreadReduceDstDesc_M = + decltype(make_naive_tensor_descriptor_packed(make_tuple(Number{}))); + + using ThreadwiseWelford = + ThreadwiseWelford; + + using BlockwiseWelford = BlockwiseWelford; + + using PassThroughOp = tensor_operation::element_wise::PassThrough; + + static constexpr auto I0 = Number<0>{}; + static constexpr auto I1 = Number<1>{}; + + static constexpr index_t M_BlockTileSize = MThreadClusterSize * MThreadSliceSize; + static constexpr index_t K_BlockTileSize = KThreadClusterSize * KThreadSliceSize; + + __device__ static void Run(const XGridDesc_M_K& x_grid_desc_m_k, + const MeanVarCountGridDesc_M_G& mean_var_count_grid_desc_m_g, + const GetReduceCountPerThreadFunctor& get_reduce_count_per_thread, + index_t num_k_block_tile_iteration, + const XDataType* const __restrict__ p_x, + MeanVarDataType* const p_welford_mean, + MeanVarDataType* const p_welford_variance, + int32_t* const p_welford_count) + { + StaticBuffer + x_thread_buf; + + StaticBuffer + welford_mean_thread_buf; + StaticBuffer + welford_var_thread_buf; + StaticBuffer + welford_count_thread_buf; + + const index_t blkgroup_size = mean_var_count_grid_desc_m_g.GetLength(I1); + + const index_t thread_local_id = get_thread_local_1d_id(); + const index_t block_global_id = get_block_1d_id(); + const index_t blkgroup_id = block_global_id / blkgroup_size; + const index_t block_local_id = block_global_id % blkgroup_size; + + const auto thread_cluster_idx = + thread_cluster_desc.CalculateBottomIndex(make_multi_index(thread_local_id)); + + const auto thread_m_cluster_id = thread_cluster_idx[I0]; + const auto thread_k_cluster_id = thread_cluster_idx[I1]; + + using ThreadBufferLengths_M_K = Sequence; + using ThreadBufferLengths_M_1 = Sequence; + + constexpr auto thread_buffer_desc_m_k = make_naive_tensor_descriptor_packed( + make_tuple(Number{}, Number{})); + constexpr auto thread_buffer_desc_m_1 = make_naive_tensor_descriptor_packed( + make_tuple(Number{}, Number<1>{})); + + const index_t reduceSizePerBlock = K_BlockTileSize * num_k_block_tile_iteration; + + auto threadwise_x_load = ThreadwiseTensorSliceTransfer_v2( + x_grid_desc_m_k, + make_multi_index(blkgroup_id * M_BlockTileSize + thread_m_cluster_id * MThreadSliceSize, + block_local_id * reduceSizePerBlock + + thread_k_cluster_id * KThreadSliceSize)); + + auto threadwise_welford_mean_var_store = + ThreadwiseTensorSliceTransfer_v1r3, + 1, + 1, + InMemoryDataOperationEnum::Set, + 1, + true>( + mean_var_count_grid_desc_m_g, + make_multi_index(blkgroup_id * M_BlockTileSize + + thread_m_cluster_id * MThreadSliceSize, + block_local_id), + PassThroughOp{}); + + auto threadwise_welford_count_store = + ThreadwiseTensorSliceTransfer_v1r3, + 1, + 1, + InMemoryDataOperationEnum::Set, + 1, + true>( + mean_var_count_grid_desc_m_g, + make_multi_index(blkgroup_id * M_BlockTileSize + + thread_m_cluster_id * MThreadSliceSize, + block_local_id), + PassThroughOp{}); + + constexpr auto thread_copy_fwd_step_m_k = make_multi_index(0, K_BlockTileSize); + + const auto x_global_val_buf = make_dynamic_buffer( + p_x, x_grid_desc_m_k.GetElementSpaceSize()); + + auto welford_mean_global_val_buf = make_dynamic_buffer( + p_welford_mean, mean_var_count_grid_desc_m_g.GetElementSpaceSize()); + + auto welford_var_global_val_buf = make_dynamic_buffer( + p_welford_variance, mean_var_count_grid_desc_m_g.GetElementSpaceSize()); + + auto welford_count_global_val_buf = make_dynamic_buffer( + p_welford_count, mean_var_count_grid_desc_m_g.GetElementSpaceSize()); + + auto threadwise_welford = ThreadwiseWelford(); + threadwise_welford.max_count_ = + get_reduce_count_per_thread(block_local_id, thread_k_cluster_id); + + static_for<0, MThreadSliceSize, 1>{}([&](auto I) { + welford_mean_thread_buf(I) = type_convert(0.0f); + welford_var_thread_buf(I) = type_convert(0.0f); + }); + + for(index_t reducedTiles = 0; reducedTiles < num_k_block_tile_iteration; ++reducedTiles) + { + threadwise_x_load.Run(x_grid_desc_m_k, + x_global_val_buf, + thread_buffer_desc_m_k, + make_tuple(I0, I0), + x_thread_buf); + + threadwise_x_load.MoveSrcSliceWindow(x_grid_desc_m_k, thread_copy_fwd_step_m_k); + threadwise_welford.Run(x_thread_buf, welford_mean_thread_buf, welford_var_thread_buf); + } + + static_for<0, MThreadSliceSize, 1>{}([&](auto I) { + if constexpr(I > 0) + block_sync_lds(); + + welford_count_thread_buf(I) = threadwise_welford.cur_count_; + BlockwiseWelford::Run( + welford_mean_thread_buf(I), welford_var_thread_buf(I), welford_count_thread_buf(I)); + }); + + if(thread_k_cluster_id == 0) + { + threadwise_welford_mean_var_store.Run(thread_buffer_desc_m_1, + make_tuple(I0, I0), + welford_mean_thread_buf, + mean_var_count_grid_desc_m_g, + welford_mean_global_val_buf); + + threadwise_welford_mean_var_store.Run(thread_buffer_desc_m_1, + make_tuple(I0, I0), + welford_var_thread_buf, + mean_var_count_grid_desc_m_g, + welford_var_global_val_buf); + + threadwise_welford_count_store.Run(thread_buffer_desc_m_1, + make_tuple(I0, I0), + welford_count_thread_buf, + mean_var_count_grid_desc_m_g, + welford_count_global_val_buf); + }; + } +}; + +} // namespace ck diff --git a/library/include/ck/library/reference_tensor_operation/cpu/reference_batchnorm_backward_nhwc_c.hpp b/library/include/ck/library/reference_tensor_operation/cpu/reference_batchnorm_backward_nhwc_c.hpp new file mode 100644 index 0000000000..64eb06a441 --- /dev/null +++ b/library/include/ck/library/reference_tensor_operation/cpu/reference_batchnorm_backward_nhwc_c.hpp @@ -0,0 +1,319 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once + +#include +#include +#include + +#include "ck/tensor_operation/gpu/device/device_batchnorm_backward.hpp" + +namespace ck { +namespace tensor_operation { +namespace host { + +template +struct ReferenceBatchNormBwd_Input_N_H_W_C_Output_C + : public device::DeviceBatchNormBwd<4, 3, DyElementwiseOp> +{ + struct Argument : public device::BaseArgument + { + Argument(const std::array xyLengths, + const std::array xStrides, + const std::array dyStrides, + const std::array dxStrides, + const std::array reduceDims, + const std::array bnScaleBiasMeanVarLengths, + const std::array bnScaleStrides, + const std::array bnBiasStrides, + const std::array bnMeanVarStrides, + const XDataType* p_x, + const DyDataType* p_dy, + const ScaleDataType* p_scale, + const MeanVarDataType* p_savedMean, + const MeanVarDataType* p_savedInvVar, + double epsilon, + const DyElementwiseOp dy_elementwise_op, + DxDataType* p_dx, + ScaleDataType* p_dscale, + BiasDataType* p_dbias) + : p_x_(p_x), + p_dy_(p_dy), + p_scale_(p_scale), + p_savedMean_(p_savedMean), + p_savedInvVar_(p_savedInvVar), + epsilon_(epsilon), + dy_elementwise_op_(dy_elementwise_op), + p_dx_(p_dx), + p_dscale_(p_dscale), + p_dbias_(p_dbias) + { + ignore = xStrides; + ignore = dyStrides; + ignore = dxStrides; + ignore = bnScaleStrides; + ignore = bnBiasStrides; + ignore = bnMeanVarStrides; + + if(xyLengths.size() != 4 || bnScaleBiasMeanVarLengths.size() != 1 || + bnScaleBiasMeanVarLengths[0] != xyLengths[3]) + throw std::runtime_error("Invalid tensor dimensions!"); + + if(reduceDims[0] != 0 || reduceDims[1] != 1 || reduceDims[2] != 2) + throw std::runtime_error("Invalid reduce dimensions!"); + + n_ = xyLengths[0]; + h_ = xyLengths[1]; + w_ = xyLengths[2]; + c_ = xyLengths[3]; + + haveSavedMeanInvVar_ = (p_savedMean != nullptr && p_savedInvVar != nullptr); + } + + const XDataType* p_x_; + const DyDataType* p_dy_; + const ScaleDataType* p_scale_; + const MeanVarDataType* p_savedMean_; + const MeanVarDataType* p_savedInvVar_; + + double epsilon_; + const DyElementwiseOp dy_elementwise_op_; + + DxDataType* p_dx_; + ScaleDataType* p_dscale_; + BiasDataType* p_dbias_; + + bool haveSavedMeanInvVar_; + + index_t n_, h_, w_, c_; + }; + + struct Invoker : public device::BaseInvoker + { + float Run(const Argument& arg) + { + auto thread_reduce_func = [&](auto iC) { + AccDataType reduceSize = type_convert(arg.n_) * + type_convert(arg.h_) * + type_convert(arg.w_); + index_t offset_C = iC; + AccDataType mean; + AccDataType invVar; + + if(arg.haveSavedMeanInvVar_) + { + mean = arg.p_savedMean_[offset_C]; + invVar = arg.p_savedInvVar_[offset_C]; + } + else + { + AccDataType meansquare; + + meansquare = type_convert(0.0f); + mean = type_convert(0.0f); + + // compute mean, meanquare, variance, inv-variance + for(index_t iN = 0; iN < arg.n_; iN++) + { + index_t offset_N = iN * arg.h_ * arg.w_ * arg.c_; + for(index_t iH = 0; iH < arg.h_; iH++) + { + index_t offset_H = iH * arg.w_ * arg.c_; + for(index_t iW = 0; iW < arg.w_; iW++) + { + index_t offset_W = iW * arg.c_; + + auto offset = offset_N + offset_H + offset_W + offset_C; + + AccDataType x = type_convert(arg.p_x_[offset]); + + mean += x; + meansquare += x * x; + }; + } + }; + + mean = mean / reduceSize; + meansquare = meansquare / reduceSize; + + AccDataType variance = meansquare - mean * mean; + invVar = type_convert(1.0f) / + std::sqrt(type_convert(arg.epsilon_) + variance); + }; + + AccDataType dbias = type_convert(0.0f); // Sum on NHW of dy + AccDataType dscale = type_convert(0.0f); // Sum on NHW of dy * norm_x + + // 1) calculate dy * (x - mean) * inv-variance + // 2) calculate sum(dy) on NHW dimensions + // 3) calculate sum(dy * norm_x) on NHW dimensions + for(index_t iN = 0; iN < arg.n_; iN++) + { + index_t offset_N = iN * arg.h_ * arg.w_ * arg.c_; + for(index_t iH = 0; iH < arg.h_; iH++) + { + index_t offset_H = iH * arg.w_ * arg.c_; + for(index_t iW = 0; iW < arg.w_; iW++) + { + index_t offset_W = iW * arg.c_; + + auto offset = offset_N + offset_H + offset_W + offset_C; + + AccDataType x = type_convert(arg.p_x_[offset]); + + AccDataType norm_x = (x - mean) * invVar; + AccDataType dy = type_convert(arg.p_dy_[offset]); + + arg.dy_elementwise_op_(dy, dy); + + dbias += dy; + dscale += norm_x * dy; + }; + } + }; + + arg.p_dscale_[offset_C] = type_convert(dscale); + arg.p_dbias_[offset_C] = type_convert(dbias); + + AccDataType scale = type_convert(arg.p_scale_[offset_C]); + AccDataType multiplier = + type_convert(1.0f) / reduceSize * invVar * scale; + + // 1) calculate tmp = dscale * (x - mean) * inv-variance + // 2) calculate dx = 1/nhw * inv-variance * scale * (nhw * dy - dbias - tmp) + for(index_t iN = 0; iN < arg.n_; iN++) + { + index_t offset_N = iN * arg.h_ * arg.w_ * arg.c_; + for(index_t iH = 0; iH < arg.h_; iH++) + { + index_t offset_H = iH * arg.w_ * arg.c_; + for(index_t iW = 0; iW < arg.w_; iW++) + { + index_t offset_W = iW * arg.c_; + + auto offset = offset_N + offset_H + offset_W + offset_C; + + AccDataType x = type_convert(arg.p_x_[offset]); + + AccDataType norm_x = (x - mean) * invVar; + AccDataType dy = type_convert(arg.p_dy_[offset]); + + arg.dy_elementwise_op_(dy, dy); + + AccDataType tmpVal = norm_x * dscale; + + AccDataType dx = multiplier * (reduceSize * dy - dbias - tmpVal); + + arg.p_dx_[offset] = type_convert(dx); + }; + } + }; + }; + + std::size_t num_thread = std::thread::hardware_concurrency(); + std::size_t work_per_thread = (arg.c_ + num_thread - 1) / num_thread; + + std::vector threads(num_thread); + + for(std::size_t it = 0; it < num_thread; ++it) + { + std::size_t ic_begin = it * work_per_thread; + std::size_t ic_end = std::min(static_cast((it + 1) * work_per_thread), arg.c_); + + auto f = [=] { + for(std::size_t ic = ic_begin; ic < ic_end; ++ic) + { + thread_reduce_func(ic); + } + }; + + threads[it] = joinable_thread(f); + } + + return (0.0f); + }; + + float Run(const device::BaseArgument* p_arg, + const StreamConfig& /*stream_config*/ = StreamConfig{}) override + { + return Run(*dynamic_cast(p_arg)); + }; + }; + + bool IsSupportedArgument(const device::BaseArgument* p_arg) override + { + (void)p_arg; + + return (true); + }; + + std::unique_ptr + MakeArgumentPointer(const std::array xyLengths, + const std::array xStrides, + const std::array dyStrides, + const std::array dxStrides, + const std::array reduceDims, + const std::array bnScaleBiasMeanVarLengths, + const std::array bnScaleStrides, + const std::array bnBiasStrides, + const std::array bnMeanVarStrides, + const void* p_x, + const void* p_dy, + const void* p_scale, + const void* p_savedMean, + const void* p_savedInvVar, + double epsilon, + const DyElementwiseOp dy_elementwise_op, + void* p_dx, + void* p_dscale, + void* p_dbias) override + { + return std::make_unique(xyLengths, + xStrides, + dyStrides, + dxStrides, + reduceDims, + bnScaleBiasMeanVarLengths, + bnScaleStrides, + bnBiasStrides, + bnMeanVarStrides, + static_cast(p_x), + static_cast(p_dy), + static_cast(p_scale), + static_cast(p_savedMean), + static_cast(p_savedInvVar), + epsilon, + dy_elementwise_op, + static_cast(p_dx), + static_cast(p_dscale), + static_cast(p_dbias)); + }; + + std::unique_ptr MakeInvokerPointer() override + { + return std::make_unique(); + }; + + std::string GetTypeString() const override + { + auto str = std::stringstream(); + + // clang-format off + str << "Reference_BatchNorm_Backward_NHWC_C<" << std::endl; + // clang-format on + + return str.str(); + } +}; + +} // namespace host +} // namespace tensor_operation +} // namespace ck