mirror of
https://github.com/ROCm/composable_kernel.git
synced 2026-05-14 10:09:41 +00:00
* Update to the batchnorm-forward API and base class
* Fix leeked header including in gridwise_set_buffer_value.hpp
* Add kernels and device file for batchnorm-forward welford supporting both blockwise and multi-block reduction
* Update to the batchnorm-forward example to use the new batchnorm-forward device interface
* Change the batchnorm-forward reference to use sequential welford method
* Change to assign the workspace into four buffers in the host layer
* Use GetReduceCountPerThread functor to replace the initial count for Blockwise and Multiblock welford
* Tiny correction and remove un-used file under example/34_batchnorm
* Renaming in the kernel arguments
* Explicitly use ck::math::sqrt in batchnorm-forward kernels
* Add some comments to some kernels
* Tiny fix
* Generalize the data types in reference_batchnorm_forward_nhwc_c
* Use ck::ignore to mark un-used parameters
* Move GetReduceCountPerThread functor codes from kernel to device
* Remove some un-used codes in device_batchnorm_forward_impl.hpp
* Tiny fix in batchnorm_forward example
* Move GetReduceCountPerThread() to welford_helper.hpp
* Use seperate data type for Scale and Bias
* Renaming in device Op
* Tiny fix in forward example
* Updata to batchnorm-infer (type spliting, renaming)
* Add time and bandwidth measurement to the batchnorm-forward example
* Add support of elementwise operation for batchnorm forward output
* Reduce object copying by passing object as reference type
* Tiny change for performance
* Updates for performance again
* Some Renamings
* Add GetActualVariance template parameter for ThreadwiseWelfordMerge
* Tiny update in reference batchnorm forward nhwc/c
* Move batchnorm multiblock kernel files to grid/batchnorm_multiblock sub-directory
* Fuse mean and bias in the normalization calculation
Co-authored-by: root <root@dc-smc-18.amd.com>
Co-authored-by: rocking5566 <ChunYu.Lai@amd.com>
[ROCm/composable_kernel commit: 7fa892e63e]
69 lines
1.9 KiB
C++
69 lines
1.9 KiB
C++
// SPDX-License-Identifier: MIT
|
|
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
|
|
|
|
#pragma once
|
|
|
|
#include <cassert>
|
|
#include <vector>
|
|
#include <array>
|
|
#include <type_traits>
|
|
|
|
#include "ck/utility/data_type.hpp"
|
|
|
|
struct NormalizeInInfer
|
|
{
|
|
NormalizeInInfer(double epsilon = 1e-4) : epsilon_(epsilon) {}
|
|
|
|
template <typename T1, typename T2, typename T3, typename T4>
|
|
__host__ __device__ constexpr void operator()(T1& y,
|
|
const T1& x,
|
|
const T2& mean,
|
|
const T2& variance,
|
|
const T3& gamma,
|
|
const T4& beta) const
|
|
{
|
|
static_assert(std::is_same<T2, float>::value || std::is_same<T2, double>::value,
|
|
"Data type is not supported by this operation!");
|
|
|
|
using ck::type_convert;
|
|
using ck::math::sqrt;
|
|
|
|
T2 tmp_x, tmp_y;
|
|
|
|
tmp_x = type_convert<T2>(x);
|
|
|
|
tmp_y = ((tmp_x - mean) / sqrt(variance + type_convert<T2>(epsilon_))) *
|
|
type_convert<T2>(gamma) +
|
|
type_convert<T2>(beta);
|
|
y = type_convert<T1>(tmp_y);
|
|
};
|
|
|
|
double epsilon_;
|
|
};
|
|
|
|
template <int Rank, int NumReduceDim>
|
|
static inline std::array<int, Rank - NumReduceDim>
|
|
get_invariant_dims(const std::array<int, NumReduceDim>& reduceDims)
|
|
{
|
|
int reduceFlag = 0;
|
|
|
|
// flag the bits for the reduceDims
|
|
for(int i = 0; i < NumReduceDim; i++)
|
|
{
|
|
reduceFlag |= 1 << reduceDims[i];
|
|
};
|
|
|
|
std::array<int, Rank - NumReduceDim> invariantDims;
|
|
|
|
// collect invariant dimensions
|
|
int dim = 0;
|
|
for(int i = 0; i < Rank; i++)
|
|
if((reduceFlag & (1 << i)) == 0)
|
|
{
|
|
invariantDims[dim] = i;
|
|
dim++;
|
|
};
|
|
|
|
return invariantDims;
|
|
};
|