mirror of
https://github.com/ROCm/composable_kernel.git
synced 2026-05-02 12:41:26 +00:00
* Update to the batchnorm-forward API and base class * Fix leeked header including in gridwise_set_buffer_value.hpp * Add kernels and device file for batchnorm-forward welford supporting both blockwise and multi-block reduction * Update to the batchnorm-forward example to use the new batchnorm-forward device interface * Change the batchnorm-forward reference to use sequential welford method * Change to assign the workspace into four buffers in the host layer * Use GetReduceCountPerThread functor to replace the initial count for Blockwise and Multiblock welford * Tiny correction and remove un-used file under example/34_batchnorm * Renaming in the kernel arguments * Explicitly use ck::math::sqrt in batchnorm-forward kernels * Add some comments to some kernels * Tiny fix * Generalize the data types in reference_batchnorm_forward_nhwc_c * Use ck::ignore to mark un-used parameters * Move GetReduceCountPerThread functor codes from kernel to device * Remove some un-used codes in device_batchnorm_forward_impl.hpp * Tiny fix in batchnorm_forward example * Move GetReduceCountPerThread() to welford_helper.hpp * Use seperate data type for Scale and Bias * Renaming in device Op * Tiny fix in forward example * Updata to batchnorm-infer (type spliting, renaming) * Add time and bandwidth measurement to the batchnorm-forward example * Add support of elementwise operation for batchnorm forward output * Reduce object copying by passing object as reference type * Tiny change for performance * Updates for performance again * Some Renamings * Add GetActualVariance template parameter for ThreadwiseWelfordMerge * Tiny update in reference batchnorm forward nhwc/c * Move batchnorm multiblock kernel files to grid/batchnorm_multiblock sub-directory * Fuse mean and bias in the normalization calculation Co-authored-by: root <root@dc-smc-18.amd.com> Co-authored-by: rocking5566 <ChunYu.Lai@amd.com>
69 lines
1.9 KiB
C++
69 lines
1.9 KiB
C++
// SPDX-License-Identifier: MIT
|
|
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
|
|
|
|
#pragma once
|
|
|
|
#include <cassert>
|
|
#include <vector>
|
|
#include <array>
|
|
#include <type_traits>
|
|
|
|
#include "ck/utility/data_type.hpp"
|
|
|
|
struct NormalizeInInfer
|
|
{
|
|
NormalizeInInfer(double epsilon = 1e-4) : epsilon_(epsilon) {}
|
|
|
|
template <typename T1, typename T2, typename T3, typename T4>
|
|
__host__ __device__ constexpr void operator()(T1& y,
|
|
const T1& x,
|
|
const T2& mean,
|
|
const T2& variance,
|
|
const T3& gamma,
|
|
const T4& beta) const
|
|
{
|
|
static_assert(std::is_same<T2, float>::value || std::is_same<T2, double>::value,
|
|
"Data type is not supported by this operation!");
|
|
|
|
using ck::type_convert;
|
|
using ck::math::sqrt;
|
|
|
|
T2 tmp_x, tmp_y;
|
|
|
|
tmp_x = type_convert<T2>(x);
|
|
|
|
tmp_y = ((tmp_x - mean) / sqrt(variance + type_convert<T2>(epsilon_))) *
|
|
type_convert<T2>(gamma) +
|
|
type_convert<T2>(beta);
|
|
y = type_convert<T1>(tmp_y);
|
|
};
|
|
|
|
double epsilon_;
|
|
};
|
|
|
|
template <int Rank, int NumReduceDim>
|
|
static inline std::array<int, Rank - NumReduceDim>
|
|
get_invariant_dims(const std::array<int, NumReduceDim>& reduceDims)
|
|
{
|
|
int reduceFlag = 0;
|
|
|
|
// flag the bits for the reduceDims
|
|
for(int i = 0; i < NumReduceDim; i++)
|
|
{
|
|
reduceFlag |= 1 << reduceDims[i];
|
|
};
|
|
|
|
std::array<int, Rank - NumReduceDim> invariantDims;
|
|
|
|
// collect invariant dimensions
|
|
int dim = 0;
|
|
for(int i = 0; i < Rank; i++)
|
|
if((reduceFlag & (1 << i)) == 0)
|
|
{
|
|
invariantDims[dim] = i;
|
|
dim++;
|
|
};
|
|
|
|
return invariantDims;
|
|
};
|