mirror of
https://github.com/ROCm/composable_kernel.git
synced 2026-05-04 13:41:24 +00:00
* Use dim 0 as faster dim for writing mean/var/count workspace in batchnorm multiblock method [performance]
* Add CountDataType as template parameter in blockwise_welford
* Add utility/get_shift.hpp
* Add BatchNorm multiblock single-kernel implementation
* Add smem inline assembly based implementation of gms_init/gms_barrier/gms_reset for gfx90a
* Renaming in device_batchnorm_forward_impl.hpp
* Tiny fix in the batchnorm_fwd profiler
* Revert "Add smem inline assembly based implementation of gms_init/gms_barrier/gms_reset for gfx90a"
This reverts commit d16d00919c.
* Use the old two-kernel batchnorm multiblock method for gfx1030
* Use the old two-kernel batchnorm multiblock method for gfx908
* use the single-kernel batchnorm multiblock method only for gfx90a
* Remove get_wave_id() from utility/get_id.hpp since it is not used
* Set true for testing running mean/variance and saving mean/invvariance in the examples
* Fix to copy-right words
* Remove un-needed including in utility/get_id.hpp
* Add comments to workgroup_synchronization.hpp
* Remove un-used codes in gridwise_multiblock_batchnorm_forward.hpp
* Renaming in the kernels
* Remove un-used kernel file
29 lines
581 B
C++
29 lines
581 B
C++
// SPDX-License-Identifier: MIT
|
|
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
|
|
|
|
#pragma once
|
|
|
|
#include "ck/utility/reduction_enums.hpp"
|
|
|
|
namespace ck {
|
|
|
|
struct float_equal_one
|
|
{
|
|
template <class T>
|
|
__host__ __device__ inline bool operator()(T x)
|
|
{
|
|
return x <= static_cast<T>(1.0f) and x >= static_cast<T>(1.0f);
|
|
};
|
|
};
|
|
|
|
struct float_equal_zero
|
|
{
|
|
template <class T>
|
|
__host__ __device__ inline bool operator()(T x)
|
|
{
|
|
return x <= static_cast<T>(0.0f) and x >= static_cast<T>(0.0f);
|
|
};
|
|
};
|
|
|
|
} // namespace ck
|