mirror of
https://github.com/ROCm/composable_kernel.git
synced 2026-07-01 12:17:00 +00:00
use fast rcpf
This commit is contained in:
@@ -90,7 +90,7 @@ struct Layernorm2dFwdPipelineOnePass
|
||||
// compute inv-std
|
||||
auto inv_std = tile_elementwise_in(
|
||||
[&](const auto& v_) {
|
||||
return type_convert<ComputeDataType>(1.0f) / (sqrt(v_ + epsilon));
|
||||
return type_convert<ComputeDataType>(1.0f) * __builtin_amdgcn_rcpf(sqrt(v_ + epsilon));
|
||||
},
|
||||
var);
|
||||
|
||||
|
||||
@@ -356,7 +356,7 @@ CK_TILE_DEVICE constexpr void block_tile_welford_post_scale_var(VarDistributedTe
|
||||
int count)
|
||||
{
|
||||
using DataType = typename VarDistributedTensor_::DataType;
|
||||
tile_elementwise_inout([&count](auto& x) { x = x / type_convert<DataType>(count); },
|
||||
tile_elementwise_inout([&count](auto& x) { x = x * __builtin_amdgcn_rcpf(type_convert<DataType>(count)); },
|
||||
var_tensor);
|
||||
}
|
||||
} // namespace ck_tile
|
||||
|
||||
@@ -12,7 +12,7 @@ CK_TILE_DEVICE void welford_update(T& mean, T& var, T x, int count)
|
||||
{
|
||||
// TODO: check nan? maybe no
|
||||
T delta = x - mean;
|
||||
mean += delta / count;
|
||||
mean += delta * __builtin_amdgcn_rcpf(count);
|
||||
T delta2 = x - mean;
|
||||
var += delta * delta2;
|
||||
}
|
||||
@@ -25,7 +25,7 @@ welford_merge(T& mean_a, T& var_a, int& count_a, T mean_b, T var_b, int count_b)
|
||||
T count_ = type_convert<T>(count);
|
||||
T count_a_ = type_convert<T>(count_a);
|
||||
T count_b_ = type_convert<T>(count_b);
|
||||
T count_b_over_count = count == 0 ? type_convert<T>(0) : count_b_ / count_;
|
||||
T count_b_over_count = count == 0 ? type_convert<T>(0) : count_b_ * __builtin_amdgcn_rcpf(count_);
|
||||
|
||||
T delta = mean_b - mean_a;
|
||||
mean_a += delta * count_b_over_count;
|
||||
|
||||
Reference in New Issue
Block a user