From 6bdb0fcb226d46b048717b09260a74332a8ffd8a Mon Sep 17 00:00:00 2001 From: dummycoderfe Date: Mon, 4 Nov 2024 03:59:42 +0000 Subject: [PATCH] use fast rcpf --- .../pipeline/layernorm2d_fwd_pipeline_one_pass.hpp | 2 +- include/ck_tile/ops/welford/block/block_welford.hpp | 2 +- include/ck_tile/ops/welford/thread/thread_welford.hpp | 4 ++-- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/include/ck_tile/ops/layernorm2d/pipeline/layernorm2d_fwd_pipeline_one_pass.hpp b/include/ck_tile/ops/layernorm2d/pipeline/layernorm2d_fwd_pipeline_one_pass.hpp index bf002141b8..4693617ee2 100644 --- a/include/ck_tile/ops/layernorm2d/pipeline/layernorm2d_fwd_pipeline_one_pass.hpp +++ b/include/ck_tile/ops/layernorm2d/pipeline/layernorm2d_fwd_pipeline_one_pass.hpp @@ -90,7 +90,7 @@ struct Layernorm2dFwdPipelineOnePass // compute inv-std auto inv_std = tile_elementwise_in( [&](const auto& v_) { - return type_convert(1.0f) / (sqrt(v_ + epsilon)); + return type_convert(1.0f) * __builtin_amdgcn_rcpf(sqrt(v_ + epsilon)); }, var); diff --git a/include/ck_tile/ops/welford/block/block_welford.hpp b/include/ck_tile/ops/welford/block/block_welford.hpp index 55d55402d8..e061af040c 100644 --- a/include/ck_tile/ops/welford/block/block_welford.hpp +++ b/include/ck_tile/ops/welford/block/block_welford.hpp @@ -356,7 +356,7 @@ CK_TILE_DEVICE constexpr void block_tile_welford_post_scale_var(VarDistributedTe int count) { using DataType = typename VarDistributedTensor_::DataType; - tile_elementwise_inout([&count](auto& x) { x = x / type_convert(count); }, + tile_elementwise_inout([&count](auto& x) { x = x * __builtin_amdgcn_rcpf(type_convert(count)); }, var_tensor); } } // namespace ck_tile diff --git a/include/ck_tile/ops/welford/thread/thread_welford.hpp b/include/ck_tile/ops/welford/thread/thread_welford.hpp index 4c61cdcf4b..9ec95d1caa 100644 --- a/include/ck_tile/ops/welford/thread/thread_welford.hpp +++ b/include/ck_tile/ops/welford/thread/thread_welford.hpp @@ -12,7 +12,7 @@ CK_TILE_DEVICE void welford_update(T& mean, T& var, T x, int count) { // TODO: check nan? maybe no T delta = x - mean; - mean += delta / count; + mean += delta * __builtin_amdgcn_rcpf(count); T delta2 = x - mean; var += delta * delta2; } @@ -25,7 +25,7 @@ welford_merge(T& mean_a, T& var_a, int& count_a, T mean_b, T var_b, int count_b) T count_ = type_convert(count); T count_a_ = type_convert(count_a); T count_b_ = type_convert(count_b); - T count_b_over_count = count == 0 ? type_convert(0) : count_b_ / count_; + T count_b_over_count = count == 0 ? type_convert(0) : count_b_ * __builtin_amdgcn_rcpf(count_); T delta = mean_b - mean_a; mean_a += delta * count_b_over_count;