From 27ff3dec9fef082cbebb6c77685d3bb880167d6f Mon Sep 17 00:00:00 2001 From: dummycoderfe Date: Mon, 4 Nov 2024 08:50:09 +0000 Subject: [PATCH] optimze small N case using vec io and using rcp div --- example/ck_tile/02_layernorm2d/generate.py | 9 ++++++--- .../pipeline/layernorm2d_fwd_pipeline_one_pass.hpp | 3 ++- include/ck_tile/ops/welford/block/block_welford.hpp | 5 +++-- .../ck_tile/ops/welford/thread/thread_welford.hpp | 13 +++++++------ 4 files changed, 18 insertions(+), 12 deletions(-) diff --git a/example/ck_tile/02_layernorm2d/generate.py b/example/ck_tile/02_layernorm2d/generate.py index bf576db97e..154deae042 100644 --- a/example/ck_tile/02_layernorm2d/generate.py +++ b/example/ck_tile/02_layernorm2d/generate.py @@ -114,7 +114,7 @@ struct layernorm2d_fwd_traits_ using WarpTile = ck_tile::sequence; using Vector = ck_tile::sequence<1, Vector_N_>; - using Shape = ck_tile::Generic2dBlockShape; + using Shape = ck_tile::Generic2dBlockShape; static constexpr bool kPadN = kPadN_; static constexpr bool kSaveMeanInvStd = kSaveMeanInvStd_; @@ -484,8 +484,11 @@ float layernorm2d_fwd(layernorm2d_fwd_traits t, fused_sweep_list = [0, 1] # NOTE: only single pass can use fused dynamic quant # rm rn tm tn vn pd mv 2p add sweep - h_trait_dict = {'64' : [ h_traits('x', 'y', 'xs', 'ys', 1, 1, 4, 64, 1, True, False, False, 0, 0)], - '128' : [ h_traits('x', 'y', 'xs', 'ys', 1, 1, 4, 64, 2, True, False, False, 0, 0), + h_trait_dict = {'64' : [ h_traits('x', 'y', 'xs', 'ys', 1, 1, 8, 8, 8, True, False, False, 0, 0), + h_traits('x', 'y', 'xs', 'ys', 1, 1, 4, 16, 4, True, False, False, 0, 0), + h_traits('x', 'y', 'xs', 'ys', 1, 1, 4, 64, 1, True, False, False, 0, 0)], + '128' : [ h_traits('x', 'y', 'xs', 'ys', 1, 1, 4, 16, 8, True, False, False, 0, 0), + h_traits('x', 'y', 'xs', 'ys', 1, 1, 4, 64, 2, True, False, False, 0, 0), h_traits('x', 'y', 'xs', 'ys', 1, 2, 4, 64, 1, True, False, False, 0, 0)], '256' : [ h_traits('x', 'y', 'xs', 'ys', 1, 1, 4, 64, 4, True, False, False, 0, 0), h_traits('x', 'y', 'xs', 'ys', 1, 2, 4, 64, 2, True, False, False, 0, 0), diff --git a/include/ck_tile/ops/layernorm2d/pipeline/layernorm2d_fwd_pipeline_one_pass.hpp b/include/ck_tile/ops/layernorm2d/pipeline/layernorm2d_fwd_pipeline_one_pass.hpp index 83cdab428e..2ba4a2f9f3 100644 --- a/include/ck_tile/ops/layernorm2d/pipeline/layernorm2d_fwd_pipeline_one_pass.hpp +++ b/include/ck_tile/ops/layernorm2d/pipeline/layernorm2d_fwd_pipeline_one_pass.hpp @@ -125,7 +125,8 @@ struct Layernorm2dFwdPipelineOnePass // compute inv-std auto inv_std = tile_elementwise_in( [&](const auto& v_) { - return type_convert(1.0f) / (sqrt(v_ + epsilon)); + return type_convert(1.0f) * + __builtin_amdgcn_rcpf(sqrt(v_ + epsilon)); }, var); diff --git a/include/ck_tile/ops/welford/block/block_welford.hpp b/include/ck_tile/ops/welford/block/block_welford.hpp index ce73c183e1..e196998a0e 100644 --- a/include/ck_tile/ops/welford/block/block_welford.hpp +++ b/include/ck_tile/ops/welford/block/block_welford.hpp @@ -356,7 +356,8 @@ CK_TILE_DEVICE constexpr void block_tile_welford_post_scale_var(VarDistributedTe int count) { using DataType = typename VarDistributedTensor_::DataType; - tile_elementwise_inout([&count](auto& x) { x = x / type_convert(count); }, - var_tensor); + tile_elementwise_inout( + [&count](auto& x) { x = x * __builtin_amdgcn_rcpf(type_convert(count)); }, + var_tensor); } } // namespace ck_tile diff --git a/include/ck_tile/ops/welford/thread/thread_welford.hpp b/include/ck_tile/ops/welford/thread/thread_welford.hpp index 4c61cdcf4b..81c5c3b383 100644 --- a/include/ck_tile/ops/welford/thread/thread_welford.hpp +++ b/include/ck_tile/ops/welford/thread/thread_welford.hpp @@ -12,7 +12,7 @@ CK_TILE_DEVICE void welford_update(T& mean, T& var, T x, int count) { // TODO: check nan? maybe no T delta = x - mean; - mean += delta / count; + mean += delta * __builtin_amdgcn_rcpf(count); T delta2 = x - mean; var += delta * delta2; } @@ -21,11 +21,12 @@ template CK_TILE_DEVICE static void welford_merge(T& mean_a, T& var_a, int& count_a, T mean_b, T var_b, int count_b) { - int count = count_a + count_b; - T count_ = type_convert(count); - T count_a_ = type_convert(count_a); - T count_b_ = type_convert(count_b); - T count_b_over_count = count == 0 ? type_convert(0) : count_b_ / count_; + int count = count_a + count_b; + T count_ = type_convert(count); + T count_a_ = type_convert(count_a); + T count_b_ = type_convert(count_b); + T count_b_over_count = + count == 0 ? type_convert(0) : count_b_ * __builtin_amdgcn_rcpf(count_); T delta = mean_b - mean_a; mean_a += delta * count_b_over_count;