From 27ff3dec9fef082cbebb6c77685d3bb880167d6f Mon Sep 17 00:00:00 2001
From: dummycoderfe <noplydummmycoder@163.com>
Date: Mon, 4 Nov 2024 08:50:09 +0000
Subject: [PATCH] optimze small N case using vec io and using rcp div

---
 example/ck_tile/02_layernorm2d/generate.py          |  9 ++++++---
 .../pipeline/layernorm2d_fwd_pipeline_one_pass.hpp  |  3 ++-
 include/ck_tile/ops/welford/block/block_welford.hpp |  5 +++--
 .../ck_tile/ops/welford/thread/thread_welford.hpp   | 13 +++++++------
 4 files changed, 18 insertions(+), 12 deletions(-)
diff --git a/example/ck_tile/02_layernorm2d/generate.py b/example/ck_tile/02_layernorm2d/generate.py
index bf576db97e..154deae042 100644
--- a/example/ck_tile/02_layernorm2d/generate.py
+++ b/example/ck_tile/02_layernorm2d/generate.py
@@ -114,7 +114,7 @@ struct layernorm2d_fwd_traits_
     using WarpTile   = ck_tile::sequence<Warp_M, Warp_N>;
     using Vector     = ck_tile::sequence<1, Vector_N_>;
 
-    using Shape = ck_tile::Generic2dBlockShape<BlockTile, BlockWarps, WarpTile, Vector>;
+    using Shape = ck_tile::Generic2dBlockShape<BlockTile, BlockWarps, WarpTile, Vector, ThreadPerBlock_M_ * ThreadPerBlock_N_>;
 
     static constexpr bool kPadN           = kPadN_;
     static constexpr bool kSaveMeanInvStd = kSaveMeanInvStd_;
@@ -484,8 +484,11 @@ float layernorm2d_fwd(layernorm2d_fwd_traits t,
         fused_sweep_list = [0, 1] # NOTE: only single pass can use fused dynamic quant
 
         #                                                       rm  rn  tm   tn  vn  pd     mv     2p     add    sweep
-        h_trait_dict = {'64'  : [ h_traits('x', 'y', 'xs', 'ys', 1,  1,  4,  64, 1,  True,  False, False,   0,    0)],
-                        '128' : [ h_traits('x', 'y', 'xs', 'ys', 1,  1,  4,  64, 2,  True,  False, False,   0,    0),
+        h_trait_dict = {'64'  : [ h_traits('x', 'y', 'xs', 'ys', 1,  1,  8,  8,  8,  True,  False, False,   0,    0),
+                                  h_traits('x', 'y', 'xs', 'ys', 1,  1,  4,  16, 4,  True,  False, False,   0,    0),
+                                  h_traits('x', 'y', 'xs', 'ys', 1,  1,  4,  64, 1,  True,  False, False,   0,    0)],
+                        '128' : [ h_traits('x', 'y', 'xs', 'ys', 1,  1,  4,  16, 8,  True,  False, False,   0,    0),
+                                  h_traits('x', 'y', 'xs', 'ys', 1,  1,  4,  64, 2,  True,  False, False,   0,    0),
                                   h_traits('x', 'y', 'xs', 'ys', 1,  2,  4,  64, 1,  True,  False, False,   0,    0)],
                         '256' : [ h_traits('x', 'y', 'xs', 'ys', 1,  1,  4,  64, 4,  True,  False, False,   0,    0),
                                   h_traits('x', 'y', 'xs', 'ys', 1,  2,  4,  64, 2,  True,  False, False,   0,    0),
diff --git a/include/ck_tile/ops/layernorm2d/pipeline/layernorm2d_fwd_pipeline_one_pass.hpp b/include/ck_tile/ops/layernorm2d/pipeline/layernorm2d_fwd_pipeline_one_pass.hpp
index 83cdab428e..2ba4a2f9f3 100644
--- a/include/ck_tile/ops/layernorm2d/pipeline/layernorm2d_fwd_pipeline_one_pass.hpp
+++ b/include/ck_tile/ops/layernorm2d/pipeline/layernorm2d_fwd_pipeline_one_pass.hpp
@@ -125,7 +125,8 @@ struct Layernorm2dFwdPipelineOnePass
         // compute inv-std
         auto inv_std = tile_elementwise_in(
             [&](const auto& v_) {
-                return type_convert<ComputeDataType>(1.0f) / (sqrt(v_ + epsilon));
+                return type_convert<ComputeDataType>(1.0f) *
+                       __builtin_amdgcn_rcpf(sqrt(v_ + epsilon));
             },
             var);
 
diff --git a/include/ck_tile/ops/welford/block/block_welford.hpp b/include/ck_tile/ops/welford/block/block_welford.hpp
index ce73c183e1..e196998a0e 100644
--- a/include/ck_tile/ops/welford/block/block_welford.hpp
+++ b/include/ck_tile/ops/welford/block/block_welford.hpp
@@ -356,7 +356,8 @@ CK_TILE_DEVICE constexpr void block_tile_welford_post_scale_var(VarDistributedTe
                                                                 int count)
 {
     using DataType = typename VarDistributedTensor_::DataType;
-    tile_elementwise_inout([&count](auto& x) { x = x / type_convert<DataType>(count); },
-                           var_tensor);
+    tile_elementwise_inout(
+        [&count](auto& x) { x = x * __builtin_amdgcn_rcpf(type_convert<DataType>(count)); },
+        var_tensor);
 }
 } // namespace ck_tile
diff --git a/include/ck_tile/ops/welford/thread/thread_welford.hpp b/include/ck_tile/ops/welford/thread/thread_welford.hpp
index 4c61cdcf4b..81c5c3b383 100644
--- a/include/ck_tile/ops/welford/thread/thread_welford.hpp
+++ b/include/ck_tile/ops/welford/thread/thread_welford.hpp
@@ -12,7 +12,7 @@ CK_TILE_DEVICE void welford_update(T& mean, T& var, T x, int count)
 {
     // TODO: check nan? maybe no
     T delta = x - mean;
-    mean += delta / count;
+    mean += delta * __builtin_amdgcn_rcpf(count);
     T delta2 = x - mean;
     var += delta * delta2;
 }
@@ -21,11 +21,12 @@ template <typename T>
 CK_TILE_DEVICE static void
 welford_merge(T& mean_a, T& var_a, int& count_a, T mean_b, T var_b, int count_b)
 {
-    int count            = count_a + count_b;
-    T count_             = type_convert<T>(count);
-    T count_a_           = type_convert<T>(count_a);
-    T count_b_           = type_convert<T>(count_b);
-    T count_b_over_count = count == 0 ? type_convert<T>(0) : count_b_ / count_;
+    int count  = count_a + count_b;
+    T count_   = type_convert<T>(count);
+    T count_a_ = type_convert<T>(count_a);
+    T count_b_ = type_convert<T>(count_b);
+    T count_b_over_count =
+        count == 0 ? type_convert<T>(0) : count_b_ * __builtin_amdgcn_rcpf(count_);
 
     T delta = mean_b - mean_a;
     mean_a += delta * count_b_over_count;