[Ck tile] support rmsnorm and related fusion (#1605)

* Add reduce2d new api * Prevent user use cross warp reduction * Fix bug of std caculation * Add rmsnorm2d * Add rmsnorm small example * Remove static assert to prevent compile fail * Add script to test performance and correctness * Add missing cmake change * refine naming * refine example of rmsnorm * Fix bug of rmsnorm * Refine naming * Fix cmake * clang format * Refine pipeline name * Add add_rmsnorm2d_rdquant kernel * Add reduce op * host verification * Fix bug of one pass pipeline * Refine tile size * Add two pass pipeline * Rename two pass to three pass * Fix bug of kSaveX == false * Add instance library * Add test script * Fix bug of x verification * Add save_x to trait * Add README * Move reduce2d into reduce folder * Fix bug of welford when number of m warp > 1 * remove reduncant comment * 1. move 06_rmsnorm2d to 10_rmsnorm2d 2. move 07_add_rmsnorm2d_rdquant to 11_add_rmsnorm2d_rdquant * clang format and add missing header * Add host validation of add + layernorm2d + rsquant * Revert "Add host validation of add + layernorm2d + rsquant" This reverts commit 936cb45797. * Remove deprecated flag
2026-05-02 04:31:25 +00:00 · 2024-10-30 15:22:56 +08:00
parent 8632221814
commit 3d60953477
90 changed files with 4674 additions and 128 deletions
--- a/include/ck_tile/ops/reduce/block/block_reduce.hpp
+++ b/include/ck_tile/ops/reduce/block/block_reduce.hpp
@@ -6,6 +6,7 @@
 #include "ck_tile/core.hpp"
 #include <tuple>

+// This file is not support cross warp reduce
 namespace ck_tile {

 /*
@@ -15,8 +16,8 @@ namespace ck_tile {
 // synchronize reduce result (cross lane reduction and broadcast on replicated dimension)
 template <typename AccDistributedTensor_, typename ReduceFunc, bool WithBroadcast = true>
 CK_TILE_DEVICE void block_tile_reduce_sync(AccDistributedTensor_& acc_tensor,
-                                           const ReduceFunc& reduce_func,
-                                           bool_constant<WithBroadcast> = {})
+                                                          const ReduceFunc& reduce_func,
+                                                          bool_constant<WithBroadcast> = {})
 {
    using Dstr             = typename AccDistributedTensor_::StaticTileDistribution;
    using DstrEncode       = typename Dstr::DstrEncode;
@@ -115,7 +116,7 @@ CK_TILE_DEVICE void block_tile_reduce_sync(AccDistributedTensor_& acc_tensor,
 */
 template <typename AccDistributedTensor_, typename ReduceFunc>
 CK_TILE_DEVICE void block_tile_reduce_xor_sync(AccDistributedTensor_& acc_tensor,
-                                               const ReduceFunc& reduce_func)
+                                                              const ReduceFunc& reduce_func)
 {
    using Dstr             = typename AccDistributedTensor_::StaticTileDistribution;
    using DstrEncode       = typename Dstr::DstrEncode;
@@ -174,9 +175,9 @@ template <typename AccDistributedTensor_,
          index_t... InReduceDims,
          typename ReduceFunc>
 CK_TILE_DEVICE void block_tile_reduce(AccDistributedTensor_& acc_tensor,
-                                      const InDistributedTensor_& in_tensor,
-                                      sequence<InReduceDims...>,
-                                      const ReduceFunc& reduce_func)
+                                                     const InDistributedTensor_& in_tensor,
+                                                     sequence<InReduceDims...>,
+                                                     const ReduceFunc& reduce_func)
 {
    constexpr auto I0 = number<0>{};
    constexpr auto I1 = number<1>{};
@@ -249,9 +250,9 @@ template <typename AccDataType_,
          typename ReduceFunc,
          typename InDataType_>
 CK_TILE_DEVICE auto block_tile_reduce(const InDistributedTensor_& in_tensor,
-                                      sequence<InReduceDims...> in_reduce_dims,
-                                      const ReduceFunc& reduce_func,
-                                      const InDataType_& reduce_init)
+                                                     sequence<InReduceDims...> in_reduce_dims,
+                                                     const ReduceFunc& reduce_func,
+                                                     const InDataType_& reduce_init)
 {
    using InDataType  = typename InDistributedTensor_::DataType;
    using AccDataType = remove_cvref_t<AccDataType_>;