[CK_TILE] Improve RMS/Layer Normalization 2 Pass Pipeline Performance (#1861)

* 50ms -> 28ms * Fix bug in non fuse_add_store cases * Fine tuned setting for 2 pass pipeline * adjust workload * remove unnecessary change * add layernorm * Adding output quant and unquant results at the same time. * fix test * fix format * tune for cases 128x640 and 128x1024 * bug ifx
2026-04-20 06:49:15 +00:00 · 2025-03-25 20:09:45 +08:00
parent d2eab23958
commit d49abdaa87
15 changed files with 492 additions and 135 deletions
--- a/include/ck_tile/host/reference/reference_rmsnorm2d_fwd.hpp
+++ b/include/ck_tile/host/reference/reference_rmsnorm2d_fwd.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.

 #pragma once

@@ -35,11 +35,13 @@ template <typename XDataType,
          typename ComputeDataType,
          typename YDataType,
          typename InvRmsDataType,
+          typename UnquantYDataType,
          typename Epilogue = reference_rmsnorm2d_default_epilogue>
 void reference_rmsnorm2d_fwd(const HostTensor<XDataType>& x_m_n,
                             const HostTensor<GammaDataType>& gamma_n,
                             HostTensor<YDataType>& y_m_n,
                             HostTensor<InvRmsDataType>& invRms_m,
+                             HostTensor<UnquantYDataType>& unquant_y_m_n,
                             ComputeDataType epsilon,
                             Epilogue epilogue_functor = {})
 {
@@ -69,7 +71,14 @@ void reference_rmsnorm2d_fwd(const HostTensor<XDataType>& x_m_n,
            acc(m, n)             = x * divisor * gamma;
        }

-        epilogue_functor(m, y_m_n, acc);
+        if constexpr(!std::is_same_v<UnquantYDataType, ck_tile::null_type>)
+        {
+            epilogue_functor(m, unquant_y_m_n, y_m_n, acc);
+        }
+        else
+        {
+            epilogue_functor(m, y_m_n, acc);
+        }
    };

    make_ParallelTensorFunctor(rmsnorm2d_fwd_func, invRms_m.mDesc.get_lengths()[0])(