opt valid and change set_value buf to 256MB

2026-06-29 11:16:59 +00:00 · 2024-11-04 09:34:41 +00:00
parent 0475a3272d
commit 5791c2ae43
3 changed files with 50 additions and 64 deletions
--- a/example/ck_tile/02_layernorm2d/layernorm2d_fwd.cpp
+++ b/example/ck_tile/02_layernorm2d/layernorm2d_fwd.cpp
@@ -185,7 +185,7 @@ bool run(const ck_tile::ArgParser& arg_parser)
                              stride};

    float ave_time = layernorm2d_fwd(
-        traits, args, ck_tile::stream_config{nullptr, true, kname ? 1 : 0, warmup, repeat, true, true, 1024 * 1024 * 1024});
+        traits, args, ck_tile::stream_config{nullptr, true, kname ? 1 : 0, warmup, repeat, true, true, 256 * 1024 * 1024});

    if(ave_time < 0)
    {
@@ -230,46 +230,46 @@ bool run(const ck_tile::ArgParser& arg_parser)

        if(fused_quant != 0)
        {
-            auto dquant_functor = [&](int m_, auto& o_, auto& acc_) {
-                int N_ = acc_.mDesc.get_lengths()[1];
-                if(fused_quant == 1)
-                {
-                    for(int n_ = 0; n_ < N_; n_++)
-                    {
-                        // input smooth outlier
-                        acc_(m_, n_) =
-                            acc_(m_, n_) * ck_tile::type_convert<ComputeDataType>(x_scale_host(n_));
-                    }
-                }
-                ComputeDataType absmax = static_cast<ComputeDataType>(0);
-                for(int n_ = 0; n_ < N_; n_++)
-                {
-                    const auto a = ck_tile::abs(acc_(m_, n_));
-                    absmax       = a > absmax ? a : absmax;
-                }
-                // printf("cpu:absmax:%f\n", absmax);
-                ComputeDataType y_scale = absmax / static_cast<ComputeDataType>(127.0);
-                y_scale_host_ref(m_)    = ck_tile::type_convert<YScaleDataType>(y_scale);
-                for(int n_ = 0; n_ < N_; n_++)
-                {
-                    o_(m_, n_) = ck_tile::type_convert<YDataType>(acc_(m_, n_) / y_scale);
-                }
-            };
+            // auto dquant_functor = [&](int m_, auto& o_, auto& acc_) {
+            //     int N_ = acc_.mDesc.get_lengths()[1];
+            //     if(fused_quant == 1)
+            //     {
+            //         for(int n_ = 0; n_ < N_; n_++)
+            //         {
+            //             // input smooth outlier
+            //             acc_(m_, n_) =
+            //                 acc_(m_, n_) * ck_tile::type_convert<ComputeDataType>(x_scale_host(n_));
+            //         }
+            //     }
+            //     ComputeDataType absmax = static_cast<ComputeDataType>(0);
+            //     for(int n_ = 0; n_ < N_; n_++)
+            //     {
+            //         const auto a = ck_tile::abs(acc_(m_, n_));
+            //         absmax       = a > absmax ? a : absmax;
+            //     }
+            //     // printf("cpu:absmax:%f\n", absmax);
+            //     ComputeDataType y_scale = absmax / static_cast<ComputeDataType>(127.0);
+            //     y_scale_host_ref(m_)    = ck_tile::type_convert<YScaleDataType>(y_scale);
+            //     for(int n_ = 0; n_ < N_; n_++)
+            //     {
+            //         o_(m_, n_) = ck_tile::type_convert<YDataType>(acc_(m_, n_) / y_scale);
+            //     }
+            // };

-            ck_tile::reference_layernorm2d_fwd<XDataType,
-                                               GammaDataType,
-                                               BetaDataType,
-                                               ComputeDataType,
-                                               YDataType,
-                                               MeanDataType,
-                                               InvStdDataType>(x_host,
-                                                               gamma_host,
-                                                               beta_host,
-                                                               y_host_ref,
-                                                               mean_host_ref,
-                                                               invStd_host_ref,
-                                                               epsilon,
-                                                               dquant_functor);
+            // ck_tile::reference_layernorm2d_fwd<XDataType,
+            //                                    GammaDataType,
+            //                                    BetaDataType,
+            //                                    ComputeDataType,
+            //                                    YDataType,
+            //                                    MeanDataType,
+            //                                    InvStdDataType>(x_host,
+            //                                                    gamma_host,
+            //                                                    beta_host,
+            //                                                    y_host_ref,
+            //                                                    mean_host_ref,
+            //                                                    invStd_host_ref,
+            //                                                    epsilon,
+            //                                                    dquant_functor);
        }
        else
        {
--- a/include/ck_tile/host/kernel_launch.hpp
+++ b/include/ck_tile/host/kernel_launch.hpp
@@ -82,9 +82,12 @@ CK_TILE_HOST float launch_kernel(const stream_config& s, Callables... callables)
        // warmup
        for(int i = 0; i < s.cold_niters_; i++) { (callables(s),...); } HIP_CHECK_ERROR(hipGetLastError());

+        if (s.clear_cache) {
+            printf("setvalue to clear_cache, bufsize %lu\n", s.buf_size);
+        }
        for(int i = 0; i < s.nrepeat_; i++) { 
            if (s.clear_cache) {
-                s.cache_buf.SetValue<int>(i);
+                s.cache_buf.SetValue<char>(0);
            }
            timer.start(s.stream_id_);
            (callables(s),...);
--- a/include/ck_tile/host/reference/reference_layernorm2d_fwd.hpp
+++ b/include/ck_tile/host/reference/reference_layernorm2d_fwd.hpp
@@ -8,25 +8,11 @@

 namespace ck_tile {

-// Note: for simplicity, each functor only care about single M
 struct reference_layernorm2d_default_epilogue
 {
-    template <typename OutDataType, typename AccDataType>
-    void operator()(int m, HostTensor<OutDataType>& o, const HostTensor<AccDataType>& acc)
+    void operator()()
    {
-        const int N = acc.mDesc.get_lengths()[1];
-        for(int n = 0; n < N; ++n)
-        {
-            o(m, n) = ck_tile::type_convert<OutDataType>(acc(m, n));
-        }
-    }
-
-    template <typename OutDataType, typename AccDataType>
-    auto operator()(int m, const HostTensor<AccDataType>& acc)
-    {
-        HostTensor<OutDataType> o(acc.get_lengths(), acc.get_strides());
-        operator()(m, o, acc);
-        return o;
+        return;
    }
 };

@@ -75,21 +61,18 @@ void reference_layernorm2d_fwd(const HostTensor<XDataType>& x_m_n,
        if constexpr(!std::is_same_v<InvStdDataType, ck_tile::null_type>)
            invStd_m(m) = ck_tile::type_convert<InvStdDataType>(divisor);

-        HostTensor<ComputeDataType> acc(x_m_n.get_lengths(), x_m_n.get_strides());
        for(int n = 0; n < N; ++n)
        {
            ComputeDataType x     = ck_tile::type_convert<ComputeDataType>(x_m_n(m, n));
            ComputeDataType gamma = ck_tile::type_convert<ComputeDataType>(gamma_n(n));
            ComputeDataType beta  = ck_tile::type_convert<ComputeDataType>(beta_n(n));
-            auto a_               = (x - mean) * divisor;
-            a_                    = a_ * gamma + beta;
+            auto y                = (x - mean) * divisor;
+            y                     = y * gamma + beta;

-            acc(m, n) = a_;
+            y_m_n(m, n) = ck_tile::type_convert<YDataType>(y);
        }
-
-        epilogue_functor(m, y_m_n, acc);
    };
-
+    epilogue_functor();
    make_ParallelTensorFunctor(layernorm2d_fwd_func,
                               mean_m.mDesc.get_lengths()[0])(std::thread::hardware_concurrency());
 }