diff --git a/example/ck_tile/02_layernorm2d/layernorm2d_fwd.cpp b/example/ck_tile/02_layernorm2d/layernorm2d_fwd.cpp index 0842f70455..a562a32644 100644 --- a/example/ck_tile/02_layernorm2d/layernorm2d_fwd.cpp +++ b/example/ck_tile/02_layernorm2d/layernorm2d_fwd.cpp @@ -185,7 +185,7 @@ bool run(const ck_tile::ArgParser& arg_parser) stride}; float ave_time = layernorm2d_fwd( - traits, args, ck_tile::stream_config{nullptr, true, kname ? 1 : 0, warmup, repeat, true, true, 1024 * 1024 * 1024}); + traits, args, ck_tile::stream_config{nullptr, true, kname ? 1 : 0, warmup, repeat, true, true, 256 * 1024 * 1024}); if(ave_time < 0) { @@ -230,46 +230,46 @@ bool run(const ck_tile::ArgParser& arg_parser) if(fused_quant != 0) { - auto dquant_functor = [&](int m_, auto& o_, auto& acc_) { - int N_ = acc_.mDesc.get_lengths()[1]; - if(fused_quant == 1) - { - for(int n_ = 0; n_ < N_; n_++) - { - // input smooth outlier - acc_(m_, n_) = - acc_(m_, n_) * ck_tile::type_convert(x_scale_host(n_)); - } - } - ComputeDataType absmax = static_cast(0); - for(int n_ = 0; n_ < N_; n_++) - { - const auto a = ck_tile::abs(acc_(m_, n_)); - absmax = a > absmax ? a : absmax; - } - // printf("cpu:absmax:%f\n", absmax); - ComputeDataType y_scale = absmax / static_cast(127.0); - y_scale_host_ref(m_) = ck_tile::type_convert(y_scale); - for(int n_ = 0; n_ < N_; n_++) - { - o_(m_, n_) = ck_tile::type_convert(acc_(m_, n_) / y_scale); - } - }; + // auto dquant_functor = [&](int m_, auto& o_, auto& acc_) { + // int N_ = acc_.mDesc.get_lengths()[1]; + // if(fused_quant == 1) + // { + // for(int n_ = 0; n_ < N_; n_++) + // { + // // input smooth outlier + // acc_(m_, n_) = + // acc_(m_, n_) * ck_tile::type_convert(x_scale_host(n_)); + // } + // } + // ComputeDataType absmax = static_cast(0); + // for(int n_ = 0; n_ < N_; n_++) + // { + // const auto a = ck_tile::abs(acc_(m_, n_)); + // absmax = a > absmax ? a : absmax; + // } + // // printf("cpu:absmax:%f\n", absmax); + // ComputeDataType y_scale = absmax / static_cast(127.0); + // y_scale_host_ref(m_) = ck_tile::type_convert(y_scale); + // for(int n_ = 0; n_ < N_; n_++) + // { + // o_(m_, n_) = ck_tile::type_convert(acc_(m_, n_) / y_scale); + // } + // }; - ck_tile::reference_layernorm2d_fwd(x_host, - gamma_host, - beta_host, - y_host_ref, - mean_host_ref, - invStd_host_ref, - epsilon, - dquant_functor); + // ck_tile::reference_layernorm2d_fwd(x_host, + // gamma_host, + // beta_host, + // y_host_ref, + // mean_host_ref, + // invStd_host_ref, + // epsilon, + // dquant_functor); } else { diff --git a/include/ck_tile/host/kernel_launch.hpp b/include/ck_tile/host/kernel_launch.hpp index c2bae890c4..ed30e893a8 100644 --- a/include/ck_tile/host/kernel_launch.hpp +++ b/include/ck_tile/host/kernel_launch.hpp @@ -82,9 +82,12 @@ CK_TILE_HOST float launch_kernel(const stream_config& s, Callables... callables) // warmup for(int i = 0; i < s.cold_niters_; i++) { (callables(s),...); } HIP_CHECK_ERROR(hipGetLastError()); + if (s.clear_cache) { + printf("setvalue to clear_cache, bufsize %lu\n", s.buf_size); + } for(int i = 0; i < s.nrepeat_; i++) { if (s.clear_cache) { - s.cache_buf.SetValue(i); + s.cache_buf.SetValue(0); } timer.start(s.stream_id_); (callables(s),...); diff --git a/include/ck_tile/host/reference/reference_layernorm2d_fwd.hpp b/include/ck_tile/host/reference/reference_layernorm2d_fwd.hpp index 62cd26b6ab..6306ba29b5 100644 --- a/include/ck_tile/host/reference/reference_layernorm2d_fwd.hpp +++ b/include/ck_tile/host/reference/reference_layernorm2d_fwd.hpp @@ -8,25 +8,11 @@ namespace ck_tile { -// Note: for simplicity, each functor only care about single M struct reference_layernorm2d_default_epilogue { - template - void operator()(int m, HostTensor& o, const HostTensor& acc) + void operator()() { - const int N = acc.mDesc.get_lengths()[1]; - for(int n = 0; n < N; ++n) - { - o(m, n) = ck_tile::type_convert(acc(m, n)); - } - } - - template - auto operator()(int m, const HostTensor& acc) - { - HostTensor o(acc.get_lengths(), acc.get_strides()); - operator()(m, o, acc); - return o; + return; } }; @@ -75,21 +61,18 @@ void reference_layernorm2d_fwd(const HostTensor& x_m_n, if constexpr(!std::is_same_v) invStd_m(m) = ck_tile::type_convert(divisor); - HostTensor acc(x_m_n.get_lengths(), x_m_n.get_strides()); for(int n = 0; n < N; ++n) { ComputeDataType x = ck_tile::type_convert(x_m_n(m, n)); ComputeDataType gamma = ck_tile::type_convert(gamma_n(n)); ComputeDataType beta = ck_tile::type_convert(beta_n(n)); - auto a_ = (x - mean) * divisor; - a_ = a_ * gamma + beta; + auto y = (x - mean) * divisor; + y = y * gamma + beta; - acc(m, n) = a_; + y_m_n(m, n) = ck_tile::type_convert(y); } - - epilogue_functor(m, y_m_n, acc); }; - + epilogue_functor(); make_ParallelTensorFunctor(layernorm2d_fwd_func, mean_m.mDesc.get_lengths()[0])(std::thread::hardware_concurrency()); }