Using in-place version of block_tile_reduce() so that using of m_local is avoided

2026-05-18 03:49:41 +00:00 · 2026-03-05 16:27:41 +00:00
parent 2be2c3cd11
commit 73d6e0eb67
2 changed files with 4 additions and 12 deletions
--- a/example/ck_tile/18_hstu_attention/hstu_attention_with_softmax_fwd_pipeline.hpp
+++ b/example/ck_tile/18_hstu_attention/hstu_attention_with_softmax_fwd_pipeline.hpp
@@ -454,14 +454,10 @@ struct HstuAttentionWithSoftmaxFwdPipelineQRKSVS

            __builtin_amdgcn_sched_barrier(0x00000001);

-            auto m_local = block_tile_reduce<CompDataType>(
-                pcomp_tile, sequence<1>{}, f_max, -numeric<CompDataType>::infinity());
-            block_tile_reduce_sync(m_local, f_max, bool_constant<false>{});
-
            const auto m_old = m;

-            tile_elementwise_inout(
-                [](auto& e0, auto e1, auto e2) { e0 = max(e1, e2); }, m, m_old, m_local);
+            block_tile_reduce(m, pcomp_tile, sequence<1>{}, f_max);
+            block_tile_reduce_sync(m, f_max, bool_constant<false>{});

            constexpr auto p_spans = decltype(pcomp_tile)::get_distributed_spans();
            sweep_tile_span(p_spans[number<0>{}], [&](auto idx0) {
--- a/example/ck_tile/18_hstu_attention/hstu_attention_with_softmax_fwd_trload_pipeline.hpp
+++ b/example/ck_tile/18_hstu_attention/hstu_attention_with_softmax_fwd_trload_pipeline.hpp
@@ -451,14 +451,10 @@ struct HstuAttentionWithSoftmaxFwdPipelineQRKSVSTrLoad

            __builtin_amdgcn_sched_barrier(0x00000001);

-            auto m_local = block_tile_reduce<CompDataType>(
-                pcomp_tile, sequence<1>{}, f_max, -numeric<CompDataType>::infinity());
-            block_tile_reduce_sync(m_local, f_max, bool_constant<false>{});
-
            const auto m_old = m;

-            tile_elementwise_inout(
-                [](auto& e0, auto e1, auto e2) { e0 = max(e1, e2); }, m, m_old, m_local);
+            block_tile_reduce(m, pcomp_tile, sequence<1>{}, f_max);
+            block_tile_reduce_sync(m, f_max, bool_constant<false>{});

            __builtin_amdgcn_sched_barrier(0x00000001);