From 902b1c645c27f7cc98619d2212ac18b378fd6666 Mon Sep 17 00:00:00 2001 From: Qianfeng Zhang Date: Mon, 19 May 2025 10:16:09 +0000 Subject: [PATCH] Move k_tile loading in the loop earlier --- .../hstu_attention_fwd_pipeline.hpp | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/example/ck_tile/18_hstu_attention/hstu_attention_fwd_pipeline.hpp b/example/ck_tile/18_hstu_attention/hstu_attention_fwd_pipeline.hpp index c9802b46c2..998fdb595b 100644 --- a/example/ck_tile/18_hstu_attention/hstu_attention_fwd_pipeline.hpp +++ b/example/ck_tile/18_hstu_attention/hstu_attention_fwd_pipeline.hpp @@ -337,6 +337,14 @@ struct HstuAttentionFwdPipelineQRKSVS tile_elementwise_inout([&scale_s](auto& x) { x = x * scale_s; }, sacc_tile); } + __builtin_amdgcn_sched_barrier(0); + + // for i_k1 = k1_loop-1, the loading is for next iteration + k_tile = load_tile(k_dram_window); + move_tile_window(k_dram_window, {kK1, 0}); + + __builtin_amdgcn_sched_barrier(0); + if constexpr(HstuMask::IsMasking) { if constexpr(HstuMask::kUseLocal) @@ -428,10 +436,6 @@ struct HstuAttentionFwdPipelineQRKSVS tile_elementwise_in(v_element_func, v_tile)); // store the prefetch }; - // for i_k1 = k1_loop-1, the loading is for next iteration - k_tile = load_tile(k_dram_window); - move_tile_window(k_dram_window, {kK1, 0}); - __builtin_amdgcn_sched_barrier(0); tile_elementwise_inout(f_silu, pcomp_tile);