From e8587b86c21a2bec11a45cb5daec2b19e61a6db1 Mon Sep 17 00:00:00 2001 From: root Date: Wed, 1 Apr 2026 23:04:07 +0000 Subject: [PATCH] Fix CK-UA pipeline: s_waitcnt_vmcnt<0> in fmha_post_process The final V tile's async load was not properly waited on before reading from LDS: s_waitcnt_vmcnt allowed V_inst outstanding loads (a no-op when K_inst == V_inst). The last loop iteration never prefetches K, so only V is outstanding. Use s_waitcnt_vmcnt<0> unconditionally. This partially fixes the BS32 race condition for production workloads (maxk >= 256). A deeper pipeline race remains for very short KV sequences (maxk < ~165, 2-5 pages) with block_size=32 at high batch. Made-with: Cursor --- .../pipeline/unified_attention_pipeline.hpp | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/include/ck_tile/ops/unified_attention/pipeline/unified_attention_pipeline.hpp b/include/ck_tile/ops/unified_attention/pipeline/unified_attention_pipeline.hpp index 551dab5242..3f60dff312 100644 --- a/include/ck_tile/ops/unified_attention/pipeline/unified_attention_pipeline.hpp +++ b/include/ck_tile/ops/unified_attention/pipeline/unified_attention_pipeline.hpp @@ -904,14 +904,13 @@ struct UnifiedAttentionPipeline auto ps_pi = number<1>{} - d; auto V_lds_rd_idx = ps_pi; - if(1 < num_total_loop) - { - s_waitcnt_vmcnt(); - } - else - { - s_waitcnt_vmcnt<0>(); - } + // Wait for the last V tile's async load to complete before reading from LDS. + // The main loop's final iteration never prefetches K (i_total_loops+1 == + // num_total_loop), so only V loads are outstanding here. The original + // s_waitcnt_vmcnt was a no-op when V_su_ld_insts == + // K_su_ld_insts (e.g. both 2 for kPageBlockSize=32), causing a race where + // V_lds_load read stale LDS before the async V load finished. + s_waitcnt_vmcnt<0>(); __builtin_amdgcn_s_barrier(); V_lds_load(V_lds_rd_idx);