From e8587b86c21a2bec11a45cb5daec2b19e61a6db1 Mon Sep 17 00:00:00 2001
From: root <root@cv350-ccs-aus-h15-18.cs-aus.dcgpu>
Date: Wed, 1 Apr 2026 23:04:07 +0000
Subject: [PATCH] Fix CK-UA pipeline: s_waitcnt_vmcnt<0> in fmha_post_process

The final V tile's async load was not properly waited on before reading
from LDS: s_waitcnt_vmcnt<K_inst> allowed V_inst outstanding loads
(a no-op when K_inst == V_inst). The last loop iteration never prefetches
K, so only V is outstanding. Use s_waitcnt_vmcnt<0> unconditionally.

This partially fixes the BS32 race condition for production workloads
(maxk >= 256). A deeper pipeline race remains for very short KV
sequences (maxk < ~165, 2-5 pages) with block_size=32 at high batch.

Made-with: Cursor
---
 .../pipeline/unified_attention_pipeline.hpp       | 15 +++++++--------
 1 file changed, 7 insertions(+), 8 deletions(-)

diff --git a/include/ck_tile/ops/unified_attention/pipeline/unified_attention_pipeline.hpp b/include/ck_tile/ops/unified_attention/pipeline/unified_attention_pipeline.hpp
index 551dab5242..3f60dff312 100644
--- a/include/ck_tile/ops/unified_attention/pipeline/unified_attention_pipeline.hpp
+++ b/include/ck_tile/ops/unified_attention/pipeline/unified_attention_pipeline.hpp
@@ -904,14 +904,13 @@ struct UnifiedAttentionPipeline
             auto ps_pi        = number<1>{} - d;
             auto V_lds_rd_idx = ps_pi;
 
-            if(1 < num_total_loop)
-            {
-                s_waitcnt_vmcnt<K_mem_su_ld_insts>();
-            }
-            else
-            {
-                s_waitcnt_vmcnt<0>();
-            }
+            // Wait for the last V tile's async load to complete before reading from LDS.
+            // The main loop's final iteration never prefetches K (i_total_loops+1 ==
+            // num_total_loop), so only V loads are outstanding here.  The original
+            // s_waitcnt_vmcnt<K_mem_su_ld_insts> was a no-op when V_su_ld_insts ==
+            // K_su_ld_insts (e.g. both 2 for kPageBlockSize=32), causing a race where
+            // V_lds_load read stale LDS before the async V load finished.
+            s_waitcnt_vmcnt<0>();
             __builtin_amdgcn_s_barrier();
 
             V_lds_load(V_lds_rd_idx);