fix(sparse_attn): backport PR #4742 LDS s_barrier

Add s_barrier after sched_barrier when K-tail and V share LDS buffer,
mirroring upstream PR #4742. Applies to both async_vsa and async_jenga pipelines.

Co-Authored-By: Claude Opus 4 <noreply@anthropic.com>
This commit is contained in:
Gino Lu
2026-05-17 02:30:48 -04:00
parent b00e5449c8
commit 668e107282
2 changed files with 12 additions and 0 deletions

View File

@@ -430,6 +430,12 @@ struct BlockFmhaPipelineQRKSVSAsyncJenga
s.get_tile_distribution()); // Pcompute{j}
__builtin_amdgcn_sched_barrier(0x7F);
// Ensure gemm_0's LDS reads (K tile) from all threads are completed before V store
// Only needed when K tail and V use the same LDS buffer
if constexpr(LdsSeq.at(number<k0_loops - 1>{}) == LdsSeq.at(number<k0_loops>{}))
{
__builtin_amdgcn_s_barrier();
}
// store & prefetch next v, after the max reduction
auto v_shuffle_tmp = make_static_distributed_tensor<VDataType>(
Policy::template MakeShuffledVRegBlockDescriptor<Problem>());

View File

@@ -387,6 +387,12 @@ struct BlockFmhaPipelineQRKSVSAsyncVSA
s.get_tile_distribution()); // Pcompute{j}
__builtin_amdgcn_sched_barrier(0x7F);
// Ensure gemm_0's LDS reads (K tile) from all threads are completed before V store
// Only needed when K tail and V use the same LDS buffer
if constexpr(LdsSeq.at(number<k0_loops - 1>{}) == LdsSeq.at(number<k0_loops>{}))
{
__builtin_amdgcn_s_barrier();
}
// store & prefetch next v, after the max reduction
if constexpr(std::is_same_v<VLayout, ck_tile::tensor_layout::gemm::RowMajor>)
{