mirror of
https://github.com/ROCm/composable_kernel.git
synced 2026-05-20 21:09:08 +00:00
fix(sparse_attn): backport PR #4742 LDS s_barrier
Add s_barrier after sched_barrier when K-tail and V share LDS buffer, mirroring upstream PR #4742. Applies to both async_vsa and async_jenga pipelines. Co-Authored-By: Claude Opus 4 <noreply@anthropic.com>
This commit is contained in:
@@ -430,6 +430,12 @@ struct BlockFmhaPipelineQRKSVSAsyncJenga
|
||||
s.get_tile_distribution()); // Pcompute{j}
|
||||
|
||||
__builtin_amdgcn_sched_barrier(0x7F);
|
||||
// Ensure gemm_0's LDS reads (K tile) from all threads are completed before V store
|
||||
// Only needed when K tail and V use the same LDS buffer
|
||||
if constexpr(LdsSeq.at(number<k0_loops - 1>{}) == LdsSeq.at(number<k0_loops>{}))
|
||||
{
|
||||
__builtin_amdgcn_s_barrier();
|
||||
}
|
||||
// store & prefetch next v, after the max reduction
|
||||
auto v_shuffle_tmp = make_static_distributed_tensor<VDataType>(
|
||||
Policy::template MakeShuffledVRegBlockDescriptor<Problem>());
|
||||
|
||||
@@ -387,6 +387,12 @@ struct BlockFmhaPipelineQRKSVSAsyncVSA
|
||||
s.get_tile_distribution()); // Pcompute{j}
|
||||
|
||||
__builtin_amdgcn_sched_barrier(0x7F);
|
||||
// Ensure gemm_0's LDS reads (K tile) from all threads are completed before V store
|
||||
// Only needed when K tail and V use the same LDS buffer
|
||||
if constexpr(LdsSeq.at(number<k0_loops - 1>{}) == LdsSeq.at(number<k0_loops>{}))
|
||||
{
|
||||
__builtin_amdgcn_s_barrier();
|
||||
}
|
||||
// store & prefetch next v, after the max reduction
|
||||
if constexpr(std::is_same_v<VLayout, ck_tile::tensor_layout::gemm::RowMajor>)
|
||||
{
|
||||
|
||||
Reference in New Issue
Block a user