update s_barrier's logic in gfx12 architecture (#3003)

change s_waitcnt's logic in gfx1250

change s_waitcnt's logic in gfx1250

update comment
This commit is contained in:
joyeamd
2025-10-14 23:49:34 +08:00
committed by GitHub
parent e4298e55c7
commit b9d74e7746
2 changed files with 55 additions and 2 deletions

View File

@@ -797,7 +797,7 @@ struct MoeSortingKernel
else
smem_tokens(curr_token_id, eid)++;
}
__builtin_amdgcn_s_waitcnt(0xc07f);
s_waitcnt<waitcnt_arg::kMaxVmCnt, waitcnt_arg::kMaxExpCnt, 0>();
}
__syncthreads(); // make sure different i_token iteration not overlap by different wave
}
@@ -922,7 +922,7 @@ struct MoeSortingKernel
// NOTE: this waitcnt is a must, compiler will not generate waitcnt lgkmcnt()
// for above write however __syncthreads will cause barrier with waves other
// than 0(which is not we want)
__builtin_amdgcn_s_waitcnt(0xc07f);
s_waitcnt<waitcnt_arg::kMaxVmCnt, waitcnt_arg::kMaxExpCnt, 0>();
}
if((lid + i_e_ - get_warp_size()) == (num_experts - 1))
{