add the sync barrier for persistent kernel (#2977)

This commit is contained in:
Thomas Ning
2025-10-07 11:54:04 -07:00
committed by GitHub
parent 19415d0b6f
commit ae9f29b7d5

View File

@@ -1134,6 +1134,7 @@ struct UniversalGemmKernel
while(block_id < num_work)
{
s_waitcnt_barrier();
// Get the tile index for this block
const auto tile_idx = amd_wave_read_first_lane(block_id % num_tiles);
const auto [iM, iN] = TilePartitioner{kargs.M, kargs.N}.GetOutputTileIndex(tile_idx);