From 192536597e87a2f719c09f1f7c81bca81818da85 Mon Sep 17 00:00:00 2001 From: Thomas Ning Date: Tue, 7 Oct 2025 11:54:04 -0700 Subject: [PATCH] add the sync barrier for persistent kernel (#2977) [ROCm/composable_kernel commit: ae9f29b7d514b0829256a0a3ca9ab4511e7a1e04] --- include/ck_tile/ops/gemm/kernel/universal_gemm_kernel.hpp | 1 + 1 file changed, 1 insertion(+) diff --git a/include/ck_tile/ops/gemm/kernel/universal_gemm_kernel.hpp b/include/ck_tile/ops/gemm/kernel/universal_gemm_kernel.hpp index 51ad4e3dd1..e77355ed3d 100644 --- a/include/ck_tile/ops/gemm/kernel/universal_gemm_kernel.hpp +++ b/include/ck_tile/ops/gemm/kernel/universal_gemm_kernel.hpp @@ -1134,6 +1134,7 @@ struct UniversalGemmKernel while(block_id < num_work) { + s_waitcnt_barrier(); // Get the tile index for this block const auto tile_idx = amd_wave_read_first_lane(block_id % num_tiles); const auto [iM, iN] = TilePartitioner{kargs.M, kargs.N}.GetOutputTileIndex(tile_idx);