From e5cc4af808456f56425dd290bb82318650240dee Mon Sep 17 00:00:00 2001 From: aska-0096 Date: Wed, 16 Jul 2025 03:54:33 +0000 Subject: [PATCH] Add block_sync_lds_direct_load utility --- example/ck_tile/36_copy/test_copy.hpp | 4 ++-- include/ck_tile/core/arch/arch.hpp | 10 +++++----- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/example/ck_tile/36_copy/test_copy.hpp b/example/ck_tile/36_copy/test_copy.hpp index 28f99752b0..3c1c9bb9ed 100644 --- a/example/ck_tile/36_copy/test_copy.hpp +++ b/example/ck_tile/36_copy/test_copy.hpp @@ -130,6 +130,7 @@ struct TileCopy make_tuple(number{}, number{}), {iM, 0}, MakeDRAMDistribution()); + constexpr auto async_copy_fence_cnt = 0; // Output tensor const auto y_m = make_naive_tensor_view( @@ -152,9 +153,8 @@ struct TileCopy async_load_tile(x_block_lds_write_window, x_block_window); // Wait all asyncload insts complete. - __builtin_amdgcn_s_waitcnt(3952); // Wait all waves synced - __builtin_amdgcn_s_barrier(); + block_sync_lds_direct_load(); auto lds_tile = load_tile(x_block_lds_read_window); diff --git a/include/ck_tile/core/arch/arch.hpp b/include/ck_tile/core/arch/arch.hpp index 3dd9604b01..8c0b9d56d7 100644 --- a/include/ck_tile/core/arch/arch.hpp +++ b/include/ck_tile/core/arch/arch.hpp @@ -113,13 +113,13 @@ CK_TILE_DEVICE void block_sync_load_raw(index_t cnt = 0) #endif } +template CK_TILE_DEVICE void block_sync_lds_direct_load() { - asm volatile("\ - s_waitcnt vmcnt(0) \n \ - s_waitcnt lgkmcnt(0) \n \ - s_barrier \ - " ::); + // We don't sync the lds insts here. + constexpr auto s_waitcnt_imm = 3952 + ((vmcnt >> 4) << 14) + (vmcnt & 0xff); + __builtin_amdgcn_s_waitcnt(s_waitcnt_imm); + __builtin_amdgcn_s_barrier(); } CK_TILE_DEVICE void s_nop(index_t cnt = 0)