diff --git a/include/ck_tile/core/arch/arch.hpp b/include/ck_tile/core/arch/arch.hpp index 4213397cdf..22fb75a4c1 100644 --- a/include/ck_tile/core/arch/arch.hpp +++ b/include/ck_tile/core/arch/arch.hpp @@ -89,21 +89,6 @@ CK_TILE_DEVICE index_t get_thread_id() { return threadIdx.x; } CK_TILE_DEVICE index_t get_block_id() { return blockIdx.x; } -template -CK_TILE_DEVICE void block_sync_lds() -{ - if constexpr(lgkmcnt > 15) - { - __builtin_amdgcn_s_waitcnt(CK_TILE_S_CNT_MAX & CK_TILE_LGKMCNT(15)); - } - else - { - __builtin_amdgcn_s_waitcnt(CK_TILE_S_CNT_MAX & CK_TILE_LGKMCNT(lgkmcnt)); - } - - __builtin_amdgcn_s_barrier(); -} - CK_TILE_DEVICE void block_sync_load_raw(index_t cnt = 0) { #ifdef __gfx12__ @@ -122,13 +107,6 @@ CK_TILE_DEVICE void block_sync_load_raw(index_t cnt = 0) #endif } -template -CK_TILE_DEVICE void block_sync_lds_direct_load() -{ - // We don't sync the lds insts here. - __builtin_amdgcn_s_waitcnt(CK_TILE_S_CNT_MAX & CK_TILE_VMCNT(vmcnt)); - __builtin_amdgcn_s_barrier(); -} // https://llvm.org/docs/AMDGPU/gfx9_waitcnt.html struct waitcnt_arg { @@ -181,6 +159,18 @@ CK_TILE_DEVICE void s_waitcnt_barrier() __builtin_amdgcn_s_barrier(); } +template +CK_TILE_DEVICE void block_sync_lds() +{ + s_waitcnt_barrier(); +} + +template +CK_TILE_DEVICE void block_sync_lds_direct_load() +{ + s_waitcnt_barrier(); +} + CK_TILE_DEVICE void s_nop(index_t cnt = 0) { #if 1