Add block_sync_lds_direct_load utility

This commit is contained in:
aska-0096
2025-07-16 03:54:33 +00:00
parent eea58629cf
commit e5cc4af808
2 changed files with 7 additions and 7 deletions

View File

@@ -130,6 +130,7 @@ struct TileCopy
make_tuple(number<S::Block_M>{}, number<S::Block_N>{}),
{iM, 0},
MakeDRAMDistribution<Problem>());
constexpr auto async_copy_fence_cnt = 0;
// Output tensor
const auto y_m = make_naive_tensor_view<address_space_enum::global>(
@@ -152,9 +153,8 @@ struct TileCopy
async_load_tile(x_block_lds_write_window, x_block_window);
// Wait all asyncload insts complete.
__builtin_amdgcn_s_waitcnt(3952);
// Wait all waves synced
__builtin_amdgcn_s_barrier();
block_sync_lds_direct_load<async_copy_fence_cnt>();
auto lds_tile = load_tile(x_block_lds_read_window);

View File

@@ -113,13 +113,13 @@ CK_TILE_DEVICE void block_sync_load_raw(index_t cnt = 0)
#endif
}
template <index_t vmcnt>
CK_TILE_DEVICE void block_sync_lds_direct_load()
{
asm volatile("\
s_waitcnt vmcnt(0) \n \
s_waitcnt lgkmcnt(0) \n \
s_barrier \
" ::);
// We don't sync the lds insts here.
constexpr auto s_waitcnt_imm = 3952 + ((vmcnt >> 4) << 14) + (vmcnt & 0xff);
__builtin_amdgcn_s_waitcnt(s_waitcnt_imm);
__builtin_amdgcn_s_barrier();
}
CK_TILE_DEVICE void s_nop(index_t cnt = 0)