mirror of
https://github.com/ROCm/composable_kernel.git
synced 2026-03-25 09:37:42 +00:00
Add block_sync_lds_direct_load utility
This commit is contained in:
@@ -130,6 +130,7 @@ struct TileCopy
|
||||
make_tuple(number<S::Block_M>{}, number<S::Block_N>{}),
|
||||
{iM, 0},
|
||||
MakeDRAMDistribution<Problem>());
|
||||
constexpr auto async_copy_fence_cnt = 0;
|
||||
|
||||
// Output tensor
|
||||
const auto y_m = make_naive_tensor_view<address_space_enum::global>(
|
||||
@@ -152,9 +153,8 @@ struct TileCopy
|
||||
async_load_tile(x_block_lds_write_window, x_block_window);
|
||||
|
||||
// Wait all asyncload insts complete.
|
||||
__builtin_amdgcn_s_waitcnt(3952);
|
||||
// Wait all waves synced
|
||||
__builtin_amdgcn_s_barrier();
|
||||
block_sync_lds_direct_load<async_copy_fence_cnt>();
|
||||
|
||||
auto lds_tile = load_tile(x_block_lds_read_window);
|
||||
|
||||
|
||||
@@ -113,13 +113,13 @@ CK_TILE_DEVICE void block_sync_load_raw(index_t cnt = 0)
|
||||
#endif
|
||||
}
|
||||
|
||||
template <index_t vmcnt>
|
||||
CK_TILE_DEVICE void block_sync_lds_direct_load()
|
||||
{
|
||||
asm volatile("\
|
||||
s_waitcnt vmcnt(0) \n \
|
||||
s_waitcnt lgkmcnt(0) \n \
|
||||
s_barrier \
|
||||
" ::);
|
||||
// We don't sync the lds insts here.
|
||||
constexpr auto s_waitcnt_imm = 3952 + ((vmcnt >> 4) << 14) + (vmcnt & 0xff);
|
||||
__builtin_amdgcn_s_waitcnt(s_waitcnt_imm);
|
||||
__builtin_amdgcn_s_barrier();
|
||||
}
|
||||
|
||||
CK_TILE_DEVICE void s_nop(index_t cnt = 0)
|
||||
|
||||
Reference in New Issue
Block a user