mirror of
https://github.com/ROCm/composable_kernel.git
synced 2026-06-08 15:30:23 +00:00
fix the s_waitcnt_imm calculation
This commit is contained in:
@@ -18,7 +18,7 @@ constexpr const char* DataTypeToString()
|
||||
{
|
||||
return "bf8";
|
||||
}
|
||||
else if constexpr(std::is_same_v<T, ck_tile::bf16_t>)
|
||||
else if constexpr(std::is_same_v<T, ck_tile::bf16_t>)
|
||||
{
|
||||
return "bf16";
|
||||
}
|
||||
|
||||
@@ -130,6 +130,7 @@ struct TileCopy
|
||||
make_tuple(number<S::Block_M>{}, number<S::Block_N>{}),
|
||||
{iM, 0},
|
||||
MakeDRAMDistribution<Problem>());
|
||||
// We don't have prefetch here, wait the data back immediately.
|
||||
constexpr auto async_copy_fence_cnt = 0;
|
||||
|
||||
// Output tensor
|
||||
|
||||
@@ -116,8 +116,10 @@ CK_TILE_DEVICE void block_sync_load_raw(index_t cnt = 0)
|
||||
template <index_t vmcnt>
|
||||
CK_TILE_DEVICE void block_sync_lds_direct_load()
|
||||
{
|
||||
// we maximum track 64 insts back
|
||||
static_assert(vmcnt <= 63);
|
||||
// We don't sync the lds insts here.
|
||||
constexpr auto s_waitcnt_imm = 3952 + ((vmcnt >> 4) << 14) + (vmcnt & 0xff);
|
||||
constexpr auto s_waitcnt_imm = 3952 + (((vmcnt & 0xf0) << 10) | (vmcnt & 0xf));
|
||||
__builtin_amdgcn_s_waitcnt(s_waitcnt_imm);
|
||||
__builtin_amdgcn_s_barrier();
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user