mirror of
https://github.com/ROCm/composable_kernel.git
synced 2026-05-03 05:01:25 +00:00
update s_barrier's logic in gfx12 architecture (#3003)
change s_waitcnt's logic in gfx1250 change s_waitcnt's logic in gfx1250 update comment
This commit is contained in:
@@ -139,6 +139,34 @@ CK_TILE_DEVICE void block_sync_load_raw(index_t cnt = 0)
|
||||
// https://llvm.org/docs/AMDGPU/gfx9_waitcnt.html
|
||||
struct waitcnt_arg
|
||||
{
|
||||
#if defined(__gfx12__)
|
||||
// use s_wait_loadcnt_dscnt in this instruction; in this instruction, ds [5:0]; mem [13:8]
|
||||
CK_TILE_DEVICE static constexpr index_t MAX = 0b00'111111'00'111111;
|
||||
|
||||
CK_TILE_DEVICE static constexpr index_t kMaxVmCnt = 0b111111;
|
||||
CK_TILE_DEVICE static constexpr index_t kMaxExpCnt = 0b111;
|
||||
CK_TILE_DEVICE static constexpr index_t kMaxLgkmCnt = 0b111111;
|
||||
|
||||
template <index_t cnt>
|
||||
CK_TILE_DEVICE static constexpr index_t from_vmcnt()
|
||||
{
|
||||
static_assert(cnt >= 0 && !(cnt >> 6), "valid range is [0..63]");
|
||||
return MAX & (cnt << 8);
|
||||
}
|
||||
|
||||
template <index_t cnt>
|
||||
CK_TILE_DEVICE static constexpr index_t from_expcnt()
|
||||
{
|
||||
return 0; // no export in MI series
|
||||
}
|
||||
|
||||
template <index_t cnt>
|
||||
CK_TILE_DEVICE static constexpr index_t from_lgkmcnt()
|
||||
{
|
||||
static_assert(cnt >= 0 && !(cnt >> 6), "valid range is [0..63]");
|
||||
return MAX & cnt;
|
||||
}
|
||||
#else
|
||||
// bit numbers (hex) -------------------------> FE'DC'BA98'7'654'3210
|
||||
// [V]M [E]XP [L]GKM counters and [U]NUSED ---> VV'UU'LLLL'U'EEE'VVVV
|
||||
CK_TILE_DEVICE static constexpr index_t MAX = 0b11'00'1111'0'111'1111;
|
||||
@@ -167,6 +195,7 @@ struct waitcnt_arg
|
||||
static_assert(cnt >= 0 && !(cnt >> 4), "valid range is [0..15]");
|
||||
return MAX & (cnt << 8);
|
||||
}
|
||||
#endif
|
||||
};
|
||||
|
||||
template <index_t vmcnt = waitcnt_arg::kMaxVmCnt,
|
||||
@@ -174,9 +203,18 @@ template <index_t vmcnt = waitcnt_arg::kMaxVmCnt,
|
||||
index_t lgkmcnt = waitcnt_arg::kMaxLgkmCnt>
|
||||
CK_TILE_DEVICE void s_waitcnt()
|
||||
{
|
||||
#if defined(__gfx12__)
|
||||
// GFX12 do't use __builtin_amdgcn_s_waitcnt
|
||||
constexpr index_t wait_mask = waitcnt_arg::from_vmcnt<vmcnt>() |
|
||||
waitcnt_arg::from_expcnt<expcnt>() |
|
||||
waitcnt_arg::from_lgkmcnt<lgkmcnt>();
|
||||
|
||||
asm volatile("s_wait_loadcnt_dscnt %0" : : "n"(wait_mask) : "memory");
|
||||
#else
|
||||
__builtin_amdgcn_s_waitcnt(waitcnt_arg::from_vmcnt<vmcnt>() |
|
||||
waitcnt_arg::from_expcnt<expcnt>() |
|
||||
waitcnt_arg::from_lgkmcnt<lgkmcnt>());
|
||||
#endif
|
||||
}
|
||||
|
||||
template <index_t vmcnt = waitcnt_arg::kMaxVmCnt,
|
||||
@@ -184,8 +222,23 @@ template <index_t vmcnt = waitcnt_arg::kMaxVmCnt,
|
||||
index_t lgkmcnt = waitcnt_arg::kMaxLgkmCnt>
|
||||
CK_TILE_DEVICE void s_waitcnt_barrier()
|
||||
{
|
||||
#if defined(__gfx12__)
|
||||
// GFX12 optimization: Manual barrier implementation avoids performance penalty
|
||||
// from __builtin_amdgcn_s_barrier which inserts extra s_wait_loadcnt_dscnt 0x0
|
||||
constexpr index_t wait_mask = waitcnt_arg::from_vmcnt<vmcnt>() |
|
||||
waitcnt_arg::from_expcnt<expcnt>() |
|
||||
waitcnt_arg::from_lgkmcnt<lgkmcnt>();
|
||||
|
||||
asm volatile("s_wait_loadcnt_dscnt %0\n"
|
||||
"s_barrier_signal -1\n"
|
||||
"s_barrier_wait -1"
|
||||
:
|
||||
: "n"(wait_mask)
|
||||
: "memory");
|
||||
#else
|
||||
s_waitcnt<vmcnt, expcnt, lgkmcnt>();
|
||||
__builtin_amdgcn_s_barrier();
|
||||
#endif
|
||||
}
|
||||
|
||||
template <index_t lgkmcnt = 0>
|
||||
|
||||
Reference in New Issue
Block a user