update s_barrier's logic in gfx12 architecture (#3003)

change s_waitcnt's logic in gfx1250

change s_waitcnt's logic in gfx1250

update comment
This commit is contained in:
joyeamd
2025-10-14 23:49:34 +08:00
committed by GitHub
parent e4298e55c7
commit b9d74e7746
2 changed files with 55 additions and 2 deletions

View File

@@ -139,6 +139,34 @@ CK_TILE_DEVICE void block_sync_load_raw(index_t cnt = 0)
// https://llvm.org/docs/AMDGPU/gfx9_waitcnt.html
struct waitcnt_arg
{
#if defined(__gfx12__)
// use s_wait_loadcnt_dscnt in this instruction; in this instruction, ds [5:0]; mem [13:8]
CK_TILE_DEVICE static constexpr index_t MAX = 0b00'111111'00'111111;
CK_TILE_DEVICE static constexpr index_t kMaxVmCnt = 0b111111;
CK_TILE_DEVICE static constexpr index_t kMaxExpCnt = 0b111;
CK_TILE_DEVICE static constexpr index_t kMaxLgkmCnt = 0b111111;
template <index_t cnt>
CK_TILE_DEVICE static constexpr index_t from_vmcnt()
{
static_assert(cnt >= 0 && !(cnt >> 6), "valid range is [0..63]");
return MAX & (cnt << 8);
}
template <index_t cnt>
CK_TILE_DEVICE static constexpr index_t from_expcnt()
{
return 0; // no export in MI series
}
template <index_t cnt>
CK_TILE_DEVICE static constexpr index_t from_lgkmcnt()
{
static_assert(cnt >= 0 && !(cnt >> 6), "valid range is [0..63]");
return MAX & cnt;
}
#else
// bit numbers (hex) -------------------------> FE'DC'BA98'7'654'3210
// [V]M [E]XP [L]GKM counters and [U]NUSED ---> VV'UU'LLLL'U'EEE'VVVV
CK_TILE_DEVICE static constexpr index_t MAX = 0b11'00'1111'0'111'1111;
@@ -167,6 +195,7 @@ struct waitcnt_arg
static_assert(cnt >= 0 && !(cnt >> 4), "valid range is [0..15]");
return MAX & (cnt << 8);
}
#endif
};
template <index_t vmcnt = waitcnt_arg::kMaxVmCnt,
@@ -174,9 +203,18 @@ template <index_t vmcnt = waitcnt_arg::kMaxVmCnt,
index_t lgkmcnt = waitcnt_arg::kMaxLgkmCnt>
CK_TILE_DEVICE void s_waitcnt()
{
#if defined(__gfx12__)
// GFX12 do't use __builtin_amdgcn_s_waitcnt
constexpr index_t wait_mask = waitcnt_arg::from_vmcnt<vmcnt>() |
waitcnt_arg::from_expcnt<expcnt>() |
waitcnt_arg::from_lgkmcnt<lgkmcnt>();
asm volatile("s_wait_loadcnt_dscnt %0" : : "n"(wait_mask) : "memory");
#else
__builtin_amdgcn_s_waitcnt(waitcnt_arg::from_vmcnt<vmcnt>() |
waitcnt_arg::from_expcnt<expcnt>() |
waitcnt_arg::from_lgkmcnt<lgkmcnt>());
#endif
}
template <index_t vmcnt = waitcnt_arg::kMaxVmCnt,
@@ -184,8 +222,23 @@ template <index_t vmcnt = waitcnt_arg::kMaxVmCnt,
index_t lgkmcnt = waitcnt_arg::kMaxLgkmCnt>
CK_TILE_DEVICE void s_waitcnt_barrier()
{
#if defined(__gfx12__)
// GFX12 optimization: Manual barrier implementation avoids performance penalty
// from __builtin_amdgcn_s_barrier which inserts extra s_wait_loadcnt_dscnt 0x0
constexpr index_t wait_mask = waitcnt_arg::from_vmcnt<vmcnt>() |
waitcnt_arg::from_expcnt<expcnt>() |
waitcnt_arg::from_lgkmcnt<lgkmcnt>();
asm volatile("s_wait_loadcnt_dscnt %0\n"
"s_barrier_signal -1\n"
"s_barrier_wait -1"
:
: "n"(wait_mask)
: "memory");
#else
s_waitcnt<vmcnt, expcnt, lgkmcnt>();
__builtin_amdgcn_s_barrier();
#endif
}
template <index_t lgkmcnt = 0>