Merge commit '0db21053e68817a50b0ed0ceea87e88228ab2475' into develop

This commit is contained in:
assistant-librarian[bot]
2025-08-22 02:40:53 +00:00
parent f2fdea3f69
commit 8a96fb5fca
10 changed files with 147 additions and 120 deletions

View File

@@ -1276,26 +1276,46 @@ llvm_amdgcn_raw_buffer_load_lds(int32x4_t rsrc,
index_t offset,
index_t aux) __asm("llvm.amdgcn.raw.buffer.load.lds");
template <bool pre_nop = false>
CK_TILE_DEVICE void async_buffer_load_dword_v(void* smem,
int32x4_t rsrc,
index_t voffset,
index_t /*soffset*/,
index_t ioffset /*max 0xFFF*/,
index_t /*flag*/ = 0,
bool_constant<pre_nop> = {})
template <unsigned num_dwords, bool pre_nop = false>
CK_TILE_DEVICE void async_buffer_load_dwordxn_v(void* smem,
int32x4_t rsrc,
index_t voffset,
index_t /*soffset*/,
index_t ioffset /*max 0xFFF*/,
index_t /*flag*/ = 0,
bool_constant<pre_nop> = {})
{
if constexpr(pre_nop)
asm volatile("s_nop 4\n"
"buffer_load_dword %1, %2, 0 offen offset:%3 lds"
: "=r"(smem) /*dummy dependency for smem*/
: "v"(voffset), "s"(rsrc), "n"(ioffset)
#define CK_TILE_ASYNC_LOAD_WITH_INSTR(instr) \
if constexpr(pre_nop) \
asm volatile("s_nop 4\n" instr " %1, %2, 0 offen offset:%3 lds" \
: "=r"(smem) /*dummy dependency for smem*/ \
: "v"(voffset), "s"(rsrc), "n"(ioffset) \
: "memory"); \
else \
asm volatile(instr " %1, %2, 0 offen offset:%3 lds" \
: "=r"(smem) /*dummy dependency for smem*/ \
: "v"(voffset), "s"(rsrc), "n"(ioffset) \
: "memory");
if constexpr(num_dwords == 1)
{
CK_TILE_ASYNC_LOAD_WITH_INSTR("buffer_load_dword");
}
#if defined(__gfx950__)
else if constexpr(num_dwords == 3)
{
CK_TILE_ASYNC_LOAD_WITH_INSTR("buffer_load_dwordx3");
}
else if constexpr(num_dwords == 4)
{
CK_TILE_ASYNC_LOAD_WITH_INSTR("buffer_load_dwordx4");
}
#endif
else
asm volatile("buffer_load_dword %1, %2, 0 offen offset:%3 lds"
: "=r"(smem) /*dummy dependency for smem*/
: "v"(voffset), "s"(rsrc), "n"(ioffset)
: "memory");
{
static_assert(false, "wrong! not implemented data width");
}
#undef CK_TILE_ASYNC_LOAD_WITH_INSTR
}
CK_TILE_DEVICE void async_buffer_load_fence(index_t cnt = 0)
@@ -1766,15 +1786,18 @@ CK_TILE_DEVICE void amd_async_buffer_load_impl(CK_TILE_LDS_ADDR T* smem,
index_t src_immediate_addr_offset = 0,
bool_constant<pre_nop> = {})
{
static_assert(sizeof(T) * N == 4, "wrong! not implemented vector size");
constexpr index_t num_bytes = sizeof(T) * N;
constexpr index_t num_words = num_bytes / 4;
static_assert(num_bytes % 4 == 0 && (num_words == 1 || num_words == 3 || num_words == 4),
"wrong! only support in dword, dwordx3, dwordx4");
async_buffer_load_dword_v(smem,
src_wave_buffer_resource,
src_thread_addr_offset,
src_wave_addr_offset,
src_immediate_addr_offset,
0,
bool_constant<pre_nop>{});
async_buffer_load_dwordxn_v<num_words>(smem,
src_wave_buffer_resource,
src_thread_addr_offset,
src_wave_addr_offset,
src_immediate_addr_offset,
0,
bool_constant<pre_nop>{});
}
template <typename T,

View File

@@ -1144,26 +1144,46 @@ llvm_amdgcn_raw_buffer_load_lds(int32x4_t rsrc,
index_t offset,
index_t aux) __asm("llvm.amdgcn.raw.buffer.load.lds");
template <bool pre_nop = false>
CK_TILE_DEVICE void async_buffer_load_dword_v(void* smem,
int32x4_t rsrc,
index_t voffset,
index_t /*soffset*/,
index_t ioffset /*max 0xFFF*/,
index_t /*flag*/ = 0,
bool_constant<pre_nop> = {})
template <unsigned num_dwords, bool pre_nop = false>
CK_TILE_DEVICE void async_buffer_load_dwordxn_v(void* smem,
int32x4_t rsrc,
index_t voffset,
index_t /*soffset*/,
index_t ioffset /*max 0xFFF*/,
index_t /*flag*/ = 0,
bool_constant<pre_nop> = {})
{
if constexpr(pre_nop)
asm volatile("s_nop 4\n"
"buffer_load_dword %1, %2, 0 offen offset:%3 lds"
: "=r"(smem) /*dummy dependency for smem*/
: "v"(voffset), "s"(rsrc), "n"(ioffset)
#define CK_TILE_ASYNC_LOAD_WITH_INSTR(instr) \
if constexpr(pre_nop) \
asm volatile("s_nop 4\n" instr " %1, %2, 0 offen offset:%3 lds" \
: "=r"(smem) /*dummy dependency for smem*/ \
: "v"(voffset), "s"(rsrc), "n"(ioffset) \
: "memory"); \
else \
asm volatile(instr " %1, %2, 0 offen offset:%3 lds" \
: "=r"(smem) /*dummy dependency for smem*/ \
: "v"(voffset), "s"(rsrc), "n"(ioffset) \
: "memory");
if constexpr(num_dwords == 1)
{
CK_TILE_ASYNC_LOAD_WITH_INSTR("buffer_load_dword");
}
#if defined(__gfx950__)
else if constexpr(num_dwords == 3)
{
CK_TILE_ASYNC_LOAD_WITH_INSTR("buffer_load_dwordx3");
}
else if constexpr(num_dwords == 4)
{
CK_TILE_ASYNC_LOAD_WITH_INSTR("buffer_load_dwordx4");
}
#endif
else
asm volatile("buffer_load_dword %1, %2, 0 offen offset:%3 lds"
: "=r"(smem) /*dummy dependency for smem*/
: "v"(voffset), "s"(rsrc), "n"(ioffset)
: "memory");
{
static_assert(false, "wrong! not implemented data width");
}
#undef CK_TILE_ASYNC_LOAD_WITH_INSTR
}
CK_TILE_DEVICE void async_buffer_load_fence(index_t cnt = 0)
@@ -1536,15 +1556,18 @@ CK_TILE_DEVICE void amd_async_buffer_load_impl(T* smem,
index_t src_immediate_addr_offset = 0,
bool_constant<pre_nop> = {})
{
static_assert(sizeof(T) * N == 4, "wrong! not implemented vector size");
constexpr index_t num_bytes = sizeof(T) * N;
constexpr index_t num_words = num_bytes / 4;
static_assert(num_bytes % 4 == 0 && (num_words == 1 || num_words == 3 || num_words == 4),
"wrong! only support in dword, dwordx3, dwordx4");
async_buffer_load_dword_v(smem,
src_wave_buffer_resource,
src_thread_addr_offset,
src_wave_addr_offset,
src_immediate_addr_offset,
0,
bool_constant<pre_nop>{});
async_buffer_load_dwordxn_v<num_words>(smem,
src_wave_buffer_resource,
src_thread_addr_offset,
src_wave_addr_offset,
src_immediate_addr_offset,
0,
bool_constant<pre_nop>{});
}
template <typename T,

View File

@@ -98,9 +98,18 @@ CK_TILE_DEVICE index_t get_block_1d_id() { return blockIdx.x; }
// Use these instead
CK_TILE_DEVICE index_t get_lane_id() { return __lane_id(); }
CK_TILE_DEVICE index_t get_warp_id()
template <bool ReturnSgpr = true>
CK_TILE_DEVICE index_t get_warp_id(bool_constant<ReturnSgpr> = {})
{
return __builtin_amdgcn_readfirstlane(threadIdx.x / get_warp_size());
const index_t warp_id = threadIdx.x / get_warp_size();
if constexpr(ReturnSgpr)
{
return __builtin_amdgcn_readfirstlane(warp_id);
}
else
{
return warp_id;
}
}
CK_TILE_DEVICE index_t get_thread_id() { return threadIdx.x; }

View File

@@ -288,8 +288,11 @@ struct tile_window_with_static_distribution
sizeof(LdsDataType) -
size_per_buf;
const index_t m0_init_value = size_per_buf + size_per_wave * get_warp_id();
m0_set_with_memory(m0_init_value); // This should be wave independent
// Use VALU so the compiler can optimize redundant/repeated computations
const index_t m0_init_value =
size_per_buf + size_per_wave * get_warp_id(/*ReturnSgpr=*/bool_constant<false>{});
m0_set_with_memory(
__builtin_amdgcn_readfirstlane(m0_init_value)); // This should be wave independent
using Traits = typename Base::Traits;