Merge commit '2622ff06cb2aabfd94df191083777b4caeb03966' into develop

This commit is contained in:
assistant-librarian[bot]
2025-08-06 13:25:28 +00:00
parent 6380d0b0b1
commit 69d6769ca8
2 changed files with 0 additions and 64 deletions

View File

@@ -2754,54 +2754,6 @@ CK_TILE_DEVICE void amd_buffer_atomic_max(const thread_buffer<T, N>& src_thread_
#endif
}
template <typename T, index_t NumElemsPerThread>
CK_TILE_DEVICE void amd_direct_load_global_to_lds(const T* global_base_ptr,
const index_t global_offset,
T* lds_base_ptr,
const index_t lds_offset,
const bool is_valid,
const index_t src_element_space_size)
{
const uint32_t* global_ptr =
reinterpret_cast<uint32_t*>(reinterpret_cast<uintptr_t>(global_base_ptr));
const int32x4_t src_resource =
make_wave_buffer_resource(global_ptr, src_element_space_size * sizeof(T));
const index_t global_offset_bytes = is_valid ? global_offset * sizeof(T) : 0x80000000;
#if CK_TILE_USE_AMD_LDS_DIRECT_LOAD_INLINE_ASM
T* lds_ptr = lds_base_ptr + lds_offset;
auto const lds_ptr_sgpr =
__builtin_amdgcn_readfirstlane((reinterpret_cast<uintptr_t>(lds_ptr)));
asm volatile("s_mov_b32 m0, %0; \n\t"
"buffer_load_dword %1, %2, 0 offen lds;\n\t" ::"s"(lds_ptr_sgpr),
"v"(global_offset_bytes),
"s"(src_resource)
: "memory");
#else
// Direct loads require that each thread reads and writes exactly a single DWORD.
#if defined(__gfx9__)
constexpr auto bytes_per_thread = sizeof(T) * NumElemsPerThread;
#endif
// Direct loads require that each thread reads and writes a multiple of DWORDs (4 bytes).
// For gfx950: supports 1, 3, or 4 DWORDs per thread
// For gfx942: supports exactly 1 DWORD per thread
#if defined(__gfx950__)
constexpr auto dword_bytes = 4;
static_assert(bytes_per_thread == dword_bytes || bytes_per_thread == dword_bytes * 3 ||
bytes_per_thread == dword_bytes * 4);
#elif defined(__gfx9__)
constexpr auto dword_bytes = 4;
static_assert(bytes_per_thread == dword_bytes);
#endif
// LDS pointer must be attributed with the LDS address space.
as3_uint32_ptr lds_ptr =
reinterpret_cast<as3_uint32_ptr>(reinterpret_cast<uintptr_t>(lds_base_ptr + lds_offset));
llvm_amdgcn_raw_buffer_load_lds(
src_resource, lds_ptr, bytes_per_thread, global_offset_bytes, 0, 0, 0);
#endif
}
#if defined(__gfx950__)
template <typename T, index_t N, address_space_enum BufferAddressSpace>
__device__ auto amd_transpose_load_to_vgpr(const T* in_ptr)

View File

@@ -174,22 +174,6 @@ CK_TILE_DEVICE void s_waitcnt_barrier()
__builtin_amdgcn_s_barrier();
}
CK_TILE_DEVICE void block_sync_lds_direct_load()
{
#if 1
// invoke clang builtins which *should* produce the same result as the inline asm below
// difference: inline asm is being compiled to wait vmcnt(0) after the barrier
s_waitcnt_barrier<0, waitcnt_arg::kMaxExpCnt, 0>();
#else
// same content as in old CK (#999)
asm volatile("\
s_waitcnt vmcnt(0) \n \
s_waitcnt lgkmcnt(0) \n \
s_barrier \
" ::);
#endif
}
CK_TILE_DEVICE void s_nop(index_t cnt = 0)
{
#if 1