mirror of
https://github.com/ROCm/composable_kernel.git
synced 2026-05-14 18:17:44 +00:00
Merge commit '2622ff06cb2aabfd94df191083777b4caeb03966' into develop
This commit is contained in:
@@ -2754,54 +2754,6 @@ CK_TILE_DEVICE void amd_buffer_atomic_max(const thread_buffer<T, N>& src_thread_
|
||||
#endif
|
||||
}
|
||||
|
||||
template <typename T, index_t NumElemsPerThread>
|
||||
CK_TILE_DEVICE void amd_direct_load_global_to_lds(const T* global_base_ptr,
|
||||
const index_t global_offset,
|
||||
T* lds_base_ptr,
|
||||
const index_t lds_offset,
|
||||
const bool is_valid,
|
||||
const index_t src_element_space_size)
|
||||
{
|
||||
const uint32_t* global_ptr =
|
||||
reinterpret_cast<uint32_t*>(reinterpret_cast<uintptr_t>(global_base_ptr));
|
||||
const int32x4_t src_resource =
|
||||
make_wave_buffer_resource(global_ptr, src_element_space_size * sizeof(T));
|
||||
const index_t global_offset_bytes = is_valid ? global_offset * sizeof(T) : 0x80000000;
|
||||
|
||||
#if CK_TILE_USE_AMD_LDS_DIRECT_LOAD_INLINE_ASM
|
||||
T* lds_ptr = lds_base_ptr + lds_offset;
|
||||
auto const lds_ptr_sgpr =
|
||||
__builtin_amdgcn_readfirstlane((reinterpret_cast<uintptr_t>(lds_ptr)));
|
||||
asm volatile("s_mov_b32 m0, %0; \n\t"
|
||||
"buffer_load_dword %1, %2, 0 offen lds;\n\t" ::"s"(lds_ptr_sgpr),
|
||||
"v"(global_offset_bytes),
|
||||
"s"(src_resource)
|
||||
: "memory");
|
||||
#else
|
||||
// Direct loads require that each thread reads and writes exactly a single DWORD.
|
||||
#if defined(__gfx9__)
|
||||
constexpr auto bytes_per_thread = sizeof(T) * NumElemsPerThread;
|
||||
#endif
|
||||
// Direct loads require that each thread reads and writes a multiple of DWORDs (4 bytes).
|
||||
// For gfx950: supports 1, 3, or 4 DWORDs per thread
|
||||
// For gfx942: supports exactly 1 DWORD per thread
|
||||
#if defined(__gfx950__)
|
||||
constexpr auto dword_bytes = 4;
|
||||
static_assert(bytes_per_thread == dword_bytes || bytes_per_thread == dword_bytes * 3 ||
|
||||
bytes_per_thread == dword_bytes * 4);
|
||||
#elif defined(__gfx9__)
|
||||
constexpr auto dword_bytes = 4;
|
||||
static_assert(bytes_per_thread == dword_bytes);
|
||||
#endif
|
||||
// LDS pointer must be attributed with the LDS address space.
|
||||
as3_uint32_ptr lds_ptr =
|
||||
reinterpret_cast<as3_uint32_ptr>(reinterpret_cast<uintptr_t>(lds_base_ptr + lds_offset));
|
||||
|
||||
llvm_amdgcn_raw_buffer_load_lds(
|
||||
src_resource, lds_ptr, bytes_per_thread, global_offset_bytes, 0, 0, 0);
|
||||
#endif
|
||||
}
|
||||
|
||||
#if defined(__gfx950__)
|
||||
template <typename T, index_t N, address_space_enum BufferAddressSpace>
|
||||
__device__ auto amd_transpose_load_to_vgpr(const T* in_ptr)
|
||||
|
||||
@@ -174,22 +174,6 @@ CK_TILE_DEVICE void s_waitcnt_barrier()
|
||||
__builtin_amdgcn_s_barrier();
|
||||
}
|
||||
|
||||
CK_TILE_DEVICE void block_sync_lds_direct_load()
|
||||
{
|
||||
#if 1
|
||||
// invoke clang builtins which *should* produce the same result as the inline asm below
|
||||
// difference: inline asm is being compiled to wait vmcnt(0) after the barrier
|
||||
s_waitcnt_barrier<0, waitcnt_arg::kMaxExpCnt, 0>();
|
||||
#else
|
||||
// same content as in old CK (#999)
|
||||
asm volatile("\
|
||||
s_waitcnt vmcnt(0) \n \
|
||||
s_waitcnt lgkmcnt(0) \n \
|
||||
s_barrier \
|
||||
" ::);
|
||||
#endif
|
||||
}
|
||||
|
||||
CK_TILE_DEVICE void s_nop(index_t cnt = 0)
|
||||
{
|
||||
#if 1
|
||||
|
||||
Reference in New Issue
Block a user