mirror of
https://github.com/ROCm/composable_kernel.git
synced 2026-05-11 17:00:18 +00:00
Add support for double buffering in direct load GEMM kernel (#1052)
This PR introduces support for double buffering in LDS into GEMM kernels that use direct load instructions. Direct loads now use inline asm instead of intrinsics. Usage of intrinsics results in compiler adding additional waitcnt instructions what breaks possible load/compute overlap in case of double buffering. Usage of inline asm results in the need to use sched_barrier in order to make sure that compiler cannot incorrectly reschedule instructions since it does not know the data dependencies between global->LDS and LDS->registers.
This commit is contained in:
committed by
GitHub
parent
c7d5c7727b
commit
bc4bf9bd03
@@ -972,6 +972,15 @@ __device__ void amd_direct_load_global_to_lds(const T* global_base_ptr,
|
||||
const int32x4_t src_resource = make_wave_buffer_resource(global_ptr, src_element_space_size);
|
||||
const index_t global_offset_bytes = is_valid ? global_offset * sizeof(T) : 0x80000000;
|
||||
|
||||
#if CK_USE_AMD_LDS_DIRECT_LOAD_INLINE_ASM
|
||||
T* lds_ptr = lds_base_ptr + lds_offset;
|
||||
auto const lds_ptr_sgpr =
|
||||
__builtin_amdgcn_readfirstlane((reinterpret_cast<uintptr_t>(lds_ptr)));
|
||||
asm volatile("s_mov_b32 m0, %0; \n\t"
|
||||
"buffer_load_dword %1, %2, 0 offen lds;\n\t" ::"s"(lds_ptr_sgpr),
|
||||
"v"(global_offset_bytes),
|
||||
"s"(src_resource));
|
||||
#else
|
||||
// LDS pointer must be attributed with the LDS address space.
|
||||
__attribute__((address_space(3))) uint32_t* lds_ptr =
|
||||
reinterpret_cast<__attribute__((address_space(3))) uint32_t*>(
|
||||
@@ -979,6 +988,7 @@ __device__ void amd_direct_load_global_to_lds(const T* global_base_ptr,
|
||||
|
||||
llvm_amdgcn_raw_buffer_load_lds(
|
||||
src_resource, lds_ptr, sizeof(uint32_t), global_offset_bytes, 0, 0, 0);
|
||||
#endif
|
||||
}
|
||||
|
||||
} // namespace ck
|
||||
|
||||
Reference in New Issue
Block a user