From 69d6769ca859aa97ababe33f9a598d9e8cb41f2c Mon Sep 17 00:00:00 2001 From: "assistant-librarian[bot]" Date: Wed, 6 Aug 2025 13:25:28 +0000 Subject: [PATCH] Merge commit '2622ff06cb2aabfd94df191083777b4caeb03966' into develop --- .../core/arch/amd_buffer_addressing.hpp | 48 ------------------- include/ck_tile/core/arch/arch.hpp | 16 ------- 2 files changed, 64 deletions(-) diff --git a/include/ck_tile/core/arch/amd_buffer_addressing.hpp b/include/ck_tile/core/arch/amd_buffer_addressing.hpp index 29cc3fefe5..35da19cd3e 100644 --- a/include/ck_tile/core/arch/amd_buffer_addressing.hpp +++ b/include/ck_tile/core/arch/amd_buffer_addressing.hpp @@ -2754,54 +2754,6 @@ CK_TILE_DEVICE void amd_buffer_atomic_max(const thread_buffer& src_thread_ #endif } -template -CK_TILE_DEVICE void amd_direct_load_global_to_lds(const T* global_base_ptr, - const index_t global_offset, - T* lds_base_ptr, - const index_t lds_offset, - const bool is_valid, - const index_t src_element_space_size) -{ - const uint32_t* global_ptr = - reinterpret_cast(reinterpret_cast(global_base_ptr)); - const int32x4_t src_resource = - make_wave_buffer_resource(global_ptr, src_element_space_size * sizeof(T)); - const index_t global_offset_bytes = is_valid ? global_offset * sizeof(T) : 0x80000000; - -#if CK_TILE_USE_AMD_LDS_DIRECT_LOAD_INLINE_ASM - T* lds_ptr = lds_base_ptr + lds_offset; - auto const lds_ptr_sgpr = - __builtin_amdgcn_readfirstlane((reinterpret_cast(lds_ptr))); - asm volatile("s_mov_b32 m0, %0; \n\t" - "buffer_load_dword %1, %2, 0 offen lds;\n\t" ::"s"(lds_ptr_sgpr), - "v"(global_offset_bytes), - "s"(src_resource) - : "memory"); -#else - // Direct loads require that each thread reads and writes exactly a single DWORD. -#if defined(__gfx9__) - constexpr auto bytes_per_thread = sizeof(T) * NumElemsPerThread; -#endif - // Direct loads require that each thread reads and writes a multiple of DWORDs (4 bytes). - // For gfx950: supports 1, 3, or 4 DWORDs per thread - // For gfx942: supports exactly 1 DWORD per thread -#if defined(__gfx950__) - constexpr auto dword_bytes = 4; - static_assert(bytes_per_thread == dword_bytes || bytes_per_thread == dword_bytes * 3 || - bytes_per_thread == dword_bytes * 4); -#elif defined(__gfx9__) - constexpr auto dword_bytes = 4; - static_assert(bytes_per_thread == dword_bytes); -#endif - // LDS pointer must be attributed with the LDS address space. - as3_uint32_ptr lds_ptr = - reinterpret_cast(reinterpret_cast(lds_base_ptr + lds_offset)); - - llvm_amdgcn_raw_buffer_load_lds( - src_resource, lds_ptr, bytes_per_thread, global_offset_bytes, 0, 0, 0); -#endif -} - #if defined(__gfx950__) template __device__ auto amd_transpose_load_to_vgpr(const T* in_ptr) diff --git a/include/ck_tile/core/arch/arch.hpp b/include/ck_tile/core/arch/arch.hpp index 0723026836..96df9d70f7 100644 --- a/include/ck_tile/core/arch/arch.hpp +++ b/include/ck_tile/core/arch/arch.hpp @@ -174,22 +174,6 @@ CK_TILE_DEVICE void s_waitcnt_barrier() __builtin_amdgcn_s_barrier(); } -CK_TILE_DEVICE void block_sync_lds_direct_load() -{ -#if 1 - // invoke clang builtins which *should* produce the same result as the inline asm below - // difference: inline asm is being compiled to wait vmcnt(0) after the barrier - s_waitcnt_barrier<0, waitcnt_arg::kMaxExpCnt, 0>(); -#else - // same content as in old CK (#999) - asm volatile("\ - s_waitcnt vmcnt(0) \n \ - s_waitcnt lgkmcnt(0) \n \ - s_barrier \ - " ::); -#endif -} - CK_TILE_DEVICE void s_nop(index_t cnt = 0) { #if 1