From 6be0c1d7a68d421eb072146ceffece7ffdfd2e94 Mon Sep 17 00:00:00 2001 From: Ali Nouri Date: Thu, 11 Sep 2025 20:17:09 +0000 Subject: [PATCH] Batch applied --- .../core/arch/amd_buffer_addressing.hpp | 245 +++++++----------- .../arch/amd_buffer_addressing_builtins.hpp | 213 +++++---------- 2 files changed, 160 insertions(+), 298 deletions(-) diff --git a/include/ck_tile/core/arch/amd_buffer_addressing.hpp b/include/ck_tile/core/arch/amd_buffer_addressing.hpp index 7a9c017eb2..7111eed596 100644 --- a/include/ck_tile/core/arch/amd_buffer_addressing.hpp +++ b/include/ck_tile/core/arch/amd_buffer_addressing.hpp @@ -13,7 +13,6 @@ #include "ck_tile/core/utility/type_traits.hpp" #include "ck_tile/core/utility/bit_cast.hpp" #include "ck_tile/core/utility/functional.hpp" -#include "ck_tile/core/utility/ignore.hpp" // This attribute gives a hint to the compiler that a branch is likely to be taken. // Then, the compiler should remove if possible the associated s_cbranch_execz branch that would @@ -24,8 +23,6 @@ #define LIKELY(x) (__builtin_expect(!!(x), 1)) #endif -using as3_uint32_ptr = uint32_t __attribute__((address_space(3)))*; - namespace ck_tile { // 128 bit SGPRs to supply buffer resource in buffer instructions @@ -41,6 +38,10 @@ CK_TILE_DEVICE int32x4_t make_wave_buffer_resource(const void* ptr, uint32_t siz { buffer_resource res{ptr, size, CK_TILE_BUFFER_RESOURCE_3RD_DWORD}; int32x4_t r = __builtin_bit_cast(int32x4_t, res); + r.x = __builtin_amdgcn_readfirstlane(r.x); + r.y = __builtin_amdgcn_readfirstlane(r.y); + r.z = __builtin_amdgcn_readfirstlane(r.z); + r.w = __builtin_amdgcn_readfirstlane(r.w); return r; } @@ -298,12 +299,12 @@ struct buffer_load_if<16, pre_nop> index_t v_offset, index_t /*s_offset*/, index_t i_offset /*max 0xFFF*/, - index_t flag = 0, + index_t flag = 0, bool_constant = {}) { static_assert(sizeof(T) == 16); auto saved_exec = __builtin_amdgcn_read_exec(); - using mbuf_t = typename impl::buffer_load_trait<16, T>::payload_t; + using mbuf_t = typename impl::buffer_load_trait<16, T>::payload_t; static_assert(sizeof(mbuf_t) == sizeof(T)); if constexpr(pre_nop) asm volatile("s_nop 4\n" @@ -332,12 +333,12 @@ struct buffer_load_if<8, pre_nop> index_t v_offset, index_t /*s_offset*/, index_t i_offset /*max 0xFFF*/, - index_t flag = 0, + index_t flag = 0, bool_constant = {}) { static_assert(sizeof(T) == 8); auto saved_exec = __builtin_amdgcn_read_exec(); - using mbuf_t = typename impl::buffer_load_trait<8, T>::payload_t; + using mbuf_t = typename impl::buffer_load_trait<8, T>::payload_t; if constexpr(pre_nop) asm volatile("s_nop 4\n" "v_cmpx_le_u32 exec, 1, %4\n" @@ -365,12 +366,12 @@ struct buffer_load_if<4, pre_nop> index_t v_offset, index_t /*s_offset*/, index_t i_offset /*max 0xFFF*/, - index_t flag = 0, + index_t flag = 0, bool_constant = {}) { static_assert(sizeof(T) == 4); auto saved_exec = __builtin_amdgcn_read_exec(); - using mbuf_t = typename impl::buffer_load_trait<4, T>::payload_t; + using mbuf_t = typename impl::buffer_load_trait<4, T>::payload_t; if constexpr(pre_nop) asm volatile("s_nop 4\n" "v_cmpx_le_u32 exec, 1, %4\n" @@ -398,12 +399,12 @@ struct buffer_load_if<2, pre_nop> index_t v_offset, index_t /*s_offset*/, index_t i_offset /*max 0xFFF*/, - index_t flag = 0, + index_t flag = 0, bool_constant = {}) { static_assert(sizeof(T) == 4); auto saved_exec = __builtin_amdgcn_read_exec(); - using mbuf_t = typename impl::buffer_load_trait<2, T>::payload_t; + using mbuf_t = typename impl::buffer_load_trait<2, T>::payload_t; if constexpr(pre_nop) asm volatile("s_nop 4\n" "v_cmpx_le_u32 exec, 1, %4\n" @@ -431,12 +432,12 @@ struct buffer_load_if<1, pre_nop> index_t v_offset, index_t /*s_offset*/, index_t i_offset /*max 0xFFF*/, - index_t flag = 0, + index_t flag = 0, bool_constant = {}) { static_assert(sizeof(T) == 4); auto saved_exec = __builtin_amdgcn_read_exec(); - using mbuf_t = typename impl::buffer_load_trait<1, T>::payload_t; + using mbuf_t = typename impl::buffer_load_trait<1, T>::payload_t; if constexpr(pre_nop) asm volatile("s_nop 4\n" "v_cmpx_le_u32 exec, 1, %4\n" @@ -620,7 +621,7 @@ struct buffer_store_if<16> { static_assert(sizeof(T) == 16); auto save_exec = __builtin_amdgcn_read_exec(); - using mbuf_t = fp32x4_t; + using mbuf_t = fp32x4_t; asm volatile("v_cmpx_le_u32 exec, 1, %4\n" "buffer_store_dwordx4 %0, %1, %2, 0 offen offset:%3\n" "s_mov_b64 exec %5" @@ -677,7 +678,7 @@ struct buffer_store_if<4> { static_assert(sizeof(T) == 4); auto save_exec = __builtin_amdgcn_read_exec(); - using mbuf_t = float; + using mbuf_t = float; asm volatile("v_cmpx_le_u32 exec, 1, %4\n" "buffer_store_dword %0, %1, %2, 0 offen offset:%3\n" "s_mov_b64 exec %5" @@ -705,7 +706,7 @@ struct buffer_store_if<2> { static_assert(sizeof(T) == 2); auto save_exec = __builtin_amdgcn_read_exec(); - using mbuf_t = short; + using mbuf_t = short; asm volatile("v_cmpx_le_u32 exec, 1, %4\n" "buffer_store_short %0, %1, %2, 0 offen offset:%3\n" "s_mov_b64 exec %5" @@ -733,7 +734,7 @@ struct buffer_store_if<1> { static_assert(sizeof(T) == 4); auto save_exec = __builtin_amdgcn_read_exec(); - using mbuf_t = float; + using mbuf_t = float; asm volatile("v_cmpx_le_u32 exec, 1, %4\n" "buffer_store_byte %0, %1, %2, 0 offen offset:%3\n" "s_mov_b64 exec %5" @@ -1269,53 +1270,33 @@ llvm_amdgcn_raw_buffer_atomic_max_fp64(double vdata, // Direct loads from global to LDS. CK_TILE_DEVICE_EXTERN void llvm_amdgcn_raw_buffer_load_lds(int32x4_t rsrc, - as3_uint32_ptr lds_ptr, + __attribute__((address_space(3))) uint32_t* lds_ptr, index_t size, index_t voffset, index_t soffset, index_t offset, index_t aux) __asm("llvm.amdgcn.raw.buffer.load.lds"); -template -CK_TILE_DEVICE void async_buffer_load_dwordxn_v(void* smem, - int32x4_t rsrc, - index_t voffset, - index_t /*soffset*/, - index_t ioffset /*max 0xFFF*/, - index_t /*flag*/ = 0, - bool_constant = {}) +template +CK_TILE_DEVICE void async_buffer_load_dword_v(void* smem, + int32x4_t rsrc, + index_t voffset, + index_t /*soffset*/, + index_t ioffset /*max 0xFFF*/, + index_t /*flag*/ = 0, + bool_constant = {}) { -#define CK_TILE_ASYNC_LOAD_WITH_INSTR(instr) \ - if constexpr(pre_nop) \ - asm volatile("s_nop 4\n" instr " %1, %2, 0 offen offset:%3 lds" \ - : "=r"(smem) /*dummy dependency for smem*/ \ - : "v"(voffset), "s"(rsrc), "n"(ioffset) \ - : "memory"); \ - else \ - asm volatile(instr " %1, %2, 0 offen offset:%3 lds" \ - : "=r"(smem) /*dummy dependency for smem*/ \ - : "v"(voffset), "s"(rsrc), "n"(ioffset) \ + if constexpr(pre_nop) + asm volatile("s_nop 4\n" + "buffer_load_dword %1, %2, 0 offen offset:%3 lds" + : "=r"(smem) /*dummy dependency for smem*/ + : "v"(voffset), "s"(rsrc), "n"(ioffset) : "memory"); - - if constexpr(num_dwords == 1) - { - CK_TILE_ASYNC_LOAD_WITH_INSTR("buffer_load_dword"); - } -#if defined(__gfx950__) - else if constexpr(num_dwords == 3) - { - CK_TILE_ASYNC_LOAD_WITH_INSTR("buffer_load_dwordx3"); - } - else if constexpr(num_dwords == 4) - { - CK_TILE_ASYNC_LOAD_WITH_INSTR("buffer_load_dwordx4"); - } -#endif else - { - static_assert(false, "wrong! not implemented data width"); - } -#undef CK_TILE_ASYNC_LOAD_WITH_INSTR + asm volatile("buffer_load_dword %1, %2, 0 offen offset:%3 lds" + : "=r"(smem) /*dummy dependency for smem*/ + : "v"(voffset), "s"(rsrc), "n"(ioffset) + : "memory"); } CK_TILE_DEVICE void async_buffer_load_fence(index_t cnt = 0) @@ -1334,17 +1315,6 @@ enum struct amd_buffer_coherence_enum glc = 1, slc = 2, glc_slc = 3, - // gfx94: bit 0 = sc0, bit 1 = nt, bit 3 = swz, bit 4 = sc1 - // SC[1:0] System Cache level: 0=wave, 1=group, 2=device, 3=system - // NT Non-Temporal: 0=expect temporal reuse; 1=do not expect temporal reuse - WAVE_NT0 = 0, - WAVE_NT1 = 2, - GROUP_NT0 = 1, - GROUP_NT1 = 3, - DEVICE_NT0 = 8, - DEVICE_NT1 = 10, - SYSTEM_NT0 = 9, - SYSTEM_NT1 = 11, }; template -CK_TILE_DEVICE void amd_async_buffer_load_impl(CK_TILE_LDS_ADDR T* smem, +CK_TILE_DEVICE void amd_async_buffer_load_impl(T* smem, int32x4_t src_wave_buffer_resource, index_t src_thread_addr_offset, index_t src_wave_addr_offset, index_t src_immediate_addr_offset = 0, bool_constant = {}) { - constexpr index_t num_bytes = sizeof(T) * N; - constexpr index_t num_words = num_bytes / 4; - static_assert(num_bytes % 4 == 0 && (num_words == 1 || num_words == 3 || num_words == 4), - "wrong! only support in dword, dwordx3, dwordx4"); + static_assert(sizeof(T) * N == 4, "wrong! not implemented vector size"); - async_buffer_load_dwordxn_v(smem, - src_wave_buffer_resource, - src_thread_addr_offset, - src_wave_addr_offset, - src_immediate_addr_offset, - 0, - bool_constant{}); + async_buffer_load_dword_v(smem, + src_wave_buffer_resource, + src_thread_addr_offset, + src_wave_addr_offset, + src_immediate_addr_offset, + 0, + bool_constant{}); } template = {}) { - constexpr index_t bytes = sizeof(T) * N; + static_assert(sizeof(T) * N == 4, "wrong! not implemented vector size"); - // Used to catch the cases when src_immediate_addr_offset is NOT 0. - // Remove this assert once other sizes are implemented. - assert(src_immediate_addr_offset == 0 && - "wrong! not implemented src_immediate_addr_offset size, only 0 supported"); - ignore = src_immediate_addr_offset; - -#if defined(__gfx950__) - static_assert(bytes == 4 || bytes == 12 || bytes == 16, - "wrong! only support in dword, dwordx3, dwordx4"); - src_wave_addr_offset = 0; -#else - static_assert(bytes == 4, "wrong! not implemented vector size"); -#endif - - // Set up v_offset: - index_t v_offset = src_thread_addr_offset; if constexpr(oob_conditional_check) - v_offset = flag ? v_offset : src_wave_buffer_resource[2]; - -#pragma clang diagnostic push -#pragma clang diagnostic ignored "-Wold-style-cast" - // Use C-style cast to change address space without dropping llvm noalias attribute - llvm_amdgcn_raw_buffer_load_lds(src_wave_buffer_resource, - (as3_uint32_ptr)(smem), - bytes, - v_offset, - src_wave_addr_offset, - /*src_immediate_addr_offset*/ 0, - static_cast(coherence)); -#pragma clang diagnostic pop + { + index_t v_offset = flag ? src_thread_addr_offset : src_wave_buffer_resource[2]; + llvm_amdgcn_raw_buffer_load_lds(src_wave_buffer_resource, + smem, + sizeof(uint32_t), + v_offset, + src_wave_addr_offset, + src_immediate_addr_offset, + static_cast(coherence)); + } + else + { + llvm_amdgcn_raw_buffer_load_lds(src_wave_buffer_resource, + smem, + sizeof(uint32_t), + src_thread_addr_offset, + src_wave_addr_offset, + src_immediate_addr_offset, + static_cast(coherence)); + } } template & src_thread_ #endif } -#if defined(__gfx950__) -template -__device__ auto amd_transpose_load_to_vgpr(const T* __restrict__ in_ptr) +template +CK_TILE_DEVICE void amd_direct_load_global_to_lds(const T* global_base_ptr, + const index_t global_offset, + T* lds_base_ptr, + const index_t lds_offset, + const bool is_valid, + const index_t src_element_space_size) { -#define __LDS_ADDR __attribute__((address_space(3))) + // Direct loads require that each thread reads and writes exactly a single DWORD. + constexpr auto dword_bytes = 4; + constexpr auto bytes_per_thread = sizeof(T) * NumElemsPerThread; + static_assert(bytes_per_thread == dword_bytes); - static_assert(__has_builtin(__builtin_amdgcn_raw_buffer_load_b32), - "We need to have the compatible compiler version to build this instruction"); + const uint32_t* global_ptr = + reinterpret_cast(reinterpret_cast(global_base_ptr)); + const int32x4_t src_resource = + make_wave_buffer_resource(global_ptr, src_element_space_size * sizeof(T)); + const index_t global_offset_bytes = is_valid ? global_offset * sizeof(T) : 0x80000000; -#pragma clang diagnostic push -#pragma clang diagnostic ignored "-Wold-style-cast" - // Use C-style cast to change address space without dropping llvm noalias attribute - const auto in_ptr_ = (__LDS_ADDR T*)(const_cast(in_ptr)); -#pragma clang diagnostic pop - if constexpr(std::is_same_v, ck_tile::half_t>) - { - typedef __attribute__((__vector_size__(4 * sizeof(__fp16)))) __fp16 llvm_fp16x4_t; - auto lds_ptr = reinterpret_cast<__LDS_ADDR llvm_fp16x4_t*>(in_ptr_); - return bit_cast>(__builtin_amdgcn_ds_read_tr16_b64_v4f16(lds_ptr)); - } - else if constexpr(std::is_same_v, ck_tile::bf16_t>) - { - typedef __attribute__((__vector_size__(4 * sizeof(__bf16)))) __bf16 llvm_bf16x4_t; - auto lds_ptr = reinterpret_cast<__LDS_ADDR llvm_bf16x4_t*>(in_ptr_); - return bit_cast>(__builtin_amdgcn_ds_read_tr16_b64_v4bf16(lds_ptr)); - } - else if constexpr(std::is_same_v, ck_tile::fp8_t> || - std::is_same_v, ck_tile::bf8_t> || - std::is_same_v, ck_tile::int8_t>) - { - typedef __attribute__((__vector_size__(2 * sizeof(index_t)))) index_t llvm_i32x2_t; - auto lds_ptr = reinterpret_cast<__LDS_ADDR llvm_i32x2_t*>(in_ptr_); - return bit_cast>(__builtin_amdgcn_ds_read_tr8_b64_v2i32(lds_ptr)); - } - else - { - static_assert(false, "not implemented"); - } -#undef __LDS_ADDR -} +#if CK_TILE_USE_AMD_LDS_DIRECT_LOAD_INLINE_ASM + T* lds_ptr = lds_base_ptr + lds_offset; + auto const lds_ptr_sgpr = + __builtin_amdgcn_readfirstlane((reinterpret_cast(lds_ptr))); + asm volatile("s_mov_b32 m0, %0; \n\t" + "buffer_load_dword %1, %2, 0 offen lds;\n\t" ::"s"(lds_ptr_sgpr), + "v"(global_offset_bytes), + "s"(src_resource) + : "memory"); +#else + // LDS pointer must be attributed with the LDS address space. + __attribute__((address_space(3))) uint32_t* lds_ptr = + reinterpret_cast<__attribute__((address_space(3))) uint32_t*>( + reinterpret_cast(lds_base_ptr + lds_offset)); + + llvm_amdgcn_raw_buffer_load_lds( + src_resource, lds_ptr, sizeof(uint32_t), global_offset_bytes, 0, 0, 0); #endif +} } // namespace ck_tile diff --git a/include/ck_tile/core/arch/amd_buffer_addressing_builtins.hpp b/include/ck_tile/core/arch/amd_buffer_addressing_builtins.hpp index 4013b51479..53a344c7b0 100644 --- a/include/ck_tile/core/arch/amd_buffer_addressing_builtins.hpp +++ b/include/ck_tile/core/arch/amd_buffer_addressing_builtins.hpp @@ -13,9 +13,6 @@ #include "ck_tile/core/utility/type_traits.hpp" #include "ck_tile/core/utility/bit_cast.hpp" #include "ck_tile/core/utility/functional.hpp" -#include "ck_tile/core/utility/ignore.hpp" - -using as3_uint32_ptr = uint32_t __attribute__((address_space(3)))*; namespace ck_tile { @@ -32,6 +29,10 @@ CK_TILE_DEVICE int32x4_t make_wave_buffer_resource(const void* ptr, uint32_t siz { buffer_resource res{ptr, size, CK_TILE_BUFFER_RESOURCE_3RD_DWORD}; int32x4_t r = __builtin_bit_cast(int32x4_t, res); + r.x = __builtin_amdgcn_readfirstlane(r.x); + r.y = __builtin_amdgcn_readfirstlane(r.y); + r.z = __builtin_amdgcn_readfirstlane(r.z); + r.w = __builtin_amdgcn_readfirstlane(r.w); return r; } @@ -1137,53 +1138,33 @@ llvm_amdgcn_raw_buffer_atomic_max_fp64(double vdata, // Direct loads from global to LDS. CK_TILE_DEVICE_EXTERN void llvm_amdgcn_raw_buffer_load_lds(int32x4_t rsrc, - as3_uint32_ptr lds_ptr, + __attribute__((address_space(3))) uint32_t* lds_ptr, index_t size, index_t voffset, index_t soffset, index_t offset, index_t aux) __asm("llvm.amdgcn.raw.buffer.load.lds"); -template -CK_TILE_DEVICE void async_buffer_load_dwordxn_v(void* smem, - int32x4_t rsrc, - index_t voffset, - index_t /*soffset*/, - index_t ioffset /*max 0xFFF*/, - index_t /*flag*/ = 0, - bool_constant = {}) +template +CK_TILE_DEVICE void async_buffer_load_dword_v(void* smem, + int32x4_t rsrc, + index_t voffset, + index_t /*soffset*/, + index_t ioffset /*max 0xFFF*/, + index_t /*flag*/ = 0, + bool_constant = {}) { -#define CK_TILE_ASYNC_LOAD_WITH_INSTR(instr) \ - if constexpr(pre_nop) \ - asm volatile("s_nop 4\n" instr " %1, %2, 0 offen offset:%3 lds" \ - : "=r"(smem) /*dummy dependency for smem*/ \ - : "v"(voffset), "s"(rsrc), "n"(ioffset) \ - : "memory"); \ - else \ - asm volatile(instr " %1, %2, 0 offen offset:%3 lds" \ - : "=r"(smem) /*dummy dependency for smem*/ \ - : "v"(voffset), "s"(rsrc), "n"(ioffset) \ + if constexpr(pre_nop) + asm volatile("s_nop 4\n" + "buffer_load_dword %1, %2, 0 offen offset:%3 lds" + : "=r"(smem) /*dummy dependency for smem*/ + : "v"(voffset), "s"(rsrc), "n"(ioffset) : "memory"); - - if constexpr(num_dwords == 1) - { - CK_TILE_ASYNC_LOAD_WITH_INSTR("buffer_load_dword"); - } -#if defined(__gfx950__) - else if constexpr(num_dwords == 3) - { - CK_TILE_ASYNC_LOAD_WITH_INSTR("buffer_load_dwordx3"); - } - else if constexpr(num_dwords == 4) - { - CK_TILE_ASYNC_LOAD_WITH_INSTR("buffer_load_dwordx4"); - } -#endif else - { - static_assert(false, "wrong! not implemented data width"); - } -#undef CK_TILE_ASYNC_LOAD_WITH_INSTR + asm volatile("buffer_load_dword %1, %2, 0 offen offset:%3 lds" + : "=r"(smem) /*dummy dependency for smem*/ + : "v"(voffset), "s"(rsrc), "n"(ioffset) + : "memory"); } CK_TILE_DEVICE void async_buffer_load_fence(index_t cnt = 0) @@ -1202,17 +1183,6 @@ enum struct amd_buffer_coherence_enum glc = 1, slc = 2, glc_slc = 3, - // gfx94: bit 0 = sc0, bit 1 = nt, bit 3 = swz, bit 4 = sc1 - // SC[1:0] System Cache level: 0=wave, 1=group, 2=device, 3=system - // NT Non-Temporal: 0=expect temporal reuse; 1=do not expect temporal reuse - WAVE_NT0 = 0, - WAVE_NT1 = 2, - GROUP_NT0 = 1, - GROUP_NT1 = 3, - DEVICE_NT0 = 8, - DEVICE_NT1 = 10, - SYSTEM_NT0 = 9, - SYSTEM_NT1 = 11, }; template = {}) { - constexpr index_t num_bytes = sizeof(T) * N; - constexpr index_t num_words = num_bytes / 4; - static_assert(num_bytes % 4 == 0 && (num_words == 1 || num_words == 3 || num_words == 4), - "wrong! only support in dword, dwordx3, dwordx4"); + static_assert(sizeof(T) * N == 4, "wrong! not implemented vector size"); - async_buffer_load_dwordxn_v(smem, - src_wave_buffer_resource, - src_thread_addr_offset, - src_wave_addr_offset, - src_immediate_addr_offset, - 0, - bool_constant{}); + async_buffer_load_dword_v(smem, + src_wave_buffer_resource, + src_thread_addr_offset, + src_wave_addr_offset, + src_immediate_addr_offset, + 0, + bool_constant{}); } template = {}) { - constexpr index_t bytes = sizeof(T) * N; + static_assert(sizeof(T) * N == 4, "wrong! not implemented vector size"); - // Used to catch the cases when src_immediate_addr_offset is NOT 0. - // Remove this assert once other sizes are implemented. - assert(src_immediate_addr_offset == 0 && - "wrong! not implemented src_immediate_addr_offset size, only 0 supported"); - ignore = src_immediate_addr_offset; - -#if defined(__gfx950__) - static_assert(bytes == 4 || bytes == 12 || bytes == 16, - "wrong! only support in dword, dwordx3, dwordx4"); - src_wave_addr_offset = 0; -#else - static_assert(bytes == 4, "wrong! not implemented vector size"); -#endif - - // Set up v_offset: - index_t v_offset = src_thread_addr_offset; if constexpr(oob_conditional_check) - v_offset = flag ? v_offset : src_wave_buffer_resource[2]; - -#pragma clang diagnostic push -#pragma clang diagnostic ignored "-Wold-style-cast" - // Use C-style cast to change address space without dropping llvm noalias attribute - llvm_amdgcn_raw_buffer_load_lds(src_wave_buffer_resource, - (as3_uint32_ptr)(smem), - bytes, - v_offset, - src_wave_addr_offset, - /*src_immediate_addr_offset*/ 0, - static_cast(coherence)); -#pragma clang diagnostic pop + { + index_t v_offset = flag ? v_offset : src_wave_buffer_resource[2]; + llvm_amdgcn_raw_buffer_load_lds(src_wave_buffer_resource, + smem, + sizeof(uint32_t), + v_offset, + src_wave_addr_offset, + src_immediate_addr_offset, + static_cast(coherence)); + } + else + { + llvm_amdgcn_raw_buffer_load_lds(src_wave_buffer_resource, + smem, + sizeof(uint32_t), + src_thread_addr_offset, + src_wave_addr_offset, + src_immediate_addr_offset, + static_cast(coherence)); + } } template (reinterpret_cast(global_base_ptr)); const int32x4_t src_resource = @@ -2581,72 +2544,16 @@ CK_TILE_DEVICE void amd_direct_load_global_to_lds(const T* global_base_ptr, "s"(src_resource) : "memory"); #else - // Direct loads require that each thread reads and writes exactly a single DWORD. -#if defined(__gfx9__) - constexpr auto bytes_per_thread = sizeof(T) * NumElemsPerThread; -#endif - // Direct loads require that each thread reads and writes a multiple of DWORDs (4 bytes). - // For gfx950: supports 1, 3, or 4 DWORDs per thread - // For gfx942: supports exactly 1 DWORD per thread -#if defined(__gfx950__) - constexpr auto dword_bytes = 4; - static_assert(bytes_per_thread == dword_bytes || bytes_per_thread == dword_bytes * 3 || - bytes_per_thread == dword_bytes * 4); -#elif defined(__gfx9__) - constexpr auto dword_bytes = 4; - static_assert(bytes_per_thread == dword_bytes); -#endif // LDS pointer must be attributed with the LDS address space. - as3_uint32_ptr lds_ptr = - reinterpret_cast(reinterpret_cast(lds_base_ptr + lds_offset)); + __attribute__((address_space(3))) uint32_t* lds_ptr = + reinterpret_cast<__attribute__((address_space(3))) uint32_t*>( + reinterpret_cast(lds_base_ptr + lds_offset)); llvm_amdgcn_raw_buffer_load_lds( - src_resource, lds_ptr, bytes_per_thread, global_offset_bytes, 0, 0, 0); + src_resource, lds_ptr, sizeof(uint32_t), global_offset_bytes, 0, 0, 0); #endif } -#if defined(__gfx950__) -template -__device__ auto amd_transpose_load_to_vgpr(const T* __restrict__ in_ptr) -{ -#define __LDS_ADDR __attribute__((address_space(3))) - - static_assert(__has_builtin(__builtin_amdgcn_raw_buffer_load_b32), - "We need to have the compatible compiler version to build this instruction"); - -#pragma clang diagnostic push -#pragma clang diagnostic ignored "-Wold-style-cast" - // Use C-style cast to change address space without dropping llvm noalias attribute - const auto in_ptr_ = (__LDS_ADDR T*)(const_cast(in_ptr)); -#pragma clang diagnostic pop - if constexpr(std::is_same_v, ck_tile::half_t>) - { - typedef __attribute__((__vector_size__(4 * sizeof(__fp16)))) __fp16 llvm_fp16x4_t; - auto lds_ptr = reinterpret_cast<__LDS_ADDR llvm_fp16x4_t*>(in_ptr_); - return bit_cast>(__builtin_amdgcn_ds_read_tr16_b64_v4f16(lds_ptr)); - } - else if constexpr(std::is_same_v, ck_tile::bf16_t>) - { - typedef __attribute__((__vector_size__(4 * sizeof(__bf16)))) __bf16 llvm_bf16x4_t; - auto lds_ptr = reinterpret_cast<__LDS_ADDR llvm_bf16x4_t*>(in_ptr_); - return bit_cast>(__builtin_amdgcn_ds_read_tr16_b64_v4bf16(lds_ptr)); - } - else if constexpr(std::is_same_v, ck_tile::fp8_t> || - std::is_same_v, ck_tile::bf8_t> || - std::is_same_v, ck_tile::int8_t>) - { - typedef __attribute__((__vector_size__(2 * sizeof(index_t)))) index_t llvm_i32x2_t; - auto lds_ptr = reinterpret_cast<__LDS_ADDR llvm_i32x2_t*>(in_ptr_); - return bit_cast>(__builtin_amdgcn_ds_read_tr8_b64_v2i32(lds_ptr)); - } - else - { - static_assert(false, "not implemented"); - } -#undef __LDS_ADDR -} -#endif - } // namespace ck_tile #endif // CK_TILE_USE_BUFFER_ADDRESSING_BUILTIN