diff --git a/include/ck_tile/core/arch/amd_buffer_addressing.hpp b/include/ck_tile/core/arch/amd_buffer_addressing.hpp index 33faa3a18b..5d6d6ce348 100644 --- a/include/ck_tile/core/arch/amd_buffer_addressing.hpp +++ b/include/ck_tile/core/arch/amd_buffer_addressing.hpp @@ -14,6 +14,15 @@ #include "ck_tile/core/utility/bit_cast.hpp" #include "ck_tile/core/utility/functional.hpp" +// This attribute gives a hint to the compiler that a branch is likely to be taken. +// Then, the compiler should remove if possible the associated s_cbranch_execz branch that would +// have been generated. +#if __cplusplus >= 202002L +#define LIKELY(x) (x) [[likely]] +#else +#define LIKELY(x) (__builtin_expect(!!(x), 1)) +#endif + namespace ck_tile { // 128 bit SGPRs to supply buffer resource in buffer instructions @@ -58,10 +67,36 @@ template<> struct buffer_load_trait<4 , thread_buffer> { using payloa // TODO: glc/slc/... template struct buffer_load; + +template +struct buffer_load_if; + +template +struct buffer_store; + +template +struct buffer_store_if; + #pragma clang diagnostic push #pragma clang diagnostic ignored "-Wundefined-reinterpret-cast" // TODO: strict aliasing rule seems fail when reinterpret_cast between vector type // (exp_vector_type(xxx)) + +#define HAS_RAW_BUFFER_BUILTINS \ + __has_builtin(__builtin_amdgcn_raw_buffer_load_b32) && \ + __has_builtin(__builtin_amdgcn_make_buffer_rsrc) && \ + __has_builtin(__builtin_amdgcn_raw_buffer_store_b32) + +#if HAS_RAW_BUFFER_BUILTINS +CK_TILE_DEVICE __amdgpu_buffer_rsrc_t cast_to_amdgpu_buffer_rsrc_t(int32x4_t res) +{ + __amdgpu_buffer_rsrc_t as_rsrc; + static_assert(sizeof(res) == sizeof(as_rsrc) && "Size of buffer resource should match"); + memcpy(&as_rsrc, &res, sizeof(res)); + return as_rsrc; +} +#endif + template struct buffer_load<16, pre_nop> { @@ -76,6 +111,11 @@ struct buffer_load<16, pre_nop> { static_assert(sizeof(T) == 16); using mbuf_t = typename impl::buffer_load_trait<16, T>::payload_t; +#if HAS_RAW_BUFFER_BUILTINS + index_t s_offset = i_offset; + reinterpret_cast(value) = __builtin_amdgcn_raw_buffer_load_b128( + cast_to_amdgpu_buffer_rsrc_t(res), v_offset, s_offset, 0); +#else if constexpr(pre_nop) asm volatile("s_nop 4\n" "buffer_load_dwordx4 %0, %1, %2, 0 offen offset:%3" @@ -87,6 +127,7 @@ struct buffer_load<16, pre_nop> : "+v"(reinterpret_cast(value)) : "v"(v_offset), "s"(res), "n"(i_offset) : "memory"); +#endif } }; @@ -104,6 +145,11 @@ struct buffer_load<8, pre_nop> { static_assert(sizeof(T) == 8); using mbuf_t = typename impl::buffer_load_trait<8, T>::payload_t; +#if HAS_RAW_BUFFER_BUILTINS + index_t s_offset = i_offset; + reinterpret_cast(value) = __builtin_amdgcn_raw_buffer_load_b64( + cast_to_amdgpu_buffer_rsrc_t(res), v_offset, s_offset, 0); +#else if constexpr(pre_nop) asm volatile("s_nop 4\n" "buffer_load_dwordx2 %0, %1, %2, 0 offen offset:%3" @@ -115,6 +161,7 @@ struct buffer_load<8, pre_nop> : "+v"(reinterpret_cast(value)) : "v"(v_offset), "s"(res), "n"(i_offset) : "memory"); +#endif } }; @@ -132,6 +179,12 @@ struct buffer_load<4, pre_nop> { static_assert(sizeof(T) == 4); using mbuf_t = typename impl::buffer_load_trait<4, T>::payload_t; + +#if HAS_RAW_BUFFER_BUILTINS + index_t s_offset = i_offset; + reinterpret_cast(value) = __builtin_amdgcn_raw_buffer_load_b32( + cast_to_amdgpu_buffer_rsrc_t(res), v_offset, s_offset, 0); +#else if constexpr(pre_nop) asm volatile("s_nop 4\n" "buffer_load_dword %0, %1, %2, 0 offen offset:%3" @@ -143,6 +196,7 @@ struct buffer_load<4, pre_nop> : "+v"(reinterpret_cast(value)) : "v"(v_offset), "s"(res), "n"(i_offset) : "memory"); +#endif } }; @@ -160,6 +214,12 @@ struct buffer_load<2, pre_nop> { static_assert(sizeof(T) == 4); // subdword is buggy, use dword buf and convert manually using mbuf_t = typename impl::buffer_load_trait<2, T>::payload_t; + +#if HAS_RAW_BUFFER_BUILTINS + index_t s_offset = i_offset; + reinterpret_cast(value) = __builtin_amdgcn_raw_buffer_load_b16( + cast_to_amdgpu_buffer_rsrc_t(res), v_offset, s_offset, 0); +#else if constexpr(pre_nop) asm volatile("s_nop 4\n" "buffer_load_ushort %0, %1, %2, 0 offen offset:%3" @@ -171,6 +231,7 @@ struct buffer_load<2, pre_nop> : "+v"(reinterpret_cast(value)) : "v"(v_offset), "s"(res), "n"(i_offset) : "memory"); +#endif } }; @@ -188,6 +249,11 @@ struct buffer_load<1, pre_nop> { static_assert(sizeof(T) == 4); using mbuf_t = typename impl::buffer_load_trait<1, T>::payload_t; +#if HAS_RAW_BUFFER_BUILTINS + index_t s_offset = i_offset; + reinterpret_cast(value) = __builtin_amdgcn_raw_buffer_load_b16( + cast_to_amdgpu_buffer_rsrc_t(res), v_offset, s_offset, 0); +#else if constexpr(pre_nop) asm volatile("s_nop 4\n" "buffer_load_ubyte %0, %1, %2, 0 offen offset:%3" @@ -199,12 +265,31 @@ struct buffer_load<1, pre_nop> : "+v"(reinterpret_cast(value)) : "v"(v_offset), "s"(res), "n"(i_offset) : "memory"); +#endif } }; -template -struct buffer_load_if; - +#if HAS_RAW_BUFFER_BUILTINS +template +struct buffer_load_if +{ + template + CK_TILE_DEVICE void operator()(T& value, + int32x4_t res /*buffer resource*/, + index_t v_offset, + index_t s_offset, + index_t i_offset /*max 0xFFF*/, + index_t flag = 0, + bool_constant = {}) + { + if LIKELY(1 <= flag) + { + buffer_load{}( + value, res, v_offset, s_offset, i_offset, flag, bool_constant{}); + } + } +}; +#else template struct buffer_load_if<16, pre_nop> { @@ -214,12 +299,12 @@ struct buffer_load_if<16, pre_nop> index_t v_offset, index_t /*s_offset*/, index_t i_offset /*max 0xFFF*/, - index_t flag = 0, + index_t flag = 0, bool_constant = {}) { static_assert(sizeof(T) == 16); auto saved_exec = __builtin_amdgcn_read_exec(); - using mbuf_t = typename impl::buffer_load_trait<16, T>::payload_t; + using mbuf_t = typename impl::buffer_load_trait<16, T>::payload_t; static_assert(sizeof(mbuf_t) == sizeof(T)); if constexpr(pre_nop) asm volatile("s_nop 4\n" @@ -248,12 +333,12 @@ struct buffer_load_if<8, pre_nop> index_t v_offset, index_t /*s_offset*/, index_t i_offset /*max 0xFFF*/, - index_t flag = 0, + index_t flag = 0, bool_constant = {}) { static_assert(sizeof(T) == 8); auto saved_exec = __builtin_amdgcn_read_exec(); - using mbuf_t = typename impl::buffer_load_trait<8, T>::payload_t; + using mbuf_t = typename impl::buffer_load_trait<8, T>::payload_t; if constexpr(pre_nop) asm volatile("s_nop 4\n" "v_cmpx_le_u32 exec, 1, %4\n" @@ -281,12 +366,12 @@ struct buffer_load_if<4, pre_nop> index_t v_offset, index_t /*s_offset*/, index_t i_offset /*max 0xFFF*/, - index_t flag = 0, + index_t flag = 0, bool_constant = {}) { static_assert(sizeof(T) == 4); auto saved_exec = __builtin_amdgcn_read_exec(); - using mbuf_t = typename impl::buffer_load_trait<4, T>::payload_t; + using mbuf_t = typename impl::buffer_load_trait<4, T>::payload_t; if constexpr(pre_nop) asm volatile("s_nop 4\n" "v_cmpx_le_u32 exec, 1, %4\n" @@ -314,12 +399,12 @@ struct buffer_load_if<2, pre_nop> index_t v_offset, index_t /*s_offset*/, index_t i_offset /*max 0xFFF*/, - index_t flag = 0, + index_t flag = 0, bool_constant = {}) { static_assert(sizeof(T) == 4); auto saved_exec = __builtin_amdgcn_read_exec(); - using mbuf_t = typename impl::buffer_load_trait<2, T>::payload_t; + using mbuf_t = typename impl::buffer_load_trait<2, T>::payload_t; if constexpr(pre_nop) asm volatile("s_nop 4\n" "v_cmpx_le_u32 exec, 1, %4\n" @@ -347,12 +432,12 @@ struct buffer_load_if<1, pre_nop> index_t v_offset, index_t /*s_offset*/, index_t i_offset /*max 0xFFF*/, - index_t flag = 0, + index_t flag = 0, bool_constant = {}) { static_assert(sizeof(T) == 4); auto saved_exec = __builtin_amdgcn_read_exec(); - using mbuf_t = typename impl::buffer_load_trait<1, T>::payload_t; + using mbuf_t = typename impl::buffer_load_trait<1, T>::payload_t; if constexpr(pre_nop) asm volatile("s_nop 4\n" "v_cmpx_le_u32 exec, 1, %4\n" @@ -370,9 +455,9 @@ struct buffer_load_if<1, pre_nop> : "memory"); } }; +#endif + #pragma clang diagnostic pop // "-Wundefined-reinterpret-cast" -template -struct buffer_store; template <> struct buffer_store<16> @@ -387,10 +472,16 @@ struct buffer_store<16> { static_assert(sizeof(T) == 16); using mbuf_t = fp32x4_t; +#if HAS_RAW_BUFFER_BUILTINS + index_t s_offset = i_offset; + __builtin_amdgcn_raw_buffer_store_b128( + bit_cast(value), cast_to_amdgpu_buffer_rsrc_t(res), v_offset, s_offset, 0); +#else asm volatile("buffer_store_dwordx4 %0, %1, %2, 0 offen offset:%3" : : "v"(bit_cast(value)), "v"(v_offset), "s"(res), "n"(i_offset) : "memory"); +#endif } }; @@ -407,10 +498,16 @@ struct buffer_store<8> { static_assert(sizeof(T) == 8); using mbuf_t = fp32x2_t; +#if HAS_RAW_BUFFER_BUILTINS + index_t s_offset = i_offset; + __builtin_amdgcn_raw_buffer_store_b64( + bit_cast(value), cast_to_amdgpu_buffer_rsrc_t(res), v_offset, s_offset, 0); +#else asm volatile("buffer_store_dwordx2 %0, %1, %2, 0 offen offset:%3" : : "v"(bit_cast(value)), "v"(v_offset), "s"(res), "n"(i_offset) : "memory"); +#endif } }; @@ -427,10 +524,16 @@ struct buffer_store<4> { static_assert(sizeof(T) == 4); using mbuf_t = float; +#if HAS_RAW_BUFFER_BUILTINS + index_t s_offset = i_offset; + __builtin_amdgcn_raw_buffer_store_b32( + bit_cast(value), cast_to_amdgpu_buffer_rsrc_t(res), v_offset, s_offset, 0); +#else asm volatile("buffer_store_dword %0, %1, %2, 0 offen offset:%3" : : "v"(bit_cast(value)), "v"(v_offset), "s"(res), "n"(i_offset) : "memory"); +#endif } }; @@ -447,10 +550,16 @@ struct buffer_store<2> { static_assert(sizeof(T) == 2); using mbuf_t = short; +#if HAS_RAW_BUFFER_BUILTINS + index_t s_offset = i_offset; + __builtin_amdgcn_raw_buffer_store_b16( + bit_cast(value), cast_to_amdgpu_buffer_rsrc_t(res), v_offset, s_offset, 0); +#else asm volatile("buffer_store_short %0, %1, %2, 0 offen offset:%3" : : "v"(bit_cast(value)), "v"(v_offset), "s"(res), "n"(i_offset) : "memory"); +#endif } }; @@ -467,16 +576,38 @@ struct buffer_store<1> { static_assert(sizeof(T) == 4); using mbuf_t = float; +#if HAS_RAW_BUFFER_BUILTINS + index_t s_offset = i_offset; + __builtin_amdgcn_raw_buffer_store_b8( + bit_cast(value), cast_to_amdgpu_buffer_rsrc_t(res), v_offset, s_offset, 0); +#else asm volatile("buffer_store_byte %0, %1, %2, 0 offen offset:%3" : : "v"(bit_cast(value)), "v"(v_offset), "s"(res), "n"(i_offset) : "memory"); +#endif } }; +#if HAS_RAW_BUFFER_BUILTINS template -struct buffer_store_if; - +struct buffer_store_if +{ + template + CK_TILE_DEVICE void operator()(const T& value, + int32x4_t res /*buffer resource*/, + index_t v_offset, + index_t s_offset, + index_t i_offset /*max 0xFFF*/, + index_t flag = 1) + { + if LIKELY(1 <= flag) + { + buffer_store{}(value, res, v_offset, s_offset, i_offset); + } + } +}; +#else template <> struct buffer_store_if<16> { @@ -490,7 +621,7 @@ struct buffer_store_if<16> { static_assert(sizeof(T) == 16); auto save_exec = __builtin_amdgcn_read_exec(); - using mbuf_t = fp32x4_t; + using mbuf_t = fp32x4_t; asm volatile("v_cmpx_le_u32 exec, 1, %4\n" "buffer_store_dwordx4 %0, %1, %2, 0 offen offset:%3\n" "s_mov_b64 exec %5" @@ -547,7 +678,7 @@ struct buffer_store_if<4> { static_assert(sizeof(T) == 4); auto save_exec = __builtin_amdgcn_read_exec(); - using mbuf_t = float; + using mbuf_t = float; asm volatile("v_cmpx_le_u32 exec, 1, %4\n" "buffer_store_dword %0, %1, %2, 0 offen offset:%3\n" "s_mov_b64 exec %5" @@ -575,7 +706,7 @@ struct buffer_store_if<2> { static_assert(sizeof(T) == 2); auto save_exec = __builtin_amdgcn_read_exec(); - using mbuf_t = short; + using mbuf_t = short; asm volatile("v_cmpx_le_u32 exec, 1, %4\n" "buffer_store_short %0, %1, %2, 0 offen offset:%3\n" "s_mov_b64 exec %5" @@ -603,7 +734,7 @@ struct buffer_store_if<1> { static_assert(sizeof(T) == 4); auto save_exec = __builtin_amdgcn_read_exec(); - using mbuf_t = float; + using mbuf_t = float; asm volatile("v_cmpx_le_u32 exec, 1, %4\n" "buffer_store_byte %0, %1, %2, 0 offen offset:%3\n" "s_mov_b64 exec %5" @@ -617,6 +748,7 @@ struct buffer_store_if<1> : "memory"); } }; +#endif CK_TILE_DEVICE void buffer_load_fence(index_t cnt = 0) {