Replace inline assembly with builtins in FHMA (#2067)

* Replace inline assembly with builtins in FHMA

---------

Co-authored-by: illsilin <Illia.Silin@amd.com>
This commit is contained in:
Juan Manuel Martinez Caamaño
2025-04-10 09:48:37 +02:00
committed by GitHub
parent 3e6d21adeb
commit f14e648e7c

View File

@@ -14,6 +14,15 @@
#include "ck_tile/core/utility/bit_cast.hpp"
#include "ck_tile/core/utility/functional.hpp"
// This attribute gives a hint to the compiler that a branch is likely to be taken.
// Then, the compiler should remove if possible the associated s_cbranch_execz branch that would
// have been generated.
#if __cplusplus >= 202002L
#define LIKELY(x) (x) [[likely]]
#else
#define LIKELY(x) (__builtin_expect(!!(x), 1))
#endif
namespace ck_tile {
// 128 bit SGPRs to supply buffer resource in buffer instructions
@@ -58,10 +67,36 @@ template<> struct buffer_load_trait<4 , thread_buffer<bf16_t, 2>> { using payloa
// TODO: glc/slc/...
template <index_t bytes, bool pre_nop = false>
struct buffer_load;
template <index_t bytes, bool pre_nop = false>
struct buffer_load_if;
template <index_t bytes>
struct buffer_store;
template <index_t bytes>
struct buffer_store_if;
#pragma clang diagnostic push
#pragma clang diagnostic ignored "-Wundefined-reinterpret-cast"
// TODO: strict aliasing rule seems fail when reinterpret_cast between vector type
// (exp_vector_type(xxx))
#define HAS_RAW_BUFFER_BUILTINS \
__has_builtin(__builtin_amdgcn_raw_buffer_load_b32) && \
__has_builtin(__builtin_amdgcn_make_buffer_rsrc) && \
__has_builtin(__builtin_amdgcn_raw_buffer_store_b32)
#if HAS_RAW_BUFFER_BUILTINS
CK_TILE_DEVICE __amdgpu_buffer_rsrc_t cast_to_amdgpu_buffer_rsrc_t(int32x4_t res)
{
__amdgpu_buffer_rsrc_t as_rsrc;
static_assert(sizeof(res) == sizeof(as_rsrc) && "Size of buffer resource should match");
memcpy(&as_rsrc, &res, sizeof(res));
return as_rsrc;
}
#endif
template <bool pre_nop>
struct buffer_load<16, pre_nop>
{
@@ -76,6 +111,11 @@ struct buffer_load<16, pre_nop>
{
static_assert(sizeof(T) == 16);
using mbuf_t = typename impl::buffer_load_trait<16, T>::payload_t;
#if HAS_RAW_BUFFER_BUILTINS
index_t s_offset = i_offset;
reinterpret_cast<mbuf_t&>(value) = __builtin_amdgcn_raw_buffer_load_b128(
cast_to_amdgpu_buffer_rsrc_t(res), v_offset, s_offset, 0);
#else
if constexpr(pre_nop)
asm volatile("s_nop 4\n"
"buffer_load_dwordx4 %0, %1, %2, 0 offen offset:%3"
@@ -87,6 +127,7 @@ struct buffer_load<16, pre_nop>
: "+v"(reinterpret_cast<mbuf_t&>(value))
: "v"(v_offset), "s"(res), "n"(i_offset)
: "memory");
#endif
}
};
@@ -104,6 +145,11 @@ struct buffer_load<8, pre_nop>
{
static_assert(sizeof(T) == 8);
using mbuf_t = typename impl::buffer_load_trait<8, T>::payload_t;
#if HAS_RAW_BUFFER_BUILTINS
index_t s_offset = i_offset;
reinterpret_cast<mbuf_t&>(value) = __builtin_amdgcn_raw_buffer_load_b64(
cast_to_amdgpu_buffer_rsrc_t(res), v_offset, s_offset, 0);
#else
if constexpr(pre_nop)
asm volatile("s_nop 4\n"
"buffer_load_dwordx2 %0, %1, %2, 0 offen offset:%3"
@@ -115,6 +161,7 @@ struct buffer_load<8, pre_nop>
: "+v"(reinterpret_cast<mbuf_t&>(value))
: "v"(v_offset), "s"(res), "n"(i_offset)
: "memory");
#endif
}
};
@@ -132,6 +179,12 @@ struct buffer_load<4, pre_nop>
{
static_assert(sizeof(T) == 4);
using mbuf_t = typename impl::buffer_load_trait<4, T>::payload_t;
#if HAS_RAW_BUFFER_BUILTINS
index_t s_offset = i_offset;
reinterpret_cast<mbuf_t&>(value) = __builtin_amdgcn_raw_buffer_load_b32(
cast_to_amdgpu_buffer_rsrc_t(res), v_offset, s_offset, 0);
#else
if constexpr(pre_nop)
asm volatile("s_nop 4\n"
"buffer_load_dword %0, %1, %2, 0 offen offset:%3"
@@ -143,6 +196,7 @@ struct buffer_load<4, pre_nop>
: "+v"(reinterpret_cast<mbuf_t&>(value))
: "v"(v_offset), "s"(res), "n"(i_offset)
: "memory");
#endif
}
};
@@ -160,6 +214,12 @@ struct buffer_load<2, pre_nop>
{
static_assert(sizeof(T) == 4); // subdword is buggy, use dword buf and convert manually
using mbuf_t = typename impl::buffer_load_trait<2, T>::payload_t;
#if HAS_RAW_BUFFER_BUILTINS
index_t s_offset = i_offset;
reinterpret_cast<mbuf_t&>(value) = __builtin_amdgcn_raw_buffer_load_b16(
cast_to_amdgpu_buffer_rsrc_t(res), v_offset, s_offset, 0);
#else
if constexpr(pre_nop)
asm volatile("s_nop 4\n"
"buffer_load_ushort %0, %1, %2, 0 offen offset:%3"
@@ -171,6 +231,7 @@ struct buffer_load<2, pre_nop>
: "+v"(reinterpret_cast<mbuf_t&>(value))
: "v"(v_offset), "s"(res), "n"(i_offset)
: "memory");
#endif
}
};
@@ -188,6 +249,11 @@ struct buffer_load<1, pre_nop>
{
static_assert(sizeof(T) == 4);
using mbuf_t = typename impl::buffer_load_trait<1, T>::payload_t;
#if HAS_RAW_BUFFER_BUILTINS
index_t s_offset = i_offset;
reinterpret_cast<mbuf_t&>(value) = __builtin_amdgcn_raw_buffer_load_b16(
cast_to_amdgpu_buffer_rsrc_t(res), v_offset, s_offset, 0);
#else
if constexpr(pre_nop)
asm volatile("s_nop 4\n"
"buffer_load_ubyte %0, %1, %2, 0 offen offset:%3"
@@ -199,12 +265,31 @@ struct buffer_load<1, pre_nop>
: "+v"(reinterpret_cast<mbuf_t&>(value))
: "v"(v_offset), "s"(res), "n"(i_offset)
: "memory");
#endif
}
};
template <index_t bytes, bool pre_nop = false>
struct buffer_load_if;
#if HAS_RAW_BUFFER_BUILTINS
template <index_t bytes, bool pre_nop>
struct buffer_load_if
{
template <typename T>
CK_TILE_DEVICE void operator()(T& value,
int32x4_t res /*buffer resource*/,
index_t v_offset,
index_t s_offset,
index_t i_offset /*max 0xFFF*/,
index_t flag = 0,
bool_constant<pre_nop> = {})
{
if LIKELY(1 <= flag)
{
buffer_load<bytes, pre_nop>{}(
value, res, v_offset, s_offset, i_offset, flag, bool_constant<pre_nop>{});
}
}
};
#else
template <bool pre_nop>
struct buffer_load_if<16, pre_nop>
{
@@ -214,12 +299,12 @@ struct buffer_load_if<16, pre_nop>
index_t v_offset,
index_t /*s_offset*/,
index_t i_offset /*max 0xFFF*/,
index_t flag = 0,
index_t flag = 0,
bool_constant<pre_nop> = {})
{
static_assert(sizeof(T) == 16);
auto saved_exec = __builtin_amdgcn_read_exec();
using mbuf_t = typename impl::buffer_load_trait<16, T>::payload_t;
using mbuf_t = typename impl::buffer_load_trait<16, T>::payload_t;
static_assert(sizeof(mbuf_t) == sizeof(T));
if constexpr(pre_nop)
asm volatile("s_nop 4\n"
@@ -248,12 +333,12 @@ struct buffer_load_if<8, pre_nop>
index_t v_offset,
index_t /*s_offset*/,
index_t i_offset /*max 0xFFF*/,
index_t flag = 0,
index_t flag = 0,
bool_constant<pre_nop> = {})
{
static_assert(sizeof(T) == 8);
auto saved_exec = __builtin_amdgcn_read_exec();
using mbuf_t = typename impl::buffer_load_trait<8, T>::payload_t;
using mbuf_t = typename impl::buffer_load_trait<8, T>::payload_t;
if constexpr(pre_nop)
asm volatile("s_nop 4\n"
"v_cmpx_le_u32 exec, 1, %4\n"
@@ -281,12 +366,12 @@ struct buffer_load_if<4, pre_nop>
index_t v_offset,
index_t /*s_offset*/,
index_t i_offset /*max 0xFFF*/,
index_t flag = 0,
index_t flag = 0,
bool_constant<pre_nop> = {})
{
static_assert(sizeof(T) == 4);
auto saved_exec = __builtin_amdgcn_read_exec();
using mbuf_t = typename impl::buffer_load_trait<4, T>::payload_t;
using mbuf_t = typename impl::buffer_load_trait<4, T>::payload_t;
if constexpr(pre_nop)
asm volatile("s_nop 4\n"
"v_cmpx_le_u32 exec, 1, %4\n"
@@ -314,12 +399,12 @@ struct buffer_load_if<2, pre_nop>
index_t v_offset,
index_t /*s_offset*/,
index_t i_offset /*max 0xFFF*/,
index_t flag = 0,
index_t flag = 0,
bool_constant<pre_nop> = {})
{
static_assert(sizeof(T) == 4);
auto saved_exec = __builtin_amdgcn_read_exec();
using mbuf_t = typename impl::buffer_load_trait<2, T>::payload_t;
using mbuf_t = typename impl::buffer_load_trait<2, T>::payload_t;
if constexpr(pre_nop)
asm volatile("s_nop 4\n"
"v_cmpx_le_u32 exec, 1, %4\n"
@@ -347,12 +432,12 @@ struct buffer_load_if<1, pre_nop>
index_t v_offset,
index_t /*s_offset*/,
index_t i_offset /*max 0xFFF*/,
index_t flag = 0,
index_t flag = 0,
bool_constant<pre_nop> = {})
{
static_assert(sizeof(T) == 4);
auto saved_exec = __builtin_amdgcn_read_exec();
using mbuf_t = typename impl::buffer_load_trait<1, T>::payload_t;
using mbuf_t = typename impl::buffer_load_trait<1, T>::payload_t;
if constexpr(pre_nop)
asm volatile("s_nop 4\n"
"v_cmpx_le_u32 exec, 1, %4\n"
@@ -370,9 +455,9 @@ struct buffer_load_if<1, pre_nop>
: "memory");
}
};
#endif
#pragma clang diagnostic pop // "-Wundefined-reinterpret-cast"
template <index_t bytes>
struct buffer_store;
template <>
struct buffer_store<16>
@@ -387,10 +472,16 @@ struct buffer_store<16>
{
static_assert(sizeof(T) == 16);
using mbuf_t = fp32x4_t;
#if HAS_RAW_BUFFER_BUILTINS
index_t s_offset = i_offset;
__builtin_amdgcn_raw_buffer_store_b128(
bit_cast<mbuf_t>(value), cast_to_amdgpu_buffer_rsrc_t(res), v_offset, s_offset, 0);
#else
asm volatile("buffer_store_dwordx4 %0, %1, %2, 0 offen offset:%3"
:
: "v"(bit_cast<mbuf_t>(value)), "v"(v_offset), "s"(res), "n"(i_offset)
: "memory");
#endif
}
};
@@ -407,10 +498,16 @@ struct buffer_store<8>
{
static_assert(sizeof(T) == 8);
using mbuf_t = fp32x2_t;
#if HAS_RAW_BUFFER_BUILTINS
index_t s_offset = i_offset;
__builtin_amdgcn_raw_buffer_store_b64(
bit_cast<mbuf_t>(value), cast_to_amdgpu_buffer_rsrc_t(res), v_offset, s_offset, 0);
#else
asm volatile("buffer_store_dwordx2 %0, %1, %2, 0 offen offset:%3"
:
: "v"(bit_cast<mbuf_t>(value)), "v"(v_offset), "s"(res), "n"(i_offset)
: "memory");
#endif
}
};
@@ -427,10 +524,16 @@ struct buffer_store<4>
{
static_assert(sizeof(T) == 4);
using mbuf_t = float;
#if HAS_RAW_BUFFER_BUILTINS
index_t s_offset = i_offset;
__builtin_amdgcn_raw_buffer_store_b32(
bit_cast<mbuf_t>(value), cast_to_amdgpu_buffer_rsrc_t(res), v_offset, s_offset, 0);
#else
asm volatile("buffer_store_dword %0, %1, %2, 0 offen offset:%3"
:
: "v"(bit_cast<mbuf_t>(value)), "v"(v_offset), "s"(res), "n"(i_offset)
: "memory");
#endif
}
};
@@ -447,10 +550,16 @@ struct buffer_store<2>
{
static_assert(sizeof(T) == 2);
using mbuf_t = short;
#if HAS_RAW_BUFFER_BUILTINS
index_t s_offset = i_offset;
__builtin_amdgcn_raw_buffer_store_b16(
bit_cast<mbuf_t>(value), cast_to_amdgpu_buffer_rsrc_t(res), v_offset, s_offset, 0);
#else
asm volatile("buffer_store_short %0, %1, %2, 0 offen offset:%3"
:
: "v"(bit_cast<mbuf_t>(value)), "v"(v_offset), "s"(res), "n"(i_offset)
: "memory");
#endif
}
};
@@ -467,16 +576,38 @@ struct buffer_store<1>
{
static_assert(sizeof(T) == 4);
using mbuf_t = float;
#if HAS_RAW_BUFFER_BUILTINS
index_t s_offset = i_offset;
__builtin_amdgcn_raw_buffer_store_b8(
bit_cast<mbuf_t>(value), cast_to_amdgpu_buffer_rsrc_t(res), v_offset, s_offset, 0);
#else
asm volatile("buffer_store_byte %0, %1, %2, 0 offen offset:%3"
:
: "v"(bit_cast<mbuf_t>(value)), "v"(v_offset), "s"(res), "n"(i_offset)
: "memory");
#endif
}
};
#if HAS_RAW_BUFFER_BUILTINS
template <index_t bytes>
struct buffer_store_if;
struct buffer_store_if
{
template <typename T>
CK_TILE_DEVICE void operator()(const T& value,
int32x4_t res /*buffer resource*/,
index_t v_offset,
index_t s_offset,
index_t i_offset /*max 0xFFF*/,
index_t flag = 1)
{
if LIKELY(1 <= flag)
{
buffer_store<bytes>{}(value, res, v_offset, s_offset, i_offset);
}
}
};
#else
template <>
struct buffer_store_if<16>
{
@@ -490,7 +621,7 @@ struct buffer_store_if<16>
{
static_assert(sizeof(T) == 16);
auto save_exec = __builtin_amdgcn_read_exec();
using mbuf_t = fp32x4_t;
using mbuf_t = fp32x4_t;
asm volatile("v_cmpx_le_u32 exec, 1, %4\n"
"buffer_store_dwordx4 %0, %1, %2, 0 offen offset:%3\n"
"s_mov_b64 exec %5"
@@ -547,7 +678,7 @@ struct buffer_store_if<4>
{
static_assert(sizeof(T) == 4);
auto save_exec = __builtin_amdgcn_read_exec();
using mbuf_t = float;
using mbuf_t = float;
asm volatile("v_cmpx_le_u32 exec, 1, %4\n"
"buffer_store_dword %0, %1, %2, 0 offen offset:%3\n"
"s_mov_b64 exec %5"
@@ -575,7 +706,7 @@ struct buffer_store_if<2>
{
static_assert(sizeof(T) == 2);
auto save_exec = __builtin_amdgcn_read_exec();
using mbuf_t = short;
using mbuf_t = short;
asm volatile("v_cmpx_le_u32 exec, 1, %4\n"
"buffer_store_short %0, %1, %2, 0 offen offset:%3\n"
"s_mov_b64 exec %5"
@@ -603,7 +734,7 @@ struct buffer_store_if<1>
{
static_assert(sizeof(T) == 4);
auto save_exec = __builtin_amdgcn_read_exec();
using mbuf_t = float;
using mbuf_t = float;
asm volatile("v_cmpx_le_u32 exec, 1, %4\n"
"buffer_store_byte %0, %1, %2, 0 offen offset:%3\n"
"s_mov_b64 exec %5"
@@ -617,6 +748,7 @@ struct buffer_store_if<1>
: "memory");
}
};
#endif
CK_TILE_DEVICE void buffer_load_fence(index_t cnt = 0)
{