mirror of
https://github.com/ROCm/composable_kernel.git
synced 2026-05-05 06:01:23 +00:00
Replace inline assembly with builtins in FHMA (#2067)
* Replace inline assembly with builtins in FHMA --------- Co-authored-by: illsilin <Illia.Silin@amd.com>
This commit is contained in:
committed by
GitHub
parent
3e6d21adeb
commit
f14e648e7c
@@ -14,6 +14,15 @@
|
||||
#include "ck_tile/core/utility/bit_cast.hpp"
|
||||
#include "ck_tile/core/utility/functional.hpp"
|
||||
|
||||
// This attribute gives a hint to the compiler that a branch is likely to be taken.
|
||||
// Then, the compiler should remove if possible the associated s_cbranch_execz branch that would
|
||||
// have been generated.
|
||||
#if __cplusplus >= 202002L
|
||||
#define LIKELY(x) (x) [[likely]]
|
||||
#else
|
||||
#define LIKELY(x) (__builtin_expect(!!(x), 1))
|
||||
#endif
|
||||
|
||||
namespace ck_tile {
|
||||
|
||||
// 128 bit SGPRs to supply buffer resource in buffer instructions
|
||||
@@ -58,10 +67,36 @@ template<> struct buffer_load_trait<4 , thread_buffer<bf16_t, 2>> { using payloa
|
||||
// TODO: glc/slc/...
|
||||
template <index_t bytes, bool pre_nop = false>
|
||||
struct buffer_load;
|
||||
|
||||
template <index_t bytes, bool pre_nop = false>
|
||||
struct buffer_load_if;
|
||||
|
||||
template <index_t bytes>
|
||||
struct buffer_store;
|
||||
|
||||
template <index_t bytes>
|
||||
struct buffer_store_if;
|
||||
|
||||
#pragma clang diagnostic push
|
||||
#pragma clang diagnostic ignored "-Wundefined-reinterpret-cast"
|
||||
// TODO: strict aliasing rule seems fail when reinterpret_cast between vector type
|
||||
// (exp_vector_type(xxx))
|
||||
|
||||
#define HAS_RAW_BUFFER_BUILTINS \
|
||||
__has_builtin(__builtin_amdgcn_raw_buffer_load_b32) && \
|
||||
__has_builtin(__builtin_amdgcn_make_buffer_rsrc) && \
|
||||
__has_builtin(__builtin_amdgcn_raw_buffer_store_b32)
|
||||
|
||||
#if HAS_RAW_BUFFER_BUILTINS
|
||||
CK_TILE_DEVICE __amdgpu_buffer_rsrc_t cast_to_amdgpu_buffer_rsrc_t(int32x4_t res)
|
||||
{
|
||||
__amdgpu_buffer_rsrc_t as_rsrc;
|
||||
static_assert(sizeof(res) == sizeof(as_rsrc) && "Size of buffer resource should match");
|
||||
memcpy(&as_rsrc, &res, sizeof(res));
|
||||
return as_rsrc;
|
||||
}
|
||||
#endif
|
||||
|
||||
template <bool pre_nop>
|
||||
struct buffer_load<16, pre_nop>
|
||||
{
|
||||
@@ -76,6 +111,11 @@ struct buffer_load<16, pre_nop>
|
||||
{
|
||||
static_assert(sizeof(T) == 16);
|
||||
using mbuf_t = typename impl::buffer_load_trait<16, T>::payload_t;
|
||||
#if HAS_RAW_BUFFER_BUILTINS
|
||||
index_t s_offset = i_offset;
|
||||
reinterpret_cast<mbuf_t&>(value) = __builtin_amdgcn_raw_buffer_load_b128(
|
||||
cast_to_amdgpu_buffer_rsrc_t(res), v_offset, s_offset, 0);
|
||||
#else
|
||||
if constexpr(pre_nop)
|
||||
asm volatile("s_nop 4\n"
|
||||
"buffer_load_dwordx4 %0, %1, %2, 0 offen offset:%3"
|
||||
@@ -87,6 +127,7 @@ struct buffer_load<16, pre_nop>
|
||||
: "+v"(reinterpret_cast<mbuf_t&>(value))
|
||||
: "v"(v_offset), "s"(res), "n"(i_offset)
|
||||
: "memory");
|
||||
#endif
|
||||
}
|
||||
};
|
||||
|
||||
@@ -104,6 +145,11 @@ struct buffer_load<8, pre_nop>
|
||||
{
|
||||
static_assert(sizeof(T) == 8);
|
||||
using mbuf_t = typename impl::buffer_load_trait<8, T>::payload_t;
|
||||
#if HAS_RAW_BUFFER_BUILTINS
|
||||
index_t s_offset = i_offset;
|
||||
reinterpret_cast<mbuf_t&>(value) = __builtin_amdgcn_raw_buffer_load_b64(
|
||||
cast_to_amdgpu_buffer_rsrc_t(res), v_offset, s_offset, 0);
|
||||
#else
|
||||
if constexpr(pre_nop)
|
||||
asm volatile("s_nop 4\n"
|
||||
"buffer_load_dwordx2 %0, %1, %2, 0 offen offset:%3"
|
||||
@@ -115,6 +161,7 @@ struct buffer_load<8, pre_nop>
|
||||
: "+v"(reinterpret_cast<mbuf_t&>(value))
|
||||
: "v"(v_offset), "s"(res), "n"(i_offset)
|
||||
: "memory");
|
||||
#endif
|
||||
}
|
||||
};
|
||||
|
||||
@@ -132,6 +179,12 @@ struct buffer_load<4, pre_nop>
|
||||
{
|
||||
static_assert(sizeof(T) == 4);
|
||||
using mbuf_t = typename impl::buffer_load_trait<4, T>::payload_t;
|
||||
|
||||
#if HAS_RAW_BUFFER_BUILTINS
|
||||
index_t s_offset = i_offset;
|
||||
reinterpret_cast<mbuf_t&>(value) = __builtin_amdgcn_raw_buffer_load_b32(
|
||||
cast_to_amdgpu_buffer_rsrc_t(res), v_offset, s_offset, 0);
|
||||
#else
|
||||
if constexpr(pre_nop)
|
||||
asm volatile("s_nop 4\n"
|
||||
"buffer_load_dword %0, %1, %2, 0 offen offset:%3"
|
||||
@@ -143,6 +196,7 @@ struct buffer_load<4, pre_nop>
|
||||
: "+v"(reinterpret_cast<mbuf_t&>(value))
|
||||
: "v"(v_offset), "s"(res), "n"(i_offset)
|
||||
: "memory");
|
||||
#endif
|
||||
}
|
||||
};
|
||||
|
||||
@@ -160,6 +214,12 @@ struct buffer_load<2, pre_nop>
|
||||
{
|
||||
static_assert(sizeof(T) == 4); // subdword is buggy, use dword buf and convert manually
|
||||
using mbuf_t = typename impl::buffer_load_trait<2, T>::payload_t;
|
||||
|
||||
#if HAS_RAW_BUFFER_BUILTINS
|
||||
index_t s_offset = i_offset;
|
||||
reinterpret_cast<mbuf_t&>(value) = __builtin_amdgcn_raw_buffer_load_b16(
|
||||
cast_to_amdgpu_buffer_rsrc_t(res), v_offset, s_offset, 0);
|
||||
#else
|
||||
if constexpr(pre_nop)
|
||||
asm volatile("s_nop 4\n"
|
||||
"buffer_load_ushort %0, %1, %2, 0 offen offset:%3"
|
||||
@@ -171,6 +231,7 @@ struct buffer_load<2, pre_nop>
|
||||
: "+v"(reinterpret_cast<mbuf_t&>(value))
|
||||
: "v"(v_offset), "s"(res), "n"(i_offset)
|
||||
: "memory");
|
||||
#endif
|
||||
}
|
||||
};
|
||||
|
||||
@@ -188,6 +249,11 @@ struct buffer_load<1, pre_nop>
|
||||
{
|
||||
static_assert(sizeof(T) == 4);
|
||||
using mbuf_t = typename impl::buffer_load_trait<1, T>::payload_t;
|
||||
#if HAS_RAW_BUFFER_BUILTINS
|
||||
index_t s_offset = i_offset;
|
||||
reinterpret_cast<mbuf_t&>(value) = __builtin_amdgcn_raw_buffer_load_b16(
|
||||
cast_to_amdgpu_buffer_rsrc_t(res), v_offset, s_offset, 0);
|
||||
#else
|
||||
if constexpr(pre_nop)
|
||||
asm volatile("s_nop 4\n"
|
||||
"buffer_load_ubyte %0, %1, %2, 0 offen offset:%3"
|
||||
@@ -199,12 +265,31 @@ struct buffer_load<1, pre_nop>
|
||||
: "+v"(reinterpret_cast<mbuf_t&>(value))
|
||||
: "v"(v_offset), "s"(res), "n"(i_offset)
|
||||
: "memory");
|
||||
#endif
|
||||
}
|
||||
};
|
||||
|
||||
template <index_t bytes, bool pre_nop = false>
|
||||
struct buffer_load_if;
|
||||
|
||||
#if HAS_RAW_BUFFER_BUILTINS
|
||||
template <index_t bytes, bool pre_nop>
|
||||
struct buffer_load_if
|
||||
{
|
||||
template <typename T>
|
||||
CK_TILE_DEVICE void operator()(T& value,
|
||||
int32x4_t res /*buffer resource*/,
|
||||
index_t v_offset,
|
||||
index_t s_offset,
|
||||
index_t i_offset /*max 0xFFF*/,
|
||||
index_t flag = 0,
|
||||
bool_constant<pre_nop> = {})
|
||||
{
|
||||
if LIKELY(1 <= flag)
|
||||
{
|
||||
buffer_load<bytes, pre_nop>{}(
|
||||
value, res, v_offset, s_offset, i_offset, flag, bool_constant<pre_nop>{});
|
||||
}
|
||||
}
|
||||
};
|
||||
#else
|
||||
template <bool pre_nop>
|
||||
struct buffer_load_if<16, pre_nop>
|
||||
{
|
||||
@@ -214,12 +299,12 @@ struct buffer_load_if<16, pre_nop>
|
||||
index_t v_offset,
|
||||
index_t /*s_offset*/,
|
||||
index_t i_offset /*max 0xFFF*/,
|
||||
index_t flag = 0,
|
||||
index_t flag = 0,
|
||||
bool_constant<pre_nop> = {})
|
||||
{
|
||||
static_assert(sizeof(T) == 16);
|
||||
auto saved_exec = __builtin_amdgcn_read_exec();
|
||||
using mbuf_t = typename impl::buffer_load_trait<16, T>::payload_t;
|
||||
using mbuf_t = typename impl::buffer_load_trait<16, T>::payload_t;
|
||||
static_assert(sizeof(mbuf_t) == sizeof(T));
|
||||
if constexpr(pre_nop)
|
||||
asm volatile("s_nop 4\n"
|
||||
@@ -248,12 +333,12 @@ struct buffer_load_if<8, pre_nop>
|
||||
index_t v_offset,
|
||||
index_t /*s_offset*/,
|
||||
index_t i_offset /*max 0xFFF*/,
|
||||
index_t flag = 0,
|
||||
index_t flag = 0,
|
||||
bool_constant<pre_nop> = {})
|
||||
{
|
||||
static_assert(sizeof(T) == 8);
|
||||
auto saved_exec = __builtin_amdgcn_read_exec();
|
||||
using mbuf_t = typename impl::buffer_load_trait<8, T>::payload_t;
|
||||
using mbuf_t = typename impl::buffer_load_trait<8, T>::payload_t;
|
||||
if constexpr(pre_nop)
|
||||
asm volatile("s_nop 4\n"
|
||||
"v_cmpx_le_u32 exec, 1, %4\n"
|
||||
@@ -281,12 +366,12 @@ struct buffer_load_if<4, pre_nop>
|
||||
index_t v_offset,
|
||||
index_t /*s_offset*/,
|
||||
index_t i_offset /*max 0xFFF*/,
|
||||
index_t flag = 0,
|
||||
index_t flag = 0,
|
||||
bool_constant<pre_nop> = {})
|
||||
{
|
||||
static_assert(sizeof(T) == 4);
|
||||
auto saved_exec = __builtin_amdgcn_read_exec();
|
||||
using mbuf_t = typename impl::buffer_load_trait<4, T>::payload_t;
|
||||
using mbuf_t = typename impl::buffer_load_trait<4, T>::payload_t;
|
||||
if constexpr(pre_nop)
|
||||
asm volatile("s_nop 4\n"
|
||||
"v_cmpx_le_u32 exec, 1, %4\n"
|
||||
@@ -314,12 +399,12 @@ struct buffer_load_if<2, pre_nop>
|
||||
index_t v_offset,
|
||||
index_t /*s_offset*/,
|
||||
index_t i_offset /*max 0xFFF*/,
|
||||
index_t flag = 0,
|
||||
index_t flag = 0,
|
||||
bool_constant<pre_nop> = {})
|
||||
{
|
||||
static_assert(sizeof(T) == 4);
|
||||
auto saved_exec = __builtin_amdgcn_read_exec();
|
||||
using mbuf_t = typename impl::buffer_load_trait<2, T>::payload_t;
|
||||
using mbuf_t = typename impl::buffer_load_trait<2, T>::payload_t;
|
||||
if constexpr(pre_nop)
|
||||
asm volatile("s_nop 4\n"
|
||||
"v_cmpx_le_u32 exec, 1, %4\n"
|
||||
@@ -347,12 +432,12 @@ struct buffer_load_if<1, pre_nop>
|
||||
index_t v_offset,
|
||||
index_t /*s_offset*/,
|
||||
index_t i_offset /*max 0xFFF*/,
|
||||
index_t flag = 0,
|
||||
index_t flag = 0,
|
||||
bool_constant<pre_nop> = {})
|
||||
{
|
||||
static_assert(sizeof(T) == 4);
|
||||
auto saved_exec = __builtin_amdgcn_read_exec();
|
||||
using mbuf_t = typename impl::buffer_load_trait<1, T>::payload_t;
|
||||
using mbuf_t = typename impl::buffer_load_trait<1, T>::payload_t;
|
||||
if constexpr(pre_nop)
|
||||
asm volatile("s_nop 4\n"
|
||||
"v_cmpx_le_u32 exec, 1, %4\n"
|
||||
@@ -370,9 +455,9 @@ struct buffer_load_if<1, pre_nop>
|
||||
: "memory");
|
||||
}
|
||||
};
|
||||
#endif
|
||||
|
||||
#pragma clang diagnostic pop // "-Wundefined-reinterpret-cast"
|
||||
template <index_t bytes>
|
||||
struct buffer_store;
|
||||
|
||||
template <>
|
||||
struct buffer_store<16>
|
||||
@@ -387,10 +472,16 @@ struct buffer_store<16>
|
||||
{
|
||||
static_assert(sizeof(T) == 16);
|
||||
using mbuf_t = fp32x4_t;
|
||||
#if HAS_RAW_BUFFER_BUILTINS
|
||||
index_t s_offset = i_offset;
|
||||
__builtin_amdgcn_raw_buffer_store_b128(
|
||||
bit_cast<mbuf_t>(value), cast_to_amdgpu_buffer_rsrc_t(res), v_offset, s_offset, 0);
|
||||
#else
|
||||
asm volatile("buffer_store_dwordx4 %0, %1, %2, 0 offen offset:%3"
|
||||
:
|
||||
: "v"(bit_cast<mbuf_t>(value)), "v"(v_offset), "s"(res), "n"(i_offset)
|
||||
: "memory");
|
||||
#endif
|
||||
}
|
||||
};
|
||||
|
||||
@@ -407,10 +498,16 @@ struct buffer_store<8>
|
||||
{
|
||||
static_assert(sizeof(T) == 8);
|
||||
using mbuf_t = fp32x2_t;
|
||||
#if HAS_RAW_BUFFER_BUILTINS
|
||||
index_t s_offset = i_offset;
|
||||
__builtin_amdgcn_raw_buffer_store_b64(
|
||||
bit_cast<mbuf_t>(value), cast_to_amdgpu_buffer_rsrc_t(res), v_offset, s_offset, 0);
|
||||
#else
|
||||
asm volatile("buffer_store_dwordx2 %0, %1, %2, 0 offen offset:%3"
|
||||
:
|
||||
: "v"(bit_cast<mbuf_t>(value)), "v"(v_offset), "s"(res), "n"(i_offset)
|
||||
: "memory");
|
||||
#endif
|
||||
}
|
||||
};
|
||||
|
||||
@@ -427,10 +524,16 @@ struct buffer_store<4>
|
||||
{
|
||||
static_assert(sizeof(T) == 4);
|
||||
using mbuf_t = float;
|
||||
#if HAS_RAW_BUFFER_BUILTINS
|
||||
index_t s_offset = i_offset;
|
||||
__builtin_amdgcn_raw_buffer_store_b32(
|
||||
bit_cast<mbuf_t>(value), cast_to_amdgpu_buffer_rsrc_t(res), v_offset, s_offset, 0);
|
||||
#else
|
||||
asm volatile("buffer_store_dword %0, %1, %2, 0 offen offset:%3"
|
||||
:
|
||||
: "v"(bit_cast<mbuf_t>(value)), "v"(v_offset), "s"(res), "n"(i_offset)
|
||||
: "memory");
|
||||
#endif
|
||||
}
|
||||
};
|
||||
|
||||
@@ -447,10 +550,16 @@ struct buffer_store<2>
|
||||
{
|
||||
static_assert(sizeof(T) == 2);
|
||||
using mbuf_t = short;
|
||||
#if HAS_RAW_BUFFER_BUILTINS
|
||||
index_t s_offset = i_offset;
|
||||
__builtin_amdgcn_raw_buffer_store_b16(
|
||||
bit_cast<mbuf_t>(value), cast_to_amdgpu_buffer_rsrc_t(res), v_offset, s_offset, 0);
|
||||
#else
|
||||
asm volatile("buffer_store_short %0, %1, %2, 0 offen offset:%3"
|
||||
:
|
||||
: "v"(bit_cast<mbuf_t>(value)), "v"(v_offset), "s"(res), "n"(i_offset)
|
||||
: "memory");
|
||||
#endif
|
||||
}
|
||||
};
|
||||
|
||||
@@ -467,16 +576,38 @@ struct buffer_store<1>
|
||||
{
|
||||
static_assert(sizeof(T) == 4);
|
||||
using mbuf_t = float;
|
||||
#if HAS_RAW_BUFFER_BUILTINS
|
||||
index_t s_offset = i_offset;
|
||||
__builtin_amdgcn_raw_buffer_store_b8(
|
||||
bit_cast<mbuf_t>(value), cast_to_amdgpu_buffer_rsrc_t(res), v_offset, s_offset, 0);
|
||||
#else
|
||||
asm volatile("buffer_store_byte %0, %1, %2, 0 offen offset:%3"
|
||||
:
|
||||
: "v"(bit_cast<mbuf_t>(value)), "v"(v_offset), "s"(res), "n"(i_offset)
|
||||
: "memory");
|
||||
#endif
|
||||
}
|
||||
};
|
||||
|
||||
#if HAS_RAW_BUFFER_BUILTINS
|
||||
template <index_t bytes>
|
||||
struct buffer_store_if;
|
||||
|
||||
struct buffer_store_if
|
||||
{
|
||||
template <typename T>
|
||||
CK_TILE_DEVICE void operator()(const T& value,
|
||||
int32x4_t res /*buffer resource*/,
|
||||
index_t v_offset,
|
||||
index_t s_offset,
|
||||
index_t i_offset /*max 0xFFF*/,
|
||||
index_t flag = 1)
|
||||
{
|
||||
if LIKELY(1 <= flag)
|
||||
{
|
||||
buffer_store<bytes>{}(value, res, v_offset, s_offset, i_offset);
|
||||
}
|
||||
}
|
||||
};
|
||||
#else
|
||||
template <>
|
||||
struct buffer_store_if<16>
|
||||
{
|
||||
@@ -490,7 +621,7 @@ struct buffer_store_if<16>
|
||||
{
|
||||
static_assert(sizeof(T) == 16);
|
||||
auto save_exec = __builtin_amdgcn_read_exec();
|
||||
using mbuf_t = fp32x4_t;
|
||||
using mbuf_t = fp32x4_t;
|
||||
asm volatile("v_cmpx_le_u32 exec, 1, %4\n"
|
||||
"buffer_store_dwordx4 %0, %1, %2, 0 offen offset:%3\n"
|
||||
"s_mov_b64 exec %5"
|
||||
@@ -547,7 +678,7 @@ struct buffer_store_if<4>
|
||||
{
|
||||
static_assert(sizeof(T) == 4);
|
||||
auto save_exec = __builtin_amdgcn_read_exec();
|
||||
using mbuf_t = float;
|
||||
using mbuf_t = float;
|
||||
asm volatile("v_cmpx_le_u32 exec, 1, %4\n"
|
||||
"buffer_store_dword %0, %1, %2, 0 offen offset:%3\n"
|
||||
"s_mov_b64 exec %5"
|
||||
@@ -575,7 +706,7 @@ struct buffer_store_if<2>
|
||||
{
|
||||
static_assert(sizeof(T) == 2);
|
||||
auto save_exec = __builtin_amdgcn_read_exec();
|
||||
using mbuf_t = short;
|
||||
using mbuf_t = short;
|
||||
asm volatile("v_cmpx_le_u32 exec, 1, %4\n"
|
||||
"buffer_store_short %0, %1, %2, 0 offen offset:%3\n"
|
||||
"s_mov_b64 exec %5"
|
||||
@@ -603,7 +734,7 @@ struct buffer_store_if<1>
|
||||
{
|
||||
static_assert(sizeof(T) == 4);
|
||||
auto save_exec = __builtin_amdgcn_read_exec();
|
||||
using mbuf_t = float;
|
||||
using mbuf_t = float;
|
||||
asm volatile("v_cmpx_le_u32 exec, 1, %4\n"
|
||||
"buffer_store_byte %0, %1, %2, 0 offen offset:%3\n"
|
||||
"s_mov_b64 exec %5"
|
||||
@@ -617,6 +748,7 @@ struct buffer_store_if<1>
|
||||
: "memory");
|
||||
}
|
||||
};
|
||||
#endif
|
||||
|
||||
CK_TILE_DEVICE void buffer_load_fence(index_t cnt = 0)
|
||||
{
|
||||
|
||||
Reference in New Issue
Block a user