mirror of
https://github.com/ROCm/composable_kernel.git
synced 2026-04-19 22:39:03 +00:00
[CK-Tile] Add the API to load SGPR (#2878)
* Have a workable version for SGPR * have a workable version for atomic add * Revert "have a workable version for atomic add" This reverts commit 792377a590c26cfff9c8f545d9a9e8484a7422eb. * substitute with the new sgpr read api * update the CHANGELOG * have a workable version for atomic add * Revert "have a workable version for atomic add" This reverts commit 792377a590c26cfff9c8f545d9a9e8484a7422eb. * change to static for logic * have a workable version for atomic add * Revert "have a workable version for atomic add" This reverts commit 792377a590c26cfff9c8f545d9a9e8484a7422eb.
This commit is contained in:
@@ -2829,6 +2829,60 @@ __device__ auto amd_transpose_load_to_vgpr(const T* __restrict__ in_ptr)
|
||||
}
|
||||
#endif
|
||||
|
||||
// amd_wave_read_first_lane is the SGPR function from AMD GPU device to load 1 or a series of the
|
||||
// memory to the SGPR registers.
|
||||
__device__ inline uint32_t amd_wave_read_first_lane(uint16_t v)
|
||||
{
|
||||
return __builtin_amdgcn_readfirstlane(static_cast<uint32_t>(v));
|
||||
}
|
||||
|
||||
__device__ inline uint32_t amd_wave_read_first_lane(uint8_t v)
|
||||
{
|
||||
return __builtin_amdgcn_readfirstlane(static_cast<uint32_t>(v));
|
||||
}
|
||||
|
||||
__device__ inline uint32_t amd_wave_read_first_lane(uint32_t value)
|
||||
{
|
||||
return __builtin_amdgcn_readfirstlane(value);
|
||||
}
|
||||
|
||||
__device__ inline int32_t amd_wave_read_first_lane(int32_t value)
|
||||
{
|
||||
return __builtin_amdgcn_readfirstlane(value);
|
||||
}
|
||||
|
||||
template <typename Object, std::enable_if_t<std::is_trivially_copyable_v<Object>, int> = 0>
|
||||
__device__ inline auto amd_wave_read_first_lane(const Object& obj)
|
||||
{
|
||||
constexpr size_t ObjectSize = sizeof(Object);
|
||||
constexpr size_t SGPR_size = 4;
|
||||
constexpr size_t NumFull = ObjectSize / SGPR_size;
|
||||
constexpr size_t Tail = ObjectSize % SGPR_size;
|
||||
|
||||
const unsigned char* src = reinterpret_cast<const unsigned char*>(&obj);
|
||||
alignas(Object) unsigned char dst[ObjectSize];
|
||||
|
||||
static_for<0, NumFull, 1>{}([&](auto Ic) {
|
||||
constexpr size_t offset = Ic * SGPR_size;
|
||||
uint32_t read_src;
|
||||
__builtin_memcpy(&read_src, src + offset, SGPR_size);
|
||||
read_src = __builtin_amdgcn_readfirstlane(read_src);
|
||||
__builtin_memcpy(dst + offset, &read_src, SGPR_size);
|
||||
});
|
||||
|
||||
if constexpr(Tail != 0)
|
||||
{
|
||||
constexpr size_t offset = NumFull * SGPR_size;
|
||||
uint32_t tail_loc = 0;
|
||||
__builtin_memcpy(&tail_loc, src + offset, Tail);
|
||||
tail_loc = __builtin_amdgcn_readfirstlane(tail_loc);
|
||||
__builtin_memcpy(dst + offset, &tail_loc, Tail);
|
||||
}
|
||||
Object out;
|
||||
__builtin_memcpy(&out, dst, ObjectSize);
|
||||
return out;
|
||||
}
|
||||
|
||||
} // namespace ck_tile
|
||||
|
||||
#endif // !CK_TILE_USE_BUFFER_ADDRESSING_BUILTIN
|
||||
|
||||
@@ -2585,9 +2585,8 @@ CK_TILE_DEVICE void amd_direct_load_global_to_lds(const T* global_base_ptr,
|
||||
const index_t global_offset_bytes = is_valid ? global_offset * sizeof(T) : 0x80000000;
|
||||
|
||||
#if CK_TILE_USE_AMD_LDS_DIRECT_LOAD_INLINE_ASM
|
||||
T* lds_ptr = lds_base_ptr + lds_offset;
|
||||
auto const lds_ptr_sgpr =
|
||||
__builtin_amdgcn_readfirstlane((reinterpret_cast<uintptr_t>(lds_ptr)));
|
||||
T* lds_ptr = lds_base_ptr + lds_offset;
|
||||
auto const lds_ptr_sgpr = amd_wave_read_first_lane((reinterpret_cast<uintptr_t>(lds_ptr)));
|
||||
asm volatile("s_mov_b32 m0, %0; \n\t"
|
||||
"buffer_load_dword %1, %2, 0 offen lds;\n\t" ::"s"(lds_ptr_sgpr),
|
||||
"v"(global_offset_bytes),
|
||||
@@ -2660,6 +2659,60 @@ __device__ auto amd_transpose_load_to_vgpr(const T* __restrict__ in_ptr)
|
||||
}
|
||||
#endif
|
||||
|
||||
// amd_wave_read_first_lane is the SGPR function from AMD GPU device to load 1 or a series of the
|
||||
// memory to the SGPR registers.
|
||||
__device__ inline uint32_t amd_wave_read_first_lane(uint16_t v)
|
||||
{
|
||||
return __builtin_amdgcn_readfirstlane(static_cast<uint32_t>(v));
|
||||
}
|
||||
|
||||
__device__ inline uint32_t amd_wave_read_first_lane(uint8_t v)
|
||||
{
|
||||
return __builtin_amdgcn_readfirstlane(static_cast<uint32_t>(v));
|
||||
}
|
||||
|
||||
__device__ inline uint32_t amd_wave_read_first_lane(uint32_t value)
|
||||
{
|
||||
return __builtin_amdgcn_readfirstlane(value);
|
||||
}
|
||||
|
||||
__device__ inline int32_t amd_wave_read_first_lane(int32_t value)
|
||||
{
|
||||
return __builtin_amdgcn_readfirstlane(value);
|
||||
}
|
||||
|
||||
template <typename Object, std::enable_if_t<std::is_trivially_copyable_v<Object>, int> = 0>
|
||||
__device__ inline auto amd_wave_read_first_lane(const Object& obj)
|
||||
{
|
||||
constexpr size_t ObjectSize = sizeof(Object);
|
||||
constexpr size_t SGPR_size = 4;
|
||||
constexpr size_t NumFull = ObjectSize / SGPR_size;
|
||||
constexpr size_t Tail = ObjectSize % SGPR_size;
|
||||
|
||||
const unsigned char* src = reinterpret_cast<const unsigned char*>(&obj);
|
||||
alignas(Object) unsigned char dst[ObjectSize];
|
||||
|
||||
static_for<0, NumFull, 1>{}([&](auto Ic) {
|
||||
constexpr size_t offset = Ic * SGPR_size;
|
||||
uint32_t read_src;
|
||||
__builtin_memcpy(&read_src, src + offset, SGPR_size);
|
||||
read_src = __builtin_amdgcn_readfirstlane(read_src);
|
||||
__builtin_memcpy(dst + offset, &read_src, SGPR_size);
|
||||
});
|
||||
|
||||
if constexpr(Tail != 0)
|
||||
{
|
||||
constexpr size_t offset = NumFull * SGPR_size;
|
||||
uint32_t tail_loc = 0;
|
||||
__builtin_memcpy(&tail_loc, src + offset, Tail);
|
||||
tail_loc = __builtin_amdgcn_readfirstlane(tail_loc);
|
||||
__builtin_memcpy(dst + offset, &tail_loc, Tail);
|
||||
}
|
||||
Object out;
|
||||
__builtin_memcpy(&out, dst, ObjectSize);
|
||||
return out;
|
||||
}
|
||||
|
||||
} // namespace ck_tile
|
||||
|
||||
#endif // CK_TILE_USE_BUFFER_ADDRESSING_BUILTIN
|
||||
|
||||
@@ -9,6 +9,8 @@
|
||||
#include "ck_tile/core/config.hpp"
|
||||
#include "ck_tile/core/numeric/integer.hpp"
|
||||
#include "ck_tile/core/numeric/integral_constant.hpp"
|
||||
#include "ck_tile/core/arch/amd_buffer_addressing_builtins.hpp"
|
||||
#include "ck_tile/core/arch/amd_buffer_addressing.hpp"
|
||||
#include "ck_tile/core/utility/ignore.hpp"
|
||||
|
||||
#define CK_TILE_S_CNT_MAX 0b1100'1111'0111'1111
|
||||
@@ -104,7 +106,7 @@ CK_TILE_DEVICE index_t get_warp_id(bool_constant<ReturnSgpr> = {})
|
||||
const index_t warp_id = threadIdx.x / get_warp_size();
|
||||
if constexpr(ReturnSgpr)
|
||||
{
|
||||
return __builtin_amdgcn_readfirstlane(warp_id);
|
||||
return amd_wave_read_first_lane(warp_id);
|
||||
}
|
||||
else
|
||||
{
|
||||
|
||||
@@ -402,7 +402,7 @@ struct tile_window_with_static_distribution
|
||||
const index_t m0_init_value =
|
||||
size_per_buf + size_per_wave * get_warp_id(/*ReturnSgpr=*/bool_constant<false>{});
|
||||
m0_set_with_memory(
|
||||
__builtin_amdgcn_readfirstlane(m0_init_value)); // This should be wave independent
|
||||
amd_wave_read_first_lane(m0_init_value)); // This should be wave independent
|
||||
|
||||
using Traits = typename Base::Traits;
|
||||
|
||||
|
||||
Reference in New Issue
Block a user