mirror of
https://github.com/ROCm/composable_kernel.git
synced 2026-05-24 14:54:47 +00:00
experimenting global and buffer load/store
This commit is contained in:
@@ -8,21 +8,169 @@ namespace ck {
|
||||
// cast a pointer of LDS to its address
|
||||
extern "C" __attribute__((address_space(3))) __device__ void* __to_local(void* p);
|
||||
|
||||
// buffer_load and buffer_store
|
||||
// global_load and global_store
|
||||
template <typename T, index_t VectorSize>
|
||||
__device__ typename vector_type<T, VectorSize>::MemoryType
|
||||
buffer_load(const T* p_src_block, uint32_t src_thread_offset, uint32_t src_const_offset);
|
||||
__global_load(const T* p_src_block, uint32_t src_thread_offset, uint32_t src_const_offset);
|
||||
|
||||
template <typename T, index_t VectorSize>
|
||||
__device__ void buffer_store(const typename vector_type<T, VectorSize>::MemoryType& src,
|
||||
T* p_dst_block,
|
||||
uint32_t dst_thread_offset,
|
||||
uint32_t dst_const_offset);
|
||||
__device__ void __global_store(const typename vector_type<T, VectorSize>::MemoryType& src,
|
||||
T* p_dst_block,
|
||||
uint32_t dst_thread_offset,
|
||||
uint32_t dst_const_offset);
|
||||
|
||||
template <>
|
||||
__device__ float buffer_load<float, 1>(const float* p_src_block,
|
||||
uint32_t src_thread_offset,
|
||||
uint32_t src_const_offset)
|
||||
__device__ float __global_load<float, 1>(const float* p_src_block,
|
||||
uint32_t src_thread_offset,
|
||||
uint32_t src_const_offset)
|
||||
{
|
||||
#if 0 // compute on VALU
|
||||
float dst;
|
||||
|
||||
uint64_t src_thread_offset_u64 = static_cast<uint64_t>(src_thread_offset + src_const_offset);
|
||||
|
||||
asm volatile("\n \
|
||||
global_load_dword %0, %1, %2, offset:0 \n \
|
||||
s_waitcnt 0 \n \
|
||||
"
|
||||
: "=v"(dst)
|
||||
: "v"(src_thread_offset_u64), "s"(p_src_block));
|
||||
|
||||
return dst;
|
||||
#else // compute on SALU
|
||||
float dst;
|
||||
|
||||
uint64_t src_thread_offset_u64 = static_cast<uint64_t>(src_thread_offset);
|
||||
|
||||
const float* p_src_block_with_offset = p_src_block + src_const_offset;
|
||||
|
||||
asm volatile("\n \
|
||||
global_load_dword %0, %1, %2, offset:0 \n \
|
||||
;;s_waitcnt 0 \n \
|
||||
"
|
||||
: "=v"(dst)
|
||||
: "v"(src_thread_offset_u64), "s"(p_src_block_with_offset));
|
||||
|
||||
return dst;
|
||||
#endif
|
||||
}
|
||||
|
||||
template <>
|
||||
__device__ vector_type<float, 2>::MemoryType __global_load<float, 2>(const float* p_src_block,
|
||||
uint32_t src_thread_offset,
|
||||
uint32_t src_const_offset)
|
||||
{
|
||||
#if 0 // compute on VALU
|
||||
vector_type<float, 2>::MemoryType dst;
|
||||
|
||||
uint64_t src_thread_offset_u64 = static_cast<uint64_t>(src_thread_offset + src_const_offset);
|
||||
|
||||
asm volatile("\n \
|
||||
global_load_dwordx2 %0, %1, %2, offset:0 \n \
|
||||
s_waitcnt 0 \n \
|
||||
"
|
||||
: "=v"(dst)
|
||||
: "v"(src_thread_offset_u64), "s"(p_src_block));
|
||||
|
||||
return dst;
|
||||
#else // compute on SALU
|
||||
vector_type<float, 2>::MemoryType dst;
|
||||
|
||||
uint64_t src_thread_offset_u64 = static_cast<uint64_t>(src_thread_offset);
|
||||
|
||||
const float* p_src_block_with_offset = p_src_block + src_const_offset;
|
||||
|
||||
asm volatile("\n \
|
||||
global_load_dwordx2 %0, %1, %2, offset:0 \n \
|
||||
;;s_waitcnt 0 \n \
|
||||
"
|
||||
: "=v"(dst)
|
||||
: "v"(src_thread_offset_u64), "s"(p_src_block_with_offset));
|
||||
|
||||
return dst;
|
||||
#endif
|
||||
}
|
||||
|
||||
template <>
|
||||
__device__ vector_type<float, 4>::MemoryType __global_load<float, 4>(const float* p_src_block,
|
||||
uint32_t src_thread_offset,
|
||||
uint32_t src_const_offset)
|
||||
{
|
||||
#if 0 // compute on VALU
|
||||
vector_type<float, 4>::MemoryType dst;
|
||||
|
||||
uint64_t src_thread_offset_u64 = static_cast<uint64_t>(src_thread_offset + src_const_offset);
|
||||
|
||||
asm volatile("\n \
|
||||
global_load_dwordx4 %0, %1, %2, offset:0 \n \
|
||||
s_waitcnt 0 \n \
|
||||
"
|
||||
: "=v"(dst)
|
||||
: "v"(src_thread_offset_u64), "s"(p_src_block));
|
||||
|
||||
return dst;
|
||||
#else // compute on SALU
|
||||
vector_type<float, 4>::MemoryType dst;
|
||||
|
||||
uint64_t src_thread_offset_u64 = static_cast<uint64_t>(src_thread_offset);
|
||||
|
||||
const float* p_src_block_with_offset = p_src_block + src_const_offset;
|
||||
|
||||
asm volatile("\n \
|
||||
global_load_dwordx4 %0, %1, %2, offset:0 \n \
|
||||
;;s_waitcnt 0 \n \
|
||||
"
|
||||
: "=v"(dst)
|
||||
: "v"(src_thread_offset_u64), "s"(p_src_block_with_offset));
|
||||
|
||||
return dst;
|
||||
#endif
|
||||
}
|
||||
|
||||
template <>
|
||||
__device__ void __global_store<float, 1>(const float& src,
|
||||
float* p_dst_block,
|
||||
uint32_t dst_thread_offset,
|
||||
uint32_t dst_const_offset)
|
||||
{
|
||||
#if 0 // compute on VALU
|
||||
uint64_t dst_thread_offset_u64 = static_cast<uint64_t>(dst_thread_offset + dst_const_offset);
|
||||
|
||||
asm volatile("\n \
|
||||
global_store_dword %0, %1, %2, offset:0 \n \
|
||||
s_waitcnt 0 \n \
|
||||
"
|
||||
:
|
||||
: "v"(dst_thread_offset_u64), "v"(src), "s"(p_dst_block));
|
||||
#else // compute on SALU
|
||||
uint64_t dst_thread_offset_u64 = static_cast<uint64_t>(dst_thread_offset);
|
||||
|
||||
float* p_dst_block_with_offset = p_dst_block + dst_const_offset;
|
||||
|
||||
asm volatile("\n \
|
||||
global_store_dword %0, %1, %2, offset:0 \n \
|
||||
;;s_waitcnt 0 \n \
|
||||
"
|
||||
:
|
||||
: "v"(dst_thread_offset_u64), "v"(src), "s"(p_dst_block_with_offset));
|
||||
#endif
|
||||
}
|
||||
|
||||
// __buffer_load and __buffer_store
|
||||
template <typename T, index_t VectorSize>
|
||||
__device__ typename vector_type<T, VectorSize>::MemoryType
|
||||
__buffer_load(const T* p_src_block, uint32_t src_thread_offset, uint32_t src_const_offset);
|
||||
|
||||
template <typename T, index_t VectorSize>
|
||||
__device__ void __buffer_store(const typename vector_type<T, VectorSize>::MemoryType& src,
|
||||
T* p_dst_block,
|
||||
uint32_t dst_thread_offset,
|
||||
uint32_t dst_const_offset);
|
||||
|
||||
template <>
|
||||
__device__ float __buffer_load<float, 1>(const float* p_src_block,
|
||||
uint32_t src_thread_offset,
|
||||
uint32_t src_const_offset)
|
||||
{
|
||||
float dst;
|
||||
|
||||
@@ -35,7 +183,7 @@ __device__ float buffer_load<float, 1>(const float* p_src_block,
|
||||
reinterpret_cast<int*>(&src_block_setting)[3] = 0x00027000;
|
||||
|
||||
asm volatile("\n \
|
||||
buffer_load_dword %0, %1, %2, %3 offen offset:0 \n \
|
||||
__buffer_load_dword %0, %1, %2, %3 offen offset:0 \n \
|
||||
s_waitcnt 0 \n \
|
||||
"
|
||||
: "=v"(dst)
|
||||
@@ -45,9 +193,9 @@ __device__ float buffer_load<float, 1>(const float* p_src_block,
|
||||
}
|
||||
|
||||
template <>
|
||||
__device__ vector_type<float, 2>::MemoryType buffer_load<float, 2>(const float* p_src_block,
|
||||
uint32_t src_thread_offset,
|
||||
uint32_t src_const_offset)
|
||||
__device__ vector_type<float, 2>::MemoryType __buffer_load<float, 2>(const float* p_src_block,
|
||||
uint32_t src_thread_offset,
|
||||
uint32_t src_const_offset)
|
||||
{
|
||||
vector_type<float, 2>::MemoryType dst;
|
||||
|
||||
@@ -60,7 +208,7 @@ __device__ vector_type<float, 2>::MemoryType buffer_load<float, 2>(const float*
|
||||
reinterpret_cast<int*>(&src_block_setting)[3] = 0x00027000;
|
||||
|
||||
asm volatile("\n \
|
||||
buffer_load_dwordx2 %0, %1, %2, %3 offen offset:0 \n \
|
||||
__buffer_load_dwordx2 %0, %1, %2, %3 offen offset:0 \n \
|
||||
s_waitcnt 0 \n \
|
||||
"
|
||||
: "=v"(dst)
|
||||
@@ -70,9 +218,9 @@ __device__ vector_type<float, 2>::MemoryType buffer_load<float, 2>(const float*
|
||||
}
|
||||
|
||||
template <>
|
||||
__device__ vector_type<float, 4>::MemoryType buffer_load<float, 4>(const float* p_src_block,
|
||||
uint32_t src_thread_offset,
|
||||
uint32_t src_const_offset)
|
||||
__device__ vector_type<float, 4>::MemoryType __buffer_load<float, 4>(const float* p_src_block,
|
||||
uint32_t src_thread_offset,
|
||||
uint32_t src_const_offset)
|
||||
{
|
||||
vector_type<float, 4>::MemoryType dst;
|
||||
|
||||
@@ -85,7 +233,7 @@ __device__ vector_type<float, 4>::MemoryType buffer_load<float, 4>(const float*
|
||||
reinterpret_cast<int*>(&src_block_setting)[3] = 0x00027000;
|
||||
|
||||
asm volatile("\n \
|
||||
buffer_load_dwordx4 %0, %1, %2, %3 offen offset:0 \n \
|
||||
__buffer_load_dwordx4 %0, %1, %2, %3 offen offset:0 \n \
|
||||
s_waitcnt 0 \n \
|
||||
"
|
||||
: "=v"(dst)
|
||||
@@ -95,10 +243,10 @@ __device__ vector_type<float, 4>::MemoryType buffer_load<float, 4>(const float*
|
||||
}
|
||||
|
||||
template <>
|
||||
__device__ void buffer_store<float, 1>(const float& src,
|
||||
float* p_dst_block,
|
||||
uint32_t dst_thread_offset,
|
||||
uint32_t dst_const_offset)
|
||||
__device__ void __buffer_store<float, 1>(const float& src,
|
||||
float* p_dst_block,
|
||||
uint32_t dst_thread_offset,
|
||||
uint32_t dst_const_offset)
|
||||
{
|
||||
int32x4_t dst_block_setting{0};
|
||||
// fill in byte 0 - 1
|
||||
@@ -109,7 +257,7 @@ __device__ void buffer_store<float, 1>(const float& src,
|
||||
reinterpret_cast<int*>(&dst_block_setting)[3] = 0x00027000;
|
||||
|
||||
asm volatile("\n \
|
||||
buffer_store_dword %1, %2, %0, %3 offen offset:0 \n \
|
||||
__buffer_store_dword %1, %2, %0, %3 offen offset:0 \n \
|
||||
s_waitcnt 0 \n \
|
||||
"
|
||||
:
|
||||
|
||||
Reference in New Issue
Block a user