mirror of
https://github.com/ROCm/composable_kernel.git
synced 2026-05-14 18:17:44 +00:00
use buffer_load buffer_store intrinsic
[ROCm/composable_kernel commit: b6e1c52a80]
This commit is contained in:
@@ -835,7 +835,7 @@ struct ThreadwiseGenericTensorSliceCopy_v2r1
|
||||
// 2. src_normal_offset must be calculatd at compile time (guaranteed)
|
||||
// 3. src_merged_offset can be runtime value (no assumption imposed)
|
||||
static_if<SrcMemorySpace == 2>{}([&](auto) {
|
||||
#if 1 // source code
|
||||
#if 0 // source code
|
||||
vector_data = *reinterpret_cast<const src_vector_t*>(
|
||||
&p_src[src_normal_offset + src_merged_offset]);
|
||||
#elif 0 // inline asm using global_load
|
||||
@@ -940,7 +940,7 @@ struct ThreadwiseGenericTensorSliceCopy_v2r1
|
||||
// 2. dst_normal_offset must be calculatd at compile time (guaranteed)
|
||||
// 3. dst_merged_offset can be runtime value (no assumption imposed)
|
||||
static_if<DstMemorySpace == 2>{}([&](auto) {
|
||||
#if 1 // source code
|
||||
#if 0 // source code
|
||||
*reinterpret_cast<dst_vector_t*>(
|
||||
&p_dst[dst_normal_offset + dst_merged_offset]) = vector_data;
|
||||
#elif 0 // inline asm using global_store
|
||||
|
||||
@@ -8,6 +8,47 @@ namespace ck {
|
||||
// cast a pointer of LDS to its address
|
||||
extern "C" __attribute__((address_space(3))) __device__ void* __to_local(void* p);
|
||||
|
||||
__device__ float __llvm_amdgcn_buffer_load(int32x4_t rsrc,
|
||||
uint32_t vindex,
|
||||
uint32_t offset,
|
||||
bool glc,
|
||||
bool slc) __asm("llvm.amdgcn.buffer.load");
|
||||
|
||||
__device__ vector_type<float, 2>::MemoryType
|
||||
__llvm_amdgcn_buffer_loadx2(int32x4_t rsrc,
|
||||
uint32_t vindex,
|
||||
uint32_t offset,
|
||||
bool glc,
|
||||
bool slc) __asm("llvm.amdgcn.buffer.load.dwordx2");
|
||||
|
||||
__device__ vector_type<float, 4>::MemoryType
|
||||
__llvm_amdgcn_buffer_loadx4(int32x4_t rsrc,
|
||||
uint32_t vindex,
|
||||
uint32_t offset,
|
||||
bool glc,
|
||||
bool slc) __asm("llvm.amdgcn.buffer.load.dwordx4");
|
||||
|
||||
__device__ void __llvm_amdgcn_buffer_store(float vdata,
|
||||
int32x4_t rsrc,
|
||||
uint32_t vindex,
|
||||
uint32_t offset,
|
||||
bool glc,
|
||||
bool slc) __asm("llvm.amdgcn.buffer.store");
|
||||
|
||||
__device__ void __llvm_amdgcn_buffer_storex2(vector_type<float, 2>::MemoryType vdata,
|
||||
int32x4_t rsrc,
|
||||
uint32_t vindex,
|
||||
uint32_t offset,
|
||||
bool glc,
|
||||
bool slc) __asm("llvm.amdgcn.buffer.store.dwordx2");
|
||||
|
||||
__device__ void __llvm_amdgcn_buffer_storex4(vector_type<float, 4>::MemoryType vdata,
|
||||
int32x4_t rsrc,
|
||||
uint32_t vindex,
|
||||
uint32_t offset,
|
||||
bool glc,
|
||||
bool slc) __asm("llvm.amdgcn.buffer.store.dwordx4");
|
||||
|
||||
// global_load and global_store
|
||||
template <typename T, index_t VectorSize>
|
||||
__device__ typename vector_type<T, VectorSize>::MemoryType __global_load(
|
||||
@@ -186,7 +227,7 @@ __device__ void __global_store<float, 1>(const float& src,
|
||||
#endif
|
||||
}
|
||||
|
||||
// __buffer_load and __buffer_store
|
||||
// buffer_load and buffer_store
|
||||
template <typename T, index_t VectorSize>
|
||||
__device__ typename vector_type<T, VectorSize>::MemoryType __buffer_load(
|
||||
const T* p_src_block, uint32_t src_thread_data_offset, uint32_t src_const_data_offset);
|
||||
@@ -202,6 +243,7 @@ __device__ float __buffer_load<float, 1>(const float* p_src_block,
|
||||
uint32_t src_thread_data_offset,
|
||||
uint32_t src_const_data_offset)
|
||||
{
|
||||
#if 0
|
||||
float dst;
|
||||
|
||||
uint32_t src_thread_addr_offset = src_thread_data_offset * sizeof(float);
|
||||
@@ -223,12 +265,32 @@ __device__ float __buffer_load<float, 1>(const float* p_src_block,
|
||||
: "v"(src_thread_addr_offset), "s"(src_block_setting), "s"(src_const_addr_offset));
|
||||
|
||||
return dst;
|
||||
#else
|
||||
float dst;
|
||||
|
||||
uint32_t src_thread_addr_offset = src_thread_data_offset * sizeof(float);
|
||||
uint32_t src_const_addr_offset = src_const_data_offset * sizeof(float);
|
||||
|
||||
int32x4_t src_block_setting{0};
|
||||
// fill in byte 0 - 1
|
||||
*reinterpret_cast<float**>(&src_block_setting) = const_cast<float*>(p_src_block);
|
||||
// fill in byte 2
|
||||
reinterpret_cast<int*>(&src_block_setting)[2] = -1;
|
||||
// fill in byte 3
|
||||
reinterpret_cast<int*>(&src_block_setting)[3] = 0x00027000;
|
||||
|
||||
dst = __llvm_amdgcn_buffer_load(
|
||||
src_block_setting, 0, src_thread_addr_offset + src_const_addr_offset, false, false);
|
||||
|
||||
return dst;
|
||||
#endif
|
||||
}
|
||||
|
||||
template <>
|
||||
__device__ vector_type<float, 2>::MemoryType __buffer_load<float, 2>(
|
||||
const float* p_src_block, uint32_t src_thread_data_offset, uint32_t src_const_data_offset)
|
||||
{
|
||||
#if 0
|
||||
vector_type<float, 2>::MemoryType dst;
|
||||
|
||||
uint32_t src_thread_addr_offset = src_thread_data_offset * sizeof(float);
|
||||
@@ -250,12 +312,32 @@ __device__ vector_type<float, 2>::MemoryType __buffer_load<float, 2>(
|
||||
: "v"(src_thread_addr_offset), "s"(src_block_setting), "s"(src_const_addr_offset));
|
||||
|
||||
return dst;
|
||||
#else
|
||||
vector_type<float, 2>::MemoryType dst;
|
||||
|
||||
uint32_t src_thread_addr_offset = src_thread_data_offset * sizeof(float);
|
||||
uint32_t src_const_addr_offset = src_const_data_offset * sizeof(float);
|
||||
|
||||
int32x4_t src_block_setting{0};
|
||||
// fill in byte 0 - 1
|
||||
*reinterpret_cast<float**>(&src_block_setting) = const_cast<float*>(p_src_block);
|
||||
// fill in byte 2
|
||||
reinterpret_cast<int*>(&src_block_setting)[2] = -1;
|
||||
// fill in byte 3
|
||||
reinterpret_cast<int*>(&src_block_setting)[3] = 0x00027000;
|
||||
|
||||
dst = __llvm_amdgcn_buffer_loadx2(
|
||||
src_block_setting, 0, src_thread_addr_offset + src_const_addr_offset, false, false);
|
||||
|
||||
return dst;
|
||||
#endif
|
||||
}
|
||||
|
||||
template <>
|
||||
__device__ vector_type<float, 4>::MemoryType __buffer_load<float, 4>(
|
||||
const float* p_src_block, uint32_t src_thread_data_offset, uint32_t src_const_data_offset)
|
||||
{
|
||||
#if 0
|
||||
vector_type<float, 4>::MemoryType dst;
|
||||
|
||||
uint32_t src_thread_addr_offset = src_thread_data_offset * sizeof(float);
|
||||
@@ -277,6 +359,25 @@ __device__ vector_type<float, 4>::MemoryType __buffer_load<float, 4>(
|
||||
: "v"(src_thread_addr_offset), "s"(src_block_setting), "s"(src_const_addr_offset));
|
||||
|
||||
return dst;
|
||||
#elif 1
|
||||
vector_type<float, 4>::MemoryType dst;
|
||||
|
||||
uint32_t src_thread_addr_offset = src_thread_data_offset * sizeof(float);
|
||||
uint32_t src_const_addr_offset = src_const_data_offset * sizeof(float);
|
||||
|
||||
int32x4_t src_block_setting{0};
|
||||
// fill in byte 0 - 1
|
||||
*reinterpret_cast<float**>(&src_block_setting) = const_cast<float*>(p_src_block);
|
||||
// fill in byte 2
|
||||
reinterpret_cast<int*>(&src_block_setting)[2] = -1;
|
||||
// fill in byte 3
|
||||
reinterpret_cast<int*>(&src_block_setting)[3] = 0x00027000;
|
||||
|
||||
dst = __llvm_amdgcn_buffer_loadx4(
|
||||
src_block_setting, 0, src_thread_addr_offset + src_const_addr_offset, false, false);
|
||||
|
||||
return dst;
|
||||
#endif
|
||||
}
|
||||
|
||||
template <>
|
||||
@@ -285,6 +386,7 @@ __device__ void __buffer_store<float, 1>(const float& src,
|
||||
uint32_t dst_thread_data_offset,
|
||||
uint32_t dst_const_data_offset)
|
||||
{
|
||||
#if 0
|
||||
uint32_t dst_thread_addr_offset = dst_thread_data_offset * sizeof(float);
|
||||
uint32_t dst_const_addr_offset = dst_const_data_offset * sizeof(float);
|
||||
|
||||
@@ -304,6 +406,107 @@ __device__ void __buffer_store<float, 1>(const float& src,
|
||||
"v"(src),
|
||||
"v"(dst_thread_addr_offset),
|
||||
"s"(dst_const_addr_offset));
|
||||
#else
|
||||
uint32_t dst_thread_addr_offset = dst_thread_data_offset * sizeof(float);
|
||||
uint32_t dst_const_addr_offset = dst_const_data_offset * sizeof(float);
|
||||
|
||||
int32x4_t dst_block_setting{0};
|
||||
// fill in byte 0 - 1
|
||||
*reinterpret_cast<float**>(&dst_block_setting) = p_dst_block;
|
||||
// fill in byte 2
|
||||
reinterpret_cast<int*>(&dst_block_setting)[2] = -1;
|
||||
// fill in byte 3
|
||||
reinterpret_cast<int*>(&dst_block_setting)[3] = 0x00027000;
|
||||
|
||||
__llvm_amdgcn_buffer_store(
|
||||
src, dst_block_setting, 0, dst_thread_addr_offset + dst_const_addr_offset, false, false);
|
||||
#endif
|
||||
}
|
||||
|
||||
template <>
|
||||
__device__ void __buffer_store<float, 2>(const vector_type<float, 2>::MemoryType& src,
|
||||
float* p_dst_block,
|
||||
uint32_t dst_thread_data_offset,
|
||||
uint32_t dst_const_data_offset)
|
||||
{
|
||||
#if 0
|
||||
uint32_t dst_thread_addr_offset = dst_thread_data_offset * sizeof(float);
|
||||
uint32_t dst_const_addr_offset = dst_const_data_offset * sizeof(float);
|
||||
|
||||
int32x4_t dst_block_setting{0};
|
||||
// fill in byte 0 - 1
|
||||
*reinterpret_cast<float**>(&dst_block_setting) = p_dst_block;
|
||||
// fill in byte 2
|
||||
reinterpret_cast<int*>(&dst_block_setting)[2] = -1;
|
||||
// fill in byte 3
|
||||
reinterpret_cast<int*>(&dst_block_setting)[3] = 0x00027000;
|
||||
|
||||
asm volatile("\n \
|
||||
buffer_store_dwordx2 %1, %2, %0, %3 offen offset:0 \n \
|
||||
"
|
||||
:
|
||||
: "s"(dst_block_setting),
|
||||
"v"(src),
|
||||
"v"(dst_thread_addr_offset),
|
||||
"s"(dst_const_addr_offset));
|
||||
#else
|
||||
uint32_t dst_thread_addr_offset = dst_thread_data_offset * sizeof(float);
|
||||
uint32_t dst_const_addr_offset = dst_const_data_offset * sizeof(float);
|
||||
|
||||
int32x4_t dst_block_setting{0};
|
||||
// fill in byte 0 - 1
|
||||
*reinterpret_cast<float**>(&dst_block_setting) = p_dst_block;
|
||||
// fill in byte 2
|
||||
reinterpret_cast<int*>(&dst_block_setting)[2] = -1;
|
||||
// fill in byte 3
|
||||
reinterpret_cast<int*>(&dst_block_setting)[3] = 0x00027000;
|
||||
|
||||
__llvm_amdgcn_buffer_storex2(
|
||||
src, dst_block_setting, 0, dst_thread_addr_offset + dst_const_addr_offset, false, false);
|
||||
#endif
|
||||
}
|
||||
|
||||
template <>
|
||||
__device__ void __buffer_store<float, 4>(const vector_type<float, 4>::MemoryType& src,
|
||||
float* p_dst_block,
|
||||
uint32_t dst_thread_data_offset,
|
||||
uint32_t dst_const_data_offset)
|
||||
{
|
||||
#if 0
|
||||
uint32_t dst_thread_addr_offset = dst_thread_data_offset * sizeof(float);
|
||||
uint32_t dst_const_addr_offset = dst_const_data_offset * sizeof(float);
|
||||
|
||||
int32x4_t dst_block_setting{0};
|
||||
// fill in byte 0 - 1
|
||||
*reinterpret_cast<float**>(&dst_block_setting) = p_dst_block;
|
||||
// fill in byte 2
|
||||
reinterpret_cast<int*>(&dst_block_setting)[2] = -1;
|
||||
// fill in byte 3
|
||||
reinterpret_cast<int*>(&dst_block_setting)[3] = 0x00027000;
|
||||
|
||||
asm volatile("\n \
|
||||
buffer_store_dwordx4 %1, %2, %0, %3 offen offset:0 \n \
|
||||
"
|
||||
:
|
||||
: "s"(dst_block_setting),
|
||||
"v"(src),
|
||||
"v"(dst_thread_addr_offset),
|
||||
"s"(dst_const_addr_offset));
|
||||
#else
|
||||
uint32_t dst_thread_addr_offset = dst_thread_data_offset * sizeof(float);
|
||||
uint32_t dst_const_addr_offset = dst_const_data_offset * sizeof(float);
|
||||
|
||||
int32x4_t dst_block_setting{0};
|
||||
// fill in byte 0 - 1
|
||||
*reinterpret_cast<float**>(&dst_block_setting) = p_dst_block;
|
||||
// fill in byte 2
|
||||
reinterpret_cast<int*>(&dst_block_setting)[2] = -1;
|
||||
// fill in byte 3
|
||||
reinterpret_cast<int*>(&dst_block_setting)[3] = 0x00027000;
|
||||
|
||||
__llvm_amdgcn_buffer_storex4(
|
||||
src, dst_block_setting, 0, dst_thread_addr_offset + dst_const_addr_offset, false, false);
|
||||
#endif
|
||||
}
|
||||
|
||||
__device__ void vmcnt(index_t cnt)
|
||||
|
||||
@@ -47,7 +47,7 @@ void device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw(InDesc,
|
||||
wei_kcyx_device_buf.ToDevice(wei_kcyx.mData.data());
|
||||
out_nkhw_device_buf.ToDevice(out_nkhw.mData.data());
|
||||
|
||||
#if 0
|
||||
#if 1
|
||||
// BlockSize = 256, blockwise-GEMM 128x128, each thread hold 64 data
|
||||
constexpr index_t BlockSize = 256;
|
||||
|
||||
|
||||
Reference in New Issue
Block a user