experimenting global and buffer load/store

[ROCm/composable_kernel commit: 5b7a18c506]
This commit is contained in:
Chao Liu
2019-09-18 02:05:42 -05:00
parent 741a647405
commit 6c5f82174b
4 changed files with 14 additions and 14 deletions

View File

@@ -838,7 +838,7 @@ struct ThreadwiseGenericTensorSliceCopy_v2r1
#if 1 // source code
vector_data = *reinterpret_cast<const src_vector_t*>(
&p_src[src_normal_offset + src_merged_offset]);
#elif 1 // inline asm using global_load
#elif 0 // inline asm using global_load
vector_data = __global_load<TData, SrcDataPerAccess>(
p_src,
static_cast<uint32_t>(src_merged_offset),
@@ -943,7 +943,7 @@ struct ThreadwiseGenericTensorSliceCopy_v2r1
#if 1 // source code
*reinterpret_cast<dst_vector_t*>(
&p_dst[dst_normal_offset + dst_merged_offset]) = vector_data;
#elif 1 // inline asm using global_store
#elif 0 // inline asm using global_store
__global_store<TData, DstDataPerAccess>(
vector_data, p_dst, dst_merged_offset, dst_normal_offset);
#elif 1 // inline asm using buffer_store

View File

@@ -183,8 +183,8 @@ __device__ float __buffer_load<float, 1>(const float* p_src_block,
reinterpret_cast<int*>(&src_block_setting)[3] = 0x00027000;
asm volatile("\n \
__buffer_load_dword %0, %1, %2, %3 offen offset:0 \n \
s_waitcnt 0 \n \
buffer_load_dword %0, %1, %2, %3 offen offset:0 \n \
;;s_waitcnt 0 \n \
"
: "=v"(dst)
: "v"(src_thread_offset), "s"(src_block_setting), "s"(src_const_offset));
@@ -208,8 +208,8 @@ __device__ vector_type<float, 2>::MemoryType __buffer_load<float, 2>(const float
reinterpret_cast<int*>(&src_block_setting)[3] = 0x00027000;
asm volatile("\n \
__buffer_load_dwordx2 %0, %1, %2, %3 offen offset:0 \n \
s_waitcnt 0 \n \
buffer_load_dwordx2 %0, %1, %2, %3 offen offset:0 \n \
;;s_waitcnt 0 \n \
"
: "=v"(dst)
: "v"(src_thread_offset), "s"(src_block_setting), "s"(src_const_offset));
@@ -233,8 +233,8 @@ __device__ vector_type<float, 4>::MemoryType __buffer_load<float, 4>(const float
reinterpret_cast<int*>(&src_block_setting)[3] = 0x00027000;
asm volatile("\n \
__buffer_load_dwordx4 %0, %1, %2, %3 offen offset:0 \n \
s_waitcnt 0 \n \
buffer_load_dwordx4 %0, %1, %2, %3 offen offset:0 \n \
;;s_waitcnt 0 \n \
"
: "=v"(dst)
: "v"(src_thread_offset), "s"(src_block_setting), "s"(src_const_offset));
@@ -257,8 +257,8 @@ __device__ void __buffer_store<float, 1>(const float& src,
reinterpret_cast<int*>(&dst_block_setting)[3] = 0x00027000;
asm volatile("\n \
__buffer_store_dword %1, %2, %0, %3 offen offset:0 \n \
s_waitcnt 0 \n \
buffer_store_dword %1, %2, %0, %3 offen offset:0 \n \
;;s_waitcnt 0 \n \
"
:
: "s"(dst_block_setting), "v"(src), "v"(dst_thread_offset), "s"(dst_const_offset));