mirror of
https://github.com/ROCm/composable_kernel.git
synced 2026-05-12 17:26:00 +00:00
experimenting global and buffer load/store
This commit is contained in:
@@ -838,7 +838,7 @@ struct ThreadwiseGenericTensorSliceCopy_v2r1
|
||||
#if 1 // source code
|
||||
vector_data = *reinterpret_cast<const src_vector_t*>(
|
||||
&p_src[src_normal_offset + src_merged_offset]);
|
||||
#elif 1 // inline asm using global_load
|
||||
#elif 0 // inline asm using global_load
|
||||
vector_data = __global_load<TData, SrcDataPerAccess>(
|
||||
p_src,
|
||||
static_cast<uint32_t>(src_merged_offset),
|
||||
@@ -943,7 +943,7 @@ struct ThreadwiseGenericTensorSliceCopy_v2r1
|
||||
#if 1 // source code
|
||||
*reinterpret_cast<dst_vector_t*>(
|
||||
&p_dst[dst_normal_offset + dst_merged_offset]) = vector_data;
|
||||
#elif 1 // inline asm using global_store
|
||||
#elif 0 // inline asm using global_store
|
||||
__global_store<TData, DstDataPerAccess>(
|
||||
vector_data, p_dst, dst_merged_offset, dst_normal_offset);
|
||||
#elif 1 // inline asm using buffer_store
|
||||
|
||||
@@ -183,8 +183,8 @@ __device__ float __buffer_load<float, 1>(const float* p_src_block,
|
||||
reinterpret_cast<int*>(&src_block_setting)[3] = 0x00027000;
|
||||
|
||||
asm volatile("\n \
|
||||
__buffer_load_dword %0, %1, %2, %3 offen offset:0 \n \
|
||||
s_waitcnt 0 \n \
|
||||
buffer_load_dword %0, %1, %2, %3 offen offset:0 \n \
|
||||
;;s_waitcnt 0 \n \
|
||||
"
|
||||
: "=v"(dst)
|
||||
: "v"(src_thread_offset), "s"(src_block_setting), "s"(src_const_offset));
|
||||
@@ -208,8 +208,8 @@ __device__ vector_type<float, 2>::MemoryType __buffer_load<float, 2>(const float
|
||||
reinterpret_cast<int*>(&src_block_setting)[3] = 0x00027000;
|
||||
|
||||
asm volatile("\n \
|
||||
__buffer_load_dwordx2 %0, %1, %2, %3 offen offset:0 \n \
|
||||
s_waitcnt 0 \n \
|
||||
buffer_load_dwordx2 %0, %1, %2, %3 offen offset:0 \n \
|
||||
;;s_waitcnt 0 \n \
|
||||
"
|
||||
: "=v"(dst)
|
||||
: "v"(src_thread_offset), "s"(src_block_setting), "s"(src_const_offset));
|
||||
@@ -233,8 +233,8 @@ __device__ vector_type<float, 4>::MemoryType __buffer_load<float, 4>(const float
|
||||
reinterpret_cast<int*>(&src_block_setting)[3] = 0x00027000;
|
||||
|
||||
asm volatile("\n \
|
||||
__buffer_load_dwordx4 %0, %1, %2, %3 offen offset:0 \n \
|
||||
s_waitcnt 0 \n \
|
||||
buffer_load_dwordx4 %0, %1, %2, %3 offen offset:0 \n \
|
||||
;;s_waitcnt 0 \n \
|
||||
"
|
||||
: "=v"(dst)
|
||||
: "v"(src_thread_offset), "s"(src_block_setting), "s"(src_const_offset));
|
||||
@@ -257,8 +257,8 @@ __device__ void __buffer_store<float, 1>(const float& src,
|
||||
reinterpret_cast<int*>(&dst_block_setting)[3] = 0x00027000;
|
||||
|
||||
asm volatile("\n \
|
||||
__buffer_store_dword %1, %2, %0, %3 offen offset:0 \n \
|
||||
s_waitcnt 0 \n \
|
||||
buffer_store_dword %1, %2, %0, %3 offen offset:0 \n \
|
||||
;;s_waitcnt 0 \n \
|
||||
"
|
||||
:
|
||||
: "s"(dst_block_setting), "v"(src), "v"(dst_thread_offset), "s"(dst_const_offset));
|
||||
|
||||
@@ -47,7 +47,7 @@ void device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw(InDesc,
|
||||
wei_kcyx_device_buf.ToDevice(wei_kcyx.mData.data());
|
||||
out_nkhw_device_buf.ToDevice(out_nkhw.mData.data());
|
||||
|
||||
#if 0
|
||||
#if 1
|
||||
// BlockSize = 256, blockwise-GEMM 128x128, each thread hold 64 data
|
||||
constexpr index_t BlockSize = 256;
|
||||
|
||||
|
||||
@@ -103,7 +103,7 @@ int main(int argc, char* argv[])
|
||||
|
||||
using LeftPads = Sequence<0, 0>;
|
||||
using RightPads = Sequence<0, 0>;
|
||||
#elif 1
|
||||
#elif 0
|
||||
// 1x1 filter, 8x8 image
|
||||
// cudnn@V100 68%, ck@V100 72%, ck@P100 52%, ck@VII 42%
|
||||
constexpr index_t N = 64;
|
||||
@@ -295,7 +295,7 @@ int main(int argc, char* argv[])
|
||||
|
||||
using LeftPads = Sequence<0, 0>;
|
||||
using RightPads = Sequence<0, 0>;
|
||||
#elif 0
|
||||
#elif 1
|
||||
// 3x3 filter, 2x2 stride, 35x35 input, 17x17 output
|
||||
// cudnn@V100 90%, ck@V100 93%, ck@P100 83%, ck@VII 81%
|
||||
constexpr index_t N = 128;
|
||||
@@ -341,7 +341,7 @@ int main(int argc, char* argv[])
|
||||
|
||||
using LeftPads = Sequence<3, 0>;
|
||||
using RightPads = Sequence<3, 0>;
|
||||
#elif 1
|
||||
#elif 0
|
||||
// 1x7 filter, 0x3 pad, 17x17 input
|
||||
constexpr index_t N = 128;
|
||||
constexpr index_t C = 128;
|
||||
|
||||
Reference in New Issue
Block a user