From 5b7a18c50601583b28e54905b56e1ac7342b22c3 Mon Sep 17 00:00:00 2001 From: Chao Liu Date: Wed, 18 Sep 2019 02:05:42 -0500 Subject: [PATCH] experimenting global and buffer load/store --- .../threadwise_generic_tensor_slice_copy.hpp | 4 ++-- .../include/utility/amd_inline_asm.hpp | 16 ++++++++-------- ...olution_implicit_gemm_v4r1_nchw_kcyx_nkhw.hpp | 2 +- driver/src/driver.cpp | 6 +++--- 4 files changed, 14 insertions(+), 14 deletions(-) diff --git a/composable_kernel/include/tensor_operation/threadwise_generic_tensor_slice_copy.hpp b/composable_kernel/include/tensor_operation/threadwise_generic_tensor_slice_copy.hpp index e661264d86..2a0fffb34d 100644 --- a/composable_kernel/include/tensor_operation/threadwise_generic_tensor_slice_copy.hpp +++ b/composable_kernel/include/tensor_operation/threadwise_generic_tensor_slice_copy.hpp @@ -838,7 +838,7 @@ struct ThreadwiseGenericTensorSliceCopy_v2r1 #if 1 // source code vector_data = *reinterpret_cast( &p_src[src_normal_offset + src_merged_offset]); -#elif 1 // inline asm using global_load +#elif 0 // inline asm using global_load vector_data = __global_load( p_src, static_cast(src_merged_offset), @@ -943,7 +943,7 @@ struct ThreadwiseGenericTensorSliceCopy_v2r1 #if 1 // source code *reinterpret_cast( &p_dst[dst_normal_offset + dst_merged_offset]) = vector_data; -#elif 1 // inline asm using global_store +#elif 0 // inline asm using global_store __global_store( vector_data, p_dst, dst_merged_offset, dst_normal_offset); #elif 1 // inline asm using buffer_store diff --git a/composable_kernel/include/utility/amd_inline_asm.hpp b/composable_kernel/include/utility/amd_inline_asm.hpp index 04b6864fa4..5c0b7d8c4b 100644 --- a/composable_kernel/include/utility/amd_inline_asm.hpp +++ b/composable_kernel/include/utility/amd_inline_asm.hpp @@ -183,8 +183,8 @@ __device__ float __buffer_load(const float* p_src_block, reinterpret_cast(&src_block_setting)[3] = 0x00027000; asm volatile("\n \ - __buffer_load_dword %0, %1, %2, %3 offen offset:0 \n \ - s_waitcnt 0 \n \ + buffer_load_dword %0, %1, %2, %3 offen offset:0 \n \ + ;;s_waitcnt 0 \n \ " : "=v"(dst) : "v"(src_thread_offset), "s"(src_block_setting), "s"(src_const_offset)); @@ -208,8 +208,8 @@ __device__ vector_type::MemoryType __buffer_load(const float reinterpret_cast(&src_block_setting)[3] = 0x00027000; asm volatile("\n \ - __buffer_load_dwordx2 %0, %1, %2, %3 offen offset:0 \n \ - s_waitcnt 0 \n \ + buffer_load_dwordx2 %0, %1, %2, %3 offen offset:0 \n \ + ;;s_waitcnt 0 \n \ " : "=v"(dst) : "v"(src_thread_offset), "s"(src_block_setting), "s"(src_const_offset)); @@ -233,8 +233,8 @@ __device__ vector_type::MemoryType __buffer_load(const float reinterpret_cast(&src_block_setting)[3] = 0x00027000; asm volatile("\n \ - __buffer_load_dwordx4 %0, %1, %2, %3 offen offset:0 \n \ - s_waitcnt 0 \n \ + buffer_load_dwordx4 %0, %1, %2, %3 offen offset:0 \n \ + ;;s_waitcnt 0 \n \ " : "=v"(dst) : "v"(src_thread_offset), "s"(src_block_setting), "s"(src_const_offset)); @@ -257,8 +257,8 @@ __device__ void __buffer_store(const float& src, reinterpret_cast(&dst_block_setting)[3] = 0x00027000; asm volatile("\n \ - __buffer_store_dword %1, %2, %0, %3 offen offset:0 \n \ - s_waitcnt 0 \n \ + buffer_store_dword %1, %2, %0, %3 offen offset:0 \n \ + ;;s_waitcnt 0 \n \ " : : "s"(dst_block_setting), "v"(src), "v"(dst_thread_offset), "s"(dst_const_offset)); diff --git a/driver/include/device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw.hpp b/driver/include/device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw.hpp index febd1625e2..7c1f142a8d 100644 --- a/driver/include/device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw.hpp +++ b/driver/include/device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw.hpp @@ -47,7 +47,7 @@ void device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw(InDesc, wei_kcyx_device_buf.ToDevice(wei_kcyx.mData.data()); out_nkhw_device_buf.ToDevice(out_nkhw.mData.data()); -#if 0 +#if 1 // BlockSize = 256, blockwise-GEMM 128x128, each thread hold 64 data constexpr index_t BlockSize = 256; diff --git a/driver/src/driver.cpp b/driver/src/driver.cpp index 166f9382fe..1e1631a89e 100644 --- a/driver/src/driver.cpp +++ b/driver/src/driver.cpp @@ -103,7 +103,7 @@ int main(int argc, char* argv[]) using LeftPads = Sequence<0, 0>; using RightPads = Sequence<0, 0>; -#elif 1 +#elif 0 // 1x1 filter, 8x8 image // cudnn@V100 68%, ck@V100 72%, ck@P100 52%, ck@VII 42% constexpr index_t N = 64; @@ -295,7 +295,7 @@ int main(int argc, char* argv[]) using LeftPads = Sequence<0, 0>; using RightPads = Sequence<0, 0>; -#elif 0 +#elif 1 // 3x3 filter, 2x2 stride, 35x35 input, 17x17 output // cudnn@V100 90%, ck@V100 93%, ck@P100 83%, ck@VII 81% constexpr index_t N = 128; @@ -341,7 +341,7 @@ int main(int argc, char* argv[]) using LeftPads = Sequence<3, 0>; using RightPads = Sequence<3, 0>; -#elif 1 +#elif 0 // 1x7 filter, 0x3 pad, 17x17 input constexpr index_t N = 128; constexpr index_t C = 128;