diff --git a/composable_kernel/include/utility/amd_inline_asm.hpp b/composable_kernel/include/utility/amd_inline_asm.hpp index 5c0b7d8c4b..f3930f8733 100644 --- a/composable_kernel/include/utility/amd_inline_asm.hpp +++ b/composable_kernel/include/utility/amd_inline_asm.hpp @@ -10,170 +10,203 @@ extern "C" __attribute__((address_space(3))) __device__ void* __to_local(void* p // global_load and global_store template -__device__ typename vector_type::MemoryType -__global_load(const T* p_src_block, uint32_t src_thread_offset, uint32_t src_const_offset); +__device__ typename vector_type::MemoryType __global_load( + const T* p_src_block, uint32_t src_thread_data_offset, uint32_t src_const_data_offset); template __device__ void __global_store(const typename vector_type::MemoryType& src, T* p_dst_block, - uint32_t dst_thread_offset, - uint32_t dst_const_offset); + uint32_t dst_thread_data_offset, + uint32_t dst_const_data_offset); template <> __device__ float __global_load(const float* p_src_block, - uint32_t src_thread_offset, - uint32_t src_const_offset) + uint32_t src_thread_data_offset, + uint32_t src_const_data_offset) { -#if 0 // compute on VALU float dst; - uint64_t src_thread_offset_u64 = static_cast(src_thread_offset + src_const_offset); +#if 0 // source code + dst = p_src_block[src_const_data_offset + src_thread_data_offset]; +#elif 0 // use VGPR only + const float* src_thread_addr_offset_u64 = + p_src_block + src_const_data_offset + src_thread_data_offset; + + asm volatile("\n \ + global_load_dword %0, %1 off offset:0 \n \ + s_waitcnt 0 \n \ + " + : "=v"(dst) + : "v"(src_thread_addr_offset_u64)); +#elif 0 // use VGPR and SGPR, do compute on VALU + uint64_t src_thread_addr_offset_u64 = + (src_thread_data_offset + src_const_data_offset) * sizeof(float); asm volatile("\n \ global_load_dword %0, %1, %2, offset:0 \n \ s_waitcnt 0 \n \ " : "=v"(dst) - : "v"(src_thread_offset_u64), "s"(p_src_block)); + : "v"(src_thread_addr_offset_u64), "s"(p_src_block)); +#elif 1 // use VGPR and SGPR, do compute on SALU + uint64_t src_thread_addr_offset_u64 = + static_cast(src_thread_data_offset * sizeof(float)); - return dst; -#else // compute on SALU - float dst; - - uint64_t src_thread_offset_u64 = static_cast(src_thread_offset); - - const float* p_src_block_with_offset = p_src_block + src_const_offset; + const float* p_src_block_with_offset = p_src_block + src_const_data_offset; asm volatile("\n \ global_load_dword %0, %1, %2, offset:0 \n \ - ;;s_waitcnt 0 \n \ + s_waitcnt 0 \n \ " : "=v"(dst) - : "v"(src_thread_offset_u64), "s"(p_src_block_with_offset)); + : "v"(src_thread_addr_offset_u64), "s"(p_src_block_with_offset)); +#endif return dst; -#endif } template <> -__device__ vector_type::MemoryType __global_load(const float* p_src_block, - uint32_t src_thread_offset, - uint32_t src_const_offset) +__device__ vector_type::MemoryType __global_load( + const float* p_src_block, uint32_t src_thread_data_offset, uint32_t src_const_data_offset) { -#if 0 // compute on VALU - vector_type::MemoryType dst; + using vector_t = vector_type::MemoryType; - uint64_t src_thread_offset_u64 = static_cast(src_thread_offset + src_const_offset); + vector_t dst; + +#if 0 // source code + dst = *reinterpret_cast(&p_src_block[src_const_data_offset + src_thread_data_offset]); +#elif 0 // use VGPR only + const float* src_thread_addr_offset_u64 = + p_src_block + src_const_data_offset + src_thread_data_offset; + + asm volatile("\n \ + global_load_dwordx2 %0, %1 off offset:0 \n \ + s_waitcnt 0 \n \ + " + : "=v"(dst) + : "v"(src_thread_addr_offset_u64)); +#elif 0 // use VGPR and SGPR, do compute on VALU + uint64_t src_thread_addr_offset_u64 = + (src_thread_data_offset + src_const_data_offset) * sizeof(float); asm volatile("\n \ global_load_dwordx2 %0, %1, %2, offset:0 \n \ s_waitcnt 0 \n \ " : "=v"(dst) - : "v"(src_thread_offset_u64), "s"(p_src_block)); + : "v"(src_thread_addr_offset_u64), "s"(p_src_block)); +#elif 1 // use VGPR and SGPR, do compute on SALU + uint64_t src_thread_addr_offset_u64 = + static_cast(src_thread_data_offset * sizeof(float)); - return dst; -#else // compute on SALU - vector_type::MemoryType dst; - - uint64_t src_thread_offset_u64 = static_cast(src_thread_offset); - - const float* p_src_block_with_offset = p_src_block + src_const_offset; + const float* p_src_block_with_offset = p_src_block + src_const_data_offset; asm volatile("\n \ global_load_dwordx2 %0, %1, %2, offset:0 \n \ - ;;s_waitcnt 0 \n \ + s_waitcnt 0 \n \ " : "=v"(dst) - : "v"(src_thread_offset_u64), "s"(p_src_block_with_offset)); + : "v"(src_thread_addr_offset_u64), "s"(p_src_block_with_offset)); +#endif return dst; -#endif } template <> -__device__ vector_type::MemoryType __global_load(const float* p_src_block, - uint32_t src_thread_offset, - uint32_t src_const_offset) +__device__ vector_type::MemoryType __global_load( + const float* p_src_block, uint32_t src_thread_data_offset, uint32_t src_const_data_offset) { -#if 0 // compute on VALU - vector_type::MemoryType dst; + using vector_t = vector_type::MemoryType; - uint64_t src_thread_offset_u64 = static_cast(src_thread_offset + src_const_offset); + vector_t dst; + +#if 0 // source code + dst = *reinterpret_cast(&p_src_block[src_const_data_offset + src_thread_data_offset]); +#elif 0 // use VGPR only + const float* src_thread_addr_offset_u64 = + p_src_block + src_const_data_offset + src_thread_data_offset; + + asm volatile("\n \ + global_load_dwordx4 %0, %1 off offset:0 \n \ + s_waitcnt 0 \n \ + " + : "=v"(dst) + : "v"(src_thread_addr_offset_u64)); +#elif 0 // use VGPR and SGPR, do compute on VALU + uint64_t src_thread_addr_offset_u64 = + (src_thread_data_offset + src_const_data_offset) * sizeof(float); asm volatile("\n \ global_load_dwordx4 %0, %1, %2, offset:0 \n \ s_waitcnt 0 \n \ " : "=v"(dst) - : "v"(src_thread_offset_u64), "s"(p_src_block)); + : "v"(src_thread_addr_offset_u64), "s"(p_src_block)); +#elif 1 // use VGPR and SGPR, do compute on SALU + uint64_t src_thread_addr_offset_u64 = + static_cast(src_thread_data_offset * sizeof(float)); - return dst; -#else // compute on SALU - vector_type::MemoryType dst; - - uint64_t src_thread_offset_u64 = static_cast(src_thread_offset); - - const float* p_src_block_with_offset = p_src_block + src_const_offset; + const float* p_src_block_with_offset = p_src_block + src_const_data_offset; asm volatile("\n \ global_load_dwordx4 %0, %1, %2, offset:0 \n \ - ;;s_waitcnt 0 \n \ + s_waitcnt 0 \n \ " : "=v"(dst) - : "v"(src_thread_offset_u64), "s"(p_src_block_with_offset)); + : "v"(src_thread_addr_offset_u64), "s"(p_src_block_with_offset)); +#endif return dst; -#endif } template <> __device__ void __global_store(const float& src, float* p_dst_block, - uint32_t dst_thread_offset, - uint32_t dst_const_offset) + uint32_t dst_thread_data_offset, + uint32_t dst_const_data_offset) { #if 0 // compute on VALU - uint64_t dst_thread_offset_u64 = static_cast(dst_thread_offset + dst_const_offset); + uint64_t dst_thread_data_offset_u64 = (dst_thread_data_offset + dst_const_data_offset) * sizeof(float); asm volatile("\n \ global_store_dword %0, %1, %2, offset:0 \n \ - s_waitcnt 0 \n \ " : - : "v"(dst_thread_offset_u64), "v"(src), "s"(p_dst_block)); + : "v"(dst_thread_data_offset_u64), "v"(src), "s"(p_dst_block)); #else // compute on SALU - uint64_t dst_thread_offset_u64 = static_cast(dst_thread_offset); + uint64_t dst_thread_data_offset_u64 = dst_thread_data_offset * sizeof(float); - float* p_dst_block_with_offset = p_dst_block + dst_const_offset; + float* p_dst_block_with_offset = p_dst_block + dst_const_data_offset; asm volatile("\n \ global_store_dword %0, %1, %2, offset:0 \n \ - ;;s_waitcnt 0 \n \ " : - : "v"(dst_thread_offset_u64), "v"(src), "s"(p_dst_block_with_offset)); + : "v"(dst_thread_data_offset_u64), "v"(src), "s"(p_dst_block_with_offset)); #endif } // __buffer_load and __buffer_store template -__device__ typename vector_type::MemoryType -__buffer_load(const T* p_src_block, uint32_t src_thread_offset, uint32_t src_const_offset); +__device__ typename vector_type::MemoryType __buffer_load( + const T* p_src_block, uint32_t src_thread_data_offset, uint32_t src_const_data_offset); template __device__ void __buffer_store(const typename vector_type::MemoryType& src, T* p_dst_block, - uint32_t dst_thread_offset, - uint32_t dst_const_offset); + uint32_t dst_thread_data_offset, + uint32_t dst_const_data_offset); template <> __device__ float __buffer_load(const float* p_src_block, - uint32_t src_thread_offset, - uint32_t src_const_offset) + uint32_t src_thread_data_offset, + uint32_t src_const_data_offset) { float dst; + uint32_t src_thread_addr_offset = src_thread_data_offset * sizeof(float); + uint32_t src_const_addr_offset = src_const_data_offset * sizeof(float); + int32x4_t src_block_setting{0}; // fill in byte 0 - 1 *reinterpret_cast(&src_block_setting) = const_cast(p_src_block); @@ -184,21 +217,23 @@ __device__ float __buffer_load(const float* p_src_block, asm volatile("\n \ buffer_load_dword %0, %1, %2, %3 offen offset:0 \n \ - ;;s_waitcnt 0 \n \ + s_waitcnt 0 \n \ " : "=v"(dst) - : "v"(src_thread_offset), "s"(src_block_setting), "s"(src_const_offset)); + : "v"(src_thread_addr_offset), "s"(src_block_setting), "s"(src_const_addr_offset)); return dst; } template <> -__device__ vector_type::MemoryType __buffer_load(const float* p_src_block, - uint32_t src_thread_offset, - uint32_t src_const_offset) +__device__ vector_type::MemoryType __buffer_load( + const float* p_src_block, uint32_t src_thread_data_offset, uint32_t src_const_data_offset) { vector_type::MemoryType dst; + uint32_t src_thread_addr_offset = src_thread_data_offset * sizeof(float); + uint32_t src_const_addr_offset = src_const_data_offset * sizeof(float); + int32x4_t src_block_setting{0}; // fill in byte 0 - 1 *reinterpret_cast(&src_block_setting) = const_cast(p_src_block); @@ -209,21 +244,23 @@ __device__ vector_type::MemoryType __buffer_load(const float asm volatile("\n \ buffer_load_dwordx2 %0, %1, %2, %3 offen offset:0 \n \ - ;;s_waitcnt 0 \n \ + s_waitcnt 0 \n \ " : "=v"(dst) - : "v"(src_thread_offset), "s"(src_block_setting), "s"(src_const_offset)); + : "v"(src_thread_addr_offset), "s"(src_block_setting), "s"(src_const_addr_offset)); return dst; } template <> -__device__ vector_type::MemoryType __buffer_load(const float* p_src_block, - uint32_t src_thread_offset, - uint32_t src_const_offset) +__device__ vector_type::MemoryType __buffer_load( + const float* p_src_block, uint32_t src_thread_data_offset, uint32_t src_const_data_offset) { vector_type::MemoryType dst; + uint32_t src_thread_addr_offset = src_thread_data_offset * sizeof(float); + uint32_t src_const_addr_offset = src_const_data_offset * sizeof(float); + int32x4_t src_block_setting{0}; // fill in byte 0 - 1 *reinterpret_cast(&src_block_setting) = const_cast(p_src_block); @@ -234,10 +271,10 @@ __device__ vector_type::MemoryType __buffer_load(const float asm volatile("\n \ buffer_load_dwordx4 %0, %1, %2, %3 offen offset:0 \n \ - ;;s_waitcnt 0 \n \ + s_waitcnt 0 \n \ " : "=v"(dst) - : "v"(src_thread_offset), "s"(src_block_setting), "s"(src_const_offset)); + : "v"(src_thread_addr_offset), "s"(src_block_setting), "s"(src_const_addr_offset)); return dst; } @@ -245,9 +282,12 @@ __device__ vector_type::MemoryType __buffer_load(const float template <> __device__ void __buffer_store(const float& src, float* p_dst_block, - uint32_t dst_thread_offset, - uint32_t dst_const_offset) + uint32_t dst_thread_data_offset, + uint32_t dst_const_data_offset) { + uint32_t dst_thread_addr_offset = dst_thread_data_offset * sizeof(float); + uint32_t dst_const_addr_offset = dst_const_data_offset * sizeof(float); + int32x4_t dst_block_setting{0}; // fill in byte 0 - 1 *reinterpret_cast(&dst_block_setting) = p_dst_block; @@ -258,10 +298,12 @@ __device__ void __buffer_store(const float& src, asm volatile("\n \ buffer_store_dword %1, %2, %0, %3 offen offset:0 \n \ - ;;s_waitcnt 0 \n \ " : - : "s"(dst_block_setting), "v"(src), "v"(dst_thread_offset), "s"(dst_const_offset)); + : "s"(dst_block_setting), + "v"(src), + "v"(dst_thread_addr_offset), + "s"(dst_const_addr_offset)); } __device__ void vmcnt(index_t cnt) diff --git a/driver/include/device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw.hpp b/driver/include/device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw.hpp index 7c1f142a8d..febd1625e2 100644 --- a/driver/include/device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw.hpp +++ b/driver/include/device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw.hpp @@ -47,7 +47,7 @@ void device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw(InDesc, wei_kcyx_device_buf.ToDevice(wei_kcyx.mData.data()); out_nkhw_device_buf.ToDevice(out_nkhw.mData.data()); -#if 1 +#if 0 // BlockSize = 256, blockwise-GEMM 128x128, each thread hold 64 data constexpr index_t BlockSize = 256; diff --git a/driver/src/driver.cpp b/driver/src/driver.cpp index 1e1631a89e..94e91f7567 100644 --- a/driver/src/driver.cpp +++ b/driver/src/driver.cpp @@ -103,7 +103,7 @@ int main(int argc, char* argv[]) using LeftPads = Sequence<0, 0>; using RightPads = Sequence<0, 0>; -#elif 0 +#elif 1 // 1x1 filter, 8x8 image // cudnn@V100 68%, ck@V100 72%, ck@P100 52%, ck@VII 42% constexpr index_t N = 64;