diff --git a/composable_kernel/include/tensor_operation/threadwise_generic_tensor_slice_copy.hpp b/composable_kernel/include/tensor_operation/threadwise_generic_tensor_slice_copy.hpp index 2a0fffb34d..525b7a04f1 100644 --- a/composable_kernel/include/tensor_operation/threadwise_generic_tensor_slice_copy.hpp +++ b/composable_kernel/include/tensor_operation/threadwise_generic_tensor_slice_copy.hpp @@ -835,7 +835,7 @@ struct ThreadwiseGenericTensorSliceCopy_v2r1 // 2. src_normal_offset must be calculatd at compile time (guaranteed) // 3. src_merged_offset can be runtime value (no assumption imposed) static_if{}([&](auto) { -#if 1 // source code +#if 0 // source code vector_data = *reinterpret_cast( &p_src[src_normal_offset + src_merged_offset]); #elif 0 // inline asm using global_load @@ -940,7 +940,7 @@ struct ThreadwiseGenericTensorSliceCopy_v2r1 // 2. dst_normal_offset must be calculatd at compile time (guaranteed) // 3. dst_merged_offset can be runtime value (no assumption imposed) static_if{}([&](auto) { -#if 1 // source code +#if 0 // source code *reinterpret_cast( &p_dst[dst_normal_offset + dst_merged_offset]) = vector_data; #elif 0 // inline asm using global_store diff --git a/composable_kernel/include/utility/amd_inline_asm.hpp b/composable_kernel/include/utility/amd_inline_asm.hpp index f3930f8733..fd8669256d 100644 --- a/composable_kernel/include/utility/amd_inline_asm.hpp +++ b/composable_kernel/include/utility/amd_inline_asm.hpp @@ -8,6 +8,47 @@ namespace ck { // cast a pointer of LDS to its address extern "C" __attribute__((address_space(3))) __device__ void* __to_local(void* p); +__device__ float __llvm_amdgcn_buffer_load(int32x4_t rsrc, + uint32_t vindex, + uint32_t offset, + bool glc, + bool slc) __asm("llvm.amdgcn.buffer.load"); + +__device__ vector_type::MemoryType +__llvm_amdgcn_buffer_loadx2(int32x4_t rsrc, + uint32_t vindex, + uint32_t offset, + bool glc, + bool slc) __asm("llvm.amdgcn.buffer.load.dwordx2"); + +__device__ vector_type::MemoryType +__llvm_amdgcn_buffer_loadx4(int32x4_t rsrc, + uint32_t vindex, + uint32_t offset, + bool glc, + bool slc) __asm("llvm.amdgcn.buffer.load.dwordx4"); + +__device__ void __llvm_amdgcn_buffer_store(float vdata, + int32x4_t rsrc, + uint32_t vindex, + uint32_t offset, + bool glc, + bool slc) __asm("llvm.amdgcn.buffer.store"); + +__device__ void __llvm_amdgcn_buffer_storex2(vector_type::MemoryType vdata, + int32x4_t rsrc, + uint32_t vindex, + uint32_t offset, + bool glc, + bool slc) __asm("llvm.amdgcn.buffer.store.dwordx2"); + +__device__ void __llvm_amdgcn_buffer_storex4(vector_type::MemoryType vdata, + int32x4_t rsrc, + uint32_t vindex, + uint32_t offset, + bool glc, + bool slc) __asm("llvm.amdgcn.buffer.store.dwordx4"); + // global_load and global_store template __device__ typename vector_type::MemoryType __global_load( @@ -186,7 +227,7 @@ __device__ void __global_store(const float& src, #endif } -// __buffer_load and __buffer_store +// buffer_load and buffer_store template __device__ typename vector_type::MemoryType __buffer_load( const T* p_src_block, uint32_t src_thread_data_offset, uint32_t src_const_data_offset); @@ -202,6 +243,7 @@ __device__ float __buffer_load(const float* p_src_block, uint32_t src_thread_data_offset, uint32_t src_const_data_offset) { +#if 0 float dst; uint32_t src_thread_addr_offset = src_thread_data_offset * sizeof(float); @@ -223,12 +265,32 @@ __device__ float __buffer_load(const float* p_src_block, : "v"(src_thread_addr_offset), "s"(src_block_setting), "s"(src_const_addr_offset)); return dst; +#else + float dst; + + uint32_t src_thread_addr_offset = src_thread_data_offset * sizeof(float); + uint32_t src_const_addr_offset = src_const_data_offset * sizeof(float); + + int32x4_t src_block_setting{0}; + // fill in byte 0 - 1 + *reinterpret_cast(&src_block_setting) = const_cast(p_src_block); + // fill in byte 2 + reinterpret_cast(&src_block_setting)[2] = -1; + // fill in byte 3 + reinterpret_cast(&src_block_setting)[3] = 0x00027000; + + dst = __llvm_amdgcn_buffer_load( + src_block_setting, 0, src_thread_addr_offset + src_const_addr_offset, false, false); + + return dst; +#endif } template <> __device__ vector_type::MemoryType __buffer_load( const float* p_src_block, uint32_t src_thread_data_offset, uint32_t src_const_data_offset) { +#if 0 vector_type::MemoryType dst; uint32_t src_thread_addr_offset = src_thread_data_offset * sizeof(float); @@ -250,12 +312,32 @@ __device__ vector_type::MemoryType __buffer_load( : "v"(src_thread_addr_offset), "s"(src_block_setting), "s"(src_const_addr_offset)); return dst; +#else + vector_type::MemoryType dst; + + uint32_t src_thread_addr_offset = src_thread_data_offset * sizeof(float); + uint32_t src_const_addr_offset = src_const_data_offset * sizeof(float); + + int32x4_t src_block_setting{0}; + // fill in byte 0 - 1 + *reinterpret_cast(&src_block_setting) = const_cast(p_src_block); + // fill in byte 2 + reinterpret_cast(&src_block_setting)[2] = -1; + // fill in byte 3 + reinterpret_cast(&src_block_setting)[3] = 0x00027000; + + dst = __llvm_amdgcn_buffer_loadx2( + src_block_setting, 0, src_thread_addr_offset + src_const_addr_offset, false, false); + + return dst; +#endif } template <> __device__ vector_type::MemoryType __buffer_load( const float* p_src_block, uint32_t src_thread_data_offset, uint32_t src_const_data_offset) { +#if 0 vector_type::MemoryType dst; uint32_t src_thread_addr_offset = src_thread_data_offset * sizeof(float); @@ -277,6 +359,25 @@ __device__ vector_type::MemoryType __buffer_load( : "v"(src_thread_addr_offset), "s"(src_block_setting), "s"(src_const_addr_offset)); return dst; +#elif 1 + vector_type::MemoryType dst; + + uint32_t src_thread_addr_offset = src_thread_data_offset * sizeof(float); + uint32_t src_const_addr_offset = src_const_data_offset * sizeof(float); + + int32x4_t src_block_setting{0}; + // fill in byte 0 - 1 + *reinterpret_cast(&src_block_setting) = const_cast(p_src_block); + // fill in byte 2 + reinterpret_cast(&src_block_setting)[2] = -1; + // fill in byte 3 + reinterpret_cast(&src_block_setting)[3] = 0x00027000; + + dst = __llvm_amdgcn_buffer_loadx4( + src_block_setting, 0, src_thread_addr_offset + src_const_addr_offset, false, false); + + return dst; +#endif } template <> @@ -285,6 +386,7 @@ __device__ void __buffer_store(const float& src, uint32_t dst_thread_data_offset, uint32_t dst_const_data_offset) { +#if 0 uint32_t dst_thread_addr_offset = dst_thread_data_offset * sizeof(float); uint32_t dst_const_addr_offset = dst_const_data_offset * sizeof(float); @@ -304,6 +406,107 @@ __device__ void __buffer_store(const float& src, "v"(src), "v"(dst_thread_addr_offset), "s"(dst_const_addr_offset)); +#else + uint32_t dst_thread_addr_offset = dst_thread_data_offset * sizeof(float); + uint32_t dst_const_addr_offset = dst_const_data_offset * sizeof(float); + + int32x4_t dst_block_setting{0}; + // fill in byte 0 - 1 + *reinterpret_cast(&dst_block_setting) = p_dst_block; + // fill in byte 2 + reinterpret_cast(&dst_block_setting)[2] = -1; + // fill in byte 3 + reinterpret_cast(&dst_block_setting)[3] = 0x00027000; + + __llvm_amdgcn_buffer_store( + src, dst_block_setting, 0, dst_thread_addr_offset + dst_const_addr_offset, false, false); +#endif +} + +template <> +__device__ void __buffer_store(const vector_type::MemoryType& src, + float* p_dst_block, + uint32_t dst_thread_data_offset, + uint32_t dst_const_data_offset) +{ +#if 0 + uint32_t dst_thread_addr_offset = dst_thread_data_offset * sizeof(float); + uint32_t dst_const_addr_offset = dst_const_data_offset * sizeof(float); + + int32x4_t dst_block_setting{0}; + // fill in byte 0 - 1 + *reinterpret_cast(&dst_block_setting) = p_dst_block; + // fill in byte 2 + reinterpret_cast(&dst_block_setting)[2] = -1; + // fill in byte 3 + reinterpret_cast(&dst_block_setting)[3] = 0x00027000; + + asm volatile("\n \ + buffer_store_dwordx2 %1, %2, %0, %3 offen offset:0 \n \ + " + : + : "s"(dst_block_setting), + "v"(src), + "v"(dst_thread_addr_offset), + "s"(dst_const_addr_offset)); +#else + uint32_t dst_thread_addr_offset = dst_thread_data_offset * sizeof(float); + uint32_t dst_const_addr_offset = dst_const_data_offset * sizeof(float); + + int32x4_t dst_block_setting{0}; + // fill in byte 0 - 1 + *reinterpret_cast(&dst_block_setting) = p_dst_block; + // fill in byte 2 + reinterpret_cast(&dst_block_setting)[2] = -1; + // fill in byte 3 + reinterpret_cast(&dst_block_setting)[3] = 0x00027000; + + __llvm_amdgcn_buffer_storex2( + src, dst_block_setting, 0, dst_thread_addr_offset + dst_const_addr_offset, false, false); +#endif +} + +template <> +__device__ void __buffer_store(const vector_type::MemoryType& src, + float* p_dst_block, + uint32_t dst_thread_data_offset, + uint32_t dst_const_data_offset) +{ +#if 0 + uint32_t dst_thread_addr_offset = dst_thread_data_offset * sizeof(float); + uint32_t dst_const_addr_offset = dst_const_data_offset * sizeof(float); + + int32x4_t dst_block_setting{0}; + // fill in byte 0 - 1 + *reinterpret_cast(&dst_block_setting) = p_dst_block; + // fill in byte 2 + reinterpret_cast(&dst_block_setting)[2] = -1; + // fill in byte 3 + reinterpret_cast(&dst_block_setting)[3] = 0x00027000; + + asm volatile("\n \ + buffer_store_dwordx4 %1, %2, %0, %3 offen offset:0 \n \ + " + : + : "s"(dst_block_setting), + "v"(src), + "v"(dst_thread_addr_offset), + "s"(dst_const_addr_offset)); +#else + uint32_t dst_thread_addr_offset = dst_thread_data_offset * sizeof(float); + uint32_t dst_const_addr_offset = dst_const_data_offset * sizeof(float); + + int32x4_t dst_block_setting{0}; + // fill in byte 0 - 1 + *reinterpret_cast(&dst_block_setting) = p_dst_block; + // fill in byte 2 + reinterpret_cast(&dst_block_setting)[2] = -1; + // fill in byte 3 + reinterpret_cast(&dst_block_setting)[3] = 0x00027000; + + __llvm_amdgcn_buffer_storex4( + src, dst_block_setting, 0, dst_thread_addr_offset + dst_const_addr_offset, false, false); +#endif } __device__ void vmcnt(index_t cnt) diff --git a/driver/include/device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw.hpp b/driver/include/device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw.hpp index febd1625e2..7c1f142a8d 100644 --- a/driver/include/device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw.hpp +++ b/driver/include/device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw.hpp @@ -47,7 +47,7 @@ void device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw(InDesc, wei_kcyx_device_buf.ToDevice(wei_kcyx.mData.data()); out_nkhw_device_buf.ToDevice(out_nkhw.mData.data()); -#if 0 +#if 1 // BlockSize = 256, blockwise-GEMM 128x128, each thread hold 64 data constexpr index_t BlockSize = 256;