From 740da00aa2335e3106c38abfdaac3d15ce7c6cec Mon Sep 17 00:00:00 2001 From: Chao Liu Date: Fri, 20 Sep 2019 21:45:20 -0500 Subject: [PATCH] refactor --- .../include/utility/amd_intrinsic.hpp | 332 ++++++++++++++++++ 1 file changed, 332 insertions(+) create mode 100644 composable_kernel/include/utility/amd_intrinsic.hpp diff --git a/composable_kernel/include/utility/amd_intrinsic.hpp b/composable_kernel/include/utility/amd_intrinsic.hpp new file mode 100644 index 0000000000..193a55bc74 --- /dev/null +++ b/composable_kernel/include/utility/amd_intrinsic.hpp @@ -0,0 +1,332 @@ +#ifndef CK_AMD_INTRINSIC_HPP +#define CK_AMD_INTRINSIC_HPP + +#include "vector_type.hpp" + +namespace ck { + +__device__ float __llvm_amdgcn_buffer_load(int32x4_t rsrc, + uint32_t vindex, + uint32_t offset, + bool glc, + bool slc) __asm("llvm.amdgcn.buffer.load"); + +__device__ vector_type::MemoryType +__llvm_amdgcn_buffer_loadx2(int32x4_t rsrc, + uint32_t vindex, + uint32_t offset, + bool glc, + bool slc) __asm("llvm.amdgcn.buffer.load.dwordx2"); + +__device__ vector_type::MemoryType +__llvm_amdgcn_buffer_loadx4(int32x4_t rsrc, + uint32_t vindex, + uint32_t offset, + bool glc, + bool slc) __asm("llvm.amdgcn.buffer.load.dwordx4"); + +__device__ void __llvm_amdgcn_buffer_store(float vdata, + int32x4_t rsrc, + uint32_t vindex, + uint32_t offset, + bool glc, + bool slc) __asm("llvm.amdgcn.buffer.store"); + +__device__ void __llvm_amdgcn_buffer_storex2(vector_type::MemoryType vdata, + int32x4_t rsrc, + uint32_t vindex, + uint32_t offset, + bool glc, + bool slc) __asm("llvm.amdgcn.buffer.store.dwordx2"); + +__device__ void __llvm_amdgcn_buffer_storex4(vector_type::MemoryType vdata, + int32x4_t rsrc, + uint32_t vindex, + uint32_t offset, + bool glc, + bool slc) __asm("llvm.amdgcn.buffer.store.dwordx4"); + +// buffer_load and buffer_store +template +__device__ typename vector_type::MemoryType __buffer_load( + const T* p_src_block, uint32_t src_thread_data_offset, uint32_t src_const_data_offset); + +template +__device__ void __buffer_store(const typename vector_type::MemoryType& src, + T* p_dst_block, + uint32_t dst_thread_data_offset, + uint32_t dst_const_data_offset); + +template <> +__device__ float __buffer_load(const float* p_src_block, + uint32_t src_thread_data_offset, + uint32_t src_const_data_offset) +{ +#if 0 + float dst; + + uint32_t src_thread_addr_offset = src_thread_data_offset * sizeof(float); + uint32_t src_const_addr_offset = src_const_data_offset * sizeof(float); + + int32x4_t src_block_setting{0}; + // fill in byte 0 - 1 + *reinterpret_cast(&src_block_setting) = const_cast(p_src_block); + // fill in byte 2 + reinterpret_cast(&src_block_setting)[2] = -1; + // fill in byte 3 + reinterpret_cast(&src_block_setting)[3] = 0x00027000; + + asm volatile("\n \ + buffer_load_dword %0, %1, %2, %3 offen offset:0 \n \ + s_waitcnt 0 \n \ + " + : "=v"(dst) + : "v"(src_thread_addr_offset), "s"(src_block_setting), "s"(src_const_addr_offset)); + + return dst; +#else + float dst; + + uint32_t src_thread_addr_offset = src_thread_data_offset * sizeof(float); + uint32_t src_const_addr_offset = src_const_data_offset * sizeof(float); + + int32x4_t src_block_setting{0}; + // fill in byte 0 - 1 + *reinterpret_cast(&src_block_setting) = const_cast(p_src_block); + // fill in byte 2 + reinterpret_cast(&src_block_setting)[2] = -1; + // fill in byte 3 + reinterpret_cast(&src_block_setting)[3] = 0x00027000; + + dst = __llvm_amdgcn_buffer_load( + src_block_setting, 0, src_thread_addr_offset + src_const_addr_offset, false, false); + + return dst; +#endif +} + +template <> +__device__ vector_type::MemoryType __buffer_load( + const float* p_src_block, uint32_t src_thread_data_offset, uint32_t src_const_data_offset) +{ +#if 0 + vector_type::MemoryType dst; + + uint32_t src_thread_addr_offset = src_thread_data_offset * sizeof(float); + uint32_t src_const_addr_offset = src_const_data_offset * sizeof(float); + + int32x4_t src_block_setting{0}; + // fill in byte 0 - 1 + *reinterpret_cast(&src_block_setting) = const_cast(p_src_block); + // fill in byte 2 + reinterpret_cast(&src_block_setting)[2] = -1; + // fill in byte 3 + reinterpret_cast(&src_block_setting)[3] = 0x00027000; + + asm volatile("\n \ + buffer_load_dwordx2 %0, %1, %2, %3 offen offset:0 \n \ + s_waitcnt 0 \n \ + " + : "=v"(dst) + : "v"(src_thread_addr_offset), "s"(src_block_setting), "s"(src_const_addr_offset)); + + return dst; +#else + vector_type::MemoryType dst; + + uint32_t src_thread_addr_offset = src_thread_data_offset * sizeof(float); + uint32_t src_const_addr_offset = src_const_data_offset * sizeof(float); + + int32x4_t src_block_setting{0}; + // fill in byte 0 - 1 + *reinterpret_cast(&src_block_setting) = const_cast(p_src_block); + // fill in byte 2 + reinterpret_cast(&src_block_setting)[2] = -1; + // fill in byte 3 + reinterpret_cast(&src_block_setting)[3] = 0x00027000; + + dst = __llvm_amdgcn_buffer_loadx2( + src_block_setting, 0, src_thread_addr_offset + src_const_addr_offset, false, false); + + return dst; +#endif +} + +template <> +__device__ vector_type::MemoryType __buffer_load( + const float* p_src_block, uint32_t src_thread_data_offset, uint32_t src_const_data_offset) +{ +#if 0 + vector_type::MemoryType dst; + + uint32_t src_thread_addr_offset = src_thread_data_offset * sizeof(float); + uint32_t src_const_addr_offset = src_const_data_offset * sizeof(float); + + int32x4_t src_block_setting{0}; + // fill in byte 0 - 1 + *reinterpret_cast(&src_block_setting) = const_cast(p_src_block); + // fill in byte 2 + reinterpret_cast(&src_block_setting)[2] = -1; + // fill in byte 3 + reinterpret_cast(&src_block_setting)[3] = 0x00027000; + + asm volatile("\n \ + buffer_load_dwordx4 %0, %1, %2, %3 offen offset:0 \n \ + s_waitcnt 0 \n \ + " + : "=v"(dst) + : "v"(src_thread_addr_offset), "s"(src_block_setting), "s"(src_const_addr_offset)); + + return dst; +#elif 1 + vector_type::MemoryType dst; + + uint32_t src_thread_addr_offset = src_thread_data_offset * sizeof(float); + uint32_t src_const_addr_offset = src_const_data_offset * sizeof(float); + + int32x4_t src_block_setting{0}; + // fill in byte 0 - 1 + *reinterpret_cast(&src_block_setting) = const_cast(p_src_block); + // fill in byte 2 + reinterpret_cast(&src_block_setting)[2] = -1; + // fill in byte 3 + reinterpret_cast(&src_block_setting)[3] = 0x00027000; + + dst = __llvm_amdgcn_buffer_loadx4( + src_block_setting, 0, src_thread_addr_offset + src_const_addr_offset, false, false); + + return dst; +#endif +} + +template <> +__device__ void __buffer_store(const float& src, + float* p_dst_block, + uint32_t dst_thread_data_offset, + uint32_t dst_const_data_offset) +{ +#if 0 + uint32_t dst_thread_addr_offset = dst_thread_data_offset * sizeof(float); + uint32_t dst_const_addr_offset = dst_const_data_offset * sizeof(float); + + int32x4_t dst_block_setting{0}; + // fill in byte 0 - 1 + *reinterpret_cast(&dst_block_setting) = p_dst_block; + // fill in byte 2 + reinterpret_cast(&dst_block_setting)[2] = -1; + // fill in byte 3 + reinterpret_cast(&dst_block_setting)[3] = 0x00027000; + + asm volatile("\n \ + buffer_store_dword %1, %2, %0, %3 offen offset:0 \n \ + " + : + : "s"(dst_block_setting), + "v"(src), + "v"(dst_thread_addr_offset), + "s"(dst_const_addr_offset)); +#else + uint32_t dst_thread_addr_offset = dst_thread_data_offset * sizeof(float); + uint32_t dst_const_addr_offset = dst_const_data_offset * sizeof(float); + + int32x4_t dst_block_setting{0}; + // fill in byte 0 - 1 + *reinterpret_cast(&dst_block_setting) = p_dst_block; + // fill in byte 2 + reinterpret_cast(&dst_block_setting)[2] = -1; + // fill in byte 3 + reinterpret_cast(&dst_block_setting)[3] = 0x00027000; + + __llvm_amdgcn_buffer_store( + src, dst_block_setting, 0, dst_thread_addr_offset + dst_const_addr_offset, false, false); +#endif +} + +template <> +__device__ void __buffer_store(const vector_type::MemoryType& src, + float* p_dst_block, + uint32_t dst_thread_data_offset, + uint32_t dst_const_data_offset) +{ +#if 0 + uint32_t dst_thread_addr_offset = dst_thread_data_offset * sizeof(float); + uint32_t dst_const_addr_offset = dst_const_data_offset * sizeof(float); + + int32x4_t dst_block_setting{0}; + // fill in byte 0 - 1 + *reinterpret_cast(&dst_block_setting) = p_dst_block; + // fill in byte 2 + reinterpret_cast(&dst_block_setting)[2] = -1; + // fill in byte 3 + reinterpret_cast(&dst_block_setting)[3] = 0x00027000; + + asm volatile("\n \ + buffer_store_dwordx2 %1, %2, %0, %3 offen offset:0 \n \ + " + : + : "s"(dst_block_setting), + "v"(src), + "v"(dst_thread_addr_offset), + "s"(dst_const_addr_offset)); +#else + uint32_t dst_thread_addr_offset = dst_thread_data_offset * sizeof(float); + uint32_t dst_const_addr_offset = dst_const_data_offset * sizeof(float); + + int32x4_t dst_block_setting{0}; + // fill in byte 0 - 1 + *reinterpret_cast(&dst_block_setting) = p_dst_block; + // fill in byte 2 + reinterpret_cast(&dst_block_setting)[2] = -1; + // fill in byte 3 + reinterpret_cast(&dst_block_setting)[3] = 0x00027000; + + __llvm_amdgcn_buffer_storex2( + src, dst_block_setting, 0, dst_thread_addr_offset + dst_const_addr_offset, false, false); +#endif +} + +template <> +__device__ void __buffer_store(const vector_type::MemoryType& src, + float* p_dst_block, + uint32_t dst_thread_data_offset, + uint32_t dst_const_data_offset) +{ +#if 0 + uint32_t dst_thread_addr_offset = dst_thread_data_offset * sizeof(float); + uint32_t dst_const_addr_offset = dst_const_data_offset * sizeof(float); + + int32x4_t dst_block_setting{0}; + // fill in byte 0 - 1 + *reinterpret_cast(&dst_block_setting) = p_dst_block; + // fill in byte 2 + reinterpret_cast(&dst_block_setting)[2] = -1; + // fill in byte 3 + reinterpret_cast(&dst_block_setting)[3] = 0x00027000; + + asm volatile("\n \ + buffer_store_dwordx4 %1, %2, %0, %3 offen offset:0 \n \ + " + : + : "s"(dst_block_setting), + "v"(src), + "v"(dst_thread_addr_offset), + "s"(dst_const_addr_offset)); +#else + uint32_t dst_thread_addr_offset = dst_thread_data_offset * sizeof(float); + uint32_t dst_const_addr_offset = dst_const_data_offset * sizeof(float); + + int32x4_t dst_block_setting{0}; + // fill in byte 0 - 1 + *reinterpret_cast(&dst_block_setting) = p_dst_block; + // fill in byte 2 + reinterpret_cast(&dst_block_setting)[2] = -1; + // fill in byte 3 + reinterpret_cast(&dst_block_setting)[3] = 0x00027000; + + __llvm_amdgcn_buffer_storex4( + src, dst_block_setting, 0, dst_thread_addr_offset + dst_const_addr_offset, false, false); +#endif +} + +} // namespace ck +#endif