From 184c6e7d37f27afd45fa894559b45673649d14de Mon Sep 17 00:00:00 2001 From: Chao Liu Date: Fri, 20 Sep 2019 21:45:03 -0500 Subject: [PATCH] nvidia build --- .../tensor_description/tensor_coordinate.hpp | 4 +- .../tensor_coordinate_v2.hpp | 4 +- .../tensor_description/tensor_descriptor.hpp | 2 +- .../threadwise_generic_tensor_slice_copy.hpp | 68 +-- .../include/utility/amd_inline_asm.hpp | 501 ------------------ .../include/utility/common_header.hpp | 4 + .../include/utility/config_amd.hpp.in | 6 +- .../include/utility/config_nvidia.hpp.in | 6 +- 8 files changed, 41 insertions(+), 554 deletions(-) diff --git a/composable_kernel/include/tensor_description/tensor_coordinate.hpp b/composable_kernel/include/tensor_description/tensor_coordinate.hpp index 223d0d5bed..5b4805e9ee 100644 --- a/composable_kernel/include/tensor_description/tensor_coordinate.hpp +++ b/composable_kernel/include/tensor_description/tensor_coordinate.hpp @@ -325,14 +325,14 @@ struct TensorCoordinate private: template __host__ __device__ static constexpr auto - MakeDummyTensorCoordinate(ConstantTensorDescriptor) + MakeDummyTensorCoordinate(ConstantTensorDescriptor) { return NormalTensorCoordinate>(); } template __host__ __device__ static constexpr auto - MakeDummyTensorCoordinate(ConstantMergedTensorDescriptor) + MakeDummyTensorCoordinate(ConstantMergedTensorDescriptor) { return MergedTensorCoordinate>(); } diff --git a/composable_kernel/include/tensor_description/tensor_coordinate_v2.hpp b/composable_kernel/include/tensor_description/tensor_coordinate_v2.hpp index 62dc8b4c9a..831088ab25 100644 --- a/composable_kernel/include/tensor_description/tensor_coordinate_v2.hpp +++ b/composable_kernel/include/tensor_description/tensor_coordinate_v2.hpp @@ -188,7 +188,7 @@ struct TensorCoordinate_v2 private: template __host__ __device__ static constexpr auto - MakeDummyTensorCoordinate(NativeTensorDescriptor) + MakeDummyTensorCoordinate(NativeTensorDescriptor) { return NativeTensorCoordinate>( make_zero_array()); @@ -196,7 +196,7 @@ struct TensorCoordinate_v2 template __host__ __device__ static constexpr auto - MakeDummyTensorCoordinate(TransformedTensorDescriptor) + MakeDummyTensorCoordinate(TransformedTensorDescriptor) { return TransformedTensorCoordinate>( make_zero_array()); diff --git a/composable_kernel/include/tensor_description/tensor_descriptor.hpp b/composable_kernel/include/tensor_description/tensor_descriptor.hpp index 4cff9a45d1..4d0acd7993 100644 --- a/composable_kernel/include/tensor_description/tensor_descriptor.hpp +++ b/composable_kernel/include/tensor_description/tensor_descriptor.hpp @@ -346,7 +346,7 @@ struct TransformedTensorDescriptor return GetLowerTensorDescriptor().CalculateOffset(CalculateLowerIndex(idx_up)); } -#if 0 +#if 1 struct lambda_sequence_logic_or { template diff --git a/composable_kernel/include/tensor_operation/threadwise_generic_tensor_slice_copy.hpp b/composable_kernel/include/tensor_operation/threadwise_generic_tensor_slice_copy.hpp index 68e6b5bb02..67b0969a0e 100644 --- a/composable_kernel/include/tensor_operation/threadwise_generic_tensor_slice_copy.hpp +++ b/composable_kernel/include/tensor_operation/threadwise_generic_tensor_slice_copy.hpp @@ -21,6 +21,10 @@ #define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V2R1 0 #endif +#ifndef CK_EXPERIMENTAL_USE_AMD_BUFFER_LOAD_STORE_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V2R1 +#define CK_EXPERIMENTAL_USE_AMD_BUFFER_LOAD_STORE_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V2R1 0 +#endif + namespace ck { // This threadwise copy allow vector access of src and dst. @@ -835,19 +839,15 @@ struct ThreadwiseGenericTensorSliceCopy_v2r1 // 2. src_normal_offset must be calculatd at compile time (guaranteed) // 3. src_merged_offset can be runtime value (no assumption imposed) static_if{}([&](auto) { -#if 0 // source code - vector_data = *reinterpret_cast( - &p_src[src_normal_offset + src_merged_offset]); -#elif 0 // inline asm using global_load - vector_data = __global_load( - p_src, - static_cast(src_merged_offset), - static_cast(src_normal_offset)); -#elif 1 // inline asm using buffer_load +#if CK_USE_AMD_INTRINSIC && \ + CK_USE_AMD_INTRINSIC_BUFFER_LOAD_STORE_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V2R1 vector_data = __buffer_load( p_src, static_cast(src_merged_offset), static_cast(src_normal_offset)); +#else + vector_data = *reinterpret_cast( + &p_src[src_normal_offset + src_merged_offset]); #endif }).Else([&](auto) { // src can be all kinds of memory-space. @@ -940,15 +940,13 @@ struct ThreadwiseGenericTensorSliceCopy_v2r1 // 2. dst_normal_offset must be calculatd at compile time (guaranteed) // 3. dst_merged_offset can be runtime value (no assumption imposed) static_if{}([&](auto) { -#if 0 // source code - *reinterpret_cast( - &p_dst[dst_normal_offset + dst_merged_offset]) = vector_data; -#elif 0 // inline asm using global_store - __global_store( - vector_data, p_dst, dst_merged_offset, dst_normal_offset); -#elif 1 // inline asm using buffer_store +#if CK_USE_AMD_INTRINSIC && \ + CK_USE_AMD_INTRINSIC_BUFFER_LOAD_STORE_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V2R1 __buffer_store( vector_data, p_dst, dst_merged_offset, dst_normal_offset); +#else + *reinterpret_cast( + &p_dst[dst_normal_offset + dst_merged_offset]) = vector_data; #endif }).Else([&](auto) { // dst can be all kinds of memory-space @@ -1053,15 +1051,6 @@ struct ThreadwiseGenericTensorSliceCopy_v3r1 auto src_slice_vectorized = mSrcSlice.Vectorize(src_vector_access_dim, src_data_per_access); -#if 0 - if(get_thread_local_1d_id() == 0 && get_block_1d_id() == 0) - { - print_ConstantTensorDescriptor("mSrcSlice: ", typename decltype(mSrcSlice)::tensor_desc_type{}); - print_ConstantTensorDescriptor("src_slice_vector: ", typename decltype(src_slice_vectorized)::tensor_desc_type{}); - } -#endif - -#if 1 // debug ford{}( [&](auto src_vector_id) { // load vector from src @@ -1080,7 +1069,6 @@ struct ThreadwiseGenericTensorSliceCopy_v3r1 reinterpret_cast(&vector_data)[i]; } }); -#endif } // copy data from buffer into dst @@ -1093,15 +1081,6 @@ struct ThreadwiseGenericTensorSliceCopy_v3r1 auto dst_slice_vectorized = mDstSlice.Vectorize(dst_vector_access_dim, dst_data_per_access); -#if 0 - if(get_thread_local_1d_id() == 0 && get_block_1d_id() == 0) - { - print_ConstantTensorDescriptor("mDstSlice: ", typename decltype(mDstSlice)::tensor_desc_type{}); - print_ConstantTensorDescriptor("dst_slice_vector: ", typename decltype(dst_slice_vectorized)::tensor_desc_type{}); - } -#endif - -#if 1 // debug ford{}( [&](auto dst_vector_id) { @@ -1122,7 +1101,6 @@ struct ThreadwiseGenericTensorSliceCopy_v3r1 // write vector into dst dst_slice_vectorized(dst_vector_id) = vector_data; }); -#endif } } @@ -1330,13 +1308,14 @@ struct ThreadwiseGenericTensorSliceCopy_v4r2 const index_t buffer_offset = i * src_data_per_access; static_if{}([&](auto) { -#if 0 // source code - *reinterpret_cast(&p_long_vector[buffer_offset]) = - *reinterpret_cast(&p_src[src_offset]); -#elif 1 // inline asm using buffer_load +#if CK_USE_AMD_INTRINSIC && \ + CK_USE_AMD_INTRINSIC_BUFFER_LOAD_STORE_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V2R1 *reinterpret_cast(&p_long_vector[buffer_offset]) = __buffer_load( p_src, static_cast(src_offset), static_cast(0)); +#else + *reinterpret_cast(&p_long_vector[buffer_offset]) = + *reinterpret_cast(&p_src[src_offset]); #endif }).Else([&](auto) { // src can be all kinds of memory-space. @@ -1358,15 +1337,16 @@ struct ThreadwiseGenericTensorSliceCopy_v4r2 (mDstSliceOrigin + (long_vector_data_begin_id + scalar_id)).GetOffset(); static_if{}([&](auto) { -#if 0 // source code - *reinterpret_cast(&p_dst[dst_offset]) = - *reinterpret_cast(&p_long_vector[buffer_offset]); -#elif 1 // inline asm using buffer_store +#if CK_USE_AMD_INTRINSIC && \ + CK_USE_AMD_INTRINSIC_BUFFER_LOAD_STORE_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V2R1 __buffer_store( *reinterpret_cast(&p_long_vector[buffer_offset]), p_dst, dst_offset, 0); +#else + *reinterpret_cast(&p_dst[dst_offset]) = + *reinterpret_cast(&p_long_vector[buffer_offset]); #endif }).Else([&](auto) { // dst can be all kinds of memory-space diff --git a/composable_kernel/include/utility/amd_inline_asm.hpp b/composable_kernel/include/utility/amd_inline_asm.hpp index fd8669256d..0a17b4bd3a 100644 --- a/composable_kernel/include/utility/amd_inline_asm.hpp +++ b/composable_kernel/include/utility/amd_inline_asm.hpp @@ -8,507 +8,6 @@ namespace ck { // cast a pointer of LDS to its address extern "C" __attribute__((address_space(3))) __device__ void* __to_local(void* p); -__device__ float __llvm_amdgcn_buffer_load(int32x4_t rsrc, - uint32_t vindex, - uint32_t offset, - bool glc, - bool slc) __asm("llvm.amdgcn.buffer.load"); - -__device__ vector_type::MemoryType -__llvm_amdgcn_buffer_loadx2(int32x4_t rsrc, - uint32_t vindex, - uint32_t offset, - bool glc, - bool slc) __asm("llvm.amdgcn.buffer.load.dwordx2"); - -__device__ vector_type::MemoryType -__llvm_amdgcn_buffer_loadx4(int32x4_t rsrc, - uint32_t vindex, - uint32_t offset, - bool glc, - bool slc) __asm("llvm.amdgcn.buffer.load.dwordx4"); - -__device__ void __llvm_amdgcn_buffer_store(float vdata, - int32x4_t rsrc, - uint32_t vindex, - uint32_t offset, - bool glc, - bool slc) __asm("llvm.amdgcn.buffer.store"); - -__device__ void __llvm_amdgcn_buffer_storex2(vector_type::MemoryType vdata, - int32x4_t rsrc, - uint32_t vindex, - uint32_t offset, - bool glc, - bool slc) __asm("llvm.amdgcn.buffer.store.dwordx2"); - -__device__ void __llvm_amdgcn_buffer_storex4(vector_type::MemoryType vdata, - int32x4_t rsrc, - uint32_t vindex, - uint32_t offset, - bool glc, - bool slc) __asm("llvm.amdgcn.buffer.store.dwordx4"); - -// global_load and global_store -template -__device__ typename vector_type::MemoryType __global_load( - const T* p_src_block, uint32_t src_thread_data_offset, uint32_t src_const_data_offset); - -template -__device__ void __global_store(const typename vector_type::MemoryType& src, - T* p_dst_block, - uint32_t dst_thread_data_offset, - uint32_t dst_const_data_offset); - -template <> -__device__ float __global_load(const float* p_src_block, - uint32_t src_thread_data_offset, - uint32_t src_const_data_offset) -{ - float dst; - -#if 0 // source code - dst = p_src_block[src_const_data_offset + src_thread_data_offset]; -#elif 0 // use VGPR only - const float* src_thread_addr_offset_u64 = - p_src_block + src_const_data_offset + src_thread_data_offset; - - asm volatile("\n \ - global_load_dword %0, %1 off offset:0 \n \ - s_waitcnt 0 \n \ - " - : "=v"(dst) - : "v"(src_thread_addr_offset_u64)); -#elif 0 // use VGPR and SGPR, do compute on VALU - uint64_t src_thread_addr_offset_u64 = - (src_thread_data_offset + src_const_data_offset) * sizeof(float); - - asm volatile("\n \ - global_load_dword %0, %1, %2, offset:0 \n \ - s_waitcnt 0 \n \ - " - : "=v"(dst) - : "v"(src_thread_addr_offset_u64), "s"(p_src_block)); -#elif 1 // use VGPR and SGPR, do compute on SALU - uint64_t src_thread_addr_offset_u64 = - static_cast(src_thread_data_offset * sizeof(float)); - - const float* p_src_block_with_offset = p_src_block + src_const_data_offset; - - asm volatile("\n \ - global_load_dword %0, %1, %2, offset:0 \n \ - s_waitcnt 0 \n \ - " - : "=v"(dst) - : "v"(src_thread_addr_offset_u64), "s"(p_src_block_with_offset)); -#endif - - return dst; -} - -template <> -__device__ vector_type::MemoryType __global_load( - const float* p_src_block, uint32_t src_thread_data_offset, uint32_t src_const_data_offset) -{ - using vector_t = vector_type::MemoryType; - - vector_t dst; - -#if 0 // source code - dst = *reinterpret_cast(&p_src_block[src_const_data_offset + src_thread_data_offset]); -#elif 0 // use VGPR only - const float* src_thread_addr_offset_u64 = - p_src_block + src_const_data_offset + src_thread_data_offset; - - asm volatile("\n \ - global_load_dwordx2 %0, %1 off offset:0 \n \ - s_waitcnt 0 \n \ - " - : "=v"(dst) - : "v"(src_thread_addr_offset_u64)); -#elif 0 // use VGPR and SGPR, do compute on VALU - uint64_t src_thread_addr_offset_u64 = - (src_thread_data_offset + src_const_data_offset) * sizeof(float); - - asm volatile("\n \ - global_load_dwordx2 %0, %1, %2, offset:0 \n \ - s_waitcnt 0 \n \ - " - : "=v"(dst) - : "v"(src_thread_addr_offset_u64), "s"(p_src_block)); -#elif 1 // use VGPR and SGPR, do compute on SALU - uint64_t src_thread_addr_offset_u64 = - static_cast(src_thread_data_offset * sizeof(float)); - - const float* p_src_block_with_offset = p_src_block + src_const_data_offset; - - asm volatile("\n \ - global_load_dwordx2 %0, %1, %2, offset:0 \n \ - s_waitcnt 0 \n \ - " - : "=v"(dst) - : "v"(src_thread_addr_offset_u64), "s"(p_src_block_with_offset)); -#endif - - return dst; -} - -template <> -__device__ vector_type::MemoryType __global_load( - const float* p_src_block, uint32_t src_thread_data_offset, uint32_t src_const_data_offset) -{ - using vector_t = vector_type::MemoryType; - - vector_t dst; - -#if 0 // source code - dst = *reinterpret_cast(&p_src_block[src_const_data_offset + src_thread_data_offset]); -#elif 0 // use VGPR only - const float* src_thread_addr_offset_u64 = - p_src_block + src_const_data_offset + src_thread_data_offset; - - asm volatile("\n \ - global_load_dwordx4 %0, %1 off offset:0 \n \ - s_waitcnt 0 \n \ - " - : "=v"(dst) - : "v"(src_thread_addr_offset_u64)); -#elif 0 // use VGPR and SGPR, do compute on VALU - uint64_t src_thread_addr_offset_u64 = - (src_thread_data_offset + src_const_data_offset) * sizeof(float); - - asm volatile("\n \ - global_load_dwordx4 %0, %1, %2, offset:0 \n \ - s_waitcnt 0 \n \ - " - : "=v"(dst) - : "v"(src_thread_addr_offset_u64), "s"(p_src_block)); -#elif 1 // use VGPR and SGPR, do compute on SALU - uint64_t src_thread_addr_offset_u64 = - static_cast(src_thread_data_offset * sizeof(float)); - - const float* p_src_block_with_offset = p_src_block + src_const_data_offset; - - asm volatile("\n \ - global_load_dwordx4 %0, %1, %2, offset:0 \n \ - s_waitcnt 0 \n \ - " - : "=v"(dst) - : "v"(src_thread_addr_offset_u64), "s"(p_src_block_with_offset)); -#endif - - return dst; -} - -template <> -__device__ void __global_store(const float& src, - float* p_dst_block, - uint32_t dst_thread_data_offset, - uint32_t dst_const_data_offset) -{ -#if 0 // compute on VALU - uint64_t dst_thread_data_offset_u64 = (dst_thread_data_offset + dst_const_data_offset) * sizeof(float); - - asm volatile("\n \ - global_store_dword %0, %1, %2, offset:0 \n \ - " - : - : "v"(dst_thread_data_offset_u64), "v"(src), "s"(p_dst_block)); -#else // compute on SALU - uint64_t dst_thread_data_offset_u64 = dst_thread_data_offset * sizeof(float); - - float* p_dst_block_with_offset = p_dst_block + dst_const_data_offset; - - asm volatile("\n \ - global_store_dword %0, %1, %2, offset:0 \n \ - " - : - : "v"(dst_thread_data_offset_u64), "v"(src), "s"(p_dst_block_with_offset)); -#endif -} - -// buffer_load and buffer_store -template -__device__ typename vector_type::MemoryType __buffer_load( - const T* p_src_block, uint32_t src_thread_data_offset, uint32_t src_const_data_offset); - -template -__device__ void __buffer_store(const typename vector_type::MemoryType& src, - T* p_dst_block, - uint32_t dst_thread_data_offset, - uint32_t dst_const_data_offset); - -template <> -__device__ float __buffer_load(const float* p_src_block, - uint32_t src_thread_data_offset, - uint32_t src_const_data_offset) -{ -#if 0 - float dst; - - uint32_t src_thread_addr_offset = src_thread_data_offset * sizeof(float); - uint32_t src_const_addr_offset = src_const_data_offset * sizeof(float); - - int32x4_t src_block_setting{0}; - // fill in byte 0 - 1 - *reinterpret_cast(&src_block_setting) = const_cast(p_src_block); - // fill in byte 2 - reinterpret_cast(&src_block_setting)[2] = -1; - // fill in byte 3 - reinterpret_cast(&src_block_setting)[3] = 0x00027000; - - asm volatile("\n \ - buffer_load_dword %0, %1, %2, %3 offen offset:0 \n \ - s_waitcnt 0 \n \ - " - : "=v"(dst) - : "v"(src_thread_addr_offset), "s"(src_block_setting), "s"(src_const_addr_offset)); - - return dst; -#else - float dst; - - uint32_t src_thread_addr_offset = src_thread_data_offset * sizeof(float); - uint32_t src_const_addr_offset = src_const_data_offset * sizeof(float); - - int32x4_t src_block_setting{0}; - // fill in byte 0 - 1 - *reinterpret_cast(&src_block_setting) = const_cast(p_src_block); - // fill in byte 2 - reinterpret_cast(&src_block_setting)[2] = -1; - // fill in byte 3 - reinterpret_cast(&src_block_setting)[3] = 0x00027000; - - dst = __llvm_amdgcn_buffer_load( - src_block_setting, 0, src_thread_addr_offset + src_const_addr_offset, false, false); - - return dst; -#endif -} - -template <> -__device__ vector_type::MemoryType __buffer_load( - const float* p_src_block, uint32_t src_thread_data_offset, uint32_t src_const_data_offset) -{ -#if 0 - vector_type::MemoryType dst; - - uint32_t src_thread_addr_offset = src_thread_data_offset * sizeof(float); - uint32_t src_const_addr_offset = src_const_data_offset * sizeof(float); - - int32x4_t src_block_setting{0}; - // fill in byte 0 - 1 - *reinterpret_cast(&src_block_setting) = const_cast(p_src_block); - // fill in byte 2 - reinterpret_cast(&src_block_setting)[2] = -1; - // fill in byte 3 - reinterpret_cast(&src_block_setting)[3] = 0x00027000; - - asm volatile("\n \ - buffer_load_dwordx2 %0, %1, %2, %3 offen offset:0 \n \ - s_waitcnt 0 \n \ - " - : "=v"(dst) - : "v"(src_thread_addr_offset), "s"(src_block_setting), "s"(src_const_addr_offset)); - - return dst; -#else - vector_type::MemoryType dst; - - uint32_t src_thread_addr_offset = src_thread_data_offset * sizeof(float); - uint32_t src_const_addr_offset = src_const_data_offset * sizeof(float); - - int32x4_t src_block_setting{0}; - // fill in byte 0 - 1 - *reinterpret_cast(&src_block_setting) = const_cast(p_src_block); - // fill in byte 2 - reinterpret_cast(&src_block_setting)[2] = -1; - // fill in byte 3 - reinterpret_cast(&src_block_setting)[3] = 0x00027000; - - dst = __llvm_amdgcn_buffer_loadx2( - src_block_setting, 0, src_thread_addr_offset + src_const_addr_offset, false, false); - - return dst; -#endif -} - -template <> -__device__ vector_type::MemoryType __buffer_load( - const float* p_src_block, uint32_t src_thread_data_offset, uint32_t src_const_data_offset) -{ -#if 0 - vector_type::MemoryType dst; - - uint32_t src_thread_addr_offset = src_thread_data_offset * sizeof(float); - uint32_t src_const_addr_offset = src_const_data_offset * sizeof(float); - - int32x4_t src_block_setting{0}; - // fill in byte 0 - 1 - *reinterpret_cast(&src_block_setting) = const_cast(p_src_block); - // fill in byte 2 - reinterpret_cast(&src_block_setting)[2] = -1; - // fill in byte 3 - reinterpret_cast(&src_block_setting)[3] = 0x00027000; - - asm volatile("\n \ - buffer_load_dwordx4 %0, %1, %2, %3 offen offset:0 \n \ - s_waitcnt 0 \n \ - " - : "=v"(dst) - : "v"(src_thread_addr_offset), "s"(src_block_setting), "s"(src_const_addr_offset)); - - return dst; -#elif 1 - vector_type::MemoryType dst; - - uint32_t src_thread_addr_offset = src_thread_data_offset * sizeof(float); - uint32_t src_const_addr_offset = src_const_data_offset * sizeof(float); - - int32x4_t src_block_setting{0}; - // fill in byte 0 - 1 - *reinterpret_cast(&src_block_setting) = const_cast(p_src_block); - // fill in byte 2 - reinterpret_cast(&src_block_setting)[2] = -1; - // fill in byte 3 - reinterpret_cast(&src_block_setting)[3] = 0x00027000; - - dst = __llvm_amdgcn_buffer_loadx4( - src_block_setting, 0, src_thread_addr_offset + src_const_addr_offset, false, false); - - return dst; -#endif -} - -template <> -__device__ void __buffer_store(const float& src, - float* p_dst_block, - uint32_t dst_thread_data_offset, - uint32_t dst_const_data_offset) -{ -#if 0 - uint32_t dst_thread_addr_offset = dst_thread_data_offset * sizeof(float); - uint32_t dst_const_addr_offset = dst_const_data_offset * sizeof(float); - - int32x4_t dst_block_setting{0}; - // fill in byte 0 - 1 - *reinterpret_cast(&dst_block_setting) = p_dst_block; - // fill in byte 2 - reinterpret_cast(&dst_block_setting)[2] = -1; - // fill in byte 3 - reinterpret_cast(&dst_block_setting)[3] = 0x00027000; - - asm volatile("\n \ - buffer_store_dword %1, %2, %0, %3 offen offset:0 \n \ - " - : - : "s"(dst_block_setting), - "v"(src), - "v"(dst_thread_addr_offset), - "s"(dst_const_addr_offset)); -#else - uint32_t dst_thread_addr_offset = dst_thread_data_offset * sizeof(float); - uint32_t dst_const_addr_offset = dst_const_data_offset * sizeof(float); - - int32x4_t dst_block_setting{0}; - // fill in byte 0 - 1 - *reinterpret_cast(&dst_block_setting) = p_dst_block; - // fill in byte 2 - reinterpret_cast(&dst_block_setting)[2] = -1; - // fill in byte 3 - reinterpret_cast(&dst_block_setting)[3] = 0x00027000; - - __llvm_amdgcn_buffer_store( - src, dst_block_setting, 0, dst_thread_addr_offset + dst_const_addr_offset, false, false); -#endif -} - -template <> -__device__ void __buffer_store(const vector_type::MemoryType& src, - float* p_dst_block, - uint32_t dst_thread_data_offset, - uint32_t dst_const_data_offset) -{ -#if 0 - uint32_t dst_thread_addr_offset = dst_thread_data_offset * sizeof(float); - uint32_t dst_const_addr_offset = dst_const_data_offset * sizeof(float); - - int32x4_t dst_block_setting{0}; - // fill in byte 0 - 1 - *reinterpret_cast(&dst_block_setting) = p_dst_block; - // fill in byte 2 - reinterpret_cast(&dst_block_setting)[2] = -1; - // fill in byte 3 - reinterpret_cast(&dst_block_setting)[3] = 0x00027000; - - asm volatile("\n \ - buffer_store_dwordx2 %1, %2, %0, %3 offen offset:0 \n \ - " - : - : "s"(dst_block_setting), - "v"(src), - "v"(dst_thread_addr_offset), - "s"(dst_const_addr_offset)); -#else - uint32_t dst_thread_addr_offset = dst_thread_data_offset * sizeof(float); - uint32_t dst_const_addr_offset = dst_const_data_offset * sizeof(float); - - int32x4_t dst_block_setting{0}; - // fill in byte 0 - 1 - *reinterpret_cast(&dst_block_setting) = p_dst_block; - // fill in byte 2 - reinterpret_cast(&dst_block_setting)[2] = -1; - // fill in byte 3 - reinterpret_cast(&dst_block_setting)[3] = 0x00027000; - - __llvm_amdgcn_buffer_storex2( - src, dst_block_setting, 0, dst_thread_addr_offset + dst_const_addr_offset, false, false); -#endif -} - -template <> -__device__ void __buffer_store(const vector_type::MemoryType& src, - float* p_dst_block, - uint32_t dst_thread_data_offset, - uint32_t dst_const_data_offset) -{ -#if 0 - uint32_t dst_thread_addr_offset = dst_thread_data_offset * sizeof(float); - uint32_t dst_const_addr_offset = dst_const_data_offset * sizeof(float); - - int32x4_t dst_block_setting{0}; - // fill in byte 0 - 1 - *reinterpret_cast(&dst_block_setting) = p_dst_block; - // fill in byte 2 - reinterpret_cast(&dst_block_setting)[2] = -1; - // fill in byte 3 - reinterpret_cast(&dst_block_setting)[3] = 0x00027000; - - asm volatile("\n \ - buffer_store_dwordx4 %1, %2, %0, %3 offen offset:0 \n \ - " - : - : "s"(dst_block_setting), - "v"(src), - "v"(dst_thread_addr_offset), - "s"(dst_const_addr_offset)); -#else - uint32_t dst_thread_addr_offset = dst_thread_data_offset * sizeof(float); - uint32_t dst_const_addr_offset = dst_const_data_offset * sizeof(float); - - int32x4_t dst_block_setting{0}; - // fill in byte 0 - 1 - *reinterpret_cast(&dst_block_setting) = p_dst_block; - // fill in byte 2 - reinterpret_cast(&dst_block_setting)[2] = -1; - // fill in byte 3 - reinterpret_cast(&dst_block_setting)[3] = 0x00027000; - - __llvm_amdgcn_buffer_storex4( - src, dst_block_setting, 0, dst_thread_addr_offset + dst_const_addr_offset, false, false); -#endif -} - __device__ void vmcnt(index_t cnt) { if(cnt == 0) diff --git a/composable_kernel/include/utility/common_header.hpp b/composable_kernel/include/utility/common_header.hpp index ed581c95c0..f8b52bba5b 100644 --- a/composable_kernel/include/utility/common_header.hpp +++ b/composable_kernel/include/utility/common_header.hpp @@ -22,4 +22,8 @@ #include "amd_inline_asm.hpp" #endif +#if CK_USE_AMD_INTRINCIS +#include "amd_intrinsic.hpp" +#endif + #endif diff --git a/composable_kernel/include/utility/config_amd.hpp.in b/composable_kernel/include/utility/config_amd.hpp.in index 1b57256a00..0ee722f22d 100644 --- a/composable_kernel/include/utility/config_amd.hpp.in +++ b/composable_kernel/include/utility/config_amd.hpp.in @@ -4,9 +4,11 @@ #include "hip/hip_runtime.h" #include "hip/hip_fp16.h" +#define CK_UNSIGNED_INDEX_TYPE 0 #define CK_DEVICE_BACKEND_AMD 1 -#define CK_USE_UNSIGNED_INDEX_TYPE 0 +#define CK_USE_AMD_INTRINSIC 1 #define CK_USE_AMD_INLINE_ASM 1 +#define CK_USE_AMD_INTRINSIC_BUFFER_LOAD_STORE_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V2R1 1 #define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_BLOCKWISE_GENERIC_SLICE_COPY_V1 1 #define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V1R1 0 #define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V1R2 0 @@ -14,7 +16,7 @@ namespace ck { -#if CK_USE_UNSIGNED_INDEX_TYPE +#if CK_UNSIGNED_INDEX_TYPE using index_t = uint32_t; #else using index_t = int32_t; diff --git a/composable_kernel/include/utility/config_nvidia.hpp.in b/composable_kernel/include/utility/config_nvidia.hpp.in index b2f8e3e43e..523b7be589 100644 --- a/composable_kernel/include/utility/config_nvidia.hpp.in +++ b/composable_kernel/include/utility/config_nvidia.hpp.in @@ -6,9 +6,11 @@ #include "nvToolsExt.h" #include "helper_cuda.h" +#define CK_UNSIGNED_INDEX_TYPE 0 #define CK_DEVICE_BACKEND_NVIDIA 1 -#define CK_USE_UNSIGNED_INDEX_TYPE 0 +#define CK_USE_AMD_INTRINSIC 0 #define CK_USE_AMD_INLINE_ASM 0 +#define CK_USE_AMD_INTRINSIC_BUFFER_LOAD_STORE_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V2R1 0 #define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_BLOCKWISE_GENERIC_SLICE_COPY_V1 0 #define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V1R1 0 #define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V1R2 0 @@ -16,7 +18,7 @@ namespace ck { -#if CK_USE_UNSIGNED_INDEX_TYPE +#if CK_UNSIGNED_INDEX_TYPE using index_t = uint32_t; #else using index_t = int32_t;