From 10bb81106072e7f9de1c7ce0ed7880e41bd9f517 Mon Sep 17 00:00:00 2001 From: Chao Liu Date: Thu, 26 Aug 2021 20:05:19 -0500 Subject: [PATCH] Misc fixes (#24) * use cast_pointer_to_generic_address_space() in v6r1 kernel wrapper, DynamcBuffer and buffer_load take customized invalid-element-value, add buffer_load/store for fp64 * use remove_cvref_t --- .../tensor_description/tensor_adaptor.hpp | 3 +- .../tensor_description/tensor_descriptor.hpp | 7 +- .../blockwise_gemm_dlops_v3.hpp | 12 +- .../threadwise_contraction_dlops.hpp | 42 ++-- .../threadwise_gemm_dlops_v3.hpp | 21 +- .../threadwise_tensor_slice_set.hpp | 4 +- .../threadwise_tensor_slice_transfer.hpp | 59 +++--- .../threadwise_tensor_slice_transfer_v2.hpp | 35 ++-- .../include/utility/amd_buffer_addressing.hpp | 183 +++++++++++------- composable_kernel/include/utility/array.hpp | 2 +- .../include/utility/data_type.hpp | 11 ++ .../include/utility/dynamic_buffer.hpp | 122 ++++++------ composable_kernel/include/utility/tuple.hpp | 2 +- .../include/utility/tuple_helper.hpp | 4 +- composable_kernel/include/utility/type.hpp | 3 + ...mplicit_gemm_v6r1_dlops_nchw_kcyx_nkhw.cpp | 9 +- 16 files changed, 267 insertions(+), 252 deletions(-) diff --git a/composable_kernel/include/tensor_description/tensor_adaptor.hpp b/composable_kernel/include/tensor_description/tensor_adaptor.hpp index 3b647e433a..50a8088bba 100644 --- a/composable_kernel/include/tensor_description/tensor_adaptor.hpp +++ b/composable_kernel/include/tensor_description/tensor_adaptor.hpp @@ -189,8 +189,7 @@ struct TensorAdaptor bool is_known = true; static_for<0, Transforms::Size(), 1>{}([&](auto i) { - is_known &= - remove_cv_t>::IsKnownAtCompileTime(); + is_known &= remove_cvref_t::IsKnownAtCompileTime(); }); return is_known && is_known_at_compile_time::value; diff --git a/composable_kernel/include/tensor_description/tensor_descriptor.hpp b/composable_kernel/include/tensor_description/tensor_descriptor.hpp index a6a57ba63b..8f6a5a3e43 100644 --- a/composable_kernel/include/tensor_description/tensor_descriptor.hpp +++ b/composable_kernel/include/tensor_description/tensor_descriptor.hpp @@ -185,8 +185,7 @@ struct TensorDescriptor bool is_known = true; static_for<0, Transforms::Size(), 1>{}([&](auto i) { - is_known &= - remove_cv_t>::IsKnownAtCompileTime(); + is_known &= remove_cvref_t::IsKnownAtCompileTime(); }); return is_known && is_known_at_compile_time::value && @@ -587,11 +586,11 @@ __host__ __device__ constexpr bool coordinate_has_valid_offset(const TensorDesc& template using TensorCoordinate_t = decltype(make_tensor_coordinate( - TensorDesc{}, MultiIndex>::GetNumOfDimension()>{})); + TensorDesc{}, MultiIndex::GetNumOfDimension()>{})); template using TensorCoordinateStep_t = decltype(make_tensor_coordinate_step( - TensorDesc{}, MultiIndex>::GetNumOfDimension()>{})); + TensorDesc{}, MultiIndex::GetNumOfDimension()>{})); } // namespace ck #endif diff --git a/composable_kernel/include/tensor_operation/blockwise_gemm_dlops_v3.hpp b/composable_kernel/include/tensor_operation/blockwise_gemm_dlops_v3.hpp index 03f889649e..5cc2f2393e 100644 --- a/composable_kernel/include/tensor_operation/blockwise_gemm_dlops_v3.hpp +++ b/composable_kernel/include/tensor_operation/blockwise_gemm_dlops_v3.hpp @@ -110,13 +110,11 @@ struct BlockwiseGemmDlops_km_kn_m0m1n0n1_v3 const BThreadBuffer& b_thread_buf, CThreadBuffer& c_thread_buf) const { - static_assert(is_same>, - remove_cv_t>>::value && - is_same>, - remove_cv_t>>::value && - is_same>, - remove_cv_t>>::value && - "wrong! inconsistent type"); + static_assert( + is_same, remove_cvref_t>::value && + is_same, remove_cvref_t>::value && + is_same, remove_cvref_t>::value && + "wrong! inconsistent type"); constexpr auto I0 = Number<0>{}; diff --git a/composable_kernel/include/tensor_operation/threadwise_contraction_dlops.hpp b/composable_kernel/include/tensor_operation/threadwise_contraction_dlops.hpp index a925a5cd68..8b75381026 100644 --- a/composable_kernel/include/tensor_operation/threadwise_contraction_dlops.hpp +++ b/composable_kernel/include/tensor_operation/threadwise_contraction_dlops.hpp @@ -55,19 +55,16 @@ struct ThreadwiseGemmDlops_km0m1_kn0n1_m0m1n0n1 CBuffer& c_buf, COriginIdx) { - static_assert( - is_known_at_compile_time>>::value && - is_known_at_compile_time>>::value && - is_known_at_compile_time>>::value, - "wrong! AOriginIdx, BOriginIdx, COringinIdx should be known at compile-time"); + static_assert(is_known_at_compile_time>::value && + is_known_at_compile_time>::value && + is_known_at_compile_time>::value, + "wrong! AOriginIdx, BOriginIdx, COringinIdx should be known at compile-time"); - static_assert(is_same>, - remove_cv_t>>::value && - is_same>, - remove_cv_t>>::value && - is_same>, - remove_cv_t>>::value && - "wrong! inconsistent type"); + static_assert( + is_same, remove_cvref_t>::value && + is_same, remove_cvref_t>::value && + is_same, remove_cvref_t>::value && + "wrong! inconsistent type"); constexpr auto I0 = Number<0>{}; constexpr auto I1 = Number<1>{}; @@ -157,19 +154,16 @@ struct ThreadwiseContractionDlops_A_TK0_TM0_TM1_TK1_B_TK0_TN0_TN1_TK1_C_TM0_TM1_ CBuffer& c_buf, COriginIdx) { - static_assert( - is_known_at_compile_time>>::value && - is_known_at_compile_time>>::value && - is_known_at_compile_time>>::value, - "wrong! AOriginIdx, BOriginIdx, COringinIdx should be known at compile-time"); + static_assert(is_known_at_compile_time>::value && + is_known_at_compile_time>::value && + is_known_at_compile_time>::value, + "wrong! AOriginIdx, BOriginIdx, COringinIdx should be known at compile-time"); - static_assert(is_same>, - remove_cv_t>>::value && - is_same>, - remove_cv_t>>::value && - is_same>, - remove_cv_t>>::value && - "wrong! inconsistent type"); + static_assert( + is_same, remove_cvref_t>::value && + is_same, remove_cvref_t>::value && + is_same, remove_cvref_t>::value && + "wrong! inconsistent type"); constexpr auto I0 = Number<0>{}; constexpr auto I1 = Number<1>{}; diff --git a/composable_kernel/include/tensor_operation/threadwise_gemm_dlops_v3.hpp b/composable_kernel/include/tensor_operation/threadwise_gemm_dlops_v3.hpp index 015ad675fb..f6c15fd85a 100644 --- a/composable_kernel/include/tensor_operation/threadwise_gemm_dlops_v3.hpp +++ b/composable_kernel/include/tensor_operation/threadwise_gemm_dlops_v3.hpp @@ -41,19 +41,16 @@ struct ThreadwiseGemmDlops_km_kn_mn_v3 CDesc::IsKnownAtCompileTime(), "wrong! Desc should be known at compile-time"); - static_assert( - is_known_at_compile_time>>::value && - is_known_at_compile_time>>::value && - is_known_at_compile_time>>::value, - "wrong! AOriginIdx, BOriginIdx, COringinIdx should be known at compile-time"); + static_assert(is_known_at_compile_time>::value && + is_known_at_compile_time>::value && + is_known_at_compile_time>::value, + "wrong! AOriginIdx, BOriginIdx, COringinIdx should be known at compile-time"); - static_assert(is_same>, - remove_cv_t>>::value && - is_same>, - remove_cv_t>>::value && - is_same>, - remove_cv_t>>::value && - "wrong! inconsistent type"); + static_assert( + is_same, remove_cvref_t>::value && + is_same, remove_cvref_t>::value && + is_same, remove_cvref_t>::value && + "wrong! inconsistent type"); constexpr auto I0 = Number<0>{}; constexpr auto I1 = Number<1>{}; diff --git a/composable_kernel/include/tensor_operation/threadwise_tensor_slice_set.hpp b/composable_kernel/include/tensor_operation/threadwise_tensor_slice_set.hpp index 0c7aa978a7..20e9a5b366 100644 --- a/composable_kernel/include/tensor_operation/threadwise_tensor_slice_set.hpp +++ b/composable_kernel/include/tensor_operation/threadwise_tensor_slice_set.hpp @@ -30,11 +30,11 @@ struct ThreadwiseTensorSliceSet_v1 static_assert(Buffer::IsStaticBuffer(), "wrong! DstBuffer need to be StaticBuffer"); - static_assert(is_known_at_compile_time>>::value, + static_assert(is_known_at_compile_time>::value, "wrong! OriginIdx need to be known at compile-time"); // Desc is known at compile-time - constexpr auto desc = remove_cv_t>{}; + constexpr auto desc = remove_cvref_t{}; // OriginIdx is known at compile-time constexpr auto origin_idx = to_multi_index(OriginIdx{}); diff --git a/composable_kernel/include/tensor_operation/threadwise_tensor_slice_transfer.hpp b/composable_kernel/include/tensor_operation/threadwise_tensor_slice_transfer.hpp index e38dbbc8b5..d5c77f4a54 100644 --- a/composable_kernel/include/tensor_operation/threadwise_tensor_slice_transfer.hpp +++ b/composable_kernel/include/tensor_operation/threadwise_tensor_slice_transfer.hpp @@ -95,18 +95,13 @@ struct ThreadwiseTensorSliceTransfer_v1r3 static_assert(SrcDesc::IsKnownAtCompileTime(), "wrong! SrcDesc need to known at compile-time"); - static_assert( - is_known_at_compile_time>>::value, - "wrong! SrcSliceOrigin need to known at compile-time"); + static_assert(is_known_at_compile_time>::value, + "wrong! SrcSliceOrigin need to known at compile-time"); static_assert(SrcBuffer::IsStaticBuffer(), "wrong! SrcBuffer need to be StaticBuffer"); - // static_assert(is_same>, - // remove_cv_t>>::value, - //"wrong! SrcBuffer data type is wrong"); - // SrcDesc and src_slice_origin_idx are known at compile-time - constexpr auto src_desc = remove_cv_t>{}; + constexpr auto src_desc = remove_cvref_t{}; constexpr auto src_slice_origin_idx = to_multi_index(SrcSliceOriginIdx{}); constexpr auto I0 = Number<0>{}; @@ -421,16 +416,15 @@ struct ThreadwiseTensorSliceTransfer_v2 static_assert(DstDesc::IsKnownAtCompileTime(), "wrong! DstDesc need to known at compile-time"); - static_assert( - is_known_at_compile_time>>::value, - "wrong! DstSliceOrigin need to known at compile-time"); + static_assert(is_known_at_compile_time>::value, + "wrong! DstSliceOrigin need to known at compile-time"); - static_assert(is_same>, - remove_cv_t>>::value && - "wrong! inconsistent type"); + static_assert( + is_same, remove_cvref_t>::value && + "wrong! inconsistent type"); // DstDesc and dst_slice_origin_idx are known at compile-time - constexpr auto dst_desc = remove_cv_t>{}; + constexpr auto dst_desc = remove_cvref_t{}; constexpr auto dst_slice_origin_idx = DstSliceOriginIdx{}; constexpr auto I0 = Number<0>{}; @@ -742,9 +736,9 @@ struct ThreadwiseTensorSliceTransfer_v3 SrcBuffer::GetAddressSpace() == AddressSpaceEnum_t::Lds, "wrong!"); - static_assert(is_same>, - remove_cv_t>>::value, - "wrong! SrcBuffer and SrcData data type are inconsistent"); + static_assert( + is_same, remove_cvref_t>::value, + "wrong! SrcBuffer and SrcData data type are inconsistent"); constexpr auto I0 = Number<0>{}; constexpr auto I1 = Number<1>{}; @@ -899,9 +893,9 @@ struct ThreadwiseTensorSliceTransfer_v3 DstBuffer::GetAddressSpace() == AddressSpaceEnum_t::Lds, "wrong!"); - static_assert(is_same>, - remove_cv_t>>::value, - "wrong! SrcBuffer or DstBuffer data type is wrong"); + static_assert( + is_same, remove_cvref_t>::value, + "wrong! SrcBuffer or DstBuffer data type is wrong"); constexpr auto I0 = Number<0>{}; constexpr auto I1 = Number<1>{}; @@ -1315,24 +1309,21 @@ struct ThreadwiseTensorSliceTransfer_v4 static_assert(SrcDesc::IsKnownAtCompileTime() && DstDesc::IsKnownAtCompileTime(), "wrong! SrcDesc and DstDesc need to known at compile-time"); - static_assert(is_same>, - remove_cv_t>>::value && - is_same>, - remove_cv_t>>::value, - "wrong! SrcBuffer or DstBuffer data type is wrong"); + static_assert( + is_same, remove_cvref_t>::value && + is_same, remove_cvref_t>::value, + "wrong! SrcBuffer or DstBuffer data type is wrong"); static_assert(DstBuffer::IsStaticBuffer(), "wrong! DstBuffer need to be StaticBuffer"); - static_assert( - is_known_at_compile_time< - remove_cv_t>>::value && - is_known_at_compile_time>>::value, - "wrong! SrcOriginToRefDistance and DstOriginToRefDistance need to be known " - "at compile-time"); + static_assert(is_known_at_compile_time>::value && + is_known_at_compile_time>::value, + "wrong! SrcOriginToRefDistance and DstOriginToRefDistance need to be known " + "at compile-time"); // SrcDesc and DstDesc are known at compile-time - constexpr auto src_desc = remove_cv_t>{}; - constexpr auto dst_desc = remove_cv_t>{}; + constexpr auto src_desc = remove_cvref_t{}; + constexpr auto dst_desc = remove_cvref_t{}; // SrcOriginToRefDisttance and DstOriginToRefDistance are known at compile-time constexpr auto src_ref_to_origin_disp_idx = to_multi_index(SrcRefToOriginDisplacement{}); diff --git a/composable_kernel/include/tensor_operation/threadwise_tensor_slice_transfer_v2.hpp b/composable_kernel/include/tensor_operation/threadwise_tensor_slice_transfer_v2.hpp index ccac4b7b44..bbdaa5fa2b 100644 --- a/composable_kernel/include/tensor_operation/threadwise_tensor_slice_transfer_v2.hpp +++ b/composable_kernel/include/tensor_operation/threadwise_tensor_slice_transfer_v2.hpp @@ -80,9 +80,9 @@ struct ThreadwiseTensorSliceTransfer_v3r1 SrcBuffer::GetAddressSpace() == AddressSpaceEnum_t::Lds, "wrong!"); - static_assert(is_same>, - remove_cv_t>>::value, - "wrong! SrcBuffer and SrcData data type are inconsistent"); + static_assert( + is_same, remove_cvref_t>::value, + "wrong! SrcBuffer and SrcData data type are inconsistent"); // tensor descriptor for src_vector constexpr auto src_vector_tensor_lengths = SrcVectorTensorLengths{}; @@ -248,9 +248,9 @@ struct ThreadwiseTensorSliceTransfer_v3r1 DstBuffer::GetAddressSpace() == AddressSpaceEnum_t::Lds, "wrong!"); - static_assert(is_same>, - remove_cv_t>>::value, - "wrong! SrcBuffer or DstBuffer data type is wrong"); + static_assert( + is_same, remove_cvref_t>::value, + "wrong! SrcBuffer or DstBuffer data type is wrong"); // tensor descriptor for dst_vector constexpr auto dst_vector_tensor_lengths = DstVectorTensorLengths{}; @@ -669,24 +669,21 @@ struct ThreadwiseTensorSliceTransfer_v4r1 static_assert(SrcDesc::IsKnownAtCompileTime() && DstDesc::IsKnownAtCompileTime(), "wrong! SrcDesc and DstDesc need to known at compile-time"); - static_assert(is_same>, - remove_cv_t>>::value && - is_same>, - remove_cv_t>>::value, - "wrong! SrcBuffer or DstBuffer data type is wrong"); + static_assert( + is_same, remove_cvref_t>::value && + is_same, remove_cvref_t>::value, + "wrong! SrcBuffer or DstBuffer data type is wrong"); static_assert(DstBuffer::IsStaticBuffer(), "wrong! DstBuffer need to be StaticBuffer"); - static_assert( - is_known_at_compile_time< - remove_cv_t>>::value && - is_known_at_compile_time>>::value, - "wrong! SrcOriginToRefDistance and DstOriginToRefDistance need to be known " - "at compile-time"); + static_assert(is_known_at_compile_time>::value && + is_known_at_compile_time>::value, + "wrong! SrcOriginToRefDistance and DstOriginToRefDistance need to be known " + "at compile-time"); // SrcDesc and DstDesc are known at compile-time - constexpr auto src_desc = remove_cv_t>{}; - constexpr auto dst_desc = remove_cv_t>{}; + constexpr auto src_desc = remove_cvref_t{}; + constexpr auto dst_desc = remove_cvref_t{}; // SrcOriginToRefDisttance and DstOriginToRefDistance are known at compile-time constexpr auto src_ref_to_origin_disp_idx = to_multi_index(SrcRefToOriginDisplacement{}); diff --git a/composable_kernel/include/utility/amd_buffer_addressing.hpp b/composable_kernel/include/utility/amd_buffer_addressing.hpp index b7fd4bc409..3df53bda44 100644 --- a/composable_kernel/include/utility/amd_buffer_addressing.hpp +++ b/composable_kernel/include/utility/amd_buffer_addressing.hpp @@ -225,13 +225,49 @@ __device__ typename vector_type::type amd_buffer_load_impl(int32x4_t src_w index_t src_wave_addr_offset) { static_assert( - (is_same::value && (N == 1 || N == 2 || N == 4 || N == 8)) || - (is_same::value && (N == 1 || N == 2 || N == 4 || N == 8 || N == 16)) || + (is_same::value && (N == 1 || N == 2 || N == 4)) || + (is_same::value && (N == 1 || N == 2 || N == 4 || N == 8)) || (is_same::value && (N == 1 || N == 2 || N == 4 || N == 8)) || - (is_same::value && (N == 1 || N == 2 || N == 4 || N == 8)), + (is_same::value && (N == 1 || N == 2 || N == 4 || N == 8)) || + (is_same::value && (N == 1 || N == 2 || N == 4 || N == 8 || N == 16)), "wrong! not implemented"); - if constexpr(is_same::value) + if constexpr(is_same::value) + { + // use fp32 load to mimic fp64 load + if constexpr(N == 1) + { + const float2_t tmp = llvm_amdgcn_raw_buffer_load_fp32x2( + src_wave_buffer_resource, src_thread_addr_offset, src_wave_addr_offset, 0); + + return as_type(tmp); + } + else if constexpr(N == 2) + { + const float4_t tmp = llvm_amdgcn_raw_buffer_load_fp32x4( + src_wave_buffer_resource, src_thread_addr_offset, src_wave_addr_offset, 0); + + return as_type(tmp); + } + else if constexpr(N == 4) + { + const float4_t f32_0 = llvm_amdgcn_raw_buffer_load_fp32x4( + src_wave_buffer_resource, src_thread_addr_offset, src_wave_addr_offset, 0); + + const float4_t f32_1 = + llvm_amdgcn_raw_buffer_load_fp32x4(src_wave_buffer_resource, + src_thread_addr_offset, + src_wave_addr_offset + 4 * sizeof(float), + 0); + vector_type tmp; + + tmp.AsType()(Number<0>{}) = as_type(f32_0); + tmp.AsType()(Number<1>{}) = as_type(f32_1); + + return tmp.AsType()(Number<0>{}); + } + } + else if constexpr(is_same::value) { if constexpr(N == 1) { @@ -283,25 +319,11 @@ __device__ typename vector_type::type amd_buffer_load_impl(int32x4_t src_w } else if constexpr(N == 8) { -#if 0 - vector_type tmp; - - tmp.AsType()(Number<0>{}) = llvm_amdgcn_raw_buffer_load_fp16x4( - src_wave_buffer_resource, src_thread_addr_offset, src_wave_addr_offset, 0); - - tmp.AsType()(Number<1>{}) = - llvm_amdgcn_raw_buffer_load_fp16x4(src_wave_buffer_resource, - src_thread_addr_offset, - src_wave_addr_offset + 4 * sizeof(half_t), - 0); - - return tmp.AsType()(Number<0>{}); -#else + // use fp32 load to mimic fp16 load float4_t tmp = llvm_amdgcn_raw_buffer_load_fp32x4( src_wave_buffer_resource, src_thread_addr_offset, src_wave_addr_offset, 0); return as_type(tmp); -#endif } } else if constexpr(is_same::value) @@ -433,13 +455,34 @@ __device__ void amd_buffer_store_impl(const typename vector_type::type src index_t dst_wave_addr_offset) { static_assert( - (is_same::value && (N == 1 || N == 2 || N == 4)) || + (is_same::value && (N == 1 || N == 2)) || + (is_same::value && (N == 1 || N == 2 || N == 4)) || + (is_same::value && (N == 1 || N == 2 || N == 4 || N == 8)) || (is_same::value && (N == 1 || N == 2 || N == 4)) || - (is_same::value && (N == 1 || N == 2 || N == 4 || N == 8 || N == 16)) || - (is_same::value && (N == 1 || N == 2 || N == 4 || N == 8)), + (is_same::value && (N == 1 || N == 2 || N == 4 || N == 8 || N == 16)), "wrong! not implemented"); - if constexpr(is_same::value) + if constexpr(is_same::value) + { + // use fp32 store to mimic fp64 store + if constexpr(N == 1) + { + llvm_amdgcn_raw_buffer_store_fp32x2(as_type(src_thread_data), + dst_wave_buffer_resource, + dst_thread_addr_offset, + dst_wave_addr_offset, + 0); + } + else if constexpr(N == 2) + { + llvm_amdgcn_raw_buffer_store_fp32x4(as_type(src_thread_data), + dst_wave_buffer_resource, + dst_thread_addr_offset, + dst_wave_addr_offset, + 0); + } + } + else if constexpr(is_same::value) { if constexpr(N == 1) { @@ -466,6 +509,49 @@ __device__ void amd_buffer_store_impl(const typename vector_type::type src 0); } } + else if constexpr(is_same::value) + { + if constexpr(N == 1) + { + llvm_amdgcn_raw_buffer_store_fp16(src_thread_data, + dst_wave_buffer_resource, + dst_thread_addr_offset, + dst_wave_addr_offset, + 0); + } + else if constexpr(N == 2) + { + llvm_amdgcn_raw_buffer_store_fp16x2(src_thread_data, + dst_wave_buffer_resource, + dst_thread_addr_offset, + dst_wave_addr_offset, + 0); + } + else if constexpr(N == 4) + { + llvm_amdgcn_raw_buffer_store_fp16x4(src_thread_data, + dst_wave_buffer_resource, + dst_thread_addr_offset, + dst_wave_addr_offset, + 0); + } + else if constexpr(N == 8) + { + vector_type tmp{src_thread_data}; + + llvm_amdgcn_raw_buffer_store_fp16x4(tmp.AsType()[Number<0>{}], + dst_wave_buffer_resource, + dst_thread_addr_offset, + dst_wave_addr_offset, + 0); + + llvm_amdgcn_raw_buffer_store_fp16x4(tmp.AsType()[Number<1>{}], + dst_wave_buffer_resource, + dst_thread_addr_offset, + dst_wave_addr_offset + 4 * sizeof(half_t), + 0); + } + } else if constexpr(is_same::value) { if constexpr(N == 1) @@ -552,49 +638,6 @@ __device__ void amd_buffer_store_impl(const typename vector_type::type src 0); } } - else if constexpr(is_same::value) - { - if constexpr(N == 1) - { - llvm_amdgcn_raw_buffer_store_fp16(src_thread_data, - dst_wave_buffer_resource, - dst_thread_addr_offset, - dst_wave_addr_offset, - 0); - } - else if constexpr(N == 2) - { - llvm_amdgcn_raw_buffer_store_fp16x2(src_thread_data, - dst_wave_buffer_resource, - dst_thread_addr_offset, - dst_wave_addr_offset, - 0); - } - else if constexpr(N == 4) - { - llvm_amdgcn_raw_buffer_store_fp16x4(src_thread_data, - dst_wave_buffer_resource, - dst_thread_addr_offset, - dst_wave_addr_offset, - 0); - } - else if constexpr(N == 8) - { - vector_type tmp{src_thread_data}; - - llvm_amdgcn_raw_buffer_store_fp16x4(tmp.AsType()[Number<0>{}], - dst_wave_buffer_resource, - dst_thread_addr_offset, - dst_wave_addr_offset, - 0); - - llvm_amdgcn_raw_buffer_store_fp16x4(tmp.AsType()[Number<1>{}], - dst_wave_buffer_resource, - dst_thread_addr_offset, - dst_wave_addr_offset + 4 * sizeof(half_t), - 0); - } - } } template @@ -720,7 +763,7 @@ __device__ void amd_buffer_atomic_add_impl(const typename vector_type::typ } // buffer_load requires: -// 1) p_src_wave must be in global memory space +// 1) p_src_wave must point to global memory space // 2) p_src_wave must be a wavewise pointer. // It is user's responsibility to make sure that is true. template @@ -754,7 +797,7 @@ amd_buffer_load_invalid_element_return_return_zero(const T* p_src_wave, } // buffer_load requires: -// 1) p_src_wave must be in global memory space +// 1) p_src_wave must point to global memory space // 2) p_src_wave must be a wavewise pointer. // It is user's responsibility to make sure that is true. template @@ -782,7 +825,7 @@ amd_buffer_load_invalid_element_return_customized_value(const T* p_src_wave, } // buffer_store requires: -// 1) p_dst_wave must be global memory +// 1) p_dst_wave must point to global memory // 2) p_dst_wave must be a wavewise pointer. // It is user's responsibility to make sure that is true. template @@ -816,7 +859,7 @@ __device__ void amd_buffer_store(const typename vector_type_maker::type::t } // buffer_atomic_add requires: -// 1) p_dst_wave must be global memory +// 1) p_dst_wave must point to global memory // 2) p_dst_wave must be a wavewise pointer. // It is user's responsibility to make sure that is true. template diff --git a/composable_kernel/include/utility/array.hpp b/composable_kernel/include/utility/array.hpp index 7271094d39..911cefd057 100644 --- a/composable_kernel/include/utility/array.hpp +++ b/composable_kernel/include/utility/array.hpp @@ -48,7 +48,7 @@ struct Array template __host__ __device__ constexpr auto make_array(X&& x, Xs&&... xs) { - using data_type = remove_cv_t>; + using data_type = remove_cvref_t; return Array{{std::forward(x), std::forward(xs)...}}; } diff --git a/composable_kernel/include/utility/data_type.hpp b/composable_kernel/include/utility/data_type.hpp index 24a2190e84..bfaac8a939 100644 --- a/composable_kernel/include/utility/data_type.hpp +++ b/composable_kernel/include/utility/data_type.hpp @@ -73,6 +73,13 @@ struct scalar_type> }; // +template <> +struct scalar_type +{ + using type = double; + static constexpr index_t vector_size = 1; +}; + template <> struct scalar_type { @@ -864,6 +871,10 @@ struct vector_type } }; +// fp64 +using double2_t = typename vector_type::type; +using double4_t = typename vector_type::type; + // fp32 using float2_t = typename vector_type::type; using float4_t = typename vector_type::type; diff --git a/composable_kernel/include/utility/dynamic_buffer.hpp b/composable_kernel/include/utility/dynamic_buffer.hpp index a875afd9be..7029bd850f 100644 --- a/composable_kernel/include/utility/dynamic_buffer.hpp +++ b/composable_kernel/include/utility/dynamic_buffer.hpp @@ -39,18 +39,15 @@ struct DynamicBuffer } template >>::type, - typename scalar_type>>::type>::value, - bool>::type = false> + typename enable_if>::type, + typename scalar_type>::type>::value, + bool>::type = false> __host__ __device__ constexpr auto Get(index_t i, bool is_valid_element) const { // X contains multiple T - constexpr index_t scalar_per_t_vector = - scalar_type>>::vector_size; + constexpr index_t scalar_per_t_vector = scalar_type>::vector_size; - constexpr index_t scalar_per_x_vector = - scalar_type>>::vector_size; + constexpr index_t scalar_per_x_vector = scalar_type>::vector_size; static_assert(scalar_per_x_vector % scalar_per_t_vector == 0, "wrong! X need to be multiple T"); @@ -67,15 +64,14 @@ struct DynamicBuffer if constexpr(InvalidElementUseNumericalZeroValue) { - return amd_buffer_load_invalid_element_return_return_zero< - remove_cv_t>, - t_per_x>(p_data_, i, is_valid_element, element_space_size_); + return amd_buffer_load_invalid_element_return_return_zero, + t_per_x>( + p_data_, i, is_valid_element, element_space_size_); } else { - return amd_buffer_load_invalid_element_return_customized_value< - remove_cv_t>, - t_per_x>( + return amd_buffer_load_invalid_element_return_customized_value, + t_per_x>( p_data_, i, is_valid_element, element_space_size_, invalid_element_value_); } } @@ -94,18 +90,15 @@ struct DynamicBuffer } template >>::type, - typename scalar_type>>::type>::value, - bool>::type = false> + typename enable_if>::type, + typename scalar_type>::type>::value, + bool>::type = false> __host__ __device__ void Set(index_t i, bool is_valid_element, const X& x) { // X contains multiple T - constexpr index_t scalar_per_t_vector = - scalar_type>>::vector_size; + constexpr index_t scalar_per_t_vector = scalar_type>::vector_size; - constexpr index_t scalar_per_x_vector = - scalar_type>>::vector_size; + constexpr index_t scalar_per_x_vector = scalar_type>::vector_size; static_assert(scalar_per_x_vector % scalar_per_t_vector == 0, "wrong! X need to be multiple T"); @@ -115,7 +108,7 @@ struct DynamicBuffer #if CK_USE_AMD_BUFFER_ADDRESSING constexpr index_t t_per_x = scalar_per_x_vector / scalar_per_t_vector; - amd_buffer_store>, t_per_x>( + amd_buffer_store, t_per_x>( x, p_data_, i, is_valid_element, element_space_size_); #else if(is_valid_element) @@ -136,70 +129,65 @@ struct DynamicBuffer // ISA, so I try to let compiler emit IR "store" which would be lower to // ds_write_b128 // TODO: remove this after compiler fix - if constexpr(is_same>>::type, - int8_t>::value) + if constexpr(is_same>::type, int8_t>::value) { - static_assert( - (is_same>, int8_t>::value && - is_same>, int8_t>::value) || - (is_same>, int8_t>::value && - is_same>, int8x2_t>::value) || - (is_same>, int8_t>::value && - is_same>, int8x4_t>::value) || - (is_same>, int8x4_t>::value && - is_same>, int8x4_t>::value) || - (is_same>, int8x8_t>::value && - is_same>, int8x8_t>::value) || - (is_same>, int8x16_t>::value && - is_same>, int8x16_t>::value), - "wrong! not implemented for this combination, please add " - "implementation"); + static_assert((is_same, int8_t>::value && + is_same, int8_t>::value) || + (is_same, int8_t>::value && + is_same, int8x2_t>::value) || + (is_same, int8_t>::value && + is_same, int8x4_t>::value) || + (is_same, int8x4_t>::value && + is_same, int8x4_t>::value) || + (is_same, int8x8_t>::value && + is_same, int8x8_t>::value) || + (is_same, int8x16_t>::value && + is_same, int8x16_t>::value), + "wrong! not implemented for this combination, please add " + "implementation"); - if constexpr(is_same>, int8_t>::value && - is_same>, int8_t>::value) + if constexpr(is_same, int8_t>::value && + is_same, int8_t>::value) { // HACK: cast pointer of x is bad // TODO: remove this after compiler fix *c_style_pointer_cast(&p_data_[i]) = *c_style_pointer_cast(&x); } - else if constexpr(is_same>, int8_t>::value && - is_same>, int8x2_t>::value) + else if constexpr(is_same, int8_t>::value && + is_same, int8x2_t>::value) { // HACK: cast pointer of x is bad // TODO: remove this after compiler fix *c_style_pointer_cast(&p_data_[i]) = *c_style_pointer_cast(&x); } - else if constexpr(is_same>, int8_t>::value && - is_same>, int8x4_t>::value) + else if constexpr(is_same, int8_t>::value && + is_same, int8x4_t>::value) { // HACK: cast pointer of x is bad // TODO: remove this after compiler fix *c_style_pointer_cast(&p_data_[i]) = *c_style_pointer_cast(&x); } - else if constexpr(is_same>, - int8x4_t>::value && - is_same>, int8x4_t>::value) + else if constexpr(is_same, int8x4_t>::value && + is_same, int8x4_t>::value) { // HACK: cast pointer of x is bad // TODO: remove this after compiler fix *c_style_pointer_cast(&p_data_[i]) = *c_style_pointer_cast(&x); } - else if constexpr(is_same>, - int8x8_t>::value && - is_same>, int8x8_t>::value) + else if constexpr(is_same, int8x8_t>::value && + is_same, int8x8_t>::value) { // HACK: cast pointer of x is bad // TODO: remove this after compiler fix *c_style_pointer_cast(&p_data_[i]) = *c_style_pointer_cast(&x); } - else if constexpr(is_same>, - int8x16_t>::value && - is_same>, int8x16_t>::value) + else if constexpr(is_same, int8x16_t>::value && + is_same, int8x16_t>::value) { // HACK: cast pointer of x is bad // TODO: remove this after compiler fix @@ -224,18 +212,15 @@ struct DynamicBuffer } template >>::type, - typename scalar_type>>::type>::value, - bool>::type = false> + typename enable_if>::type, + typename scalar_type>::type>::value, + bool>::type = false> __host__ __device__ void AtomicAdd(index_t i, bool is_valid_element, const X& x) { // X contains multiple T - constexpr index_t scalar_per_t_vector = - scalar_type>>::vector_size; + constexpr index_t scalar_per_t_vector = scalar_type>::vector_size; - constexpr index_t scalar_per_x_vector = - scalar_type>>::vector_size; + constexpr index_t scalar_per_x_vector = scalar_type>::vector_size; static_assert(scalar_per_x_vector % scalar_per_t_vector == 0, "wrong! X need to be multiple T"); @@ -245,7 +230,7 @@ struct DynamicBuffer #if CK_USE_AMD_BUFFER_ADDRESSING constexpr index_t t_per_x = scalar_per_x_vector / scalar_per_t_vector; - amd_buffer_atomic_add>, t_per_x>( + amd_buffer_atomic_add, t_per_x>( x, p_data_, i, is_valid_element, element_space_size_); #else if(is_valid_element) @@ -266,9 +251,14 @@ __host__ __device__ constexpr auto make_dynamic_buffer(T* p, ElementSpaceSize el return DynamicBuffer{p, element_space_size}; } -template +template < + AddressSpaceEnum_t BufferAddressSpace, + typename T, + typename ElementSpaceSize, + typename X, + typename enable_if, remove_cvref_t>::value, bool>::type = false> __host__ __device__ constexpr auto -make_dynamic_buffer(T* p, ElementSpaceSize element_space_size, T invalid_element_value) +make_dynamic_buffer(T* p, ElementSpaceSize element_space_size, X invalid_element_value) { return DynamicBuffer{ p, element_space_size, invalid_element_value}; diff --git a/composable_kernel/include/utility/tuple.hpp b/composable_kernel/include/utility/tuple.hpp index ee96a8b435..70f4d77d87 100644 --- a/composable_kernel/include/utility/tuple.hpp +++ b/composable_kernel/include/utility/tuple.hpp @@ -159,7 +159,7 @@ struct Tuple : detail::TupleImpl __host__ __device__ constexpr auto make_tuple(Xs&&... xs) { - return Tuple>...>(std::forward(xs)...); + return Tuple...>(std::forward(xs)...); } } // namespace ck diff --git a/composable_kernel/include/utility/tuple_helper.hpp b/composable_kernel/include/utility/tuple_helper.hpp index 9499a3596c..55a79d2594 100644 --- a/composable_kernel/include/utility/tuple_helper.hpp +++ b/composable_kernel/include/utility/tuple_helper.hpp @@ -14,9 +14,7 @@ struct is_known_at_compile_time> return container_reduce( Tuple{}, [](auto x, bool r) { - return is_known_at_compile_time< - remove_cv_t>>::value & - r; + return is_known_at_compile_time>::value & r; }, true); } diff --git a/composable_kernel/include/utility/type.hpp b/composable_kernel/include/utility/type.hpp index b7902ad496..89a2bdbde6 100644 --- a/composable_kernel/include/utility/type.hpp +++ b/composable_kernel/include/utility/type.hpp @@ -22,6 +22,9 @@ using remove_reference_t = typename std::remove_reference::type; template using remove_cv_t = typename std::remove_cv::type; +template +using remove_cvref_t = remove_cv_t>; + template inline constexpr bool is_pointer_v = std::is_pointer::value; diff --git a/composable_kernel/src/kernel_wrapper/convolution_forward_implicit_gemm_v6r1_dlops_nchw_kcyx_nkhw.cpp b/composable_kernel/src/kernel_wrapper/convolution_forward_implicit_gemm_v6r1_dlops_nchw_kcyx_nkhw.cpp index c1208ac3cb..71239e0ecc 100644 --- a/composable_kernel/src/kernel_wrapper/convolution_forward_implicit_gemm_v6r1_dlops_nchw_kcyx_nkhw.cpp +++ b/composable_kernel/src/kernel_wrapper/convolution_forward_implicit_gemm_v6r1_dlops_nchw_kcyx_nkhw.cpp @@ -374,13 +374,8 @@ extern "C" __global__ void CGridDesc_GM10_BM0_BM1_GN10_BN0_BN1{}, CGridBlockCluster_BlockId_To_GM10_GN10{})); - const auto desc_tuple = *reinterpret_cast( -#pragma clang diagnostic push -#pragma clang diagnostic ignored "-Wold-style-cast" - // TODO: how to cast? - (const void*)p_desc_tuple -#pragma clang diagnostic pop - ); + const auto desc_tuple = + *reinterpret_cast(cast_pointer_to_generic_address_space(p_desc_tuple)); const auto a_grid_desc_gk0_gm0_gm10_gm11_gk1 = desc_tuple[I0]; const auto b_grid_desc_gk0_gn0_gn10_gn11_gk1 = desc_tuple[I1];