From 498e71b09822406b1b050c5eb03edebfe04038a6 Mon Sep 17 00:00:00 2001 From: Chao Liu Date: Tue, 4 Jun 2019 17:02:49 -0500 Subject: [PATCH] try using more constexpr --- ...lution_implicit_gemm_v1_chwn_cyxk_khwn.hpp | 2 +- driver/driver.hip.cpp | 4 +- src/include/Array.hip.hpp | 17 ++- .../ConstantMergedTensorDescriptor.hip.hpp | 103 +++++++++++++++++- src/include/ConstantTensorDescriptor.hip.hpp | 102 +++++++++++++++-- src/include/Sequence.hip.hpp | 48 +++++--- .../blockwise_generic_tensor_slice_op.hip.hpp | 14 +++ src/include/functional.hip.hpp | 16 ++- ...3_lds_double_buffer_chwn_cyxk_khwn.hip.hpp | 6 +- ...4_lds_double_buffer_nchw_kcyx_nkhw.hip.hpp | 2 +- 10 files changed, 272 insertions(+), 42 deletions(-) diff --git a/driver/device_convolution_implicit_gemm_v1_chwn_cyxk_khwn.hpp b/driver/device_convolution_implicit_gemm_v1_chwn_cyxk_khwn.hpp index 919c3503f5..217eb853d9 100644 --- a/driver/device_convolution_implicit_gemm_v1_chwn_cyxk_khwn.hpp +++ b/driver/device_convolution_implicit_gemm_v1_chwn_cyxk_khwn.hpp @@ -140,7 +140,7 @@ void device_convolution_implicit_gemm_v1_chwn_cyxk_khwn(InDesc, constexpr index_t WeiBlockCopyDataPerRead_K = 4; constexpr index_t OutThreadCopyDataPerWrite_N = 2; -#elif 1 +#elif 0 // for 3x3, 34x34, v1r3, Pascal // for 3x3, 28x28, v1r3, Pascal // for 3x3, 14x14, v1r3, Pascal diff --git a/driver/driver.hip.cpp b/driver/driver.hip.cpp index 9022bcdf2a..bb228d31bf 100644 --- a/driver/driver.hip.cpp +++ b/driver/driver.hip.cpp @@ -443,7 +443,7 @@ int main(int argc, char* argv[]) constexpr index_t HPad = 0; constexpr index_t WPad = 0; -#elif 1 +#elif 0 // 3x3 filter, 28x28 image constexpr index_t N = 128; constexpr index_t C = 256; @@ -455,7 +455,7 @@ int main(int argc, char* argv[]) constexpr index_t HPad = 0; constexpr index_t WPad = 0; -#elif 0 +#elif 1 // 1x1 filter, 28x28 image constexpr index_t N = 128; constexpr index_t C = 512; diff --git a/src/include/Array.hip.hpp b/src/include/Array.hip.hpp index 14f7a6524d..9d3d385738 100644 --- a/src/include/Array.hip.hpp +++ b/src/include/Array.hip.hpp @@ -18,11 +18,24 @@ struct Array __host__ __device__ constexpr index_t GetSize() const { return NSize; } - __host__ __device__ const TData& operator[](index_t i) const { return mData[i]; } + __host__ __device__ constexpr TData operator[](index_t i) const { return mData[i]; } __host__ __device__ TData& operator[](index_t i) { return mData[i]; } - __host__ __device__ auto PushBack(TData x) const + template + __host__ __device__ constexpr TData Get(Number) const + { + return mData[I]; + } + + template + __host__ __device__ constexpr bool Set(Number, TData x) + { + mData[I] = x; + return true; // for constexpr + } + + __host__ __device__ constexpr auto PushBack(TData x) const { Array new_array; diff --git a/src/include/ConstantMergedTensorDescriptor.hip.hpp b/src/include/ConstantMergedTensorDescriptor.hip.hpp index 157546c979..b595a2f0a4 100644 --- a/src/include/ConstantMergedTensorDescriptor.hip.hpp +++ b/src/include/ConstantMergedTensorDescriptor.hip.hpp @@ -74,7 +74,8 @@ struct ConstantMergedTensorDescriptor return OriginalTensorDesc::GetElementSize(); } - __host__ __device__ static auto +#if 0 + __host__ __device__ static constexpr auto GetOriginalMultiIndexFromMultiIndex(Array multi_id) { Array original_multi_id; @@ -98,21 +99,111 @@ struct ConstantMergedTensorDescriptor return original_multi_id; } - - __host__ __device__ static index_t GetOffsetFromMultiIndex(Array multi_id) +#else + template + struct GetOriginalMultiIndexFromMultiIndex_impl1 { - const auto original_multi_id = GetOriginalMultiIndexFromMultiIndex(multi_id); + const Array& original_multi_id_partial_ref; + Array& original_multi_id_ref; + + __host__ __device__ constexpr GetOriginalMultiIndexFromMultiIndex_impl1( + const Array& original_multi_id_partial, + Array& original_multi_id) + : original_multi_id_partial_ref(original_multi_id_partial), + original_multi_id_ref(original_multi_id) + { + } + + template + constexpr __host__ __device__ bool operator()(Number) const + { + constexpr index_t idim_original = OriginalDimsPartial::Get(Number{}); + + index_t itmp = original_multi_id_partial_ref.Get(Number{}); + + original_multi_id_ref.Set(Number{}, itmp); + + return true; + } + }; + + struct GetOriginalMultiIndexFromMultiIndex_impl0 + { + const Array& multi_id_ref; + Array& original_multi_id_ref; + + __host__ __device__ constexpr GetOriginalMultiIndexFromMultiIndex_impl0( + const Array& multi_id, Array& original_multi_id) + : multi_id_ref(multi_id), original_multi_id_ref(original_multi_id) + { + } + + template + constexpr __host__ __device__ bool operator()(Number) const + { + constexpr auto original_dims_partial = + std::get(std::tuple{}); + + // get partial original-multi-id corresponding to this merged dimension + const auto original_multi_id_partial = + OriginalTensorDesc::Extract(original_dims_partial) + .GetMultiIndexFrom1dIndex(multi_id_ref[IDim]); + + static_for<0, original_dims_partial.GetSize(), 1>{}( + GetOriginalMultiIndexFromMultiIndex_impl1( + original_multi_id_partial, original_multi_id_ref)); + + return true; + } + }; + + __host__ __device__ static constexpr auto + GetOriginalMultiIndexFromMultiIndex(Array multi_id) + { + Array original_multi_id; + + static_for<0, nDim, 1>{}( + GetOriginalMultiIndexFromMultiIndex_impl0(multi_id, original_multi_id)); + + return original_multi_id; + } + + template + __host__ __device__ static constexpr index_t GetOffsetFromMultiIndex(Sequence) + { + constexpr auto multi_id = sequence2array(Sequence{}); + + constexpr auto original_multi_id = GetOriginalMultiIndexFromMultiIndex(multi_id); + + return OriginalTensorDesc::GetOffsetFromMultiIndex(original_multi_id); + } +#endif + +#if 0 + // return type is Sequence<...> + template + __host__ __device__ static constexpr auto GetOriginalMultiIndexFromMultiIndex(Sequence) + { + // not implemented + return Sequence<>{}; + } +#endif + + __host__ __device__ static constexpr index_t + GetOffsetFromMultiIndex(Array multi_id) + { + auto original_multi_id = GetOriginalMultiIndexFromMultiIndex(multi_id); return OriginalTensorDesc::GetOffsetFromMultiIndex(original_multi_id); } template - __host__ __device__ static index_t GetOffsetFromMultiIndex(Is... is) + __host__ __device__ static constexpr index_t GetOffsetFromMultiIndex(Is... is) { return GetOffsetFromMultiIndex(Array{is...}); } - __host__ __device__ static Array GetMultiIndexFrom1dIndex(index_t id) + __host__ __device__ static constexpr Array GetMultiIndexFrom1dIndex(index_t id) { constexpr auto dummy_desc = make_ConstantTensorDescriptor_packed(GetLengths()); diff --git a/src/include/ConstantTensorDescriptor.hip.hpp b/src/include/ConstantTensorDescriptor.hip.hpp index 788d197351..afafab8e0e 100644 --- a/src/include/ConstantTensorDescriptor.hip.hpp +++ b/src/include/ConstantTensorDescriptor.hip.hpp @@ -4,7 +4,8 @@ template __host__ __device__ constexpr auto calculate_tensor_strides_packed(Lengths) { - return reverse_inclusive_scan_sequence(Lengths{}.PopFront(), mod_conv::multiplies{}) + return reverse_inclusive_scan_sequence( + Lengths{}.PopFront(), mod_conv::multiplies{}, Number<1>{}) .PushBack(Number<1>{}); } @@ -91,8 +92,10 @@ struct ConstantTensorDescriptor return align.Get() * ((element_space_unaligned + align.Get() - 1) / align.Get()); } +#if 0 template - __host__ __device__ static index_t GetOffsetFromMultiIndex(Array multi_id) + __host__ __device__ static constexpr index_t + GetOffsetFromMultiIndex(Array multi_id) { static_assert(NSize == nDim, "wrong! Dimension not consistent"); @@ -105,9 +108,43 @@ struct ConstantTensorDescriptor return offset; } +#else + template + struct GetOffsetFromMultiIndex_impl + { + Array& multi_id_ref; + index_t& offset_ref; + + __host__ __device__ constexpr GetOffsetFromMultiIndex_impl(Array& multi_id, + index_t& offset) + : multi_id_ref(multi_id), offset_ref(offset) + { + } + + template + __host__ __device__ constexpr bool operator()(Number) const + { + offset_ref += multi_id_ref.Get(Number{}) * Type::GetStride(Number{}); + return true; + } + }; + + template + __host__ __device__ static constexpr index_t + GetOffsetFromMultiIndex(Array multi_id) + { + static_assert(NSize == nDim, "wrong! Dimension not consistent"); + + index_t offset = 0; + + static_for<0, nDim, 1>{}(GetOffsetFromMultiIndex_impl(multi_id, offset)); + + return offset; + } +#endif template - __host__ __device__ static index_t GetOffsetFromMultiIndex(Is... is) + __host__ __device__ static constexpr index_t GetOffsetFromMultiIndex(Is... is) { return GetOffsetFromMultiIndex(Array{is...}); } @@ -123,7 +160,8 @@ struct ConstantTensorDescriptor multi_id * GetStrides(), mod_conv::plus{}, Number<0>{}); } - __host__ __device__ static Array GetMultiIndexFrom1dIndex(index_t id) +#if 0 + __host__ __device__ static constexpr Array GetMultiIndexFrom1dIndex(index_t id) { Array multi_id; @@ -141,8 +179,58 @@ struct ConstantTensorDescriptor return multi_id; } +#else + struct GetMultiIndexFrom1dIndex_impl + { + using DummyStrides = decltype(calculate_tensor_strides_packed(GetLengths())); - __host__ __device__ static auto + index_t& id_ref; + Array& multi_id_ref; + + __host__ __device__ constexpr GetMultiIndexFrom1dIndex_impl(index_t& id, + Array& multi_id) + : id_ref(id), multi_id_ref(multi_id) + { + } + + template + __host__ __device__ constexpr bool operator()(Number) const + { + constexpr index_t stride = DummyStrides::Get(Number{}); + multi_id_ref.Set(Number{}, id_ref / stride); + id_ref -= multi_id_ref.Get(Number{}) * stride; + + return true; + } + }; + + __host__ __device__ static constexpr Array GetMultiIndexFrom1dIndex(index_t id) + { + Array multi_id; + + constexpr auto dummy_strides = calculate_tensor_strides_packed(GetLengths()); + + // calculate index in each of the dimensions in the order of their dimension + static_for<0, nDim - 1, 1>{}(GetMultiIndexFrom1dIndex_impl(id, multi_id)); + + index_t itmp = id / dummy_strides.Get(Number{}); + + multi_id.Set(Number{}, itmp); + + return multi_id; + } +#endif + +#if 0 + // return type is Sequence<...> + template + __host__ __device__ static constexpr auto GetMultiIndexFrom1dIndex(Number) + { + return inclusive_scan_sequence(f_impl, GetStrides(), Number{}); + } +#endif + + __host__ __device__ static constexpr auto GetOriginalMultiIndexFromMultiIndex(Array multi_id) { return multi_id; @@ -278,8 +366,8 @@ struct ConstantTensorDescriptor // folded strides constexpr auto fold_strides = Number{} * - reverse_inclusive_scan_sequence(fold_intervals.PushBack(Number<1>{}), - mod_conv::multiplies{}); + reverse_inclusive_scan_sequence( + fold_intervals.PushBack(Number<1>{}), mod_conv::multiplies{}, Number<1>{}); // left and right constexpr auto left = typename arithmetic_sequence_gen<0, IDim, 1>::SeqType{}; diff --git a/src/include/Sequence.hip.hpp b/src/include/Sequence.hip.hpp index 1b96b9351b..b14b88d4d5 100644 --- a/src/include/Sequence.hip.hpp +++ b/src/include/Sequence.hip.hpp @@ -139,31 +139,49 @@ struct arithmetic_sequence_gen typename arithmetic_sequence_gen_impl::SeqType; }; -template +// reverse scan with init +template struct sequence_reverse_inclusive_scan; -template -struct sequence_reverse_inclusive_scan, Reduce> +template +struct sequence_reverse_inclusive_scan, Reduce, Init> { - using old_scan = typename sequence_reverse_inclusive_scan, Reduce>::SeqType; + using old_scan = + typename sequence_reverse_inclusive_scan, Reduce, Init>::SeqType; static constexpr index_t new_reduce = Reduce{}(I, old_scan{}.Front()); using SeqType = typename sequence_merge, old_scan>::SeqType; }; -template -struct sequence_reverse_inclusive_scan, Reduce> +template +struct sequence_reverse_inclusive_scan, Reduce, Init> { - using SeqType = Sequence; + using SeqType = Sequence; }; -template -struct sequence_reverse_inclusive_scan, Reduce> +template +struct sequence_reverse_inclusive_scan, Reduce, Init> { using SeqType = Sequence<>; }; +#if 0 +// reverse scan with token +template +struct sequence_reverse_inclusive_token_scan; + +template +struct sequence_reverse_inclusive_token_scan, F, Token> +{ + using old_scan = typename sequence_reverse_inclusive_token_scan, F, Token>::SeqType; + + static constexpr index_t new_reduce = Reduce{}(I, old_scan{}.Front()); + + using SeqType = typename sequence_merge, old_scan>::SeqType; +}; +#endif + template struct sequence_extract; @@ -434,16 +452,16 @@ transform_sequences(F f, Sequence, Sequence, Sequence) return Sequence{}; } -template -__host__ __device__ constexpr auto reverse_inclusive_scan_sequence(Seq, Reduce) +template +__host__ __device__ constexpr auto reverse_inclusive_scan_sequence(Seq, Reduce, Number) { - return typename sequence_reverse_inclusive_scan::SeqType{}; + return typename sequence_reverse_inclusive_scan::SeqType{}; } -template -__host__ __device__ constexpr auto inclusive_scan_sequence(Seq, Reduce) +template +__host__ __device__ constexpr auto inclusive_scan_sequence(Seq, Reduce, Number) { - return reverse_inclusive_scan_sequence(Seq{}.Reverse(), Reduce{}).Reverse(); + return reverse_inclusive_scan_sequence(Seq{}.Reverse(), Reduce{}, Number{}).Reverse(); } template diff --git a/src/include/blockwise_generic_tensor_slice_op.hip.hpp b/src/include/blockwise_generic_tensor_slice_op.hip.hpp index d7b46cde1b..d080c362e6 100644 --- a/src/include/blockwise_generic_tensor_slice_op.hip.hpp +++ b/src/include/blockwise_generic_tensor_slice_op.hip.hpp @@ -203,6 +203,7 @@ struct BlockwiseGenericTensorSliceCopy_v1 make_ConstantTensorDescriptor_packed(thread_sub_tensor_lengths * repeat_lengths); static_ford{}([&](auto repeat_multi_id_) { +#if 0 constexpr auto repeat_multi_id = sequence2array(decltype(repeat_multi_id_){}); const auto src_thread_data_multi_id_begin = @@ -216,6 +217,19 @@ struct BlockwiseGenericTensorSliceCopy_v1 const index_t clipboard_offset = thread_tensor_desc.GetOffsetFromMultiIndex( clipboard_data_multi_id_begin); // cannot not constexpr, why? +#else + constexpr auto src_thread_data_multi_id_begin = + repeat_multi_id_ * data_per_cluster_per_dims; + + constexpr auto clipboard_data_multi_id_begin = + repeat_multi_id_ * thread_sub_tensor_lengths; + + constexpr index_t src_offset = + SrcDesc::GetOffsetFromMultiIndex(src_thread_data_multi_id_begin); + + constexpr index_t clipboard_offset = + thread_tensor_desc.GetOffsetFromMultiIndex(clipboard_data_multi_id_begin); +#endif threadwise_generic_tensor_slice_copy_v1(SrcDesc{}, p_src + src_offset + mThreadSrcOffset, diff --git a/src/include/functional.hip.hpp b/src/include/functional.hip.hpp index 94835d294b..f287e5c08e 100644 --- a/src/include/functional.hip.hpp +++ b/src/include/functional.hip.hpp @@ -4,9 +4,9 @@ struct forwarder { template - __host__ __device__ constexpr T operator()(T&& x) const + __host__ __device__ constexpr T&& operator()(T&& x) const { - return std::forward(x); + return static_cast(x); } }; @@ -76,7 +76,7 @@ template struct static_for_impl { template - __host__ __device__ void operator()(F f) const + constexpr __host__ __device__ void operator()(F f) const { static_assert(Remaining % Increment == 0, "wrong! Remaining % Increment != 0"); static_assert(Increment <= Remaining, "will go out-of-range"); @@ -90,7 +90,7 @@ template struct static_for_impl { template - __host__ __device__ void operator()(F) const + constexpr __host__ __device__ void operator()(F) const { // no work left, just return return; @@ -102,13 +102,19 @@ template struct static_for { template - __host__ __device__ void operator()(F f) const + constexpr __host__ __device__ void operator()(F f) const { + static_assert(NBegin <= NEnd, "wrongs! should have NBegin <= NEnd"); + static_assert((NEnd - NBegin) % Increment == 0, "Wrong! should satisfy (NEnd - NBegin) % Increment == 0"); +#if 0 static_if<(NBegin < NEnd)>{}( [&](auto fwd) { static_for_impl{}(f); }); +#else + static_for_impl{}(f); +#endif } }; diff --git a/src/include/gridwise_convolution_implicit_gemm_v1r3_lds_double_buffer_chwn_cyxk_khwn.hip.hpp b/src/include/gridwise_convolution_implicit_gemm_v1r3_lds_double_buffer_chwn_cyxk_khwn.hip.hpp index ca0ac22944..af7f841644 100644 --- a/src/include/gridwise_convolution_implicit_gemm_v1r3_lds_double_buffer_chwn_cyxk_khwn.hip.hpp +++ b/src/include/gridwise_convolution_implicit_gemm_v1r3_lds_double_buffer_chwn_cyxk_khwn.hip.hpp @@ -155,7 +155,7 @@ struct GridwiseConvolutionImplicitGemm_v1r3_lds_double_buffer_chwn_cyxk_khwn decltype(wei_c_k_global_desc), decltype(wei_c_k_block_desc), decltype(wei_c_k_block_desc.GetLengths()), - WeiBlockCopyDataPerRead_K>{}; + WeiBlockCopyDataPerRead_K>({0, 0}, {0, 0}); // a series of blockwise batched GEMM // C_matrix += transpose(A_matrix) * B_matrix @@ -235,8 +235,8 @@ struct GridwiseConvolutionImplicitGemm_v1r3_lds_double_buffer_chwn_cyxk_khwn } #endif - // set threadwise output tensor to 0 - threadwise_4d_tensor_set_zero(out_k_h_w_n_thread_desc, p_out_thread); + // set threadwise output to 0 + threadwise_matrix_set_zero(c_k_wn_thread_mtx_desc, p_out_thread); for(index_t y = 0; y < Y; ++y) { diff --git a/src/include/gridwise_convolution_implicit_gemm_v4_lds_double_buffer_nchw_kcyx_nkhw.hip.hpp b/src/include/gridwise_convolution_implicit_gemm_v4_lds_double_buffer_nchw_kcyx_nkhw.hip.hpp index 3cb67d4058..80d933b21a 100644 --- a/src/include/gridwise_convolution_implicit_gemm_v4_lds_double_buffer_nchw_kcyx_nkhw.hip.hpp +++ b/src/include/gridwise_convolution_implicit_gemm_v4_lds_double_buffer_nchw_kcyx_nkhw.hip.hpp @@ -246,7 +246,7 @@ struct GridwiseConvolutionImplicitGemm_v4_lds_double_buffer_nchw_kcyx_nkhw // choose GEMM implementation here const auto run_blockwise_gemm = [&](auto... Xs) { -#if 1 +#if 0 return blockwise_gemm.Run(Xs...); #else return blockwise_gemm.Run_asm(Xs...);