From 4f4aba4872e0b080a112337972c5cc02623c7b1a Mon Sep 17 00:00:00 2001 From: Chao Liu Date: Tue, 24 Sep 2019 23:59:47 -0500 Subject: [PATCH] adding GetLinearDimensionMask() --- ...chw_kcyx_nkhw_padded_lds_double_buffer.hpp | 4 +- .../tensor_coordinate_v2.hpp | 5 + .../tensor_description/tensor_descriptor.hpp | 138 ++++++++++-------- .../include/utility/functional.hpp | 12 ++ .../include/utility/sequence.hpp | 84 ++++++++--- 5 files changed, 152 insertions(+), 91 deletions(-) diff --git a/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_padded_lds_double_buffer.hpp b/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_padded_lds_double_buffer.hpp index 4d941b5053..6303c09f95 100644 --- a/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_padded_lds_double_buffer.hpp +++ b/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_padded_lds_double_buffer.hpp @@ -440,9 +440,7 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_padded_lds_double_buf 0, b_thread_data_on_global, 0}) -#if 0 - .Run_generic -#elif 1 +#if 1 .template Run_generic #elif 1 .template Run_optimized_dst_address_calculation diff --git a/composable_kernel/include/tensor_description/tensor_coordinate_v2.hpp b/composable_kernel/include/tensor_description/tensor_coordinate_v2.hpp index b88b52671a..8004d57c2a 100644 --- a/composable_kernel/include/tensor_description/tensor_coordinate_v2.hpp +++ b/composable_kernel/include/tensor_description/tensor_coordinate_v2.hpp @@ -45,6 +45,7 @@ struct NativeTensorCoordinate __host__ __device__ constexpr type operator+=(const Index& idx_diff) { // mIndex is updated here, but some (or all) of its entries may never be used + // compiler should remove those entries as dead code mIndex += idx_diff; mOffset += tensor_desc_type::CalculateOffsetDiff(idx_diff); @@ -55,6 +56,7 @@ struct NativeTensorCoordinate __host__ __device__ constexpr type operator-=(const Index& idx_diff) { // mIndex is updated here, but some (or all) of its entries may never be used + // compiler should remove those entries as dead code mIndex -= idx_diff; mOffset -= tensor_desc_type::CalculateOffsetDiff(idx_diff); @@ -136,6 +138,7 @@ struct TransformedTensorCoordinate idx_up_diff, GetIndex(), GetLowerCoordinate().GetIndex()); // mIndexUp is updated here, but some (or all) of its entries may never be used + // compiler should remove those entries as dead code mIndexUp += idx_up_diff; return *this; @@ -146,6 +149,8 @@ struct TransformedTensorCoordinate mCoordLow -= tensor_desc_type::CalculateLowerIndexDiff( idx_up_diff, GetIndex(), GetLowerCoordinate().GetIndex()); + // mIndex is updated here, but some (or all) of its entries may never be used + // compiler should remove those entries as dead code mIndexUp -= idx_up_diff; return *this; diff --git a/composable_kernel/include/tensor_description/tensor_descriptor.hpp b/composable_kernel/include/tensor_description/tensor_descriptor.hpp index dc2039355c..a934651911 100644 --- a/composable_kernel/include/tensor_description/tensor_descriptor.hpp +++ b/composable_kernel/include/tensor_description/tensor_descriptor.hpp @@ -101,12 +101,12 @@ struct NativeTensorDescriptor return true; } - __host__ __device__ static constexpr auto GetMaskOfLinearDimensions() + __host__ __device__ static constexpr auto GetLinearDimensionMask() { return typename uniform_sequence_gen::type{}; } - __host__ __device__ static constexpr auto GetMaskOfNonLinearDimensions() + __host__ __device__ static constexpr auto GetNonLinearDimensionMask() { return typename uniform_sequence_gen::type{}; } @@ -353,18 +353,27 @@ struct TransformedTensorDescriptor return GetLowerTensorDescriptor().CalculateOffset(CalculateLowerIndex(idx_up)); } -#if 0 - struct lambda_sequence_logic_or +#if 1 + struct lambda_sequence_logical_and { template - __host__ __device__ constexpr auto operator()(Seqs... seqs) const + __host__ __device__ constexpr auto operator()(Seqs...) const { - // TODO: should use math::logic_or, after Sequence can take bool - return typename sequence_reduce, Seqs...>::type{}; + return typename sequence_reduce, Seqs...>::type{}; } }; - struct lambda_1 + template + struct lambda_is_true + { + __host__ __device__ constexpr auto operator()(const T& x) const + { + // TODO: remove static_cast once Sequence can take bool as entries + return static_cast(x) == true; + } + }; + + struct lambda_get_linear_dimension_mask_of_single_tranform { // check only one transform at a time template @@ -372,73 +381,73 @@ struct TransformedTensorDescriptor operator()(const Transform& tran, LowDimensionId, UpDimensionId) const { // judge if transformation is linear - constexpr bool is_linear_transform = tran.IsLinearTransform(); + constexpr bool is_linear_transform = Transform::IsLinearTransform(); // judge if all lower dimension are linear - constexpr bool is_all_low_dim_linear = math::reduce_on_sequence( - pick_sequence_elements_by_mask( - GetLowerTensorDescriptor().GetMaskOfLinearDimensions(), LowDimensionId{}), - math::logic_and{}, - integral_constant{}); + constexpr bool are_all_low_dim_linear = sequence_all_of( + pick_sequence_elements_by_ids(GetLowerTensorDescriptor().GetLinearDimensionMask(), + LowDimensionId{}), + lambda_is_true{}); - // judge if upper dimenisons are linear - constexpr bool is_up_dim_nonlinear = !(is_linear_transform && is_all_low_dim_linear); + // create linear mask for upper dimensions + constexpr bool are_up_dim_linear = is_linear_transform && are_all_low_dim_linear; - constexpr auto value_sequence = - typename uniform_sequence_gen::type{}; + constexpr auto mask_of_up_linear_dims = modifiy_sequence_by_ids( + typename uniform_sequence_gen::type{}, + typename uniform_sequence_gen::type{}, + UpDimensionId{}); - constexpr auto mask_of_up_nonlinear_dims = modifiy_sequence( - typename uniform_sequence_gen::type{}, value_sequence, UpDimensionId{}); - - return mask_of_up_nonlinear_dims; - }; - - __host__ __device__ static constexpr bool GetMaskOfNonLinearDimensions() - { - // create tuple of linear dimension masks, for all transformations - constexpr auto tuple_of_nonlinear_dimension_mask = - transform_tuples(lambda_1{}, Transforms{}, LowDimensionIds{}, UpDimensionIds{}); - - // reduce tuple of masks into one mask - constexpr auto nonlinear_dimension_mask = - unpack(lambda_sequence_logic_or{}, tuple_of_nonlinear_dimension_mask); - - return nonlinear_dimension_mask; + return mask_of_up_linear_dims; } + }; - __host__ __device__ static constexpr bool GetMaskOfLinearDimensions() - { - return GetMaskOfNonLinearDimensions().Transform(math::logic_not{}); - } + __host__ __device__ static constexpr auto GetLinearDimensionMask() + { + // create tuple of linear dimension masks, for all transformations + constexpr auto tuple_of_linear_dimension_mask = + transform_tuples(lambda_get_linear_dimension_mask_of_single_tranform{}, + Transforms{}, + LowDimensionIds{}, + UpDimensionIds{}); - template - __host__ __device__ static constexpr bool IsLinearDimension(Number) - { - return GetMaskOfLinearDimensions().At(Number{}); - } + // reduce tuple of masks into one mask + constexpr auto linear_dimension_mask = + unpack(lambda_sequence_logical_and{}, tuple_of_linear_dimension_mask); - __host__ __device__ static constexpr auto GetLinearDimensions() - { - constexpr auto linear_dimension_mask = GetMaskOfLienarDimensions(); + return linear_dimension_mask; + } - return pick_sequence_elements_by_mask( - typename arithmetic_sequence_gen<0, nDimUp, 1>::type{}, linear_dimension_mask); - } + __host__ __device__ static constexpr auto GetNonLinearDimensionMask() + { + return GetLinearDimensionMask().Transform(logical_not{}); + } - __host__ __device__ static constexpr auto GetNonLinearDimensions() - { - constexpr auto nonlinear_dimension_mask = - GetMaskOfLienarDimensions().Transform(math::logic_not{}); + template + __host__ __device__ static constexpr bool IsLinearDimension(Number) + { + return GetLinearDimensionMask().At(Number{}); + } - return pick_sequence_elements_by_mask( - typename arithmetic_sequence_gen<0, nDimUp, 1>::type{}, nonlinear_dimension_mask); - } + __host__ __device__ static constexpr auto GetLinearDimensions() + { + constexpr auto linear_dimension_mask = GetLinearDimensionMask(); - __host__ __device__ static constexpr auto GetNonLinearIndependentDimensionGroups() - { - // not implemented - } + return pick_sequence_elements_by_mask( + typename arithmetic_sequence_gen<0, nDimUp, 1>::type{}, linear_dimension_mask); + } + + __host__ __device__ static constexpr auto GetNonLinearDimensions() + { + constexpr auto nonlinear_dimension_mask = GetNonLinearDimensionMask(); + + return pick_sequence_elements_by_mask( + typename arithmetic_sequence_gen<0, nDimUp, 1>::type{}, nonlinear_dimension_mask); + } + + __host__ __device__ static constexpr auto GetNonLinearIndependentDimensionGroups() + { + // not implemented + } #endif __host__ __device__ static constexpr bool @@ -457,9 +466,10 @@ struct TransformedTensorDescriptor return flag; } - // Whenever this function is called, it will call CalculateLowerIndex() recursively + // Whenever this function is called, it will call CalculateLowerIndex() recursively. // If you have created a tensor coordinate already, instead of calling this function, - // you should call TransformedTensorCoordinate::IsUpperIndexMappedToValidOffset() + // you should call TensorCoordinate::IsUpperIndexMappedToValidOffset() which would + // be less expensive. __host__ __device__ static constexpr bool IsUpperIndexMappedToValidOffset(const UpperIndex& idx_up) { diff --git a/composable_kernel/include/utility/functional.hpp b/composable_kernel/include/utility/functional.hpp index 6232e2243f..c49800827d 100644 --- a/composable_kernel/include/utility/functional.hpp +++ b/composable_kernel/include/utility/functional.hpp @@ -25,6 +25,18 @@ struct swallow } }; +template +struct logical_and +{ + constexpr bool operator()(const T& x, const T& y) const { return x && y; } +}; + +template +struct logical_or +{ + constexpr bool operator()(const T& x, const T& y) const { return x || y; } +}; + template struct logical_not { diff --git a/composable_kernel/include/utility/sequence.hpp b/composable_kernel/include/utility/sequence.hpp index 3ccc8c5f65..85855780e3 100644 --- a/composable_kernel/include/utility/sequence.hpp +++ b/composable_kernel/include/utility/sequence.hpp @@ -311,7 +311,7 @@ struct sequence_reverse> using type = Sequence; }; -#if 0 +#if 1 template struct sequence_reduce { @@ -755,46 +755,82 @@ __host__ __device__ constexpr auto pick_sequence_elements_by_ids(Seq, Sequence{})...>{}; } -#if 0 +#if 1 +namespace detail { +template +struct pick_sequence_elements_by_mask_impl +{ + using new_work_seq = typename conditional::type; + + using type = + typename pick_sequence_elements_by_mask_impl::type; +}; + +template +struct pick_sequence_elements_by_mask_impl, Sequence<>> +{ + using type = WorkSeq; +}; + +} // namespace detail + template __host__ __device__ constexpr auto pick_sequence_elements_by_mask(Seq, Mask) { - // not implemented + static_assert(Seq::Size() == Mask::Size(), "wrong!"); + + return typename detail::pick_sequence_elements_by_mask_impl, Seq, Mask>::type{}; +} + +namespace detail { +template +struct modify_sequence_elements_by_ids_impl +{ + using new_work_seq = decltype(WorkSeq::Modify(RemainIds::Front(), RemainValues::Front())); + + using type = + typename modify_sequence_elements_by_ids_impl::type; +}; + +template +struct modify_sequence_elements_by_ids_impl, Sequence<>> +{ + using type = WorkSeq; +}; +} // namespace detail + +template +__host__ __device__ constexpr auto modify_sequence_elements_by_ids(Seq, Values, Ids) +{ + static_assert(Values::Size() == Ids::Size() && Seq::Size() >= Values::Size(), "wrong!"); + + return typename detail::modify_sequence_elements_by_ids_impl::type{}; } #endif -template -struct lambda_reduce_on_sequence -{ - const Reduce& f; - index_t& result; - - __host__ __device__ constexpr lambda_reduce_on_sequence(const Reduce& f_, index_t& result_) - : f(f_), result(result_) - { - } - - template - __host__ __device__ constexpr index_t operator()(IDim) const - { - return result = f(result, Seq::At(IDim{})); - } -}; - template __host__ __device__ constexpr index_t reduce_on_sequence(Seq, Reduce f, Number /*initial_value*/) { index_t result = Init; - static_for<0, Seq::Size(), 1>{}(lambda_reduce_on_sequence(f, result)); + for(index_t i = 0; i < Seq::Size(); ++i) + { + result = f(result, Seq::At(i)); + } return result; } // TODO: a generic any_of for any container template -__host__ __device__ constexpr bool sequence_any_of(Seq, F f /*initial_value*/) +__host__ __device__ constexpr bool sequence_any_of(Seq, F f) { bool flag = false; @@ -808,7 +844,7 @@ __host__ __device__ constexpr bool sequence_any_of(Seq, F f /*initial_value*/) // TODO: a generic all_of for any container template -__host__ __device__ constexpr bool sequence_all_of(Seq, F f /*initial_value*/) +__host__ __device__ constexpr bool sequence_all_of(Seq, F f) { bool flag = true;