adding GetLinearDimensionMask()

2026-05-12 09:16:52 +00:00 · 2019-09-24 23:59:47 -05:00
parent 545d930568
commit 4f4aba4872
5 changed files with 152 additions and 91 deletions
--- a/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_padded_lds_double_buffer.hpp
+++ b/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_padded_lds_double_buffer.hpp
@@ -440,9 +440,7 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_padded_lds_double_buf
                                                      0,
                                                      b_thread_data_on_global,
                                                      0})
-#if 0
-                .Run_generic
-#elif 1
+#if 1
                .template Run_generic<Float, address_space_t::generic, address_space_t::global>
 #elif 1
                .template Run_optimized_dst_address_calculation<Float, address_space_t::global>
--- a/composable_kernel/include/tensor_description/tensor_coordinate_v2.hpp
+++ b/composable_kernel/include/tensor_description/tensor_coordinate_v2.hpp
@@ -45,6 +45,7 @@ struct NativeTensorCoordinate
    __host__ __device__ constexpr type operator+=(const Index& idx_diff)
    {
        // mIndex is updated here, but some (or all) of its entries may never be used
+        // compiler should remove those entries as dead code
        mIndex += idx_diff;

        mOffset += tensor_desc_type::CalculateOffsetDiff(idx_diff);
@@ -55,6 +56,7 @@ struct NativeTensorCoordinate
    __host__ __device__ constexpr type operator-=(const Index& idx_diff)
    {
        // mIndex is updated here, but some (or all) of its entries may never be used
+        // compiler should remove those entries as dead code
        mIndex -= idx_diff;

        mOffset -= tensor_desc_type::CalculateOffsetDiff(idx_diff);
@@ -136,6 +138,7 @@ struct TransformedTensorCoordinate
            idx_up_diff, GetIndex(), GetLowerCoordinate().GetIndex());

        // mIndexUp is updated here, but some (or all) of its entries may never be used
+        // compiler should remove those entries as dead code
        mIndexUp += idx_up_diff;

        return *this;
@@ -146,6 +149,8 @@ struct TransformedTensorCoordinate
        mCoordLow -= tensor_desc_type::CalculateLowerIndexDiff(
            idx_up_diff, GetIndex(), GetLowerCoordinate().GetIndex());

+        // mIndex is updated here, but some (or all) of its entries may never be used
+        // compiler should remove those entries as dead code
        mIndexUp -= idx_up_diff;

        return *this;
--- a/composable_kernel/include/tensor_description/tensor_descriptor.hpp
+++ b/composable_kernel/include/tensor_description/tensor_descriptor.hpp
@@ -101,12 +101,12 @@ struct NativeTensorDescriptor
        return true;
    }

-    __host__ __device__ static constexpr auto GetMaskOfLinearDimensions()
+    __host__ __device__ static constexpr auto GetLinearDimensionMask()
    {
        return typename uniform_sequence_gen<nDim, 1>::type{};
    }

-    __host__ __device__ static constexpr auto GetMaskOfNonLinearDimensions()
+    __host__ __device__ static constexpr auto GetNonLinearDimensionMask()
    {
        return typename uniform_sequence_gen<nDim, 0>::type{};
    }
@@ -353,18 +353,27 @@ struct TransformedTensorDescriptor
        return GetLowerTensorDescriptor().CalculateOffset(CalculateLowerIndex(idx_up));
    }

-#if 0
-    struct lambda_sequence_logic_or
+#if 1
+    struct lambda_sequence_logical_and
    {
        template <typename... Seqs>
-        __host__ __device__ constexpr auto operator()(Seqs... seqs) const
+        __host__ __device__ constexpr auto operator()(Seqs...) const
        {
-            // TODO: should use math::logic_or<bool>, after Sequence can take bool
-            return typename sequence_reduce<math::logic_or<bool>, Seqs...>::type{};
+            return typename sequence_reduce<logical_and<index_t>, Seqs...>::type{};
        }
    };

-    struct lambda_1
+    template <typename T>
+    struct lambda_is_true
+    {
+        __host__ __device__ constexpr auto operator()(const T& x) const
+        {
+            // TODO: remove static_cast once Sequence can take bool as entries
+            return static_cast<bool>(x) == true;
+        }
+    };
+
+    struct lambda_get_linear_dimension_mask_of_single_tranform
    {
        // check only one transform at a time
        template <typename Transform, typename LowDimensionId, typename UpDimensionId>
@@ -372,73 +381,73 @@ struct TransformedTensorDescriptor
        operator()(const Transform& tran, LowDimensionId, UpDimensionId) const
        {
            // judge if transformation is linear
-            constexpr bool is_linear_transform = tran.IsLinearTransform();
+            constexpr bool is_linear_transform = Transform::IsLinearTransform();

            // judge if all lower dimension are linear
-            constexpr bool is_all_low_dim_linear = math::reduce_on_sequence(
-                pick_sequence_elements_by_mask(
-                    GetLowerTensorDescriptor().GetMaskOfLinearDimensions(), LowDimensionId{}),
-                math::logic_and<bool>{},
-                integral_constant<bool, true>{});
+            constexpr bool are_all_low_dim_linear = sequence_all_of(
+                pick_sequence_elements_by_ids(GetLowerTensorDescriptor().GetLinearDimensionMask(),
+                                              LowDimensionId{}),
+                lambda_is_true<index_t>{});

-            // judge if upper dimenisons are linear
-            constexpr bool is_up_dim_nonlinear = !(is_linear_transform && is_all_low_dim_linear);
+            // create linear mask for upper dimensions
+            constexpr bool are_up_dim_linear = is_linear_transform && are_all_low_dim_linear;

-            constexpr auto value_sequence =
-                typename uniform_sequence_gen<tran.GetNumOfUpperDimension(),
-                                              is_up_dim_nonlinear>::type{};
+            constexpr auto mask_of_up_linear_dims = modifiy_sequence_by_ids(
+                typename uniform_sequence_gen<nDimUp, 0>::type{},
+                typename uniform_sequence_gen<UpDimensionId::Size(), 1>::type{},
+                UpDimensionId{});

-            constexpr auto mask_of_up_nonlinear_dims = modifiy_sequence(
-                typename uniform_sequence_gen<nDimUp, 0>::type{}, value_sequence, UpDimensionId{});
-
-            return mask_of_up_nonlinear_dims;
-        };
-
-        __host__ __device__ static constexpr bool GetMaskOfNonLinearDimensions()
-        {
-            // create tuple of linear dimension masks, for all transformations
-            constexpr auto tuple_of_nonlinear_dimension_mask =
-                transform_tuples(lambda_1{}, Transforms{}, LowDimensionIds{}, UpDimensionIds{});
-
-            // reduce tuple of masks into one mask
-            constexpr auto nonlinear_dimension_mask =
-                unpack(lambda_sequence_logic_or{}, tuple_of_nonlinear_dimension_mask);
-
-            return nonlinear_dimension_mask;
+            return mask_of_up_linear_dims;
        }
+    };

-        __host__ __device__ static constexpr bool GetMaskOfLinearDimensions()
-        {
-            return GetMaskOfNonLinearDimensions().Transform(math::logic_not<bool>{});
-        }
+    __host__ __device__ static constexpr auto GetLinearDimensionMask()
+    {
+        // create tuple of linear dimension masks, for all transformations
+        constexpr auto tuple_of_linear_dimension_mask =
+            transform_tuples(lambda_get_linear_dimension_mask_of_single_tranform{},
+                             Transforms{},
+                             LowDimensionIds{},
+                             UpDimensionIds{});

-        template <index_t IDim>
-        __host__ __device__ static constexpr bool IsLinearDimension(Number<IDim>)
-        {
-            return GetMaskOfLinearDimensions().At(Number<IDim>{});
-        }
+        // reduce tuple of masks into one mask
+        constexpr auto linear_dimension_mask =
+            unpack(lambda_sequence_logical_and{}, tuple_of_linear_dimension_mask);

-        __host__ __device__ static constexpr auto GetLinearDimensions()
-        {
-            constexpr auto linear_dimension_mask = GetMaskOfLienarDimensions();
+        return linear_dimension_mask;
+    }

-            return pick_sequence_elements_by_mask(
-                typename arithmetic_sequence_gen<0, nDimUp, 1>::type{}, linear_dimension_mask);
-        }
+    __host__ __device__ static constexpr auto GetNonLinearDimensionMask()
+    {
+        return GetLinearDimensionMask().Transform(logical_not<index_t>{});
+    }

-        __host__ __device__ static constexpr auto GetNonLinearDimensions()
-        {
-            constexpr auto nonlinear_dimension_mask =
-                GetMaskOfLienarDimensions().Transform(math::logic_not<index_t>{});
+    template <index_t IDim>
+    __host__ __device__ static constexpr bool IsLinearDimension(Number<IDim>)
+    {
+        return GetLinearDimensionMask().At(Number<IDim>{});
+    }

-            return pick_sequence_elements_by_mask(
-                typename arithmetic_sequence_gen<0, nDimUp, 1>::type{}, nonlinear_dimension_mask);
-        }
+    __host__ __device__ static constexpr auto GetLinearDimensions()
+    {
+        constexpr auto linear_dimension_mask = GetLinearDimensionMask();

-        __host__ __device__ static constexpr auto GetNonLinearIndependentDimensionGroups()
-        {
-            // not implemented
-        }
+        return pick_sequence_elements_by_mask(
+            typename arithmetic_sequence_gen<0, nDimUp, 1>::type{}, linear_dimension_mask);
+    }
+
+    __host__ __device__ static constexpr auto GetNonLinearDimensions()
+    {
+        constexpr auto nonlinear_dimension_mask = GetNonLinearDimensionMask();
+
+        return pick_sequence_elements_by_mask(
+            typename arithmetic_sequence_gen<0, nDimUp, 1>::type{}, nonlinear_dimension_mask);
+    }
+
+    __host__ __device__ static constexpr auto GetNonLinearIndependentDimensionGroups()
+    {
+        // not implemented
+    }
 #endif

    __host__ __device__ static constexpr bool
@@ -457,9 +466,10 @@ struct TransformedTensorDescriptor
        return flag;
    }

-    // Whenever this function is called, it will call CalculateLowerIndex() recursively
+    // Whenever this function is called, it will call CalculateLowerIndex() recursively.
    // If you have created a tensor coordinate already, instead of calling this function,
-    //   you should call TransformedTensorCoordinate::IsUpperIndexMappedToValidOffset()
+    //   you should call TensorCoordinate::IsUpperIndexMappedToValidOffset() which would
+    //   be less expensive.
    __host__ __device__ static constexpr bool
    IsUpperIndexMappedToValidOffset(const UpperIndex& idx_up)
    {
--- a/composable_kernel/include/utility/functional.hpp
+++ b/composable_kernel/include/utility/functional.hpp
@@ -25,6 +25,18 @@ struct swallow
    }
 };

+template <typename T>
+struct logical_and
+{
+    constexpr bool operator()(const T& x, const T& y) const { return x && y; }
+};
+
+template <typename T>
+struct logical_or
+{
+    constexpr bool operator()(const T& x, const T& y) const { return x || y; }
+};
+
 template <typename T>
 struct logical_not
 {
--- a/composable_kernel/include/utility/sequence.hpp
+++ b/composable_kernel/include/utility/sequence.hpp
@@ -311,7 +311,7 @@ struct sequence_reverse<Sequence<I0, I1>>
    using type = Sequence<I1, I0>;
 };

-#if 0
+#if 1
 template <typename Reduce, typename Seq, typename... Seqs>
 struct sequence_reduce
 {
@@ -755,46 +755,82 @@ __host__ __device__ constexpr auto pick_sequence_elements_by_ids(Seq, Sequence<I
    return Sequence<Seq::At(Number<Is>{})...>{};
 }

-#if 0
+#if 1
+namespace detail {
+template <typename WorkSeq, typename RemainSeq, typename RemainMask>
+struct pick_sequence_elements_by_mask_impl
+{
+    using new_work_seq = typename conditional<RemainMask::Front(),
+                                              decltype(WorkSeq::PushBack(RemainSeq::Front())),
+                                              WorkSeq>::type;
+
+    using type =
+        typename pick_sequence_elements_by_mask_impl<new_work_seq,
+                                                     decltype(RemainSeq::PopFront()),
+                                                     decltype(RemainMask::PopFront())>::type;
+};
+
+template <typename WorkSeq>
+struct pick_sequence_elements_by_mask_impl<WorkSeq, Sequence<>, Sequence<>>
+{
+    using type = WorkSeq;
+};
+
+} // namespace detail
+
 template <typename Seq, typename Mask>
 __host__ __device__ constexpr auto pick_sequence_elements_by_mask(Seq, Mask)
 {
-    // not implemented
+    static_assert(Seq::Size() == Mask::Size(), "wrong!");
+
+    return typename detail::pick_sequence_elements_by_mask_impl<Sequence<>, Seq, Mask>::type{};
+}
+
+namespace detail {
+template <typename WorkSeq, typename RemainValues, typename RemainIds>
+struct modify_sequence_elements_by_ids_impl
+{
+    using new_work_seq = decltype(WorkSeq::Modify(RemainIds::Front(), RemainValues::Front()));
+
+    using type =
+        typename modify_sequence_elements_by_ids_impl<new_work_seq,
+                                                      decltype(RemainValues::PopFront()),
+                                                      decltype(RemainIds::PopFront())>::type;
+};
+
+template <typename WorkSeq>
+struct modify_sequence_elements_by_ids_impl<WorkSeq, Sequence<>, Sequence<>>
+{
+    using type = WorkSeq;
+};
+} // namespace detail
+
+template <typename Seq, typename Values, typename Ids>
+__host__ __device__ constexpr auto modify_sequence_elements_by_ids(Seq, Values, Ids)
+{
+    static_assert(Values::Size() == Ids::Size() && Seq::Size() >= Values::Size(), "wrong!");
+
+    return typename detail::modify_sequence_elements_by_ids_impl<Seq, Values, Ids>::type{};
 }
 #endif

-template <typename Seq, typename Reduce>
-struct lambda_reduce_on_sequence
-{
-    const Reduce& f;
-    index_t& result;
-
-    __host__ __device__ constexpr lambda_reduce_on_sequence(const Reduce& f_, index_t& result_)
-        : f(f_), result(result_)
-    {
-    }
-
-    template <typename IDim>
-    __host__ __device__ constexpr index_t operator()(IDim) const
-    {
-        return result = f(result, Seq::At(IDim{}));
-    }
-};
-
 template <typename Seq, typename Reduce, index_t Init>
 __host__ __device__ constexpr index_t
 reduce_on_sequence(Seq, Reduce f, Number<Init> /*initial_value*/)
 {
    index_t result = Init;

-    static_for<0, Seq::Size(), 1>{}(lambda_reduce_on_sequence<Seq, Reduce>(f, result));
+    for(index_t i = 0; i < Seq::Size(); ++i)
+    {
+        result = f(result, Seq::At(i));
+    }

    return result;
 }

 // TODO: a generic any_of for any container
 template <typename Seq, typename F>
-__host__ __device__ constexpr bool sequence_any_of(Seq, F f /*initial_value*/)
+__host__ __device__ constexpr bool sequence_any_of(Seq, F f)
 {
    bool flag = false;

@@ -808,7 +844,7 @@ __host__ __device__ constexpr bool sequence_any_of(Seq, F f /*initial_value*/)

 // TODO: a generic all_of for any container
 template <typename Seq, typename F>
-__host__ __device__ constexpr bool sequence_all_of(Seq, F f /*initial_value*/)
+__host__ __device__ constexpr bool sequence_all_of(Seq, F f)
 {
    bool flag = true;