From 4f4aba4872e0b080a112337972c5cc02623c7b1a Mon Sep 17 00:00:00 2001
From: Chao Liu <lc.roy86@gmail.com>
Date: Tue, 24 Sep 2019 23:59:47 -0500
Subject: [PATCH] adding GetLinearDimensionMask()

---
 ...chw_kcyx_nkhw_padded_lds_double_buffer.hpp |   4 +-
 .../tensor_coordinate_v2.hpp                  |   5 +
 .../tensor_description/tensor_descriptor.hpp  | 138 ++++++++++--------
 .../include/utility/functional.hpp            |  12 ++
 .../include/utility/sequence.hpp              |  84 ++++++++---
 5 files changed, 152 insertions(+), 91 deletions(-)

diff --git a/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_padded_lds_double_buffer.hpp b/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_padded_lds_double_buffer.hpp
index 4d941b5053..6303c09f95 100644
--- a/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_padded_lds_double_buffer.hpp
+++ b/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_padded_lds_double_buffer.hpp
@@ -440,9 +440,7 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_padded_lds_double_buf
                                                       0,
                                                       b_thread_data_on_global,
                                                       0})
-#if 0
-                .Run_generic
-#elif 1
+#if 1
                 .template Run_generic<Float, address_space_t::generic, address_space_t::global>
 #elif 1
                 .template Run_optimized_dst_address_calculation<Float, address_space_t::global>
diff --git a/composable_kernel/include/tensor_description/tensor_coordinate_v2.hpp b/composable_kernel/include/tensor_description/tensor_coordinate_v2.hpp
index b88b52671a..8004d57c2a 100644
--- a/composable_kernel/include/tensor_description/tensor_coordinate_v2.hpp
+++ b/composable_kernel/include/tensor_description/tensor_coordinate_v2.hpp
@@ -45,6 +45,7 @@ struct NativeTensorCoordinate
     __host__ __device__ constexpr type operator+=(const Index& idx_diff)
     {
         // mIndex is updated here, but some (or all) of its entries may never be used
+        // compiler should remove those entries as dead code
         mIndex += idx_diff;
 
         mOffset += tensor_desc_type::CalculateOffsetDiff(idx_diff);
@@ -55,6 +56,7 @@ struct NativeTensorCoordinate
     __host__ __device__ constexpr type operator-=(const Index& idx_diff)
     {
         // mIndex is updated here, but some (or all) of its entries may never be used
+        // compiler should remove those entries as dead code
         mIndex -= idx_diff;
 
         mOffset -= tensor_desc_type::CalculateOffsetDiff(idx_diff);
@@ -136,6 +138,7 @@ struct TransformedTensorCoordinate
             idx_up_diff, GetIndex(), GetLowerCoordinate().GetIndex());
 
         // mIndexUp is updated here, but some (or all) of its entries may never be used
+        // compiler should remove those entries as dead code
         mIndexUp += idx_up_diff;
 
         return *this;
@@ -146,6 +149,8 @@ struct TransformedTensorCoordinate
         mCoordLow -= tensor_desc_type::CalculateLowerIndexDiff(
             idx_up_diff, GetIndex(), GetLowerCoordinate().GetIndex());
 
+        // mIndex is updated here, but some (or all) of its entries may never be used
+        // compiler should remove those entries as dead code
         mIndexUp -= idx_up_diff;
 
         return *this;
diff --git a/composable_kernel/include/tensor_description/tensor_descriptor.hpp b/composable_kernel/include/tensor_description/tensor_descriptor.hpp
index dc2039355c..a934651911 100644
--- a/composable_kernel/include/tensor_description/tensor_descriptor.hpp
+++ b/composable_kernel/include/tensor_description/tensor_descriptor.hpp
@@ -101,12 +101,12 @@ struct NativeTensorDescriptor
         return true;
     }
 
-    __host__ __device__ static constexpr auto GetMaskOfLinearDimensions()
+    __host__ __device__ static constexpr auto GetLinearDimensionMask()
     {
         return typename uniform_sequence_gen<nDim, 1>::type{};
     }
 
-    __host__ __device__ static constexpr auto GetMaskOfNonLinearDimensions()
+    __host__ __device__ static constexpr auto GetNonLinearDimensionMask()
     {
         return typename uniform_sequence_gen<nDim, 0>::type{};
     }
@@ -353,18 +353,27 @@ struct TransformedTensorDescriptor
         return GetLowerTensorDescriptor().CalculateOffset(CalculateLowerIndex(idx_up));
     }
 
-#if 0
-    struct lambda_sequence_logic_or
+#if 1
+    struct lambda_sequence_logical_and
     {
         template <typename... Seqs>
-        __host__ __device__ constexpr auto operator()(Seqs... seqs) const
+        __host__ __device__ constexpr auto operator()(Seqs...) const
         {
-            // TODO: should use math::logic_or<bool>, after Sequence can take bool
-            return typename sequence_reduce<math::logic_or<bool>, Seqs...>::type{};
+            return typename sequence_reduce<logical_and<index_t>, Seqs...>::type{};
         }
     };
 
-    struct lambda_1
+    template <typename T>
+    struct lambda_is_true
+    {
+        __host__ __device__ constexpr auto operator()(const T& x) const
+        {
+            // TODO: remove static_cast once Sequence can take bool as entries
+            return static_cast<bool>(x) == true;
+        }
+    };
+
+    struct lambda_get_linear_dimension_mask_of_single_tranform
     {
         // check only one transform at a time
         template <typename Transform, typename LowDimensionId, typename UpDimensionId>
@@ -372,73 +381,73 @@ struct TransformedTensorDescriptor
         operator()(const Transform& tran, LowDimensionId, UpDimensionId) const
         {
             // judge if transformation is linear
-            constexpr bool is_linear_transform = tran.IsLinearTransform();
+            constexpr bool is_linear_transform = Transform::IsLinearTransform();
 
             // judge if all lower dimension are linear
-            constexpr bool is_all_low_dim_linear = math::reduce_on_sequence(
-                pick_sequence_elements_by_mask(
-                    GetLowerTensorDescriptor().GetMaskOfLinearDimensions(), LowDimensionId{}),
-                math::logic_and<bool>{},
-                integral_constant<bool, true>{});
+            constexpr bool are_all_low_dim_linear = sequence_all_of(
+                pick_sequence_elements_by_ids(GetLowerTensorDescriptor().GetLinearDimensionMask(),
+                                              LowDimensionId{}),
+                lambda_is_true<index_t>{});
 
-            // judge if upper dimenisons are linear
-            constexpr bool is_up_dim_nonlinear = !(is_linear_transform && is_all_low_dim_linear);
+            // create linear mask for upper dimensions
+            constexpr bool are_up_dim_linear = is_linear_transform && are_all_low_dim_linear;
 
-            constexpr auto value_sequence =
-                typename uniform_sequence_gen<tran.GetNumOfUpperDimension(),
-                                              is_up_dim_nonlinear>::type{};
+            constexpr auto mask_of_up_linear_dims = modifiy_sequence_by_ids(
+                typename uniform_sequence_gen<nDimUp, 0>::type{},
+                typename uniform_sequence_gen<UpDimensionId::Size(), 1>::type{},
+                UpDimensionId{});
 
-            constexpr auto mask_of_up_nonlinear_dims = modifiy_sequence(
-                typename uniform_sequence_gen<nDimUp, 0>::type{}, value_sequence, UpDimensionId{});
-
-            return mask_of_up_nonlinear_dims;
-        };
-
-        __host__ __device__ static constexpr bool GetMaskOfNonLinearDimensions()
-        {
-            // create tuple of linear dimension masks, for all transformations
-            constexpr auto tuple_of_nonlinear_dimension_mask =
-                transform_tuples(lambda_1{}, Transforms{}, LowDimensionIds{}, UpDimensionIds{});
-
-            // reduce tuple of masks into one mask
-            constexpr auto nonlinear_dimension_mask =
-                unpack(lambda_sequence_logic_or{}, tuple_of_nonlinear_dimension_mask);
-
-            return nonlinear_dimension_mask;
+            return mask_of_up_linear_dims;
         }
+    };
 
-        __host__ __device__ static constexpr bool GetMaskOfLinearDimensions()
-        {
-            return GetMaskOfNonLinearDimensions().Transform(math::logic_not<bool>{});
-        }
+    __host__ __device__ static constexpr auto GetLinearDimensionMask()
+    {
+        // create tuple of linear dimension masks, for all transformations
+        constexpr auto tuple_of_linear_dimension_mask =
+            transform_tuples(lambda_get_linear_dimension_mask_of_single_tranform{},
+                             Transforms{},
+                             LowDimensionIds{},
+                             UpDimensionIds{});
 
-        template <index_t IDim>
-        __host__ __device__ static constexpr bool IsLinearDimension(Number<IDim>)
-        {
-            return GetMaskOfLinearDimensions().At(Number<IDim>{});
-        }
+        // reduce tuple of masks into one mask
+        constexpr auto linear_dimension_mask =
+            unpack(lambda_sequence_logical_and{}, tuple_of_linear_dimension_mask);
 
-        __host__ __device__ static constexpr auto GetLinearDimensions()
-        {
-            constexpr auto linear_dimension_mask = GetMaskOfLienarDimensions();
+        return linear_dimension_mask;
+    }
 
-            return pick_sequence_elements_by_mask(
-                typename arithmetic_sequence_gen<0, nDimUp, 1>::type{}, linear_dimension_mask);
-        }
+    __host__ __device__ static constexpr auto GetNonLinearDimensionMask()
+    {
+        return GetLinearDimensionMask().Transform(logical_not<index_t>{});
+    }
 
-        __host__ __device__ static constexpr auto GetNonLinearDimensions()
-        {
-            constexpr auto nonlinear_dimension_mask =
-                GetMaskOfLienarDimensions().Transform(math::logic_not<index_t>{});
+    template <index_t IDim>
+    __host__ __device__ static constexpr bool IsLinearDimension(Number<IDim>)
+    {
+        return GetLinearDimensionMask().At(Number<IDim>{});
+    }
 
-            return pick_sequence_elements_by_mask(
-                typename arithmetic_sequence_gen<0, nDimUp, 1>::type{}, nonlinear_dimension_mask);
-        }
+    __host__ __device__ static constexpr auto GetLinearDimensions()
+    {
+        constexpr auto linear_dimension_mask = GetLinearDimensionMask();
 
-        __host__ __device__ static constexpr auto GetNonLinearIndependentDimensionGroups()
-        {
-            // not implemented
-        }
+        return pick_sequence_elements_by_mask(
+            typename arithmetic_sequence_gen<0, nDimUp, 1>::type{}, linear_dimension_mask);
+    }
+
+    __host__ __device__ static constexpr auto GetNonLinearDimensions()
+    {
+        constexpr auto nonlinear_dimension_mask = GetNonLinearDimensionMask();
+
+        return pick_sequence_elements_by_mask(
+            typename arithmetic_sequence_gen<0, nDimUp, 1>::type{}, nonlinear_dimension_mask);
+    }
+
+    __host__ __device__ static constexpr auto GetNonLinearIndependentDimensionGroups()
+    {
+        // not implemented
+    }
 #endif
 
     __host__ __device__ static constexpr bool
@@ -457,9 +466,10 @@ struct TransformedTensorDescriptor
         return flag;
     }
 
-    // Whenever this function is called, it will call CalculateLowerIndex() recursively
+    // Whenever this function is called, it will call CalculateLowerIndex() recursively.
     // If you have created a tensor coordinate already, instead of calling this function,
-    //   you should call TransformedTensorCoordinate::IsUpperIndexMappedToValidOffset()
+    //   you should call TensorCoordinate::IsUpperIndexMappedToValidOffset() which would
+    //   be less expensive.
     __host__ __device__ static constexpr bool
     IsUpperIndexMappedToValidOffset(const UpperIndex& idx_up)
     {
diff --git a/composable_kernel/include/utility/functional.hpp b/composable_kernel/include/utility/functional.hpp
index 6232e2243f..c49800827d 100644
--- a/composable_kernel/include/utility/functional.hpp
+++ b/composable_kernel/include/utility/functional.hpp
@@ -25,6 +25,18 @@ struct swallow
     }
 };
 
+template <typename T>
+struct logical_and
+{
+    constexpr bool operator()(const T& x, const T& y) const { return x && y; }
+};
+
+template <typename T>
+struct logical_or
+{
+    constexpr bool operator()(const T& x, const T& y) const { return x || y; }
+};
+
 template <typename T>
 struct logical_not
 {
diff --git a/composable_kernel/include/utility/sequence.hpp b/composable_kernel/include/utility/sequence.hpp
index 3ccc8c5f65..85855780e3 100644
--- a/composable_kernel/include/utility/sequence.hpp
+++ b/composable_kernel/include/utility/sequence.hpp
@@ -311,7 +311,7 @@ struct sequence_reverse<Sequence<I0, I1>>
     using type = Sequence<I1, I0>;
 };
 
-#if 0
+#if 1
 template <typename Reduce, typename Seq, typename... Seqs>
 struct sequence_reduce
 {
@@ -755,46 +755,82 @@ __host__ __device__ constexpr auto pick_sequence_elements_by_ids(Seq, Sequence<I
     return Sequence<Seq::At(Number<Is>{})...>{};
 }
 
-#if 0
+#if 1
+namespace detail {
+template <typename WorkSeq, typename RemainSeq, typename RemainMask>
+struct pick_sequence_elements_by_mask_impl
+{
+    using new_work_seq = typename conditional<RemainMask::Front(),
+                                              decltype(WorkSeq::PushBack(RemainSeq::Front())),
+                                              WorkSeq>::type;
+
+    using type =
+        typename pick_sequence_elements_by_mask_impl<new_work_seq,
+                                                     decltype(RemainSeq::PopFront()),
+                                                     decltype(RemainMask::PopFront())>::type;
+};
+
+template <typename WorkSeq>
+struct pick_sequence_elements_by_mask_impl<WorkSeq, Sequence<>, Sequence<>>
+{
+    using type = WorkSeq;
+};
+
+} // namespace detail
+
 template <typename Seq, typename Mask>
 __host__ __device__ constexpr auto pick_sequence_elements_by_mask(Seq, Mask)
 {
-    // not implemented
+    static_assert(Seq::Size() == Mask::Size(), "wrong!");
+
+    return typename detail::pick_sequence_elements_by_mask_impl<Sequence<>, Seq, Mask>::type{};
+}
+
+namespace detail {
+template <typename WorkSeq, typename RemainValues, typename RemainIds>
+struct modify_sequence_elements_by_ids_impl
+{
+    using new_work_seq = decltype(WorkSeq::Modify(RemainIds::Front(), RemainValues::Front()));
+
+    using type =
+        typename modify_sequence_elements_by_ids_impl<new_work_seq,
+                                                      decltype(RemainValues::PopFront()),
+                                                      decltype(RemainIds::PopFront())>::type;
+};
+
+template <typename WorkSeq>
+struct modify_sequence_elements_by_ids_impl<WorkSeq, Sequence<>, Sequence<>>
+{
+    using type = WorkSeq;
+};
+} // namespace detail
+
+template <typename Seq, typename Values, typename Ids>
+__host__ __device__ constexpr auto modify_sequence_elements_by_ids(Seq, Values, Ids)
+{
+    static_assert(Values::Size() == Ids::Size() && Seq::Size() >= Values::Size(), "wrong!");
+
+    return typename detail::modify_sequence_elements_by_ids_impl<Seq, Values, Ids>::type{};
 }
 #endif
 
-template <typename Seq, typename Reduce>
-struct lambda_reduce_on_sequence
-{
-    const Reduce& f;
-    index_t& result;
-
-    __host__ __device__ constexpr lambda_reduce_on_sequence(const Reduce& f_, index_t& result_)
-        : f(f_), result(result_)
-    {
-    }
-
-    template <typename IDim>
-    __host__ __device__ constexpr index_t operator()(IDim) const
-    {
-        return result = f(result, Seq::At(IDim{}));
-    }
-};
-
 template <typename Seq, typename Reduce, index_t Init>
 __host__ __device__ constexpr index_t
 reduce_on_sequence(Seq, Reduce f, Number<Init> /*initial_value*/)
 {
     index_t result = Init;
 
-    static_for<0, Seq::Size(), 1>{}(lambda_reduce_on_sequence<Seq, Reduce>(f, result));
+    for(index_t i = 0; i < Seq::Size(); ++i)
+    {
+        result = f(result, Seq::At(i));
+    }
 
     return result;
 }
 
 // TODO: a generic any_of for any container
 template <typename Seq, typename F>
-__host__ __device__ constexpr bool sequence_any_of(Seq, F f /*initial_value*/)
+__host__ __device__ constexpr bool sequence_any_of(Seq, F f)
 {
     bool flag = false;
 
@@ -808,7 +844,7 @@ __host__ __device__ constexpr bool sequence_any_of(Seq, F f /*initial_value*/)
 
 // TODO: a generic all_of for any container
 template <typename Seq, typename F>
-__host__ __device__ constexpr bool sequence_all_of(Seq, F f /*initial_value*/)
+__host__ __device__ constexpr bool sequence_all_of(Seq, F f)
 {
     bool flag = true;