Shuffle fix for gfx950 (#3491)

* solve compiler issue * solve the gfx950 mfma shuffle regression * refactor jenkinsfile to handle arch name better * [CK TILE] set divisor to count of thread along k dimension * fix the compiler error * solve degradation * Finish the multiplies fix * fix the scales * solve compilation error * solve the composes * solve the error of tile sweeper * fix the test and example * fix for gfx950 --------- Co-authored-by: Max Podkorytov <4273004+tenpercent@users.noreply.github.com> Co-authored-by: illsilin_amdeng <Illia.Silin@amd.com> Co-authored-by: Cong Ma <congma13@amd.com>
2026-04-20 06:49:15 +00:00 · 2026-01-14 01:21:29 +08:00
parent 9908a87c31
commit 00c46785a8
33 changed files with 161 additions and 152 deletions
--- a/include/ck_tile/core/algorithm/coordinate_transform.hpp
+++ b/include/ck_tile/core/algorithm/coordinate_transform.hpp
@@ -564,7 +564,7 @@ struct merge_v2_magic_division : public base_transform<LowLengths::size(), 1>
    using UpperIndex = multi_index<1>;

    using UpLengths =
-        decltype(make_tuple(container_reduce(LowLengths{}, multiplies{}, number<1>{})));
+        decltype(make_tuple(container_reduce(LowLengths{}, multiplies<>{}, number<1>{})));

    using LowLengthsMagicDivisor = decltype(generate_tuple(
        lambda_merge_generate_MagicDivision_calculate_magic_divisor<LowLengths>{},
@@ -584,7 +584,7 @@ struct merge_v2_magic_division : public base_transform<LowLengths::size(), 1>
          low_lengths_magic_divisor_{generate_tuple(
              [&](auto i) { return magic_division::calculate_magic_numbers(low_lengths[i]); },
              number<NDimLow>{})},
-          up_lengths_{make_tuple(container_reduce(low_lengths, multiplies{}, I1))}
+          up_lengths_{make_tuple(container_reduce(low_lengths, multiplies<>{}, I1))}
    {
        static_assert(LowerIndex::size() == NDimLow, "wrong!");
    }
@@ -707,10 +707,10 @@ struct merge_v3_division_mod : public base_transform<LowLengths::size(), 1>
    using UpperIndex = multi_index<1>;

    using LowLengthsScan =
-        decltype(container_reverse_exclusive_scan(LowLengths{}, multiplies{}, number<1>{}));
+        decltype(container_reverse_exclusive_scan(LowLengths{}, multiplies<>{}, number<1>{}));

    using UpLengths =
-        decltype(make_tuple(container_reduce(LowLengths{}, multiplies{}, number<1>{})));
+        decltype(make_tuple(container_reduce(LowLengths{}, multiplies<>{}, number<1>{})));

    LowLengths low_lengths_;
    LowLengthsScan low_lengths_scan_;
@@ -721,8 +721,8 @@ struct merge_v3_division_mod : public base_transform<LowLengths::size(), 1>
    CK_TILE_HOST_DEVICE constexpr merge_v3_division_mod(const LowLengths& low_lengths)
        : low_lengths_{low_lengths},
          low_lengths_scan_{
-              container_reverse_exclusive_scan(low_lengths, multiplies{}, number<1>{})},
-          up_lengths_{make_tuple(container_reduce(low_lengths, multiplies{}, number<1>{}))}
+              container_reverse_exclusive_scan(low_lengths, multiplies<>{}, number<1>{})},
+          up_lengths_{make_tuple(container_reduce(low_lengths, multiplies<>{}, number<1>{}))}
    {
        static_assert(LowerIndex::size() == NDimLow, "wrong!");
    }
@@ -832,7 +832,7 @@ struct unmerge : public base_transform<1, UpLengths::size()>
    using UpperIndex = multi_index<NDimUp>;

    using UpLengthsScan =
-        decltype(container_reverse_exclusive_scan(UpLengths{}, multiplies{}, number<1>{}));
+        decltype(container_reverse_exclusive_scan(UpLengths{}, multiplies<>{}, number<1>{}));

    UpLengths up_lengths_;
    UpLengthsScan up_lengths_scan_;
@@ -841,7 +841,8 @@ struct unmerge : public base_transform<1, UpLengths::size()>

    CK_TILE_HOST_DEVICE constexpr unmerge(const UpLengths& up_lengths)
        : up_lengths_{up_lengths},
-          up_lengths_scan_{container_reverse_exclusive_scan(up_lengths, multiplies{}, number<1>{})}
+          up_lengths_scan_{
+              container_reverse_exclusive_scan(up_lengths, multiplies<>{}, number<1>{})}
    {
    }

--- a/include/ck_tile/core/algorithm/space_filling_curve.hpp
+++ b/include/ck_tile/core/algorithm/space_filling_curve.hpp
@@ -19,7 +19,7 @@ template <typename TensorLengths,
 struct space_filling_curve
 {
    static constexpr index_t TensorSize =
-        reduce_on_sequence(TensorLengths{}, multiplies{}, number<1>{});
+        reduce_on_sequence(TensorLengths{}, multiplies<>{}, number<1>{});
    static_assert(0 < TensorSize,
                  "space_filling_curve should be used to access a non-empty tensor");

@@ -28,7 +28,7 @@ struct space_filling_curve
    using Index = multi_index<nDim>;

    static constexpr index_t ScalarPerVector =
-        reduce_on_sequence(ScalarsPerAccess{}, multiplies{}, number<1>{});
+        reduce_on_sequence(ScalarsPerAccess{}, multiplies<>{}, number<1>{});

    static constexpr auto access_lengths   = TensorLengths{} / ScalarsPerAccess{};
    static constexpr auto dim_access_order = DimAccessOrder{};
@@ -49,7 +49,7 @@ struct space_filling_curve
        static_assert(TensorLengths{} % ScalarsPerAccess{} ==
                      typename uniform_sequence_gen<TensorLengths::size(), 0>::type{});

-        return reduce_on_sequence(TensorLengths{}, multiplies{}, number<1>{}) / ScalarPerVector;
+        return reduce_on_sequence(TensorLengths{}, multiplies<>{}, number<1>{}) / ScalarPerVector;
    }

    template <index_t AccessIdx1dHead, index_t AccessIdx1dTail>
@@ -94,7 +94,7 @@ struct space_filling_curve
 #else

        constexpr auto access_strides =
-            container_reverse_exclusive_scan(ordered_access_lengths, multiplies{}, number<1>{});
+            container_reverse_exclusive_scan(ordered_access_lengths, multiplies<>{}, number<1>{});

        constexpr auto idx_1d = number<AccessIdx1d>{};
        // Given tensor strides \p access_lengths, and 1D index of space-filling-curve, compute the
--- a/include/ck_tile/core/container/sequence.hpp
+++ b/include/ck_tile/core/container/sequence.hpp
@@ -1237,10 +1237,11 @@ constexpr auto reverse_slice_sequence(Seq,
 {
    static_assert(Seq::size() == Mask::size());
    static_assert(SliceSize != 0, "slice size zero is invalid");
-    static_assert(container_reduce(pick_sequence_elements_by_mask(Seq{}, Mask{}), multiplies{}, 1) %
-                          SliceSize ==
-                      0,
-                  "slice size can't evenly divide input sizes");
+    static_assert(
+        container_reduce(pick_sequence_elements_by_mask(Seq{}, Mask{}), multiplies<>{}, 1) %
+                SliceSize ==
+            0,
+        "slice size can't evenly divide input sizes");
    using sliced_type =
        impl::reverse_slice_sequence_impl<Seq,
                                          Mask,
--- a/include/ck_tile/core/numeric/math.hpp
+++ b/include/ck_tile/core/numeric/math.hpp
@@ -42,7 +42,7 @@ struct scales
 };

 template <typename Scale>
-CK_TILE_HOST_DEVICE_EXTERN scales(Scale) -> scales<Scale>;
+scales(Scale) -> scales<Scale>;

 template <typename Left = void, typename Right = Left>
 struct plus
@@ -65,8 +65,6 @@ struct plus<void, void>
    }
 };

-CK_TILE_HOST_DEVICE_EXTERN plus() -> plus<void, void>;
-
 template <typename Left = void, typename Right = Left>
 struct minus
 {
@@ -88,8 +86,6 @@ struct minus<void, void>
    }
 };

-CK_TILE_HOST_DEVICE_EXTERN minus() -> minus<void, void>;
-
 template <typename Left = void, typename Right = Left>
 struct multiplies
 {
@@ -111,8 +107,6 @@ struct multiplies<void, void>
    }
 };

-CK_TILE_HOST_DEVICE_EXTERN multiplies() -> multiplies<void, void>;
-
 template <typename T>
 struct maximize
 {
@@ -341,8 +335,6 @@ struct equal<void, void>
    }
 };

-CK_TILE_HOST_DEVICE_EXTERN equal() -> equal<void, void>;
-
 template <>
 struct equal<float, float>
 {
@@ -382,8 +374,6 @@ struct less<void, void>
    }
 };

-CK_TILE_HOST_DEVICE_EXTERN less() -> less<void, void>;
-
 template <typename Left = void, typename Right = Left>
 struct less_equal
 {
@@ -405,8 +395,6 @@ struct less_equal<void, void>
    }
 };

-CK_TILE_HOST_DEVICE_EXTERN less_equal() -> less_equal<void, void>;
-
 template <>
 struct less_equal<float, float>
 {
--- a/include/ck_tile/core/tensor/load_tile_transpose.hpp
+++ b/include/ck_tile/core/tensor/load_tile_transpose.hpp
@@ -434,7 +434,7 @@ CK_TILE_DEVICE auto load_tile_transpose_with_offset(
                  "the vector length is not the same!");
    constexpr index_t vecLoadSize = y_in_lengths[NDimYIn - 1];
    constexpr index_t num_of_access =
-        reduce_on_sequence(y_in_lengths, multiplies{}, number<1>{}) / vecLoadSize;
+        reduce_on_sequence(y_in_lengths, multiplies<>{}, number<1>{}) / vecLoadSize;

    using DataVec = array<typename BottomTensorView_::DataType, vecLoadSize>;
    static_for<0, num_of_access, 1>{}([&](auto iAccess) {
--- a/include/ck_tile/core/tensor/static_distributed_tensor.hpp
+++ b/include/ck_tile/core/tensor/static_distributed_tensor.hpp
@@ -229,7 +229,7 @@ set_tile_if(static_distributed_tensor<DataType, StaticTileDistribution>& out_ten
 template <typename YLengths, index_t XUnpacks>
 CK_TILE_HOST_DEVICE constexpr auto get_y_unpacks_from_x_unpacks(YLengths, number<XUnpacks>)
 {
-    constexpr auto y_size  = reduce_on_sequence(YLengths{}, multiplies{}, number<1>{});
+    constexpr auto y_size  = reduce_on_sequence(YLengths{}, multiplies<>{}, number<1>{});
    constexpr auto y_packs = number<XUnpacks>{};
    static_assert(y_size % y_packs == 0);
    constexpr auto y_slice_size = y_size / y_packs;
--- a/include/ck_tile/core/tensor/sweep_tile.hpp
+++ b/include/ck_tile/core/tensor/sweep_tile.hpp
@@ -297,12 +297,12 @@ struct tile_sweeper

 // partial deduction is not allowed
 // template <typename T, typename F, typename U>
-// CK_TILE_HOST_DEVICE_EXTERN tile_sweeper(const F&, U = {})->tile_sweeper<T, F, U>;
+// tile_sweeper(const F&, U = {})->tile_sweeper<T, F, U>;

 // deduction guide
 template <typename T,
          typename F,
          typename U = typename uniform_sequence_gen<T::get_num_of_dimension(), 1>::type>
-CK_TILE_HOST_DEVICE_EXTERN tile_sweeper(const T&, const F&, U = {}) -> tile_sweeper<T, F, U>;
+tile_sweeper(const T&, const F&, U = {}) -> tile_sweeper<T, F, U>;

 } // namespace ck_tile
--- a/include/ck_tile/core/tensor/tensor_adaptor.hpp
+++ b/include/ck_tile/core/tensor/tensor_adaptor.hpp
@@ -76,7 +76,7 @@ struct tensor_adaptor
            number<ndim_top_>{});

        // TODO: make container_reduce support tuple of number and index_t
-        return container_reduce(lengths, multiplies{}, number<1>{});
+        return container_reduce(lengths, multiplies<>{}, number<1>{});
    }

    template <index_t IDimHidden>
--- a/include/ck_tile/core/tensor/tensor_descriptor.hpp
+++ b/include/ck_tile/core/tensor/tensor_descriptor.hpp
@@ -382,7 +382,7 @@ make_naive_tensor_descriptor_packed(const tuple<Lengths...>& lengths,

    constexpr auto visible_dim_hidden_ids = typename arithmetic_sequence_gen<1, N + 1, 1>::type{};

-    const auto element_space_size = container_reduce(lengths, multiplies{}, long_number<1>{});
+    const auto element_space_size = container_reduce(lengths, multiplies<>{}, long_number<1>{});

    constexpr index_t first_dim_length = []() {
        if constexpr(is_constant_v<remove_cvref_t<decltype(element_space_size)>>)
@@ -428,7 +428,7 @@ CK_TILE_HOST_DEVICE constexpr auto make_naive_tensor_descriptor_packed_with_offs
    number<GuaranteedLastDimensionVectorLength> = number<-1>{})
 {
    const auto desc_0 = [&]() {
-        const auto element_space_size = container_reduce(lengths, multiplies{}, long_number<1>{});
+        const auto element_space_size = container_reduce(lengths, multiplies<>{}, long_number<1>{});

        const auto transforms = make_tuple(make_offset_transform(element_space_size, offset));

@@ -491,8 +491,12 @@ make_naive_tensor_descriptor_aligned(const tuple<Lengths...>& lengths, Align ali
            }
            else
            {
-                return container_reduce(
-                    lengths, multiplies{}, number<stride_n_minus_2>{}, i + I1, number<N - 1>{}, I1);
+                return container_reduce(lengths,
+                                        multiplies<>{},
+                                        number<stride_n_minus_2>{},
+                                        i + I1,
+                                        number<N - 1>{},
+                                        I1);
            }
        },
        number<N>{});
--- a/include/ck_tile/core/tensor/tile_distribution.hpp
+++ b/include/ck_tile/core/tensor/tile_distribution.hpp
@@ -113,7 +113,7 @@ struct tile_distribution
        return generate_tuple(
            [&](auto i) {
                constexpr index_t x_length =
-                    container_reduce(typename DstrEncode::HsLengthss{}[i], multiplies{}, 1);
+                    container_reduce(typename DstrEncode::HsLengthss{}[i], multiplies<>{}, 1);

                return number<x_length>{};
            },
@@ -583,8 +583,8 @@ CK_TILE_HOST_DEVICE constexpr auto slice_distribution_from_x(
            if constexpr(x_slice_ends[i] == -1)
            {
                // -1 means till the end
-                constexpr auto x_length_ =
-                    container_reduce(typename Encoding::HsLengthss{}[i], multiplies{}, number<1>{});
+                constexpr auto x_length_ = container_reduce(
+                    typename Encoding::HsLengthss{}[i], multiplies<>{}, number<1>{});
                return x_length_;
            }
            else
--- a/include/ck_tile/core/tensor/tile_window_linear.hpp
+++ b/include/ck_tile/core/tensor/tile_window_linear.hpp
@@ -277,7 +277,7 @@ struct tile_window_linear
    {
        constexpr auto linear_coord = get_bottom_linear_coordinate(number<i_access>{});
        constexpr auto is_pure_linear_tensor =
-            reduce_on_sequence(LinearBottomDims{}, multiplies{}, number<1>{});
+            reduce_on_sequence(LinearBottomDims{}, multiplies<>{}, number<1>{});
        if constexpr(is_pure_linear_tensor)
        {
            // this case usually is a LDS window, everything is known at compile tile.
--- a/include/ck_tile/core/utility/functional_with_tuple.hpp
+++ b/include/ck_tile/core/utility/functional_with_tuple.hpp
@@ -69,9 +69,9 @@ struct static_uford_one_shot_impl
    CK_TILE_HOST_DEVICE constexpr void operator()(F f, CurrentUnpackIds, number<current_acc>) const
    {
        constexpr auto r_lens_stride =
-            reverse_exclusive_scan_sequence(RemainLengths{}, multiplies{}, number<1>{});
+            reverse_exclusive_scan_sequence(RemainLengths{}, multiplies<>{}, number<1>{});
        constexpr auto r_upks_stride =
-            reverse_exclusive_scan_sequence(RamainUnpacks{}, multiplies{}, number<1>{});
+            reverse_exclusive_scan_sequence(RamainUnpacks{}, multiplies<>{}, number<1>{});

        constexpr index_t current_stride = r_lens_stride.front() / r_upks_stride.front();
        constexpr index_t pack_len       = RamainUnpacks::front();
@@ -127,7 +127,7 @@ template <class Lengths,
          class Orders  = typename arithmetic_sequence_gen<0, Lengths::size(), 1>::type>
 struct static_uford
 {
-    static constexpr index_t num_packs = reduce_on_sequence(Unpacks{}, multiplies{}, number<1>{});
+    static constexpr index_t num_packs = reduce_on_sequence(Unpacks{}, multiplies<>{}, number<1>{});

    CK_TILE_HOST_DEVICE constexpr static_uford()
    {
@@ -142,7 +142,7 @@ struct static_uford
    {
        using L_ = decltype(Lengths{} / Unpacks{});

-        return reduce_on_sequence(L_{}, multiplies{}, number<1>{});
+        return reduce_on_sequence(L_{}, multiplies<>{}, number<1>{});
    }

    // F signature: F(sequence<...> multi_id...)
--- a/include/ck_tile/core/utility/unary_element_function.hpp
+++ b/include/ck_tile/core/utility/unary_element_function.hpp
@@ -47,8 +47,11 @@ struct composes<F>
    F f_;
 };

-template <typename... Ts>
-CK_TILE_HOST_DEVICE_EXTERN composes(Ts&&...) -> composes<remove_cvref_t<Ts>...>;
+template <class... Ts>
+CK_TILE_HOST_DEVICE constexpr auto make_composes(Ts&&... ts)
+{
+    return composes<remove_cvref_t<Ts>...>{std::forward<Ts>(ts)...};
+}

 template <typename SaturateType>
 struct saturates