[ck_tile] refactor reduce kernel (#3257)

* refactor reduce kernel - Rename Reduce kernel as per convention - Move kept_dim and reduce_dims from runtime to compile-time parameters - Update Reduce2dProblem template to include KeptDim, ReduceDims, and Rank - Remove IsSupportedArgument validation function as it's unnecessary. Not using the GuaranteedLastDimensionVectorStride while making tensor view or descriptor which removes the bounds enforced earlier. We still calculate and use vector size. - Update reduce example to demonstrate NCHW->NHW reduction with non-contiguous support - Update tests Kernel now handles both contiguous and non-contiguous memory layout. * fix compile errors
2026-04-20 14:59:17 +00:00 · 2025-12-17 21:46:08 +02:00
parent 92653168c2
commit ea10a78203
5 changed files with 89 additions and 130 deletions
--- a/include/ck_tile/ops/reduce/kernel/reduce2d_kernel.hpp
+++ b/include/ck_tile/ops/reduce/kernel/reduce2d_kernel.hpp
@@ -16,7 +16,7 @@
 namespace ck_tile {

 template <typename Problem_, typename Policy_ = Reduce2dDefaultPolicy>
-struct Reduce
+struct ReduceKernel
 {
    using Problem = ck_tile::remove_cvref_t<Problem_>;
    using Policy  = ck_tile::remove_cvref_t<Policy_>;
@@ -33,7 +33,7 @@ struct Reduce

    private:
    // Helper function to calculate optimal vector size for input tensor
-    template <typename InputShape, typename ReduceDims>
+    template <typename ReduceDims, index_t Rank, index_t NumReduceDim>
    static constexpr index_t CalculateInputVectorSize()
    {
        using S                                   = typename Problem::BlockShape;
@@ -41,8 +41,8 @@ struct Reduce
        constexpr index_t thread_tile_vector_size = S::ThreadTile_N;

        // Check if innermost reduce dimension is the last dimension (stride 1).
-        constexpr auto innermost_reduce_dim    = ReduceDims{}.at(number<ReduceDims{}.size() - 1>{});
-        constexpr bool is_innermost_contiguous = (innermost_reduce_dim == InputShape{}.size() - 1);
+        constexpr index_t innermost_reduce_dim = ReduceDims::at(number<NumReduceDim - 1>{});
+        constexpr bool is_innermost_contiguous = (innermost_reduce_dim == Rank - 1);

        // If innermost reduce dimension is not the last dim (not contiguous), limit vectorization
        constexpr index_t stride_based_vector_size =
@@ -63,29 +63,28 @@ struct Reduce
    }

    public:
-    template <typename InputShape, typename InputStrides, typename KeptDim, typename ReduceDims>
+    template <typename InputShape, typename InputStrides>
    CK_TILE_DEVICE void operator()(const XDataType* p_x,
                                   YDataType* p_y,
                                   InputShape input_shape,
-                                   InputStrides input_strides,
-                                   KeptDim kept_dim,
-                                   ReduceDims reduce_dims) const
+                                   InputStrides input_strides) const
    {
        using S       = typename Problem::BlockShape;
        const auto iM = get_block_id() * S::Block_M;

-        static_assert(kept_dim.size() + reduce_dims.size() == InputShape::size(),
+        static_assert(Problem::KeptDim::size() + Problem::ReduceDims::size() == Problem::Rank,
                      "Size of kept dimensions + reduced dimensions must equal input tensor rank");

        // Extract lengths based on kept and reduced dimensions
        const auto kept_lens = [&]() {
-            return generate_tuple([&](auto I) { return input_shape.at(number<kept_dim.at(I)>{}); },
-                                  number<kept_dim.size()>{});
+            return generate_tuple(
+                [&](auto I) { return input_shape.at(number<Problem::KeptDim::at(I)>{}); },
+                number<Problem::KeptDim::size()>{});
        }();
        const auto reduce_lens = [&]() {
            return generate_tuple(
-                [&](auto I) { return input_shape.at(number<reduce_dims.at(I)>{}); },
-                number<reduce_dims.size()>{});
+                [&](auto I) { return input_shape.at(number<Problem::ReduceDims::at(I)>{}); },
+                number<Problem::ReduceDims::size()>{});
        }();

        const auto kept_merge_transform   = make_merge_transform(kept_lens);
@@ -96,11 +95,13 @@ struct Reduce
            type_convert<XDataType>(reduce_func.template GetIdentityValue<ComputeDataType>());

        // Calculate optimal vector size for input tensor
-        constexpr auto x_tensor_vector_size = CalculateInputVectorSize<InputShape, ReduceDims>();
+        constexpr auto x_tensor_vector_size = CalculateInputVectorSize<typename Problem::ReduceDims,
+                                                                       Problem::Rank,
+                                                                       Problem::NumReduceDim>();

        // Create input tensor view with custom padding value
        auto desc = make_naive_tensor_descriptor(
-            input_shape, input_strides, number<x_tensor_vector_size>{}, number<1>{});
+            input_shape, input_strides, number<x_tensor_vector_size>{});

        // Create buffer view with custom padding value
        auto buffer_view = make_buffer_view<address_space_enum::global>(
@@ -109,10 +110,11 @@ struct Reduce
        // Create tensor view with custom padding
        const auto x_tensor = tensor_view<decltype(buffer_view), decltype(desc)>{buffer_view, desc};
        const auto transformed_x_tensor = pad_tensor_view(
-            transform_tensor_view(x_tensor,
-                                  make_tuple(kept_merge_transform, reduce_merge_transform),
-                                  make_tuple(kept_dim, reduce_dims),
-                                  make_tuple(sequence<0>{}, sequence<1>{})),
+            transform_tensor_view(
+                x_tensor,
+                make_tuple(kept_merge_transform, reduce_merge_transform),
+                make_tuple(typename Problem::KeptDim{}, typename Problem::ReduceDims{}),
+                make_tuple(sequence<0>{}, sequence<1>{})),
            make_tuple(number<S::Block_M>{}, number<S::Block_N>{}),
            sequence<0, 1>{});

@@ -122,25 +124,25 @@ struct Reduce
                [&](auto I) {
                    // Calculate stride for dimension I as product of all following dimensions
                    index_t stride = 1;
-                    static_for<I + 1, kept_dim.size(), 1>{}(
+                    static_for<I + 1, Problem::KeptDim::size(), 1>{}(
                        [&](auto J) { stride *= kept_lens.at(number<J>{}); });
                    return stride;
                },
-                number<kept_dim.size()>{});
+                number<Problem::KeptDim::size()>{});
        }();

        // Calculate optimal vector size for output tensor
        constexpr auto y_tensor_vector_size = CalculateOutputVectorSize();

        const auto y_m = make_naive_tensor_view<address_space_enum::global>(
-            p_y, kept_lens, kept_strides, number<y_tensor_vector_size>{}, number<1>{});
+            p_y, kept_lens, kept_strides, number<y_tensor_vector_size>{});

        // Transform output tensor to 1D merged view
        // This creates a view compatible with the 2D reduction pattern
        const auto y_merged = transform_tensor_view(
            y_m,
            make_tuple(kept_merge_transform),
-            make_tuple(typename arithmetic_sequence_gen<0, kept_dim.size(), 1>::type{}),
+            make_tuple(typename arithmetic_sequence_gen<0, Problem::KeptDim::size(), 1>::type{}),
            make_tuple(sequence<0>{}));

        auto x_window = make_tile_window(transformed_x_tensor,
@@ -179,49 +181,6 @@ struct Reduce

        store_tile(y_window, cast_tile<YDataType>(y_compute));
    }
-
-    /// @brief Validates if the given arguments are supported by the 2D reduction kernel.
-    ///
-    /// @param y_continous_dim Size of the continuous dimension of the output tensor.
-    ///                        Must be a multiple of ThreadTile_N for proper thread mapping.
-    ///
-    /// @param input_strides   The stride configuration of the input tensor.
-    ///                        The last stride must be 1 to ensure contiguous memory access
-    ///                        and enable efficient vectorized loads.
-    ///
-    /// @return true if the arguments are supported, false otherwise.
-    ///         Error messages are logged when CK_TILE_LOGGING is enabled.
-    ///
-    /// @note Requirements:
-    ///       - y_continous_dim % ThreadTile_N == 0 (for proper thread distribution)
-    ///       - input_strides[-1] == 1 (for contiguous memory access)
-    template <typename InputStrides>
-    CK_TILE_HOST static bool IsSupportedArgument(index_t y_continous_dim,
-                                                 InputStrides input_strides)
-    {
-        using S = typename Problem::BlockShape;
-
-        if(y_continous_dim % S::ThreadTile_N != 0)
-        {
-            if(ck_tile::EnvIsEnabled(CK_TILE_ENV(CK_TILE_LOGGING)))
-            {
-                CK_TILE_ERROR("Total reduction size should be a multiple of ThreadTile_N!");
-            }
-            return false;
-        }
-
-        if(input_strides.at(number<input_strides.size() - 1>{}) != 1)
-        {
-            if(ck_tile::EnvIsEnabled(CK_TILE_ENV(CK_TILE_LOGGING)))
-            {
-                CK_TILE_ERROR(
-                    "Input tensor's last stride must be 1 to support correct vector access!");
-            }
-            return false;
-        }
-
-        return true;
-    }
 };

 } // namespace ck_tile
--- a/include/ck_tile/ops/reduce/pipeline/reduce2d_problem.hpp
+++ b/include/ck_tile/ops/reduce/pipeline/reduce2d_problem.hpp
@@ -12,6 +12,9 @@ template <typename XDataType_,
          typename YDataType_,
          typename BlockShape_,
          typename ReduceOp_,
+          typename KeptDim_,
+          typename ReduceDims_,
+          index_t Rank_,
          bool OutputIndex_ = false>
 struct Reduce2dProblem
 {
@@ -20,7 +23,11 @@ struct Reduce2dProblem
    using YDataType       = remove_cvref_t<YDataType_>;
    using BlockShape      = remove_cvref_t<BlockShape_>;
    using ReduceOp        = ReduceOp_;
+    using KeptDim         = remove_cvref_t<KeptDim_>;
+    using ReduceDims      = remove_cvref_t<ReduceDims_>;

+    static constexpr index_t Rank            = Rank_;
+    static constexpr index_t NumReduceDim    = ReduceDims::size();
    static constexpr bool kOutputIndex       = OutputIndex_;
    static constexpr bool kNeedCrossLaneSync = BlockShape::ThreadPerWarp_N > 1;
    static constexpr bool kNeedCrossWarpSync = BlockShape::WarpPerBlock_N > 1;