comments resolution

2026-06-30 19:57:40 +00:00 · 2025-07-24 17:59:43 +00:00
parent f515d29036
commit 295cf136a0
8 changed files with 354 additions and 176 deletions
--- a/example/ck_tile/05_reduce/reduce.cpp
+++ b/example/ck_tile/05_reduce/reduce.cpp
@@ -1,3 +1,6 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
+
 #include "ck_tile/host.hpp"
 #include "ck_tile/ops/reduce.hpp"
 #include <cstring>
@@ -5,9 +8,10 @@
 auto create_args(int argc, char* argv[])
 {
    ck_tile::ArgParser arg_parser;
-    arg_parser.insert("m", "2048", "m dimension")
-        .insert("n", "1024", "n dimension")
-        .insert("k", "2", "k dimension")
+    arg_parser.insert("n", "32", "n dimension")
+        .insert("h", "7", "h dimension")
+        .insert("w", "7", "w dimension")
+        .insert("c", "512", "c dimension")
        .insert("v", "1", "cpu validation or not")
        .insert("prec", "fp16", "precision")
        .insert("warmup", "0", "cold iter")
@@ -24,27 +28,28 @@ bool run(const ck_tile::ArgParser& arg_parser)
    using ComputeDataType = float;
    using YDataType       = DataType;

-    ck_tile::index_t m = arg_parser.get_int("m");
-    ck_tile::index_t n = arg_parser.get_int("n");
-    ck_tile::index_t k = arg_parser.get_int("k");
+    ck_tile::index_t N = arg_parser.get_int("n");
+    ck_tile::index_t H = arg_parser.get_int("h");
+    ck_tile::index_t W = arg_parser.get_int("w");
+    ck_tile::index_t C = arg_parser.get_int("c");
    int do_validation  = arg_parser.get_int("v");
    int warmup         = arg_parser.get_int("warmup");
    int repeat         = arg_parser.get_int("repeat");

-    std::vector<ck_tile::index_t> problem_shape = {m, n, k};
-    std::vector<ck_tile::index_t> strides(3);
-    strides[0] = n * k;
-    strides[1] = k;
-    strides[2] = 1;
+    std::vector<ck_tile::index_t> problem_shape = {N, H, W, C};
+    std::vector<ck_tile::index_t> strides(4);
+    strides[0] = H * W * C;
+    strides[1] = W * C;
+    strides[2] = C;
+    strides[3] = 1;

    // Define reduction specification:
-    // dimension 0 is kept, dimensions 1,2 are reduced
-    constexpr auto kept_dim    = ck_tile::sequence<0>{}; // Which dimension to keep (pass-through)
-    constexpr auto reduce_dims = ck_tile::sequence<1, 2>{}; // Which dimensions to reduce (merge)
+    constexpr auto kept_dim    = ck_tile::sequence<0, 3>{}; // Which dimension to keep
+    constexpr auto reduce_dims = ck_tile::sequence<1, 2>{}; // Which dimensions to reduce

    ck_tile::HostTensor<XDataType> x_host(problem_shape, strides);
-    ck_tile::HostTensor<YDataType> y_host_ref({problem_shape[kept_dim.at(0)]}, {1});
-    ck_tile::HostTensor<YDataType> y_host_dev({problem_shape[kept_dim.at(0)]}, {1});
+    ck_tile::HostTensor<YDataType> y_host_ref({N,C}, {C,1});
+    ck_tile::HostTensor<YDataType> y_host_dev({N,C}, {C,1});

    ck_tile::FillUniformDistribution<XDataType>{-5.f, 5.f}(x_host);

@@ -67,8 +72,9 @@ bool run(const ck_tile::ArgParser& arg_parser)

    constexpr ck_tile::index_t kBlockSize  = 256;
    constexpr ck_tile::index_t kBlockPerCu = 1;
+    ck_tile::index_t kept_dim_len_prod = N*C;
    ck_tile::index_t kGridSize =
-        (problem_shape[kept_dim.at(0)] + BlockTile::at(ck_tile::number<0>{}) - 1) /
+        (kept_dim_len_prod + BlockTile::at(ck_tile::number<0>{}) - 1) /
        BlockTile::at(ck_tile::number<0>{});
    std::cout << "grid size " << kGridSize << std::endl;

@@ -79,10 +85,10 @@ bool run(const ck_tile::ArgParser& arg_parser)
    using Kernel = ck_tile::Reduce<Porblem>;

    // Create input tensor shape and strides
-    auto input_shape   = ck_tile::make_tuple(problem_shape[0], problem_shape[1], problem_shape[2]);
-    auto input_strides = ck_tile::make_tuple(strides[0], strides[1], strides[2]);
+    auto input_shape   = ck_tile::make_tuple(problem_shape[0], problem_shape[1], problem_shape[2], problem_shape[3]);
+    auto input_strides = ck_tile::make_tuple(strides[0], strides[1], strides[2], strides[3]);

-    if(!Kernel::IsSupportedArgument(arg_parser))
+    if(!Kernel::IsSupportedArgument(C)) //output tensor's continuous dimension
    {
        throw std::runtime_error("Wrong! Arguments not supported!\n");
    }
@@ -101,7 +107,7 @@ bool run(const ck_tile::ArgParser& arg_parser)
                                       reduce_dims));

    std::size_t num_btype =
-        sizeof(XDataType) * m * n * k + sizeof(YDataType) * problem_shape[kept_dim.at(0)];
+        sizeof(XDataType) * N * C * H * W + sizeof(YDataType) * N * C;

    float gb_per_sec = num_btype / 1.E6 / ave_time;

@@ -114,7 +120,7 @@ bool run(const ck_tile::ArgParser& arg_parser)
        // reference
        ck_tile::reference_reduce<XDataType, ComputeDataType, YDataType>(
            x_host, y_host_ref, ReduceOp{}, kept_dim, reduce_dims);
-        y_buf.FromDevice(y_host_dev.mData.data());
+        y_buf.FromDevice(y_host_dev.mData.data());   
        pass = ck_tile::check_err(y_host_dev, y_host_ref);

        std::cout << "valid:" << (pass ? "y" : "n") << std::flush << std::endl;
--- a/include/ck_tile/core/utility/reduce_operator.hpp
+++ b/include/ck_tile/core/utility/reduce_operator.hpp
@@ -26,7 +26,8 @@ struct Add
    }

    template <typename T,
-              typename = std::enable_if_t<std::is_same_v<T, half_t> || std::is_same_v<T, bf16_t>>>
+              typename = std::enable_if_t<std::is_same_v<T, half_t> || std::is_same_v<T, bf16_t> ||
+                                          std::is_same_v<T, fp8_t> || std::is_same_v<T, bf8_t>>>
    CK_TILE_HOST_DEVICE constexpr T operator()(T& y, T x) const
    {
        float y_ = type_convert<float>(y);
@@ -54,6 +55,16 @@ struct SquareAdd
        return y + (x * x);
    }

+    template <typename T,
+              typename = std::enable_if_t<std::is_same_v<T, half_t> || std::is_same_v<T, bf16_t> ||
+                                          std::is_same_v<T, fp8_t> || std::is_same_v<T, bf8_t>>>
+    CK_TILE_HOST_DEVICE constexpr T operator()(T& y, T x) const
+    {
+        float y_ = type_convert<float>(y);
+        float x_ = type_convert<float>(x);
+        return type_convert<T>(y_ + (x_ * x_));
+    }
+
    // For combining partial results
    template <typename T,
              typename = std::enable_if_t<std::is_same_v<T, float> || std::is_same_v<T, double> ||
@@ -64,6 +75,16 @@ struct SquareAdd
        return partial1 + partial2; // Just add the partial sums, don't square again
    }

+    template <typename T,
+              typename = std::enable_if_t<std::is_same_v<T, half_t> || std::is_same_v<T, bf16_t> ||
+                                          std::is_same_v<T, fp8_t> || std::is_same_v<T, bf8_t>>>
+    CK_TILE_HOST_DEVICE constexpr T combine_partial_results(T& partial1, T& partial2) const
+    {
+        float partial1_ = type_convert<float>(partial1);
+        float partial2_ = type_convert<float>(partial2);
+        return type_convert<T>(partial1_ + partial2_);
+    }
+
    static constexpr bool requires_special_combine = true;
 };

@@ -71,7 +92,9 @@ struct Max
 {
    template <typename T,
              typename = std::enable_if_t<std::is_same_v<T, float> || std::is_same_v<T, double> ||
-                                          std::is_same_v<T, int32_t> || std::is_same_v<T, int8_t>>>
+                                          std::is_same_v<T, int32_t> || std::is_same_v<T, int8_t> ||
+                                          std::is_same_v<T, half_t> || std::is_same_v<T, bf16_t> ||
+                                          std::is_same_v<T, fp8_t> || std::is_same_v<T, bf8_t>>>
    CK_TILE_HOST_DEVICE static constexpr T GetIdentityValue()
    {
        return numeric<T>::min();
@@ -79,7 +102,9 @@ struct Max

    template <typename T,
              typename = std::enable_if_t<std::is_same_v<T, float> || std::is_same_v<T, double> ||
-                                          std::is_same_v<T, int32_t> || std::is_same_v<T, int8_t>>>
+                                          std::is_same_v<T, int32_t> || std::is_same_v<T, int8_t> ||
+                                          std::is_same_v<T, half_t> || std::is_same_v<T, bf16_t> ||
+                                          std::is_same_v<T, fp8_t> || std::is_same_v<T, bf8_t>>>
    CK_TILE_HOST_DEVICE constexpr T operator()(const T& y, const T x) const
    {
        return max(y, x);
@@ -92,7 +117,9 @@ struct AbsMax
 {
    template <typename T,
              typename = std::enable_if_t<std::is_same_v<T, float> || std::is_same_v<T, double> ||
-                                          std::is_same_v<T, int32_t> || std::is_same_v<T, int8_t>>>
+                                          std::is_same_v<T, int32_t> || std::is_same_v<T, int8_t> ||
+                                          std::is_same_v<T, half_t> || std::is_same_v<T, bf16_t> ||
+                                          std::is_same_v<T, fp8_t> || std::is_same_v<T, bf8_t>>>
    CK_TILE_HOST_DEVICE static constexpr T GetIdentityValue()
    {
        return numeric<T>::min();
@@ -100,7 +127,9 @@ struct AbsMax

    template <typename T,
              typename = std::enable_if_t<std::is_same_v<T, float> || std::is_same_v<T, double> ||
-                                          std::is_same_v<T, int32_t> || std::is_same_v<T, int8_t>>>
+                                          std::is_same_v<T, int32_t> || std::is_same_v<T, int8_t> ||
+                                          std::is_same_v<T, half_t> || std::is_same_v<T, bf16_t> ||
+                                          std::is_same_v<T, fp8_t> || std::is_same_v<T, bf8_t>>>
    CK_TILE_HOST_DEVICE constexpr T operator()(const T& y, const T x) const
    {
        return max(y, abs(x));
--- a/include/ck_tile/host/reference/reference_reduce.hpp
+++ b/include/ck_tile/host/reference/reference_reduce.hpp
@@ -45,45 +45,76 @@ CK_TILE_HOST void reference_reduce(const HostTensor<XDataType>& x_tensor,
                                   ReduceDims reduce_dims)
 {
    const auto& x_lengths = x_tensor.mDesc.get_lengths();
-    const auto kept_len   = x_lengths[kept_dim.at(0)];
+    const auto& x_strides = x_tensor.mDesc.get_strides();
+    const auto& y_strides = y_tensor.mDesc.get_strides();
+    
+    // Calculate total kept elements (product of all kept dimension lengths)
+    index_t total_kept_elements = 1;
+    static_for<0, kept_dim.size(), 1>{}(
+        [&](auto i) { total_kept_elements *= x_lengths[kept_dim.at(i)]; });

-    // Calculate total reduce elements
+    // Calculate total reduce elements (product of all reduce dimension lengths)
    index_t total_reduce_elements = 1;
    static_for<0, reduce_dims.size(), 1>{}(
        [&](auto i) { total_reduce_elements *= x_lengths[reduce_dims.at(i)]; });

-    auto f = [&](auto kept_idx) {
+    auto f = [&](auto linear_kept_idx) {
        ComputeDataType v_acc = reduce_op.template GetIdentityValue<ComputeDataType>();

+        // Convert linear kept index to multi-dimensional kept indices
+        std::vector<index_t> kept_indices(kept_dim.size());
+        index_t temp_kept = linear_kept_idx;
+        static_for<0, kept_dim.size(), 1>{}([&](auto i) {
+            constexpr auto dim_idx = kept_dim.size() - 1 - i;
+            constexpr auto dim = kept_dim.at(dim_idx);
+            const auto len = x_lengths[dim];
+            kept_indices[dim_idx] = temp_kept % len;
+            temp_kept /= len;
+        });
+
        for(index_t reduce_idx = 0; reduce_idx < total_reduce_elements; ++reduce_idx)
        {
-            // Convert linear index to multi-dimensional indices
-            std::vector<index_t> indices(x_lengths.size(), 0);
-            indices[kept_dim.at(0)] = kept_idx;
-
-            index_t temp = reduce_idx;
+            // Convert linear reduce index to multi-dimensional reduce indices
+            std::vector<index_t> reduce_indices(reduce_dims.size());
+            index_t temp_reduce = reduce_idx;
            static_for<0, reduce_dims.size(), 1>{}([&](auto i) {
-                constexpr auto dim = reduce_dims.at(reduce_dims.size() - 1 - i);
-                const auto len     = x_lengths[dim];
-                indices[dim]       = temp % len;
-                temp /= len;
+                constexpr auto dim_idx = reduce_dims.size() - 1 - i;
+                constexpr auto dim = reduce_dims.at(dim_idx);
+                const auto len = x_lengths[dim];
+                reduce_indices[dim_idx] = temp_reduce % len;
+                temp_reduce /= len;
            });

-            // Flat tensor access
-            index_t flat_idx    = 0;
-            const auto& strides = x_tensor.mDesc.get_strides();
-            for(size_t d = 0; d < indices.size(); ++d)
+            // Build full input tensor indices by combining kept and reduce indices
+            std::vector<index_t> full_indices(x_lengths.size(), 0);
+            static_for<0, kept_dim.size(), 1>{}([&](auto i) {
+                full_indices[kept_dim.at(i)] = kept_indices[i];
+            });
+            static_for<0, reduce_dims.size(), 1>{}([&](auto i) {
+                full_indices[reduce_dims.at(i)] = reduce_indices[i];
+            });
+
+            // Calculate flat input tensor index
+            index_t flat_x_idx = 0;
+            for(size_t d = 0; d < full_indices.size(); ++d)
            {
-                flat_idx += indices[d] * strides[d];
+                flat_x_idx += full_indices[d] * x_strides[d];
            }
-            const auto v_a = type_convert<ComputeDataType>(x_tensor.mData[flat_idx]);
+            const auto v_a = type_convert<ComputeDataType>(x_tensor.mData[flat_x_idx]);

            v_acc = reduce_op(v_acc, v_a);
        }

-        y_tensor(kept_idx) = type_convert<YDataType>(v_acc);
+        // Calculate output tensor index using kept indices and output strides
+        // The output tensor has the same structure as the kept dimensions
+        index_t flat_y_idx = 0;
+        static_for<0, kept_dim.size(), 1>{}([&](auto i) {
+            flat_y_idx += kept_indices[i] * y_strides[i];
+        });
+
+        y_tensor.mData[flat_y_idx] = type_convert<YDataType>(v_acc);
    };

-    make_ParallelTensorFunctor(f, kept_len)(std::thread::hardware_concurrency());
+    make_ParallelTensorFunctor(f, total_kept_elements)(std::thread::hardware_concurrency());
 }
 } // namespace ck_tile
--- a/include/ck_tile/ops/reduce/kernel/reduce2d_kernel.hpp
+++ b/include/ck_tile/ops/reduce/kernel/reduce2d_kernel.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.

 #pragma once

@@ -25,53 +25,6 @@ struct Reduce
    using ComputeDataType = ck_tile::remove_cvref_t<typename Problem::ComputeDataType>;
    using YDataType       = ck_tile::remove_cvref_t<typename Problem::YDataType>;

-#if 0
-    CK_TILE_DEVICE void operator()(const XDataType* p_x, YDataType* p_y, index_t M, index_t N)
-    const
-    {
-        using S = typename Problem::BlockShape;
-
-        const auto x_m_n = make_naive_tensor_view<address_space_enum::global>(
-            p_x, make_tuple(M, N), make_tuple(N, 1), number<S::Vector_N>{}, number<1>{});
-
-        const auto y_m = make_naive_tensor_view_packed<address_space_enum::global>(
-            p_y, make_tuple(M), number<1>{});
-
-        const auto iM = get_block_id() * S::Block_M;
-
-        auto x_window = make_tile_window(x_m_n,
-                                         make_tuple(number<S::Block_M>{}, number<S::Block_N>{}),
-                                         {iM, 0},
-                                         Policy::template MakeXBlockTileDistribution<Problem>());
-
-        auto y_window = make_tile_window(y_m, make_tuple(number<S::Block_M>{}), {iM});
-
-        const auto f_reduce = [](const auto& v0, const auto& v1) { return v0 + v1; };
-
-        const XDataType reduce_init_value = 0;
-
-        constexpr auto reduce_dims = sequence<1>{};
-
-        auto y_compute = decltype(block_tile_reduce<ComputeDataType>(
-            load_tile(x_window), reduce_dims, f_reduce, reduce_init_value)){};
-
-        set_tile(y_compute, reduce_init_value);
-
-        index_t num_n_tile_iteration =
-            __builtin_amdgcn_readfirstlane(integer_divide_ceil(N, S::Block_N));
-
-        for(int iN = __builtin_amdgcn_readfirstlane(0); iN < num_n_tile_iteration; ++iN)
-        {
-            const auto x = load_tile(x_window);
-            block_tile_reduce(y_compute, x, reduce_dims, f_reduce);
-            move_tile_window(x_window, {0, S::Block_N});
-        }
-
-        block_tile_reduce_sync(y_compute, f_reduce);
-
-        store_tile(y_window, cast_tile<YDataType>(y_compute));
-    }
-#else
    template <typename InputShape, typename InputStrides, typename KeptDim, typename ReduceDims>
    CK_TILE_DEVICE void operator()(const XDataType* p_x,
                                   YDataType* p_y,
@@ -83,26 +36,31 @@ struct Reduce
        using S       = typename Problem::BlockShape;
        const auto iM = get_block_id() * S::Block_M;

+        static_assert(kept_dim.size() + reduce_dims.size() == InputShape::size(), 
+                      "Size of kept dimensions + reduced dimensions must equal input tensor rank");
+
        // Extract lengths based on kept and reduced dimensions
-        const auto kept_len    = input_shape.at(number<kept_dim.at(0)>{});
+        const auto kept_lens = [&]() {
+            return generate_tuple(
+                [&](auto I) { return input_shape.at(number<kept_dim.at(I)>{}); },
+                number<kept_dim.size()>{});
+        }();
        const auto reduce_lens = [&]() {
            return generate_tuple(
                [&](auto I) { return input_shape.at(number<reduce_dims.at(I)>{}); },
                number<reduce_dims.size()>{});
        }();

-        // Create transforms
-        const auto pass_through_transform = make_pass_through_transform(kept_len);
-        const auto merge_transform        = make_merge_transform(reduce_lens);
+        const auto kept_merge_transform   = make_merge_transform(kept_lens);
+        const auto reduce_merge_transform = make_merge_transform(reduce_lens);

        auto reduce_func = typename Problem::ReduceOp{};
        const XDataType custom_padding_value =
            type_convert<XDataType>(reduce_func.template GetIdentityValue<ComputeDataType>());

        // Create input tensor view with custom padding value
-        // First create the descriptor
        auto desc = make_naive_tensor_descriptor(
-            input_shape, input_strides, number<S::Vector_N>{}, number<1>{});
+            input_shape, input_strides, number<S::ThreadTile_N>{}, number<1>{});

        // Create buffer view with custom padding value
        auto buffer_view = make_buffer_view<address_space_enum::global>(
@@ -112,21 +70,43 @@ struct Reduce
        const auto x_tensor = tensor_view<decltype(buffer_view), decltype(desc)>{buffer_view, desc};
        const auto transformed_x_tensor = pad_tensor_view(
            transform_tensor_view(x_tensor,
-                                  ck_tile::make_tuple(pass_through_transform, merge_transform),
-                                  ck_tile::make_tuple(kept_dim, reduce_dims),
-                                  ck_tile::make_tuple(sequence<0>{}, sequence<1>{})),
+                                  make_tuple(kept_merge_transform, reduce_merge_transform),
+                                  make_tuple(kept_dim, reduce_dims),
+                                  make_tuple(sequence<0>{}, sequence<1>{})),
            make_tuple(number<S::Block_M>{}, number<S::Block_N>{}),
            sequence<0, 1>{});

-        const auto y_m = make_naive_tensor_view_packed<address_space_enum::global>(
-            p_y, make_tuple(kept_len), number<1>{});
+        // Calculate strides for output tensor based on its own dimensions
+        const auto kept_strides = [&]() {
+            return generate_tuple(
+                [&](auto I) {
+                    // Calculate stride for dimension I as product of all following dimensions
+                    index_t stride = 1;
+                    static_for<I + 1, kept_dim.size(), 1>{}([&](auto J) {
+                        stride *= kept_lens.at(number<J>{});
+                    });
+                    return stride;
+                },
+                number<kept_dim.size()>{});
+        }();
+
+        const auto y_m = make_naive_tensor_view<address_space_enum::global>(
+            p_y, kept_lens, kept_strides, number<16 / sizeof(YDataType)>{}, number<1>{});
+
+        // Transform output tensor to 1D merged view
+        // This creates a view compatible with the 2D reduction pattern
+        const auto y_merged = transform_tensor_view(
+            y_m,
+            make_tuple(kept_merge_transform),
+            make_tuple(typename arithmetic_sequence_gen<0, kept_dim.size(), 1>::type{}),
+            make_tuple(sequence<0>{}));

        auto x_window = make_tile_window(transformed_x_tensor,
                                         make_tuple(number<S::Block_M>{}, number<S::Block_N>{}),
                                         {iM, 0},
                                         Policy::template MakeXBlockTileDistribution<Problem>());

-        auto y_window = make_tile_window(y_m, make_tuple(number<S::Block_M>{}), {iM});
+        auto y_window = make_tile_window(y_merged, make_tuple(number<S::Block_M>{}), {iM});

        __shared__ char smem[Policy::template GetSmemSize<Problem>()];

@@ -158,22 +138,22 @@ struct Reduce
        store_tile(y_window, cast_tile<YDataType>(y_compute));
    }

-    template <typename ArgParser>
-    CK_TILE_HOST static bool IsSupportedArgument(const ArgParser& arg_parser)
+    CK_TILE_HOST static bool IsSupportedArgument(index_t y_continous_dim)
    {
        using S = typename Problem::BlockShape;
-        if(arg_parser.get_int("n") % S::Vector_N != 0)
+
+        if(y_continous_dim % S::ThreadTile_N != 0)
        {
            if(ck_tile::EnvIsEnabled(CK_TILE_ENV(CK_TILE_LOGGING)))
            {
-                CK_TILE_ERROR("Size of n dimension should be a multiple of Vector_N !");
+                CK_TILE_ERROR("Total reduction size should be a multiple of ThreadTile_N!");
            }
            return false;
        }
+
        return true;
    }

-#endif
 };

 } // namespace ck_tile
--- a/include/ck_tile/ops/reduce/pipeline/reduce2d_default_policy.hpp
+++ b/include/ck_tile/ops/reduce/pipeline/reduce2d_default_policy.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.

 #pragma once

@@ -18,8 +18,8 @@ struct Reduce2dDefaultPolicy
        return make_static_tile_distribution(
            tile_distribution_encoding<
                sequence<>,
-                tuple<sequence<S::Repeat_M, S::WarpPerBlock_M, S::ThreadPerWarp_M, S::Vector_M>,
-                      sequence<S::Repeat_N, S::WarpPerBlock_N, S::ThreadPerWarp_N, S::Vector_N>>,
+                tuple<sequence<S::Repeat_M, S::WarpPerBlock_M, S::ThreadPerWarp_M, S::ThreadTile_M>,
+                      sequence<S::Repeat_N, S::WarpPerBlock_N, S::ThreadPerWarp_N, S::ThreadTile_N>>,
                tuple<sequence<1, 2>, sequence<1, 2>>,
                tuple<sequence<1, 1>, sequence<2, 2>>,
                sequence<1, 1, 2, 2>,
--- a/include/ck_tile/ops/reduce/pipeline/reduce2d_problem.hpp
+++ b/include/ck_tile/ops/reduce/pipeline/reduce2d_problem.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.

 #pragma once

--- a/include/ck_tile/ops/reduce/pipeline/reduce2d_shape.hpp
+++ b/include/ck_tile/ops/reduce/pipeline/reduce2d_shape.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.

 #pragma once

@@ -10,7 +10,7 @@ namespace ck_tile {
 template <typename BlockWarps, // num warps along seq<M, N>
          typename BlockTile,  // block size, seq<M, N>
          typename WarpTile,   // warp size, seq<M, N>
-          typename Vector>     // contiguous pixels(vector size) along seq<M, N>
+          typename ThreadTile>     // contiguous pixels(vector size) along seq<M, N>
 struct Reduce2dShape
 {
    static constexpr index_t Block_M = BlockTile::at(number<0>{});
@@ -19,14 +19,14 @@ struct Reduce2dShape
    static constexpr index_t Warp_M = WarpTile::at(number<0>{});
    static constexpr index_t Warp_N = WarpTile::at(number<1>{});

-    static constexpr index_t Vector_M = Vector::at(number<0>{});
-    static constexpr index_t Vector_N = Vector::at(number<1>{});
+    static constexpr index_t ThreadTile_M = ThreadTile::at(number<0>{});
+    static constexpr index_t ThreadTile_N = ThreadTile::at(number<1>{});

    static constexpr index_t WarpPerBlock_M = BlockWarps::at(number<0>{});
    static constexpr index_t WarpPerBlock_N = BlockWarps::at(number<1>{});

-    static constexpr index_t ThreadPerWarp_M = Warp_M / Vector_M;
-    static constexpr index_t ThreadPerWarp_N = Warp_N / Vector_N;
+    static constexpr index_t ThreadPerWarp_M = Warp_M / ThreadTile_M;
+    static constexpr index_t ThreadPerWarp_N = Warp_N / ThreadTile_N;

    static constexpr index_t Repeat_M = Block_M / (WarpPerBlock_M * Warp_M);
    static constexpr index_t Repeat_N = Block_N / (WarpPerBlock_N * Warp_N);
--- a/test/ck_tile/reduce/test_reduce2d.cpp
+++ b/test/ck_tile/reduce/test_reduce2d.cpp
@@ -11,14 +11,9 @@
 #include "ck_tile/host.hpp"
 #include "ck_tile/ops/reduce.hpp"
 #include "ck_tile/host/kernel_launch.hpp"
-#include "ck_tile/ops/reduce/kernel/reduce2d_kernel.hpp"
-#include "ck_tile/ops/reduce/pipeline/reduce2d_problem.hpp"
-#include "ck_tile/ops/reduce/pipeline/reduce2d_default_policy.hpp"
-#include "ck_tile/ops/reduce/pipeline/reduce2d_shape.hpp"
-#include "ck_tile/host/reference/reference_reduce.hpp"

 template <typename Tuple>
-class TestCkTileReduce2d : public ::testing::Test
+class TestCkTileReduce : public ::testing::Test
 {
    protected:
    using XDataType       = std::tuple_element_t<0, Tuple>;
@@ -28,25 +23,23 @@ class TestCkTileReduce2d : public ::testing::Test
    using BlockWarps_     = std::tuple_element_t<4, Tuple>;
    using BlockTile_      = std::tuple_element_t<5, Tuple>;
    using WarpTile_       = std::tuple_element_t<6, Tuple>;
-    using Vector_         = std::tuple_element_t<7, Tuple>;
+    using ThreadTile_     = std::tuple_element_t<7, Tuple>;

-    using TestReduce2dShape = ck_tile::Reduce2dShape<BlockWarps_, BlockTile_, WarpTile_, Vector_>;
+    using TestReduce2dShape = ck_tile::Reduce2dShape<BlockWarps_, BlockTile_, WarpTile_, ThreadTile_>;

-    void RunTest(ck_tile::index_t m, ck_tile::index_t n, ck_tile::index_t k)
+    template<std::size_t InputDim, typename KeptDimSeq, typename ReduceDimSeq>
+    void RunGenericTest(const std::vector<ck_tile::index_t>& input_shape, 
+                       const std::vector<ck_tile::index_t>& input_strides,
+                       const std::vector<ck_tile::index_t>& output_shape,
+                       const std::vector<ck_tile::index_t>& output_strides,
+                       ck_tile::index_t kept_dim_len_prod,
+                       ck_tile::index_t total_reduce_elements,
+                       KeptDimSeq kept_dims, 
+                       ReduceDimSeq reduce_dims)
    {
-        // Problem shape: 3D tensor [M, N, K] -> reduce along [N, K] -> output [M]
-        std::vector<ck_tile::index_t> problem_shape = {m, n, k};
-        std::vector<ck_tile::index_t> strides(3);
-        strides[0] = n * k; // M stride
-        strides[1] = k;     // N stride
-        strides[2] = 1;     // K stride
-
-        constexpr auto kept_dim    = ck_tile::sequence<0>{};
-        constexpr auto reduce_dims = ck_tile::sequence<1, 2>{};
-
-        ck_tile::HostTensor<XDataType> h_x(problem_shape, strides);
-        ck_tile::HostTensor<YDataType> h_y({problem_shape[kept_dim.at(0)]}, {1});
-        ck_tile::HostTensor<YDataType> h_y_ref({problem_shape[kept_dim.at(0)]}, {1});
+        ck_tile::HostTensor<XDataType> h_x(input_shape, input_strides);
+        ck_tile::HostTensor<YDataType> h_y(output_shape, output_strides);
+        ck_tile::HostTensor<YDataType> h_y_ref(output_shape, output_strides);

        ck_tile::FillUniformDistribution<XDataType>{-5.f, 5.f}(h_x);
        h_y.SetZero();
@@ -67,36 +60,47 @@ class TestCkTileReduce2d : public ::testing::Test
        // Launch configuration
        constexpr ck_tile::index_t kBlockSize  = 256;
        constexpr ck_tile::index_t kBlockPerCu = 1;
+        
        ck_tile::index_t kGridSize =
-            (problem_shape[kept_dim.at(0)] + TestReduce2dShape::Block_M - 1) /
+            (kept_dim_len_prod + TestReduce2dShape::Block_M - 1) /
            TestReduce2dShape::Block_M;

-        auto input_shape =
-            ck_tile::make_tuple(problem_shape[0], problem_shape[1], problem_shape[2]);
-        auto input_strides = ck_tile::make_tuple(strides[0], strides[1], strides[2]);
+        // Generic helper to create tuple from vector based on compile-time size
+        auto make_shape_tuple = []<std::size_t N>(const std::vector<ck_tile::index_t>& vec) {
+            return [&vec]<std::size_t... I>(std::index_sequence<I...>) {
+                return ck_tile::make_tuple(vec[I]...);
+            }(std::make_index_sequence<N>{});
+        };
+
+        auto input_shape_tuple = make_shape_tuple.template operator()<InputDim>(input_shape);
+        auto input_strides_tuple = make_shape_tuple.template operator()<InputDim>(input_strides);
+
+        if(!Kernel::IsSupportedArgument(output_shape[output_shape.size()-1])) //output tensor's continuous dimension
+        {
+            throw std::runtime_error("Wrong! Arguments not supported!\n");
+        }

        ck_tile::launch_kernel(ck_tile::stream_config{nullptr, false, 0},
-                               ck_tile::make_kernel<kBlockSize, kBlockPerCu>(
-                                   Kernel{},
-                                   kGridSize,
-                                   kBlockSize,
-                                   0,
-                                   static_cast<XDataType*>(d_x_mem.GetDeviceBuffer()),
-                                   static_cast<YDataType*>(d_y_mem.GetDeviceBuffer()),
-                                   input_shape,
-                                   input_strides,
-                                   kept_dim,
-                                   reduce_dims));
+                                ck_tile::make_kernel<kBlockSize, kBlockPerCu>(
+                                    Kernel{},
+                                    kGridSize,
+                                    kBlockSize,
+                                    0,
+                                    static_cast<XDataType*>(d_x_mem.GetDeviceBuffer()),
+                                    static_cast<YDataType*>(d_y_mem.GetDeviceBuffer()),
+                                    input_shape_tuple,
+                                    input_strides_tuple,
+                                    kept_dims,
+                                    reduce_dims));

        // Get results back
        d_y_mem.FromDevice(h_y.data());

        // Reference computation
        ck_tile::reference_reduce<XDataType, ComputeDataType, YDataType>(
-            h_x, h_y_ref, ReduceOpType{}, kept_dim, reduce_dims);
+            h_x, h_y_ref, ReduceOpType{}, kept_dims, reduce_dims);

        // Calculate proper error thresholds based on data types and number of accumulations
-        const auto total_reduce_elements = n * k;
        const auto rtol = ck_tile::get_relative_threshold<XDataType, YDataType, ComputeDataType>(
            total_reduce_elements);
        const auto atol = ck_tile::get_absolute_threshold<XDataType, YDataType, ComputeDataType>(
@@ -107,23 +111,126 @@ class TestCkTileReduce2d : public ::testing::Test
        EXPECT_TRUE(result);
    }

-    void RunTest2D(ck_tile::index_t m, ck_tile::index_t n)
+    // Convenience functions for specific dimensional patterns
+    void RunTest2D_KeepDim0_ReduceDim1(ck_tile::index_t dim0, ck_tile::index_t dim1)
    {
-        // 2D case: [M, N] -> reduce along [N] -> output [M]
-        RunTest(m, n, 1);
+        constexpr auto kept_dims = ck_tile::sequence<0>{};
+        constexpr auto reduce_dims = ck_tile::sequence<1>{};
+        
+        // Input shape and strides
+        std::vector<ck_tile::index_t> input_shape = {dim0, dim1};
+        std::vector<ck_tile::index_t> input_strides = {dim1, 1};
+        
+        // Output shape and strides (keep dim0)
+        std::vector<ck_tile::index_t> output_shape = {dim0};
+        std::vector<ck_tile::index_t> output_strides = {1};
+        
+        // Calculate products
+        ck_tile::index_t kept_dim_len_prod = dim0;
+        ck_tile::index_t total_reduce_elements = dim1;
+        
+        RunGenericTest<2>(input_shape, input_strides, output_shape, output_strides,
+                         kept_dim_len_prod, total_reduce_elements, kept_dims, reduce_dims);
    }
+
+    void RunTest3D_KeepDim0_ReduceDim12(ck_tile::index_t dim0, ck_tile::index_t dim1, ck_tile::index_t dim2)
+    {
+        constexpr auto kept_dims = ck_tile::sequence<0>{};
+        constexpr auto reduce_dims = ck_tile::sequence<1, 2>{};
+        
+        // Input shape and strides
+        std::vector<ck_tile::index_t> input_shape = {dim0, dim1, dim2};
+        std::vector<ck_tile::index_t> input_strides = {dim1 * dim2, dim2, 1};
+        
+        // Output shape and strides (keep dim0)
+        std::vector<ck_tile::index_t> output_shape = {dim0};
+        std::vector<ck_tile::index_t> output_strides = {1};
+        
+        // Calculate products
+        ck_tile::index_t kept_dim_len_prod = dim0;  // product of kept dimensions
+        ck_tile::index_t total_reduce_elements = dim1 * dim2;  // product of reduced dimensions
+        
+        RunGenericTest<3>(input_shape, input_strides, output_shape, output_strides,
+                         kept_dim_len_prod, total_reduce_elements, kept_dims, reduce_dims);
+    }
+
+    void RunTest3D_KeepDim01_ReduceDim2(ck_tile::index_t dim0, ck_tile::index_t dim1, ck_tile::index_t dim2)
+    {
+        constexpr auto kept_dims = ck_tile::sequence<0,1>{};
+        constexpr auto reduce_dims = ck_tile::sequence<2>{};
+        
+        // Input shape and strides
+        std::vector<ck_tile::index_t> input_shape = {dim0, dim1, dim2};
+        std::vector<ck_tile::index_t> input_strides = {dim1 * dim2, dim2, 1};
+        
+        // Output shape and strides (keep dim0)
+        std::vector<ck_tile::index_t> output_shape = {dim0, dim1};
+        std::vector<ck_tile::index_t> output_strides = {dim1, 1};
+        
+        // Calculate products
+        ck_tile::index_t kept_dim_len_prod = dim0 * dim1;  // product of kept dimensions
+        ck_tile::index_t total_reduce_elements = dim2;  // product of reduced dimensions
+        
+        RunGenericTest<3>(input_shape, input_strides, output_shape, output_strides,
+                         kept_dim_len_prod, total_reduce_elements, kept_dims, reduce_dims);
+    }
+
+
+
+    void RunTest4D_KeepDim01_ReduceDim23(ck_tile::index_t N, ck_tile::index_t C, ck_tile::index_t H, ck_tile::index_t W)
+    {
+        constexpr auto kept_dims = ck_tile::sequence<0, 1>{};
+        constexpr auto reduce_dims = ck_tile::sequence<2, 3>{};
+        
+        // Input shape and strides
+        std::vector<ck_tile::index_t> input_shape = {N, C, H, W};
+        std::vector<ck_tile::index_t> input_strides = {C * H * W, H * W, W, 1};
+        
+        // Output shape and strides (keep dim0, dim1)
+        std::vector<ck_tile::index_t> output_shape = {N, C};
+        std::vector<ck_tile::index_t> output_strides = {C, 1};
+        
+        // Calculate products
+        ck_tile::index_t kept_dim_len_prod = N * C;  // product of kept dimensions
+        ck_tile::index_t total_reduce_elements = H * W;  // product of reduced dimensions
+        
+        RunGenericTest<4>(input_shape, input_strides, output_shape, output_strides,
+                         kept_dim_len_prod, total_reduce_elements, kept_dims, reduce_dims);
+    }
+
+    void RunTest4D_KeepDim03_ReduceDim12(ck_tile::index_t N, ck_tile::index_t H, ck_tile::index_t W, ck_tile::index_t C)
+    {
+        constexpr auto kept_dims = ck_tile::sequence<0, 3>{};
+        constexpr auto reduce_dims = ck_tile::sequence<1, 2>{};
+        
+        // Input shape and strides
+        std::vector<ck_tile::index_t> input_shape = {N, H, W, C};
+        std::vector<ck_tile::index_t> input_strides = {H * W * C, W * C, C, 1};
+        
+        // Output shape and strides (keep dim0, dim1)
+        std::vector<ck_tile::index_t> output_shape = {N, C};
+        std::vector<ck_tile::index_t> output_strides = {C, 1};
+        
+        // Calculate products
+        ck_tile::index_t kept_dim_len_prod = N * C;  // product of kept dimensions
+        ck_tile::index_t total_reduce_elements = H * W;  // product of reduced dimensions
+        
+        RunGenericTest<4>(input_shape, input_strides, output_shape, output_strides,
+                         kept_dim_len_prod, total_reduce_elements, kept_dims, reduce_dims);
+    }
+
 };

 // Shape parameters for different test configurations
 using Shape1_BlockWarps = ck_tile::sequence<4, 1>;
 using Shape1_BlockTile  = ck_tile::sequence<128, 128>;
 using Shape1_WarpTile   = ck_tile::sequence<32, 128>;
-using Shape1_Vector     = ck_tile::sequence<8, 8>;
+using Shape1_ThreadTile     = ck_tile::sequence<8, 8>;

 using Shape2_BlockWarps = ck_tile::sequence<2, 2>; // Cross-warp reduction test
 using Shape2_BlockTile  = ck_tile::sequence<2, 1024>;
 using Shape2_WarpTile   = ck_tile::sequence<1, 512>;
-using Shape2_Vector     = ck_tile::sequence<1, 8>;
+using Shape2_ThreadTile     = ck_tile::sequence<1, 8>;

 // Test configurations for different data types and operations
 using TestConfig_F32_Add = std::tuple<float,
@@ -133,7 +240,7 @@ using TestConfig_F32_Add = std::tuple<float,
                                      Shape1_BlockWarps,
                                      Shape1_BlockTile,
                                      Shape1_WarpTile,
-                                      Shape1_Vector>;
+                                      Shape1_ThreadTile>;

 using TestConfig_F16_Add = std::tuple<ck_tile::half_t,
                                      float,
@@ -142,7 +249,7 @@ using TestConfig_F16_Add = std::tuple<ck_tile::half_t,
                                      Shape1_BlockWarps,
                                      Shape1_BlockTile,
                                      Shape1_WarpTile,
-                                      Shape1_Vector>;
+                                      Shape1_ThreadTile>;

 using TestConfig_F32_CrossWarp = std::tuple<float,
                                            float,
@@ -151,7 +258,7 @@ using TestConfig_F32_CrossWarp = std::tuple<float,
                                            Shape2_BlockWarps,
                                            Shape2_BlockTile,
                                            Shape2_WarpTile,
-                                            Shape2_Vector>;
+                                            Shape2_ThreadTile>;

 using TestConfig_F32_Max = std::tuple<float,
                                      float,
@@ -160,7 +267,7 @@ using TestConfig_F32_Max = std::tuple<float,
                                      Shape1_BlockWarps,
                                      Shape1_BlockTile,
                                      Shape1_WarpTile,
-                                      Shape1_Vector>;
+                                      Shape1_ThreadTile>;

 using TestConfig_F32_SquareAdd = std::tuple<float,
                                            float,
@@ -169,7 +276,7 @@ using TestConfig_F32_SquareAdd = std::tuple<float,
                                            Shape1_BlockWarps,
                                            Shape1_BlockTile,
                                            Shape1_WarpTile,
-                                            Shape1_Vector>;
+                                            Shape1_ThreadTile>;

 using TestTypes = ::testing::Types<TestConfig_F32_Add,
                                   TestConfig_F16_Add,
@@ -177,12 +284,37 @@ using TestTypes = ::testing::Types<TestConfig_F32_Add,
                                   TestConfig_F32_Max,
                                   TestConfig_F32_SquareAdd>;

-TYPED_TEST_SUITE(TestCkTileReduce2d, TestTypes);
+TYPED_TEST_SUITE(TestCkTileReduce, TestTypes);

-TYPED_TEST(TestCkTileReduce2d, test) { this->RunTest(128, 128, 1); }
+// 2D Tests - Keep dim0, reduce dim1
+TYPED_TEST(TestCkTileReduce, Test2D_KeepDim0_ReduceDim1_64x32) 
+{ 
+    this->RunTest2D_KeepDim0_ReduceDim1(64, 32); 
+}

-TYPED_TEST(TestCkTileReduce2d, Reduce3D_512_1024_16) { this->RunTest(512, 1024, 16); }
+TYPED_TEST(TestCkTileReduce, Test2D_KeepDim0_ReduceDim1_1024x512) 
+{ 
+    this->RunTest2D_KeepDim0_ReduceDim1(1024, 512); 
+}

-TYPED_TEST(TestCkTileReduce2d, Reduce3D_150_170_6) { this->RunTest(150, 64, 3); }
+// 3D Tests - Keep dim0, reduce dim1,2
+TYPED_TEST(TestCkTileReduce, Test3D_KeepDim0_ReduceDim12_128x128x1) 
+{ 
+    this->RunTest3D_KeepDim0_ReduceDim12(128, 128, 8); 
+}
+// 3D Tests - Keep dim0,1, reduce dim1
+TYPED_TEST(TestCkTileReduce, Test3D_KeepDim01_ReduceDim2_512x1024x16) 
+{ 
+    this->RunTest3D_KeepDim01_ReduceDim2(512, 1024, 16); 
+}

-TYPED_TEST(TestCkTileReduce2d, Reduce2D_128_128) { this->RunTest2D(128, 128); }
+// 4D Tests - Keep dim0,1, reduce dim2,3 (NCHW -> NC)
+TYPED_TEST(TestCkTileReduce, Test4D_KeepDim01_ReduceDim23_32x256x16x16) 
+{ 
+    this->RunTest4D_KeepDim01_ReduceDim23(32, 256, 16, 16); 
+}
+// 4D Tests - Keep dim0,3, reduce dim1,2 (NHWC -> NC)
+TYPED_TEST(TestCkTileReduce, Test4D_KeepDim03_ReduceDim12_16x32x32x128) 
+{ 
+    this->RunTest4D_KeepDim03_ReduceDim12(16, 32, 32, 128); 
+}