[ck_tile] refactor reduce kernel (#3257)

* refactor reduce kernel - Rename Reduce kernel as per convention - Move kept_dim and reduce_dims from runtime to compile-time parameters - Update Reduce2dProblem template to include KeptDim, ReduceDims, and Rank - Remove IsSupportedArgument validation function as it's unnecessary. Not using the GuaranteedLastDimensionVectorStride while making tensor view or descriptor which removes the bounds enforced earlier. We still calculate and use vector size. - Update reduce example to demonstrate NCHW->NHW reduction with non-contiguous support - Update tests Kernel now handles both contiguous and non-contiguous memory layout. * fix compile errors
2026-04-20 06:49:15 +00:00 · 2025-12-17 21:46:08 +02:00
parent 92653168c2
commit ea10a78203
5 changed files with 89 additions and 130 deletions
--- a/example/ck_tile/03_gemm/gemm_splitk_two_stage_reduce.cpp
+++ b/example/ck_tile/03_gemm/gemm_splitk_two_stage_reduce.cpp
@@ -286,7 +286,6 @@ template <typename CDataType,
          typename ELayout         = ck_tile::tensor_layout::gemm::RowMajor>
 float reduce_stage2(const GemmSplitKHostArgs& args, const ck_tile::stream_config& s)
 {
-    const ck_tile::index_t reduce_dim_size = args.k_batch; // Number of partial results to reduce
    // Calculate output size based on the final output tensor dimensions
    const ck_tile::index_t output_size = args.M * args.N;

@@ -303,27 +302,28 @@ float reduce_stage2(const GemmSplitKHostArgs& args, const ck_tile::stream_config
    constexpr auto reduce_dims = ck_tile::sequence<0>{};    // Reduce k_batch dimension

    using ReduceOp   = ck_tile::ReduceOp::Add;
-    using BlockWarps = ck_tile::sequence<4, 1>;
-    using BlockTile  = ck_tile::sequence<128, 128>;
-    using WarpTile   = ck_tile::sequence<32, 128>;
-    using ThreadTile = ck_tile::sequence<8, 8>;
+    using BlockWarps = ck_tile::sequence<1, 1>;
+    using BlockTile  = ck_tile::sequence<256, 1>;
+    using WarpTile   = ck_tile::sequence<256, 1>;
+    using ThreadTile = ck_tile::sequence<1, 1>;

    constexpr ck_tile::index_t kBlockPerCu = 1;

    ck_tile::index_t kGridSize = (output_size + BlockTile::at(ck_tile::number<0>{}) - 1) /
                                 BlockTile::at(ck_tile::number<0>{});

-    using Shape = ck_tile::Reduce2dShape<BlockWarps, BlockTile, WarpTile, ThreadTile>;
-    using Problem =
-        ck_tile::Reduce2dProblem<CDataType, ComputeDataType, CDataType, Shape, ReduceOp>;
-    using Kernel                      = ck_tile::Reduce<Problem>;
+    using Shape   = ck_tile::Reduce2dShape<BlockWarps, BlockTile, WarpTile, ThreadTile>;
+    using Problem = ck_tile::Reduce2dProblem<CDataType,
+                                             ComputeDataType,
+                                             CDataType,
+                                             Shape,
+                                             ReduceOp,
+                                             decltype(kept_dim),
+                                             decltype(reduce_dims),
+                                             3>;
+    using Kernel  = ck_tile::ReduceKernel<Problem>;
    const ck_tile::index_t kBlockSize = Kernel::BlockSize();

-    if(!Kernel::IsSupportedArgument(reduce_dim_size, workspace_strides))
-    {
-        throw std::runtime_error("Wrong! Reduction arguments not supported!\n");
-    }
-
    if(s.log_level_ > 0)
    {
        std::cout << "Stage 2 - Launching Reduction kernel" << '\n'
@@ -343,9 +343,7 @@ float reduce_stage2(const GemmSplitKHostArgs& args, const ck_tile::stream_config
                                   static_cast<const CDataType*>(args.e_ptr), // workspace input
                                   static_cast<CDataType*>(args.final_output_ptr), // final output
                                   workspace_shape,
-                                   workspace_strides,
-                                   kept_dim,
-                                   reduce_dims));
+                                   workspace_strides));

    return ave_time;
 }
--- a/example/ck_tile/05_reduce/reduce.cpp
+++ b/example/ck_tile/05_reduce/reduce.cpp
@@ -9,14 +9,14 @@
 auto create_args(int argc, char* argv[])
 {
    ck_tile::ArgParser arg_parser;
-    arg_parser.insert("n", "32", "n dimension")
-        .insert("h", "7", "h dimension")
-        .insert("w", "7", "w dimension")
-        .insert("c", "512", "c dimension")
+    arg_parser.insert("n", "16", "n dimension")
+        .insert("h", "64", "h dimension")
+        .insert("w", "32", "w dimension")
+        .insert("c", "960", "c dimension")
        .insert("v", "1", "cpu validation or not")
        .insert("prec", "fp16", "precision")
-        .insert("warmup", "5", "cold iter")
-        .insert("repeat", "20", "hot iter")
+        .insert("warmup", "20", "cold iter")
+        .insert("repeat", "100", "hot iter")
        .insert("json", "0", "0: No Json, 1: Dump Results in Json format")
        .insert("jsonfile", "reduce.json", "json file name to dump results");

@@ -47,12 +47,12 @@ bool run(const ck_tile::ArgParser& arg_parser)
    strides[3] = 1;

    // Define reduction specification:
-    constexpr auto kept_dim    = ck_tile::sequence<0, 3>{}; // Which dimension to keep
-    constexpr auto reduce_dims = ck_tile::sequence<1, 2>{}; // Which dimensions to reduce
+    constexpr auto kept_dim    = ck_tile::sequence<1, 2, 3>{}; // Which dimension to keep
+    constexpr auto reduce_dims = ck_tile::sequence<0>{};       // Which dimensions to reduce

    ck_tile::HostTensor<XDataType> x_host(problem_shape, strides);
-    ck_tile::HostTensor<YDataType> y_host_ref({N, C}, {C, 1});
-    ck_tile::HostTensor<YDataType> y_host_dev({N, C}, {C, 1});
+    ck_tile::HostTensor<YDataType> y_host_ref({H, W, C}, {W * C, C, 1});
+    ck_tile::HostTensor<YDataType> y_host_dev({H, W, C}, {W * C, C, 1});

    ck_tile::FillUniformDistribution<XDataType>{-5.f, 5.f}(x_host);

@@ -62,40 +62,40 @@ bool run(const ck_tile::ArgParser& arg_parser)
    x_buf.ToDevice(x_host.data());

    using ReduceOp   = ck_tile::ReduceOp::Add;
-    using BlockWarps = ck_tile::sequence<4, 1>;
-    using BlockTile  = ck_tile::sequence<128, 128>;
-    using WarpTile   = ck_tile::sequence<32, 128>;
-    using Vector     = ck_tile::sequence<8, 8>;
+    using BlockWarps = ck_tile::sequence<1, 1>;
+    using BlockTile  = ck_tile::sequence<256, 1>;
+    using WarpTile   = ck_tile::sequence<256, 1>;
+    using ThreadTile = ck_tile::sequence<1, 1>;

    // cross warp-reduce
    // using BlockWarps = ck_tile::sequence<2, 2>;
    // using BlockTile  = ck_tile::sequence<2, 1024>;
    // using WarpTile   = ck_tile::sequence<1, 512>;
-    // using Vector = ck_tile::sequence<1, 8>;
+    // using ThreadTile = ck_tile::sequence<1, 8>;

    constexpr ck_tile::index_t kBlockPerCu = 1;
-    ck_tile::index_t kept_dim_len_prod     = N * C;
+    ck_tile::index_t kept_dim_len_prod     = H * W * C;
    ck_tile::index_t kGridSize = (kept_dim_len_prod + BlockTile::at(ck_tile::number<0>{}) - 1) /
                                 BlockTile::at(ck_tile::number<0>{});
    std::cout << "grid size " << kGridSize << std::endl;

-    using Shape = ck_tile::Reduce2dShape<BlockWarps, BlockTile, WarpTile, Vector>;
-    using Porblem =
-        ck_tile::Reduce2dProblem<XDataType, ComputeDataType, YDataType, Shape, ReduceOp>;
+    using Shape   = ck_tile::Reduce2dShape<BlockWarps, BlockTile, WarpTile, ThreadTile>;
+    using Porblem = ck_tile::Reduce2dProblem<XDataType,
+                                             ComputeDataType,
+                                             YDataType,
+                                             Shape,
+                                             ReduceOp,
+                                             decltype(kept_dim),
+                                             decltype(reduce_dims),
+                                             4>;

-    using Kernel                      = ck_tile::Reduce<Porblem>;
+    using Kernel                      = ck_tile::ReduceKernel<Porblem>;
    const ck_tile::index_t kBlockSize = Kernel::BlockSize();
    // Create input tensor shape and strides
    auto input_shape =
        ck_tile::make_tuple(problem_shape[0], problem_shape[1], problem_shape[2], problem_shape[3]);
    auto input_strides = ck_tile::make_tuple(strides[0], strides[1], strides[2], strides[3]);

-    if(!Kernel::IsSupportedArgument(
-           C, input_strides)) // output tensor's continuous dimension and input strides
-    {
-        throw std::runtime_error("Wrong! Arguments not supported!\n");
-    }
-
    float ave_time = launch_kernel(
        ck_tile::stream_config{nullptr, true, 0, warmup, repeat},
        ck_tile::make_kernel<kBlockPerCu>(Kernel{},
@@ -105,11 +105,9 @@ bool run(const ck_tile::ArgParser& arg_parser)
                                          static_cast<XDataType*>(x_buf.GetDeviceBuffer()),
                                          static_cast<YDataType*>(y_buf.GetDeviceBuffer()),
                                          input_shape,
-                                          input_strides,
-                                          kept_dim,
-                                          reduce_dims));
+                                          input_strides));

-    std::size_t num_btype = sizeof(XDataType) * N * C * H * W + sizeof(YDataType) * N * C;
+    std::size_t num_btype = sizeof(XDataType) * N * H * W * C + sizeof(YDataType) * H * W * C;

    float gb_per_sec = num_btype / 1.E6 / ave_time;

@@ -149,8 +147,8 @@ int main(int argc, char* argv[])
    {
        return run<ck_tile::half_t>(arg_parser) ? 0 : -2;
    }
-    // else if(data_type == "bf16")
-    // {
-    //     return run<ck_tile::bf16_t>(arg_parser) ? 0 : -2;
-    // }
+    else if(data_type == "bf16")
+    {
+        return run<ck_tile::bf16_t>(arg_parser) ? 0 : -2;
+    }
 }