[CK_TILE] Pooling FWD (Lwpck 3683) (#2956)

* Pooling 2D/3D with refernce * Tests & cleanup - added test for ppoling - cleanup - removed 2d example * Comment resolution - README added - example target name rectified - appropriate arg description and comments added * clang-format * appropriate blocksize calc * modifications for future indexing addition - instead of transforming views we now transform the descriptors, so that the same descriptor can be re-used for index tensor in the future * some basic fixes * comment resolutions * comment resolutions --------- Co-authored-by: Illia Silin <98187287+illsilin@users.noreply.github.com>
2026-05-02 20:51:23 +00:00 · 2025-10-09 17:13:26 +03:00
parent 9d4bfe3932
commit 7b6451b68e
14 changed files with 1317 additions and 0 deletions
--- a/include/ck_tile/ops/pooling/pipeline/pool_default_policy.hpp
+++ b/include/ck_tile/ops/pooling/pipeline/pool_default_policy.hpp
@@ -0,0 +1,80 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck_tile/core.hpp"
+#include "ck_tile/ops/reduce/block/block_reduce2d_problem.hpp"
+#include "ck_tile/ops/reduce/block/block_reduce2d.hpp"
+
+namespace ck_tile {
+
+struct PoolDefaultPolicy
+{
+    template <typename Problem>
+    CK_TILE_DEVICE static constexpr auto MakeXBlockTileDistribution()
+    {
+        using S = typename Problem::BlockShape;
+        return make_static_tile_distribution(
+            tile_distribution_encoding<
+                sequence<>,
+                tuple<
+                    sequence<S::Repeat_M, S::WarpPerBlock_M, S::ThreadPerWarp_M, S::ThreadTile_M>,
+                    sequence<S::Repeat_N, S::WarpPerBlock_N, S::ThreadPerWarp_N, S::ThreadTile_N>>,
+                tuple<sequence<1, 2>, sequence<1, 2>>,
+                tuple<sequence<1, 1>, sequence<2, 2>>,
+                sequence<1, 1, 2, 2>,
+                sequence<0, 3, 0, 3>>{});
+    }
+
+    template <typename Problem>
+    CK_TILE_HOST_DEVICE static constexpr auto GetBlockReduce2d()
+    {
+        using P_ = BlockReduce2dProblem<typename Problem::InDataType,
+                                        typename Problem::ComputeDataType,
+                                        typename Problem::BlockShape>;
+        return BlockReduce2d<P_>{};
+    }
+
+    template <typename Problem>
+    CK_TILE_HOST_DEVICE static constexpr auto GetBlockReduce2dSync()
+    {
+        using P_ = BlockReduce2dProblem<typename Problem::InDataType,
+                                        typename Problem::ComputeDataType,
+                                        typename Problem::BlockShape>;
+        return BlockReduce2dSync<P_>{};
+    }
+
+    template <typename Problem>
+    CK_TILE_HOST_DEVICE static constexpr auto GetBlockReduce2dCrossWarpSync()
+    {
+        using P_ = BlockReduce2dProblem<typename Problem::InDataType,
+                                        typename Problem::ComputeDataType,
+                                        typename Problem::BlockShape>;
+        return BlockReduce2dCrossWarpSync<P_>{};
+    }
+
+    template <typename Problem>
+    CK_TILE_HOST_DEVICE static constexpr index_t GetSmemSize()
+    {
+        if constexpr(Problem::kNeedCrossWarpSync)
+        {
+            using P_ = BlockReduce2dProblem<typename Problem::InDataType,
+                                            typename Problem::ComputeDataType,
+                                            typename Problem::BlockShape>;
+
+            using block_reduce2d = BlockReduce2d<P_>;
+            using x_block_tile =
+                decltype(make_static_distributed_tensor<typename Problem::InDataType>(
+                    MakeXBlockTileDistribution<Problem>()));
+            using y_block_tile = decltype(block_reduce2d::template MakeYBlockTile<x_block_tile>());
+
+            return GetBlockReduce2dCrossWarpSync<Problem>().template GetSmemSize<y_block_tile>();
+        }
+        else
+        {
+            return 1; // zero size arrays are an extension
+        }
+    }
+};
+} // namespace ck_tile
--- a/include/ck_tile/ops/pooling/pipeline/pool_problem.hpp
+++ b/include/ck_tile/ops/pooling/pipeline/pool_problem.hpp
@@ -0,0 +1,33 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck_tile/core.hpp"
+
+namespace ck_tile {
+
+template <typename InDataType_,
+          typename OutDataType_,
+          typename ComputeDataType_,
+          typename IndexDataType_,
+          typename ReduceOp_,
+          bool OutputIndex_,
+          bool PropagateNan_,
+          typename BlockShape_>
+struct PoolProblem
+{
+    using InDataType      = remove_cvref_t<InDataType_>;
+    using OutDataType     = remove_cvref_t<OutDataType_>;
+    using ComputeDataType = remove_cvref_t<ComputeDataType_>;
+    using IndexDataType   = remove_cvref_t<IndexDataType_>;
+    using BlockShape      = remove_cvref_t<BlockShape_>;
+    using ReduceOp        = ReduceOp_;
+    using OutputIndex     = bool_constant<OutputIndex_>;
+    using PropagateNan    = bool_constant<PropagateNan_>;
+
+    static constexpr bool kNeedCrossLaneSync = BlockShape::ThreadPerWarp_N > 1;
+    static constexpr bool kNeedCrossWarpSync = BlockShape::WarpPerBlock_N > 1;
+};
+
+} // namespace ck_tile
--- a/include/ck_tile/ops/pooling/pipeline/pool_shape.hpp
+++ b/include/ck_tile/ops/pooling/pipeline/pool_shape.hpp
@@ -0,0 +1,57 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck_tile/core.hpp"
+
+namespace ck_tile {
+
+template <typename BlockWarps, // num warps along seq<M, N>
+          typename BlockTile,  // block size, seq<M, N>
+          typename WarpTile,   // warp size, seq<M, N>
+          typename ThreadTile> // contiguous pixels(vector size) along seq<M, N>
+struct PoolShape
+{
+    static constexpr index_t Block_M = BlockTile::at(number<0>{});
+    static constexpr index_t Block_N = BlockTile::at(number<1>{});
+
+    static constexpr index_t Warp_M = WarpTile::at(number<0>{});
+    static constexpr index_t Warp_N = WarpTile::at(number<1>{});
+
+    static constexpr index_t ThreadTile_M = ThreadTile::at(number<0>{});
+    static constexpr index_t ThreadTile_N = ThreadTile::at(number<1>{});
+
+    static constexpr index_t WarpPerBlock_M = BlockWarps::at(number<0>{});
+    static constexpr index_t WarpPerBlock_N = BlockWarps::at(number<1>{});
+
+    static_assert(Warp_M % ThreadTile_M == 0, "Warp_M must be divisible by ThreadTile_M");
+    static_assert(Warp_N % ThreadTile_N == 0, "Warp_N must be divisible by ThreadTile_N");
+    static_assert((Warp_M * Warp_N / ThreadTile_M / ThreadTile_N) % ck_tile::get_warp_size() == 0,
+                  "Warp_M * Warp_N / ThreadTile_M / ThreadTile_N must be a multiple of warp size");
+
+    // Scale factor to account for warp size
+    // WarpSizeScaleFactor = warp tile/ thread tile / warp size
+    static constexpr index_t WarpSizeScaleFactor =
+        Warp_M * Warp_N / ThreadTile_M / ThreadTile_N / ck_tile::get_warp_size();
+
+    static constexpr index_t WarpSizeScaleFactor_M =
+        (Warp_M / ThreadTile_M > Warp_N / ThreadTile_N) ? WarpSizeScaleFactor : 1;
+    static constexpr index_t WarpSizeScaleFactor_N =
+        (Warp_M / ThreadTile_M > Warp_N / ThreadTile_N) ? 1 : WarpSizeScaleFactor;
+
+    static constexpr index_t ThreadPerWarp_M = Warp_M / ThreadTile_M / WarpSizeScaleFactor_M;
+    static constexpr index_t ThreadPerWarp_N = Warp_N / ThreadTile_N / WarpSizeScaleFactor_N;
+
+    static_assert((Block_M * WarpSizeScaleFactor_M) % (WarpPerBlock_M * Warp_M) == 0,
+                  "Block_M * WarpSizeScaleFactor_M must be divisible by WarpPerBlock_M * Warp_M");
+    static_assert((Block_N * WarpSizeScaleFactor_N) % (WarpPerBlock_N * Warp_N) == 0,
+                  "Block_N * WarpSizeScaleFactor_N must be divisible by WarpPerBlock_N * Warp_N");
+
+    static constexpr index_t Repeat_M = Block_M * WarpSizeScaleFactor_M / (WarpPerBlock_M * Warp_M);
+    static constexpr index_t Repeat_N = Block_N * WarpSizeScaleFactor_N / (WarpPerBlock_N * Warp_N);
+
+    static constexpr index_t BlockSize =
+        ck_tile::get_warp_size() * reduce_on_sequence(BlockWarps{}, multiplies{}, number<1>{});
+};
+} // namespace ck_tile