[CK Tile] multi reduce improvements (#3607)

* WIP: refactoring

* Swap operation/data nested loops order

* Improve memory coalescing

* Add comments

* Enforce same identity element for the reduce operations

* Re-add compile time constant

* Comment + re-add __builtin_amdgcn_readfirstlane(0) to the loop init

---------

Co-authored-by: Damien Lejeune <damien.lejeune@amd.com>
This commit is contained in:
damien-lejeune
2026-01-27 21:56:09 +01:00
committed by GitHub
parent 23cefda140
commit 91e32f305f
2 changed files with 97 additions and 63 deletions

View File

@@ -39,26 +39,20 @@ using TestConfig_F16_Add = std::tuple<ck_tile::half_t,
Shape1_WarpTile,
Shape1_ThreadTile>;
using TestConfig_F16_Add_Max = std::tuple<
using TestConfig_F16_Add_SumSquare = std::tuple<
ck_tile::half_t,
float,
ck_tile::half_t,
ck_tile::tuple<ck_tile::ReduceOp::Add, ck_tile::ReduceOp::Max, ck_tile::ReduceOp::Add>,
ck_tile::tuple<ck_tile::element_wise::PassThrough,
ck_tile::element_wise::PassThrough,
ck_tile::element_wise::UnarySquare>,
ck_tile::tuple<ck_tile::element_wise::PassThrough,
ck_tile::element_wise::PassThrough,
ck_tile::element_wise::UnaryDivide>,
ck_tile::tuple<ck_tile::element_wise::PassThrough,
ck_tile::element_wise::PassThrough,
ck_tile::element_wise::PassThrough>,
ck_tile::tuple<ck_tile::ReduceOp::Add, ck_tile::ReduceOp::Add>,
ck_tile::tuple<ck_tile::element_wise::PassThrough, ck_tile::element_wise::UnarySquare>,
ck_tile::tuple<ck_tile::element_wise::PassThrough, ck_tile::element_wise::UnaryDivide>,
ck_tile::tuple<ck_tile::element_wise::PassThrough, ck_tile::element_wise::PassThrough>,
Shape1_BlockWarps,
Shape1_BlockTile,
Shape1_WarpTile,
Shape1_ThreadTile>;
using TestTypes = ::testing::Types<TestConfig_F16_Add, TestConfig_F16_Add_Max>;
using TestTypes = ::testing::Types<TestConfig_F16_Add, TestConfig_F16_Add_SumSquare>;
TYPED_TEST_SUITE(TestCkTileMultiReduceThreadwise, TestTypes);