[CK_TILE] Adjust kBlockSize of reduce example for better perf (#1779)

* Observed a 2x perf improvement with kBlockSize = 256
* Using 512 threads may lead to redundant computations
This commit is contained in:
ClementLinCF
2025-01-13 12:50:32 +08:00
committed by GitHub
parent 3d50f57f43
commit 0b8f117f1a

View File

@@ -52,7 +52,7 @@ bool run(const ck_tile::ArgParser& arg_parser)
// using WarpTile = ck_tile::sequence<1, 512>;
// using Vector = ck_tile::sequence<1, 8>;
constexpr ck_tile::index_t kBlockSize = 512;
constexpr ck_tile::index_t kBlockSize = 256;
constexpr ck_tile::index_t kBlockPerCu = 1;
ck_tile::index_t kGridSize = (m / BlockTile::at(ck_tile::number<0>{}));
std::cout << "grid size " << kGridSize << std::endl;