mirror of
https://github.com/ROCm/composable_kernel.git
synced 2026-04-19 22:39:03 +00:00
[CK_TILE] Adjust kBlockSize of reduce example for better perf (#1779)
* Observed a 2x perf improvement with kBlockSize = 256 * Using 512 threads may lead to redundant computations
This commit is contained in:
@@ -52,7 +52,7 @@ bool run(const ck_tile::ArgParser& arg_parser)
|
||||
// using WarpTile = ck_tile::sequence<1, 512>;
|
||||
// using Vector = ck_tile::sequence<1, 8>;
|
||||
|
||||
constexpr ck_tile::index_t kBlockSize = 512;
|
||||
constexpr ck_tile::index_t kBlockSize = 256;
|
||||
constexpr ck_tile::index_t kBlockPerCu = 1;
|
||||
ck_tile::index_t kGridSize = (m / BlockTile::at(ck_tile::number<0>{}));
|
||||
std::cout << "grid size " << kGridSize << std::endl;
|
||||
|
||||
Reference in New Issue
Block a user