From bbd54d3dfbcabc959f0ff98bb94be1059a3397f8 Mon Sep 17 00:00:00 2001 From: ClementLinCF <162283536+ClementLinCF@users.noreply.github.com> Date: Mon, 13 Jan 2025 12:50:32 +0800 Subject: [PATCH] [CK_TILE] Adjust kBlockSize of reduce example for better perf (#1779) * Observed a 2x perf improvement with kBlockSize = 256 * Using 512 threads may lead to redundant computations [ROCm/composable_kernel commit: 0b8f117f1ae765f0e490368de7f0c5d7591b17b6] --- example/ck_tile/05_reduce/reduce.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/example/ck_tile/05_reduce/reduce.cpp b/example/ck_tile/05_reduce/reduce.cpp index 005541dc62..602661f779 100644 --- a/example/ck_tile/05_reduce/reduce.cpp +++ b/example/ck_tile/05_reduce/reduce.cpp @@ -52,7 +52,7 @@ bool run(const ck_tile::ArgParser& arg_parser) // using WarpTile = ck_tile::sequence<1, 512>; // using Vector = ck_tile::sequence<1, 8>; - constexpr ck_tile::index_t kBlockSize = 512; + constexpr ck_tile::index_t kBlockSize = 256; constexpr ck_tile::index_t kBlockPerCu = 1; ck_tile::index_t kGridSize = (m / BlockTile::at(ck_tile::number<0>{})); std::cout << "grid size " << kGridSize << std::endl;