Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
This commit is contained in:
Kawrakow
2025-09-23 16:43:02 +02:00
committed by GitHub
parent 079231c291
commit 8b4208e789

View File

@@ -1408,7 +1408,7 @@ void launch_fattn_mma(
//const bool use_stream_k = cc >= CC_ADA_LOVELACE || tiles_efficiency_percent < 75;
// On my RTX-4080 the above is slightly slower for PP. It would be useful to try and see what happens on Blackwell
const bool use_stream_k = tiles_efficiency_percent < 75;
const bool use_stream_k = tiles_efficiency_percent < 75 || Q->ne[1] > 2048;
blocks_num.x = use_stream_k ? nblocks_stream_k : ntiles_total;
blocks_num.y = 1;