This commit is contained in:
Iwan Kawrakow
2025-09-23 17:25:47 +03:00
parent 18f04350e9
commit 0a70ca0bc0

View File

@@ -1408,7 +1408,7 @@ void launch_fattn_mma(
//const bool use_stream_k = cc >= CC_ADA_LOVELACE || tiles_efficiency_percent < 75;
// On my RTX-4080 the above is slightly slower for PP. It would be useful to try and see what happens on Blackwell
const bool use_stream_k = tiles_efficiency_percent < 75;
const bool use_stream_k = tiles_efficiency_percent < 75 || Q->ne[1] > 2048;
blocks_num.x = use_stream_k ? nblocks_stream_k : ntiles_total;
blocks_num.y = 1;