Slightly better PP

This commit is contained in:
Iwan Kawrakow
2025-09-01 09:17:19 +03:00
parent e2e5b270c5
commit bb21114ab4

View File

@@ -1235,7 +1235,9 @@ void launch_fattn_mma(
const int nblocks_stream_k = max_blocks;
const bool use_stream_k = cc >= CC_ADA_LOVELACE || tiles_efficiency_percent < 75;
//const bool use_stream_k = cc >= CC_ADA_LOVELACE || tiles_efficiency_percent < 75;
// On my RTX-4080 the above is slightly slower for PP. It would be useful to try and see what happens on Blackwell
const bool use_stream_k = tiles_efficiency_percent < 75;
blocks_num.x = use_stream_k ? nblocks_stream_k : ntiles_total;
blocks_num.y = 1;