From bb21114ab4a57f800451d9b5264cca8e563f4977 Mon Sep 17 00:00:00 2001 From: Iwan Kawrakow Date: Mon, 1 Sep 2025 09:17:19 +0300 Subject: [PATCH] Slightly better PP --- ggml/src/ggml-cuda/fattn-mma-f16.cuh | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/ggml/src/ggml-cuda/fattn-mma-f16.cuh b/ggml/src/ggml-cuda/fattn-mma-f16.cuh index 3603c040..16c8c24f 100644 --- a/ggml/src/ggml-cuda/fattn-mma-f16.cuh +++ b/ggml/src/ggml-cuda/fattn-mma-f16.cuh @@ -1235,7 +1235,9 @@ void launch_fattn_mma( const int nblocks_stream_k = max_blocks; - const bool use_stream_k = cc >= CC_ADA_LOVELACE || tiles_efficiency_percent < 75; + //const bool use_stream_k = cc >= CC_ADA_LOVELACE || tiles_efficiency_percent < 75; + // On my RTX-4080 the above is slightly slower for PP. It would be useful to try and see what happens on Blackwell + const bool use_stream_k = tiles_efficiency_percent < 75; blocks_num.x = use_stream_k ? nblocks_stream_k : ntiles_total; blocks_num.y = 1;