From bb21114ab4a57f800451d9b5264cca8e563f4977 Mon Sep 17 00:00:00 2001
From: Iwan Kawrakow <iwan.kawrakow@gmail.com>
Date: Mon, 1 Sep 2025 09:17:19 +0300
Subject: [PATCH] Slightly better PP

---
 ggml/src/ggml-cuda/fattn-mma-f16.cuh | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/ggml/src/ggml-cuda/fattn-mma-f16.cuh b/ggml/src/ggml-cuda/fattn-mma-f16.cuh
index 3603c040..16c8c24f 100644
--- a/ggml/src/ggml-cuda/fattn-mma-f16.cuh
+++ b/ggml/src/ggml-cuda/fattn-mma-f16.cuh
@@ -1235,7 +1235,9 @@ void launch_fattn_mma(
 
         const int nblocks_stream_k = max_blocks;
 
-        const bool use_stream_k = cc >= CC_ADA_LOVELACE || tiles_efficiency_percent < 75;
+        //const bool use_stream_k = cc >= CC_ADA_LOVELACE || tiles_efficiency_percent < 75;
+        //  On my RTX-4080 the above is slightly slower for PP. It would be useful to try and see what happens on Blackwell
+        const bool use_stream_k = tiles_efficiency_percent < 75;
 
         blocks_num.x = use_stream_k ? nblocks_stream_k : ntiles_total;
         blocks_num.y = 1;