From 0a70ca0bc05abeccf78867545555ab6079ec2c4d Mon Sep 17 00:00:00 2001 From: Iwan Kawrakow Date: Tue, 23 Sep 2025 17:25:47 +0300 Subject: [PATCH] Fix #772 --- ggml/src/ggml-cuda/fattn-mma-f16.cuh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ggml/src/ggml-cuda/fattn-mma-f16.cuh b/ggml/src/ggml-cuda/fattn-mma-f16.cuh index 0c1aaf7d..27328319 100644 --- a/ggml/src/ggml-cuda/fattn-mma-f16.cuh +++ b/ggml/src/ggml-cuda/fattn-mma-f16.cuh @@ -1408,7 +1408,7 @@ void launch_fattn_mma( //const bool use_stream_k = cc >= CC_ADA_LOVELACE || tiles_efficiency_percent < 75; // On my RTX-4080 the above is slightly slower for PP. It would be useful to try and see what happens on Blackwell - const bool use_stream_k = tiles_efficiency_percent < 75; + const bool use_stream_k = tiles_efficiency_percent < 75 || Q->ne[1] > 2048; blocks_num.x = use_stream_k ? nblocks_stream_k : ntiles_total; blocks_num.y = 1;