From 27e8ed645445d07bb0cac26b1d70bed6b8eaa89c Mon Sep 17 00:00:00 2001 From: Iwan Kawrakow Date: Tue, 2 Sep 2025 18:24:45 +0300 Subject: [PATCH] This seems very slightly better --- ggml/src/ggml-cuda/fattn-mma-f16.cuh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ggml/src/ggml-cuda/fattn-mma-f16.cuh b/ggml/src/ggml-cuda/fattn-mma-f16.cuh index cfff9a3b..0c1aaf7d 100644 --- a/ggml/src/ggml-cuda/fattn-mma-f16.cuh +++ b/ggml/src/ggml-cuda/fattn-mma-f16.cuh @@ -1400,7 +1400,7 @@ void launch_fattn_mma( dim3 blocks_num; if (stream_k) { // For short contexts it can be faster to have the SMs work on whole tiles because this lets us skip the fixup. - const int max_blocks = 2*nsm; + const int max_blocks = Q->ne[1] > 1 ? 2*nsm : nsm; const int tiles_nwaves = (ntiles_total + max_blocks - 1) / max_blocks; const int tiles_efficiency_percent = 100 * ntiles_total / (max_blocks*tiles_nwaves);