This seems very slightly better

2026-02-27 08:34:09 +00:00 · 2025-09-02 18:24:45 +03:00
parent 32e223df46
commit 27e8ed6454
1 changed files with 1 additions and 1 deletions
--- a/ggml/src/ggml-cuda/fattn-mma-f16.cuh
+++ b/ggml/src/ggml-cuda/fattn-mma-f16.cuh
@@ -1400,7 +1400,7 @@ void launch_fattn_mma(
    dim3 blocks_num;
    if (stream_k) {
        // For short contexts it can be faster to have the SMs work on whole tiles because this lets us skip the fixup.
-        const int max_blocks = 2*nsm;
+        const int max_blocks = Q->ne[1] > 1 ? 2*nsm : nsm;
        const int tiles_nwaves = (ntiles_total + max_blocks - 1) / max_blocks;
        const int tiles_efficiency_percent = 100 * ntiles_total / (max_blocks*tiles_nwaves);