Better CPU prompt processing performance for SWA models (#696)

* This does the trick for PP * Compute mask bounds when creating the mask * Set mask bounds for all supported SWA models --------- Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
2026-03-07 20:40:02 +00:00 · 2025-08-17 10:30:27 +03:00
parent 259cbf0bde
commit d4d017766e
5 changed files with 140 additions and 30 deletions
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -2513,6 +2513,8 @@ struct llama_context {
    struct ggml_tensor * inp_embd_enc;      // F32 [n_embd, n_outputs_enc]
    struct ggml_tensor * inp_KQ_mask_cross; // F32 [n_outputs_enc, n_batch]
    struct ggml_tensor * inp_scale = nullptr; // F32 [n_tokens]
+    struct ggml_tensor * inp_mask_bounds = nullptr; // I32 [2, n_tokens]
+    struct ggml_tensor * inp_mask_bounds_swa = nullptr; // I32 [2, n_tokens]
 };

 struct llama_lora_weight {
@@ -7943,7 +7945,8 @@ static struct ggml_tensor * llm_build_kqv(
                    float     kq_scale,
         const llm_build_cb & cb,
                    int       il,
-                ggml_tensor * sinks = nullptr) {
+                ggml_tensor * sinks  = nullptr,
+                ggml_tensor * bounds = nullptr) {
    const llama_model   & model   = lctx.model;
    const llama_hparams & hparams = lctx.model.hparams;
    const llama_cparams & cparams = lctx.cparams;
@@ -7990,7 +7993,8 @@ static struct ggml_tensor * llm_build_kqv(

        cur = ggml_flash_attn_ext(ctx, q, k, v, kq_mask, kq_scale, hparams.f_max_alibi_bias,
                                  hparams.attn_soft_cap ? hparams.f_attn_logit_softcapping : 0.0f);
-        ggml_flash_attn_ext_add_sinks(cur, sinks);
+        ggml_flash_attn_ext_add_sinks (cur, sinks);
+        ggml_flash_attn_ext_add_bounds(cur, bounds);

        // Some models produced NaNs/gibberish when FA is computed with f16 precision on CUDA
        // For DeepSeek-2, it is perfectly fine with fp16 for PP, but I get gibberish when uding fp16 for TG.
@@ -8148,7 +8152,8 @@ static struct ggml_tensor * llm_build_kv(
                    float     kq_scale,
         const llm_build_cb & cb,
                    int       il,
-                ggml_tensor * sinks = nullptr) {
+                ggml_tensor * sinks  = nullptr,
+                ggml_tensor * bounds = nullptr) {
    const llama_hparams & hparams = lctx.model.hparams;
    const llama_cparams & cparams = lctx.cparams;

@@ -8163,7 +8168,7 @@ static struct ggml_tensor * llm_build_kv(
    struct ggml_tensor * cur;

    cur  = llm_build_kqv(ctx, lctx, kv, graph, wo, wo_b,
-            q_cur, kq_mask, n_tokens, n_kv, kq_scale, cb, il, sinks);
+            q_cur, kq_mask, n_tokens, n_kv, kq_scale, cb, il, sinks, bounds);
    cb(cur, "kqv_out", il);

    return cur;
@@ -8298,6 +8303,8 @@ struct llm_build_context {
        lctx.inp_pos_bucket    = nullptr;
        lctx.inp_embd_enc      = nullptr;
        lctx.inp_KQ_mask_cross = nullptr;
+        lctx.inp_mask_bounds   = nullptr;
+        lctx.inp_mask_bounds_swa = nullptr;
    }

    void free() {
@@ -8478,6 +8485,9 @@ struct llm_build_context {
        cb(lctx.inp_KQ_mask, "KQ_mask", -1);
        ggml_set_input(lctx.inp_KQ_mask);

+        lctx.inp_mask_bounds = ggml_new_tensor_2d(ctx0, GGML_TYPE_I32, 2, GGML_PAD(n_tokens, GGML_KQ_MASK_PAD));
+        ggml_set_input(lctx.inp_mask_bounds);
+
        return flash_attn ? ggml_cast(ctx0, lctx.inp_KQ_mask, GGML_TYPE_F16) : lctx.inp_KQ_mask;
    }

@@ -8490,6 +8500,9 @@ struct llm_build_context {
        cb(lctx.inp_KQ_mask_swa, "KQ_mask_swa", -1);
        ggml_set_input(lctx.inp_KQ_mask_swa);

+        lctx.inp_mask_bounds_swa = ggml_new_tensor_2d(ctx0, GGML_TYPE_I32, 2, GGML_PAD(n_tokens, GGML_KQ_MASK_PAD));
+        ggml_set_input(lctx.inp_mask_bounds_swa);
+
        return flash_attn ? ggml_cast(ctx0, lctx.inp_KQ_mask_swa, GGML_TYPE_F16) : lctx.inp_KQ_mask_swa;
    }

@@ -8658,6 +8671,7 @@ struct llm_build_context {
            bool use_rope = model.arch == LLM_ARCH_LLAMA4 ? (il + 1) % hparams.n_no_rope_layer_step != 0 : true;
            auto this_KQ_mask = hparams.n_swa > 0 && hparams.n_swa_pattern > 0 && il % hparams.n_swa_pattern < (hparams.n_swa_pattern - 1) ?
                KQ_mask_swa : KQ_mask;
+            auto bounds = this_KQ_mask == KQ_mask_swa ? lctx.inp_mask_bounds_swa : lctx.inp_mask_bounds;

            // norm
            cur = llm_build_norm(ctx0, inpL, hparams,
@@ -8722,7 +8736,7 @@ struct llm_build_context {

                cur = llm_build_kv(ctx0, lctx, kv_self, gf,
                        model.layers[il].wo, model.layers[il].bo,
-                        Kcur, Vcur, Qcur, this_KQ_mask, n_tokens, kv_head, n_kv, kq_scale, cb, il);
+                        Kcur, Vcur, Qcur, this_KQ_mask, n_tokens, kv_head, n_kv, kq_scale, cb, il, nullptr, bounds);
            }

            if (il == n_layer - 1) {
@@ -11223,7 +11237,7 @@ struct llm_build_context {

                cur = llm_build_kv(ctx0, lctx, kv_self, gf,
                        model.layers[il].wo, model.layers[il].bo,
-                        Kcur, Vcur, Qcur, KQ_mask_swa, n_tokens, kv_head, n_kv, 1.0f, cb, il);
+                        Kcur, Vcur, Qcur, KQ_mask_swa, n_tokens, kv_head, n_kv, 1.0f, cb, il, nullptr, lctx.inp_mask_bounds_swa);
            }

            if (il == n_layer - 1) {
@@ -12112,6 +12126,7 @@ struct llm_build_context {
        for (int il = 0; il < n_layer; ++il) {
            // (il % 2) layers use SWA
            struct ggml_tensor * KQ_mask_l = (il % 2 == 0) ? KQ_mask_swa : KQ_mask;
+            auto bounds = KQ_mask_l == KQ_mask_swa ? lctx.inp_mask_bounds_swa : lctx.inp_mask_bounds;

            // norm
            cur = llm_build_norm(ctx0, inpL, hparams,
@@ -12154,7 +12169,7 @@ struct llm_build_context {

                cur = llm_build_kv(ctx0, lctx, kv_self, gf,
                        model.layers[il].wo, NULL,
-                        Kcur, Vcur, Qcur, KQ_mask_l, n_tokens, kv_head, n_kv, 1.0f, cb, il);
+                        Kcur, Vcur, Qcur, KQ_mask_l, n_tokens, kv_head, n_kv, 1.0f, cb, il, nullptr, bounds);
            }

            cur = llm_build_norm(ctx0, cur, hparams,
@@ -12257,6 +12272,7 @@ struct llm_build_context {
            const float freq_base_l        = is_sliding ? 10000.0f    : freq_base;
            const float freq_scale_l       = is_sliding ? 1.0f        : freq_scale;
            struct ggml_tensor * KQ_mask_l = is_sliding ? KQ_mask_swa : KQ_mask;
+            auto bounds = is_sliding ? lctx.inp_mask_bounds_swa : lctx.inp_mask_bounds;

            // norm
            cur = llm_build_norm(ctx0, inpL, hparams, model.layers[il].attn_norm, NULL, LLM_NORM_RMS, cb, il);
@@ -12291,7 +12307,7 @@ struct llm_build_context {
                cb(Kcur, "Kcur", il);

                cur = llm_build_kv(ctx0, lctx, kv_self, gf, model.layers[il].wo, NULL,
-                        Kcur, Vcur, Qcur, KQ_mask_l, n_tokens, kv_head, n_kv, hparams.f_attention_scale, cb, il);
+                        Kcur, Vcur, Qcur, KQ_mask_l, n_tokens, kv_head, n_kv, hparams.f_attention_scale, cb, il, nullptr, bounds);
            }

            cur = llm_build_norm(ctx0, cur, hparams, model.layers[il].attn_post_norm, NULL, LLM_NORM_RMS, cb, il);
@@ -14303,6 +14319,7 @@ struct llm_build_context {
            // fourth layer uses global attention without positional embeddings
            const bool           is_sliding = il % sliding_window_pattern < (sliding_window_pattern - 1);
            struct ggml_tensor * KQ_mask_l = is_sliding ? KQ_mask_swa : KQ_mask;
+            auto bounds = is_sliding ? lctx.inp_mask_bounds_swa : lctx.inp_mask_bounds;

            // norm
            cur = llm_build_norm(ctx0, inpL, hparams, model.layers[il].attn_norm, NULL, LLM_NORM, cb, il);
@@ -14356,7 +14373,7 @@ struct llm_build_context {
                }

                cur = llm_build_kv(ctx0, lctx, kv_self, gf, model.layers[il].wo, model.layers[il].bo, Kcur, Vcur, Qcur,
-                                   KQ_mask_l, n_tokens, kv_head, n_kv, 1.0f / sqrtf(float(n_embd_head)), cb, il);
+                                   KQ_mask_l, n_tokens, kv_head, n_kv, 1.0f / sqrtf(float(n_embd_head)), cb, il, nullptr, bounds);
            }

            if (il == n_layer - 1) {
@@ -15407,6 +15424,7 @@ struct llm_build_context {
            ggml_tensor * inpSA = inpL;

            struct ggml_tensor * KQ_mask_l = is_sliding ? KQ_mask_swa : KQ_mask;
+            auto bounds = is_sliding ? lctx.inp_mask_bounds_swa : lctx.inp_mask_bounds;

            // norm
            cur = llm_build_norm(ctx0, inpL, hparams, model.layers[il].attn_norm, nullptr, LLM_NORM_RMS, cb, il);
@@ -15446,7 +15464,7 @@ struct llm_build_context {
                cb(Kcur, "Kcur", il);

                cur = llm_build_kv(ctx0, lctx, kv_self, gf, model.layers[il].wo, model.layers[il].bo,
-                        Kcur, Vcur, Qcur, KQ_mask_l, n_tokens, kv_head, n_kv, kq_scale, cb, il, model.layers[il].attn_sinks);
+                        Kcur, Vcur, Qcur, KQ_mask_l, n_tokens, kv_head, n_kv, kq_scale, cb, il, model.layers[il].attn_sinks, bounds);

                cb(cur, "attn_out", il);
            }
@@ -15965,16 +15983,26 @@ static void llama_set_inputs(llama_context & lctx, const llama_batch & batch) {

            float * data     = nullptr;
            float * data_swa = nullptr;
+            int32_t * bounds = nullptr;
+            int32_t * bounds_swa = nullptr;

            if (lctx.inp_KQ_mask) {
                GGML_ASSERT(ggml_backend_buffer_is_host(lctx.inp_KQ_mask->buffer));
                data = (float *) lctx.inp_KQ_mask->data;
            }
+            if (lctx.inp_mask_bounds) {
+                GGML_ASSERT(ggml_backend_buffer_is_host(lctx.inp_mask_bounds->buffer));
+                bounds = (int32_t *)lctx.inp_mask_bounds->data;
+            }

            if (lctx.inp_KQ_mask_swa) {
                GGML_ASSERT(ggml_backend_buffer_is_host(lctx.inp_KQ_mask_swa->buffer));
                data_swa = (float *) lctx.inp_KQ_mask_swa->data;
            }
+            if (lctx.inp_mask_bounds_swa) {
+                GGML_ASSERT(ggml_backend_buffer_is_host(lctx.inp_mask_bounds_swa->buffer));
+                bounds_swa = (int32_t *)lctx.inp_mask_bounds_swa->data;
+            }

            // For causal attention, use only the previous KV cells
            // of the correct sequence for each token of the batch.
@@ -16023,6 +16051,19 @@ static void llama_set_inputs(llama_context & lctx, const llama_batch & batch) {
                            data[h*(n_kv*n_tokens) + i*n_kv + j] = -INFINITY;
                        }
                    }
+                    if (h == 0 && bounds) {
+                        for (int i = 0; i < GGML_PAD(n_tokens, GGML_KQ_MASK_PAD); ++i) {
+                            int min = n_kv, max = 0;
+                            for (int j = 0; j < n_kv; ++j) {
+                                if (data[i*n_kv + j] > -INFINITY) {
+                                    min = std::min(min, j);
+                                    max = std::max(max, j);
+                                }
+                            }
+                            bounds[2*i + 0] = min;
+                            bounds[2*i + 1] = max+1;
+                        }
+                    }
                }

                if (data_swa) {
@@ -16031,6 +16072,19 @@ static void llama_set_inputs(llama_context & lctx, const llama_batch & batch) {
                            data_swa[h*(n_kv*n_tokens) + i*n_kv + j] = -INFINITY;
                        }
                    }
+                    if (h == 0 && bounds_swa) {
+                        for (int i = 0; i < GGML_PAD(n_tokens, GGML_KQ_MASK_PAD); ++i) {
+                            int min = n_kv, max = 0;
+                            for (int j = 0; j < n_kv; ++j) {
+                                if (data_swa[i*n_kv + j] > -INFINITY) {
+                                    min = std::min(min, j);
+                                    max = std::max(max, j);
+                                }
+                            }
+                            bounds_swa[2*i + 0] = min;
+                            bounds_swa[2*i + 1] = max+1;
+                        }
+                    }
                }
            }
        } else {