Alternative CUDA FA for SWA models (#754)

* Bounds for flash attention * Add n_swa to FA parameters * Fix it * This seems very slightly better * Using vec kernel when we have SWA * Need also this * f32 vec kernel * This is slightly better --------- Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
2026-02-10 16:30:12 +00:00 · 2025-09-04 08:42:18 +02:00
parent 727f7b7d9f
commit 4a6a6f17ee
7 changed files with 1528 additions and 209 deletions
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -7985,7 +7985,7 @@ static struct ggml_tensor * llm_build_kqv(
                    float     kq_scale,
         const llm_build_cb & cb,
                    int       il,
-                ggml_tensor * sinks = nullptr) {
+                ggml_tensor * sinks = nullptr, int n_swa = 0) {
    const llama_model   & model   = lctx.model;
    const llama_hparams & hparams = lctx.model.hparams;
    const llama_cparams & cparams = lctx.cparams;
@@ -8033,6 +8033,9 @@ static struct ggml_tensor * llm_build_kqv(
        cur = ggml_flash_attn_ext(ctx, q, k, v, kq_mask, kq_scale, hparams.f_max_alibi_bias,
                                  hparams.attn_soft_cap ? hparams.f_attn_logit_softcapping : 0.0f);
        ggml_flash_attn_ext_add_sinks(cur, sinks);
+        if (n_swa > 0) {
+            ((int32_t *)cur->op_params)[4] = n_swa;
+        }

        // Some models produced NaNs/gibberish when FA is computed with f16 precision on CUDA
        // For DeepSeek-2, it is perfectly fine with fp16 for PP, but I get gibberish when uding fp16 for TG.
@@ -8190,7 +8193,7 @@ static struct ggml_tensor * llm_build_kv(
                    float     kq_scale,
         const llm_build_cb & cb,
                    int       il,
-                ggml_tensor * sinks = nullptr) {
+                ggml_tensor * sinks = nullptr, int n_swa = 0) {
    const llama_hparams & hparams = lctx.model.hparams;
    const llama_cparams & cparams = lctx.cparams;

@@ -8205,7 +8208,7 @@ static struct ggml_tensor * llm_build_kv(
    struct ggml_tensor * cur;

    cur  = llm_build_kqv(ctx, lctx, kv, graph, wo, wo_b,
-            q_cur, kq_mask, n_tokens, n_kv, kq_scale, cb, il, sinks);
+            q_cur, kq_mask, n_tokens, n_kv, kq_scale, cb, il, sinks, n_swa);
    cb(cur, "kqv_out", il);

    return cur;
@@ -8766,7 +8769,8 @@ struct llm_build_context {

                cur = llm_build_kv(ctx0, lctx, kv_self, gf,
                        model.layers[il].wo, model.layers[il].bo,
-                        Kcur, Vcur, Qcur, this_KQ_mask, n_tokens, kv_head, n_kv, kq_scale, cb, il);
+                        Kcur, Vcur, Qcur, this_KQ_mask, n_tokens, kv_head, n_kv, kq_scale, cb, il, nullptr,
+                        this_KQ_mask == KQ_mask_swa ? hparams.n_swa : 0);
            }

            if (il == n_layer - 1) {
@@ -12198,7 +12202,8 @@ struct llm_build_context {

                cur = llm_build_kv(ctx0, lctx, kv_self, gf,
                        model.layers[il].wo, NULL,
-                        Kcur, Vcur, Qcur, KQ_mask_l, n_tokens, kv_head, n_kv, 1.0f, cb, il);
+                        Kcur, Vcur, Qcur, KQ_mask_l, n_tokens, kv_head, n_kv, 1.0f, cb, il, nullptr,
+                        KQ_mask_l == KQ_mask_swa ? hparams.n_swa : 0);
            }

            cur = llm_build_norm(ctx0, cur, hparams,
@@ -12335,7 +12340,8 @@ struct llm_build_context {
                cb(Kcur, "Kcur", il);

                cur = llm_build_kv(ctx0, lctx, kv_self, gf, model.layers[il].wo, NULL,
-                        Kcur, Vcur, Qcur, KQ_mask_l, n_tokens, kv_head, n_kv, hparams.f_attention_scale, cb, il);
+                        Kcur, Vcur, Qcur, KQ_mask_l, n_tokens, kv_head, n_kv, hparams.f_attention_scale, cb, il, nullptr,
+                        KQ_mask_l == KQ_mask_swa ? hparams.n_swa : 0);
            }

            cur = llm_build_norm(ctx0, cur, hparams, model.layers[il].attn_post_norm, NULL, LLM_NORM_RMS, cb, il);
@@ -14400,7 +14406,8 @@ struct llm_build_context {
                }

                cur = llm_build_kv(ctx0, lctx, kv_self, gf, model.layers[il].wo, model.layers[il].bo, Kcur, Vcur, Qcur,
-                                   KQ_mask_l, n_tokens, kv_head, n_kv, 1.0f / sqrtf(float(n_embd_head)), cb, il);
+                                   KQ_mask_l, n_tokens, kv_head, n_kv, 1.0f / sqrtf(float(n_embd_head)), cb, il, nullptr,
+                                   is_sliding ? hparams.n_swa : 0);
            }

            if (il == n_layer - 1) {
@@ -15490,7 +15497,8 @@ struct llm_build_context {
                cb(Kcur, "Kcur", il);

                cur = llm_build_kv(ctx0, lctx, kv_self, gf, model.layers[il].wo, model.layers[il].bo,
-                        Kcur, Vcur, Qcur, KQ_mask_l, n_tokens, kv_head, n_kv, kq_scale, cb, il, model.layers[il].attn_sinks);
+                        Kcur, Vcur, Qcur, KQ_mask_l, n_tokens, kv_head, n_kv, kq_scale, cb, il, model.layers[il].attn_sinks,
+                        is_sliding ? hparams.n_swa : 0);

                cb(cur, "attn_out", il);
            }