soft_cap_max: initial CPU version of fused softcap + soft_max

With this vanilla CPU implementation I'm already getting a ~3% speedup for Gemma-2-9b and a prompt of 8192 tokens.
2026-04-29 02:41:47 +00:00 · 2024-08-21 13:31:56 +03:00
parent bd99ed7d0a
commit 6e5d728040
3 changed files with 219 additions and 5 deletions
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -8324,10 +8324,12 @@ static struct ggml_tensor * llm_build_kqv(
        }

        if (hparams.attn_soft_cap) {
-            kq = ggml_softcap(ctx, kq, 1.0f / hparams.f_attn_logit_softcapping, hparams.f_attn_logit_softcapping);
+            //kq = ggml_softcap(ctx, kq, 1.0f / hparams.f_attn_logit_softcapping, hparams.f_attn_logit_softcapping);
+            kq = ggml_softcap_max(ctx, kq, kq_mask, kq_scale, hparams.f_max_alibi_bias,
+                    1.0f / hparams.f_attn_logit_softcapping, hparams.f_attn_logit_softcapping);
+        } else {
+            kq = ggml_soft_max_ext(ctx, kq, kq_mask, kq_scale, hparams.f_max_alibi_bias);
        }
-
-        kq = ggml_soft_max_ext(ctx, kq, kq_mask, kq_scale, hparams.f_max_alibi_bias);
        cb(kq, "kq_soft_max_ext", il);

        GGML_ASSERT(kv.size == n_ctx);