Hadamard transforms for K-cache - CPU only (#1033)

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
2026-03-14 15:57:37 +00:00 · 2025-12-04 06:51:11 +01:00
parent 08961718f3
commit 658ced0abd
13 changed files with 155 additions and 20 deletions
--- a/src/llama-build-context.cpp
+++ b/src/llama-build-context.cpp
@@ -52,6 +52,7 @@ llm_build_context::llm_build_context(
        fused_up_gate    (cparams.fused_up_gate),
        fused_mmad       (cparams.fused_mmad),
        rope_cache       (cparams.rope_cache),
+        k_cache_hadamard (cparams.k_cache_hadamard),
        min_experts      (cparams.min_experts),
        thresh_experts   (cparams.thresh_experts),
        pooling_type     (cparams.pooling_type),
@@ -1466,6 +1467,13 @@ ggml_tensor * llm_build_context::llm_build_kv(
    const llama_hparams & hparams = lctx.model.hparams;
    const llama_cparams & cparams = lctx.cparams;

+    if (cparams.k_cache_hadamard) {
+        q_cur = ggml_hadamard(ctx, q_cur, hparams.n_embd_head_k);
+        k_cur = ggml_hadamard(ctx, k_cur, hparams.n_embd_head_k);
+        cb(q_cur, "Qcur_hadamard", il);
+        cb(k_cur, "Kcur_hadamard", il);
+    }
+
    // these nodes are added to the graph together so that they are not reordered
    // by doing so, the number of splits in the graph is reduced
    ggml_build_forward_expand(graph, q_cur);
@@ -9375,6 +9383,12 @@ ggml_tensor * llm_build_context::build_std_attention(ggml_cgraph * gf, ggml_tens
                    Qcur = ggml_mul(ctx0, Qcur, inp_attn_scale);
                    cb(Qcur, "Qcur_temp_scaled", il_cb);
                }
+                if (cparams.k_cache_hadamard) {
+                    Qcur = ggml_hadamard(ctx0, Qcur, hparams.n_embd_head_k);
+                    Kcur = ggml_hadamard(ctx0, Kcur, hparams.n_embd_head_k);
+                    cb(Qcur, "Qcur_hadamard", il_cb);
+                    cb(Kcur, "Kcur_hadamard", il_cb);
+                }
                ggml_build_forward_expand(gf, Qcur);
                ggml_build_forward_expand(gf, Kcur);
                ggml_build_forward_expand(gf, Vcur);