FlashMLA-2: slightly smaller computer buffer size

2026-02-25 07:34:10 +00:00 · 2025-03-12 15:06:31 +02:00
parent 3db0f1cc7f
commit 765c03d09b
1 changed files with 3 additions and 1 deletions
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -13661,6 +13661,9 @@ struct llm_build_context {
                            auto k_nope = ggml_cast(ctx0, k_nope_f32, kv_self.kv_l[il]->type);
                            cb(k_nope, "k_nope", il);

+                            ggml_build_forward_expand(gf, k_nope);
+                            ggml_build_forward_expand(gf, v);
+
                            auto kv_cache_rope = ggml_view_3d(ctx0, kv_self.kv_l[il], n_embd_head_qk_rope, n_kv, 1,
                                    kv_self.kv_l[il]->nb[1], kv_self.kv_l[il]->nb[2], ggml_row_size(kv_self.kv_l[il]->type, kv_lora_rank));

@@ -13673,7 +13676,6 @@ struct llm_build_context {
                            cb(k, "k", il);

                            ggml_build_forward_expand(gf, k);
-                            ggml_build_forward_expand(gf, v);
                        }
                        else {
                            // Hahaha, we need to convert the KV cache for this layer to f32 because the general purpose ML library ggml does not