From 765c03d09bf64bf68a51ee1ecfa46ffcec379d13 Mon Sep 17 00:00:00 2001
From: Iwan Kawrakow <iwan.kawrakow@gmail.com>
Date: Wed, 12 Mar 2025 15:06:31 +0200
Subject: [PATCH] FlashMLA-2: slightly smaller computer buffer size

---
 src/llama.cpp | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/src/llama.cpp b/src/llama.cpp
index d5bf5e9f..cc15cf33 100644
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -13661,6 +13661,9 @@ struct llm_build_context {
                             auto k_nope = ggml_cast(ctx0, k_nope_f32, kv_self.kv_l[il]->type);
                             cb(k_nope, "k_nope", il);
 
+                            ggml_build_forward_expand(gf, k_nope);
+                            ggml_build_forward_expand(gf, v);
+
                             auto kv_cache_rope = ggml_view_3d(ctx0, kv_self.kv_l[il], n_embd_head_qk_rope, n_kv, 1,
                                     kv_self.kv_l[il]->nb[1], kv_self.kv_l[il]->nb[2], ggml_row_size(kv_self.kv_l[il]->type, kv_lora_rank));
 
@@ -13673,7 +13676,6 @@ struct llm_build_context {
                             cb(k, "k", il);
 
                             ggml_build_forward_expand(gf, k);
-                            ggml_build_forward_expand(gf, v);
                         }
                         else {
                             // Hahaha, we need to convert the KV cache for this layer to f32 because the general purpose ML library ggml does not