From 765c03d09bf64bf68a51ee1ecfa46ffcec379d13 Mon Sep 17 00:00:00 2001 From: Iwan Kawrakow Date: Wed, 12 Mar 2025 15:06:31 +0200 Subject: [PATCH] FlashMLA-2: slightly smaller computer buffer size --- src/llama.cpp | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/llama.cpp b/src/llama.cpp index d5bf5e9f..cc15cf33 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -13661,6 +13661,9 @@ struct llm_build_context { auto k_nope = ggml_cast(ctx0, k_nope_f32, kv_self.kv_l[il]->type); cb(k_nope, "k_nope", il); + ggml_build_forward_expand(gf, k_nope); + ggml_build_forward_expand(gf, v); + auto kv_cache_rope = ggml_view_3d(ctx0, kv_self.kv_l[il], n_embd_head_qk_rope, n_kv, 1, kv_self.kv_l[il]->nb[1], kv_self.kv_l[il]->nb[2], ggml_row_size(kv_self.kv_l[il]->type, kv_lora_rank)); @@ -13673,7 +13676,6 @@ struct llm_build_context { cb(k, "k", il); ggml_build_forward_expand(gf, k); - ggml_build_forward_expand(gf, v); } else { // Hahaha, we need to convert the KV cache for this layer to f32 because the general purpose ML library ggml does not