From a4c29905d0c01b3cc0e739a890a30326bf757bb2 Mon Sep 17 00:00:00 2001 From: Iwan Kawrakow Date: Thu, 13 Nov 2025 06:36:33 +0200 Subject: [PATCH] This is perhaps cleaner --- src/llama-context.h | 4 ++-- src/llama.cpp | 22 ++++++++++------------ 2 files changed, 12 insertions(+), 14 deletions(-) diff --git a/src/llama-context.h b/src/llama-context.h index c9ab3595..bb21d880 100644 --- a/src/llama-context.h +++ b/src/llama-context.h @@ -210,7 +210,7 @@ struct llama_context { std::unique_ptr prev; void reset_scheduler(); - bool can_reuse_graph(const llama_batch & u_batch) const; + bool can_reuse_graph(const llama_batch & u_batch); struct CacheCopy { ggml_tensor * cpy = nullptr; @@ -218,6 +218,6 @@ struct llama_context { }; std::vector cache_copies; - void update_cache_copies(); + bool update_cache_copies(); }; diff --git a/src/llama.cpp b/src/llama.cpp index 8e14cda0..bcf9433c 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -548,7 +548,7 @@ void llama_context::reset_scheduler() { prev.reset(); } -bool llama_context::can_reuse_graph(const llama_batch & u_batch) const { +bool llama_context::can_reuse_graph(const llama_batch & u_batch) { if (!prev || !prev->graph) return false; if (u_batch.n_tokens > 1) return false; if (u_batch.embd) return false; @@ -556,31 +556,30 @@ bool llama_context::can_reuse_graph(const llama_batch & u_batch) const { return u_batch.all_seq_id == prev->all_seq_id && kv_self.head > 0 && kv_self.n == prev->n_kv && - n_outputs == prev->n_outputs; + n_outputs == prev->n_outputs && + update_cache_copies(); } -void llama_context::update_cache_copies() { +bool llama_context::update_cache_copies() { int n_layer = cache_copies.size()/2; - GGML_ASSERT((int)kv_self.k_l.size() == n_layer); - GGML_ASSERT(kv_self.v_l.empty() || (int)kv_self.v_l.size() == n_layer); - //printf("%s: head = %d\n", __func__, kv_self.head); + if ((int)kv_self.k_l.size() != n_layer) return false; + if (!(kv_self.v_l.empty() || (int)kv_self.v_l.size() == n_layer)) return false; for (int il = 0; il < n_layer; ++il) { auto& c = cache_copies[2*il+0]; - GGML_ASSERT(c.cpy->op == GGML_OP_CPY); - GGML_ASSERT(c.cpy->view_src == kv_self.k_l[il]); + if (!c.cpy || c.cpy->op != GGML_OP_CPY || c.cpy->view_src != kv_self.k_l[il]) return false; c.cpy->view_offs = kv_self.head*c.step; c.cpy->src[1]->data = (char *)kv_self.k_l[il]->data + c.cpy->view_offs; c.cpy->data = c.cpy->src[1]->data; } - if (kv_self.v_l.empty()) return; + if (kv_self.v_l.empty()) return true; for (int il = 0; il < n_layer; ++il) { auto& c = cache_copies[2*il+1]; - GGML_ASSERT(c.cpy->op == GGML_OP_CPY); - GGML_ASSERT(c.cpy->view_src == kv_self.v_l[il]); + if (!c.cpy || c.cpy->op != GGML_OP_CPY || c.cpy->view_src != kv_self.v_l[il]) return false; c.cpy->view_offs = kv_self.head*c.step; c.cpy->src[1]->data = (char *)kv_self.v_l[il]->data + c.cpy->view_offs; c.cpy->data = c.cpy->src[1]->data; } + return true; } llama_context::llama_context(const llama_model & model) @@ -2971,7 +2970,6 @@ static int llama_decode_internal( } else { //printf("Reusing graph\n"); gf = lctx.prev->graph; - lctx.update_cache_copies(); } // the output is always the last tensor in the graph