From b40d11b22d637073e4c98d94c47bb573e52e153d Mon Sep 17 00:00:00 2001 From: firecoperana Date: Sat, 15 Nov 2025 15:04:16 +0000 Subject: [PATCH] Fix kv cache save and load for GLM model (#965) Co-authored-by: firecoperana --- examples/server/server.cpp | 2 +- src/llama.cpp | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/examples/server/server.cpp b/examples/server/server.cpp index 196ea225..966947c0 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -1837,7 +1837,7 @@ struct server_context { LLAMA_LOG_INFO("prompt cache save took %.2f ms\n", (ggml_time_us() - t_start) / 1000.0); } // has prompts saved earlier to load - if (!prompt_cache->states.empty()) { + if (prompt_cache && !prompt_cache->states.empty()) { const int64_t t_start = ggml_time_us(); ret->server_cached_prompt.tokens = server_tokens(tokens.get_text_tokens(), false); // copy cache tokens ret->prompt_load(*prompt_cache, task.tokens); diff --git a/src/llama.cpp b/src/llama.cpp index bcf9433c..c7cc3965 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -5286,7 +5286,7 @@ struct llama_data_write { // 1 -> transposed V cache // 2 -> no V cache (as it may be the case with MLA) const uint32_t v_state = kv_self.v_l.empty() ? 2 : kv_self.v_trans ? 1 : 0; - const uint32_t n_layer = hparams.n_layer; + const uint32_t n_layer = kv_self.k_l.size(); write(&v_state, sizeof(v_state)); write(&n_layer, sizeof(n_layer)); @@ -5591,7 +5591,7 @@ struct llama_data_read { read_to(&v_state, sizeof(v_state)); read_to(&n_layer, sizeof(n_layer)); - if (n_layer != hparams.n_layer) { + if (n_layer != kv_self.k_l.size()) { LLAMA_LOG_ERROR("%s: mismatched layer count (%u instead of %u)\n", __func__, n_layer, hparams.n_layer); return false; }