Fix kv cache save and load for GLM model (#965)

Co-authored-by: firecoperana <firecoperana>
2026-02-27 00:24:11 +00:00 · 2025-11-15 15:04:16 +00:00
parent 5ec0def0ef
commit b40d11b22d
2 changed files with 3 additions and 3 deletions
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@@ -1837,7 +1837,7 @@ struct server_context {
                LLAMA_LOG_INFO("prompt cache save took %.2f ms\n", (ggml_time_us() - t_start) / 1000.0);
            }
            // has prompts saved earlier to load
-            if (!prompt_cache->states.empty()) {
+            if (prompt_cache && !prompt_cache->states.empty()) {
                const int64_t t_start = ggml_time_us();
                ret->server_cached_prompt.tokens = server_tokens(tokens.get_text_tokens(), false); // copy cache tokens
                ret->prompt_load(*prompt_cache, task.tokens);
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -5286,7 +5286,7 @@ struct llama_data_write {
        //          1 -> transposed V cache
        //          2 -> no V cache (as it may be the case with MLA)
        const uint32_t v_state = kv_self.v_l.empty() ? 2 : kv_self.v_trans ? 1 : 0;
-        const uint32_t n_layer = hparams.n_layer;
+        const uint32_t n_layer = kv_self.k_l.size();

        write(&v_state, sizeof(v_state));
        write(&n_layer, sizeof(n_layer));
@@ -5591,7 +5591,7 @@ struct llama_data_read {
        read_to(&v_state, sizeof(v_state));
        read_to(&n_layer, sizeof(n_layer));

-        if (n_layer != hparams.n_layer) {
+        if (n_layer != kv_self.k_l.size()) {
            LLAMA_LOG_ERROR("%s: mismatched layer count (%u instead of %u)\n", __func__, n_layer, hparams.n_layer);
            return false;
        }