mirror of
https://github.com/ikawrakow/ik_llama.cpp.git
synced 2026-04-30 11:21:56 +00:00
Fix kv cache save and load for GLM model (#965)
Co-authored-by: firecoperana <firecoperana>
This commit is contained in:
@@ -1837,7 +1837,7 @@ struct server_context {
|
|||||||
LLAMA_LOG_INFO("prompt cache save took %.2f ms\n", (ggml_time_us() - t_start) / 1000.0);
|
LLAMA_LOG_INFO("prompt cache save took %.2f ms\n", (ggml_time_us() - t_start) / 1000.0);
|
||||||
}
|
}
|
||||||
// has prompts saved earlier to load
|
// has prompts saved earlier to load
|
||||||
if (!prompt_cache->states.empty()) {
|
if (prompt_cache && !prompt_cache->states.empty()) {
|
||||||
const int64_t t_start = ggml_time_us();
|
const int64_t t_start = ggml_time_us();
|
||||||
ret->server_cached_prompt.tokens = server_tokens(tokens.get_text_tokens(), false); // copy cache tokens
|
ret->server_cached_prompt.tokens = server_tokens(tokens.get_text_tokens(), false); // copy cache tokens
|
||||||
ret->prompt_load(*prompt_cache, task.tokens);
|
ret->prompt_load(*prompt_cache, task.tokens);
|
||||||
|
|||||||
@@ -5286,7 +5286,7 @@ struct llama_data_write {
|
|||||||
// 1 -> transposed V cache
|
// 1 -> transposed V cache
|
||||||
// 2 -> no V cache (as it may be the case with MLA)
|
// 2 -> no V cache (as it may be the case with MLA)
|
||||||
const uint32_t v_state = kv_self.v_l.empty() ? 2 : kv_self.v_trans ? 1 : 0;
|
const uint32_t v_state = kv_self.v_l.empty() ? 2 : kv_self.v_trans ? 1 : 0;
|
||||||
const uint32_t n_layer = hparams.n_layer;
|
const uint32_t n_layer = kv_self.k_l.size();
|
||||||
|
|
||||||
write(&v_state, sizeof(v_state));
|
write(&v_state, sizeof(v_state));
|
||||||
write(&n_layer, sizeof(n_layer));
|
write(&n_layer, sizeof(n_layer));
|
||||||
@@ -5591,7 +5591,7 @@ struct llama_data_read {
|
|||||||
read_to(&v_state, sizeof(v_state));
|
read_to(&v_state, sizeof(v_state));
|
||||||
read_to(&n_layer, sizeof(n_layer));
|
read_to(&n_layer, sizeof(n_layer));
|
||||||
|
|
||||||
if (n_layer != hparams.n_layer) {
|
if (n_layer != kv_self.k_l.size()) {
|
||||||
LLAMA_LOG_ERROR("%s: mismatched layer count (%u instead of %u)\n", __func__, n_layer, hparams.n_layer);
|
LLAMA_LOG_ERROR("%s: mismatched layer count (%u instead of %u)\n", __func__, n_layer, hparams.n_layer);
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|||||||
Reference in New Issue
Block a user