From fc03b9adbcac490fbf9a11ff347332d79101f601 Mon Sep 17 00:00:00 2001 From: Iwan Kawrakow Date: Sat, 15 Mar 2025 18:35:37 +0200 Subject: [PATCH] Fix case where wkv_b is quantized with k- or i-quants. --- src/llama.cpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/llama.cpp b/src/llama.cpp index de81e8c1..477044d0 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -8157,7 +8157,8 @@ static bool llm_load_tensors( size_t max_wk_size = 0; for (auto& l : model.layers) { if (!l.wk_b) { - auto size = ggml_row_size(l.wkv_b->type, n_embd_head_qk_nope)*kv_lora_rank*n_head; + auto new_type = ggml_is_quantized(l.wkv_b->type) ? GGML_TYPE_Q8_0 : l.wkv_b->type; + auto size = ggml_row_size(new_type, n_embd_head_qk_nope)*kv_lora_rank*n_head; max_wk_size = std::max(max_wk_size, size); if (!ggml_backend_buffer_is_host(l.wkv_b->buffer)) { max_wkv_size = std::max(max_wkv_size, ggml_nbytes(l.wkv_b));