From 3208660d20db57e27ea98f4773bb7c3ad210c4e6 Mon Sep 17 00:00:00 2001 From: Kawrakow Date: Fri, 6 Mar 2026 07:25:40 +0100 Subject: [PATCH] Be able to quantize mmproj files (#1367) --- src/llama-hparams.cpp | 7 ++++--- src/llama-model-loader.h | 2 +- src/llama-quantize.cpp | 15 ++++++++++++--- 3 files changed, 17 insertions(+), 7 deletions(-) diff --git a/src/llama-hparams.cpp b/src/llama-hparams.cpp index 7ea29c47..ab9953a1 100644 --- a/src/llama-hparams.cpp +++ b/src/llama-hparams.cpp @@ -39,7 +39,7 @@ static inline const char * llm_expert_gating_func_name(llm_expert_gating_func_ty void llm_load_hparams( llama_model_loader & ml, - llama_model & model) { + llama_model & model, bool ignore_vocab) { auto & hparams = model.hparams; const gguf_context * ctx = ml.meta; @@ -54,11 +54,13 @@ void llm_load_hparams( model.gguf_kv.emplace(name, value); } + ml.get_key(LLM_KV_BLOCK_COUNT, hparams.n_layer); + // get general kv ml.get_key(LLM_KV_GENERAL_NAME, model.name, false); // get hparams kv - ml.get_key(LLM_KV_VOCAB_SIZE, hparams.n_vocab, false) || ml.get_arr_n(LLM_KV_TOKENIZER_LIST, hparams.n_vocab); + ml.get_key(LLM_KV_VOCAB_SIZE, hparams.n_vocab, false) || ml.get_arr_n(LLM_KV_TOKENIZER_LIST, hparams.n_vocab, !ignore_vocab); // everything past this point is not vocab-related if (hparams.vocab_only) { @@ -67,7 +69,6 @@ void llm_load_hparams( ml.get_key(LLM_KV_CONTEXT_LENGTH, hparams.n_ctx_train); ml.get_key(LLM_KV_EMBEDDING_LENGTH, hparams.n_embd); - ml.get_key(LLM_KV_BLOCK_COUNT, hparams.n_layer); ml.get_key(LLM_KV_EXPERT_COUNT, hparams.n_expert, false); ml.get_key(LLM_KV_EXPERT_USED_COUNT, hparams.n_expert_used, false); diff --git a/src/llama-model-loader.h b/src/llama-model-loader.h index c59eaf4f..e9d72a43 100644 --- a/src/llama-model-loader.h +++ b/src/llama-model-loader.h @@ -176,7 +176,7 @@ struct llama_model_loader { void llm_load_arch(llama_model_loader & ml, llama_model & model); -void llm_load_hparams(llama_model_loader & ml, llama_model & model); +void llm_load_hparams(llama_model_loader & ml, llama_model & model, bool ignore_vocab = false); struct create_tensors_helper_interface { virtual ~create_tensors_helper_interface() = default; diff --git a/src/llama-quantize.cpp b/src/llama-quantize.cpp index bdd5321e..925fb4e9 100644 --- a/src/llama-quantize.cpp +++ b/src/llama-quantize.cpp @@ -1026,8 +1026,16 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s ml.init_mappings(false); // no prefetching llama_model model; - llm_load_arch(ml, model); - llm_load_hparams(ml, model); + try { + llm_load_arch(ml, model); + } catch(const std::exception & e) { + LLAMA_LOG_WARN("XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX %s\n", e.what()); + } + try { + llm_load_hparams(ml, model, true); + } catch(const std::exception & e) { + LLAMA_LOG_WARN("XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX %s\n", e.what()); + } struct quantize_state_internal qs(model, params); @@ -1159,7 +1167,8 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s // - qs.n_attention_wv == 3 * model.hparams.n_layer for Encoder-Decoder models // - model.arch == LLM_ARCH_DECI for Deci-Nemotron models // - GGML_ASSERT((qs.n_attention_wv == 0 || qs.n_attention_wv == (int)model.hparams.n_layer || qs.n_attention_wv == 3 * (int)model.hparams.n_layer || model.arch == LLM_ARCH_DECI) && "n_attention_wv is unexpected"); + GGML_ASSERT((qs.n_attention_wv == 0 || qs.n_attention_wv == (int)model.hparams.n_layer || qs.n_attention_wv == 3 * (int)model.hparams.n_layer || + model.arch == LLM_ARCH_DECI || model.arch == LLM_ARCH_UNKNOWN) && "n_attention_wv is unexpected"); size_t total_size_org = 0; size_t total_size_new = 0;