From 3208660d20db57e27ea98f4773bb7c3ad210c4e6 Mon Sep 17 00:00:00 2001
From: Kawrakow <iwankawrakow@gmail.com>
Date: Fri, 6 Mar 2026 07:25:40 +0100
Subject: [PATCH] Be able to quantize mmproj files (#1367)

---
 src/llama-hparams.cpp    |  7 ++++---
 src/llama-model-loader.h |  2 +-
 src/llama-quantize.cpp   | 15 ++++++++++++---
 3 files changed, 17 insertions(+), 7 deletions(-)

diff --git a/src/llama-hparams.cpp b/src/llama-hparams.cpp
index 7ea29c47..ab9953a1 100644
--- a/src/llama-hparams.cpp
+++ b/src/llama-hparams.cpp
@@ -39,7 +39,7 @@ static inline const char * llm_expert_gating_func_name(llm_expert_gating_func_ty
 
 void llm_load_hparams(
         llama_model_loader & ml,
-        llama_model & model) {
+        llama_model & model, bool ignore_vocab) {
     auto & hparams = model.hparams;
     const gguf_context * ctx = ml.meta;
 
@@ -54,11 +54,13 @@ void llm_load_hparams(
         model.gguf_kv.emplace(name, value);
     }
 
+    ml.get_key(LLM_KV_BLOCK_COUNT,       hparams.n_layer);
+
     // get general kv
     ml.get_key(LLM_KV_GENERAL_NAME, model.name, false);
 
     // get hparams kv
-    ml.get_key(LLM_KV_VOCAB_SIZE, hparams.n_vocab, false) || ml.get_arr_n(LLM_KV_TOKENIZER_LIST, hparams.n_vocab);
+    ml.get_key(LLM_KV_VOCAB_SIZE, hparams.n_vocab, false) || ml.get_arr_n(LLM_KV_TOKENIZER_LIST, hparams.n_vocab, !ignore_vocab);
 
     // everything past this point is not vocab-related
     if (hparams.vocab_only) {
@@ -67,7 +69,6 @@ void llm_load_hparams(
 
     ml.get_key(LLM_KV_CONTEXT_LENGTH,    hparams.n_ctx_train);
     ml.get_key(LLM_KV_EMBEDDING_LENGTH,  hparams.n_embd);
-    ml.get_key(LLM_KV_BLOCK_COUNT,       hparams.n_layer);
     ml.get_key(LLM_KV_EXPERT_COUNT,      hparams.n_expert,      false);
     ml.get_key(LLM_KV_EXPERT_USED_COUNT, hparams.n_expert_used, false);
 
diff --git a/src/llama-model-loader.h b/src/llama-model-loader.h
index c59eaf4f..e9d72a43 100644
--- a/src/llama-model-loader.h
+++ b/src/llama-model-loader.h
@@ -176,7 +176,7 @@ struct llama_model_loader {
 
 void llm_load_arch(llama_model_loader & ml, llama_model & model);
 
-void llm_load_hparams(llama_model_loader & ml, llama_model & model);
+void llm_load_hparams(llama_model_loader & ml, llama_model & model, bool ignore_vocab = false);
 
 struct create_tensors_helper_interface {
     virtual ~create_tensors_helper_interface() = default;
diff --git a/src/llama-quantize.cpp b/src/llama-quantize.cpp
index bdd5321e..925fb4e9 100644
--- a/src/llama-quantize.cpp
+++ b/src/llama-quantize.cpp
@@ -1026,8 +1026,16 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
     ml.init_mappings(false); // no prefetching
 
     llama_model model;
-    llm_load_arch(ml, model);
-    llm_load_hparams(ml, model);
+    try {
+        llm_load_arch(ml, model);
+    } catch(const std::exception & e) {
+        LLAMA_LOG_WARN("XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX %s\n", e.what());
+    }
+    try {
+        llm_load_hparams(ml, model, true);
+    } catch(const std::exception & e) {
+        LLAMA_LOG_WARN("XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX %s\n", e.what());
+    }
 
     struct quantize_state_internal qs(model, params);
 
@@ -1159,7 +1167,8 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
     //  - qs.n_attention_wv == 3 * model.hparams.n_layer for Encoder-Decoder models
     //  - model.arch == LLM_ARCH_DECI                    for Deci-Nemotron   models
     //
-    GGML_ASSERT((qs.n_attention_wv == 0 || qs.n_attention_wv == (int)model.hparams.n_layer || qs.n_attention_wv == 3 * (int)model.hparams.n_layer || model.arch == LLM_ARCH_DECI) && "n_attention_wv is unexpected");
+    GGML_ASSERT((qs.n_attention_wv == 0 || qs.n_attention_wv == (int)model.hparams.n_layer || qs.n_attention_wv == 3 * (int)model.hparams.n_layer ||
+                model.arch == LLM_ARCH_DECI || model.arch == LLM_ARCH_UNKNOWN) && "n_attention_wv is unexpected");
 
     size_t total_size_org = 0;
     size_t total_size_new = 0;