Allow quantization of ffn_gate_inp (#896)

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
2026-02-01 03:59:52 +00:00 · 2025-11-05 10:44:32 +02:00
parent 15159a87d4
commit abb966eba1
4 changed files with 19 additions and 2 deletions
--- a/src/llama-quantize.cpp
+++ b/src/llama-quantize.cpp
@@ -1245,7 +1245,12 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s

        // do not quantize expert gating tensors
        // NOTE: can't use LLM_TN here because the layer number is not known
-        quantize &= name.find("ffn_gate_inp.weight") == std::string::npos;
+        if (name.find("ffn_gate_inp.weight") != std::string::npos) {
+            if (params->ffn_gate_inp_type == GGML_TYPE_COUNT || params->ffn_gate_inp_type == tensor->type) {
+                quantize = false;
+            }
+        }
+        //quantize &= name.find("ffn_gate_inp.weight") == std::string::npos;

        // do not quantize positional embeddings and token types (BERT)
        quantize &= name != LLM_TN(model.arch)(LLM_TENSOR_POS_EMBD,    "weight");
@@ -1328,6 +1333,9 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
            if (params->output_tensor_type < GGML_TYPE_COUNT && strcmp(tensor->name, "output.weight") == 0) {
                new_type = params->output_tensor_type;
            }
+            if (params->ffn_gate_inp_type < GGML_TYPE_COUNT && name.find("ffn_gate_inp.weight") != std::string::npos) {
+                new_type = params->ffn_gate_inp_type;
+            }
            if (params->attn_q_type < GGML_TYPE_COUNT && strcmp(tensor->name, "attn_q.weight") == 0) {
                new_type = params->attn_q_type;
            }