Quantization mixes tweaks

2026-02-24 23:24:13 +00:00 · 2025-01-30 11:49:45 +02:00
parent 2e6b523853
commit cc39e3a90c
1 changed files with 13 additions and 5 deletions
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -16075,7 +16075,10 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
            else new_type = ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || is_iq2_m ? GGML_TYPE_IQ3_S : GGML_TYPE_Q2_K;
            ++qs.i_attention_wv;
        }
-        else if (qs.model.hparams.n_expert == 8 && name.find("attn_k.weight") != std::string::npos) {
+        else if (qs.model.hparams.n_expert >= 8 && name.find("attn_k") != std::string::npos) {
+            new_type = GGML_TYPE_Q4_K;
+        }
+        else if (qs.model.hparams.n_expert >= 8 && name.find("attn_q") != std::string::npos) {
            new_type = GGML_TYPE_Q4_K;
        }
        else if (name.find("attn_qkv.weight") != std::string::npos) {
@@ -16088,7 +16091,7 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
            ++qs.i_ffn_down;
        }
        else if (name.find("attn_output.weight") != std::string::npos) {
-            if (qs.model.hparams.n_expert == 8) {
+            if (qs.model.hparams.n_expert >= 4) {
                new_type = GGML_TYPE_Q5_K;
            } else {
                if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_S || ftype == LLAMA_FTYPE_MOSTLY_IQ1_M) new_type = GGML_TYPE_IQ2_K;
@@ -16188,9 +16191,9 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
            else if (new_type == GGML_TYPE_Q5_K) new_type = GGML_TYPE_Q6_K;
        }
        ++qs.i_attention_wv;
-    } else if (name.find("attn_k.weight") != std::string::npos) {
+    } else if (name.find("attn_k") != std::string::npos) {
        if (qs.params->attn_k_type < GGML_TYPE_COUNT) new_type = qs.params->attn_k_type;
-        else if (qs.model.hparams.n_expert == 8) {
+        else if (qs.model.hparams.n_expert >= 8) {
            // for the 8-expert model, bumping this to Q8_0 trades just ~128MB
            // TODO: explore better strategies
            new_type = GGML_TYPE_Q8_0;
@@ -16201,8 +16204,13 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
        else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS_R4) {
            new_type = GGML_TYPE_IQ2_S;
        }
-    } else if (name.find("attn_q.weight") != std::string::npos) {
+    } else if (name.find("attn_q") != std::string::npos) {
        if (qs.params->attn_q_type < GGML_TYPE_COUNT) new_type = qs.params->attn_q_type;
+        else if (qs.model.hparams.n_expert >= 8) {
+            // for the 8-expert model, bumping this to Q8_0 trades just ~128MB
+            // TODO: explore better strategies
+            new_type = GGML_TYPE_Q8_0;
+        }
        else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS) {
            new_type = GGML_TYPE_IQ3_XXS;
        }