q2_K: allow it to detect ternary nets and quantize accordingly

2026-03-01 01:24:08 +00:00 · 2024-08-05 11:59:36 +03:00
parent 74f2f50abf
commit 695c7eef49
4 changed files with 55 additions and 4 deletions
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -16071,12 +16071,13 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
                    }
                }
            }
-            if ((new_type == GGML_TYPE_IQ2_XXS ||
+            if (!params->ignore_imatrix_rules && !imatrix &&
+                (new_type == GGML_TYPE_IQ2_XXS ||
                 new_type == GGML_TYPE_IQ2_XS  ||
                 new_type == GGML_TYPE_IQ2_S   ||
                 new_type == GGML_TYPE_IQ1_S   ||
                (new_type == GGML_TYPE_IQ1_M && strcmp(tensor->name, "token_embd.weight") && strcmp(tensor->name, "output.weight"))  ||
-                (new_type == GGML_TYPE_Q2_K && params->ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S && strcmp(tensor->name, "token_embd.weight") != 0)) && !imatrix) {
+                (new_type == GGML_TYPE_Q2_K && params->ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S && strcmp(tensor->name, "token_embd.weight") != 0))) {
                LLAMA_LOG_ERROR("\n\n============================================================\n");
                LLAMA_LOG_ERROR("Missing importance matrix for tensor %s in a very low-bit quantization\n", tensor->name);
                LLAMA_LOG_ERROR("The result will be garbage, so bailing out\n");
@@ -16441,6 +16442,7 @@ struct llama_model_quantize_params llama_model_quantize_default_params() {
        /*.only_copy                   =*/ false,
        /*.pure                        =*/ false,
        /*.keep_split                  =*/ false,
+        /*.ignore_imatrix_rules        =*/ false,
        /*.imatrix                     =*/ nullptr,
        /*.kv_overrides                =*/ nullptr,
    };