Add command line option to validate quants

2026-04-26 01:19:20 +00:00 · 2025-08-27 08:31:59 +03:00
parent 3add753ed7
commit c04b918a01
4 changed files with 13 additions and 3 deletions
--- a/common/common.cpp
+++ b/common/common.cpp
@@ -1166,6 +1166,10 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
        params.use_thp = true;
        return true;
    }
+    if (arg == "-vq" || arg == "--validate-quants") {
+        params.validate_quants = true;
+        return true;
+    }
    if (arg == "--numa") {
        CHECK_ARG
        std::string value(argv[i]);
@@ -2571,6 +2575,7 @@ struct llama_model_params llama_model_params_from_gpt_params(const gpt_params &
    mparams.check_tensors   = params.check_tensors;
    mparams.repack_tensors  = params.repack_tensors;
    mparams.use_thp         = params.use_thp;
+    mparams.validate_quants = params.validate_quants;
    if (params.kv_overrides.empty()) {
        mparams.kv_overrides = NULL;
    } else {
@@ -3719,6 +3724,7 @@ void yaml_dump_non_result_info(FILE * stream, const gpt_params & params, const l
    fprintf(stream, "no_mmap: %s # default: false\n", !params.use_mmap ? "true" : "false");
    fprintf(stream, "repack: %s # default: false\n", params.repack_tensors ? "true" : "false");
    fprintf(stream, "use_thp: %s # default: false\n", params.use_thp ? "true" : "false");
+    fprintf(stream, "validate_quants: %s # default: false\n", params.validate_quants ? "true" : "false");
    fprintf(stream, "penalize_nl: %s # default: false\n", sparams.penalize_nl ? "true" : "false");
    fprintf(stream, "ppl_output_type: %d # default: 0\n", params.ppl_output_type);
    fprintf(stream, "ppl_stride: %d # default: 0\n", params.ppl_stride);
--- a/common/common.h
+++ b/common/common.h
@@ -209,6 +209,7 @@ struct gpt_params {
    bool check_tensors     = false; // validate tensor data
    bool repack_tensors    = false; // repack tensors if interleaved variant is available
    bool use_thp           = false; // use transparent huge pages (linux only)
+    bool validate_quants   = false; // if true, check for NaNs while loading the model

    std::string cache_type_k = "f16"; // KV cache data type for the K
    std::string cache_type_v = "f16"; // KV cache data type for the V
--- a/include/llama.h
+++ b/include/llama.h
@@ -377,7 +377,8 @@ extern "C" {
        bool use_mlock;     // force system to keep model in RAM
        bool check_tensors; // validate model tensor data
        bool repack_tensors;// repack if available
-        bool use_thp;       // uase transparent huge pages (linux only)
+        bool use_thp;       // use transparent huge pages (linux only)
+        bool validate_quants; // if true, check for NaNs while loading the model
    };

    // NOTE: changing the default values of parameters marked as [EXPERIMENTAL] may cause crashes or incorrect results in certain configurations
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -4751,6 +4751,7 @@ static bool llm_load_tensors(
        int main_gpu,
        const float * tensor_split,
        bool use_mlock,
+        bool validate_quants,
        llama_progress_callback progress_callback,
        void * progress_callback_user_data) {
    model.t_start_us = ggml_time_us();
@@ -7261,7 +7262,7 @@ static bool llm_load_tensors(
        if (n_modified > 0) printf("============ Modified %d tensors\n", n_modified);
    }

-    if (true) {
+    if (validate_quants) {
        int nbad = 0;
        for (auto& it : model.tensors_by_name) {
            if (ggml_backend_buffer_is_host(it.second->buffer)) {
@@ -7374,7 +7375,8 @@ static int llama_model_load(const std::string & fname, llama_model & model, llam
 #endif

        if (!llm_load_tensors(
-            ml, model, params.n_gpu_layers, params.mla, params.split_mode,  params.main_gpu, params.tensor_split, params.use_mlock,
+            ml, model, params.n_gpu_layers, params.mla, params.split_mode,  params.main_gpu, params.tensor_split,
+            params.use_mlock, params.validate_quants,
            params.progress_callback, params.progress_callback_user_data
        )) {
            return -2;