From e760b4dc41bee8abb0ee1ef2cb04e4ad89044651 Mon Sep 17 00:00:00 2001
From: Kawrakow <iwankawrakow@gmail.com>
Date: Wed, 27 Aug 2025 19:00:17 +0300
Subject: [PATCH] Check for NaNs while loading the model. (#727)

* Check for NaNs while loading the model.

* Also tell which experts have NaNs.

* Add command line option to validate quants

* Add checks for more quantization types

* Add checks for more quantizagtion types

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
---
 common/common.cpp             |   6 ++
 common/common.h               |   1 +
 ggml/src/iqk/iqk_quantize.cpp | 172 ++++++++++++++++++++++++++++++++++
 ggml/src/iqk/iqk_quantize.h   |   2 +
 include/llama.h               |   3 +-
 src/llama.cpp                 |  17 +++-
 6 files changed, 199 insertions(+), 2 deletions(-)
diff --git a/common/common.cpp b/common/common.cpp
index c1e94323..459e7c27 100644
--- a/common/common.cpp
+++ b/common/common.cpp
@@ -1166,6 +1166,10 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
         params.use_thp = true;
         return true;
     }
+    if (arg == "-vq" || arg == "--validate-quants") {
+        params.validate_quants = true;
+        return true;
+    }
     if (arg == "--numa") {
         CHECK_ARG
         std::string value(argv[i]);
@@ -2571,6 +2575,7 @@ struct llama_model_params llama_model_params_from_gpt_params(const gpt_params &
     mparams.check_tensors   = params.check_tensors;
     mparams.repack_tensors  = params.repack_tensors;
     mparams.use_thp         = params.use_thp;
+    mparams.validate_quants = params.validate_quants;
     if (params.kv_overrides.empty()) {
         mparams.kv_overrides = NULL;
     } else {
@@ -3719,6 +3724,7 @@ void yaml_dump_non_result_info(FILE * stream, const gpt_params & params, const l
     fprintf(stream, "no_mmap: %s # default: false\n", !params.use_mmap ? "true" : "false");
     fprintf(stream, "repack: %s # default: false\n", params.repack_tensors ? "true" : "false");
     fprintf(stream, "use_thp: %s # default: false\n", params.use_thp ? "true" : "false");
+    fprintf(stream, "validate_quants: %s # default: false\n", params.validate_quants ? "true" : "false");
     fprintf(stream, "penalize_nl: %s # default: false\n", sparams.penalize_nl ? "true" : "false");
     fprintf(stream, "ppl_output_type: %d # default: 0\n", params.ppl_output_type);
     fprintf(stream, "ppl_stride: %d # default: 0\n", params.ppl_stride);
diff --git a/common/common.h b/common/common.h
index 1bf0f235..fb15c77f 100644
--- a/common/common.h
+++ b/common/common.h
@@ -209,6 +209,7 @@ struct gpt_params {
     bool check_tensors     = false; // validate tensor data
     bool repack_tensors    = false; // repack tensors if interleaved variant is available
     bool use_thp           = false; // use transparent huge pages (linux only)
+    bool validate_quants   = false; // if true, check for NaNs while loading the model
 
     std::string cache_type_k = "f16"; // KV cache data type for the K
     std::string cache_type_v = "f16"; // KV cache data type for the V
diff --git a/ggml/src/iqk/iqk_quantize.cpp b/ggml/src/iqk/iqk_quantize.cpp
index 22c57d06..531f2877 100644
--- a/ggml/src/iqk/iqk_quantize.cpp
+++ b/ggml/src/iqk/iqk_quantize.cpp
@@ -9868,3 +9868,175 @@ void vec_dot_iq4_kt_q8_k(int n, float * s, size_t bs, const void * vx, size_t bx
 #endif
 
 }
+
+namespace {
+template <typename Block>
+inline int check_row_for_blocks_256_fp16(int nblock, const Block * x) {
+    int nbad = 0;
+    for (int ib = 0; ib < nblock; ++ib) {
+        float d = GGML_FP16_TO_FP32(x[ib].d);
+        if (isnan(d)) ++nbad;
+    }
+    return nbad;
+}
+template <typename Block>
+bool check_tensor_for_blocks_256_fp16(const ggml_tensor * tensor) {
+    int nblock = tensor->ne[0]/QK_K;
+    int nbad = 0;
+    for (int row = 0; row < ggml_nrows(tensor); ++row) {
+        auto x = (const Block *)((const char *)tensor->data + tensor->nb[1]*row);
+        nbad += check_row_for_blocks_256_fp16(nblock, x);
+    }
+    if (nbad > 0) {
+        fprintf(stderr, "%s: found %d NaN block scales out of %ld blocks in tensor %s\n", __func__,
+                nbad, ggml_nrows(tensor)*nblock, tensor->name);
+        if (tensor->ne[2] > 1) {
+            int nb = tensor->ne[0]/QK_K;
+            for (int64_t i02 = 0; i02 < tensor->ne[2]; ++i02) {
+                int nbad_expert = 0;
+                auto xex = (const char *)((const char *)tensor->data + i02*tensor->nb[2]);
+                for (int64_t i01 = 0; i01 < tensor->ne[1]; ++i01) {
+                    auto xr = (const Block *)(xex + i01*tensor->nb[1]);
+                    nbad_expert += check_row_for_blocks_256_fp16(nb, xr);
+                }
+                if (nbad_expert > 0) fprintf(stderr,"    there are %d NaN block scales for expert %ld\n", nbad_expert, i02);
+            }
+        }
+        return false;
+    }
+    return true;
+}
+template <typename Block>
+inline int check_row_for_blocks_256_fp16(int nblock, const Block * x, int nr) {
+    int nbad = 0;
+    for (int ib = 0; ib < nblock; ++ib) {
+        for (int j = 0; j < nr; ++j) {
+            if (!isfinite(GGML_FP16_TO_FP32(x[ib].d[j]))) ++nbad;
+        }
+    }
+    return nbad;
+}
+template <typename Block, int nr>
+bool check_tensor_for_blocks_256_fp16_repacked(const ggml_tensor * tensor) {
+    int nblock = tensor->ne[0]/QK_K;
+    int nbad = 0;
+    for (int row = 0; row < ggml_nrows(tensor); row += nr) {
+        auto x = (const Block *)((const char *)tensor->data + tensor->nb[1]*row);
+        nbad += check_row_for_blocks_256_fp16(nblock, x, nr);
+    }
+    if (nbad > 0) {
+        fprintf(stderr, "%s: found %d NaN block scales out of %ld blocks in tensor %s\n", __func__,
+                nbad, ggml_nrows(tensor)*nblock, tensor->name);
+        if (tensor->ne[2] > 1) {
+            int nb = tensor->ne[0]/QK_K;
+            for (int64_t i02 = 0; i02 < tensor->ne[2]; ++i02) {
+                int nbad_expert = 0;
+                auto xex = (const char *)((const char *)tensor->data + i02*tensor->nb[2]);
+                for (int64_t i01 = 0; i01 < tensor->ne[1]; i01 += nr) {
+                    auto xr = (const Block *)(xex + i01*tensor->nb[1]);
+                    nbad_expert += check_row_for_blocks_256_fp16(nb, xr, nr);
+                }
+                if (nbad_expert > 0) fprintf(stderr,"    there are %d NaN block scales for expert %ld\n", nbad_expert, i02);
+            }
+        }
+        return false;
+    }
+    return true;
+}
+struct F32Scale {
+    static inline int check_row(const char * data) {
+        float d = *(const float *)data;
+        return isfinite(d) ? 0 : 1;
+    }
+};
+struct F16Scale {
+    static inline int check_row(const char * data) {
+        float d = GGML_FP16_TO_FP32(*(const ggml_half *)data);
+        return isfinite(d) ? 0 : 1;
+    }
+};
+template <int nr>
+struct F32ScaleRX {
+    static inline int check_row(const char * data) {
+        auto d = (const float *)data;
+        int nbad = 0;
+        for (int i = 0; i < nr; ++i) {
+            if (!isfinite(d[i])) ++nbad;
+        }
+        return nbad;
+    }
+};
+template <int nr>
+struct F16ScaleRX {
+    static inline int check_row(const char * data) {
+        auto d = (const ggml_half *)data;
+        int nbad = 0;
+        for (int i = 0; i < nr; ++i) {
+            if (!isfinite(GGML_FP16_TO_FP32(d[i]))) ++nbad;
+        }
+        return nbad;
+    }
+};
+template <typename RS>
+bool check_tensor_row_scales(const ggml_tensor * tensor) {
+    auto row_size = ggml_row_size(tensor->type, tensor->ne[0]);
+    int num_rows = ggml_nrows(tensor);
+    auto data = (const char *)tensor->data;
+    int nbad = 0;
+    for (int row = 0; row < num_rows; ++row) {
+        nbad += RS::check_row(data);
+        data += row_size;
+    }
+    if (nbad > 0) {
+        fprintf(stderr, "%s: found %d NaN row scales out of %d rows in tensor %s\n", __func__,
+                nbad, num_rows, tensor->name);
+        return false;
+    }
+    return true;
+}
+}
+
+bool iqk_validate_tensor(const ggml_tensor * tensor) {
+    if (!tensor) return true;
+    if (!ggml_is_contiguous(tensor)) return true;
+
+    switch (tensor->type) {
+        case GGML_TYPE_IQ2_K:      return check_tensor_for_blocks_256_fp16<block_iq2_k>(tensor);
+        case GGML_TYPE_IQ3_K:      return check_tensor_for_blocks_256_fp16<block_iq3_k>(tensor);
+        case GGML_TYPE_IQ4_K:      return check_tensor_for_blocks_256_fp16<block_iq4_k>(tensor);
+        case GGML_TYPE_IQ5_K:      return check_tensor_for_blocks_256_fp16<block_iq5_k>(tensor);
+        case GGML_TYPE_IQ6_K:      return check_tensor_for_blocks_256_fp16<block_iq6_k>(tensor);
+        case GGML_TYPE_IQ2_XXS:    return check_tensor_for_blocks_256_fp16<block_iq2_xxs>(tensor);
+        case GGML_TYPE_IQ2_XS:     return check_tensor_for_blocks_256_fp16<block_iq2_xs>(tensor);
+        case GGML_TYPE_IQ2_S:      return check_tensor_for_blocks_256_fp16<block_iq2_s>(tensor);
+        case GGML_TYPE_IQ3_XXS:    return check_tensor_for_blocks_256_fp16<block_iq3_xxs>(tensor);
+        case GGML_TYPE_IQ3_S:      return check_tensor_for_blocks_256_fp16<block_iq3_s>(tensor);
+        case GGML_TYPE_IQ4_XS:     return check_tensor_for_blocks_256_fp16<block_iq4_xs>(tensor);
+        case GGML_TYPE_IQ2_K_R4:   return check_tensor_for_blocks_256_fp16_repacked<block_iq2_k_r4, 4>(tensor);
+        case GGML_TYPE_IQ3_K_R4:   return check_tensor_for_blocks_256_fp16_repacked<block_iq3_k_r4, 4>(tensor);
+        case GGML_TYPE_IQ4_K_R4:   return check_tensor_for_blocks_256_fp16_repacked<block_iq4_k_r4, 4>(tensor);
+        case GGML_TYPE_IQ5_K_R4:   return check_tensor_for_blocks_256_fp16_repacked<block_iq5_k_r4, 4>(tensor);
+        case GGML_TYPE_IQ2_XXS_R4: return check_tensor_for_blocks_256_fp16_repacked<block_iq2_xxs_r4, 4>(tensor);
+        case GGML_TYPE_IQ2_XS_R4:  return check_tensor_for_blocks_256_fp16_repacked<block_iq2_xs_r4, 4>(tensor);
+        case GGML_TYPE_IQ2_S_R4:   return check_tensor_for_blocks_256_fp16_repacked<block_iq2_s_r4, 4>(tensor);
+        case GGML_TYPE_IQ3_XXS_R4: return check_tensor_for_blocks_256_fp16_repacked<block_iq3_xxs_r4, 4>(tensor);
+        case GGML_TYPE_IQ3_S_R4:   return check_tensor_for_blocks_256_fp16_repacked<block_iq3_s_r4, 4>(tensor);
+        case GGML_TYPE_IQ4_XS_R8:  return check_tensor_for_blocks_256_fp16_repacked<block_iq4_xs_r8, 8>(tensor);
+        case GGML_TYPE_IQ2_BN:
+        case GGML_TYPE_IQ4_KSS:
+        case GGML_TYPE_IQ4_KS:
+        case GGML_TYPE_IQ5_KS:     return check_tensor_row_scales<F32Scale>(tensor);
+        case GGML_TYPE_IQ2_BN_R4:
+        case GGML_TYPE_IQ4_KS_R4:
+        case GGML_TYPE_IQ5_KS_R4:  return check_tensor_row_scales<F32ScaleRX<4>>(tensor);
+        case GGML_TYPE_IQ1_BN:
+        case GGML_TYPE_IQ2_KS:
+        case GGML_TYPE_IQ2_KL:
+        case GGML_TYPE_IQ3_KS:     return check_tensor_row_scales<F16Scale>(tensor);
+        case GGML_TYPE_IQ1_S_R4:
+        case GGML_TYPE_IQ1_M_R4:   return check_tensor_row_scales<F16ScaleRX<4>>(tensor);
+
+        default: break;
+    }
+    return true;
+}
diff --git a/ggml/src/iqk/iqk_quantize.h b/ggml/src/iqk/iqk_quantize.h
index b512c9dd..5f062261 100644
--- a/ggml/src/iqk/iqk_quantize.h
+++ b/ggml/src/iqk/iqk_quantize.h
@@ -326,6 +326,8 @@ void iqk_quantize_any(int from_type, int to_type,
                       const void * GGML_RESTRICT x, void * GGML_RESTRICT y, void * work_buffer,
                       to_float_t to_float, from_float_t from_float, int ith, int nth);
 
+bool iqk_validate_tensor(const struct ggml_tensor * src);
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/include/llama.h b/include/llama.h
index ffbd9c94..a5939769 100644
--- a/include/llama.h
+++ b/include/llama.h
@@ -377,7 +377,8 @@ extern "C" {
         bool use_mlock;     // force system to keep model in RAM
         bool check_tensors; // validate model tensor data
         bool repack_tensors;// repack if available
-        bool use_thp;       // uase transparent huge pages (linux only)
+        bool use_thp;       // use transparent huge pages (linux only)
+        bool validate_quants; // if true, check for NaNs while loading the model
     };
 
     // NOTE: changing the default values of parameters marked as [EXPERIMENTAL] may cause crashes or incorrect results in certain configurations
diff --git a/src/llama.cpp b/src/llama.cpp
index f74345a5..9a6ef293 100644
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -4751,6 +4751,7 @@ static bool llm_load_tensors(
         int main_gpu,
         const float * tensor_split,
         bool use_mlock,
+        bool validate_quants,
         llama_progress_callback progress_callback,
         void * progress_callback_user_data) {
     model.t_start_us = ggml_time_us();
@@ -7261,6 +7262,19 @@ static bool llm_load_tensors(
         if (n_modified > 0) printf("============ Modified %d tensors\n", n_modified);
     }
 
+    if (validate_quants) {
+        int nbad = 0;
+        for (auto& it : model.tensors_by_name) {
+            if (ggml_backend_buffer_is_host(it.second->buffer)) {
+                if (!iqk_validate_tensor(it.second)) ++nbad;
+            }
+        }
+        if (nbad > 0) {
+            LLAMA_LOG_ERROR("Found %d bad tensors in model\n", nbad);
+            throw std::runtime_error("Bad tensors in model");
+        }
+    }
+
     if (!ml.use_mmap && ml.repack_tensors) {
         int n_repacked = 0;
         for (auto& it : model.tensors_by_name) {
@@ -7361,7 +7375,8 @@ static int llama_model_load(const std::string & fname, llama_model & model, llam
 #endif
 
         if (!llm_load_tensors(
-            ml, model, params.n_gpu_layers, params.mla, params.split_mode,  params.main_gpu, params.tensor_split, params.use_mlock,
+            ml, model, params.n_gpu_layers, params.mla, params.split_mode,  params.main_gpu, params.tensor_split,
+            params.use_mlock, params.validate_quants,
             params.progress_callback, params.progress_callback_user_data
         )) {
             return -2;