Check for NaNs while loading the model. (#727)

* Check for NaNs while loading the model. * Also tell which experts have NaNs. * Add command line option to validate quants * Add checks for more quantization types * Add checks for more quantizagtion types --------- Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
2026-01-26 17:20:01 +00:00 · 2025-08-27 19:00:17 +03:00
parent ca5b6ab9b1
commit e760b4dc41
6 changed files with 199 additions and 2 deletions
--- a/ggml/src/iqk/iqk_quantize.cpp
+++ b/ggml/src/iqk/iqk_quantize.cpp
@@ -9868,3 +9868,175 @@ void vec_dot_iq4_kt_q8_k(int n, float * s, size_t bs, const void * vx, size_t bx
 #endif

 }
+
+namespace {
+template <typename Block>
+inline int check_row_for_blocks_256_fp16(int nblock, const Block * x) {
+    int nbad = 0;
+    for (int ib = 0; ib < nblock; ++ib) {
+        float d = GGML_FP16_TO_FP32(x[ib].d);
+        if (isnan(d)) ++nbad;
+    }
+    return nbad;
+}
+template <typename Block>
+bool check_tensor_for_blocks_256_fp16(const ggml_tensor * tensor) {
+    int nblock = tensor->ne[0]/QK_K;
+    int nbad = 0;
+    for (int row = 0; row < ggml_nrows(tensor); ++row) {
+        auto x = (const Block *)((const char *)tensor->data + tensor->nb[1]*row);
+        nbad += check_row_for_blocks_256_fp16(nblock, x);
+    }
+    if (nbad > 0) {
+        fprintf(stderr, "%s: found %d NaN block scales out of %ld blocks in tensor %s\n", __func__,
+                nbad, ggml_nrows(tensor)*nblock, tensor->name);
+        if (tensor->ne[2] > 1) {
+            int nb = tensor->ne[0]/QK_K;
+            for (int64_t i02 = 0; i02 < tensor->ne[2]; ++i02) {
+                int nbad_expert = 0;
+                auto xex = (const char *)((const char *)tensor->data + i02*tensor->nb[2]);
+                for (int64_t i01 = 0; i01 < tensor->ne[1]; ++i01) {
+                    auto xr = (const Block *)(xex + i01*tensor->nb[1]);
+                    nbad_expert += check_row_for_blocks_256_fp16(nb, xr);
+                }
+                if (nbad_expert > 0) fprintf(stderr,"    there are %d NaN block scales for expert %ld\n", nbad_expert, i02);
+            }
+        }
+        return false;
+    }
+    return true;
+}
+template <typename Block>
+inline int check_row_for_blocks_256_fp16(int nblock, const Block * x, int nr) {
+    int nbad = 0;
+    for (int ib = 0; ib < nblock; ++ib) {
+        for (int j = 0; j < nr; ++j) {
+            if (!isfinite(GGML_FP16_TO_FP32(x[ib].d[j]))) ++nbad;
+        }
+    }
+    return nbad;
+}
+template <typename Block, int nr>
+bool check_tensor_for_blocks_256_fp16_repacked(const ggml_tensor * tensor) {
+    int nblock = tensor->ne[0]/QK_K;
+    int nbad = 0;
+    for (int row = 0; row < ggml_nrows(tensor); row += nr) {
+        auto x = (const Block *)((const char *)tensor->data + tensor->nb[1]*row);
+        nbad += check_row_for_blocks_256_fp16(nblock, x, nr);
+    }
+    if (nbad > 0) {
+        fprintf(stderr, "%s: found %d NaN block scales out of %ld blocks in tensor %s\n", __func__,
+                nbad, ggml_nrows(tensor)*nblock, tensor->name);
+        if (tensor->ne[2] > 1) {
+            int nb = tensor->ne[0]/QK_K;
+            for (int64_t i02 = 0; i02 < tensor->ne[2]; ++i02) {
+                int nbad_expert = 0;
+                auto xex = (const char *)((const char *)tensor->data + i02*tensor->nb[2]);
+                for (int64_t i01 = 0; i01 < tensor->ne[1]; i01 += nr) {
+                    auto xr = (const Block *)(xex + i01*tensor->nb[1]);
+                    nbad_expert += check_row_for_blocks_256_fp16(nb, xr, nr);
+                }
+                if (nbad_expert > 0) fprintf(stderr,"    there are %d NaN block scales for expert %ld\n", nbad_expert, i02);
+            }
+        }
+        return false;
+    }
+    return true;
+}
+struct F32Scale {
+    static inline int check_row(const char * data) {
+        float d = *(const float *)data;
+        return isfinite(d) ? 0 : 1;
+    }
+};
+struct F16Scale {
+    static inline int check_row(const char * data) {
+        float d = GGML_FP16_TO_FP32(*(const ggml_half *)data);
+        return isfinite(d) ? 0 : 1;
+    }
+};
+template <int nr>
+struct F32ScaleRX {
+    static inline int check_row(const char * data) {
+        auto d = (const float *)data;
+        int nbad = 0;
+        for (int i = 0; i < nr; ++i) {
+            if (!isfinite(d[i])) ++nbad;
+        }
+        return nbad;
+    }
+};
+template <int nr>
+struct F16ScaleRX {
+    static inline int check_row(const char * data) {
+        auto d = (const ggml_half *)data;
+        int nbad = 0;
+        for (int i = 0; i < nr; ++i) {
+            if (!isfinite(GGML_FP16_TO_FP32(d[i]))) ++nbad;
+        }
+        return nbad;
+    }
+};
+template <typename RS>
+bool check_tensor_row_scales(const ggml_tensor * tensor) {
+    auto row_size = ggml_row_size(tensor->type, tensor->ne[0]);
+    int num_rows = ggml_nrows(tensor);
+    auto data = (const char *)tensor->data;
+    int nbad = 0;
+    for (int row = 0; row < num_rows; ++row) {
+        nbad += RS::check_row(data);
+        data += row_size;
+    }
+    if (nbad > 0) {
+        fprintf(stderr, "%s: found %d NaN row scales out of %d rows in tensor %s\n", __func__,
+                nbad, num_rows, tensor->name);
+        return false;
+    }
+    return true;
+}
+}
+
+bool iqk_validate_tensor(const ggml_tensor * tensor) {
+    if (!tensor) return true;
+    if (!ggml_is_contiguous(tensor)) return true;
+
+    switch (tensor->type) {
+        case GGML_TYPE_IQ2_K:      return check_tensor_for_blocks_256_fp16<block_iq2_k>(tensor);
+        case GGML_TYPE_IQ3_K:      return check_tensor_for_blocks_256_fp16<block_iq3_k>(tensor);
+        case GGML_TYPE_IQ4_K:      return check_tensor_for_blocks_256_fp16<block_iq4_k>(tensor);
+        case GGML_TYPE_IQ5_K:      return check_tensor_for_blocks_256_fp16<block_iq5_k>(tensor);
+        case GGML_TYPE_IQ6_K:      return check_tensor_for_blocks_256_fp16<block_iq6_k>(tensor);
+        case GGML_TYPE_IQ2_XXS:    return check_tensor_for_blocks_256_fp16<block_iq2_xxs>(tensor);
+        case GGML_TYPE_IQ2_XS:     return check_tensor_for_blocks_256_fp16<block_iq2_xs>(tensor);
+        case GGML_TYPE_IQ2_S:      return check_tensor_for_blocks_256_fp16<block_iq2_s>(tensor);
+        case GGML_TYPE_IQ3_XXS:    return check_tensor_for_blocks_256_fp16<block_iq3_xxs>(tensor);
+        case GGML_TYPE_IQ3_S:      return check_tensor_for_blocks_256_fp16<block_iq3_s>(tensor);
+        case GGML_TYPE_IQ4_XS:     return check_tensor_for_blocks_256_fp16<block_iq4_xs>(tensor);
+        case GGML_TYPE_IQ2_K_R4:   return check_tensor_for_blocks_256_fp16_repacked<block_iq2_k_r4, 4>(tensor);
+        case GGML_TYPE_IQ3_K_R4:   return check_tensor_for_blocks_256_fp16_repacked<block_iq3_k_r4, 4>(tensor);
+        case GGML_TYPE_IQ4_K_R4:   return check_tensor_for_blocks_256_fp16_repacked<block_iq4_k_r4, 4>(tensor);
+        case GGML_TYPE_IQ5_K_R4:   return check_tensor_for_blocks_256_fp16_repacked<block_iq5_k_r4, 4>(tensor);
+        case GGML_TYPE_IQ2_XXS_R4: return check_tensor_for_blocks_256_fp16_repacked<block_iq2_xxs_r4, 4>(tensor);
+        case GGML_TYPE_IQ2_XS_R4:  return check_tensor_for_blocks_256_fp16_repacked<block_iq2_xs_r4, 4>(tensor);
+        case GGML_TYPE_IQ2_S_R4:   return check_tensor_for_blocks_256_fp16_repacked<block_iq2_s_r4, 4>(tensor);
+        case GGML_TYPE_IQ3_XXS_R4: return check_tensor_for_blocks_256_fp16_repacked<block_iq3_xxs_r4, 4>(tensor);
+        case GGML_TYPE_IQ3_S_R4:   return check_tensor_for_blocks_256_fp16_repacked<block_iq3_s_r4, 4>(tensor);
+        case GGML_TYPE_IQ4_XS_R8:  return check_tensor_for_blocks_256_fp16_repacked<block_iq4_xs_r8, 8>(tensor);
+        case GGML_TYPE_IQ2_BN:
+        case GGML_TYPE_IQ4_KSS:
+        case GGML_TYPE_IQ4_KS:
+        case GGML_TYPE_IQ5_KS:     return check_tensor_row_scales<F32Scale>(tensor);
+        case GGML_TYPE_IQ2_BN_R4:
+        case GGML_TYPE_IQ4_KS_R4:
+        case GGML_TYPE_IQ5_KS_R4:  return check_tensor_row_scales<F32ScaleRX<4>>(tensor);
+        case GGML_TYPE_IQ1_BN:
+        case GGML_TYPE_IQ2_KS:
+        case GGML_TYPE_IQ2_KL:
+        case GGML_TYPE_IQ3_KS:     return check_tensor_row_scales<F16Scale>(tensor);
+        case GGML_TYPE_IQ1_S_R4:
+        case GGML_TYPE_IQ1_M_R4:   return check_tensor_row_scales<F16ScaleRX<4>>(tensor);
+
+        default: break;
+    }
+    return true;
+}
--- a/ggml/src/iqk/iqk_quantize.h
+++ b/ggml/src/iqk/iqk_quantize.h
@@ -326,6 +326,8 @@ void iqk_quantize_any(int from_type, int to_type,
                      const void * GGML_RESTRICT x, void * GGML_RESTRICT y, void * work_buffer,
                      to_float_t to_float, from_float_t from_float, int ith, int nth);

+bool iqk_validate_tensor(const struct ggml_tensor * src);
+
 #ifdef __cplusplus
 }
 #endif