From e760b4dc41bee8abb0ee1ef2cb04e4ad89044651 Mon Sep 17 00:00:00 2001 From: Kawrakow Date: Wed, 27 Aug 2025 19:00:17 +0300 Subject: [PATCH] Check for NaNs while loading the model. (#727) * Check for NaNs while loading the model. * Also tell which experts have NaNs. * Add command line option to validate quants * Add checks for more quantization types * Add checks for more quantizagtion types --------- Co-authored-by: Iwan Kawrakow --- common/common.cpp | 6 ++ common/common.h | 1 + ggml/src/iqk/iqk_quantize.cpp | 172 ++++++++++++++++++++++++++++++++++ ggml/src/iqk/iqk_quantize.h | 2 + include/llama.h | 3 +- src/llama.cpp | 17 +++- 6 files changed, 199 insertions(+), 2 deletions(-) diff --git a/common/common.cpp b/common/common.cpp index c1e94323..459e7c27 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -1166,6 +1166,10 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa params.use_thp = true; return true; } + if (arg == "-vq" || arg == "--validate-quants") { + params.validate_quants = true; + return true; + } if (arg == "--numa") { CHECK_ARG std::string value(argv[i]); @@ -2571,6 +2575,7 @@ struct llama_model_params llama_model_params_from_gpt_params(const gpt_params & mparams.check_tensors = params.check_tensors; mparams.repack_tensors = params.repack_tensors; mparams.use_thp = params.use_thp; + mparams.validate_quants = params.validate_quants; if (params.kv_overrides.empty()) { mparams.kv_overrides = NULL; } else { @@ -3719,6 +3724,7 @@ void yaml_dump_non_result_info(FILE * stream, const gpt_params & params, const l fprintf(stream, "no_mmap: %s # default: false\n", !params.use_mmap ? "true" : "false"); fprintf(stream, "repack: %s # default: false\n", params.repack_tensors ? "true" : "false"); fprintf(stream, "use_thp: %s # default: false\n", params.use_thp ? "true" : "false"); + fprintf(stream, "validate_quants: %s # default: false\n", params.validate_quants ? "true" : "false"); fprintf(stream, "penalize_nl: %s # default: false\n", sparams.penalize_nl ? "true" : "false"); fprintf(stream, "ppl_output_type: %d # default: 0\n", params.ppl_output_type); fprintf(stream, "ppl_stride: %d # default: 0\n", params.ppl_stride); diff --git a/common/common.h b/common/common.h index 1bf0f235..fb15c77f 100644 --- a/common/common.h +++ b/common/common.h @@ -209,6 +209,7 @@ struct gpt_params { bool check_tensors = false; // validate tensor data bool repack_tensors = false; // repack tensors if interleaved variant is available bool use_thp = false; // use transparent huge pages (linux only) + bool validate_quants = false; // if true, check for NaNs while loading the model std::string cache_type_k = "f16"; // KV cache data type for the K std::string cache_type_v = "f16"; // KV cache data type for the V diff --git a/ggml/src/iqk/iqk_quantize.cpp b/ggml/src/iqk/iqk_quantize.cpp index 22c57d06..531f2877 100644 --- a/ggml/src/iqk/iqk_quantize.cpp +++ b/ggml/src/iqk/iqk_quantize.cpp @@ -9868,3 +9868,175 @@ void vec_dot_iq4_kt_q8_k(int n, float * s, size_t bs, const void * vx, size_t bx #endif } + +namespace { +template +inline int check_row_for_blocks_256_fp16(int nblock, const Block * x) { + int nbad = 0; + for (int ib = 0; ib < nblock; ++ib) { + float d = GGML_FP16_TO_FP32(x[ib].d); + if (isnan(d)) ++nbad; + } + return nbad; +} +template +bool check_tensor_for_blocks_256_fp16(const ggml_tensor * tensor) { + int nblock = tensor->ne[0]/QK_K; + int nbad = 0; + for (int row = 0; row < ggml_nrows(tensor); ++row) { + auto x = (const Block *)((const char *)tensor->data + tensor->nb[1]*row); + nbad += check_row_for_blocks_256_fp16(nblock, x); + } + if (nbad > 0) { + fprintf(stderr, "%s: found %d NaN block scales out of %ld blocks in tensor %s\n", __func__, + nbad, ggml_nrows(tensor)*nblock, tensor->name); + if (tensor->ne[2] > 1) { + int nb = tensor->ne[0]/QK_K; + for (int64_t i02 = 0; i02 < tensor->ne[2]; ++i02) { + int nbad_expert = 0; + auto xex = (const char *)((const char *)tensor->data + i02*tensor->nb[2]); + for (int64_t i01 = 0; i01 < tensor->ne[1]; ++i01) { + auto xr = (const Block *)(xex + i01*tensor->nb[1]); + nbad_expert += check_row_for_blocks_256_fp16(nb, xr); + } + if (nbad_expert > 0) fprintf(stderr," there are %d NaN block scales for expert %ld\n", nbad_expert, i02); + } + } + return false; + } + return true; +} +template +inline int check_row_for_blocks_256_fp16(int nblock, const Block * x, int nr) { + int nbad = 0; + for (int ib = 0; ib < nblock; ++ib) { + for (int j = 0; j < nr; ++j) { + if (!isfinite(GGML_FP16_TO_FP32(x[ib].d[j]))) ++nbad; + } + } + return nbad; +} +template +bool check_tensor_for_blocks_256_fp16_repacked(const ggml_tensor * tensor) { + int nblock = tensor->ne[0]/QK_K; + int nbad = 0; + for (int row = 0; row < ggml_nrows(tensor); row += nr) { + auto x = (const Block *)((const char *)tensor->data + tensor->nb[1]*row); + nbad += check_row_for_blocks_256_fp16(nblock, x, nr); + } + if (nbad > 0) { + fprintf(stderr, "%s: found %d NaN block scales out of %ld blocks in tensor %s\n", __func__, + nbad, ggml_nrows(tensor)*nblock, tensor->name); + if (tensor->ne[2] > 1) { + int nb = tensor->ne[0]/QK_K; + for (int64_t i02 = 0; i02 < tensor->ne[2]; ++i02) { + int nbad_expert = 0; + auto xex = (const char *)((const char *)tensor->data + i02*tensor->nb[2]); + for (int64_t i01 = 0; i01 < tensor->ne[1]; i01 += nr) { + auto xr = (const Block *)(xex + i01*tensor->nb[1]); + nbad_expert += check_row_for_blocks_256_fp16(nb, xr, nr); + } + if (nbad_expert > 0) fprintf(stderr," there are %d NaN block scales for expert %ld\n", nbad_expert, i02); + } + } + return false; + } + return true; +} +struct F32Scale { + static inline int check_row(const char * data) { + float d = *(const float *)data; + return isfinite(d) ? 0 : 1; + } +}; +struct F16Scale { + static inline int check_row(const char * data) { + float d = GGML_FP16_TO_FP32(*(const ggml_half *)data); + return isfinite(d) ? 0 : 1; + } +}; +template +struct F32ScaleRX { + static inline int check_row(const char * data) { + auto d = (const float *)data; + int nbad = 0; + for (int i = 0; i < nr; ++i) { + if (!isfinite(d[i])) ++nbad; + } + return nbad; + } +}; +template +struct F16ScaleRX { + static inline int check_row(const char * data) { + auto d = (const ggml_half *)data; + int nbad = 0; + for (int i = 0; i < nr; ++i) { + if (!isfinite(GGML_FP16_TO_FP32(d[i]))) ++nbad; + } + return nbad; + } +}; +template +bool check_tensor_row_scales(const ggml_tensor * tensor) { + auto row_size = ggml_row_size(tensor->type, tensor->ne[0]); + int num_rows = ggml_nrows(tensor); + auto data = (const char *)tensor->data; + int nbad = 0; + for (int row = 0; row < num_rows; ++row) { + nbad += RS::check_row(data); + data += row_size; + } + if (nbad > 0) { + fprintf(stderr, "%s: found %d NaN row scales out of %d rows in tensor %s\n", __func__, + nbad, num_rows, tensor->name); + return false; + } + return true; +} +} + +bool iqk_validate_tensor(const ggml_tensor * tensor) { + if (!tensor) return true; + if (!ggml_is_contiguous(tensor)) return true; + + switch (tensor->type) { + case GGML_TYPE_IQ2_K: return check_tensor_for_blocks_256_fp16(tensor); + case GGML_TYPE_IQ3_K: return check_tensor_for_blocks_256_fp16(tensor); + case GGML_TYPE_IQ4_K: return check_tensor_for_blocks_256_fp16(tensor); + case GGML_TYPE_IQ5_K: return check_tensor_for_blocks_256_fp16(tensor); + case GGML_TYPE_IQ6_K: return check_tensor_for_blocks_256_fp16(tensor); + case GGML_TYPE_IQ2_XXS: return check_tensor_for_blocks_256_fp16(tensor); + case GGML_TYPE_IQ2_XS: return check_tensor_for_blocks_256_fp16(tensor); + case GGML_TYPE_IQ2_S: return check_tensor_for_blocks_256_fp16(tensor); + case GGML_TYPE_IQ3_XXS: return check_tensor_for_blocks_256_fp16(tensor); + case GGML_TYPE_IQ3_S: return check_tensor_for_blocks_256_fp16(tensor); + case GGML_TYPE_IQ4_XS: return check_tensor_for_blocks_256_fp16(tensor); + case GGML_TYPE_IQ2_K_R4: return check_tensor_for_blocks_256_fp16_repacked(tensor); + case GGML_TYPE_IQ3_K_R4: return check_tensor_for_blocks_256_fp16_repacked(tensor); + case GGML_TYPE_IQ4_K_R4: return check_tensor_for_blocks_256_fp16_repacked(tensor); + case GGML_TYPE_IQ5_K_R4: return check_tensor_for_blocks_256_fp16_repacked(tensor); + case GGML_TYPE_IQ2_XXS_R4: return check_tensor_for_blocks_256_fp16_repacked(tensor); + case GGML_TYPE_IQ2_XS_R4: return check_tensor_for_blocks_256_fp16_repacked(tensor); + case GGML_TYPE_IQ2_S_R4: return check_tensor_for_blocks_256_fp16_repacked(tensor); + case GGML_TYPE_IQ3_XXS_R4: return check_tensor_for_blocks_256_fp16_repacked(tensor); + case GGML_TYPE_IQ3_S_R4: return check_tensor_for_blocks_256_fp16_repacked(tensor); + case GGML_TYPE_IQ4_XS_R8: return check_tensor_for_blocks_256_fp16_repacked(tensor); + case GGML_TYPE_IQ2_BN: + case GGML_TYPE_IQ4_KSS: + case GGML_TYPE_IQ4_KS: + case GGML_TYPE_IQ5_KS: return check_tensor_row_scales(tensor); + case GGML_TYPE_IQ2_BN_R4: + case GGML_TYPE_IQ4_KS_R4: + case GGML_TYPE_IQ5_KS_R4: return check_tensor_row_scales>(tensor); + case GGML_TYPE_IQ1_BN: + case GGML_TYPE_IQ2_KS: + case GGML_TYPE_IQ2_KL: + case GGML_TYPE_IQ3_KS: return check_tensor_row_scales(tensor); + case GGML_TYPE_IQ1_S_R4: + case GGML_TYPE_IQ1_M_R4: return check_tensor_row_scales>(tensor); + + default: break; + } + return true; +} diff --git a/ggml/src/iqk/iqk_quantize.h b/ggml/src/iqk/iqk_quantize.h index b512c9dd..5f062261 100644 --- a/ggml/src/iqk/iqk_quantize.h +++ b/ggml/src/iqk/iqk_quantize.h @@ -326,6 +326,8 @@ void iqk_quantize_any(int from_type, int to_type, const void * GGML_RESTRICT x, void * GGML_RESTRICT y, void * work_buffer, to_float_t to_float, from_float_t from_float, int ith, int nth); +bool iqk_validate_tensor(const struct ggml_tensor * src); + #ifdef __cplusplus } #endif diff --git a/include/llama.h b/include/llama.h index ffbd9c94..a5939769 100644 --- a/include/llama.h +++ b/include/llama.h @@ -377,7 +377,8 @@ extern "C" { bool use_mlock; // force system to keep model in RAM bool check_tensors; // validate model tensor data bool repack_tensors;// repack if available - bool use_thp; // uase transparent huge pages (linux only) + bool use_thp; // use transparent huge pages (linux only) + bool validate_quants; // if true, check for NaNs while loading the model }; // NOTE: changing the default values of parameters marked as [EXPERIMENTAL] may cause crashes or incorrect results in certain configurations diff --git a/src/llama.cpp b/src/llama.cpp index f74345a5..9a6ef293 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -4751,6 +4751,7 @@ static bool llm_load_tensors( int main_gpu, const float * tensor_split, bool use_mlock, + bool validate_quants, llama_progress_callback progress_callback, void * progress_callback_user_data) { model.t_start_us = ggml_time_us(); @@ -7261,6 +7262,19 @@ static bool llm_load_tensors( if (n_modified > 0) printf("============ Modified %d tensors\n", n_modified); } + if (validate_quants) { + int nbad = 0; + for (auto& it : model.tensors_by_name) { + if (ggml_backend_buffer_is_host(it.second->buffer)) { + if (!iqk_validate_tensor(it.second)) ++nbad; + } + } + if (nbad > 0) { + LLAMA_LOG_ERROR("Found %d bad tensors in model\n", nbad); + throw std::runtime_error("Bad tensors in model"); + } + } + if (!ml.use_mmap && ml.repack_tensors) { int n_repacked = 0; for (auto& it : model.tensors_by_name) { @@ -7361,7 +7375,8 @@ static int llama_model_load(const std::string & fname, llama_model & model, llam #endif if (!llm_load_tensors( - ml, model, params.n_gpu_layers, params.mla, params.split_mode, params.main_gpu, params.tensor_split, params.use_mlock, + ml, model, params.n_gpu_layers, params.mla, params.split_mode, params.main_gpu, params.tensor_split, + params.use_mlock, params.validate_quants, params.progress_callback, params.progress_callback_user_data )) { return -2;