Add command line option to validate quants

This commit is contained in:
Iwan Kawrakow
2025-08-27 08:31:59 +03:00
parent 3add753ed7
commit c04b918a01
4 changed files with 13 additions and 3 deletions

View File

@@ -1166,6 +1166,10 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
params.use_thp = true;
return true;
}
if (arg == "-vq" || arg == "--validate-quants") {
params.validate_quants = true;
return true;
}
if (arg == "--numa") {
CHECK_ARG
std::string value(argv[i]);
@@ -2571,6 +2575,7 @@ struct llama_model_params llama_model_params_from_gpt_params(const gpt_params &
mparams.check_tensors = params.check_tensors;
mparams.repack_tensors = params.repack_tensors;
mparams.use_thp = params.use_thp;
mparams.validate_quants = params.validate_quants;
if (params.kv_overrides.empty()) {
mparams.kv_overrides = NULL;
} else {
@@ -3719,6 +3724,7 @@ void yaml_dump_non_result_info(FILE * stream, const gpt_params & params, const l
fprintf(stream, "no_mmap: %s # default: false\n", !params.use_mmap ? "true" : "false");
fprintf(stream, "repack: %s # default: false\n", params.repack_tensors ? "true" : "false");
fprintf(stream, "use_thp: %s # default: false\n", params.use_thp ? "true" : "false");
fprintf(stream, "validate_quants: %s # default: false\n", params.validate_quants ? "true" : "false");
fprintf(stream, "penalize_nl: %s # default: false\n", sparams.penalize_nl ? "true" : "false");
fprintf(stream, "ppl_output_type: %d # default: 0\n", params.ppl_output_type);
fprintf(stream, "ppl_stride: %d # default: 0\n", params.ppl_stride);

View File

@@ -209,6 +209,7 @@ struct gpt_params {
bool check_tensors = false; // validate tensor data
bool repack_tensors = false; // repack tensors if interleaved variant is available
bool use_thp = false; // use transparent huge pages (linux only)
bool validate_quants = false; // if true, check for NaNs while loading the model
std::string cache_type_k = "f16"; // KV cache data type for the K
std::string cache_type_v = "f16"; // KV cache data type for the V

View File

@@ -377,7 +377,8 @@ extern "C" {
bool use_mlock; // force system to keep model in RAM
bool check_tensors; // validate model tensor data
bool repack_tensors;// repack if available
bool use_thp; // uase transparent huge pages (linux only)
bool use_thp; // use transparent huge pages (linux only)
bool validate_quants; // if true, check for NaNs while loading the model
};
// NOTE: changing the default values of parameters marked as [EXPERIMENTAL] may cause crashes or incorrect results in certain configurations

View File

@@ -4751,6 +4751,7 @@ static bool llm_load_tensors(
int main_gpu,
const float * tensor_split,
bool use_mlock,
bool validate_quants,
llama_progress_callback progress_callback,
void * progress_callback_user_data) {
model.t_start_us = ggml_time_us();
@@ -7261,7 +7262,7 @@ static bool llm_load_tensors(
if (n_modified > 0) printf("============ Modified %d tensors\n", n_modified);
}
if (true) {
if (validate_quants) {
int nbad = 0;
for (auto& it : model.tensors_by_name) {
if (ggml_backend_buffer_is_host(it.second->buffer)) {
@@ -7374,7 +7375,8 @@ static int llama_model_load(const std::string & fname, llama_model & model, llam
#endif
if (!llm_load_tensors(
ml, model, params.n_gpu_layers, params.mla, params.split_mode, params.main_gpu, params.tensor_split, params.use_mlock,
ml, model, params.n_gpu_layers, params.mla, params.split_mode, params.main_gpu, params.tensor_split,
params.use_mlock, params.validate_quants,
params.progress_callback, params.progress_callback_user_data
)) {
return -2;