mirror of
https://github.com/ikawrakow/ik_llama.cpp.git
synced 2026-04-26 01:19:20 +00:00
Add command line option to validate quants
This commit is contained in:
@@ -1166,6 +1166,10 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
|
||||
params.use_thp = true;
|
||||
return true;
|
||||
}
|
||||
if (arg == "-vq" || arg == "--validate-quants") {
|
||||
params.validate_quants = true;
|
||||
return true;
|
||||
}
|
||||
if (arg == "--numa") {
|
||||
CHECK_ARG
|
||||
std::string value(argv[i]);
|
||||
@@ -2571,6 +2575,7 @@ struct llama_model_params llama_model_params_from_gpt_params(const gpt_params &
|
||||
mparams.check_tensors = params.check_tensors;
|
||||
mparams.repack_tensors = params.repack_tensors;
|
||||
mparams.use_thp = params.use_thp;
|
||||
mparams.validate_quants = params.validate_quants;
|
||||
if (params.kv_overrides.empty()) {
|
||||
mparams.kv_overrides = NULL;
|
||||
} else {
|
||||
@@ -3719,6 +3724,7 @@ void yaml_dump_non_result_info(FILE * stream, const gpt_params & params, const l
|
||||
fprintf(stream, "no_mmap: %s # default: false\n", !params.use_mmap ? "true" : "false");
|
||||
fprintf(stream, "repack: %s # default: false\n", params.repack_tensors ? "true" : "false");
|
||||
fprintf(stream, "use_thp: %s # default: false\n", params.use_thp ? "true" : "false");
|
||||
fprintf(stream, "validate_quants: %s # default: false\n", params.validate_quants ? "true" : "false");
|
||||
fprintf(stream, "penalize_nl: %s # default: false\n", sparams.penalize_nl ? "true" : "false");
|
||||
fprintf(stream, "ppl_output_type: %d # default: 0\n", params.ppl_output_type);
|
||||
fprintf(stream, "ppl_stride: %d # default: 0\n", params.ppl_stride);
|
||||
|
||||
@@ -209,6 +209,7 @@ struct gpt_params {
|
||||
bool check_tensors = false; // validate tensor data
|
||||
bool repack_tensors = false; // repack tensors if interleaved variant is available
|
||||
bool use_thp = false; // use transparent huge pages (linux only)
|
||||
bool validate_quants = false; // if true, check for NaNs while loading the model
|
||||
|
||||
std::string cache_type_k = "f16"; // KV cache data type for the K
|
||||
std::string cache_type_v = "f16"; // KV cache data type for the V
|
||||
|
||||
@@ -377,7 +377,8 @@ extern "C" {
|
||||
bool use_mlock; // force system to keep model in RAM
|
||||
bool check_tensors; // validate model tensor data
|
||||
bool repack_tensors;// repack if available
|
||||
bool use_thp; // uase transparent huge pages (linux only)
|
||||
bool use_thp; // use transparent huge pages (linux only)
|
||||
bool validate_quants; // if true, check for NaNs while loading the model
|
||||
};
|
||||
|
||||
// NOTE: changing the default values of parameters marked as [EXPERIMENTAL] may cause crashes or incorrect results in certain configurations
|
||||
|
||||
@@ -4751,6 +4751,7 @@ static bool llm_load_tensors(
|
||||
int main_gpu,
|
||||
const float * tensor_split,
|
||||
bool use_mlock,
|
||||
bool validate_quants,
|
||||
llama_progress_callback progress_callback,
|
||||
void * progress_callback_user_data) {
|
||||
model.t_start_us = ggml_time_us();
|
||||
@@ -7261,7 +7262,7 @@ static bool llm_load_tensors(
|
||||
if (n_modified > 0) printf("============ Modified %d tensors\n", n_modified);
|
||||
}
|
||||
|
||||
if (true) {
|
||||
if (validate_quants) {
|
||||
int nbad = 0;
|
||||
for (auto& it : model.tensors_by_name) {
|
||||
if (ggml_backend_buffer_is_host(it.second->buffer)) {
|
||||
@@ -7374,7 +7375,8 @@ static int llama_model_load(const std::string & fname, llama_model & model, llam
|
||||
#endif
|
||||
|
||||
if (!llm_load_tensors(
|
||||
ml, model, params.n_gpu_layers, params.mla, params.split_mode, params.main_gpu, params.tensor_split, params.use_mlock,
|
||||
ml, model, params.n_gpu_layers, params.mla, params.split_mode, params.main_gpu, params.tensor_split,
|
||||
params.use_mlock, params.validate_quants,
|
||||
params.progress_callback, params.progress_callback_user_data
|
||||
)) {
|
||||
return -2;
|
||||
|
||||
Reference in New Issue
Block a user