From 170467e8358799bde1cb454bcbf9ddd8ac4e87c1 Mon Sep 17 00:00:00 2001 From: Nexes the Elder <124105151+Nexesenex@users.noreply.github.com> Date: Wed, 25 Feb 2026 07:25:15 +0100 Subject: [PATCH] Llama-quantize: Partial requant feature (#1313) * Partial Requant feature for llama-quantize - Inspired by the recently portcopied --dry-run feature. - Allows to partially requantize a split quantized .gguf by requantizing only the missing splits in the destination directory. - Works both for GGUF which are split tensors by tensors, or by group of several tensors (though this one is not very much tested beyond 2 tensors by split). - Vibe coded. * Create output directory if it doesn't exist in llama-quantize * Create output directory if it doesn't exist in gguf-split * Add exit when directory fails to be created on Windows * Use std::filesystem * cleanup --- examples/gguf-split/gguf-split.cpp | 17 +++++++++++ examples/quantize/quantize.cpp | 5 +++- include/llama.h | 1 + src/llama-quantize.cpp | 47 +++++++++++++++++++++++++++++- src/llama.cpp | 1 + 5 files changed, 69 insertions(+), 2 deletions(-) diff --git a/examples/gguf-split/gguf-split.cpp b/examples/gguf-split/gguf-split.cpp index 7e62657e..ecb7fab6 100644 --- a/examples/gguf-split/gguf-split.cpp +++ b/examples/gguf-split/gguf-split.cpp @@ -7,6 +7,7 @@ #include #include #include +#include #include #include @@ -190,6 +191,18 @@ static void zeros(std::ofstream & file, size_t n) { } } +static void ensure_output_directory(const std::string & filepath) { + std::filesystem::path p(filepath); + if (p.has_parent_path()) { + std::error_code ec; + std::filesystem::create_directories(p.parent_path(), ec); + if (ec) { + fprintf(stderr, "Failed to create directory '%s': %s\n", p.parent_path().string().c_str(), ec.message().c_str()); + exit(EXIT_FAILURE); + } + } +} + struct split_strategy { const split_params params; std::ifstream & f_input; @@ -310,6 +323,8 @@ struct split_strategy { char split_path[PATH_MAX] = {0}; llama_split_path(split_path, sizeof(split_path), params.output.c_str(), i_split, n_split); + ensure_output_directory(split_path); + // open the output file printf("Writing file %s ... ", split_path); fflush(stdout); @@ -401,6 +416,8 @@ static void gguf_merge(const split_params & split_params) { int n_split = 1; int total_tensors = 0; + ensure_output_directory(split_params.output); + // avoid overwriting existing output file if (std::ifstream(split_params.output.c_str())) { fprintf(stderr, "%s: output file %s already exists\n", __func__, split_params.output.c_str()); diff --git a/examples/quantize/quantize.cpp b/examples/quantize/quantize.cpp index 0aaaa421..bca2c45e 100644 --- a/examples/quantize/quantize.cpp +++ b/examples/quantize/quantize.cpp @@ -151,7 +151,7 @@ static bool try_parse_ftype(const std::string & ftype_str_in, llama_ftype & ftyp // [[noreturn]] static void usage(const char * executable) { - printf("usage: %s [--help] [--allow-requantize] [--leave-output-tensor] [--pure] [--imatrix] [--hide-imatrix] [--include-weights] [--exclude-weights] [--output-tensor-type] [--token-embedding-type] [--ffn-gate-inp-type] [--attn-q-type] [--attn-k-type] [--attn-v-type] [--attn-qkv-type] [--attn-output-type] [--ffn-gate-type] [--ffn-down-type] [--ffn-up-type] [--keep-split] [--override-kv] model-f32.gguf [model-quant.gguf] type [nthreads]\n\n", executable); + printf("usage: %s [--help] [--allow-requantize] [--leave-output-tensor] [--pure] [--imatrix] [--hide-imatrix] [--include-weights] [--exclude-weights] [--output-tensor-type] [--token-embedding-type] [--ffn-gate-inp-type] [--attn-q-type] [--attn-k-type] [--attn-v-type] [--attn-qkv-type] [--attn-output-type] [--ffn-gate-type] [--ffn-down-type] [--ffn-up-type] [--keep-split] [--partial-requant] [--override-kv] model-f32.gguf [model-quant.gguf] type [nthreads]\n\n", executable); printf(" --allow-requantize: Allows requantizing tensors that have already been quantized. Warning: This can severely reduce quality compared to quantizing from 16bit or 32bit\n"); printf(" --leave-output-tensor: Will leave output.weight un(re)quantized. Increases model size but may also increase quality, especially when requantizing\n"); printf(" --pure: Disable k-quant mixtures and quantize all tensors to the same type\n"); @@ -175,6 +175,7 @@ static void usage(const char * executable) { printf(" --ffn-down-type ggml_type: use this ggml_type for the ffn_down tensor.\n"); printf(" --ffn-up-type ggml_type: use this ggml_type for the ffn_up tensor.\n\n"); printf(" --keep-split: will generate quantized model in the same shards as input\n"); + printf(" --partial-requant: quantize only missing split files in the split quantized .gguf destination directory\n"); printf(" --override-kv KEY=TYPE:VALUE\n"); printf(" Advanced option to override model metadata by key in the quantized model. May be specified multiple times.\n\n"); printf("Note: --include-weights and --exclude-weights cannot be used together\n"); @@ -466,6 +467,8 @@ int main(int argc, char ** argv) { } } else if (strcmp(argv[arg_idx], "--keep-split") == 0) { params.keep_split = true; + } else if (strcmp(argv[arg_idx], "--partial-requant") == 0) { + params.partial_requant = true; } else { usage(argv[0]); } diff --git a/include/llama.h b/include/llama.h index 104f5d40..d73440fa 100644 --- a/include/llama.h +++ b/include/llama.h @@ -491,6 +491,7 @@ extern "C" { bool ignore_imatrix_rules; // If set to true, the built-in rules for refusing to quantize into certain quants without imatrix are ignored bool only_repack; // Only repack tensors bool dry_run; // + bool partial_requant; // quantize only missing split files in the split quantized .gguf destination directory void * imatrix; // pointer to importance matrix data void * kv_overrides; // pointer to vector containing overrides void * custom_quants; // pointer to vector containing custom quantization rules diff --git a/src/llama-quantize.cpp b/src/llama-quantize.cpp index ee298ec7..68abb7c7 100644 --- a/src/llama-quantize.cpp +++ b/src/llama-quantize.cpp @@ -11,6 +11,7 @@ #include #include #include +#include // // quantization @@ -39,6 +40,18 @@ static void zeros(std::ofstream & file, size_t n) { } } +static void ensure_output_directory(const std::string & filepath) { + std::filesystem::path p(filepath); + if (p.has_parent_path()) { + std::error_code ec; + std::filesystem::create_directories(p.parent_path(), ec); + if (ec) { + fprintf(stderr, "Failed to create directory '%s': %s\n", p.parent_path().string().c_str(), ec.message().c_str()); + exit(EXIT_FAILURE); + } + } +} + struct quantize_state_internal { const llama_model & model; const llama_model_quantize_params * params; @@ -1039,8 +1052,21 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s } const size_t align = GGUF_DEFAULT_ALIGNMENT; + + ensure_output_directory(fname_out); + struct gguf_context * ctx_out = gguf_init_empty(); + // Early exit if partial_requant is enabled and output file already exists + if (params->partial_requant && !params->keep_split) { + std::ifstream test_file(fname_out); + if (test_file) { + LLAMA_LOG_INFO("%s: output file %s exists, skipping\n", __func__, fname_out.c_str()); + gguf_free(ctx_out); + return; + } + } + // copy the KV pairs from the input file gguf_set_kv (ctx_out, ml.meta); gguf_set_val_u32(ctx_out, "general.quantization_version", GGML_QNT_VERSION); // TODO: use LLM_KV @@ -1179,6 +1205,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s int cur_split = -1; std::ofstream fout; + std::vector split_skipped(n_split, false); auto close_ofstream = [&]() { // Write metadata and close file handler if (fout.is_open()) { @@ -1202,6 +1229,17 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s fname = std::string(split_path); } + if (params->partial_requant) { + std::ifstream test_file(fname); + if (test_file) { + LLAMA_LOG_INFO("%s: split file %s exists, skipping\n", __func__, fname.c_str()); + split_skipped[cur_split] = true; + fout = std::ofstream(); + return; + } + } + + ensure_output_directory(fname); fout = std::ofstream(fname, std::ios::binary); fout.exceptions(std::ofstream::failbit); // fail fast on write errors const size_t meta_size = gguf_get_meta_size(ctx_outs[cur_split]); @@ -1219,6 +1257,13 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s new_ofstream(weight->idx); } + if (params->partial_requant && split_skipped[cur_split]) { + const std::string name = ggml_get_name(tensor); + gguf_set_tensor_type(ctx_outs[cur_split], name.c_str(), tensor->type); + gguf_set_tensor_data(ctx_outs[cur_split], name.c_str(), tensor->data, ggml_nbytes(tensor)); + continue; + } + const std::string name = ggml_get_name(tensor); if (!ml.use_mmap) { @@ -1511,7 +1556,7 @@ QuantizationDone:; total_size_org += ggml_nbytes(tensor); total_size_new += new_size; - if (!params->dry_run) { + if (!params->dry_run && !split_skipped[cur_split]) { // update the gguf meta data as we go gguf_set_tensor_type(ctx_outs[cur_split], name.c_str(), new_type); gguf_set_tensor_data(ctx_outs[cur_split], name.c_str(), new_data, new_size); diff --git a/src/llama.cpp b/src/llama.cpp index 4442e2dd..ae8875b5 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -4414,6 +4414,7 @@ struct llama_model_quantize_params llama_model_quantize_default_params() { /*.ignore_imatrix_rules =*/ false, /*.only_repack =*/ false, /*.dry_run =*/ false, + /*.partial_requant =*/ false, /*.imatrix =*/ nullptr, /*.kv_overrides =*/ nullptr, /*.custom_quants =*/ nullptr,