Llama-quantize: Partial requant feature (#1313)

* Partial Requant feature for llama-quantize - Inspired by the recently portcopied --dry-run feature. - Allows to partially requantize a split quantized .gguf by requantizing only the missing splits in the destination directory. - Works both for GGUF which are split tensors by tensors, or by group of several tensors (though this one is not very much tested beyond 2 tensors by split). - Vibe coded. * Create output directory if it doesn't exist in llama-quantize * Create output directory if it doesn't exist in gguf-split * Add exit when directory fails to be created on Windows * Use std::filesystem * cleanup
2026-02-27 00:24:11 +00:00 · 2026-02-25 07:25:15 +01:00
parent 68431b049a
commit 170467e835
5 changed files with 69 additions and 2 deletions
--- a/src/llama-quantize.cpp
+++ b/src/llama-quantize.cpp
@@ -11,6 +11,7 @@
 #include <regex>
 #include <mutex>
 #include <fstream>
+#include <filesystem>

 //
 // quantization
@@ -39,6 +40,18 @@ static void zeros(std::ofstream & file, size_t n) {
    }
 }

+static void ensure_output_directory(const std::string & filepath) {
+    std::filesystem::path p(filepath);
+    if (p.has_parent_path()) {
+        std::error_code ec;
+        std::filesystem::create_directories(p.parent_path(), ec);
+        if (ec) {
+            fprintf(stderr, "Failed to create directory '%s': %s\n", p.parent_path().string().c_str(), ec.message().c_str());
+            exit(EXIT_FAILURE);
+        }
+    }
+}
+
 struct quantize_state_internal {
    const llama_model                 & model;
    const llama_model_quantize_params * params;
@@ -1039,8 +1052,21 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
    }

    const size_t align = GGUF_DEFAULT_ALIGNMENT;
+
+    ensure_output_directory(fname_out);
+
    struct gguf_context * ctx_out = gguf_init_empty();

+    // Early exit if partial_requant is enabled and output file already exists
+    if (params->partial_requant && !params->keep_split) {
+        std::ifstream test_file(fname_out);
+        if (test_file) {
+            LLAMA_LOG_INFO("%s: output file %s exists, skipping\n", __func__, fname_out.c_str());
+            gguf_free(ctx_out);
+            return;
+        }
+    }
+
    // copy the KV pairs from the input file
    gguf_set_kv     (ctx_out, ml.meta);
    gguf_set_val_u32(ctx_out, "general.quantization_version", GGML_QNT_VERSION); // TODO: use LLM_KV
@@ -1179,6 +1205,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s

    int cur_split = -1;
    std::ofstream fout;
+    std::vector<bool> split_skipped(n_split, false);
    auto close_ofstream = [&]() {
        // Write metadata and close file handler
        if (fout.is_open()) {
@@ -1202,6 +1229,17 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
            fname = std::string(split_path);
        }

+        if (params->partial_requant) {
+            std::ifstream test_file(fname);
+            if (test_file) {
+                LLAMA_LOG_INFO("%s: split file %s exists, skipping\n", __func__, fname.c_str());
+                split_skipped[cur_split] = true;
+                fout = std::ofstream();
+                return;
+            }
+        }
+
+        ensure_output_directory(fname);
        fout = std::ofstream(fname, std::ios::binary);
        fout.exceptions(std::ofstream::failbit); // fail fast on write errors
        const size_t meta_size = gguf_get_meta_size(ctx_outs[cur_split]);
@@ -1219,6 +1257,13 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
            new_ofstream(weight->idx);
        }

+        if (params->partial_requant && split_skipped[cur_split]) {
+            const std::string name = ggml_get_name(tensor);
+            gguf_set_tensor_type(ctx_outs[cur_split], name.c_str(), tensor->type);
+            gguf_set_tensor_data(ctx_outs[cur_split], name.c_str(), tensor->data, ggml_nbytes(tensor));
+            continue;
+        }
+
        const std::string name = ggml_get_name(tensor);

        if (!ml.use_mmap) {
@@ -1511,7 +1556,7 @@ QuantizationDone:;
        total_size_org += ggml_nbytes(tensor);
        total_size_new += new_size;

-        if (!params->dry_run) {
+        if (!params->dry_run && !split_skipped[cur_split]) {
            // update the gguf meta data as we go
            gguf_set_tensor_type(ctx_outs[cur_split], name.c_str(), new_type);
            gguf_set_tensor_data(ctx_outs[cur_split], name.c_str(), new_data, new_size);