Merge remote-tracking branch 'origin/main' into ik/fused_delta_net_2

2026-05-11 08:30:19 +00:00 · 2026-02-25 13:19:09 +00:00
parent a8ef7e20e7 c77ec4b8b8
commit ef2ab07b5b
7 changed files with 74 additions and 5 deletions
--- a/examples/gguf-split/gguf-split.cpp
+++ b/examples/gguf-split/gguf-split.cpp
@@ -7,6 +7,7 @@
 #include <fstream>
 #include <string>
 #include <vector>
+#include <filesystem>

 #include <stdio.h>
 #include <string.h>
@@ -190,6 +191,18 @@ static void zeros(std::ofstream & file, size_t n) {
    }
 }

+static void ensure_output_directory(const std::string & filepath) {
+    std::filesystem::path p(filepath);
+    if (p.has_parent_path()) {
+        std::error_code ec;
+        std::filesystem::create_directories(p.parent_path(), ec);
+        if (ec) {
+            fprintf(stderr, "Failed to create directory '%s': %s\n", p.parent_path().string().c_str(), ec.message().c_str());
+            exit(EXIT_FAILURE);
+        }
+    }
+}
+
 struct split_strategy {
    const split_params params;
    std::ifstream & f_input;
@@ -310,6 +323,8 @@ struct split_strategy {
            char split_path[PATH_MAX] = {0};
            llama_split_path(split_path, sizeof(split_path), params.output.c_str(), i_split, n_split);

+            ensure_output_directory(split_path);
+
            // open the output file
            printf("Writing file %s ... ", split_path);
            fflush(stdout);
@@ -401,6 +416,8 @@ static void gguf_merge(const split_params & split_params) {
    int n_split = 1;
    int total_tensors = 0;

+    ensure_output_directory(split_params.output);
+
    // avoid overwriting existing output file
    if (std::ifstream(split_params.output.c_str())) {
        fprintf(stderr, "%s: output file %s already exists\n", __func__, split_params.output.c_str());
--- a/examples/quantize/quantize.cpp
+++ b/examples/quantize/quantize.cpp
@@ -151,7 +151,7 @@ static bool try_parse_ftype(const std::string & ftype_str_in, llama_ftype & ftyp
 //
 [[noreturn]]
 static void usage(const char * executable) {
-    printf("usage: %s [--help] [--allow-requantize] [--leave-output-tensor] [--pure] [--imatrix] [--hide-imatrix] [--include-weights] [--exclude-weights] [--output-tensor-type] [--token-embedding-type] [--ffn-gate-inp-type] [--attn-q-type] [--attn-k-type] [--attn-v-type] [--attn-qkv-type] [--attn-output-type] [--ffn-gate-type] [--ffn-down-type] [--ffn-up-type] [--keep-split] [--override-kv] model-f32.gguf [model-quant.gguf] type [nthreads]\n\n", executable);
+    printf("usage: %s [--help] [--allow-requantize] [--leave-output-tensor] [--pure] [--imatrix] [--hide-imatrix] [--include-weights] [--exclude-weights] [--output-tensor-type] [--token-embedding-type] [--ffn-gate-inp-type] [--attn-q-type] [--attn-k-type] [--attn-v-type] [--attn-qkv-type] [--attn-output-type] [--ffn-gate-type] [--ffn-down-type] [--ffn-up-type] [--keep-split] [--partial-requant] [--override-kv] model-f32.gguf [model-quant.gguf] type [nthreads]\n\n", executable);
    printf("  --allow-requantize: Allows requantizing tensors that have already been quantized. Warning: This can severely reduce quality compared to quantizing from 16bit or 32bit\n");
    printf("  --leave-output-tensor: Will leave output.weight un(re)quantized. Increases model size but may also increase quality, especially when requantizing\n");
    printf("  --pure: Disable k-quant mixtures and quantize all tensors to the same type\n");
@@ -175,6 +175,7 @@ static void usage(const char * executable) {
    printf("      --ffn-down-type ggml_type: use this ggml_type for the ffn_down tensor.\n");
    printf("      --ffn-up-type ggml_type: use this ggml_type for the ffn_up tensor.\n\n");
    printf("  --keep-split: will generate quantized model in the same shards as input\n");
+    printf("  --partial-requant: quantize only missing split files in the split quantized .gguf destination directory\n");
    printf("  --override-kv KEY=TYPE:VALUE\n");
    printf("      Advanced option to override model metadata by key in the quantized model. May be specified multiple times.\n\n");
    printf("Note: --include-weights and --exclude-weights cannot be used together\n");
@@ -466,6 +467,8 @@ int main(int argc, char ** argv) {
            }
        } else if (strcmp(argv[arg_idx], "--keep-split") == 0) {
            params.keep_split = true;
+        } else if (strcmp(argv[arg_idx], "--partial-requant") == 0) {
+            params.partial_requant = true;
        } else {
            usage(argv[0]);
        }
--- a/ggml/src/iqk/iqk_mul_mat.cpp
+++ b/ggml/src/iqk/iqk_mul_mat.cpp
@@ -1435,7 +1435,7 @@ void iqk_fused_delta_net_impl(int n_heads, int n_tokens, int n_seqs,
                auto vk = _mm256_loadu_ps(k_t + i);
                vqksum = _mm256_fmadd_ps(vk, vq, vqksum);
            }
-            kq_sum    = hsum_float_8(vqksum);
+            kq_sum = hsum_float_8(vqksum);
 #else
            for (int i = 0; i < head_dim; ++i) {
                kq_sum += k_t[i] * q_t[i];
--- a/include/llama.h
+++ b/include/llama.h
@@ -492,6 +492,7 @@ extern "C" {
        bool ignore_imatrix_rules;           // If set to true, the built-in rules for refusing to quantize into certain quants without imatrix are ignored
        bool only_repack;                    // Only repack tensors
        bool dry_run;                        //
+        bool partial_requant;                // quantize only missing split files in the split quantized .gguf destination directory
        void * imatrix;                      // pointer to importance matrix data
        void * kv_overrides;                 // pointer to vector containing overrides
        void * custom_quants;                // pointer to vector containing custom quantization rules
--- a/src/llama-load-tensors.cpp
+++ b/src/llama-load-tensors.cpp
@@ -312,7 +312,9 @@ ggml_context * create_tensors_helper::get_context_for_tensor(ggml_context * ctx,
        for (const auto * overrides = ml.tensor_buft_overrides; overrides->pattern != nullptr; ++overrides) {
            std::regex pattern(overrides->pattern);
            if (std::regex_search(name, pattern)) {
-                LLAMA_LOG_INFO("Tensor %s buffer type overriden to %s\n", name.c_str(), ggml_backend_buft_name(overrides->buft));
+                const struct ggml_tensor * cur = ml.get_tensor_meta(name.c_str());
+                const size_t nbytes = cur ? ggml_nbytes(cur) : 0;
+                LLAMA_LOG_INFO("Tensor %s (size = %.2f MiB) buffer type overriden to %s\n", name.c_str(), nbytes/1024./1024., ggml_backend_buft_name(overrides->buft));
                ctx = ctx_for_buft(overrides->buft);
                break;
            }
--- a/src/llama-quantize.cpp
+++ b/src/llama-quantize.cpp
@@ -11,6 +11,7 @@
 #include <regex>
 #include <mutex>
 #include <fstream>
+#include <filesystem>

 //
 // quantization
@@ -39,6 +40,18 @@ static void zeros(std::ofstream & file, size_t n) {
    }
 }

+static void ensure_output_directory(const std::string & filepath) {
+    std::filesystem::path p(filepath);
+    if (p.has_parent_path()) {
+        std::error_code ec;
+        std::filesystem::create_directories(p.parent_path(), ec);
+        if (ec) {
+            fprintf(stderr, "Failed to create directory '%s': %s\n", p.parent_path().string().c_str(), ec.message().c_str());
+            exit(EXIT_FAILURE);
+        }
+    }
+}
+
 struct quantize_state_internal {
    const llama_model                 & model;
    const llama_model_quantize_params * params;
@@ -1039,8 +1052,21 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
    }

    const size_t align = GGUF_DEFAULT_ALIGNMENT;
+
+    ensure_output_directory(fname_out);
+
    struct gguf_context * ctx_out = gguf_init_empty();

+    // Early exit if partial_requant is enabled and output file already exists
+    if (params->partial_requant && !params->keep_split) {
+        std::ifstream test_file(fname_out);
+        if (test_file) {
+            LLAMA_LOG_INFO("%s: output file %s exists, skipping\n", __func__, fname_out.c_str());
+            gguf_free(ctx_out);
+            return;
+        }
+    }
+
    // copy the KV pairs from the input file
    gguf_set_kv     (ctx_out, ml.meta);
    gguf_set_val_u32(ctx_out, "general.quantization_version", GGML_QNT_VERSION); // TODO: use LLM_KV
@@ -1179,6 +1205,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s

    int cur_split = -1;
    std::ofstream fout;
+    std::vector<bool> split_skipped(n_split, false);
    auto close_ofstream = [&]() {
        // Write metadata and close file handler
        if (fout.is_open()) {
@@ -1202,6 +1229,17 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
            fname = std::string(split_path);
        }

+        if (params->partial_requant) {
+            std::ifstream test_file(fname);
+            if (test_file) {
+                LLAMA_LOG_INFO("%s: split file %s exists, skipping\n", __func__, fname.c_str());
+                split_skipped[cur_split] = true;
+                fout = std::ofstream();
+                return;
+            }
+        }
+
+        ensure_output_directory(fname);
        fout = std::ofstream(fname, std::ios::binary);
        fout.exceptions(std::ofstream::failbit); // fail fast on write errors
        const size_t meta_size = gguf_get_meta_size(ctx_outs[cur_split]);
@@ -1219,6 +1257,13 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
            new_ofstream(weight->idx);
        }

+        if (params->partial_requant && split_skipped[cur_split]) {
+            const std::string name = ggml_get_name(tensor);
+            gguf_set_tensor_type(ctx_outs[cur_split], name.c_str(), tensor->type);
+            gguf_set_tensor_data(ctx_outs[cur_split], name.c_str(), tensor->data, ggml_nbytes(tensor));
+            continue;
+        }
+
        const std::string name = ggml_get_name(tensor);

        if (!ml.use_mmap) {
@@ -1511,7 +1556,7 @@ QuantizationDone:;
        total_size_org += ggml_nbytes(tensor);
        total_size_new += new_size;

-        if (!params->dry_run) {
+        if (!params->dry_run && !split_skipped[cur_split]) {
            // update the gguf meta data as we go
            gguf_set_tensor_type(ctx_outs[cur_split], name.c_str(), new_type);
            gguf_set_tensor_data(ctx_outs[cur_split], name.c_str(), new_data, new_size);
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -2209,7 +2209,7 @@ static bool llm_load_tensors(

    // print memory requirements
    for (ggml_backend_buffer_t buf : model.bufs) {
-        LLAMA_LOG_INFO("%s: %10s buffer size = %8.2f MiB\n", __func__, ggml_backend_buffer_name(buf), ggml_backend_buffer_get_size(buf) / 1024.0 / 1024.0);
+        LLAMA_LOG_DEBUG("%s: %10s buffer size = %8.2f MiB\n", __func__, ggml_backend_buffer_name(buf), ggml_backend_buffer_get_size(buf) / 1024.0 / 1024.0);
    }

    // populate tensors_by_name
@@ -4415,6 +4415,7 @@ struct llama_model_quantize_params llama_model_quantize_default_params() {
        /*.ignore_imatrix_rules        =*/ false,
        /*.only_repack                 =*/ false,
        /*.dry_run                     =*/ false,
+        /*.partial_requant             =*/ false,
        /*.imatrix                     =*/ nullptr,
        /*.kv_overrides                =*/ nullptr,
        /*.custom_quants               =*/ nullptr,