From cfb6747776e075a203b4d9241d3294350b0e9a20 Mon Sep 17 00:00:00 2001
From: Kawrakow <iwankawrakow@gmail.com>
Date: Tue, 24 Feb 2026 15:21:52 +0100
Subject: [PATCH] llama-quantize: --dry-run option (#1309)

---
 examples/quantize/quantize.cpp |   2 +
 include/llama.h                |   1 +
 src/llama-quantize.cpp         | 109 ++++++++++++++++++---------------
 src/llama.cpp                  |   1 +
 4 files changed, 64 insertions(+), 49 deletions(-)

diff --git a/examples/quantize/quantize.cpp b/examples/quantize/quantize.cpp
index 9e4bcd6d..0aaaa421 100644
--- a/examples/quantize/quantize.cpp
+++ b/examples/quantize/quantize.cpp
@@ -355,6 +355,8 @@ int main(int argc, char ** argv) {
             params.quantize_output_tensor = false;
         } else if (strcmp(argv[arg_idx], "--ignore-imatrix-rules") == 0) {
             params.ignore_imatrix_rules = true;
+        } else if (strcmp(argv[arg_idx], "--dry-run") == 0) {
+            params.dry_run = true;
         } else if (strcmp(argv[arg_idx], "--repack") == 0) {
             params.only_repack = true;
         } else if (strcmp(argv[arg_idx], "--repack-pattern") == 0) {
diff --git a/include/llama.h b/include/llama.h
index 6ab628e3..a8feef50 100644
--- a/include/llama.h
+++ b/include/llama.h
@@ -490,6 +490,7 @@ extern "C" {
         bool keep_split;                     // quantize to the same number of shards
         bool ignore_imatrix_rules;           // If set to true, the built-in rules for refusing to quantize into certain quants without imatrix are ignored
         bool only_repack;                    // Only repack tensors
+        bool dry_run;                        //
         void * imatrix;                      // pointer to importance matrix data
         void * kv_overrides;                 // pointer to vector containing overrides
         void * custom_quants;                // pointer to vector containing custom quantization rules
diff --git a/src/llama-quantize.cpp b/src/llama-quantize.cpp
index 42e3fd75..ee298ec7 100644
--- a/src/llama-quantize.cpp
+++ b/src/llama-quantize.cpp
@@ -1190,6 +1190,9 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
         }
     };
     auto new_ofstream = [&](int index) {
+        if (params->dry_run) {
+            return;
+        }
         cur_split = index;
         GGML_ASSERT(ctx_outs[cur_split] && "Find uninitialized gguf_context");
         std::string fname = fname_out;
@@ -1267,8 +1270,8 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
         quantize &= name.find("attn_rel_b.weight") == std::string::npos;
 
         enum ggml_type new_type;
-        void * new_data;
-        size_t new_size;
+        void * new_data = nullptr;
+        size_t new_size = 0;
 
         if (params->only_repack) {
             ggml_type repacked_type = (ggml_type)iqk_repacked_type(tensor);
@@ -1289,19 +1292,21 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
             if (modify || repacked_type != tensor->type) {
                 new_type = repacked_type;
                 new_size = ggml_nbytes(tensor);
-                if ((int)work.size() < new_size) work.resize(new_size);
-                new_data = work.data();
+                if (!params->dry_run) {
+                    if ((int)work.size() < new_size) work.resize(new_size);
+                    new_data = work.data();
 
-                auto aux_tensor = *tensor;
-                aux_tensor.data = work.data();
-                std::memcpy(aux_tensor.data, tensor->data, new_size);
+                    auto aux_tensor = *tensor;
+                    aux_tensor.data = work.data();
+                    std::memcpy(aux_tensor.data, tensor->data, new_size);
 
-                if (repacked_type != tensor->type) {
-                    iqk_repack_tensor(&aux_tensor);
-                    GGML_ASSERT(aux_tensor.type == repacked_type);
-                } else {
-                    bool did_modify = iqk_modify_tensor(&aux_tensor);
-                    GGML_ASSERT(did_modify);
+                    if (repacked_type != tensor->type) {
+                        iqk_repack_tensor(&aux_tensor);
+                        GGML_ASSERT(aux_tensor.type == repacked_type);
+                    } else {
+                        bool did_modify = iqk_modify_tensor(&aux_tensor);
+                        GGML_ASSERT(did_modify);
+                    }
                 }
             }
             else {
@@ -1448,17 +1453,6 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
                 }
             }
 
-            float * f32_data;
-
-            if (tensor->type == GGML_TYPE_F32) {
-                f32_data = (float *) tensor->data;
-            } else if (ggml_is_quantized(tensor->type) && !params->allow_requantize) {
-                throw std::runtime_error(format("requantizing from type %s is disabled", ggml_type_name(tensor->type)));
-            } else {
-                llama_tensor_dequantize_internal(tensor, f32_conv_buf, workers, nelements, nthread);
-                f32_data = (float *) f32_conv_buf.data();
-            }
-
             int chunk_size_multiplier = 1;
             auto [working_type, num_rows] = interleaved_properties(new_type);
             if (tensor->ne[1] % num_rows != 0) {
@@ -1470,30 +1464,45 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
             LLAMA_LOG_INFO("converting to %s .. ", ggml_type_name(new_type));
             fflush(stdout);
 
-            if (work.size() < (size_t)nelements * 4) {
-                work.resize(nelements * 4); // upper bound on size
-            }
-            new_data = work.data();
+            if (params->dry_run) {
+                new_size = tensor->ne[2] * tensor->ne[1] * ggml_row_size(new_type, tensor->ne[0]);
+            } else {
+                float * f32_data;
 
-            const int64_t n_per_row = tensor->ne[0];
-            const int64_t nrows = tensor->ne[1];
+                if (tensor->type == GGML_TYPE_F32) {
+                    f32_data = (float *) tensor->data;
+                } else if (ggml_is_quantized(tensor->type) && !params->allow_requantize) {
+                    throw std::runtime_error(format("requantizing from type %s is disabled", ggml_type_name(tensor->type)));
+                } else {
+                    llama_tensor_dequantize_internal(tensor, f32_conv_buf, workers, nelements, nthread);
+                    f32_data = (float *) f32_conv_buf.data();
+                }
 
-            static const int64_t min_chunk_size = 32 * 512;
-            const int64_t chunk_size = (n_per_row >= min_chunk_size ? n_per_row : n_per_row * ((min_chunk_size + n_per_row - 1)/n_per_row)) *
-                                       chunk_size_multiplier;
+                if (work.size() < (size_t)nelements * 4) {
+                    work.resize(nelements * 4); // upper bound on size
+                }
+                new_data = work.data();
 
-            const int64_t nelements_matrix = tensor->ne[0] * tensor->ne[1];
-            const int64_t nchunk = (nelements_matrix + chunk_size - 1)/chunk_size;
-            const int64_t nthread_use = nthread > 1 ? std::max((int64_t)1, std::min((int64_t)nthread, nchunk)) : 1;
+                const int64_t n_per_row = tensor->ne[0];
+                const int64_t nrows = tensor->ne[1];
 
-            // quantize each expert separately since they have different importance matrices
-            new_size = 0;
-            for (int64_t i03 = 0; i03 < tensor->ne[2]; ++i03) {
-                const float * f32_data_03 = f32_data + i03 * nelements_matrix;
-                void * new_data_03 = (char *)new_data + ggml_row_size(new_type, n_per_row) * i03 * nrows;
-                const float * imatrix_03 = imatrix ? imatrix + i03 * n_per_row : nullptr;
+                static const int64_t min_chunk_size = 32 * 512;
+                const int64_t chunk_size = (n_per_row >= min_chunk_size ? n_per_row : n_per_row * ((min_chunk_size + n_per_row - 1)/n_per_row)) *
+                    chunk_size_multiplier;
 
-                new_size += llama_tensor_quantize_internal(new_type, f32_data_03, new_data_03, chunk_size, nrows, n_per_row, imatrix_03, workers, nthread_use);
+                const int64_t nelements_matrix = tensor->ne[0] * tensor->ne[1];
+                const int64_t nchunk = (nelements_matrix + chunk_size - 1)/chunk_size;
+                const int64_t nthread_use = nthread > 1 ? std::max((int64_t)1, std::min((int64_t)nthread, nchunk)) : 1;
+
+                // quantize each expert separately since they have different importance matrices
+                new_size = 0;
+                for (int64_t i03 = 0; i03 < tensor->ne[2]; ++i03) {
+                    const float * f32_data_03 = f32_data + i03 * nelements_matrix;
+                    void * new_data_03 = (char *)new_data + ggml_row_size(new_type, n_per_row) * i03 * nrows;
+                    const float * imatrix_03 = imatrix ? imatrix + i03 * n_per_row : nullptr;
+
+                    new_size += llama_tensor_quantize_internal(new_type, f32_data_03, new_data_03, chunk_size, nrows, n_per_row, imatrix_03, workers, nthread_use);
+                }
             }
             LLAMA_LOG_INFO("size = %8.2f MiB -> %8.2f MiB\n", ggml_nbytes(tensor)/1024.0/1024.0, new_size/1024.0/1024.0);
         }
@@ -1502,13 +1511,15 @@ QuantizationDone:;
         total_size_org += ggml_nbytes(tensor);
         total_size_new += new_size;
 
-        // update the gguf meta data as we go
-        gguf_set_tensor_type(ctx_outs[cur_split], name.c_str(), new_type);
-        gguf_set_tensor_data(ctx_outs[cur_split], name.c_str(), new_data, new_size);
+        if (!params->dry_run) {
+            // update the gguf meta data as we go
+            gguf_set_tensor_type(ctx_outs[cur_split], name.c_str(), new_type);
+            gguf_set_tensor_data(ctx_outs[cur_split], name.c_str(), new_data, new_size);
 
-        // write tensor data + padding
-        fout.write((const char *) new_data, new_size);
-        zeros(fout, GGML_PAD(new_size, align) - new_size);
+            // write tensor data + padding
+            fout.write((const char *) new_data, new_size);
+            zeros(fout, GGML_PAD(new_size, align) - new_size);
+        }
     }
     close_ofstream();
     for (auto & c:ctx_outs) {
diff --git a/src/llama.cpp b/src/llama.cpp
index d9a5a709..573c042d 100644
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -4412,6 +4412,7 @@ struct llama_model_quantize_params llama_model_quantize_default_params() {
         /*.keep_split                  =*/ false,
         /*.ignore_imatrix_rules        =*/ false,
         /*.only_repack                 =*/ false,
+        /*.dry_run                     =*/ false,
         /*.imatrix                     =*/ nullptr,
         /*.kv_overrides                =*/ nullptr,
         /*.custom_quants               =*/ nullptr,