From cfb6747776e075a203b4d9241d3294350b0e9a20 Mon Sep 17 00:00:00 2001 From: Kawrakow Date: Tue, 24 Feb 2026 15:21:52 +0100 Subject: [PATCH] llama-quantize: --dry-run option (#1309) --- examples/quantize/quantize.cpp | 2 + include/llama.h | 1 + src/llama-quantize.cpp | 109 ++++++++++++++++++--------------- src/llama.cpp | 1 + 4 files changed, 64 insertions(+), 49 deletions(-) diff --git a/examples/quantize/quantize.cpp b/examples/quantize/quantize.cpp index 9e4bcd6d..0aaaa421 100644 --- a/examples/quantize/quantize.cpp +++ b/examples/quantize/quantize.cpp @@ -355,6 +355,8 @@ int main(int argc, char ** argv) { params.quantize_output_tensor = false; } else if (strcmp(argv[arg_idx], "--ignore-imatrix-rules") == 0) { params.ignore_imatrix_rules = true; + } else if (strcmp(argv[arg_idx], "--dry-run") == 0) { + params.dry_run = true; } else if (strcmp(argv[arg_idx], "--repack") == 0) { params.only_repack = true; } else if (strcmp(argv[arg_idx], "--repack-pattern") == 0) { diff --git a/include/llama.h b/include/llama.h index 6ab628e3..a8feef50 100644 --- a/include/llama.h +++ b/include/llama.h @@ -490,6 +490,7 @@ extern "C" { bool keep_split; // quantize to the same number of shards bool ignore_imatrix_rules; // If set to true, the built-in rules for refusing to quantize into certain quants without imatrix are ignored bool only_repack; // Only repack tensors + bool dry_run; // void * imatrix; // pointer to importance matrix data void * kv_overrides; // pointer to vector containing overrides void * custom_quants; // pointer to vector containing custom quantization rules diff --git a/src/llama-quantize.cpp b/src/llama-quantize.cpp index 42e3fd75..ee298ec7 100644 --- a/src/llama-quantize.cpp +++ b/src/llama-quantize.cpp @@ -1190,6 +1190,9 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s } }; auto new_ofstream = [&](int index) { + if (params->dry_run) { + return; + } cur_split = index; GGML_ASSERT(ctx_outs[cur_split] && "Find uninitialized gguf_context"); std::string fname = fname_out; @@ -1267,8 +1270,8 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s quantize &= name.find("attn_rel_b.weight") == std::string::npos; enum ggml_type new_type; - void * new_data; - size_t new_size; + void * new_data = nullptr; + size_t new_size = 0; if (params->only_repack) { ggml_type repacked_type = (ggml_type)iqk_repacked_type(tensor); @@ -1289,19 +1292,21 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s if (modify || repacked_type != tensor->type) { new_type = repacked_type; new_size = ggml_nbytes(tensor); - if ((int)work.size() < new_size) work.resize(new_size); - new_data = work.data(); + if (!params->dry_run) { + if ((int)work.size() < new_size) work.resize(new_size); + new_data = work.data(); - auto aux_tensor = *tensor; - aux_tensor.data = work.data(); - std::memcpy(aux_tensor.data, tensor->data, new_size); + auto aux_tensor = *tensor; + aux_tensor.data = work.data(); + std::memcpy(aux_tensor.data, tensor->data, new_size); - if (repacked_type != tensor->type) { - iqk_repack_tensor(&aux_tensor); - GGML_ASSERT(aux_tensor.type == repacked_type); - } else { - bool did_modify = iqk_modify_tensor(&aux_tensor); - GGML_ASSERT(did_modify); + if (repacked_type != tensor->type) { + iqk_repack_tensor(&aux_tensor); + GGML_ASSERT(aux_tensor.type == repacked_type); + } else { + bool did_modify = iqk_modify_tensor(&aux_tensor); + GGML_ASSERT(did_modify); + } } } else { @@ -1448,17 +1453,6 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s } } - float * f32_data; - - if (tensor->type == GGML_TYPE_F32) { - f32_data = (float *) tensor->data; - } else if (ggml_is_quantized(tensor->type) && !params->allow_requantize) { - throw std::runtime_error(format("requantizing from type %s is disabled", ggml_type_name(tensor->type))); - } else { - llama_tensor_dequantize_internal(tensor, f32_conv_buf, workers, nelements, nthread); - f32_data = (float *) f32_conv_buf.data(); - } - int chunk_size_multiplier = 1; auto [working_type, num_rows] = interleaved_properties(new_type); if (tensor->ne[1] % num_rows != 0) { @@ -1470,30 +1464,45 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s LLAMA_LOG_INFO("converting to %s .. ", ggml_type_name(new_type)); fflush(stdout); - if (work.size() < (size_t)nelements * 4) { - work.resize(nelements * 4); // upper bound on size - } - new_data = work.data(); + if (params->dry_run) { + new_size = tensor->ne[2] * tensor->ne[1] * ggml_row_size(new_type, tensor->ne[0]); + } else { + float * f32_data; - const int64_t n_per_row = tensor->ne[0]; - const int64_t nrows = tensor->ne[1]; + if (tensor->type == GGML_TYPE_F32) { + f32_data = (float *) tensor->data; + } else if (ggml_is_quantized(tensor->type) && !params->allow_requantize) { + throw std::runtime_error(format("requantizing from type %s is disabled", ggml_type_name(tensor->type))); + } else { + llama_tensor_dequantize_internal(tensor, f32_conv_buf, workers, nelements, nthread); + f32_data = (float *) f32_conv_buf.data(); + } - static const int64_t min_chunk_size = 32 * 512; - const int64_t chunk_size = (n_per_row >= min_chunk_size ? n_per_row : n_per_row * ((min_chunk_size + n_per_row - 1)/n_per_row)) * - chunk_size_multiplier; + if (work.size() < (size_t)nelements * 4) { + work.resize(nelements * 4); // upper bound on size + } + new_data = work.data(); - const int64_t nelements_matrix = tensor->ne[0] * tensor->ne[1]; - const int64_t nchunk = (nelements_matrix + chunk_size - 1)/chunk_size; - const int64_t nthread_use = nthread > 1 ? std::max((int64_t)1, std::min((int64_t)nthread, nchunk)) : 1; + const int64_t n_per_row = tensor->ne[0]; + const int64_t nrows = tensor->ne[1]; - // quantize each expert separately since they have different importance matrices - new_size = 0; - for (int64_t i03 = 0; i03 < tensor->ne[2]; ++i03) { - const float * f32_data_03 = f32_data + i03 * nelements_matrix; - void * new_data_03 = (char *)new_data + ggml_row_size(new_type, n_per_row) * i03 * nrows; - const float * imatrix_03 = imatrix ? imatrix + i03 * n_per_row : nullptr; + static const int64_t min_chunk_size = 32 * 512; + const int64_t chunk_size = (n_per_row >= min_chunk_size ? n_per_row : n_per_row * ((min_chunk_size + n_per_row - 1)/n_per_row)) * + chunk_size_multiplier; - new_size += llama_tensor_quantize_internal(new_type, f32_data_03, new_data_03, chunk_size, nrows, n_per_row, imatrix_03, workers, nthread_use); + const int64_t nelements_matrix = tensor->ne[0] * tensor->ne[1]; + const int64_t nchunk = (nelements_matrix + chunk_size - 1)/chunk_size; + const int64_t nthread_use = nthread > 1 ? std::max((int64_t)1, std::min((int64_t)nthread, nchunk)) : 1; + + // quantize each expert separately since they have different importance matrices + new_size = 0; + for (int64_t i03 = 0; i03 < tensor->ne[2]; ++i03) { + const float * f32_data_03 = f32_data + i03 * nelements_matrix; + void * new_data_03 = (char *)new_data + ggml_row_size(new_type, n_per_row) * i03 * nrows; + const float * imatrix_03 = imatrix ? imatrix + i03 * n_per_row : nullptr; + + new_size += llama_tensor_quantize_internal(new_type, f32_data_03, new_data_03, chunk_size, nrows, n_per_row, imatrix_03, workers, nthread_use); + } } LLAMA_LOG_INFO("size = %8.2f MiB -> %8.2f MiB\n", ggml_nbytes(tensor)/1024.0/1024.0, new_size/1024.0/1024.0); } @@ -1502,13 +1511,15 @@ QuantizationDone:; total_size_org += ggml_nbytes(tensor); total_size_new += new_size; - // update the gguf meta data as we go - gguf_set_tensor_type(ctx_outs[cur_split], name.c_str(), new_type); - gguf_set_tensor_data(ctx_outs[cur_split], name.c_str(), new_data, new_size); + if (!params->dry_run) { + // update the gguf meta data as we go + gguf_set_tensor_type(ctx_outs[cur_split], name.c_str(), new_type); + gguf_set_tensor_data(ctx_outs[cur_split], name.c_str(), new_data, new_size); - // write tensor data + padding - fout.write((const char *) new_data, new_size); - zeros(fout, GGML_PAD(new_size, align) - new_size); + // write tensor data + padding + fout.write((const char *) new_data, new_size); + zeros(fout, GGML_PAD(new_size, align) - new_size); + } } close_ofstream(); for (auto & c:ctx_outs) { diff --git a/src/llama.cpp b/src/llama.cpp index d9a5a709..573c042d 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -4412,6 +4412,7 @@ struct llama_model_quantize_params llama_model_quantize_default_params() { /*.keep_split =*/ false, /*.ignore_imatrix_rules =*/ false, /*.only_repack =*/ false, + /*.dry_run =*/ false, /*.imatrix =*/ nullptr, /*.kv_overrides =*/ nullptr, /*.custom_quants =*/ nullptr,