llama-quantize: --dry-run option

This commit is contained in:
Kawrakow
2026-02-23 15:16:10 +00:00
parent 68bd30d99c
commit 1a40325265
4 changed files with 64 additions and 49 deletions

View File

@@ -355,6 +355,8 @@ int main(int argc, char ** argv) {
params.quantize_output_tensor = false;
} else if (strcmp(argv[arg_idx], "--ignore-imatrix-rules") == 0) {
params.ignore_imatrix_rules = true;
} else if (strcmp(argv[arg_idx], "--dry-run") == 0) {
params.dry_run = true;
} else if (strcmp(argv[arg_idx], "--repack") == 0) {
params.only_repack = true;
} else if (strcmp(argv[arg_idx], "--repack-pattern") == 0) {

View File

@@ -490,6 +490,7 @@ extern "C" {
bool keep_split; // quantize to the same number of shards
bool ignore_imatrix_rules; // If set to true, the built-in rules for refusing to quantize into certain quants without imatrix are ignored
bool only_repack; // Only repack tensors
bool dry_run; //
void * imatrix; // pointer to importance matrix data
void * kv_overrides; // pointer to vector containing overrides
void * custom_quants; // pointer to vector containing custom quantization rules

View File

@@ -1190,6 +1190,9 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
}
};
auto new_ofstream = [&](int index) {
if (params->dry_run) {
return;
}
cur_split = index;
GGML_ASSERT(ctx_outs[cur_split] && "Find uninitialized gguf_context");
std::string fname = fname_out;
@@ -1267,8 +1270,8 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
quantize &= name.find("attn_rel_b.weight") == std::string::npos;
enum ggml_type new_type;
void * new_data;
size_t new_size;
void * new_data = nullptr;
size_t new_size = 0;
if (params->only_repack) {
ggml_type repacked_type = (ggml_type)iqk_repacked_type(tensor);
@@ -1289,6 +1292,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
if (modify || repacked_type != tensor->type) {
new_type = repacked_type;
new_size = ggml_nbytes(tensor);
if (!params->dry_run) {
if ((int)work.size() < new_size) work.resize(new_size);
new_data = work.data();
@@ -1304,6 +1308,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
GGML_ASSERT(did_modify);
}
}
}
else {
new_type = tensor->type;
new_size = ggml_nbytes(tensor);
@@ -1448,17 +1453,6 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
}
}
float * f32_data;
if (tensor->type == GGML_TYPE_F32) {
f32_data = (float *) tensor->data;
} else if (ggml_is_quantized(tensor->type) && !params->allow_requantize) {
throw std::runtime_error(format("requantizing from type %s is disabled", ggml_type_name(tensor->type)));
} else {
llama_tensor_dequantize_internal(tensor, f32_conv_buf, workers, nelements, nthread);
f32_data = (float *) f32_conv_buf.data();
}
int chunk_size_multiplier = 1;
auto [working_type, num_rows] = interleaved_properties(new_type);
if (tensor->ne[1] % num_rows != 0) {
@@ -1470,6 +1464,20 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
LLAMA_LOG_INFO("converting to %s .. ", ggml_type_name(new_type));
fflush(stdout);
if (params->dry_run) {
new_size = tensor->ne[2] * tensor->ne[1] * ggml_row_size(new_type, tensor->ne[0]);
} else {
float * f32_data;
if (tensor->type == GGML_TYPE_F32) {
f32_data = (float *) tensor->data;
} else if (ggml_is_quantized(tensor->type) && !params->allow_requantize) {
throw std::runtime_error(format("requantizing from type %s is disabled", ggml_type_name(tensor->type)));
} else {
llama_tensor_dequantize_internal(tensor, f32_conv_buf, workers, nelements, nthread);
f32_data = (float *) f32_conv_buf.data();
}
if (work.size() < (size_t)nelements * 4) {
work.resize(nelements * 4); // upper bound on size
}
@@ -1495,6 +1503,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
new_size += llama_tensor_quantize_internal(new_type, f32_data_03, new_data_03, chunk_size, nrows, n_per_row, imatrix_03, workers, nthread_use);
}
}
LLAMA_LOG_INFO("size = %8.2f MiB -> %8.2f MiB\n", ggml_nbytes(tensor)/1024.0/1024.0, new_size/1024.0/1024.0);
}
@@ -1502,6 +1511,7 @@ QuantizationDone:;
total_size_org += ggml_nbytes(tensor);
total_size_new += new_size;
if (!params->dry_run) {
// update the gguf meta data as we go
gguf_set_tensor_type(ctx_outs[cur_split], name.c_str(), new_type);
gguf_set_tensor_data(ctx_outs[cur_split], name.c_str(), new_data, new_size);
@@ -1510,6 +1520,7 @@ QuantizationDone:;
fout.write((const char *) new_data, new_size);
zeros(fout, GGML_PAD(new_size, align) - new_size);
}
}
close_ofstream();
for (auto & c:ctx_outs) {
gguf_free(c);

View File

@@ -4412,6 +4412,7 @@ struct llama_model_quantize_params llama_model_quantize_default_params() {
/*.keep_split =*/ false,
/*.ignore_imatrix_rules =*/ false,
/*.only_repack =*/ false,
/*.dry_run =*/ false,
/*.imatrix =*/ nullptr,
/*.kv_overrides =*/ nullptr,
/*.custom_quants =*/ nullptr,