mirror of
https://github.com/ikawrakow/ik_llama.cpp.git
synced 2026-02-26 08:04:09 +00:00
Llama-quantize: Partial requant feature (#1313)
* Partial Requant feature for llama-quantize - Inspired by the recently portcopied --dry-run feature. - Allows to partially requantize a split quantized .gguf by requantizing only the missing splits in the destination directory. - Works both for GGUF which are split tensors by tensors, or by group of several tensors (though this one is not very much tested beyond 2 tensors by split). - Vibe coded. * Create output directory if it doesn't exist in llama-quantize * Create output directory if it doesn't exist in gguf-split * Add exit when directory fails to be created on Windows * Use std::filesystem * cleanup
This commit is contained in:
@@ -7,6 +7,7 @@
|
||||
#include <fstream>
|
||||
#include <string>
|
||||
#include <vector>
|
||||
#include <filesystem>
|
||||
|
||||
#include <stdio.h>
|
||||
#include <string.h>
|
||||
@@ -190,6 +191,18 @@ static void zeros(std::ofstream & file, size_t n) {
|
||||
}
|
||||
}
|
||||
|
||||
static void ensure_output_directory(const std::string & filepath) {
|
||||
std::filesystem::path p(filepath);
|
||||
if (p.has_parent_path()) {
|
||||
std::error_code ec;
|
||||
std::filesystem::create_directories(p.parent_path(), ec);
|
||||
if (ec) {
|
||||
fprintf(stderr, "Failed to create directory '%s': %s\n", p.parent_path().string().c_str(), ec.message().c_str());
|
||||
exit(EXIT_FAILURE);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
struct split_strategy {
|
||||
const split_params params;
|
||||
std::ifstream & f_input;
|
||||
@@ -310,6 +323,8 @@ struct split_strategy {
|
||||
char split_path[PATH_MAX] = {0};
|
||||
llama_split_path(split_path, sizeof(split_path), params.output.c_str(), i_split, n_split);
|
||||
|
||||
ensure_output_directory(split_path);
|
||||
|
||||
// open the output file
|
||||
printf("Writing file %s ... ", split_path);
|
||||
fflush(stdout);
|
||||
@@ -401,6 +416,8 @@ static void gguf_merge(const split_params & split_params) {
|
||||
int n_split = 1;
|
||||
int total_tensors = 0;
|
||||
|
||||
ensure_output_directory(split_params.output);
|
||||
|
||||
// avoid overwriting existing output file
|
||||
if (std::ifstream(split_params.output.c_str())) {
|
||||
fprintf(stderr, "%s: output file %s already exists\n", __func__, split_params.output.c_str());
|
||||
|
||||
@@ -151,7 +151,7 @@ static bool try_parse_ftype(const std::string & ftype_str_in, llama_ftype & ftyp
|
||||
//
|
||||
[[noreturn]]
|
||||
static void usage(const char * executable) {
|
||||
printf("usage: %s [--help] [--allow-requantize] [--leave-output-tensor] [--pure] [--imatrix] [--hide-imatrix] [--include-weights] [--exclude-weights] [--output-tensor-type] [--token-embedding-type] [--ffn-gate-inp-type] [--attn-q-type] [--attn-k-type] [--attn-v-type] [--attn-qkv-type] [--attn-output-type] [--ffn-gate-type] [--ffn-down-type] [--ffn-up-type] [--keep-split] [--override-kv] model-f32.gguf [model-quant.gguf] type [nthreads]\n\n", executable);
|
||||
printf("usage: %s [--help] [--allow-requantize] [--leave-output-tensor] [--pure] [--imatrix] [--hide-imatrix] [--include-weights] [--exclude-weights] [--output-tensor-type] [--token-embedding-type] [--ffn-gate-inp-type] [--attn-q-type] [--attn-k-type] [--attn-v-type] [--attn-qkv-type] [--attn-output-type] [--ffn-gate-type] [--ffn-down-type] [--ffn-up-type] [--keep-split] [--partial-requant] [--override-kv] model-f32.gguf [model-quant.gguf] type [nthreads]\n\n", executable);
|
||||
printf(" --allow-requantize: Allows requantizing tensors that have already been quantized. Warning: This can severely reduce quality compared to quantizing from 16bit or 32bit\n");
|
||||
printf(" --leave-output-tensor: Will leave output.weight un(re)quantized. Increases model size but may also increase quality, especially when requantizing\n");
|
||||
printf(" --pure: Disable k-quant mixtures and quantize all tensors to the same type\n");
|
||||
@@ -175,6 +175,7 @@ static void usage(const char * executable) {
|
||||
printf(" --ffn-down-type ggml_type: use this ggml_type for the ffn_down tensor.\n");
|
||||
printf(" --ffn-up-type ggml_type: use this ggml_type for the ffn_up tensor.\n\n");
|
||||
printf(" --keep-split: will generate quantized model in the same shards as input\n");
|
||||
printf(" --partial-requant: quantize only missing split files in the split quantized .gguf destination directory\n");
|
||||
printf(" --override-kv KEY=TYPE:VALUE\n");
|
||||
printf(" Advanced option to override model metadata by key in the quantized model. May be specified multiple times.\n\n");
|
||||
printf("Note: --include-weights and --exclude-weights cannot be used together\n");
|
||||
@@ -466,6 +467,8 @@ int main(int argc, char ** argv) {
|
||||
}
|
||||
} else if (strcmp(argv[arg_idx], "--keep-split") == 0) {
|
||||
params.keep_split = true;
|
||||
} else if (strcmp(argv[arg_idx], "--partial-requant") == 0) {
|
||||
params.partial_requant = true;
|
||||
} else {
|
||||
usage(argv[0]);
|
||||
}
|
||||
|
||||
@@ -491,6 +491,7 @@ extern "C" {
|
||||
bool ignore_imatrix_rules; // If set to true, the built-in rules for refusing to quantize into certain quants without imatrix are ignored
|
||||
bool only_repack; // Only repack tensors
|
||||
bool dry_run; //
|
||||
bool partial_requant; // quantize only missing split files in the split quantized .gguf destination directory
|
||||
void * imatrix; // pointer to importance matrix data
|
||||
void * kv_overrides; // pointer to vector containing overrides
|
||||
void * custom_quants; // pointer to vector containing custom quantization rules
|
||||
|
||||
@@ -11,6 +11,7 @@
|
||||
#include <regex>
|
||||
#include <mutex>
|
||||
#include <fstream>
|
||||
#include <filesystem>
|
||||
|
||||
//
|
||||
// quantization
|
||||
@@ -39,6 +40,18 @@ static void zeros(std::ofstream & file, size_t n) {
|
||||
}
|
||||
}
|
||||
|
||||
static void ensure_output_directory(const std::string & filepath) {
|
||||
std::filesystem::path p(filepath);
|
||||
if (p.has_parent_path()) {
|
||||
std::error_code ec;
|
||||
std::filesystem::create_directories(p.parent_path(), ec);
|
||||
if (ec) {
|
||||
fprintf(stderr, "Failed to create directory '%s': %s\n", p.parent_path().string().c_str(), ec.message().c_str());
|
||||
exit(EXIT_FAILURE);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
struct quantize_state_internal {
|
||||
const llama_model & model;
|
||||
const llama_model_quantize_params * params;
|
||||
@@ -1039,8 +1052,21 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
||||
}
|
||||
|
||||
const size_t align = GGUF_DEFAULT_ALIGNMENT;
|
||||
|
||||
ensure_output_directory(fname_out);
|
||||
|
||||
struct gguf_context * ctx_out = gguf_init_empty();
|
||||
|
||||
// Early exit if partial_requant is enabled and output file already exists
|
||||
if (params->partial_requant && !params->keep_split) {
|
||||
std::ifstream test_file(fname_out);
|
||||
if (test_file) {
|
||||
LLAMA_LOG_INFO("%s: output file %s exists, skipping\n", __func__, fname_out.c_str());
|
||||
gguf_free(ctx_out);
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
// copy the KV pairs from the input file
|
||||
gguf_set_kv (ctx_out, ml.meta);
|
||||
gguf_set_val_u32(ctx_out, "general.quantization_version", GGML_QNT_VERSION); // TODO: use LLM_KV
|
||||
@@ -1179,6 +1205,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
||||
|
||||
int cur_split = -1;
|
||||
std::ofstream fout;
|
||||
std::vector<bool> split_skipped(n_split, false);
|
||||
auto close_ofstream = [&]() {
|
||||
// Write metadata and close file handler
|
||||
if (fout.is_open()) {
|
||||
@@ -1202,6 +1229,17 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
||||
fname = std::string(split_path);
|
||||
}
|
||||
|
||||
if (params->partial_requant) {
|
||||
std::ifstream test_file(fname);
|
||||
if (test_file) {
|
||||
LLAMA_LOG_INFO("%s: split file %s exists, skipping\n", __func__, fname.c_str());
|
||||
split_skipped[cur_split] = true;
|
||||
fout = std::ofstream();
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
ensure_output_directory(fname);
|
||||
fout = std::ofstream(fname, std::ios::binary);
|
||||
fout.exceptions(std::ofstream::failbit); // fail fast on write errors
|
||||
const size_t meta_size = gguf_get_meta_size(ctx_outs[cur_split]);
|
||||
@@ -1219,6 +1257,13 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
||||
new_ofstream(weight->idx);
|
||||
}
|
||||
|
||||
if (params->partial_requant && split_skipped[cur_split]) {
|
||||
const std::string name = ggml_get_name(tensor);
|
||||
gguf_set_tensor_type(ctx_outs[cur_split], name.c_str(), tensor->type);
|
||||
gguf_set_tensor_data(ctx_outs[cur_split], name.c_str(), tensor->data, ggml_nbytes(tensor));
|
||||
continue;
|
||||
}
|
||||
|
||||
const std::string name = ggml_get_name(tensor);
|
||||
|
||||
if (!ml.use_mmap) {
|
||||
@@ -1511,7 +1556,7 @@ QuantizationDone:;
|
||||
total_size_org += ggml_nbytes(tensor);
|
||||
total_size_new += new_size;
|
||||
|
||||
if (!params->dry_run) {
|
||||
if (!params->dry_run && !split_skipped[cur_split]) {
|
||||
// update the gguf meta data as we go
|
||||
gguf_set_tensor_type(ctx_outs[cur_split], name.c_str(), new_type);
|
||||
gguf_set_tensor_data(ctx_outs[cur_split], name.c_str(), new_data, new_size);
|
||||
|
||||
@@ -4414,6 +4414,7 @@ struct llama_model_quantize_params llama_model_quantize_default_params() {
|
||||
/*.ignore_imatrix_rules =*/ false,
|
||||
/*.only_repack =*/ false,
|
||||
/*.dry_run =*/ false,
|
||||
/*.partial_requant =*/ false,
|
||||
/*.imatrix =*/ nullptr,
|
||||
/*.kv_overrides =*/ nullptr,
|
||||
/*.custom_quants =*/ nullptr,
|
||||
|
||||
Reference in New Issue
Block a user