mirror of
https://github.com/ikawrakow/ik_llama.cpp.git
synced 2026-02-26 16:14:10 +00:00
Llama-quantize: Partial requant feature (#1313)
* Partial Requant feature for llama-quantize - Inspired by the recently portcopied --dry-run feature. - Allows to partially requantize a split quantized .gguf by requantizing only the missing splits in the destination directory. - Works both for GGUF which are split tensors by tensors, or by group of several tensors (though this one is not very much tested beyond 2 tensors by split). - Vibe coded. * Create output directory if it doesn't exist in llama-quantize * Create output directory if it doesn't exist in gguf-split * Add exit when directory fails to be created on Windows * Use std::filesystem * cleanup
This commit is contained in:
@@ -151,7 +151,7 @@ static bool try_parse_ftype(const std::string & ftype_str_in, llama_ftype & ftyp
|
||||
//
|
||||
[[noreturn]]
|
||||
static void usage(const char * executable) {
|
||||
printf("usage: %s [--help] [--allow-requantize] [--leave-output-tensor] [--pure] [--imatrix] [--hide-imatrix] [--include-weights] [--exclude-weights] [--output-tensor-type] [--token-embedding-type] [--ffn-gate-inp-type] [--attn-q-type] [--attn-k-type] [--attn-v-type] [--attn-qkv-type] [--attn-output-type] [--ffn-gate-type] [--ffn-down-type] [--ffn-up-type] [--keep-split] [--override-kv] model-f32.gguf [model-quant.gguf] type [nthreads]\n\n", executable);
|
||||
printf("usage: %s [--help] [--allow-requantize] [--leave-output-tensor] [--pure] [--imatrix] [--hide-imatrix] [--include-weights] [--exclude-weights] [--output-tensor-type] [--token-embedding-type] [--ffn-gate-inp-type] [--attn-q-type] [--attn-k-type] [--attn-v-type] [--attn-qkv-type] [--attn-output-type] [--ffn-gate-type] [--ffn-down-type] [--ffn-up-type] [--keep-split] [--partial-requant] [--override-kv] model-f32.gguf [model-quant.gguf] type [nthreads]\n\n", executable);
|
||||
printf(" --allow-requantize: Allows requantizing tensors that have already been quantized. Warning: This can severely reduce quality compared to quantizing from 16bit or 32bit\n");
|
||||
printf(" --leave-output-tensor: Will leave output.weight un(re)quantized. Increases model size but may also increase quality, especially when requantizing\n");
|
||||
printf(" --pure: Disable k-quant mixtures and quantize all tensors to the same type\n");
|
||||
@@ -175,6 +175,7 @@ static void usage(const char * executable) {
|
||||
printf(" --ffn-down-type ggml_type: use this ggml_type for the ffn_down tensor.\n");
|
||||
printf(" --ffn-up-type ggml_type: use this ggml_type for the ffn_up tensor.\n\n");
|
||||
printf(" --keep-split: will generate quantized model in the same shards as input\n");
|
||||
printf(" --partial-requant: quantize only missing split files in the split quantized .gguf destination directory\n");
|
||||
printf(" --override-kv KEY=TYPE:VALUE\n");
|
||||
printf(" Advanced option to override model metadata by key in the quantized model. May be specified multiple times.\n\n");
|
||||
printf("Note: --include-weights and --exclude-weights cannot be used together\n");
|
||||
@@ -466,6 +467,8 @@ int main(int argc, char ** argv) {
|
||||
}
|
||||
} else if (strcmp(argv[arg_idx], "--keep-split") == 0) {
|
||||
params.keep_split = true;
|
||||
} else if (strcmp(argv[arg_idx], "--partial-requant") == 0) {
|
||||
params.partial_requant = true;
|
||||
} else {
|
||||
usage(argv[0]);
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user