mirror of
https://github.com/ikawrakow/ik_llama.cpp.git
synced 2026-05-01 03:41:53 +00:00
Add ability to hide imatrix details in llama-quantize
This commit is contained in:
@@ -142,11 +142,12 @@ static bool try_parse_ftype(const std::string & ftype_str_in, llama_ftype & ftyp
|
|||||||
//
|
//
|
||||||
[[noreturn]]
|
[[noreturn]]
|
||||||
static void usage(const char * executable) {
|
static void usage(const char * executable) {
|
||||||
printf("usage: %s [--help] [--allow-requantize] [--leave-output-tensor] [--pure] [--imatrix] [--include-weights] [--exclude-weights] [--output-tensor-type] [--token-embedding-type] [--attn-q-type] [--attn-k-type] [--attn-v-type] [--attn-qkv-type] [--attn-output-type] [--ffn-gate-type] [--ffn-down-type] [--ffn-up-type] [--keep-split] [--override-kv] model-f32.gguf [model-quant.gguf] type [nthreads]\n\n", executable);
|
printf("usage: %s [--help] [--allow-requantize] [--leave-output-tensor] [--pure] [--imatrix] [--hide-imatrix] [--include-weights] [--exclude-weights] [--output-tensor-type] [--token-embedding-type] [--attn-q-type] [--attn-k-type] [--attn-v-type] [--attn-qkv-type] [--attn-output-type] [--ffn-gate-type] [--ffn-down-type] [--ffn-up-type] [--keep-split] [--override-kv] model-f32.gguf [model-quant.gguf] type [nthreads]\n\n", executable);
|
||||||
printf(" --allow-requantize: Allows requantizing tensors that have already been quantized. Warning: This can severely reduce quality compared to quantizing from 16bit or 32bit\n");
|
printf(" --allow-requantize: Allows requantizing tensors that have already been quantized. Warning: This can severely reduce quality compared to quantizing from 16bit or 32bit\n");
|
||||||
printf(" --leave-output-tensor: Will leave output.weight un(re)quantized. Increases model size but may also increase quality, especially when requantizing\n");
|
printf(" --leave-output-tensor: Will leave output.weight un(re)quantized. Increases model size but may also increase quality, especially when requantizing\n");
|
||||||
printf(" --pure: Disable k-quant mixtures and quantize all tensors to the same type\n");
|
printf(" --pure: Disable k-quant mixtures and quantize all tensors to the same type\n");
|
||||||
printf(" --imatrix file_name: use data in file_name as importance matrix for quant optimizations\n");
|
printf(" --imatrix file_name: use data in file_name as importance matrix for quant optimizations\n");
|
||||||
|
printf(" --hide-imatrix: do not store imatrix details in the quantized model\n");
|
||||||
printf(" --include-weights tensor_name: use importance matrix for this/these tensor(s)\n");
|
printf(" --include-weights tensor_name: use importance matrix for this/these tensor(s)\n");
|
||||||
printf(" --exclude-weights tensor_name: use importance matrix for this/these tensor(s)\n");
|
printf(" --exclude-weights tensor_name: use importance matrix for this/these tensor(s)\n");
|
||||||
printf(" --output-tensor-type ggml_type: use this ggml_type for the output.weight tensor.\n");
|
printf(" --output-tensor-type ggml_type: use this ggml_type for the output.weight tensor.\n");
|
||||||
@@ -337,6 +338,8 @@ int main(int argc, char ** argv) {
|
|||||||
|
|
||||||
std::vector<std::string> repack_patterns;
|
std::vector<std::string> repack_patterns;
|
||||||
|
|
||||||
|
bool hide_imatrix = false;
|
||||||
|
|
||||||
for (; arg_idx < argc && strncmp(argv[arg_idx], "--", 2) == 0; arg_idx++) {
|
for (; arg_idx < argc && strncmp(argv[arg_idx], "--", 2) == 0; arg_idx++) {
|
||||||
if (strcmp(argv[arg_idx], "--leave-output-tensor") == 0) {
|
if (strcmp(argv[arg_idx], "--leave-output-tensor") == 0) {
|
||||||
params.quantize_output_tensor = false;
|
params.quantize_output_tensor = false;
|
||||||
@@ -429,6 +432,8 @@ int main(int argc, char ** argv) {
|
|||||||
} else {
|
} else {
|
||||||
usage(argv[0]);
|
usage(argv[0]);
|
||||||
}
|
}
|
||||||
|
} else if (strcmp(argv[arg_idx], "--hide-imatrix") == 0) {
|
||||||
|
hide_imatrix = true;
|
||||||
} else if (strcmp(argv[arg_idx], "--include-weights") == 0) {
|
} else if (strcmp(argv[arg_idx], "--include-weights") == 0) {
|
||||||
if (arg_idx < argc-1) {
|
if (arg_idx < argc-1) {
|
||||||
included_weights.emplace_back(argv[++arg_idx]);
|
included_weights.emplace_back(argv[++arg_idx]);
|
||||||
@@ -469,7 +474,11 @@ int main(int argc, char ** argv) {
|
|||||||
llama_model_kv_override kvo;
|
llama_model_kv_override kvo;
|
||||||
std::strcpy(kvo.key, LLM_KV_QUANTIZE_IMATRIX_FILE);
|
std::strcpy(kvo.key, LLM_KV_QUANTIZE_IMATRIX_FILE);
|
||||||
kvo.tag = LLAMA_KV_OVERRIDE_TYPE_STR;
|
kvo.tag = LLAMA_KV_OVERRIDE_TYPE_STR;
|
||||||
strncpy(kvo.val_str, imatrix_file.c_str(), 127);
|
if (hide_imatrix) {
|
||||||
|
strncpy(kvo.val_str, "top_secret", 127);
|
||||||
|
} else {
|
||||||
|
strncpy(kvo.val_str, imatrix_file.c_str(), 127);
|
||||||
|
}
|
||||||
kvo.val_str[127] = '\0';
|
kvo.val_str[127] = '\0';
|
||||||
kv_overrides.emplace_back(std::move(kvo));
|
kv_overrides.emplace_back(std::move(kvo));
|
||||||
}
|
}
|
||||||
@@ -477,7 +486,11 @@ int main(int argc, char ** argv) {
|
|||||||
llama_model_kv_override kvo;
|
llama_model_kv_override kvo;
|
||||||
std::strcpy(kvo.key, LLM_KV_QUANTIZE_IMATRIX_DATASET);
|
std::strcpy(kvo.key, LLM_KV_QUANTIZE_IMATRIX_DATASET);
|
||||||
kvo.tag = LLAMA_KV_OVERRIDE_TYPE_STR;
|
kvo.tag = LLAMA_KV_OVERRIDE_TYPE_STR;
|
||||||
strncpy(kvo.val_str, imatrix_dataset.c_str(), 127);
|
if (hide_imatrix) {
|
||||||
|
strncpy(kvo.val_str, "top_secret", 127);
|
||||||
|
} else {
|
||||||
|
strncpy(kvo.val_str, imatrix_dataset.c_str(), 127);
|
||||||
|
}
|
||||||
kvo.val_str[127] = '\0';
|
kvo.val_str[127] = '\0';
|
||||||
kv_overrides.emplace_back(std::move(kvo));
|
kv_overrides.emplace_back(std::move(kvo));
|
||||||
}
|
}
|
||||||
@@ -486,7 +499,11 @@ int main(int argc, char ** argv) {
|
|||||||
llama_model_kv_override kvo;
|
llama_model_kv_override kvo;
|
||||||
std::strcpy(kvo.key, LLM_KV_QUANTIZE_IMATRIX_N_ENTRIES);
|
std::strcpy(kvo.key, LLM_KV_QUANTIZE_IMATRIX_N_ENTRIES);
|
||||||
kvo.tag = LLAMA_KV_OVERRIDE_TYPE_INT;
|
kvo.tag = LLAMA_KV_OVERRIDE_TYPE_INT;
|
||||||
kvo.val_i64 = imatrix_data.size();
|
if (hide_imatrix) {
|
||||||
|
kvo.val_i64 = 0;
|
||||||
|
} else {
|
||||||
|
kvo.val_i64 = imatrix_data.size();
|
||||||
|
}
|
||||||
kv_overrides.emplace_back(std::move(kvo));
|
kv_overrides.emplace_back(std::move(kvo));
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -494,7 +511,11 @@ int main(int argc, char ** argv) {
|
|||||||
llama_model_kv_override kvo;
|
llama_model_kv_override kvo;
|
||||||
std::strcpy(kvo.key, LLM_KV_QUANTIZE_IMATRIX_N_CHUNKS);
|
std::strcpy(kvo.key, LLM_KV_QUANTIZE_IMATRIX_N_CHUNKS);
|
||||||
kvo.tag = LLAMA_KV_OVERRIDE_TYPE_INT;
|
kvo.tag = LLAMA_KV_OVERRIDE_TYPE_INT;
|
||||||
kvo.val_i64 = m_last_call;
|
if (hide_imatrix) {
|
||||||
|
kvo.val_i64 = 0;
|
||||||
|
} else {
|
||||||
|
kvo.val_i64 = m_last_call;
|
||||||
|
}
|
||||||
kv_overrides.emplace_back(std::move(kvo));
|
kv_overrides.emplace_back(std::move(kvo));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
Reference in New Issue
Block a user