mirror of
https://github.com/ikawrakow/ik_llama.cpp.git
synced 2026-04-30 19:31:48 +00:00
q2_K: allow it to detect ternary nets and quantize accordingly
This commit is contained in:
@@ -255,6 +255,8 @@ int main(int argc, char ** argv) {
|
|||||||
for (; arg_idx < argc && strncmp(argv[arg_idx], "--", 2) == 0; arg_idx++) {
|
for (; arg_idx < argc && strncmp(argv[arg_idx], "--", 2) == 0; arg_idx++) {
|
||||||
if (strcmp(argv[arg_idx], "--leave-output-tensor") == 0) {
|
if (strcmp(argv[arg_idx], "--leave-output-tensor") == 0) {
|
||||||
params.quantize_output_tensor = false;
|
params.quantize_output_tensor = false;
|
||||||
|
} else if (strcmp(argv[arg_idx], "--ignore-imatrix-rules") == 0) {
|
||||||
|
params.ignore_imatrix_rules = true;
|
||||||
} else if (strcmp(argv[arg_idx], "--output-tensor-type") == 0) {
|
} else if (strcmp(argv[arg_idx], "--output-tensor-type") == 0) {
|
||||||
if (arg_idx < argc-1) {
|
if (arg_idx < argc-1) {
|
||||||
params.output_tensor_type = parse_ggml_type(argv[++arg_idx]);
|
params.output_tensor_type = parse_ggml_type(argv[++arg_idx]);
|
||||||
@@ -409,11 +411,12 @@ int main(int argc, char ** argv) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if ((params.ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS || params.ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS ||
|
if (!params.ignore_imatrix_rules && imatrix_data.empty() &&
|
||||||
|
(params.ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS || params.ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS ||
|
||||||
params.ftype == LLAMA_FTYPE_MOSTLY_IQ2_S ||
|
params.ftype == LLAMA_FTYPE_MOSTLY_IQ2_S ||
|
||||||
params.ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S ||
|
params.ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S ||
|
||||||
params.ftype == LLAMA_FTYPE_MOSTLY_IQ1_S ||
|
params.ftype == LLAMA_FTYPE_MOSTLY_IQ1_S ||
|
||||||
params.ftype == LLAMA_FTYPE_MOSTLY_IQ1_M) && imatrix_data.empty()) {
|
params.ftype == LLAMA_FTYPE_MOSTLY_IQ1_M)) {
|
||||||
fprintf(stderr, "\n==========================================================================================================\n");
|
fprintf(stderr, "\n==========================================================================================================\n");
|
||||||
fprintf(stderr, "Please do not use IQ1_S, IQ1_M, IQ2_S, IQ2_XXS, IQ2_XS or Q2_K_S quantization without an importance matrix\n");
|
fprintf(stderr, "Please do not use IQ1_S, IQ1_M, IQ2_S, IQ2_XXS, IQ2_XS or Q2_K_S quantization without an importance matrix\n");
|
||||||
fprintf(stderr, "==========================================================================================================\n\n\n");
|
fprintf(stderr, "==========================================================================================================\n\n\n");
|
||||||
|
|||||||
@@ -1995,7 +1995,52 @@ void quantize_row_q2_K_ref(const float * restrict x, block_q2_K * restrict y, in
|
|||||||
|
|
||||||
const float q4scale = 15.f;
|
const float q4scale = 15.f;
|
||||||
|
|
||||||
|
// Detect TriNet
|
||||||
|
{
|
||||||
|
int n = k;
|
||||||
|
float max = 0;
|
||||||
|
for (int j = 0; j < n; ++j) {
|
||||||
|
float ax = fabsf(x[j]);
|
||||||
|
max = MAX(max, ax);
|
||||||
|
}
|
||||||
|
float mse0 = 0, mse = 0;
|
||||||
|
for (int j = 0; j < n; ++j) {
|
||||||
|
int l = x[j] < -0.5f*max ? -1 : x[j] < 0.5f*max ? 0 : 1;
|
||||||
|
mse0 += x[j]*x[j];
|
||||||
|
float diff = x[j] - max*l;
|
||||||
|
mse += diff*diff;
|
||||||
|
}
|
||||||
|
if (mse < 0.1f*mse0) {
|
||||||
|
// yes, most likely trinet
|
||||||
|
for (int ibl = 0; ibl < nb; ++ibl) {
|
||||||
|
y[ibl].d = GGML_FP32_TO_FP16(max);
|
||||||
|
y[ibl].dmin = GGML_FP32_TO_FP16(max);
|
||||||
|
for (int ib = 0; ib < QK_K/16; ++ib) y[ibl].scales[ib] = 1 | (1 << 4);
|
||||||
|
const float * xb = x + QK_K * ibl;
|
||||||
|
for (int j = 0; j < QK_K; ++j) {
|
||||||
|
L[j] = xb[j] < -0.5f*max ? 0 : xb[j] < 0.5f*max ? 1 : 2;
|
||||||
|
}
|
||||||
|
uint8_t * qs = y[ibl].qs;
|
||||||
|
for (int j = 0; j < QK_K; j += 128) {
|
||||||
|
for (int l = 0; l < 32; ++l) {
|
||||||
|
qs[l] = L[j + l] | (L[j + l + 32] << 2) | (L[j + l + 64] << 4) | (L[j + l + 96] << 6);
|
||||||
|
}
|
||||||
|
qs += 32;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
for (int i = 0; i < nb; i++) {
|
for (int i = 0; i < nb; i++) {
|
||||||
|
//{
|
||||||
|
// float max = x[0], min = x[0];
|
||||||
|
// for (int j = 1; j < 256; ++j) {
|
||||||
|
// max = MAX(x[j], max);
|
||||||
|
// min = MIN(x[j], min);
|
||||||
|
// }
|
||||||
|
// printf("%s: max = %g, min = %g\n", __func__, (double)max, (double)min);
|
||||||
|
//}
|
||||||
float max_scale = 0; // as we are deducting the min, scales are always positive
|
float max_scale = 0; // as we are deducting the min, scales are always positive
|
||||||
float max_min = 0;
|
float max_min = 0;
|
||||||
for (int j = 0; j < QK_K/16; ++j) {
|
for (int j = 0; j < QK_K/16; ++j) {
|
||||||
|
|||||||
@@ -359,6 +359,7 @@ extern "C" {
|
|||||||
bool only_copy; // only copy tensors - ftype, allow_requantize and quantize_output_tensor are ignored
|
bool only_copy; // only copy tensors - ftype, allow_requantize and quantize_output_tensor are ignored
|
||||||
bool pure; // quantize all tensors to the default type
|
bool pure; // quantize all tensors to the default type
|
||||||
bool keep_split; // quantize to the same number of shards
|
bool keep_split; // quantize to the same number of shards
|
||||||
|
bool ignore_imatrix_rules; // If set to true, the built-in rules for refusing to quantize into certain quants without imatrix are ignored
|
||||||
void * imatrix; // pointer to importance matrix data
|
void * imatrix; // pointer to importance matrix data
|
||||||
void * kv_overrides; // pointer to vector containing overrides
|
void * kv_overrides; // pointer to vector containing overrides
|
||||||
} llama_model_quantize_params;
|
} llama_model_quantize_params;
|
||||||
|
|||||||
@@ -16071,12 +16071,13 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if ((new_type == GGML_TYPE_IQ2_XXS ||
|
if (!params->ignore_imatrix_rules && !imatrix &&
|
||||||
|
(new_type == GGML_TYPE_IQ2_XXS ||
|
||||||
new_type == GGML_TYPE_IQ2_XS ||
|
new_type == GGML_TYPE_IQ2_XS ||
|
||||||
new_type == GGML_TYPE_IQ2_S ||
|
new_type == GGML_TYPE_IQ2_S ||
|
||||||
new_type == GGML_TYPE_IQ1_S ||
|
new_type == GGML_TYPE_IQ1_S ||
|
||||||
(new_type == GGML_TYPE_IQ1_M && strcmp(tensor->name, "token_embd.weight") && strcmp(tensor->name, "output.weight")) ||
|
(new_type == GGML_TYPE_IQ1_M && strcmp(tensor->name, "token_embd.weight") && strcmp(tensor->name, "output.weight")) ||
|
||||||
(new_type == GGML_TYPE_Q2_K && params->ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S && strcmp(tensor->name, "token_embd.weight") != 0)) && !imatrix) {
|
(new_type == GGML_TYPE_Q2_K && params->ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S && strcmp(tensor->name, "token_embd.weight") != 0))) {
|
||||||
LLAMA_LOG_ERROR("\n\n============================================================\n");
|
LLAMA_LOG_ERROR("\n\n============================================================\n");
|
||||||
LLAMA_LOG_ERROR("Missing importance matrix for tensor %s in a very low-bit quantization\n", tensor->name);
|
LLAMA_LOG_ERROR("Missing importance matrix for tensor %s in a very low-bit quantization\n", tensor->name);
|
||||||
LLAMA_LOG_ERROR("The result will be garbage, so bailing out\n");
|
LLAMA_LOG_ERROR("The result will be garbage, so bailing out\n");
|
||||||
@@ -16441,6 +16442,7 @@ struct llama_model_quantize_params llama_model_quantize_default_params() {
|
|||||||
/*.only_copy =*/ false,
|
/*.only_copy =*/ false,
|
||||||
/*.pure =*/ false,
|
/*.pure =*/ false,
|
||||||
/*.keep_split =*/ false,
|
/*.keep_split =*/ false,
|
||||||
|
/*.ignore_imatrix_rules =*/ false,
|
||||||
/*.imatrix =*/ nullptr,
|
/*.imatrix =*/ nullptr,
|
||||||
/*.kv_overrides =*/ nullptr,
|
/*.kv_overrides =*/ nullptr,
|
||||||
};
|
};
|
||||||
|
|||||||
Reference in New Issue
Block a user