iq2_kt - this is better

Using blocks of 32 and 16 bits per group of 8 weights
it beats iq2_xxs in terms of PPL by a significant margin.
It is 0.0625 bpw larger, but even if we go to 15 bits per
group od 8 (so 0.0625 bpw less than iq2_xxs), PPL is still
lower.
This commit is contained in:
Iwan Kawrakow
2024-11-06 20:49:56 +02:00
parent 766fa600c8
commit 36e9c922b8
6 changed files with 388 additions and 127 deletions

View File

@@ -15775,7 +15775,10 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
if (name.find("attn_v.weight") != std::string::npos) {
if (qs.model.hparams.n_gqa() >= 4 || qs.model.hparams.n_expert >= 4) new_type = GGML_TYPE_IQ4_K;
else if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) new_type = GGML_TYPE_IQ3_K;
else new_type = ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M ? GGML_TYPE_IQ3_S : GGML_TYPE_Q2_K;
//else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_KT) new_type = GGML_TYPE_IQ2_K;
else {
new_type = ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M ? GGML_TYPE_IQ3_S : GGML_TYPE_Q2_K;
}
++qs.i_attention_wv;
}
else if (qs.model.hparams.n_expert == 8 && name.find("attn_k.weight") != std::string::npos) {
@@ -15784,7 +15787,7 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
else if (name.find("attn_qkv.weight") != std::string::npos) {
new_type = ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M ? GGML_TYPE_IQ3_XXS : GGML_TYPE_IQ2_K;
}
else if (name.find("ffn_down") != std::string::npos) {
else if (name.find("ffn_down") != std::string::npos) { // && ftype != LLAMA_FTYPE_MOSTLY_IQ2_KT) {
if (qs.i_ffn_down < qs.n_ffn_down/8) {
new_type = ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M ? GGML_TYPE_IQ3_S : GGML_TYPE_Q2_K;
}