From 39ef8eeb9deb49e5696414918f73dc620d34d0bf Mon Sep 17 00:00:00 2001 From: Iwan Kawrakow Date: Thu, 17 Jul 2025 11:25:36 +0300 Subject: [PATCH] Two things * Use iq2_kl instead of q2_K in the default recipes * Fine-tune the iq2_ks block scales --- ggml/src/iqk/iqk_quantize.cpp | 113 ++++++++-------------------------- src/llama.cpp | 18 +++--- 2 files changed, 35 insertions(+), 96 deletions(-) diff --git a/ggml/src/iqk/iqk_quantize.cpp b/ggml/src/iqk/iqk_quantize.cpp index b38cc51f..7de1aa5e 100644 --- a/ggml/src/iqk/iqk_quantize.cpp +++ b/ggml/src/iqk/iqk_quantize.cpp @@ -1179,8 +1179,6 @@ void quantize_row_iq2_ks_impl(const float * x, void * vy, int n_per_row, const f constexpr int kBlockSize = 32; constexpr int kMax_i1 = 3*kBlockSize/4; constexpr int kMin_i3 = kBlockSize/4; - //constexpr int kNtry = 5; - //constexpr float kStep = 1.f; ggml_half * dptr = (ggml_half *)vy; *dptr = GGML_FP32_TO_FP16(0.f); @@ -1226,83 +1224,6 @@ void quantize_row_iq2_ks_impl(const float * x, void * vy, int n_per_row, const f sw[ib] += weight[j]; pairs[j] = {xb[j], j}; } - //float amax = 0, max = 0; - //for (int j = 0; j < kBlockSize; ++j) { - // float ax = fabsf(xb[j]); - // if (ax > amax) { - // amax = ax; max = xb[j]; - // } - //} - //if (!amax) { - // scales[ib] = 0; - // continue; - //} - //float d = kNtry > 0 ? -max/iq2nl_values[0] : max/iq2nl_values[0]; - //float id = 1/d; - //float sumqx_p = 0, sumq2_p = 0; - //float sumqx_m = 0, sumq2_m = 0; - //for (int j = 0; j < kBlockSize; ++j) { - // float w = weight[j]; - // float al = id*xb[j]; - // int l = best_index_iq2nl(iq2nl_values, al); - // float q = iq2nl_values[l]; - // sumqx_p += w*q*xb[j]; - // sumq2_p += w*q*q; - // l = best_index_iq2nl(iq2nl_values, -al); - // q = iq2nl_values[l]; - // sumqx_m += w*q*xb[j]; - // sumq2_m += w*q*q; - //} - //d = sumqx_p/sumq2_p; - //float best = d*sumqx_p; - //if (sumq2_m > 0 && sumqx_m*sumqx_m > best*sumq2_m) { - // d = sumqx_m/sumq2_m; best = d*sumqx_m; - //} - //bool is_shifted = false; - //for (int itry = -kNtry; itry <= kNtry; ++itry) { - // id = (kStep*itry + iq2nl_values[0])/max; - // sumqx_p = sumq2_p = 0; - // sumqx_m = sumq2_m = 0; - // for (int j = 0; j < kBlockSize; ++j) { - // float w = weight[j]; - // float al = id*xb[j]; - // int l = best_index_iq2nl(iq2nl_values, al); - // float q = iq2nl_values[l]; - // sumqx_p += w*q*xb[j]; - // sumq2_p += w*q*q; - // l = best_index_iq2nl(iq2nl_values, -al); - // q = iq2nl_values[l]; - // sumqx_m += w*q*xb[j]; - // sumq2_m += w*q*q; - // } - // if (sumq2_p > 0 && sumqx_p*sumqx_p > best*sumq2_p) { - // d = sumqx_p/sumq2_p; best = d * sumqx_p; is_shifted = false; - // } - // if (sumq2_m > 0 && sumqx_m*sumqx_m > best*sumq2_m) { - // d = sumqx_m/sumq2_m; best = d * sumqx_m; is_shifted = false; - // } - // id = (kStep*itry + shifted_values[0])/max; - // sumqx_p = sumq2_p = 0; - // sumqx_m = sumq2_m = 0; - // for (int j = 0; j < kBlockSize; ++j) { - // float w = weight[j]; - // float al = id*xb[j]; - // int l = best_index_iq2nl(shifted_values, al); - // float q = shifted_values[l]; - // sumqx_p += w*q*xb[j]; - // sumq2_p += w*q*q; - // l = best_index_iq2nl(shifted_values, -al); - // q = shifted_values[l]; - // sumqx_m += w*q*xb[j]; - // sumq2_m += w*q*q; - // } - // if (sumq2_p > 0 && sumqx_p*sumqx_p > best*sumq2_p) { - // d = sumqx_p/sumq2_p; best = d * sumqx_p; is_shifted = true; - // } - // if (sumq2_m > 0 && sumqx_m*sumqx_m > best*sumq2_m) { - // d = sumqx_m/sumq2_m; best = d * sumqx_m; is_shifted = true; - // } - //} std::sort(pairs.begin(), pairs.end()); sumx[0] = sumw[0] = 0; for (int j = 0; j < kBlockSize; ++j) { @@ -1368,19 +1289,37 @@ void quantize_row_iq2_ks_impl(const float * x, void * vy, int n_per_row, const f auto Ls = all_Ls + ibl*(QK_K/kBlockSize); for (int ib = 0; ib < QK_K/kBlockSize; ++ib) { int ls = Ls[ib]; + const int8_t * block_values = y[ibl].extra & (1 << ib) ? shifted_values : iq2nl_values; + const float * xb = xbl + kBlockSize*ib; + int lsmin = std::max(ls-1, 0); + int lsmax = std::min(ls+1, 31); + if (quant_weights) { + const float * qw = quant_weights + ibl*QK_K + ib*kBlockSize; + for (int j = 0; j < kBlockSize; ++j) weight[j] = qw[j] * sqrtf(sigma2 + xb[j]*xb[j]); + } else { + for (int j = 0; j < kBlockSize; ++j) weight[j] = 0.25f*sigma2 + xb[j]*xb[j]; + } + int best_ls = ls; + float best_score = std::numeric_limits::max(); + for (int ils = lsmin; ils <= lsmax; ++ils) { + float dl = d * (ils - 16); + float idl = std::abs(dl) > 1e-13f ? 1/dl : 0.f; + float score = 0; + for (int j = 0; j < 32; ++j) { + int ibest = best_index_iq2nl(block_values, idl*xb[j]); + float diff = xb[j] - dl*block_values[ibest]; + score += weight[j]*diff*diff; + } + if (score < best_score) { + best_score = score; best_ls = ils; + } + } + ls = best_ls; y[ibl].scales[ib/2] |= ((ls & 0xf) << 4*(ib%2)); y[ibl].extra |= ((ls >> 4) << (8 + ib)); ls -= 16; float dl = d * ls; if (dl) { - const int8_t * block_values = y[ibl].extra & (1 << ib) ? shifted_values : iq2nl_values; - const float * xb = xbl + kBlockSize*ib; - if (quant_weights) { - const float * qw = quant_weights + ibl*QK_K + ib*kBlockSize; - for (int j = 0; j < kBlockSize; ++j) weight[j] = qw[j] * sqrtf(sigma2 + xb[j]*xb[j]); - } else { - for (int j = 0; j < kBlockSize; ++j) weight[j] = 0.25f*sigma2 + xb[j]*xb[j]; - } float idl = 1/dl; uint8_t * qs = y[ibl].qs + 32*(ib/4); for (int j = 0; j < 32; ++j) { diff --git a/src/llama.cpp b/src/llama.cpp index 0d29f24a..5af8e30d 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -19419,9 +19419,9 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n } else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_KT) { if (name.find("attn_v.weight") != std::string::npos) { - if (qs.model.hparams.n_expert >= 4 || qs.model.hparams.n_gqa() >= 4) new_type = GGML_TYPE_IQ4_K; + if (qs.model.hparams.n_expert >= 4 || qs.model.hparams.n_gqa() >= 4) new_type = GGML_TYPE_IQ4_KS; else if (qs.model.hparams.n_gqa() >= 2) new_type = GGML_TYPE_IQ3_K; - else new_type = GGML_TYPE_Q2_K; + else new_type = GGML_TYPE_IQ2_KL; ++qs.i_attention_wv; } else if (qs.model.hparams.n_expert >= 8 && name.find("attn_k") != std::string::npos) { @@ -19439,7 +19439,7 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n new_type = GGML_TYPE_IQ3_K; } else if (name.find("_shexp.weight") != std::string::npos) { - new_type = GGML_TYPE_IQ4_K; + new_type = GGML_TYPE_IQ4_KS; } else if (name.find("ffn_down") != std::string::npos) { auto [i_layer, n_layer] = layer_info(qs.i_ffn_down, qs.n_ffn_down, name.c_str()); @@ -19458,9 +19458,9 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n ftype == LLAMA_FTYPE_MOSTLY_IQ2_M_R4) { bool is_iq2_m = ftype == LLAMA_FTYPE_MOSTLY_IQ2_M || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M_R4; if (name.find("attn_v.weight") != std::string::npos) { - if (qs.model.hparams.n_gqa() >= 4 || qs.model.hparams.n_expert >= 4) new_type = GGML_TYPE_IQ4_K; + if (qs.model.hparams.n_gqa() >= 4 || qs.model.hparams.n_expert >= 4) new_type = GGML_TYPE_IQ4_KS; else if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) new_type = GGML_TYPE_IQ3_K; - else new_type = ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || is_iq2_m ? GGML_TYPE_IQ3_S : GGML_TYPE_Q2_K; + else new_type = ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || is_iq2_m ? GGML_TYPE_IQ3_S : GGML_TYPE_IQ2_KL; ++qs.i_attention_wv; } else if (qs.model.hparams.n_expert >= 8 && name.find("attn_k") != std::string::npos) { @@ -19474,7 +19474,7 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n } else if (name.find("ffn_down") != std::string::npos) { if (qs.i_ffn_down < qs.n_ffn_down/8) { - new_type = ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || is_iq2_m ? GGML_TYPE_IQ3_S : GGML_TYPE_Q2_K; + new_type = ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || is_iq2_m ? GGML_TYPE_IQ3_S : GGML_TYPE_IQ2_KL; } ++qs.i_ffn_down; } @@ -19505,7 +19505,7 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n new_type = qs.model.hparams.n_gqa() >= 4 ? GGML_TYPE_Q4_K : GGML_TYPE_Q3_K; } else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_K) { - new_type = qs.model.hparams.n_gqa() >= 2 ? GGML_TYPE_IQ4_K : GGML_TYPE_IQ3_K; + new_type = qs.model.hparams.n_gqa() >= 2 ? GGML_TYPE_IQ4_KS : GGML_TYPE_IQ3_K; } else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_K_R4) { new_type = qs.model.hparams.n_gqa() >= 2 ? GGML_TYPE_IQ4_K_R4 : GGML_TYPE_IQ3_K_R4; @@ -19523,7 +19523,7 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_KT) { //new_type = qs.model.hparams.n_gqa() >= 4 ? GGML_TYPE_IQ4_K : qs.model.hparams.n_gqa() >= 2 ? GGML_TYPE_IQ3_K // : !qs.has_imatrix ? GGML_TYPE_IQ3_K : GGML_TYPE_IQ3_KT; - new_type = qs.model.hparams.n_gqa() >= 4 ? GGML_TYPE_IQ4_K : GGML_TYPE_IQ3_K; + new_type = qs.model.hparams.n_gqa() >= 4 ? GGML_TYPE_IQ4_KS : GGML_TYPE_IQ3_K; } else if (ftype == LLAMA_FTYPE_MOSTLY_IQ4_KT) { //new_type = qs.model.hparams.n_gqa() >= 4 ? GGML_TYPE_IQ5_K : qs.model.hparams.n_gqa() >= 2 ? GGML_TYPE_IQ4_K @@ -19592,7 +19592,7 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n if (new_type == GGML_TYPE_Q2_K || new_type == GGML_TYPE_IQ3_XXS) new_type = GGML_TYPE_IQ3_S; else if (new_type == GGML_TYPE_Q2_K_R4 || new_type == GGML_TYPE_IQ3_XXS_R4) new_type = GGML_TYPE_IQ3_K_R4; else if (new_type == GGML_TYPE_Q3_K || new_type == GGML_TYPE_IQ3_S) new_type = GGML_TYPE_Q4_K; - else if (new_type == GGML_TYPE_IQ3_K) new_type = GGML_TYPE_IQ4_K; + else if (new_type == GGML_TYPE_IQ3_K) new_type = GGML_TYPE_IQ4_KS; else if (new_type == GGML_TYPE_IQ3_KS) new_type = GGML_TYPE_IQ4_KS; else if (new_type == GGML_TYPE_IQ2_KL) new_type = GGML_TYPE_IQ4_KS; else if (new_type == GGML_TYPE_IQ3_S_R4) new_type = GGML_TYPE_Q4_K_R4;