From e640a9ed8888ae6563d98c24004ee178377e54e7 Mon Sep 17 00:00:00 2001
From: Iwan Kawrakow <iwan.kawrakow@gmail.com>
Date: Sat, 12 Oct 2024 09:52:49 +0300
Subject: [PATCH] iq2k: Try make_qx_quants for the scale

Slightly better for LLaMA-3.1, Gemma-2, slightly worse for
Qwen2.5
---
 ggml/src/ggml-cuda/convert.cu |  8 ++--
 ggml/src/iqk/iqk_quantize.cpp | 76 ++++++++++++++++++++++++++++++-----
 2 files changed, 69 insertions(+), 15 deletions(-)

diff --git a/ggml/src/ggml-cuda/convert.cu b/ggml/src/ggml-cuda/convert.cu
index 62dd52a2..4ec136fa 100644
--- a/ggml/src/ggml-cuda/convert.cu
+++ b/ggml/src/ggml-cuda/convert.cu
@@ -729,10 +729,10 @@ static __global__ void dequantize_block_iq2_k(const void * __restrict__ vx, dst_
     int il    = tid%16; // 0...15
     dst_t * y = yy + i*QK_K + 128*ib128 + 2*il;
     const float d = (float)x[i].d;
-    const float dl1 = d * (2*((x[i].scales[4*ib128+0] >> 4*(il/8)) & 0xf) - 15);
-    const float dl2 = d * (2*((x[i].scales[4*ib128+1] >> 4*(il/8)) & 0xf) - 15);
-    const float dl3 = d * (2*((x[i].scales[4*ib128+2] >> 4*(il/8)) & 0xf) - 15);
-    const float dl4 = d * (2*((x[i].scales[4*ib128+3] >> 4*(il/8)) & 0xf) - 15);
+    const float dl1 = d * (((x[i].scales[4*ib128+0] >> 4*(il/8)) & 0xf) - 8);
+    const float dl2 = d * (((x[i].scales[4*ib128+1] >> 4*(il/8)) & 0xf) - 8);
+    const float dl3 = d * (((x[i].scales[4*ib128+2] >> 4*(il/8)) & 0xf) - 8);
+    const float dl4 = d * (((x[i].scales[4*ib128+3] >> 4*(il/8)) & 0xf) - 8);
     const uint8_t * qs = x[i].qs + 32*ib128 + 2*il;
     const int16_t extra = x[i].extra >> (8*ib128 + (il/8));
     for (int j = 0; j < 2; ++j) {
diff --git a/ggml/src/iqk/iqk_quantize.cpp b/ggml/src/iqk/iqk_quantize.cpp
index 430b629f..6cc6cef0 100644
--- a/ggml/src/iqk/iqk_quantize.cpp
+++ b/ggml/src/iqk/iqk_quantize.cpp
@@ -30,6 +30,50 @@ inline int nearest_int(float fval) {
     return (i & 0x007fffff) - 0x00400000;
 }
 
+float make_qx_quants(int n, int nmax, const float * x, int8_t * L, const float * qw) {
+    float max = 0;
+    float amax = 0;
+    for (int i = 0; i < n; ++i) {
+        float ax = fabsf(x[i]);
+        if (ax > amax) { amax = ax; max = x[i]; }
+    }
+    if (!amax) { // all zero
+        for (int i = 0; i < n; ++i) L[i] = 0;
+        return 0.f;
+    }
+    float iscale = -nmax / max;
+    float sumlx = 0;
+    float suml2 = 0;
+    for (int i = 0; i < n; ++i) {
+        int l = nearest_int(iscale * x[i]);
+        l = std::max(-nmax, std::min(nmax-1, l));
+        L[i] = l + nmax;
+        sumlx += qw[i]*x[i]*l;
+        suml2 += qw[i]*l*l;
+    }
+    float scale = suml2 ? sumlx/suml2 : 0.0f;
+    float best = scale * sumlx;
+    for (int is = -9; is <= 9; ++is) {
+        if (is == 0) continue;
+        iscale = -(nmax + 0.1f*is) / max;
+        sumlx = suml2 = 0;
+        for (int i = 0; i < n; ++i) {
+            int l = nearest_int(iscale * x[i]);
+            l = std::max(-nmax, std::min(nmax-1, l));
+            sumlx += qw[i]*x[i]*l;
+            suml2 += qw[i]*l*l;
+        }
+        if (suml2 > 0 && sumlx*sumlx > best*suml2) {
+            for (int i = 0; i < n; ++i) {
+                int l = nearest_int(iscale * x[i]);
+                L[i] = nmax + std::max(-nmax, std::min(nmax-1, l));
+            }
+            scale = sumlx/suml2; best = scale*sumlx;
+        }
+    }
+    return scale;
+}
+
 struct IQ1BNQuantizer {
     int8_t L[QK_IQ1BN];
     void quantize_one_row_1bn(const float * src, block_iq1_bn * y, int n_per_row, const float * imatrix);
@@ -507,6 +551,8 @@ void quantize_row_iq2_k_impl(const float * x, void * vy, int n_per_row, const fl
     float scales[QK_K/kBlockSize];
     float weight[kBlockSize];
     float sumx[kBlockSize+1], sumw[kBlockSize+1];
+    float sw[QK_K/kBlockSize];
+    int8_t Ls[QK_K/kBlockSize];
 
     std::array<std::pair<float,int>, kBlockSize> pairs;
 
@@ -524,7 +570,7 @@ void quantize_row_iq2_k_impl(const float * x, void * vy, int n_per_row, const fl
 
         uint16_t extra = 0;
 
-        float max_abs_scale = 0;
+        float max_abs_scale = 0, max_scale = 0;
 
         for (int ib = 0; ib < QK_K/kBlockSize; ++ib) {
             const float * xb = xbl + kBlockSize*ib;
@@ -534,7 +580,11 @@ void quantize_row_iq2_k_impl(const float * x, void * vy, int n_per_row, const fl
             } else {
                 for (int j = 0; j < kBlockSize; ++j) weight[j] = 0.25f*sigma2 + xb[j]*xb[j];
             }
-            for (int j = 0; j < kBlockSize; ++j) pairs[j] = {xb[j], j};
+            sw[ib] = 0;
+            for (int j = 0; j < kBlockSize; ++j) {
+                sw[ib] += weight[j];
+                pairs[j] = {xb[j], j};
+            }
             std::sort(pairs.begin(), pairs.end());
             sumx[0] = sumw[0] = 0;
             for (int j = 0; j < kBlockSize; ++j) {
@@ -583,21 +633,25 @@ void quantize_row_iq2_k_impl(const float * x, void * vy, int n_per_row, const fl
             if (is_shifted) extra |= (1 << ib);
 
             float abs_scale = fabsf(scales[ib]);
-            max_abs_scale = MAX(max_abs_scale, abs_scale);
+            if (abs_scale > max_abs_scale) {
+                max_abs_scale = abs_scale;
+                max_scale = scales[ib];
+            }
         }
 
         if (!max_abs_scale) continue;
+        float d = make_qx_quants(QK_K/kBlockSize, 8, scales, Ls, sw);
+        if (!d) continue;
 
-        float d = max_abs_scale/15;
+        //float d = -max_scale/8;
         y[ibl].extra = extra;
         float id = 1/d;
 
         float sumqx = 0, sumq2 = 0;
         for (int ib = 0; ib < QK_K/kBlockSize; ++ib) {
-            int ls = nearest_int(0.5f*(id*scales[ib]+15));
-            ls = MAX(0, MIN(15, ls));
-            y[ibl].scales[ib/2] |= (ls << 4*(ib%2));
-            ls = 2*ls - 15;
+            int ls = nearest_int(id*scales[ib]);
+            ls = std::max(-8, std::min(7, ls));
+            y[ibl].scales[ib/2] |= ((ls + 8) << 4*(ib%2));
             float dl = d * ls;
             if (dl) {
                 const int8_t * block_values = y[ibl].extra & (1 << ib) ? shifted_values : iq2nl_values;
@@ -623,7 +677,7 @@ void quantize_row_iq2_k_impl(const float * x, void * vy, int n_per_row, const fl
                 }
             }
         }
-        y[ibl].d = GGML_FP32_TO_FP16(1.025f*(sumq2 > 0 ? sumqx/sumq2 : d));
+        y[ibl].d = GGML_FP32_TO_FP16(1.030f*(sumq2 > 0 ? sumqx/sumq2 : d));
 
     }
 }
@@ -665,8 +719,8 @@ void dequantize_row_iq2_k(const block_iq2_k  * GGML_RESTRICT x, float * GGML_RES
 
         int shift = 0;
         for (int ib32 = 0; ib32 < QK_K/32; ++ib32) {
-            float dl1 = d * (2*(x[i].scales[ib32] & 0xf) - 15);
-            float dl2 = d * (2*(x[i].scales[ib32] >>  4) - 15);
+            float dl1 = d * ((x[i].scales[ib32] & 0xf) - 8);
+            float dl2 = d * ((x[i].scales[ib32] >>  4) - 8);
             const int8_t * values1 = extra & 1 ? iq2nl_values + 4 : iq2nl_values;
             const int8_t * values2 = extra & 2 ? iq2nl_values + 4 : iq2nl_values;
             extra >>= 2;