From 4608f0cc6d67d25e7938c96c0ebff54cd4bd4a1e Mon Sep 17 00:00:00 2001
From: Iwan Kawrakow <iwan.kawrakow@gmail.com>
Date: Sun, 10 Nov 2024 17:21:32 +0200
Subject: [PATCH] iq2_kt: SOTA

We arrive at
PPL(LLaMA-3.1-8B-Instruct, 8192) = 9.2406
PPL(LLaMA-2-7B,            4096) = 6.4179
---
 ggml/src/iqk/iqk_quantize.cpp | 15 +++++----------
 1 file changed, 5 insertions(+), 10 deletions(-)

diff --git a/ggml/src/iqk/iqk_quantize.cpp b/ggml/src/iqk/iqk_quantize.cpp
index c7fdae36..f8300136 100644
--- a/ggml/src/iqk/iqk_quantize.cpp
+++ b/ggml/src/iqk/iqk_quantize.cpp
@@ -3751,10 +3751,7 @@ void quantize_row_iq2_kt_impl(const float * x, void * vy, int n_per_row, const f
 
         memset(&y[ibl], 0, sizeof(block_iq2_kt));
 
-        auto qs = (uint16_t *)y[ibl].ql;
-
         const float * xbl = x + ibl*Q::kSuperBlockSize;
-
         auto scales = all_scales + ibl*Q::kNblock;
 
         for (int ib = 0; ib < Q::kNblock; ++ib) {
@@ -3765,13 +3762,11 @@ void quantize_row_iq2_kt_impl(const float * x, void * vy, int n_per_row, const f
                 float ax = std::abs(xb[j]);
                 amax = std::max(amax, ax);
             }
-            float d = amax/96.f;
-            quantizer.find_best_match(d, xb, weight, best_idx);
-            auto pair = quantizer.find_best_scale(xb, weight, best_idx);
-            scales[ib] = pair.first;
-
-            for (int j = 0; j < Q::kNg; ++j) qs[j] = best_idx[j];
-            qs += Q::kNg;
+            quantizer.find_best_match( amax/96.f, xb, weight, best_idx);
+            auto [dp, score_p] = quantizer.find_best_scale(xb, weight, best_idx);
+            quantizer.find_best_match(-amax/96.f, xb, weight, best_idx);
+            auto [dm, score_m] = quantizer.find_best_scale(xb, weight, best_idx);
+            scales[ib] = score_p > score_m ? dp : dm;
 
             float abs_scale = std::abs(scales[ib]);
             if (abs_scale > amax_scale) {