From a4b41b4870c39d81b7bd06a29e19474df28afb7e Mon Sep 17 00:00:00 2001
From: Iwan Kawrakow <iwan.kawrakow@gmail.com>
Date: Wed, 2 Oct 2024 12:02:57 +0300
Subject: [PATCH] q6_0: slightly better kv-cache result

Better than q8_0+q4_0, but not as good as q8_0+iq4_nl
---
 examples/llama-bench/llama-bench.cpp |  3 +++
 ggml/src/ggml-quants.c               | 10 +++++++++-
 2 files changed, 12 insertions(+), 1 deletion(-)

diff --git a/examples/llama-bench/llama-bench.cpp b/examples/llama-bench/llama-bench.cpp
index fc77be50..9e4fd266 100644
--- a/examples/llama-bench/llama-bench.cpp
+++ b/examples/llama-bench/llama-bench.cpp
@@ -327,6 +327,9 @@ static ggml_type ggml_type_from_name(const std::string & s) {
     if (s == "iq4_nl") {
         return GGML_TYPE_IQ4_NL;
     }
+    if (s == "q6_0") {
+        return GGML_TYPE_Q6_0;
+    }
 
     return GGML_TYPE_COUNT;
 }
diff --git a/ggml/src/ggml-quants.c b/ggml/src/ggml-quants.c
index 1de1eb06..f5fff22e 100644
--- a/ggml/src/ggml-quants.c
+++ b/ggml/src/ggml-quants.c
@@ -870,12 +870,15 @@ void quantize_row_q6_0_ref(const float * restrict x, block_q6_0 * restrict y, in
         const float d  = max / -32;
         const float id = d ? 1.0f/d : 0.0f;
 
-        y[i].d = GGML_FP32_TO_FP16(d);
+        //y[i].d = GGML_FP32_TO_FP16(d);
         memset(y[i].qh, 0, qk/4);
 
+        float sumqx = 0, sumq2 = 0;
         for (int j = 0; j < qk/2; ++j) {
             const float x0 = x[i*qk + 0    + j]*id;
             const float x1 = x[i*qk + qk/2 + j]*id;
+            const float w0 = x0*x0;
+            const float w1 = x1*x1;
 
             const uint8_t xi0 = MIN(63, (int8_t)(x0 + 32.5f));
             const uint8_t xi1 = MIN(63, (int8_t)(x1 + 32.5f));
@@ -885,7 +888,12 @@ void quantize_row_q6_0_ref(const float * restrict x, block_q6_0 * restrict y, in
             const uint8_t h = (xi0 >> 4) | ((xi1 >> 4) << 2);
             y[i].qh[j%(qk/4)] |= (h << 4*(j/(qk/4)));
 
+            const float q0 = (float)xi0 - 32.f;
+            const float q1 = (float)xi1 - 32.f;
+            sumqx += w0*x[i*qk + j]*q0 + w1*x[i*qk + qk/2 + j]*q1;
+            sumq2 += w0*q0*q0 + w1*q1*q1;
         }
+        y[i].d = sumq2 > 0 ? GGML_FP32_TO_FP16(sumqx/sumq2) : GGML_FP32_TO_FP16(d);
     }
 }