From a4b41b4870c39d81b7bd06a29e19474df28afb7e Mon Sep 17 00:00:00 2001 From: Iwan Kawrakow Date: Wed, 2 Oct 2024 12:02:57 +0300 Subject: [PATCH] q6_0: slightly better kv-cache result Better than q8_0+q4_0, but not as good as q8_0+iq4_nl --- examples/llama-bench/llama-bench.cpp | 3 +++ ggml/src/ggml-quants.c | 10 +++++++++- 2 files changed, 12 insertions(+), 1 deletion(-) diff --git a/examples/llama-bench/llama-bench.cpp b/examples/llama-bench/llama-bench.cpp index fc77be50..9e4fd266 100644 --- a/examples/llama-bench/llama-bench.cpp +++ b/examples/llama-bench/llama-bench.cpp @@ -327,6 +327,9 @@ static ggml_type ggml_type_from_name(const std::string & s) { if (s == "iq4_nl") { return GGML_TYPE_IQ4_NL; } + if (s == "q6_0") { + return GGML_TYPE_Q6_0; + } return GGML_TYPE_COUNT; } diff --git a/ggml/src/ggml-quants.c b/ggml/src/ggml-quants.c index 1de1eb06..f5fff22e 100644 --- a/ggml/src/ggml-quants.c +++ b/ggml/src/ggml-quants.c @@ -870,12 +870,15 @@ void quantize_row_q6_0_ref(const float * restrict x, block_q6_0 * restrict y, in const float d = max / -32; const float id = d ? 1.0f/d : 0.0f; - y[i].d = GGML_FP32_TO_FP16(d); + //y[i].d = GGML_FP32_TO_FP16(d); memset(y[i].qh, 0, qk/4); + float sumqx = 0, sumq2 = 0; for (int j = 0; j < qk/2; ++j) { const float x0 = x[i*qk + 0 + j]*id; const float x1 = x[i*qk + qk/2 + j]*id; + const float w0 = x0*x0; + const float w1 = x1*x1; const uint8_t xi0 = MIN(63, (int8_t)(x0 + 32.5f)); const uint8_t xi1 = MIN(63, (int8_t)(x1 + 32.5f)); @@ -885,7 +888,12 @@ void quantize_row_q6_0_ref(const float * restrict x, block_q6_0 * restrict y, in const uint8_t h = (xi0 >> 4) | ((xi1 >> 4) << 2); y[i].qh[j%(qk/4)] |= (h << 4*(j/(qk/4))); + const float q0 = (float)xi0 - 32.f; + const float q1 = (float)xi1 - 32.f; + sumqx += w0*x[i*qk + j]*q0 + w1*x[i*qk + qk/2 + j]*q1; + sumq2 += w0*q0*q0 + w1*q1*q1; } + y[i].d = sumq2 > 0 ? GGML_FP32_TO_FP16(sumqx/sumq2) : GGML_FP32_TO_FP16(d); } }