From e640a9ed8888ae6563d98c24004ee178377e54e7 Mon Sep 17 00:00:00 2001 From: Iwan Kawrakow Date: Sat, 12 Oct 2024 09:52:49 +0300 Subject: [PATCH] iq2k: Try make_qx_quants for the scale Slightly better for LLaMA-3.1, Gemma-2, slightly worse for Qwen2.5 --- ggml/src/ggml-cuda/convert.cu | 8 ++-- ggml/src/iqk/iqk_quantize.cpp | 76 ++++++++++++++++++++++++++++++----- 2 files changed, 69 insertions(+), 15 deletions(-) diff --git a/ggml/src/ggml-cuda/convert.cu b/ggml/src/ggml-cuda/convert.cu index 62dd52a2..4ec136fa 100644 --- a/ggml/src/ggml-cuda/convert.cu +++ b/ggml/src/ggml-cuda/convert.cu @@ -729,10 +729,10 @@ static __global__ void dequantize_block_iq2_k(const void * __restrict__ vx, dst_ int il = tid%16; // 0...15 dst_t * y = yy + i*QK_K + 128*ib128 + 2*il; const float d = (float)x[i].d; - const float dl1 = d * (2*((x[i].scales[4*ib128+0] >> 4*(il/8)) & 0xf) - 15); - const float dl2 = d * (2*((x[i].scales[4*ib128+1] >> 4*(il/8)) & 0xf) - 15); - const float dl3 = d * (2*((x[i].scales[4*ib128+2] >> 4*(il/8)) & 0xf) - 15); - const float dl4 = d * (2*((x[i].scales[4*ib128+3] >> 4*(il/8)) & 0xf) - 15); + const float dl1 = d * (((x[i].scales[4*ib128+0] >> 4*(il/8)) & 0xf) - 8); + const float dl2 = d * (((x[i].scales[4*ib128+1] >> 4*(il/8)) & 0xf) - 8); + const float dl3 = d * (((x[i].scales[4*ib128+2] >> 4*(il/8)) & 0xf) - 8); + const float dl4 = d * (((x[i].scales[4*ib128+3] >> 4*(il/8)) & 0xf) - 8); const uint8_t * qs = x[i].qs + 32*ib128 + 2*il; const int16_t extra = x[i].extra >> (8*ib128 + (il/8)); for (int j = 0; j < 2; ++j) { diff --git a/ggml/src/iqk/iqk_quantize.cpp b/ggml/src/iqk/iqk_quantize.cpp index 430b629f..6cc6cef0 100644 --- a/ggml/src/iqk/iqk_quantize.cpp +++ b/ggml/src/iqk/iqk_quantize.cpp @@ -30,6 +30,50 @@ inline int nearest_int(float fval) { return (i & 0x007fffff) - 0x00400000; } +float make_qx_quants(int n, int nmax, const float * x, int8_t * L, const float * qw) { + float max = 0; + float amax = 0; + for (int i = 0; i < n; ++i) { + float ax = fabsf(x[i]); + if (ax > amax) { amax = ax; max = x[i]; } + } + if (!amax) { // all zero + for (int i = 0; i < n; ++i) L[i] = 0; + return 0.f; + } + float iscale = -nmax / max; + float sumlx = 0; + float suml2 = 0; + for (int i = 0; i < n; ++i) { + int l = nearest_int(iscale * x[i]); + l = std::max(-nmax, std::min(nmax-1, l)); + L[i] = l + nmax; + sumlx += qw[i]*x[i]*l; + suml2 += qw[i]*l*l; + } + float scale = suml2 ? sumlx/suml2 : 0.0f; + float best = scale * sumlx; + for (int is = -9; is <= 9; ++is) { + if (is == 0) continue; + iscale = -(nmax + 0.1f*is) / max; + sumlx = suml2 = 0; + for (int i = 0; i < n; ++i) { + int l = nearest_int(iscale * x[i]); + l = std::max(-nmax, std::min(nmax-1, l)); + sumlx += qw[i]*x[i]*l; + suml2 += qw[i]*l*l; + } + if (suml2 > 0 && sumlx*sumlx > best*suml2) { + for (int i = 0; i < n; ++i) { + int l = nearest_int(iscale * x[i]); + L[i] = nmax + std::max(-nmax, std::min(nmax-1, l)); + } + scale = sumlx/suml2; best = scale*sumlx; + } + } + return scale; +} + struct IQ1BNQuantizer { int8_t L[QK_IQ1BN]; void quantize_one_row_1bn(const float * src, block_iq1_bn * y, int n_per_row, const float * imatrix); @@ -507,6 +551,8 @@ void quantize_row_iq2_k_impl(const float * x, void * vy, int n_per_row, const fl float scales[QK_K/kBlockSize]; float weight[kBlockSize]; float sumx[kBlockSize+1], sumw[kBlockSize+1]; + float sw[QK_K/kBlockSize]; + int8_t Ls[QK_K/kBlockSize]; std::array, kBlockSize> pairs; @@ -524,7 +570,7 @@ void quantize_row_iq2_k_impl(const float * x, void * vy, int n_per_row, const fl uint16_t extra = 0; - float max_abs_scale = 0; + float max_abs_scale = 0, max_scale = 0; for (int ib = 0; ib < QK_K/kBlockSize; ++ib) { const float * xb = xbl + kBlockSize*ib; @@ -534,7 +580,11 @@ void quantize_row_iq2_k_impl(const float * x, void * vy, int n_per_row, const fl } else { for (int j = 0; j < kBlockSize; ++j) weight[j] = 0.25f*sigma2 + xb[j]*xb[j]; } - for (int j = 0; j < kBlockSize; ++j) pairs[j] = {xb[j], j}; + sw[ib] = 0; + for (int j = 0; j < kBlockSize; ++j) { + sw[ib] += weight[j]; + pairs[j] = {xb[j], j}; + } std::sort(pairs.begin(), pairs.end()); sumx[0] = sumw[0] = 0; for (int j = 0; j < kBlockSize; ++j) { @@ -583,21 +633,25 @@ void quantize_row_iq2_k_impl(const float * x, void * vy, int n_per_row, const fl if (is_shifted) extra |= (1 << ib); float abs_scale = fabsf(scales[ib]); - max_abs_scale = MAX(max_abs_scale, abs_scale); + if (abs_scale > max_abs_scale) { + max_abs_scale = abs_scale; + max_scale = scales[ib]; + } } if (!max_abs_scale) continue; + float d = make_qx_quants(QK_K/kBlockSize, 8, scales, Ls, sw); + if (!d) continue; - float d = max_abs_scale/15; + //float d = -max_scale/8; y[ibl].extra = extra; float id = 1/d; float sumqx = 0, sumq2 = 0; for (int ib = 0; ib < QK_K/kBlockSize; ++ib) { - int ls = nearest_int(0.5f*(id*scales[ib]+15)); - ls = MAX(0, MIN(15, ls)); - y[ibl].scales[ib/2] |= (ls << 4*(ib%2)); - ls = 2*ls - 15; + int ls = nearest_int(id*scales[ib]); + ls = std::max(-8, std::min(7, ls)); + y[ibl].scales[ib/2] |= ((ls + 8) << 4*(ib%2)); float dl = d * ls; if (dl) { const int8_t * block_values = y[ibl].extra & (1 << ib) ? shifted_values : iq2nl_values; @@ -623,7 +677,7 @@ void quantize_row_iq2_k_impl(const float * x, void * vy, int n_per_row, const fl } } } - y[ibl].d = GGML_FP32_TO_FP16(1.025f*(sumq2 > 0 ? sumqx/sumq2 : d)); + y[ibl].d = GGML_FP32_TO_FP16(1.030f*(sumq2 > 0 ? sumqx/sumq2 : d)); } } @@ -665,8 +719,8 @@ void dequantize_row_iq2_k(const block_iq2_k * GGML_RESTRICT x, float * GGML_RES int shift = 0; for (int ib32 = 0; ib32 < QK_K/32; ++ib32) { - float dl1 = d * (2*(x[i].scales[ib32] & 0xf) - 15); - float dl2 = d * (2*(x[i].scales[ib32] >> 4) - 15); + float dl1 = d * ((x[i].scales[ib32] & 0xf) - 8); + float dl2 = d * ((x[i].scales[ib32] >> 4) - 8); const int8_t * values1 = extra & 1 ? iq2nl_values + 4 : iq2nl_values; const int8_t * values2 = extra & 2 ? iq2nl_values + 4 : iq2nl_values; extra >>= 2;