From 70815ec5b2e93e78784714d8a0b0a46d20109ced Mon Sep 17 00:00:00 2001 From: Iwan Kawrakow Date: Sun, 24 Nov 2024 11:29:37 +0200 Subject: [PATCH] iq2k: quantization improvement I was not using the ciorrect scale sign to compute mse when checking the solution with the sign flipped. iq4_kss is now almost on par with the 4-bit Trellis. --- ggml/src/ggml-cuda/convert.cu | 2 +- ggml/src/iqk/iqk_quantize.cpp | 8 ++++---- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/ggml/src/ggml-cuda/convert.cu b/ggml/src/ggml-cuda/convert.cu index 3f23ad4d..84120ac7 100644 --- a/ggml/src/ggml-cuda/convert.cu +++ b/ggml/src/ggml-cuda/convert.cu @@ -715,7 +715,7 @@ static __global__ void dequantize_block_iq4_kss(const void * __restrict__ vx, ds int64_t ii = blockIdx.x; int64_t row = (QK_K * ii) / n_per_row; const char * cx = (const char *)vx + row * row_size; - float scale = *(const float *)cx; + float scale = *(const float *)cx * 1.01f; const block_iq4_kss * x = (const block_iq4_kss *)(cx + sizeof(float)); const int64_t i = ii - (row*n_per_row)/QK_K; diff --git a/ggml/src/iqk/iqk_quantize.cpp b/ggml/src/iqk/iqk_quantize.cpp index b2b1f819..0891c4f2 100644 --- a/ggml/src/iqk/iqk_quantize.cpp +++ b/ggml/src/iqk/iqk_quantize.cpp @@ -2898,7 +2898,7 @@ static void quantize_row_iq4_kss_impl(int n_per_row, const float * x, char * cy, vm |= (best_index_iq4nl(values, -al) << 4*j); } vp = prune_iq4ks(vp, values, xv, wv, this_d); - vm = prune_iq4ks(vm, values, xv, wv, this_d); + vm = prune_iq4ks(vm, values, xv, wv, -this_d); for (int j = 0; j < 4; ++j) { float w = wv[j]; float q = values[(vp >> 4*j) & 0xf]; @@ -2938,7 +2938,7 @@ static void quantize_row_iq4_kss_impl(int n_per_row, const float * x, char * cy, vm |= (best_index_iq4nl(shifted_values, -al) << 4*j); } vp = prune_iq4ks(vp, shifted_values, xv, wv, this_d); - vm = prune_iq4ks(vm, shifted_values, xv, wv, this_d); + vm = prune_iq4ks(vm, shifted_values, xv, wv, -this_d); for (int j = 0; j < 4; ++j) { float w = wv[j]; float q = shifted_values[(vp >> 4*j) & 0xf]; @@ -3002,8 +3002,8 @@ static void quantize_row_iq4_kss_impl(int n_per_row, const float * x, char * cy, vp |= (best_index_iq4nl( values, al) << 4*j); vm |= (best_index_iq4nl(shifted_values, al) << 4*j); } - vp = prune_iq4ks(vp, values, xv, wv, dl); - vm = prune_iq4ks(vm, shifted_values, xv, wv, dl); + vp = prune_iq4ks(vp, values, xv, wv, dl); + vm = prune_iq4ks(vm, shifted_values, xv, wv, dl); for (int j = 0; j < 4; ++j) { float w = wv[j]; float q = values[(vp >> 4*j) & 0xf];