iq4_kss attempt - not as good as original

2026-04-25 00:49:34 +00:00 · 2024-11-29 11:46:10 +02:00
parent 3a9926b932
commit 93e2c97a8b
6 changed files with 1423 additions and 70 deletions
--- a/examples/quantize-stats/quantize-stats.cpp
+++ b/examples/quantize-stats/quantize-stats.cpp
--- a/ggml/src/ggml-common.h
+++ b/ggml/src/ggml-common.h
@@ -435,9 +435,10 @@ typedef struct {
 static_assert(sizeof(block_iq4_ks) == QK_K/32 + QK_K/2, "wrong iq4_ks block size/padding");

 typedef struct {
-    uint32_t qs[QK_K/8];
+    uint16_t scales;
+    uint8_t qs[QK_K/2];
 } block_iq4_kss;
-static_assert(sizeof(block_iq4_kss) == QK_K/8*sizeof(uint32_t), "wrong iq4_kss block size/padding");
+static_assert(sizeof(block_iq4_kss) == QK_K/2 + sizeof(uint16_t), "wrong iq4_kss block size/padding");

 typedef struct {
    ggml_half d;
--- a/ggml/src/ggml-cuda/convert.cu
+++ b/ggml/src/ggml-cuda/convert.cu
@@ -715,28 +715,19 @@ static __global__ void dequantize_block_iq4_kss(const void * __restrict__ vx, ds
    int64_t ii  = blockIdx.x;
    int64_t row = (QK_K * ii) / n_per_row;
    const char * cx = (const char *)vx + row * row_size;
-    float scale = *(const float *)cx;
-    const block_iq4_kss * x = (const block_iq4_kss *)(cx + sizeof(float));
-    const int64_t i   = ii - (row*n_per_row)/QK_K;
+    float scale = (float)*(const half *)cx;
+    const block_iq4_kss * x = (const block_iq4_kss *)(cx + sizeof(half));
+    const int8_t * values = iq4k_values + 16;
+    const int64_t i = ii - (row*n_per_row)/QK_K;

    const int64_t tid = threadIdx.x;
-    const int64_t il = tid/8; // 0...3
-    const int64_t ib = tid%8; // 0...7
-    dst_t * y = yy + ii*QK_K + 32*ib + 4*il;
-    const uint32_t * q4 = x[i].qs + 4*ib;
-    uint32_t s32 = (q4[0] & 0x00010001) | ((q4[1] & 0x00010001) << 2) | ((q4[2] & 0x00010001) << 4) | ((q4[3] & 0x00010001) << 6);
-    uint8_t ls = (s32 | (s32 >> 15)) & 0xff;
-    const float d = scale * ((ls & 254) - 127);
-    const int8_t * values = iq4k_values + ((ls & 1) << 4);
-    uint32_t aux32[2];
-    aux32[0] = q4[il] & 0xfffefffe;
-    aux32[0] ^= (aux32[0] >> 1);
-    aux32[1] = ((aux32[0] >> 4) & 0x0f0f0f0f);
-    aux32[0] &= 0x0f0f0f0f;
-    const uint8_t * aux8 = (const uint8_t *)aux32;
+    dst_t * y = yy + ii*QK_K + 4*tid;
+    const uint8_t * qs = x[i].qs + 4*tid;
+    float d1 = scale * (((x[i].scales >> (4*(tid/16)+0)) & 0xf) + 1);
+    float d2 = scale * (((x[i].scales >> (4*(tid/16)+8)) & 0xf) + 1);
    for (int j = 0; j < 4; ++j) {
-        y[j+ 0] = d * values[aux8[j+0]];
-        y[j+16] = d * values[aux8[j+4]];
+        y[j       ] = d1 * values[qs[j] & 0xf];
+        y[j+QK_K/2] = d2 * values[qs[j] >>  4];
    }
 }

--- a/ggml/src/ggml-cuda/iqk_mmvq.cu
+++ b/ggml/src/ggml-cuda/iqk_mmvq.cu
@@ -245,6 +245,8 @@ __device__ __forceinline__ float vec_dot_iq4_ks_q8_1(
 __device__ __forceinline__ float vec_dot_iq4_kss_q8_1(
    const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & kbx, const int & iqs) {

+    return 0.f;
+
    float scale = *(const float *)vbq;
    const block_iq4_kss * bq4 = (const block_iq4_kss *)((const char *)vbq + sizeof(float)) + kbx;
    const uint8_t * all_values = (const uint8_t *)iq4k_values;
--- a/ggml/src/ggml.c
+++ b/ggml/src/ggml.c
@@ -1085,7 +1085,7 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
        .vec_dot                  = vec_dot_iq4_kss_q8_k,
        .vec_dot_type             = GGML_TYPE_Q8_K,
        .nrows                    = 1,
-        .row_meta_size            = 4,
+        .row_meta_size            = 2,
    },
    [GGML_TYPE_Q8_K] = {
        .type_name                = "q8_K",
--- a/ggml/src/iqk/iqk_quantize.cpp
+++ b/ggml/src/iqk/iqk_quantize.cpp
@@ -2757,7 +2757,376 @@ uint16_t prune_iq4ks(uint16_t v, const int8_t * values, const float * x, const f
    q4[jbest] = bestq;
    return (q4[0] | (q4[1] << 4) | (q4[2] << 8) | (q4[3] << 12));
 }
-static void quantize_row_iq4_kss_impl(int n_per_row, const float * x, char * cy,
+
+template <typename T>
+static inline int best_index(int n, const T * values, float x) {
+    if (x <= values[0]) return 0;
+    if (x >= values[n-1]) return n-1;
+    int ml = 0, mu = n-1;
+    while (mu - ml > 1) {
+        int mav = (mu + ml)/2;
+        if (x < values[mav]) mu = mav;
+        else ml = mav;
+    }
+    return x - values[mu-1] < values[mu] - x ? mu - 1 : mu;
+}
+
+void quantize_row_iq4_kss_impl(int n_per_row, const float * xr, char * cy,
+        const float * quant_weights,
+        float * weights, int8_t * quants,
+        float * scales) {
+        //std::vector<float>& all_steps) {
+
+    constexpr int kBlockSize = 64;
+    auto values = iq4k_values + 16;
+
+    //float * dptr = (float *)cy;
+    //*dptr = 0;
+    ggml_half * dh = (ggml_half *)cy;
+    dh[0] = GGML_FP32_TO_FP16(0.f);
+
+    block_iq4_kss * y = (block_iq4_kss *)(dh + 1);
+    std::memset(y, 0, (n_per_row/QK_K)*sizeof(block_iq4_kss));
+
+    float max_amax = 0;
+    for (int ib = 0; ib < n_per_row/kBlockSize; ++ib) {
+        auto xb = xr + ib*kBlockSize;
+        float amax = 0;
+        for (int j = 0; j < kBlockSize; ++j) amax = std::max(amax, std::abs(xb[j]));
+        scales[ib] = amax;
+        max_amax = std::max(amax, max_amax);
+    }
+    if (!max_amax) {
+        return;
+    }
+    float idm = 16/max_amax;
+    for (int ib = 0; ib < n_per_row/kBlockSize; ++ib) {
+        int l = nearest_int(idm*scales[ib]);
+        l = std::max(1, std::min(16, l));
+        scales[ib] = l;
+    }
+
+    float max = 0, amax = 0, sigma2 = 0;
+    for (int ib = 0; ib < n_per_row/kBlockSize; ++ib) {
+        auto xb = xr + ib*kBlockSize;
+        float norm = 1/scales[ib];
+        for (int j = 0; j < kBlockSize; ++j) {
+            sigma2 += xb[j]*xb[j];
+            float xs = norm*xb[j];
+            float axs = std::abs(xs);
+            if (axs > amax) {
+                amax = axs; max = xs;
+            }
+        }
+    }
+    sigma2 *= 1.f/n_per_row;
+    if (quant_weights) {
+        for (int j = 0; j < n_per_row; ++j) weights[j] = quant_weights[j] * sqrt(sigma2 + xr[j]*xr[j]);
+    } else {
+        for (int j = 0; j < n_per_row; ++j) weights[j] = 0.25f*sigma2 + xr[j]*xr[j];
+    }
+
+    float best = 0, d = max/values[0];
+    for (int itry = -9; itry <= 9; ++itry) {
+        float id = (values[0] + itry)/max;
+        float sumqx = 0, sumq2 = 0;
+        for (int ib = 0; ib < n_per_row/kBlockSize; ++ib) {
+            auto xb = xr + ib*kBlockSize;
+            auto wb = weights + ib*kBlockSize;
+            float norm = 1/scales[ib];
+            for (int j = 0; j < kBlockSize; ++j) {
+                int idx = best_index_iq4nl(values, id*norm*xb[j]);
+                float q = values[idx]*scales[ib];
+                sumqx += wb[j]*q*xb[j];
+                sumq2 += wb[j]*q*q;
+            }
+        }
+        if (sumq2 > 0 && sumqx*sumqx > best*sumq2) {
+            d = sumqx/sumq2; best = d*sumqx;
+        }
+    }
+    dh[0] = GGML_FP32_TO_FP16(d);
+    if (!d) return;
+
+    float id = 1/d;
+    //float mse = 0;
+    for (int ib = 0; ib < n_per_row/kBlockSize; ++ib) {
+        auto xb = xr + ib*kBlockSize;
+        float norm = 1/scales[ib];
+        for (int j = 0; j < kBlockSize; ++j) {
+            int idx = best_index_iq4nl(values, id*norm*xb[j]);
+            quants[ib*kBlockSize+j] = idx;
+            //float diff = xb[j] - d*scales[ib]*values[idx];
+            //mse += diff*diff;
+        }
+    }
+    //printf("rmse = %g, %g\n", sqrt(mse/n_per_row), sqrt(2*mse/n_per_row/sigma2));
+
+    for (int itry = 0; itry < 3; ++itry) {
+        id = 1/d;
+        int nchanged = 0;
+        for (int ib = 0; ib < n_per_row/kBlockSize; ++ib) {
+            auto xb = xr + ib*kBlockSize;
+            auto wb = weights + ib*kBlockSize;
+            float best_mse = 0;
+            for (int j = 0; j < kBlockSize; ++j) {
+                float q = scales[ib]*values[quants[ib*kBlockSize+j]];
+                float diff = xb[j] - d*q;
+                best_mse += wb[j]*diff*diff;
+            }
+            int l = nearest_int(scales[ib]);
+            if (l > 1) {
+                float try_scale = l-1;
+                float norm = 1/try_scale;
+                float this_mse = 0;
+                for (int j = 0; j < kBlockSize; ++j) {
+                    int idx = best_index_iq4nl(values, id*norm*xb[j]);
+                    float q = values[idx]*try_scale;
+                    float diff = xb[j] - d*q;
+                    this_mse += wb[j]*diff*diff;
+                }
+                if (this_mse < best_mse) {
+                    best_mse = this_mse; scales[ib] = try_scale;
+                    ++nchanged;
+                }
+            }
+            if (l < 16) {
+                float try_scale = l+1;
+                float norm = 1/try_scale;
+                float this_mse = 0;
+                for (int j = 0; j < kBlockSize; ++j) {
+                    int idx = best_index_iq4nl(values, id*norm*xb[j]);
+                    float q = values[idx]*try_scale;
+                    float diff = xb[j] - d*q;
+                    this_mse += wb[j]*diff*diff;
+                }
+                if (this_mse < best_mse) {
+                    best_mse = this_mse; scales[ib] = try_scale;
+                    ++nchanged;
+                }
+            }
+        }
+        if (nchanged == 0) break;
+        float sumqx = 0, sumq2 = 0;
+        //float mse = 0;
+        for (int ib = 0; ib < n_per_row/kBlockSize; ++ib) {
+            auto xb = xr + ib*kBlockSize;
+            auto wb = weights + ib*kBlockSize;
+            float norm = 1/scales[ib];
+            for (int j = 0; j < kBlockSize; ++j) {
+                int idx = best_index_iq4nl(values, id*norm*xb[j]);
+                quants[ib*kBlockSize+j] = idx;
+                float q = values[idx]*scales[ib];
+                sumqx += wb[j]*q*xb[j];
+                sumq2 += wb[j]*q*q;
+                //float diff = xb[j] - d*q;
+                //mse += diff*diff;
+            }
+        }
+        d = sumqx/sumq2;
+        //printf("itry = %d: %g, %g\n", itry, sqrt(mse/n_per_row), sqrt(2*mse/n_per_row/sigma2));
+    }
+
+    for (int ibl = 0; ibl < n_per_row/QK_K; ++ibl) {
+        y[ibl].scales = 0;
+        auto qs = y[ibl].qs;
+        auto qb = quants + ibl*QK_K;
+        for (int ib = 0; ib < QK_K/kBlockSize; ++ib) {
+            int l = nearest_int(scales[ibl*(QK_K/kBlockSize)+ib]);
+            if (l < 1 || l > 16) {
+                printf("Oops: scale = %g, l = %d\n", scales[ib], l);
+                GGML_ABORT("fatal error");
+            }
+            y[ibl].scales |= ((l-1) << 4*ib);
+        }
+        for (int j = 0; j < QK_K/2; ++j) qs[j] = qb[j] | (qb[j+QK_K/2] << 4);
+    }
+    dh[0] = GGML_FP32_TO_FP16(d);
+
+    //d = GGML_FP16_TO_FP32(dh[0]);
+    //float mse = 0;
+    //for (int ibl = 0; ibl < n_per_row/QK_K; ++ibl) {
+    //    for (int ib = 0; ib < QK_K/kBlockSize; ++ib) {
+    //        auto xb = xr + ibl*QK_K + ib*kBlockSize;
+    //        int l = ((y[ibl].scales >> 4*ib) & 0xf) + 1;
+    //        for (int j = 0; j < kBlockSize; ++j) {
+    //            float q = values[(y[ibl].qs[64*(ib%2)+j] >> 4*(ib/2)) & 0xf]*l;
+    //            //float q = values[quants[ibl*QK_K+ib*kBlockSize+j]]*l;
+    //            float diff = xb[j] - d*q;
+    //            mse += diff*diff;
+    //        }
+    //    }
+    //}
+    ////for (int ib = 0; ib < n_per_row/kBlockSize; ++ib) {
+    ////    auto xb = xr + ib*kBlockSize;
+    ////    for (int j = 0; j < kBlockSize; ++j) {
+    ////        float q = values[quants[ib*kBlockSize+j]]*scales[ib];
+    ////        float diff = xb[j] - d*q;
+    ////        mse += diff*diff;
+    ////    }
+    ////}
+    //printf("Final rmse: %g, %g\n", sqrt(mse/n_per_row), sqrt(2*mse/n_per_row/sigma2));
+
+    //constexpr float kMinGamma = 1.625f;
+    //int8_t next_values[16];
+    //float grad[16];
+
+    //int8_t * int_values = (int8_t *)(dptr + 1);
+    //std::memset(int_values, 0, 16);
+
+    //float sigma2 = 0, amax = 0, max = 0;
+    //for (int j = 0; j < n_per_row; ++j) {
+    //    sigma2 += xr[j]*xr[j];
+    //    float ax = std::abs(xr[j]);
+    //    if (ax > amax) {
+    //        amax = ax; max = xr[j];
+    //    }
+    //}
+    //if (!sigma2) return;
+
+    //float sigma = sqrt(sigma2/n_per_row);
+    //float gamma = amax/sigma;
+    //float alpha = gamma > kMinGamma ? (gamma/kMinGamma - 1)/gamma : 0.f;
+    //float d = -max/(8*sigma*(1 + alpha*gamma));
+    //float id = 1/d;
+    //for (int j = 0; j < n_per_row; ++j) {
+    //    float xs = xr[j]/sigma;
+    //    float z = xs/(1 + alpha*std::abs(xs));
+    //    int l = nearest_int(id*z);
+    //    l = std::max(-8, std::min(7, l));
+    //    quants[j] = l;
+    //}
+
+    //sigma2 *= 2.f/n_per_row;
+    //if (quant_weights) {
+    //    for (int j = 0; j < n_per_row; ++j) weights[j] = quant_weights[j] * sqrt(sigma2 + xr[j]*xr[j]);
+    //} else {
+    //    for (int j = 0; j < n_per_row; ++j) weights[j] = 0.25f*sigma2 + xr[j]*xr[j];
+    //}
+
+    //alpha = std::abs(alpha*d);
+    //for (int iter = 0; iter < 9; ++iter) {
+    //    float sumqx = 0, sumq2 = 0;
+    //    for (int j = 0; j < n_per_row; ++j) {
+    //        float q = sigma*quants[j]/(1 - alpha*std::abs(quants[j]));
+    //        sumqx += weights[j]*q*xr[j];
+    //        sumq2 += weights[j]*q*q;
+    //    }
+    //    if (sumq2 > 0) d = sumqx/sumq2;
+    //    int nchanged = 0;
+    //    for (int j = 0; j < n_per_row; ++j) {
+    //        float xs = xr[j]/(d*sigma);
+    //        float z = xs/(1 + alpha*std::abs(xs));
+    //        int l = nearest_int(z);
+    //        l = std::max(-8, std::min(7, l));
+    //        if (l != quants[j]) ++nchanged;
+    //        quants[j] = l;
+    //    }
+    //    if (nchanged == 0) break;
+    //}
+    //float c = 15.f*(1 - 8*alpha);
+    //for (int i = 0; i < 16; ++i) {
+    //    int_values[i] = nearest_int(c*(i-8)/(1-alpha*std::abs(i-8)));
+    //}
+    //float sumqx = 0, sumq2 = 0;
+    //for (int j = 0; j < n_per_row; ++j) {
+    //    quants[j] += 8;
+    //    float q = int_values[quants[j]];
+    //    sumqx += weights[j]*q*xr[j];
+    //    sumq2 += weights[j]*q*q;
+    //}
+    //d = sumqx/sumq2;
+    //for (int iter = 0; iter < 5; ++iter) {
+    //    id = 1/d;
+    //    std::memset(grad, 0, 16*sizeof(float));
+    //    sumqx = sumq2 = 0;
+    //    for (int j = 0; j < n_per_row; ++j) {
+    //        int idx = best_index(16, int_values, id*xr[j]);
+    //        float q = int_values[idx];
+    //        grad[idx] += weights[j]*d*(xr[j] - d*q);
+    //        quants[j] = idx;
+    //        sumqx += weights[j]*q*xr[j];
+    //        sumq2 += weights[j]*q*q;
+    //    }
+    //    all_steps.clear();
+    //    for (int i = 0; i < 16; ++i) {
+    //        int l = int_values[i];
+    //        if (grad[i] > 0) {
+    //            int lmax = std::min(127, l + 5);
+    //            if (i < 16) lmax = std::min(lmax, int_values[i+1] - 1);
+    //            for (int k = l + 1; k <= lmax; ++k) {
+    //                float step = (k - 0.4999f - l)/grad[i];
+    //                all_steps.push_back(step);
+    //            }
+    //        }
+    //        else if (grad[i] < 0) {
+    //            int lmin = std::max(-128, l - 5);
+    //            if (i > 0) lmin = std::max(lmin, int_values[i-1]+1);
+    //            for (int k = l-1; k >= lmin; --k) {
+    //                float step = (k + 0.499f - l)/grad[i];
+    //                all_steps.push_back(step);
+    //            }
+    //        }
+    //    }
+    //    float best = sumqx*sumqx/sumq2;
+    //    int best_is = -1;
+    //    int nstep = std::min(5, int(all_steps.size()));
+    //    std::partial_sort(all_steps.begin(), all_steps.begin() + nstep, all_steps.end());
+    //    float last_sumqx = sumqx, last_sumq2 = sumq2;
+    //    for (int is = 0; is < nstep; ++is) {
+    //        for (int i = 0; i < 16; ++i) {
+    //            int l = nearest_int(int_values[i] + all_steps[is]*grad[i]);
+    //            next_values[i] = std::max(-128, std::min(127, l));
+    //        }
+    //        sumqx = last_sumqx, sumq2 = last_sumq2;
+    //        for (int j = 0; j < n_per_row; ++j) {
+    //            int l = quants[j];
+    //            int lnew = l;
+    //            float dist = std::abs(id*xr[j] - next_values[l]);
+    //            if (l > 0) {
+    //                float dist1 = std::abs(id*xr[j] - next_values[l-1]);
+    //                if (dist1 < dist) { dist = dist1; lnew = l - 1; }
+    //            }
+    //            if (l < 15) {
+    //                float dist1 = std::abs(id*xr[j] - next_values[l+1]);
+    //                if (dist1 < dist) { dist = dist1; lnew = l + 1; }
+    //            }
+    //            if (next_values[lnew] == int_values[l]) continue;
+    //            float q = int_values[l];
+    //            sumqx -= weights[j]*q*xr[j];
+    //            sumq2 -= weights[j]*q*q;
+    //            q = next_values[lnew];
+    //            sumqx += weights[j]*q*xr[j];
+    //            sumq2 += weights[j]*q*q;
+    //        }
+    //        if (sumq2 > 0 && sumqx*sumqx > best*sumq2) {
+    //            d = sumqx/sumq2; best = d*sumqx; best_is = is;
+    //        }
+    //    }
+    //    if (best_is < 0) break;
+    //    for (int i = 0; i < 16; ++i) {
+    //        int l = nearest_int(int_values[i] + all_steps[best_is]*grad[i]);
+    //        int_values[i] = l;
+    //    }
+    //}
+
+    //*dptr = d;
+
+    //uint8_t * qs = (uint8_t *)int_values + 16;
+
+    //for (int ib = 0; ib < n_per_row/QK_K; ++ib) {
+    //    for (int j = 0; j < QK_K/2; ++j) qs[j] = quants[j] | (quants[j+QK_K/2] << 4);
+    //    qs += QK_K/2;
+    //    quants += QK_K;
+    //}
+
+    ////for (int j = 0; j < n_per_row/2; ++j) {
+    ////    qs[j] = quants[j] | (quants[j+n_per_row/2] << 4);
+    ////}
+
+}
+
+static void quantize_row_iq4_kss_impl_old(int n_per_row, const float * x, char * cy,
        float * all_scales, float * weight,
        const int8_t * values,
        const float * quant_weights,
@@ -2768,7 +3137,7 @@ static void quantize_row_iq4_kss_impl(int n_per_row, const float * x, char * cy,
    constexpr int block_size = 32;

    float * dptr = (float *)cy;
-    *dptr = 0;
+    dptr[0] = 0;
    block_iq4_kss * y = (block_iq4_kss *)(dptr + 1);

    const int8_t * shifted_values = values + 16;
@@ -3043,17 +3412,16 @@ void prune_iq4ks_to_iq4kss(int n_per_row, const uint16_t * table, const char * c
 }

 size_t quantize_iq4_kss(const float * src, void * dst, int64_t nrows, int64_t n_per_row, const float * imatrix) {
-    constexpr int kBlockSize = 32; //128;
    GGML_ASSERT(n_per_row%QK_K == 0);
-    auto row_size    = ggml_row_size(GGML_TYPE_IQ4_KSS, n_per_row);
-    auto row_size_ks = ggml_row_size(GGML_TYPE_IQ4_KS, n_per_row);
-    std::vector<char> work(row_size_ks);
-    std::vector<float> all_scales(n_per_row/kBlockSize);
-    float weight[kBlockSize];
+    auto row_size = ggml_row_size(GGML_TYPE_IQ4_KSS, n_per_row);
+    std::vector<float> weights(n_per_row);
+    std::vector<float> scales(n_per_row/64);
+    //std::vector<float> all_steps;
+    std::vector<int8_t> quants(n_per_row);
    auto qrow = (char *)dst;
-    auto table = scramble_table();
    for (int row = 0; row < nrows; ++row) {
-        quantize_row_iq4_kss_impl(n_per_row, src, qrow, all_scales.data(), weight, iq4k_values, imatrix, table, 7);
+        //quantize_row_iq4_kss_impl(n_per_row, src, qrow, imatrix, weights.data(), quants.data(), all_steps);
+        quantize_row_iq4_kss_impl(n_per_row, src, qrow, imatrix, weights.data(), quants.data(), scales.data());
        src  += n_per_row;
        qrow += row_size;
    }
@@ -3069,43 +3437,43 @@ void quantize_row_iq4_kss(const float * x, void * y, int64_t k) {
 }

 void dequantize_row_iq4_kss(const block_iq4_kss * x, float * y, int64_t k) {
-    const float * dptr = (const float *)x;
-    const float d = *dptr;
-    x = (const block_iq4_kss *)(dptr + 1);
-    uint16_t aux16[8];
-    const uint8_t * aux8 = (const uint8_t *)aux16;
-    for (int ibl = 0; ibl < k/QK_K; ++ibl) {
-        auto qs = (const uint16_t *)x[ibl].qs;
-        for (int ib = 0; ib < QK_K/32; ++ib) {
-            //uint8_t ls = ((qs[0] >> 30) | ((qs[1] >> 28) & 0x0c) | ((qs[2] >> 26) & 0x30) | ((qs[3] >> 24) & 0xc0));
-            //const int8_t * values = iq4k_values + ((ls & 1) << 4);
-            //const float dl = d * ((ls & 254) - 127);
-            //for (int k = 0; k < 4; ++k) {
-            //    uint16_t vl = qs[k] & 0x7fff;
-            //    vl ^= (vl << 1);
-            //    uint16_t vh = (qs[k] >> 15) & 0x7fff;
-            //    vh ^= (vh << 1);
-            //    for (int j = 0; j < 4; ++j) {
-            //        y[4*k + j +  0] = dl*values[(vl >> 4*j) & 0xf];
-            //        y[4*k + j + 16] = dl*values[(vh >> 4*j) & 0xf];
-            //    }
-            //}
-            int16_t ls = 0;
-            for (int k = 0; k < 8; ++k) {
-                aux16[k] = qs[k] & 0xfffe;
-                aux16[k] ^= (aux16[k] >> 1);
-                ls |= (qs[k] & 1) << k;
+    const ggml_half * dh = (const ggml_half *)x;
+    const float d = GGML_FP16_TO_FP32(dh[0]);
+    x = (const block_iq4_kss *)(dh + 1);
+    const int8_t * values = iq4k_values + 16;
+    int nblock = k/QK_K;
+    float scales[QK_K/64];
+    for (int ib = 0; ib < nblock; ++ib) {
+        for (int k = 0; k < QK_K/64; ++k) scales[k] = d*(((x[ib].scales >> 4*k) & 0xf) + 1);
+        auto qs = x[ib].qs;
+        for (int is = 0; is < 2; ++is) {
+            for (int j = 0; j < 64; ++j) {
+                y[j       ] = scales[is+0] * values[qs[j] & 0xf];
+                y[j+QK_K/2] = scales[is+2] * values[qs[j] >>  4];
            }
-            const int8_t * values = iq4k_values + ((ls & 1) << 4);
-            float dl = d * ((ls & 254) - 127);
-            for (int j = 0; j < 16; ++j) {
-                y[j+ 0] = dl * values[aux8[j] & 0xf];
-                y[j+16] = dl * values[aux8[j] >>  4];
-            }
-            y  += 32;
-            qs += 8;
+            y  += 64;
+            qs += 64;
        }
+        y += QK_K/2;
    }
+
+    //const float * dptr = (const float *)x;
+    //const float d = *dptr;
+    //const int8_t * int_values = (const int8_t *)(dptr + 1);
+    //const uint8_t * qs = (const uint8_t *)int_values + 16;
+    //int nblock = k/QK_K;
+    //for (int ib = 0; ib < nblock; ++ib) {
+    //    for (int j = 0; j < QK_K/2; ++j) {
+    //        y[j       ] = d * int_values[qs[j] & 0xf];
+    //        y[j+QK_K/2] = d * int_values[qs[j] >>  4];
+    //    }
+    //    qs += QK_K/2;
+    //    y  += QK_K;
+    //}
+    //for (int j = 0; j < k/2; ++j) {
+    //    y[j    ] = d * int_values[qs[j] & 0xf];
+    //    y[j+k/2] = d * int_values[qs[j] >>  4];
+    //}
 }

 void vec_dot_iq4_kss_q8_k(int n, float * s, size_t bs, const void * vx, size_t bx, const void * vy, size_t by, int nrc) {