New IQ2_KT, IQ3_KT and IQ4_KT, V2 (#529)

* New iq4_kt trellis The new trellis generates int8_t values via sum_as_uint8_t[(ka * idx + kb) & 0x3f33f3f3f] - 126. CUDA dequantize works. AVX2 case Ny > 32 works, and we get 273 t/s for L3-8B. PPL is on par or even slightly lower than original QTIP trellis. * Something is not working with the AVX2 dot product * New iq4_kt: CUDA MMVQ * New iq4_kt: CUDA MMQ * For now have only iq4_kt use the new trellis * Fix iq2_kt that got broken along the way * New iq4_kt: AVX2 dot product finally works We get 13.6 t/s vs 8.4 t/s with the f16 trellis and f32 arithmetic. Still somewhat slower than other quants, but no longer pathetic. * New iq4_kt: fix vanilla AVX2 * New iq4_kt: NEON implementation We get very respectable PP-512 = 120 t/s. TG-128 is pathetic at 5.3 t/s, so 20+% slower than the f16 variant. * New iq4_kt: slightly faster NEON * New iq4_kt: slightly faster NEON * New iq4_kt: faster NEON We are now at 9.4 t/s, up from 6.6 t/s for the f16 trellis. * Minor * New iq4_kt trellis: not working Metal implementation * Remove the extra 4 bytes of row meta data that is no longer used * Cleanup * Adding forgottent file * Switching iq2_kt to new trellis - CUDA MMQ * New iq2_kt: CUDA GEMV * New iq2_kt: AVX2 dequantize * New iq2_kt: AVX2 GEMM/GEMV * Adding forgotten file * New iq2_kt: NEON GEMM/GEMV * New iq2_kt: slightly faster NEON GEMM * New iq2_kt: Metal - very slow. It seems Apple Silicon cannot quickly add 4 8-bit ints. Or I don't know how to do it - but I didn't find anything in the Metal Shading Language Specification. So, performance is quite a bit worse than the original trellis. * Add missing break * Trying @louiehelm's multiplier * CPU * iq3_kt: use integer trellis + CUDA dequantize and MMVQ * iq3_kt: MMQ * iq3_kt: AVX2 GEMM * iq3_kt: AVX2 GEMV * The trellis quants now need super-blocks of 256, so we need a check --------- Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
2026-04-29 10:51:51 +00:00 · 2025-06-18 16:20:54 +03:00
parent c410cc72bb
commit d85c64428e
16 changed files with 1668 additions and 132 deletions
--- a/ggml/src/iqk/iqk_quantize.cpp
+++ b/ggml/src/iqk/iqk_quantize.cpp
@@ -7397,7 +7397,7 @@ void dequantize_row_ms_i2s(const void * vx, float * y, int64_t k) {
 }

 namespace {
-template <int block_size, int group_size, int num_bits, bool is_abs = false>
+template <int block_size, int group_size, int num_bits, bool is_abs = false, bool is_int = false>
 class QuantizerIQKT {
    static_assert(group_size == 8 || group_size == 4);
    static_assert(block_size >= 8 && block_size%8 == 0);
@@ -7408,7 +7408,7 @@ public:
    constexpr static int kNg = kBlockSize/kGroupSize;
    constexpr static int kNblock = kSuperBlockSize/kBlockSize;
    constexpr static int kNumVal = 1 << num_bits; // i.e, 16 bits per group of 8
-    constexpr static float kScale = 31.75f;
+    constexpr static float kScale = is_int ? 1.f : 31.75f;
    constexpr static bool kVerbose = false;

    QuantizerIQKT(int num_clusters, int num_neighbours, int offset = 4096);
@@ -7419,17 +7419,32 @@ public:
    inline float find_best_inverse_scale(const float * xb, const float * weight, const int * best_idx) const;

    static inline void set_values(uint32_t i, float * result, float scale, int offset = 4096) {
-        constexpr uint32_t ka = 89226354;
-        constexpr uint32_t kb = 64248484;
-        constexpr uint32_t kmask = 0x8fff8fff;
-        constexpr uint32_t km32 = 0x3b603b60;
        uint32_t x = i + offset;
-        for (int k = 0; k < kGroupSize; ++k) {
-            x = ka*x + kb;
-            uint32_t s = (x & kmask) ^ km32;
-            float val = GGML_FP16_TO_FP32(s & 65535) + GGML_FP16_TO_FP32(s >> 16);
-            if constexpr (is_abs) result[k] = scale*std::abs(val);
-            else result[k] = scale*val;
+        if constexpr (is_int) {
+            constexpr uint32_t ka = 0xCBAC1FED;
+            uint32_t s;
+            auto i8 = (const int8_t *)&s;
+            for (int k = 0; k < kGroupSize; ++k) {
+                x = ka*x;
+                s = x & 0x3f3f3f3f;
+                if constexpr (is_abs) {
+                    result[k] = scale*std::abs(i8[0] + i8[1] + i8[2] + i8[3] - 126.f);
+                } else {
+                    result[k] = scale*(i8[0] + i8[1] + i8[2] + i8[3] - 126.f);
+                }
+            }
+        } else {
+            constexpr uint32_t ka = 89226354;
+            constexpr uint32_t kb = 64248484;
+            constexpr uint32_t kmask = 0x8fff8fff;
+            constexpr uint32_t km32 = 0x3b603b60;
+            for (int k = 0; k < kGroupSize; ++k) {
+                x = ka*x + kb;
+                uint32_t s = (x & kmask) ^ km32;
+                float val = GGML_FP16_TO_FP32(s & 65535) + GGML_FP16_TO_FP32(s >> 16);
+                if constexpr (is_abs) result[k] = scale*std::abs(val);
+                else result[k] = scale*val;
+            }
        }
    }

@@ -7478,14 +7493,15 @@ private:
    float m_mid[4*kGroupSize];
 };

-template <int block_size, int group_size, int num_bits, bool is_abs>
-QuantizerIQKT<block_size, group_size, num_bits, is_abs>::QuantizerIQKT(int num_clusters, int num_neighbours, int offset) {
+template <int block_size, int group_size, int num_bits, bool is_abs, bool is_int>
+QuantizerIQKT<block_size, group_size, num_bits, is_abs, is_int>::QuantizerIQKT(int num_clusters, int num_neighbours, int offset) {
    m_values.resize(kNumVal*kGroupSize);
    float * data = m_values.data();
    for (int i = 0; i < kNumVal; ++i) {
        set_values(i, data, kScale, offset);
        data += kGroupSize;
    }
+    if (num_clusters == 0) return;
    // Make 128 clusters.
    // Note: we get a slightly better result by using 64 clusters
    //       at the expense of almost doubling the quantization time.
@@ -7494,8 +7510,8 @@ QuantizerIQKT<block_size, group_size, num_bits, is_abs>::QuantizerIQKT(int num_c
    m_in_cluster = finalize_clusters(num_neighbours, m_values, m_clusters, m_c_values);
 }

-template <int block_size, int group_size, int num_bits, bool is_abs>
-std::pair<float, float> QuantizerIQKT<block_size, group_size, num_bits, is_abs>::find_best_scale(
+template <int block_size, int group_size, int num_bits, bool is_abs, bool is_int>
+std::pair<float, float> QuantizerIQKT<block_size, group_size, num_bits, is_abs, is_int>::find_best_scale(
        const float * xb, const float * weight, const int * best_idx) const {
    float sumqx = 0, sumq2 = 0;
 #ifdef __AVX2__
@@ -7527,8 +7543,8 @@ std::pair<float, float> QuantizerIQKT<block_size, group_size, num_bits, is_abs>:
    return sumq2 > 0 ? std::make_pair(sumqx/sumq2, sumqx*sumqx/sumq2) : std::make_pair(0.f, 0.f);
 }

-template <int block_size, int group_size, int num_bits, bool is_abs>
-float QuantizerIQKT<block_size, group_size, num_bits, is_abs>::find_best_inverse_scale(
+template <int block_size, int group_size, int num_bits, bool is_abs, bool is_int>
+float QuantizerIQKT<block_size, group_size, num_bits, is_abs, is_int>::find_best_inverse_scale(
        const float * xb, const float * weight, const int * best_idx) const {
    float sumqx = 0, sumx2 = 0;
 #ifdef __AVX2__
@@ -7560,8 +7576,8 @@ float QuantizerIQKT<block_size, group_size, num_bits, is_abs>::find_best_inverse
    return sumx2 > 0 ? sumqx/sumx2 : 0.f;
 }

-template <int block_size, int group_size, int num_bits, bool is_abs>
-void QuantizerIQKT<block_size, group_size, num_bits, is_abs>::find_best_match(float d, const float * xb, const float * weight, int * best_idx) const {
+template <int block_size, int group_size, int num_bits, bool is_abs, bool is_int>
+void QuantizerIQKT<block_size, group_size, num_bits, is_abs, is_int>::find_best_match(float d, const float * xb, const float * weight, int * best_idx) const {
    if (!d) {
        std::memset(best_idx, 0, kNg*sizeof(int));
        return;
@@ -7739,8 +7755,8 @@ void QuantizerIQKT<block_size, group_size, num_bits, is_abs>::find_best_match(fl
 #endif
 }

-template <int block_size, int group_size, int num_bits, bool is_abs>
-std::vector<std::vector<int>> QuantizerIQKT<block_size, group_size, num_bits, is_abs>::finalize_clusters(int num_neighbours,
+template <int block_size, int group_size, int num_bits, bool is_abs, bool is_int>
+std::vector<std::vector<int>> QuantizerIQKT<block_size, group_size, num_bits, is_abs, is_int>::finalize_clusters(int num_neighbours,
        const std::vector<float>& values, const std::vector<float>& clusters, std::vector<std::vector<float>>& c_values) {
    int ncluster = clusters.size()/kGroupSize;
    std::vector<std::vector<int>> p_in_cluster(ncluster);
@@ -7826,8 +7842,8 @@ std::vector<std::vector<int>> QuantizerIQKT<block_size, group_size, num_bits, is
    return p_in_cluster;
 }

-template <int block_size, int group_size, int num_bits, bool is_abs>
-std::vector<float> QuantizerIQKT<block_size, group_size, num_bits, is_abs>::cluster_points(const std::vector<float>& points, int ncluster, int niter, float * mid) {
+template <int block_size, int group_size, int num_bits, bool is_abs, bool is_int>
+std::vector<float> QuantizerIQKT<block_size, group_size, num_bits, is_abs, is_int>::cluster_points(const std::vector<float>& points, int ncluster, int niter, float * mid) {
    constexpr int ndim = kGroupSize;
    GGML_ASSERT(points.size() % ndim == 0);
    int npoint = points.size() / ndim;
@@ -7995,7 +8011,7 @@ std::vector<float> QuantizerIQKT<block_size, group_size, num_bits, is_abs>::clus

 // ========================================== iq2_kt ====================================================

-using QuantizerIQ2KT = QuantizerIQKT<32, 8, 16>;
+using QuantizerIQ2KT = QuantizerIQKT<32, 8, 16, false, true>;

 const QuantizerIQ2KT& iq2kt_quantizer() {
    static std::mutex mutex;
@@ -8006,7 +8022,7 @@ const QuantizerIQ2KT& iq2kt_quantizer() {
 }

 void quantize_row_iq2_kt_impl(const float * x, void * vy, int n_per_row, const float * quant_weights, float * all_scales, float * all_weights,
-        float * qtmp) {
+        int * all_idx) {

    constexpr float kSigmaScale = 2.0f;
    using Q = QuantizerIQ2KT;
@@ -8025,6 +8041,11 @@ void quantize_row_iq2_kt_impl(const float * x, void * vy, int n_per_row, const f

    Q::set_weights(kSigmaScale, nblock, x, quant_weights, all_weights);

+    float amax_row = 0;
+    for (int j = 0; j < n_per_row; ++j) {
+        amax_row = std::max(amax_row, std::abs(x[j]));
+    }
+
    float amax_scale = 0, max_scale = 0;

    for (int ibl = 0; ibl < nblock; ++ibl) {
@@ -8042,9 +8063,10 @@ void quantize_row_iq2_kt_impl(const float * x, void * vy, int n_per_row, const f
                float ax = std::abs(xb[j]);
                amax = std::max(amax, ax);
            }
-            quantizer.find_best_match( amax/96.f, xb, weight, best_idx);
+            float scale_0 = std::max(90.f, 124.f*amax/amax_row);
+            quantizer.find_best_match( amax/scale_0, xb, weight, best_idx);
            auto [dp, score_p] = quantizer.find_best_scale(xb, weight, best_idx);
-            quantizer.find_best_match(-amax/96.f, xb, weight, best_idx + Q::kNg);
+            quantizer.find_best_match(-amax/scale_0, xb, weight, best_idx + Q::kNg);
            auto [dm, score_m] = quantizer.find_best_scale(xb, weight, best_idx + Q::kNg);

            auto idx = best_idx;
@@ -8052,12 +8074,7 @@ void quantize_row_iq2_kt_impl(const float * x, void * vy, int n_per_row, const f
            else {
                scales[ib] = dm; idx += Q::kNg;
            }
-            auto qt = qtmp + ibl*Q::kSuperBlockSize + ib*Q::kBlockSize;
-            for (int ig = 0; ig < Q::kNg; ++ig) {
-                auto q = quantizer.values() + idx[ig]*Q::kGroupSize;
-                for (int j = 0; j < Q::kGroupSize; ++j) qt[j] = q[j];
-                qt += Q::kGroupSize;
-            }
+            for (int ig = 0; ig < Q::kNg; ++ig) all_idx[(ibl*Q::kSuperBlockSize + ib*Q::kBlockSize)/Q::kGroupSize + ig] = idx[ig];

            float abs_scale = std::abs(scales[ib]);
            if (abs_scale > amax_scale) {
@@ -8080,20 +8097,22 @@ void quantize_row_iq2_kt_impl(const float * x, void * vy, int n_per_row, const f
        float sumqx = 0, sumq2 = 0;
        for (int ibl = 0; ibl < nblock; ++ibl) {
            const float * xb = x + ibl*Q::kSuperBlockSize;
-            const float * qb = qtmp + ibl*Q::kSuperBlockSize;
            const float * wb = all_weights + ibl*Q::kSuperBlockSize;
            auto scales = all_scales + ibl*Q::kNblock;
            for (int ib = 0; ib < Q::kNblock; ++ib) {
                int ls = best_index_iq4nl(iq4k_values, id*scales[ib]);
                float dl = iq4k_values[ls];
-                for (int j = 0; j < Q::kBlockSize; ++j) {
-                    float q = dl*qb[j];
-                    sumqx += wb[j]*xb[j]*q;
-                    sumq2 += wb[j]*q*q;
+                for (int ig = 0; ig < Q::kNg; ++ig) {
+                    auto qb = quantizer.values() + Q::kGroupSize*all_idx[(ibl*Q::kSuperBlockSize + ib*Q::kBlockSize)/Q::kGroupSize + ig];
+                    for (int j = 0; j < Q::kGroupSize; ++j) {
+                        int jj = ig*Q::kGroupSize + j;
+                        float q = dl*qb[j];
+                        sumqx += wb[jj]*xb[jj]*q;
+                        sumq2 += wb[jj]*q*q;
+                    }
                }
                xb += Q::kBlockSize;
                wb += Q::kBlockSize;
-                qb += Q::kBlockSize;
            }
        }
        if (sumq2 > 0 && sumqx*sumqx > best*sumq2) {
@@ -8129,6 +8148,26 @@ void quantize_row_iq2_kt_impl(const float * x, void * vy, int n_per_row, const f
                float dl = d*ls;
                quantizer.find_best_match(dl, xb, weight, best_idx);

+                auto prev_idx = all_idx + (ibl*Q::kSuperBlockSize + ib*Q::kBlockSize)/Q::kGroupSize;
+
+                float mse1 = 0, mse2 = 0;
+                for (int ig = 0; ig < Q::kNg; ++ig) {
+                    auto q1 = quantizer.values() + Q::kGroupSize*prev_idx[ig];
+                    auto q2 = quantizer.values() + Q::kGroupSize*best_idx[ig];
+                    for (int j = 0; j < Q::kGroupSize; ++j) {
+                        int jj = ig*Q::kGroupSize + j;
+                        float diff1 = xb[jj] - dl*q1[j];
+                        float diff2 = xb[jj] - dl*q2[j];
+                        mse1 += weight[jj]*diff1*diff1;
+                        mse2 += weight[jj]*diff2*diff2;
+                    }
+                }
+                if (mse1 < mse2) {
+                    for (int ig = 0; ig < Q::kNg; ++ig) best_idx[ig] = prev_idx[ig];
+                } else {
+                    for (int ig = 0; ig < Q::kNg; ++ig) prev_idx[ig] = best_idx[ig];
+                }
+
                for (int j = 0; j < Q::kNg; ++j) {
                    qs[j] = best_idx[j];
                    auto xl = xb + Q::kGroupSize*j;
@@ -8196,10 +8235,10 @@ size_t quantize_iq2_kt(const float * src, void * dst, int64_t nrows, int64_t n_p
    auto row_size = ggml_row_size(GGML_TYPE_IQ2_KT, n_per_row);
    std::vector<float> scales(n_per_row/QuantizerIQ2KT::kBlockSize);
    std::vector<float> weights(n_per_row);
-    std::vector<float> xtmp(n_per_row);
+    std::vector<int> idx(n_per_row/QuantizerIQ2KT::kGroupSize);
    char * qrow = (char *)dst;
    for (int64_t row = 0; row < nrows; ++row) {
-        quantize_row_iq2_kt_impl(src, (void *)qrow, n_per_row, imatrix, scales.data(), weights.data(), xtmp.data());
+        quantize_row_iq2_kt_impl(src, (void *)qrow, n_per_row, imatrix, scales.data(), weights.data(), idx.data());
        src += n_per_row;
        qrow += row_size;
    }
@@ -8209,7 +8248,7 @@ size_t quantize_iq2_kt(const float * src, void * dst, int64_t nrows, int64_t n_p
 void dequantize_row_iq2_kt(const block_iq2_kt * x, float * y, int64_t k) {
    assert(k % QuantizerIQ2KT::kSuperBlockSize == 0);
 #ifdef __AVX2__
-    if (iqk_dequantize_ktquants(GGML_TYPE_IQ2_KT, k, x, 0, y, 0, 1)) return;
+    //if (iqk_dequantize_ktquants(GGML_TYPE_IQ2_KT, k, x, 0, y, 0, 1)) return;
 #endif
    const int nb = k / QuantizerIQ2KT::kSuperBlockSize;
    const float * dptr = (const float *)x;
@@ -8254,7 +8293,7 @@ void vec_dot_iq2_kt_q8_k(int n, float * s, size_t bs, const void * vx, size_t bx

 namespace {

-using QuantizerIQ3KT = QuantizerIQKT<32, 8, 16, true>;
+using QuantizerIQ3KT = QuantizerIQKT<32, 8, 16, true, true>;
 const QuantizerIQ3KT& iq3kt_quantizer() {
    static std::mutex mutex;
    std::lock_guard<std::mutex> lock(mutex);
@@ -8465,7 +8504,7 @@ size_t quantize_iq3_kt(const float * src, void * dst, int64_t nrows, int64_t n_p

 void dequantize_row_iq3_kt(const block_iq3_kt * x, float * y, int64_t k) {
 #ifdef __AVX2__
-    if (iqk_dequantize_ktquants(GGML_TYPE_IQ3_KT, k, x, 0, y, 0, 1)) return;
+    //if (iqk_dequantize_ktquants(GGML_TYPE_IQ3_KT, k, x, 0, y, 0, 1)) return;
 #endif
    using Q = QuantizerIQ3KT;
    constexpr int kNumGroups = Q::kSuperBlockSize/Q::kGroupSize;
@@ -8521,7 +8560,7 @@ void vec_dot_iq3_kt_q8_k(int n, float * s, size_t bs, const void * vx, size_t bx

 namespace{

-using QuantizerIQ4KT = QuantizerIQKT<32, 4, 15>;
+using QuantizerIQ4KT = QuantizerIQKT<32, 4, 15, false, true>;

 const QuantizerIQ4KT& iq4kt_quantizer(bool with_offset = false) {
    static std::mutex mutex;
@@ -8536,6 +8575,14 @@ const QuantizerIQ4KT& iq4kt_quantizer(bool with_offset = false) {
    return *quantizer1;
 }

+const QuantizerIQ4KT& iq4kt_dequantizer() {
+    static std::mutex mutex;
+    std::lock_guard<std::mutex> lock(mutex);
+    static std::unique_ptr<QuantizerIQ4KT> dequantizer;
+    if (!dequantizer) dequantizer = std::make_unique<QuantizerIQ4KT>(0, 0, 4096);
+    return *dequantizer;
+}
+
 void quantize_row_iq4_kt_impl(const float * x, void * vy, int n_per_row, const float * quant_weights, float * all_scales, float * all_weights) {

    constexpr float kSigmaScale = 2.0f;
@@ -8546,7 +8593,7 @@ void quantize_row_iq4_kt_impl(const float * x, void * vy, int n_per_row, const f

    float * dptr = (float *)vy;

-    block_iq4_kt * y = (block_iq4_kt *)(dptr + 2);
+    block_iq4_kt * y = (block_iq4_kt *)(dptr + 1);

    auto& quantizer1 = iq4kt_quantizer();
    auto& quantizer2 = iq4kt_quantizer(true);
@@ -8555,13 +8602,10 @@ void quantize_row_iq4_kt_impl(const float * x, void * vy, int n_per_row, const f

    Q::set_weights(kSigmaScale, nblock, x, quant_weights, all_weights);

-    float amax_row = 0, row_av = 0;
+    float amax_row = 0;
    for (int j = 0; j < n_per_row; ++j) {
-        row_av += x[j];
        amax_row = std::max(amax_row, std::abs(x[j]));
    }
-    row_av /= n_per_row;
-    dptr[1] = row_av;
    if (!amax_row) {
        dptr[0] = 0.f;
        std::memset(y, 0, nblock*sizeof(block_iq4_kt));
@@ -8584,7 +8628,7 @@ void quantize_row_iq4_kt_impl(const float * x, void * vy, int n_per_row, const f
            const float * weight = all_weights + ibl*Q::kSuperBlockSize + ib*Q::kBlockSize;
            float amax = 0;
            for (int j = 0; j < Q::kBlockSize; ++j) {
-                xaux[j] = xbl[ib*Q::kBlockSize+j] - row_av;
+                xaux[j] = xbl[ib*Q::kBlockSize+j];
                float ax = std::abs(xaux[j]);
                amax = std::max(amax, ax);
            }
@@ -8593,7 +8637,7 @@ void quantize_row_iq4_kt_impl(const float * x, void * vy, int n_per_row, const f
                continue;
            }
            float best = 0;
-            float scale_0 = std::max(92.f, 127.f*amax/amax_row);
+            float scale_0 = std::max(90.f, 124.f*amax/amax_row);
            for (int itry = -kNtry; itry <= kNtry; ++itry) {
                quantizer1.find_best_match( amax/(8.f*itry + scale_0), xaux, weight, best_idx);
                auto [dp, score_p] = quantizer1.find_best_scale(xaux, weight, best_idx);
@@ -8664,7 +8708,7 @@ void quantize_row_iq4_kt_impl(const float * x, void * vy, int n_per_row, const f
            for (int ib = 0; ib < Q::kNblock; ++ib) {
                auto& quantizer = y[ibl].qs[ib] & 1 ? quantizer2 : quantizer1;
                const float * weight = all_weights + ibl*Q::kSuperBlockSize + ib*Q::kBlockSize;
-                for (int j = 0; j < Q::kBlockSize; ++j) xaux[j] = xbl[ib*Q::kBlockSize+j] - row_av;
+                for (int j = 0; j < Q::kBlockSize; ++j) xaux[j] = xbl[ib*Q::kBlockSize+j];
                int ls = nearest_int(id*scales[ib]);
                ls = std::min(ls, 63);
                *(uint8_t *)(shb + ib) = ((ls + 64) << 1) | (shb[ib] & 1);
@@ -8724,7 +8768,7 @@ size_t quantize_iq4_kt(const float * src, void * dst, int64_t nrows, int64_t n_p

 void dequantize_row_iq4_kt(const block_iq4_kt * x, float * y, int64_t k) {
 #ifdef __AVX2__
-    if (iqk_dequantize_ktquants(GGML_TYPE_IQ4_KT, k, x, 0, y, 0, 1)) return;
+    //if (iqk_dequantize_ktquants(GGML_TYPE_IQ4_KT, k, x, 0, y, 0, 1)) return;
 #endif
    using Q = QuantizerIQ4KT;
    assert(k % Q::kSuperBlockSize == 0);
@@ -8732,23 +8776,20 @@ void dequantize_row_iq4_kt(const block_iq4_kt * x, float * y, int64_t k) {
    const int nb = k / Q::kSuperBlockSize;
    const float * dptr = (const float *)x;
    const float d = dptr[0] * Q::kScale;
-    const float row_av = dptr[1];
-    x = (const block_iq4_kt *)(dptr + 2);
-    auto& deq = iq4kt_quantizer();
+    x = (const block_iq4_kt *)(dptr + 1);
+    auto& deq = iq4kt_dequantizer();
    for (int ibl = 0; ibl < nb; ++ibl) {
        auto shb = x[ibl].qs;
        auto ql = (const uint8_t *)(shb + Q::kNblock);
        auto qh = ql + kNumGroups;
        for (int ib = 0; ib < Q::kNblock; ++ib) {
            int offset = shb[ib] & 1 ? 32768 + 4096 : 4096;
-            //auto& deq = shb[ib] & 1 ? deq2 : deq1;
            int ls = int((shb[ib] & 0xff) >> 1) - 64;
            float sl = d * ls;
            for (int ig = 0; ig < Q::kNg; ++ig) {
                int jj = ib*Q::kNg+ig;
                uint16_t idx = ql[jj] | ((qh[jj%(kNumGroups/2)] << (8 - 4*(jj/(kNumGroups/2)))) & 0xf00) | (((shb[ib] >> (8 + 3*ig)) & 7) << 12);
                deq.set_values(idx, y, sl, offset);
-                for (int j = 0; j < Q::kGroupSize; ++j) y[j] += row_av;
                y += Q::kGroupSize;
            }
        }