iq4_kt: very slightly better

at the expense of much longer quantization time.
2026-02-25 15:44:10 +00:00 · 2024-11-15 12:27:10 +02:00
parent 1be0a9e0d7
commit ab1cef30e7
2 changed files with 57 additions and 28 deletions
--- a/ggml/src/ggml-cuda/convert.cu
+++ b/ggml/src/ggml-cuda/convert.cu
@@ -432,9 +432,11 @@ static __global__ void dequantize_block_iq4_kt(const void * __restrict__ vx, dst
    const int ib32 = ib/4;
    const int ig = ib%4;
    const int jj = ib32*8 + 2*ig;
-    uint32_t idx1 = ql[jj+0] + ((qh[(jj+0)%(kNumGroups/2)] << (8 - 4*((jj+0)/(kNumGroups/2)))) & 0xf00) + (((shb[ib32] >> (8 + 6*ig+0)) & 7) << 12) + 4096;
-    uint32_t idx2 = ql[jj+1] + ((qh[(jj+1)%(kNumGroups/2)] << (8 - 4*((jj+1)/(kNumGroups/2)))) & 0xf00) + (((shb[ib32] >> (8 + 6*ig+3)) & 7) << 12) + 4096;
-    const float dl = scale * ((const int8_t *)(shb + ib32))[0];
+    uint32_t offset = shb[ib32] & 1 ? 4096 + 32768 : 4096;
+    uint32_t idx1 = ql[jj+0] + ((qh[(jj+0)%(kNumGroups/2)] << (8 - 4*((jj+0)/(kNumGroups/2)))) & 0xf00) + (((shb[ib32] >> (8 + 6*ig+0)) & 7) << 12) + offset;
+    uint32_t idx2 = ql[jj+1] + ((qh[(jj+1)%(kNumGroups/2)] << (8 - 4*((jj+1)/(kNumGroups/2)))) & 0xf00) + (((shb[ib32] >> (8 + 6*ig+3)) & 7) << 12) + offset;
+    int ls = ((shb[ib32] & 0xff) >> 1) - 64;
+    const float dl = scale * ls;
    uint32_t s[2];
    const half * h = (const half *)s;
    for (int j = 0; j < 4; ++j) {
--- a/ggml/src/iqk/iqk_quantize.cpp
+++ b/ggml/src/iqk/iqk_quantize.cpp
@@ -3165,19 +3165,19 @@ public:
    constexpr static float kScale = 31.75f;
    constexpr static bool kVerbose = false;

-    QuantizerIQKT(int num_clusters, int num_neighbours);
+    QuantizerIQKT(int num_clusters, int num_neighbours, int offset = 4096);
    const float * values() const { return m_values.data(); }

    inline void find_best_match(float d, const float * xb, const float * weight, int * best_idx) const;
    inline std::pair<float, float> find_best_scale(const float * xb, const float * weight, const int * best_idx) const;
    inline float find_best_inverse_scale(const float * xb, const float * weight, const int * best_idx) const;

-    static inline void set_values(uint32_t i, float * result, float scale) {
+    static inline void set_values(uint32_t i, float * result, float scale, int offset = 4096) {
        constexpr uint32_t ka = 89226354;
        constexpr uint32_t kb = 64248484;
        constexpr uint32_t kmask = 0x8fff8fff;
        constexpr uint32_t km32 = 0x3b603b60;
-        uint32_t x = i + 4096;
+        uint32_t x = i + offset;
        for (int k = 0; k < kGroupSize; ++k) {
            x = ka*x + kb;
            uint32_t s = (x & kmask) ^ km32;
@@ -3219,11 +3219,11 @@ private:
 };

 template <int block_size, int group_size, int num_bits>
-QuantizerIQKT<block_size, group_size, num_bits>::QuantizerIQKT(int num_clusters, int num_neighbours) {
+QuantizerIQKT<block_size, group_size, num_bits>::QuantizerIQKT(int num_clusters, int num_neighbours, int offset) {
    m_values.resize(kNumVal*kGroupSize);
    float * data = m_values.data();
    for (int i = 0; i < kNumVal; ++i) {
-        set_values(i, data, kScale);
+        set_values(i, data, kScale, offset);
        data += kGroupSize;
    }
    // Make 128 clusters.
@@ -4255,17 +4255,23 @@ namespace{

 using QuantizerIQ4KT = QuantizerIQKT<32, 4, 15>;

-const QuantizerIQ4KT& iq4kt_quantizer() {
+const QuantizerIQ4KT& iq4kt_quantizer(bool with_offset = false) {
    static std::mutex mutex;
    std::lock_guard<std::mutex> lock(mutex);
-    static std::unique_ptr<QuantizerIQ4KT> quantizer;
-    if (!quantizer) quantizer = std::make_unique<QuantizerIQ4KT>(625, 6);
-    return *quantizer;
+    static std::unique_ptr<QuantizerIQ4KT> quantizer1;
+    static std::unique_ptr<QuantizerIQ4KT> quantizer2;
+    if (with_offset) {
+        if (!quantizer2) quantizer2 = std::make_unique<QuantizerIQ4KT>(625, 6, 4096+32768);
+        return *quantizer2;
+    }
+    if (!quantizer1) quantizer1 = std::make_unique<QuantizerIQ4KT>(625, 6, 4096);
+    return *quantizer1;
 }

 void quantize_row_iq4_kt_impl(const float * x, void * vy, int n_per_row, const float * quant_weights, float * all_scales, float * all_weights) {

    constexpr float kSigmaScale = 2.0f;
+    constexpr int kNtry = 2;
    using Q = QuantizerIQ4KT;

    static_assert(Q::kNumVal%8 == 0);
@@ -4274,7 +4280,8 @@ void quantize_row_iq4_kt_impl(const float * x, void * vy, int n_per_row, const f

    block_iq4_kt * y = (block_iq4_kt *)(dptr + 2);

-    auto& quantizer = iq4kt_quantizer();
+    auto& quantizer1 = iq4kt_quantizer();
+    auto& quantizer2 = iq4kt_quantizer(true);

    int nblock = n_per_row / Q::kSuperBlockSize;

@@ -4319,19 +4326,40 @@ void quantize_row_iq4_kt_impl(const float * x, void * vy, int n_per_row, const f
            }
            float best = 0;
            float scale_0 = std::max(92.f, 127.f*amax/amax_row);
-            for (int itry = -2; itry <= 2; ++itry) {
-                quantizer.find_best_match( amax/(8.f*itry + scale_0), xaux, weight, best_idx);
-                auto [dp, score_p] = quantizer.find_best_scale(xaux, weight, best_idx);
+            for (int itry = -kNtry; itry <= kNtry; ++itry) {
+                quantizer1.find_best_match( amax/(8.f*itry + scale_0), xaux, weight, best_idx);
+                auto [dp, score_p] = quantizer1.find_best_scale(xaux, weight, best_idx);
                if (score_p > best) {
                    best = score_p; scales[ib] = dp;
                }
-                quantizer.find_best_match(-amax/(8.f*itry + scale_0), xaux, weight, best_idx);
-                auto [dm, score_m] = quantizer.find_best_scale(xaux, weight, best_idx);
+                quantizer1.find_best_match(-amax/(8.f*itry + scale_0), xaux, weight, best_idx);
+                auto [dm, score_m] = quantizer1.find_best_scale(xaux, weight, best_idx);
                if (score_m > best) {
                    best = score_m; scales[ib] = dm;
                }
            }

+            quantizer2.find_best_match(scales[ib], xaux, weight, best_idx);
+            auto [d, score] = quantizer2.find_best_scale(xaux, weight, best_idx);
+            if (score > best) {
+                scales[ib] = d;
+                y[ibl].qs[ib] = 1;
+            }
+            bool with_offset = false;
+            for (int itry = -kNtry; itry <= kNtry; ++itry) {
+                quantizer2.find_best_match( amax/(8.f*itry + scale_0), xaux, weight, best_idx);
+                auto [dp, score_p] = quantizer2.find_best_scale(xaux, weight, best_idx);
+                if (score_p > best) {
+                    best = score_p; scales[ib] = dp; with_offset = true;
+                }
+                quantizer2.find_best_match(-amax/(8.f*itry + scale_0), xaux, weight, best_idx);
+                auto [dm, score_m] = quantizer2.find_best_scale(xaux, weight, best_idx);
+                if (score_m > best) {
+                    best = score_m; scales[ib] = dm; with_offset = true;
+                }
+            }
+            if (with_offset) y[ibl].qs[ib] = 1;
+
            float abs_scale = std::abs(scales[ib]);
            if (abs_scale > amax_scale) {
                amax_scale = abs_scale;
@@ -4341,7 +4369,7 @@ void quantize_row_iq4_kt_impl(const float * x, void * vy, int n_per_row, const f

    }

-    float d = -max_scale/128;
+    float d = -max_scale/64;

    dptr[0] = d;
    if (!d) return;
@@ -4366,11 +4394,12 @@ void quantize_row_iq4_kt_impl(const float * x, void * vy, int n_per_row, const f
            auto scales = all_scales + ibl*Q::kNblock;

            for (int ib = 0; ib < Q::kNblock; ++ib) {
+                auto& quantizer = y[ibl].qs[ib] & 1 ? quantizer2 : quantizer1;
                const float * weight = all_weights + ibl*Q::kSuperBlockSize + ib*Q::kBlockSize;
                for (int j = 0; j < Q::kBlockSize; ++j) xaux[j] = xbl[ib*Q::kBlockSize+j] - row_av;
                int ls = nearest_int(id*scales[ib]);
-                ls = std::min(ls, 127);
-                *(int8_t *)(shb + ib) = ls;
+                ls = std::min(ls, 63);
+                *(uint8_t *)(shb + ib) = ((ls + 64) << 1) | (shb[ib] & 1);
                float dl = d*ls;
                quantizer.find_best_match(dl, xaux, weight, best_idx);

@@ -4387,11 +4416,6 @@ void quantize_row_iq4_kt_impl(const float * x, void * vy, int n_per_row, const f
                        sumq2 += wl[k]*q*q;
                    }
                }
-                //ls += 128;
-                //qs[2*ib+0] = uint64_t(best_idx[0]) | (uint64_t(best_idx[1]) << 15) | (uint64_t(best_idx[2]) << 30) | (uint64_t(best_idx[3]) << 45) |
-                //    (uint64_t(ls & 0x0f) << 60);
-                //qs[2*ib+1] = uint64_t(best_idx[4]) | (uint64_t(best_idx[5]) << 15) | (uint64_t(best_idx[6]) << 30) | (uint64_t(best_idx[7]) << 45) |
-                //    (uint64_t(ls & 0xf0) << 56);
            }
        }
        if (sumq2 > 0) {
@@ -4445,11 +4469,14 @@ void dequantize_row_iq4_kt(const block_iq4_kt * x, float * y, int64_t k) {
        auto ql = (const uint8_t *)(shb + Q::kNblock);
        auto qh = ql + kNumGroups;
        for (int ib = 0; ib < Q::kNblock; ++ib) {
-            float sl = d * ((const int8_t *)(shb + ib))[0];
+            int offset = shb[ib] & 1 ? 32768 + 4096 : 4096;
+            //auto& deq = shb[ib] & 1 ? deq2 : deq1;
+            int ls = int((shb[ib] & 0xff) >> 1) - 64;
+            float sl = d * ls;
            for (int ig = 0; ig < Q::kNg; ++ig) {
                int jj = ib*Q::kNg+ig;
                uint16_t idx = ql[jj] | ((qh[jj%(kNumGroups/2)] << (8 - 4*(jj/(kNumGroups/2)))) & 0xf00) | (((shb[ib] >> (8 + 3*ig)) & 7) << 12);
-                deq.set_values(idx, y, sl);
+                deq.set_values(idx, y, sl, offset);
                for (int j = 0; j < Q::kGroupSize; ++j) y[j] += row_av;
                y += Q::kGroupSize;
            }