diff --git a/ggml/src/ggml-cuda/convert.cu b/ggml/src/ggml-cuda/convert.cu
index 0855d101..0c529303 100644
--- a/ggml/src/ggml-cuda/convert.cu
+++ b/ggml/src/ggml-cuda/convert.cu
@@ -432,9 +432,11 @@ static __global__ void dequantize_block_iq4_kt(const void * __restrict__ vx, dst
     const int ib32 = ib/4;
     const int ig = ib%4;
     const int jj = ib32*8 + 2*ig;
-    uint32_t idx1 = ql[jj+0] + ((qh[(jj+0)%(kNumGroups/2)] << (8 - 4*((jj+0)/(kNumGroups/2)))) & 0xf00) + (((shb[ib32] >> (8 + 6*ig+0)) & 7) << 12) + 4096;
-    uint32_t idx2 = ql[jj+1] + ((qh[(jj+1)%(kNumGroups/2)] << (8 - 4*((jj+1)/(kNumGroups/2)))) & 0xf00) + (((shb[ib32] >> (8 + 6*ig+3)) & 7) << 12) + 4096;
-    const float dl = scale * ((const int8_t *)(shb + ib32))[0];
+    uint32_t offset = shb[ib32] & 1 ? 4096 + 32768 : 4096;
+    uint32_t idx1 = ql[jj+0] + ((qh[(jj+0)%(kNumGroups/2)] << (8 - 4*((jj+0)/(kNumGroups/2)))) & 0xf00) + (((shb[ib32] >> (8 + 6*ig+0)) & 7) << 12) + offset;
+    uint32_t idx2 = ql[jj+1] + ((qh[(jj+1)%(kNumGroups/2)] << (8 - 4*((jj+1)/(kNumGroups/2)))) & 0xf00) + (((shb[ib32] >> (8 + 6*ig+3)) & 7) << 12) + offset;
+    int ls = ((shb[ib32] & 0xff) >> 1) - 64;
+    const float dl = scale * ls;
     uint32_t s[2];
     const half * h = (const half *)s;
     for (int j = 0; j < 4; ++j) {
diff --git a/ggml/src/iqk/iqk_quantize.cpp b/ggml/src/iqk/iqk_quantize.cpp
index 8d7376a8..3e266b6b 100644
--- a/ggml/src/iqk/iqk_quantize.cpp
+++ b/ggml/src/iqk/iqk_quantize.cpp
@@ -3165,19 +3165,19 @@ public:
     constexpr static float kScale = 31.75f;
     constexpr static bool kVerbose = false;
 
-    QuantizerIQKT(int num_clusters, int num_neighbours);
+    QuantizerIQKT(int num_clusters, int num_neighbours, int offset = 4096);
     const float * values() const { return m_values.data(); }
 
     inline void find_best_match(float d, const float * xb, const float * weight, int * best_idx) const;
     inline std::pair<float, float> find_best_scale(const float * xb, const float * weight, const int * best_idx) const;
     inline float find_best_inverse_scale(const float * xb, const float * weight, const int * best_idx) const;
 
-    static inline void set_values(uint32_t i, float * result, float scale) {
+    static inline void set_values(uint32_t i, float * result, float scale, int offset = 4096) {
         constexpr uint32_t ka = 89226354;
         constexpr uint32_t kb = 64248484;
         constexpr uint32_t kmask = 0x8fff8fff;
         constexpr uint32_t km32 = 0x3b603b60;
-        uint32_t x = i + 4096;
+        uint32_t x = i + offset;
         for (int k = 0; k < kGroupSize; ++k) {
             x = ka*x + kb;
             uint32_t s = (x & kmask) ^ km32;
@@ -3219,11 +3219,11 @@ private:
 };
 
 template <int block_size, int group_size, int num_bits>
-QuantizerIQKT<block_size, group_size, num_bits>::QuantizerIQKT(int num_clusters, int num_neighbours) {
+QuantizerIQKT<block_size, group_size, num_bits>::QuantizerIQKT(int num_clusters, int num_neighbours, int offset) {
     m_values.resize(kNumVal*kGroupSize);
     float * data = m_values.data();
     for (int i = 0; i < kNumVal; ++i) {
-        set_values(i, data, kScale);
+        set_values(i, data, kScale, offset);
         data += kGroupSize;
     }
     // Make 128 clusters.
@@ -4255,17 +4255,23 @@ namespace{
 
 using QuantizerIQ4KT = QuantizerIQKT<32, 4, 15>;
 
-const QuantizerIQ4KT& iq4kt_quantizer() {
+const QuantizerIQ4KT& iq4kt_quantizer(bool with_offset = false) {
     static std::mutex mutex;
     std::lock_guard<std::mutex> lock(mutex);
-    static std::unique_ptr<QuantizerIQ4KT> quantizer;
-    if (!quantizer) quantizer = std::make_unique<QuantizerIQ4KT>(625, 6);
-    return *quantizer;
+    static std::unique_ptr<QuantizerIQ4KT> quantizer1;
+    static std::unique_ptr<QuantizerIQ4KT> quantizer2;
+    if (with_offset) {
+        if (!quantizer2) quantizer2 = std::make_unique<QuantizerIQ4KT>(625, 6, 4096+32768);
+        return *quantizer2;
+    }
+    if (!quantizer1) quantizer1 = std::make_unique<QuantizerIQ4KT>(625, 6, 4096);
+    return *quantizer1;
 }
 
 void quantize_row_iq4_kt_impl(const float * x, void * vy, int n_per_row, const float * quant_weights, float * all_scales, float * all_weights) {
 
     constexpr float kSigmaScale = 2.0f;
+    constexpr int kNtry = 2;
     using Q = QuantizerIQ4KT;
 
     static_assert(Q::kNumVal%8 == 0);
@@ -4274,7 +4280,8 @@ void quantize_row_iq4_kt_impl(const float * x, void * vy, int n_per_row, const f
 
     block_iq4_kt * y = (block_iq4_kt *)(dptr + 2);
 
-    auto& quantizer = iq4kt_quantizer();
+    auto& quantizer1 = iq4kt_quantizer();
+    auto& quantizer2 = iq4kt_quantizer(true);
 
     int nblock = n_per_row / Q::kSuperBlockSize;
 
@@ -4319,19 +4326,40 @@ void quantize_row_iq4_kt_impl(const float * x, void * vy, int n_per_row, const f
             }
             float best = 0;
             float scale_0 = std::max(92.f, 127.f*amax/amax_row);
-            for (int itry = -2; itry <= 2; ++itry) {
-                quantizer.find_best_match( amax/(8.f*itry + scale_0), xaux, weight, best_idx);
-                auto [dp, score_p] = quantizer.find_best_scale(xaux, weight, best_idx);
+            for (int itry = -kNtry; itry <= kNtry; ++itry) {
+                quantizer1.find_best_match( amax/(8.f*itry + scale_0), xaux, weight, best_idx);
+                auto [dp, score_p] = quantizer1.find_best_scale(xaux, weight, best_idx);
                 if (score_p > best) {
                     best = score_p; scales[ib] = dp;
                 }
-                quantizer.find_best_match(-amax/(8.f*itry + scale_0), xaux, weight, best_idx);
-                auto [dm, score_m] = quantizer.find_best_scale(xaux, weight, best_idx);
+                quantizer1.find_best_match(-amax/(8.f*itry + scale_0), xaux, weight, best_idx);
+                auto [dm, score_m] = quantizer1.find_best_scale(xaux, weight, best_idx);
                 if (score_m > best) {
                     best = score_m; scales[ib] = dm;
                 }
             }
 
+            quantizer2.find_best_match(scales[ib], xaux, weight, best_idx);
+            auto [d, score] = quantizer2.find_best_scale(xaux, weight, best_idx);
+            if (score > best) {
+                scales[ib] = d;
+                y[ibl].qs[ib] = 1;
+            }
+            bool with_offset = false;
+            for (int itry = -kNtry; itry <= kNtry; ++itry) {
+                quantizer2.find_best_match( amax/(8.f*itry + scale_0), xaux, weight, best_idx);
+                auto [dp, score_p] = quantizer2.find_best_scale(xaux, weight, best_idx);
+                if (score_p > best) {
+                    best = score_p; scales[ib] = dp; with_offset = true;
+                }
+                quantizer2.find_best_match(-amax/(8.f*itry + scale_0), xaux, weight, best_idx);
+                auto [dm, score_m] = quantizer2.find_best_scale(xaux, weight, best_idx);
+                if (score_m > best) {
+                    best = score_m; scales[ib] = dm; with_offset = true;
+                }
+            }
+            if (with_offset) y[ibl].qs[ib] = 1;
+
             float abs_scale = std::abs(scales[ib]);
             if (abs_scale > amax_scale) {
                 amax_scale = abs_scale;
@@ -4341,7 +4369,7 @@ void quantize_row_iq4_kt_impl(const float * x, void * vy, int n_per_row, const f
 
     }
 
-    float d = -max_scale/128;
+    float d = -max_scale/64;
 
     dptr[0] = d;
     if (!d) return;
@@ -4366,11 +4394,12 @@ void quantize_row_iq4_kt_impl(const float * x, void * vy, int n_per_row, const f
             auto scales = all_scales + ibl*Q::kNblock;
 
             for (int ib = 0; ib < Q::kNblock; ++ib) {
+                auto& quantizer = y[ibl].qs[ib] & 1 ? quantizer2 : quantizer1;
                 const float * weight = all_weights + ibl*Q::kSuperBlockSize + ib*Q::kBlockSize;
                 for (int j = 0; j < Q::kBlockSize; ++j) xaux[j] = xbl[ib*Q::kBlockSize+j] - row_av;
                 int ls = nearest_int(id*scales[ib]);
-                ls = std::min(ls, 127);
-                *(int8_t *)(shb + ib) = ls;
+                ls = std::min(ls, 63);
+                *(uint8_t *)(shb + ib) = ((ls + 64) << 1) | (shb[ib] & 1);
                 float dl = d*ls;
                 quantizer.find_best_match(dl, xaux, weight, best_idx);
 
@@ -4387,11 +4416,6 @@ void quantize_row_iq4_kt_impl(const float * x, void * vy, int n_per_row, const f
                         sumq2 += wl[k]*q*q;
                     }
                 }
-                //ls += 128;
-                //qs[2*ib+0] = uint64_t(best_idx[0]) | (uint64_t(best_idx[1]) << 15) | (uint64_t(best_idx[2]) << 30) | (uint64_t(best_idx[3]) << 45) |
-                //    (uint64_t(ls & 0x0f) << 60);
-                //qs[2*ib+1] = uint64_t(best_idx[4]) | (uint64_t(best_idx[5]) << 15) | (uint64_t(best_idx[6]) << 30) | (uint64_t(best_idx[7]) << 45) |
-                //    (uint64_t(ls & 0xf0) << 56);
             }
         }
         if (sumq2 > 0) {
@@ -4445,11 +4469,14 @@ void dequantize_row_iq4_kt(const block_iq4_kt * x, float * y, int64_t k) {
         auto ql = (const uint8_t *)(shb + Q::kNblock);
         auto qh = ql + kNumGroups;
         for (int ib = 0; ib < Q::kNblock; ++ib) {
-            float sl = d * ((const int8_t *)(shb + ib))[0];
+            int offset = shb[ib] & 1 ? 32768 + 4096 : 4096;
+            //auto& deq = shb[ib] & 1 ? deq2 : deq1;
+            int ls = int((shb[ib] & 0xff) >> 1) - 64;
+            float sl = d * ls;
             for (int ig = 0; ig < Q::kNg; ++ig) {
                 int jj = ib*Q::kNg+ig;
                 uint16_t idx = ql[jj] | ((qh[jj%(kNumGroups/2)] << (8 - 4*(jj/(kNumGroups/2)))) & 0xf00) | (((shb[ib] >> (8 + 3*ig)) & 7) << 12);
-                deq.set_values(idx, y, sl);
+                deq.set_values(idx, y, sl, offset);
                 for (int j = 0; j < Q::kGroupSize; ++j) y[j] += row_av;
                 y += Q::kGroupSize;
             }