Fix KT quantization yet again (#1321)

* Fix KT quantization yet again * Add same 1e-16f check for all quants in iqk_uantize.cpp * Fixes for k-quants * Also this one
2026-04-22 23:49:23 +00:00 · 2026-02-25 18:07:12 +01:00
parent c77ec4b8b8
commit 216f44363f
2 changed files with 54 additions and 16 deletions
--- a/ggml/src/ggml-quants.c
+++ b/ggml/src/ggml-quants.c
@@ -1916,7 +1916,7 @@ static float make_qkx1_quants(int n, int nmax, const float * restrict x, uint8_t
        if (x[i] < min) min = x[i];
        if (x[i] > max) max = x[i];
    }
-    if (max == min) {
+    if (max - min < 1e-10f) {
        for (int i = 0; i < n; ++i) L[i] = 0;
        *the_min = 0;
        return 0.f;
@@ -1971,7 +1971,7 @@ static float make_qkx2_quants(int n, int nmax, const float * restrict x, const f
        sum_x += w * x[i];
    }
    if (min > 0) min = 0;
-    if (max == min) {
+    if (max - min < 1e-10f) {
        for (int i = 0; i < n; ++i) L[i] = 0;
        *the_min = -min;
        return 0.f;
@@ -2218,7 +2218,7 @@ static float make_qkx3_quants(int n, int nmax, const float * restrict x, const f
    if (min > 0) {
        min = 0;
    }
-    if (max <= min) {
+    if (max - min < 1e-10f) {
        memset(L, 0, n);
        *the_min = -min;
        return 0.f;
@@ -2340,7 +2340,7 @@ static float make_qp_quants(int n, int nmax, const float * restrict x, uint8_t *
    for (int i = 0; i < n; ++i) {
        max = MAX(max, x[i]);
    }
-    if (!max) { // all zero
+    if (max < 1e-16f) { // all zero
        for (int i = 0; i < n; ++i) { L[i] = 0; }
        return 0.f;
    }
@@ -2733,6 +2733,10 @@ void quantize_row_q4_K_ref(const float * restrict x, block_q4_K * restrict y, in
            float av_x = sqrtf(sum_x2/32);
            for (int l = 0; l < 32; ++l) weights[l] = av_x + fabsf(x[32*j + l]);
            scales[j] = make_qkx2_quants(32, 15, x + 32*j, weights, L + 32*j, &mins[j], Laux, -1.f, 0.1f, 20, false);
+            if (isnan(scales[j])) {
+                printf("Oops: NaN scale\n");
+                GGML_ABORT("Fatal error");
+            }
            float scale = scales[j];
            if (scale > max_scale) {
                max_scale = scale;
@@ -2846,10 +2850,18 @@ static void quantize_row_q4_K_impl(const float * restrict x, block_q4_K * restri
            for (int l = 0; l < 32; ++l) sumw += weights[l];
            sw[j] = sumw;
            scales[j] = make_qkx3_quants(32, 15, x + 32*j, weights, L + 32*j, &mins[j], Laux, -0.9f, 0.05f, 36, false);
+            if (isnan(scales[j])) {
+                printf("%s: got NaN scale\n", __func__);
+                GGML_ABORT("Fatal error");
+            }
        }

        float d_block = make_qp_quants(QK_K/32, 63, scales, Ls, sw);
        float m_block = make_qp_quants(QK_K/32, 63, mins,   Lm, sw);
+        if (isnan(d_block) || isnan(m_block)) {
+            printf("%s: d_block = %g, m_block = %g\n", __func__, (double)d_block, (double)m_block);
+            GGML_ABORT("Fatal error");
+        }
        for (int j = 0; j < QK_K/32; ++j) {
            uint8_t ls = Ls[j];
            uint8_t lm = Lm[j];
--- a/ggml/src/iqk/iqk_quantize.cpp
+++ b/ggml/src/iqk/iqk_quantize.cpp
@@ -1053,9 +1053,16 @@ void quantize_row_iq2_k_impl(const float * x, void * vy, int n_per_row, const fl
                for (int j = 0; j < kBlockSize; ++j) weight[j] = 0.25f*sigma2 + xb[j]*xb[j];
            }
            sw[ib] = 0;
+            float amax = 0;
            for (int j = 0; j < kBlockSize; ++j) {
                sw[ib] += weight[j];
                pairs[j] = {xb[j], j};
+                float ax = std::abs(xb[j]);
+                amax = std::max(amax, ax);
+            }
+            if (amax < 1e-16f) {
+                scales[ib] = 0;
+                continue;
            }
            std::sort(pairs.begin(), pairs.end());
            sumx[0] = sumw[0] = 0;
@@ -1269,9 +1276,16 @@ void quantize_row_iq2_ks_impl(const float * x, void * vy, int n_per_row, const f
                for (int j = 0; j < kBlockSize; ++j) weight[j] = 0.25f*sigma2 + xb[j]*xb[j];
            }
            sw[ib] = 0;
+            float amax = 0;
            for (int j = 0; j < kBlockSize; ++j) {
                sw[ib] += weight[j];
                pairs[j] = {xb[j], j};
+                float ax = std::abs(xb[j]);
+                amax = std::max(amax, ax);
+            }
+            if (amax < 1e-16f) {
+                scales[ib] = 0;
+                continue;
            }
            //float amax = 0, max = 0;
            //for (int j = 0; j < kBlockSize; ++j) {
@@ -1678,7 +1692,7 @@ void quantize_row_iq2_kl_impl(const float * x, void * vy, int n_per_row, const f
                    amax = ax; max = xb[j];
                }
            }
-            if (!amax) {
+            if (amax < 1e-16f) {
                scales[ib] = 0;
                continue;
            }
@@ -1929,7 +1943,7 @@ static void quantize_row_iq3_k_impl(const float * x, void * vy, int n_per_row, c
                    amax = ax; max = xb[j];
                }
            }
-            if (amax < 1e-9f) {
+            if (amax < 1e-16f) {
                scales[ib] = 0;
                continue;
            }
@@ -2216,7 +2230,7 @@ static void quantize_row_iq3_ks_impl(const int super_block_size, const int block
                    amax = ax; max = xb[j];
                }
            }
-            if (amax < 1e-9f) {
+            if (amax < 1e-16f) {
                scales[ib] = 0;
                continue;
            }
@@ -2544,7 +2558,7 @@ static void quantize_row_iq4_k_impl_bs16(const int super_block_size, const int b
                amax = ax; max = xb[j];
            }
        }
-        if (!amax) {
+        if (amax < 1e-16f) {
            scales[ib] = 0;
            continue;
        }
@@ -2862,7 +2876,7 @@ void quantize_row_iq5_k_impl(const float * x, void * vy, int n_per_row, const fl
                    amax = ax; max = xb[j];
                }
            }
-            if (!amax) {
+            if (amax < 1e-16f) {
                scales[ib] = 0;
                continue;
            }
@@ -3216,7 +3230,7 @@ void quantize_row_iq6_k_impl(const float * x, void * vy, int n_per_row, const fl
                    amax = ax; max = xb[j];
                }
            }
-            if (!amax) {
+            if (amax < 1e-16f) {
                scales[ib] = 0;
                continue;
            }
@@ -3918,7 +3932,7 @@ static void quantize_row_iq4_k_impl_bs128(const int super_block_size, const int
                    amax = ax; max = xb[j];
                }
            }
-            if (!amax) {
+            if (amax < 1e-16f) {
                scales[ib] = 0;
                continue;
            }
@@ -4167,7 +4181,7 @@ static void quantize_row_iq5_ks_impl(const int super_block_size, const int block
                    amax = ax; max = xb[j];
                }
            }
-            if (amax < 1e-15f) {
+            if (amax < 1e-16f) {
                scales[ib] = 0;
                continue;
            }
@@ -4470,7 +4484,7 @@ static void quantize_row_iq4_kss_impl(int n_per_row, const float * x, char * cy,
                    amax = ax; max = xb[j];
                }
            }
-            if (!amax) {
+            if (amax < 1e-16f) {
                scales[ib] = 0;
                continue;
            }
@@ -8733,6 +8747,11 @@ void quantize_row_iq1_kt_impl(const float * x, void * vy, int n_per_row, const f
                float ax = std::abs(xb[j]);
                amax = std::max(amax, ax);
            }
+            if (amax < 1e-16f) {
+                scales[ib] = 0.0f;
+                for (int ig = 0; ig < Q::kNg; ++ig) all_idx[(ibl*Q::kSuperBlockSize + ib*Q::kBlockSize)/Q::kGroupSize + ig] = 0;
+                continue;
+            }
            float scale_0 = std::max(90.f, 124.f*amax/amax_row);
            quantizer.find_best_match( amax/scale_0, xb, weight, best_idx);
            auto [dp, score_p] = quantizer.find_best_scale(xb, weight, best_idx);
@@ -8998,6 +9017,11 @@ void quantize_row_iq2_kt_impl(const float * x, void * vy, int n_per_row, const f
                float ax = std::abs(xb[j]);
                amax = std::max(amax, ax);
            }
+            if (amax < 1e-16f) {
+                scales[ib] = 0.0f;
+                for (int ig = 0; ig < Q::kNg; ++ig) all_idx[(ibl*Q::kSuperBlockSize + ib*Q::kBlockSize)/Q::kGroupSize + ig] = 0;
+                continue;
+            }
            float scale_0 = std::max(90.f, 124.f*amax/amax_row);
            quantizer.find_best_match( amax/scale_0, xb, weight, best_idx);
            auto [dp, score_p] = quantizer.find_best_scale(xb, weight, best_idx);
@@ -9289,8 +9313,10 @@ void quantize_row_iq3_kt_impl(const float * x, void * vy, int n_per_row, const f
                xaux[j] = ax;
                amax = std::max(amax, ax);
            }
-            scales[ib] = 0;
-            if (!amax) continue;
+            if (amax < 1e-16f) {
+                scales[ib] = 0.0f;
+                continue;
+            }

            //quantizer.find_best_match(amax/96.f, xaux, weight, best_idx+Q::kNg);
            //scales[ib] = quantizer.find_best_scale(xaux, weight, best_idx+Q::kNg).first;
@@ -9577,7 +9603,7 @@ void quantize_row_iq4_kt_impl(const float * x, void * vy, int n_per_row, const f
                float ax = std::abs(xaux[j]);
                amax = std::max(amax, ax);
            }
-            if (!amax) {
+            if (amax < 1e-16f) {
                scales[ib] = 0;
                continue;
            }