iq2_kt - this is better

Using blocks of 32 and 16 bits per group of 8 weights it beats iq2_xxs in terms of PPL by a significant margin. It is 0.0625 bpw larger, but even if we go to 15 bits per group od 8 (so 0.0625 bpw less than iq2_xxs), PPL is still lower.
2026-02-25 07:34:10 +00:00 · 2024-11-06 20:49:56 +02:00
parent 766fa600c8
commit 36e9c922b8
6 changed files with 388 additions and 127 deletions
--- a/examples/quantize-stats/quantize-stats.cpp
+++ b/examples/quantize-stats/quantize-stats.cpp
@@ -498,7 +498,7 @@ static std::vector<float> cluster_points(const std::vector<float>& points, int n
 }

 static void analyze_x_v2(const char * name, int nrows, int n_per_row, const float * values, float& tot_mse, float& tot_mse_q, float& tot_elements) {
-    constexpr int kNumVal = 1 << 16;
+    constexpr int kNumVal = 1 << 15;
    constexpr int kBlockSize = 32;
    constexpr int kGroupSize = 8;
    constexpr int kNg = kBlockSize/kGroupSize;
@@ -508,7 +508,7 @@ static void analyze_x_v2(const char * name, int nrows, int n_per_row, const floa
    static std::vector<std::vector<int>> p_in_cluster;
    if (codes.empty()) {
        codes = make_values(kNumVal, kGroupSize, 31.75f);
-        clusters = cluster_points(codes, kGroupSize, kNumVal/1024, 200);
+        clusters = cluster_points(codes, kGroupSize, kNumVal/512, 200);
        if (clusters.empty()) { printf("Oops\n"); exit(1); }
        int ncluster = clusters.size()/kGroupSize;
        p_in_cluster.resize(ncluster);
@@ -623,7 +623,7 @@ static void analyze_x_v2(const char * name, int nrows, int n_per_row, const floa
                sigma2 /= n_per_row;
                for (int ib = 0; ib < n_per_row/kBlockSize; ++ib) {
                    auto xb = xr + kBlockSize*ib;
-                    for (int i = 0; i < kBlockSize; ++i) weight[i] = 0.25f*sigma2 + xb[i]*xb[i];
+                    //for (int i = 0; i < kBlockSize; ++i) weight[i] = 0.25f*sigma2 + xb[i]*xb[i];
                    float d = find_best_scale(kBlockSize, xb, weight.data(), iq4k_values, 5);
                    float id = d ? 1/d : 0.f;
 #ifdef __AVX2__