iq4_xxs: this looks very viable compared to iq4_xs

At the same 4.25 bpw PPL is always better, for some models significantly better. I'll rename to iq4_ks and keep it.
2026-04-27 01:49:28 +00:00 · 2024-10-08 16:07:00 +03:00
parent 1dd6c40c15
commit 975292b6b9
3 changed files with 9 additions and 9 deletions
--- a/ggml/src/ggml-common.h
+++ b/ggml/src/ggml-common.h
@@ -442,10 +442,10 @@ typedef struct {
 static_assert(sizeof(block_iq4_xs) == sizeof(ggml_half) + sizeof(uint16_t) + QK_K/64 + QK_K/2, "wrong iq4_xs block size/padding");

 typedef struct {
-    uint8_t  scales[QK_K/64];
+    uint8_t  scales[QK_K/32];
    uint8_t  qs[QK_K/2];
 } block_iq4_xxs;
-static_assert(sizeof(block_iq4_xxs) == QK_K/64 + QK_K/2, "wrong iq4_xxs block size/padding");
+static_assert(sizeof(block_iq4_xxs) == QK_K/32 + QK_K/2, "wrong iq4_xxs block size/padding");

 typedef struct {
    ggml_half d;
--- a/ggml/src/ggml-cuda/convert.cu
+++ b/ggml/src/ggml-cuda/convert.cu
@@ -626,15 +626,15 @@ static __global__ void dequantize_block_iq4_xxs(const void * __restrict__ vx, ds
    const int64_t i   = ii - (row*n_per_row)/QK_K;

    const int64_t tid = threadIdx.x;
-    const int64_t ib = tid/8; // 0...3
-    const int64_t il = tid%8; // 0...7
-    dst_t * y = yy + ii*QK_K + 64*ib + 4*il;
-    const uint8_t  * q4 = x[i].qs + 32*ib + 4*il;
+    const int64_t il = tid/8; // 0...3
+    const int64_t ib = tid%8; // 0...7
+    dst_t * y = yy + ii*QK_K + 32*ib + 4*il;
+    const uint8_t  * q4 = x[i].qs + 16*ib + 4*il;
    const float d = scale * ((x[i].scales[ib] & 254) - 127);
    const int8_t * values = iq4k_values + ((x[i].scales[ib] & 1) << 4);
    for (int j = 0; j < 4; ++j) {
        y[j+ 0] = d * values[q4[j] & 0xf];
-        y[j+32] = d * values[q4[j] >>  4];
+        y[j+16] = d * values[q4[j] >>  4];
    }
 }

--- a/ggml/src/iqk/iqk_quantize.cpp
+++ b/ggml/src/iqk/iqk_quantize.cpp
@@ -2340,7 +2340,7 @@ void quantize_row_iq4_xxs(const float * x, void * y, int64_t k) {

 size_t quantize_iq4_xxs(const float * src, void * dst, int64_t nrows, int64_t n_per_row, const float * imatrix) {
    //printf("============ %s(%d, %d)\n", __func__, int(nrows), int(n_per_row));
-    constexpr int kBlockSize = 64; //128;
+    constexpr int kBlockSize = 32; //128;
    GGML_ASSERT(n_per_row%QK_K == 0);
    auto row_size = ggml_row_size(GGML_TYPE_IQ4_XXS, n_per_row);
    char * qrow = (char *)dst;
@@ -2355,7 +2355,7 @@ size_t quantize_iq4_xxs(const float * src, void * dst, int64_t nrows, int64_t n_
 }

 void dequantize_row_iq4_xxs(const block_iq4_xxs * x, float * y, int64_t k) {
-    constexpr int kBlockSize = 64; //128;
+    constexpr int kBlockSize = 32; //128;
    GGML_ASSERT(k%QK_K == 0);
    const float * dptr = (const float *)x;
    float d = *dptr;