iq4_xxs: scalar CPU dot product

Also fix the breakage I caused with the dedicated work buffer quantization portion when the multiplication is not done via iqk_mul_mat.
2026-03-07 04:20:03 +00:00 · 2024-10-08 17:11:42 +03:00
parent 81bd33213d
commit 834af69e47
2 changed files with 47 additions and 2 deletions
--- a/ggml/src/ggml.c
+++ b/ggml/src/ggml.c
@@ -13279,7 +13279,7 @@ static void ggml_compute_forward_mul_mat_one_chunk(
        return;
    }

-    const void * wdata = (src1->type == vec_dot_type) ? src1->data : params->wdata;
+    const void * wdata = (src1->type == vec_dot_type) ? src1->data : (char *)params->wdata + params->wsize - params->qsize + GGML_MAX_NAME;
    const size_t row_size = ggml_row_size(vec_dot_type, ne10);

    assert(ne12 % ne02 == 0);
@@ -13534,6 +13534,11 @@ IQK_MulMat_Not_Available2:;
 UseGgmlGemm2:;
 #endif

+    if (ith == 0) {
+        atomic_store(&params->shared->current_chunk, nth);
+    }
+    ggml_barrier(params->shared);
+
    // This is the size of the first dimension of the result, so we can iterate that way. (see the ASSERT above, these are the same numbers)
    const int64_t nr0 = ne0;

--- a/ggml/src/iqk/iqk_quantize.cpp
+++ b/ggml/src/iqk/iqk_quantize.cpp
@@ -2376,6 +2376,46 @@ void dequantize_row_iq4_xxs(const block_iq4_xxs * x, float * y, int64_t k) {
    }
 }

-void  vec_dot_iq4_xxs_q8_k(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
+void  vec_dot_iq4_xxs_q8_k(int n, float * s, size_t bs, const void * vx, size_t bx, const void * vy, size_t by, int nrc) {
+    constexpr int kBlockSize = 32;
+//#if GGML_USE_IQK_MULMAT
+//    if (iqk_mul_mat(1, 1, n, GGML_TYPE_IQ4_XXS, vx, 0, GGML_TYPE_Q8_K, vy, 0, s, 0, 0, 1)) {
+//        return;
+//    }
+//#endif
+    GGML_ASSERT(n%QK_K == 0);
+    GGML_ASSERT(nrc == 1);
+    GGML_UNUSED(bs);
+    GGML_UNUSED(bx);
+    GGML_UNUSED(by);
+    const float * dptr = (const float *)vx;
+    const float d = *dptr;
+    //printf("%s: n = %d, d = %g\n", __func__, n, d);
+    const block_iq4_xxs * x = (const block_iq4_xxs *)(dptr + 1);
+    const block_q8_K    * y = (const block_q8_K    *)vy;
+    int nblock = n/QK_K;
+    float sumf = 0;
+    for (int ibl = 0; ibl < nblock; ++ibl) {
+        //int sumi = 0;
+        auto qy = y[ibl].qs;
+        auto qx = x[ibl].qs;
+        float db = d * y[ibl].d;
+        for (int ib = 0; ib < QK_K/kBlockSize; ++ib) {
+            float dl = db * ((x[ibl].scales[ib] & 254) - 127);
+            //int ls = (x[ibl].scales[ib] & 254) - 127;
+            const int8_t * values = iq4k_values + ((x[ibl].scales[ib] & 1) << 4);
+            int suml = 0;
+            for (int j = 0; j < kBlockSize/2; ++j) {
+                suml += qy[j               ] * values[qx[j] & 0xf]
+                      + qy[j + kBlockSize/2] * values[qx[j] >>  4];
+            }
+            sumf += dl * suml;
+            //sumi += ls * suml;
+            qy += kBlockSize;
+            qx += kBlockSize/2;
+        }
+        //sumf += d * y[ibl].d * sumi;
+    }
+    *s = sumf;
 }