From 834af69e47bc80961c0955baf69fb2b7c2267014 Mon Sep 17 00:00:00 2001 From: Iwan Kawrakow Date: Tue, 8 Oct 2024 17:11:42 +0300 Subject: [PATCH] iq4_xxs: scalar CPU dot product Also fix the breakage I caused with the dedicated work buffer quantization portion when the multiplication is not done via iqk_mul_mat. --- ggml/src/ggml.c | 7 +++++- ggml/src/iqk/iqk_quantize.cpp | 42 ++++++++++++++++++++++++++++++++++- 2 files changed, 47 insertions(+), 2 deletions(-) diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c index 44b5ad54..c69a9109 100644 --- a/ggml/src/ggml.c +++ b/ggml/src/ggml.c @@ -13279,7 +13279,7 @@ static void ggml_compute_forward_mul_mat_one_chunk( return; } - const void * wdata = (src1->type == vec_dot_type) ? src1->data : params->wdata; + const void * wdata = (src1->type == vec_dot_type) ? src1->data : (char *)params->wdata + params->wsize - params->qsize + GGML_MAX_NAME; const size_t row_size = ggml_row_size(vec_dot_type, ne10); assert(ne12 % ne02 == 0); @@ -13534,6 +13534,11 @@ IQK_MulMat_Not_Available2:; UseGgmlGemm2:; #endif + if (ith == 0) { + atomic_store(¶ms->shared->current_chunk, nth); + } + ggml_barrier(params->shared); + // This is the size of the first dimension of the result, so we can iterate that way. (see the ASSERT above, these are the same numbers) const int64_t nr0 = ne0; diff --git a/ggml/src/iqk/iqk_quantize.cpp b/ggml/src/iqk/iqk_quantize.cpp index 835b4ea0..5cf0a152 100644 --- a/ggml/src/iqk/iqk_quantize.cpp +++ b/ggml/src/iqk/iqk_quantize.cpp @@ -2376,6 +2376,46 @@ void dequantize_row_iq4_xxs(const block_iq4_xxs * x, float * y, int64_t k) { } } -void vec_dot_iq4_xxs_q8_k(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) { +void vec_dot_iq4_xxs_q8_k(int n, float * s, size_t bs, const void * vx, size_t bx, const void * vy, size_t by, int nrc) { + constexpr int kBlockSize = 32; +//#if GGML_USE_IQK_MULMAT +// if (iqk_mul_mat(1, 1, n, GGML_TYPE_IQ4_XXS, vx, 0, GGML_TYPE_Q8_K, vy, 0, s, 0, 0, 1)) { +// return; +// } +//#endif + GGML_ASSERT(n%QK_K == 0); + GGML_ASSERT(nrc == 1); + GGML_UNUSED(bs); + GGML_UNUSED(bx); + GGML_UNUSED(by); + const float * dptr = (const float *)vx; + const float d = *dptr; + //printf("%s: n = %d, d = %g\n", __func__, n, d); + const block_iq4_xxs * x = (const block_iq4_xxs *)(dptr + 1); + const block_q8_K * y = (const block_q8_K *)vy; + int nblock = n/QK_K; + float sumf = 0; + for (int ibl = 0; ibl < nblock; ++ibl) { + //int sumi = 0; + auto qy = y[ibl].qs; + auto qx = x[ibl].qs; + float db = d * y[ibl].d; + for (int ib = 0; ib < QK_K/kBlockSize; ++ib) { + float dl = db * ((x[ibl].scales[ib] & 254) - 127); + //int ls = (x[ibl].scales[ib] & 254) - 127; + const int8_t * values = iq4k_values + ((x[ibl].scales[ib] & 1) << 4); + int suml = 0; + for (int j = 0; j < kBlockSize/2; ++j) { + suml += qy[j ] * values[qx[j] & 0xf] + + qy[j + kBlockSize/2] * values[qx[j] >> 4]; + } + sumf += dl * suml; + //sumi += ls * suml; + qy += kBlockSize; + qx += kBlockSize/2; + } + //sumf += d * y[ibl].d * sumi; + } + *s = sumf; }