iq4_xxs: scalar CPU dot product

Also fix the breakage I caused with the dedicated work buffer
quantization portion when the multiplication is not done
via iqk_mul_mat.
This commit is contained in:
Iwan Kawrakow
2024-10-08 17:11:42 +03:00
parent 81bd33213d
commit 834af69e47
2 changed files with 47 additions and 2 deletions

View File

@@ -13279,7 +13279,7 @@ static void ggml_compute_forward_mul_mat_one_chunk(
return;
}
const void * wdata = (src1->type == vec_dot_type) ? src1->data : params->wdata;
const void * wdata = (src1->type == vec_dot_type) ? src1->data : (char *)params->wdata + params->wsize - params->qsize + GGML_MAX_NAME;
const size_t row_size = ggml_row_size(vec_dot_type, ne10);
assert(ne12 % ne02 == 0);
@@ -13534,6 +13534,11 @@ IQK_MulMat_Not_Available2:;
UseGgmlGemm2:;
#endif
if (ith == 0) {
atomic_store(&params->shared->current_chunk, nth);
}
ggml_barrier(params->shared);
// This is the size of the first dimension of the result, so we can iterate that way. (see the ASSERT above, these are the same numbers)
const int64_t nr0 = ne0;

View File

@@ -2376,6 +2376,46 @@ void dequantize_row_iq4_xxs(const block_iq4_xxs * x, float * y, int64_t k) {
}
}
void vec_dot_iq4_xxs_q8_k(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
void vec_dot_iq4_xxs_q8_k(int n, float * s, size_t bs, const void * vx, size_t bx, const void * vy, size_t by, int nrc) {
constexpr int kBlockSize = 32;
//#if GGML_USE_IQK_MULMAT
// if (iqk_mul_mat(1, 1, n, GGML_TYPE_IQ4_XXS, vx, 0, GGML_TYPE_Q8_K, vy, 0, s, 0, 0, 1)) {
// return;
// }
//#endif
GGML_ASSERT(n%QK_K == 0);
GGML_ASSERT(nrc == 1);
GGML_UNUSED(bs);
GGML_UNUSED(bx);
GGML_UNUSED(by);
const float * dptr = (const float *)vx;
const float d = *dptr;
//printf("%s: n = %d, d = %g\n", __func__, n, d);
const block_iq4_xxs * x = (const block_iq4_xxs *)(dptr + 1);
const block_q8_K * y = (const block_q8_K *)vy;
int nblock = n/QK_K;
float sumf = 0;
for (int ibl = 0; ibl < nblock; ++ibl) {
//int sumi = 0;
auto qy = y[ibl].qs;
auto qx = x[ibl].qs;
float db = d * y[ibl].d;
for (int ib = 0; ib < QK_K/kBlockSize; ++ib) {
float dl = db * ((x[ibl].scales[ib] & 254) - 127);
//int ls = (x[ibl].scales[ib] & 254) - 127;
const int8_t * values = iq4k_values + ((x[ibl].scales[ib] & 1) << 4);
int suml = 0;
for (int j = 0; j < kBlockSize/2; ++j) {
suml += qy[j ] * values[qx[j] & 0xf]
+ qy[j + kBlockSize/2] * values[qx[j] >> 4];
}
sumf += dl * suml;
//sumi += ls * suml;
qy += kBlockSize;
qx += kBlockSize/2;
}
//sumf += d * y[ibl].d * sumi;
}
*s = sumf;
}