Interleave 8 rows (Q8_0, IQ4_XS) (#178)

* Try interleaving 8 rows for iq4_xs

On Zen4, PP-512 goes up from ~260 t/s to 288 t/s for L3-8B.
TG-128 reaches max. performance at 2 threads and is slightly
higher than 4 interleaved rows (14.48 t/s vs 13.11 t/s @ 2 threads
and 14/28 t/s @ 4 threads).

* Try interleaving 8 iq4_xs rows

It is also faster on AVX2.

This is the NEON implementation. It is tiny bit faster than
4 interleaved rows (~0.5%).

So, this looks like a winner given the Zen4/AVX2 improvement
without associated NEON egression.

* Cleanup

* 8-rows interleaved q8_0 (AVX2)

* 8-rows interleaved q8_0 (Zen4)

* 8-rows interleaved q8_0 (Zen4) - slightly better

PP-512 is now 284 t/s compared to 257 t/s for 4-rows interleaved.
TG-128 reaches peak of 8.16 t/s at just 2 threads compared
to 7.95 t/s @ 4 threads before.

* 8-rows interleaved q8_0 (NEON)

PP-512 is slightly better (138 t/s vs 132.5 t/s), TG-128 is about the
same.

* FA: repack Q8_0 to Q8_0_R8

* Remove special purpose mul_mat_q8_0_r4_q8_1_128 (Zen4)

* FA: repack Q8_0 to Q8_0_R8 (NEON)

Very slightly faster than the general purpose gemm, slightly
slower than the D = 128 special case gemm mul_mat_q8_0_r4_q8_0_128.
Still removing mul_mat_q8_0_r4_q8_0_128 as we simply don't have
enough vector registers to hold 8 interleaved rows, so there is
no point to have the special purpose implementation.

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
This commit is contained in:
Kawrakow
2025-01-27 16:50:07 +02:00
committed by GitHub
parent 814d3e054c
commit d9c4ea48d1
6 changed files with 437 additions and 431 deletions

View File

@@ -3709,63 +3709,63 @@ void vec_dot_q4_0_r4_q8_0(int n, float * s, size_t bs, const void * vx, size_t b
//
// ========================================= q8_0_r4
//
void quantize_row_q8_0_r4_ref(const float * x, block_q8_0_x4 * y, int64_t k) {
void quantize_row_q8_0_r4_ref(const float * x, block_q8_0_r8 * y, int64_t k) {
// we assume we are called with 4 rows
quantize_q8_0_r4(x, (void *)y, 4, k/4, nullptr);
quantize_q8_0_r4(x, (void *)y, 8, k/8, nullptr);
}
void quantize_row_q8_0_r4(const float * x, void * y, int64_t k) {
// we assume we are called with 4 rows
quantize_q8_0_r4(x, y, 4, k/4, nullptr);
quantize_q8_0_r4(x, y, 8, k/8, nullptr);
}
static void repack_q8_0(int nrows, int n_per_row, const block_q8_0 * x, block_q8_0_x4 * y) {
GGML_ASSERT(nrows%4 == 0);
static void repack_q8_0(int nrows, int n_per_row, const block_q8_0 * x, block_q8_0_r8 * y) {
GGML_ASSERT(nrows%8 == 0);
GGML_ASSERT(n_per_row%QK8_0 == 0);
int nblock = n_per_row/QK8_0;
const block_q8_0 * x4[4];
for (int row = 0; row < nrows; row += 4) {
for (int k = 0; k < 4; ++k) x4[k] = x + nblock*k;
const block_q8_0 * x8[8];
for (int row = 0; row < nrows; row += 8) {
for (int k = 0; k < 8; ++k) x8[k] = x + nblock*k;
for (int ib = 0; ib < nblock; ++ib) {
for (int k = 0; k < 4; ++k) y[ib].d[k] = x4[k][ib].d;
for (int k = 0; k < 8; ++k) y[ib].d[k] = x8[k][ib].d;
for (int l = 0; l < 4; ++l) {
for (int k = 0; k < 4; ++k) for (int i = 0; i < 4; ++i) {
y[ib].qs[32*l+4*k+i+ 0] = x4[k][ib].qs[i+4*l+ 0];
y[ib].qs[32*l+4*k+i+16] = x4[k][ib].qs[i+4*l+16];
for (int k = 0; k < 8; ++k) for (int i = 0; i < 4; ++i) {
y[ib].qs[32*l+4*k+i+ 0] = x8[k][ib].qs[i+4*l+ 0];
y[ib].qs[32*l+4*k+i+128] = x8[k][ib].qs[i+4*l+16];
}
}
}
x += 4*nblock;
x += 8*nblock;
y += nblock;
}
}
size_t quantize_q8_0_r4(const float * src, void * dst, int64_t nrows, int64_t n_per_row, const float * imatrix) {
GGML_ASSERT(nrows%4 == 0);
GGML_ASSERT(nrows%8 == 0);
auto row_size_0 = ggml_row_size(GGML_TYPE_Q8_0, n_per_row);
std::vector<char> qtmp(4*row_size_0);
std::vector<char> qtmp(8*row_size_0);
char * qrow = (char *)dst;
for (int row = 0; row < nrows; row += 4) {
quantize_q8_0(src, qtmp.data(), 4, n_per_row, imatrix);
repack_q8_0(4, n_per_row, (const block_q8_0 *)qtmp.data(), (block_q8_0_x4 *)qrow);
src += 4*n_per_row;
qrow += 4*row_size_0;
for (int row = 0; row < nrows; row += 8) {
quantize_q8_0(src, qtmp.data(), 8, n_per_row, imatrix);
repack_q8_0(8, n_per_row, (const block_q8_0 *)qtmp.data(), (block_q8_0_r8 *)qrow);
src += 8*n_per_row;
qrow += 8*row_size_0;
}
return nrows*row_size_0;
}
void dequantize_row_q8_0_r4(const block_q8_0_x4 * x, float * y, int64_t k) {
void dequantize_row_q8_0_r4(const block_q8_0_r8 * x, float * y, int64_t k) {
// we assume we are called with 4 rows
int n_per_row = k/4;
int n_per_row = k/8;
int nb = n_per_row/QK8_0;
float * yk[4];
for (int k = 0; k < 4; ++k) yk[k] = y + k*n_per_row;
float * yk[8];
for (int k = 0; k < 8; ++k) yk[k] = y + k*n_per_row;
for (int ib = 0; ib < nb; ++ib) {
for (int k = 0; k < 4; ++k) {
for (int k = 0; k < 8; ++k) {
float scale = GGML_FP16_TO_FP32(x[ib].d[k]);
for (int l = 0; l < 4; ++l) for (int i = 0; i < 4; ++i) {
yk[k][QK8_0*ib+4*l+i+ 0] = scale * x[ib].qs[QK8_0*l+4*k+i+ 0];
yk[k][QK8_0*ib+4*l+i+16] = scale * x[ib].qs[QK8_0*l+4*k+i+16];
yk[k][QK8_0*ib+4*l+i+ 0] = scale * x[ib].qs[32*l+4*k+i+ 0];
yk[k][QK8_0*ib+4*l+i+16] = scale * x[ib].qs[32*l+4*k+i+128];
}
}
}
@@ -3987,93 +3987,77 @@ void vec_dot_q6_0_r4_q8_0(int n, float * s, size_t bs, const void * vx, size_t b
//
void quantize_row_iq4_xs_r4_ref(const float * x, block_iq4_xs_r4 * y, int64_t k) {
quantize_iq4_xs_r4(x, (void *)y, 4, k/4, nullptr);
quantize_iq4_xs_r4(x, (void *)y, 8, k/8, nullptr);
}
void quantize_row_iq4_xs_r4(const float * x, void * y, int64_t k) {
quantize_iq4_xs_r4(x, y, 4, k/4, nullptr);
quantize_iq4_xs_r4(x, y, 8, k/8, nullptr);
}
static void repack_iq4_xs(int nrows, int n_per_row, const block_iq4_xs * x, block_iq4_xs_r4 * y) {
GGML_ASSERT(nrows%4 == 0);
GGML_ASSERT(nrows%8 == 0);
GGML_ASSERT(n_per_row%QK_K == 0);
int nblock = n_per_row/QK_K;
const block_iq4_xs * x4[4];
for (int row = 0; row < nrows; row += 4) {
for (int k = 0; k < 4; ++k) x4[k] = x + nblock*k;
const block_iq4_xs * x8[8];
for (int row = 0; row < nrows; row += 8) {
for (int k = 0; k < 8; ++k) x8[k] = x + nblock*k;
for (int ibl = 0; ibl < nblock; ++ibl) {
std::memset(y[ibl].scales_l, 0, QK_K/16);
std::memset(y[ibl].scales_h, 0, QK_K/32);
for (int k = 0; k < 4; ++k) {
y[ibl].d[k] = x4[k][ibl].d;
std::memset(y[ibl].scales_l, 0, QK_K/8);
std::memset(y[ibl].scales_h, 0, QK_K/16);
for (int k = 0; k < 8; ++k) {
y[ibl].d[k] = x8[k][ibl].d;
for (int ib = 0; ib < QK_K/32; ++ib) {
uint8_t sl = (x4[k][ibl].scales_l[ib/2] >> 4*(ib%2)) & 0xf;
uint8_t sh = (x4[k][ibl].scales_h >> 2*ib) & 3;
int i = 4*ib + k;
y[ibl].scales_l[i%16] |= (sl << 4*(i/16));
y[ibl].scales_h[i%8 ] |= (sh << 2*(i/8));
}
}
for (int ib = 0; ib < QK_K/32; ++ib) {
for (int k = 0; k < 4; ++k) for (int i = 0; i < 4; ++i) {
y[ibl].qs[64*ib+4*k+i+ 0] = (x4[k][ibl].qs[16*ib+i+0] & 0xf) | ((x4[k][ibl].qs[16*ib+i+ 8] & 0x0f) << 4); // 0....3 + 8...11 from each row
y[ibl].qs[64*ib+4*k+i+16] = (x4[k][ibl].qs[16*ib+i+0] >> 4) | ((x4[k][ibl].qs[16*ib+i+ 8] & 0xf0)); // 16...19 + 24...27 from each row
y[ibl].qs[64*ib+4*k+i+32] = (x4[k][ibl].qs[16*ib+i+4] & 0xf) | ((x4[k][ibl].qs[16*ib+i+12] & 0x0f) << 4); // 4....7 + 12...15 from each row
y[ibl].qs[64*ib+4*k+i+48] = (x4[k][ibl].qs[16*ib+i+4] >> 4) | ((x4[k][ibl].qs[16*ib+i+12] & 0xf0)); // 20...23 + 28...31 from each row
uint8_t sl = (x8[k][ibl].scales_l[ib/2] >> 4*(ib%2)) & 0xf;
uint8_t sh = (x8[k][ibl].scales_h >> 2*ib) & 3;
int i = 8*ib + k;
y[ibl].scales_l[i%32] |= (sl << 4*(i/32));
y[ibl].scales_h[i%16] |= (sh << 2*(i/16));
for (int i = 0; i < 4; ++i) {
y[ibl].qs[128*ib+4*k+i+ 0] = (x8[k][ibl].qs[16*ib+i+0] & 0xf) | ((x8[k][ibl].qs[16*ib+i+ 4] & 0xf) << 4);
y[ibl].qs[128*ib+4*k+i+32] = (x8[k][ibl].qs[16*ib+i+8] & 0xf) | ((x8[k][ibl].qs[16*ib+i+12] & 0xf) << 4);
y[ibl].qs[128*ib+4*k+i+64] = (x8[k][ibl].qs[16*ib+i+0] >> 4) | ((x8[k][ibl].qs[16*ib+i+ 4] >> 4) << 4);
y[ibl].qs[128*ib+4*k+i+96] = (x8[k][ibl].qs[16*ib+i+8] >> 4) | ((x8[k][ibl].qs[16*ib+i+12] >> 4) << 4);
}
}
}
}
x += 4*nblock;
x += 8*nblock;
y += nblock;
}
}
size_t quantize_iq4_xs_r4(const float * src, void * dst, int64_t nrows, int64_t n_per_row, const float * imatrix) {
GGML_ASSERT(nrows%4 == 0);
GGML_ASSERT(nrows%8 == 0);
GGML_ASSERT(n_per_row%QK_K == 0);
char * qcur = (char *)dst;
auto row_size = ggml_row_size(GGML_TYPE_IQ4_XS, n_per_row);
std::vector<char> qtmp(4*row_size);
for (int row = 0; row < nrows; row += 4) {
quantize_iq4_xs(src, (void *)qtmp.data(), 4, n_per_row, imatrix);
repack_iq4_xs(4, n_per_row, (const block_iq4_xs *)qtmp.data(), (block_iq4_xs_r4 *)qcur);
qcur += 4*row_size;
src += 4*n_per_row;
std::vector<char> qtmp(8*row_size);
for (int row = 0; row < nrows; row += 8) {
quantize_iq4_xs(src, (void *)qtmp.data(), 8, n_per_row, imatrix);
repack_iq4_xs(8, n_per_row, (const block_iq4_xs *)qtmp.data(), (block_iq4_xs_r4 *)qcur);
qcur += 8*row_size;
src += 8*n_per_row;
}
return nrows*row_size;
}
void dequantize_row_iq4_xs_r4(const block_iq4_xs_r4 * x, float * y, int64_t k) {
auto n_per_row = k/4;
float * y4[4] = {y, y + n_per_row, y + 2*n_per_row, y + 3*n_per_row};
auto n_per_row = k/8;
float * y8[8];
for (int k = 0; k < 8; ++k) y8[k] = y + n_per_row*k;
int nblock = n_per_row/QK_K;
for (int ibl = 0; ibl < nblock; ++ibl) {
for (int k = 0; k < 4; ++k) {
for (int k = 0; k < 8; ++k) {
const float d = GGML_FP16_TO_FP32(x[ibl].d[k]);
for (int ib = 0; ib < QK_K/32; ++ib) {
int is = 4*ib + k;
float dl = d * ((((x[ibl].scales_l[is%16] >> 4*(is/16)) & 0xf) | (((x[ibl].scales_h[is%8] >> 2*(is/8)) & 3) << 4)) - 32);
for (int i = 0; i < 4; ++i) {
y4[k][QK_K*ibl+32*ib+i+ 0] = dl * iq4k_values[x[ibl].qs[64*ib+4*k+i+ 0] & 0xf];
y4[k][QK_K*ibl+32*ib+i+ 8] = dl * iq4k_values[x[ibl].qs[64*ib+4*k+i+ 0] >> 4];
y4[k][QK_K*ibl+32*ib+i+16] = dl * iq4k_values[x[ibl].qs[64*ib+4*k+i+16] & 0xf];
y4[k][QK_K*ibl+32*ib+i+24] = dl * iq4k_values[x[ibl].qs[64*ib+4*k+i+16] >> 4];
y4[k][QK_K*ibl+32*ib+i+ 4] = dl * iq4k_values[x[ibl].qs[64*ib+4*k+i+32] & 0xf];
y4[k][QK_K*ibl+32*ib+i+12] = dl * iq4k_values[x[ibl].qs[64*ib+4*k+i+32] >> 4];
y4[k][QK_K*ibl+32*ib+i+20] = dl * iq4k_values[x[ibl].qs[64*ib+4*k+i+48] & 0xf];
y4[k][QK_K*ibl+32*ib+i+28] = dl * iq4k_values[x[ibl].qs[64*ib+4*k+i+48] >> 4];
int is = 8*ib + k;
float dl = d * ((((x[ibl].scales_l[is%32] >> 4*(is/32)) & 0xf) | (((x[ibl].scales_h[is%16] >> 2*(is/16)) & 3) << 4)) - 32);
for (int l = 0; l < 4; ++l) for (int i = 0; i < 4; ++i) {
y8[k][QK_K*ibl+32*ib+8*l+i+0] = dl * iq4k_values[x[ibl].qs[128*ib+4*k+i+32*l] & 0xf];
y8[k][QK_K*ibl+32*ib+8*l+i+4] = dl * iq4k_values[x[ibl].qs[128*ib+4*k+i+32*l] >> 4];
}
}
}
//dequantize_row_iq4_xs(x + ib, ytmp, QK_K);
//for (int k = 0; k < 4; ++k) {
// for (int l = 0; l < 16; ++l) {
// for (int i = 0; i < 4; ++i) {
// //y4[k][ib*kBlockSize + i + 16*(l%4) + 4*(l/4)] = ytmp[16*l + 4*k + i];
// y4[k][ib*kBlockSize + i + 8*(l%8) + 4*(l/8)] = ytmp[16*l + 4*k + i];
// }
// }
//}
}
}
@@ -6063,7 +6047,7 @@ void iqk_repack_tensor(struct ggml_tensor * tensor) {
{ GGML_TYPE_IQ3_K, { GGML_TYPE_IQ3_K_R4, 4, (Repack::repack_func)repack_iq3_k} },
{ GGML_TYPE_IQ4_K, { GGML_TYPE_IQ4_K_R4, 4, (Repack::repack_func)repack_iq4_k} },
{ GGML_TYPE_IQ5_K, { GGML_TYPE_IQ5_K_R4, 4, (Repack::repack_func)repack_iq5_k} },
{ GGML_TYPE_IQ4_XS, { GGML_TYPE_IQ4_XS_R4, 4, (Repack::repack_func)repack_iq4_xs} },
{ GGML_TYPE_IQ4_XS, { GGML_TYPE_IQ4_XS_R4, 8, (Repack::repack_func)repack_iq4_xs} },
{ GGML_TYPE_IQ4_KS, { GGML_TYPE_IQ4_KS_R4, 4, (Repack::repack_func)repack_iq4_ks} },
{ GGML_TYPE_IQ4_NL, { GGML_TYPE_IQ4_NL_R4, 4, (Repack::repack_func)repack_iq4_nl} },
{ GGML_TYPE_IQ2_BN, { GGML_TYPE_IQ2_BN_R4, 4, (Repack::repack_func)repack_iq2_bn} },
@@ -6080,7 +6064,7 @@ void iqk_repack_tensor(struct ggml_tensor * tensor) {
{ GGML_TYPE_Q4_0, { GGML_TYPE_Q4_0_R4, 4, (Repack::repack_func)repack_q4_0} },
{ GGML_TYPE_Q5_0, { GGML_TYPE_Q5_0_R4, 4, (Repack::repack_func)repack_q5_0} },
{ GGML_TYPE_Q6_0, { GGML_TYPE_Q6_0_R4, 4, (Repack::repack_func)repack_q6_0} },
{ GGML_TYPE_Q8_0, { GGML_TYPE_Q8_0_R4, 4, (Repack::repack_func)repack_q8_0} },
{ GGML_TYPE_Q8_0, { GGML_TYPE_Q8_0_R4, 8, (Repack::repack_func)repack_q8_0} },
{ GGML_TYPE_Q8_K, { GGML_TYPE_Q8_K_R8, 8, (Repack::repack_func)repack_q8_k} },
#ifdef __AVX512BF16__
{ GGML_TYPE_BF16, { GGML_TYPE_BF16_R16, 16, (Repack::repack_func)repack_bf16<ggml_bf16_t>}},