mirror of
https://github.com/ikawrakow/ik_llama.cpp.git
synced 2026-04-27 01:49:28 +00:00
Interleave 8 rows (Q8_0, IQ4_XS) (#178)
* Try interleaving 8 rows for iq4_xs On Zen4, PP-512 goes up from ~260 t/s to 288 t/s for L3-8B. TG-128 reaches max. performance at 2 threads and is slightly higher than 4 interleaved rows (14.48 t/s vs 13.11 t/s @ 2 threads and 14/28 t/s @ 4 threads). * Try interleaving 8 iq4_xs rows It is also faster on AVX2. This is the NEON implementation. It is tiny bit faster than 4 interleaved rows (~0.5%). So, this looks like a winner given the Zen4/AVX2 improvement without associated NEON egression. * Cleanup * 8-rows interleaved q8_0 (AVX2) * 8-rows interleaved q8_0 (Zen4) * 8-rows interleaved q8_0 (Zen4) - slightly better PP-512 is now 284 t/s compared to 257 t/s for 4-rows interleaved. TG-128 reaches peak of 8.16 t/s at just 2 threads compared to 7.95 t/s @ 4 threads before. * 8-rows interleaved q8_0 (NEON) PP-512 is slightly better (138 t/s vs 132.5 t/s), TG-128 is about the same. * FA: repack Q8_0 to Q8_0_R8 * Remove special purpose mul_mat_q8_0_r4_q8_1_128 (Zen4) * FA: repack Q8_0 to Q8_0_R8 (NEON) Very slightly faster than the general purpose gemm, slightly slower than the D = 128 special case gemm mul_mat_q8_0_r4_q8_0_128. Still removing mul_mat_q8_0_r4_q8_0_128 as we simply don't have enough vector registers to hold 8 interleaved rows, so there is no point to have the special purpose implementation. --------- Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
This commit is contained in:
@@ -3709,63 +3709,63 @@ void vec_dot_q4_0_r4_q8_0(int n, float * s, size_t bs, const void * vx, size_t b
|
||||
//
|
||||
// ========================================= q8_0_r4
|
||||
//
|
||||
void quantize_row_q8_0_r4_ref(const float * x, block_q8_0_x4 * y, int64_t k) {
|
||||
void quantize_row_q8_0_r4_ref(const float * x, block_q8_0_r8 * y, int64_t k) {
|
||||
// we assume we are called with 4 rows
|
||||
quantize_q8_0_r4(x, (void *)y, 4, k/4, nullptr);
|
||||
quantize_q8_0_r4(x, (void *)y, 8, k/8, nullptr);
|
||||
}
|
||||
|
||||
void quantize_row_q8_0_r4(const float * x, void * y, int64_t k) {
|
||||
// we assume we are called with 4 rows
|
||||
quantize_q8_0_r4(x, y, 4, k/4, nullptr);
|
||||
quantize_q8_0_r4(x, y, 8, k/8, nullptr);
|
||||
}
|
||||
|
||||
static void repack_q8_0(int nrows, int n_per_row, const block_q8_0 * x, block_q8_0_x4 * y) {
|
||||
GGML_ASSERT(nrows%4 == 0);
|
||||
static void repack_q8_0(int nrows, int n_per_row, const block_q8_0 * x, block_q8_0_r8 * y) {
|
||||
GGML_ASSERT(nrows%8 == 0);
|
||||
GGML_ASSERT(n_per_row%QK8_0 == 0);
|
||||
int nblock = n_per_row/QK8_0;
|
||||
const block_q8_0 * x4[4];
|
||||
for (int row = 0; row < nrows; row += 4) {
|
||||
for (int k = 0; k < 4; ++k) x4[k] = x + nblock*k;
|
||||
const block_q8_0 * x8[8];
|
||||
for (int row = 0; row < nrows; row += 8) {
|
||||
for (int k = 0; k < 8; ++k) x8[k] = x + nblock*k;
|
||||
for (int ib = 0; ib < nblock; ++ib) {
|
||||
for (int k = 0; k < 4; ++k) y[ib].d[k] = x4[k][ib].d;
|
||||
for (int k = 0; k < 8; ++k) y[ib].d[k] = x8[k][ib].d;
|
||||
for (int l = 0; l < 4; ++l) {
|
||||
for (int k = 0; k < 4; ++k) for (int i = 0; i < 4; ++i) {
|
||||
y[ib].qs[32*l+4*k+i+ 0] = x4[k][ib].qs[i+4*l+ 0];
|
||||
y[ib].qs[32*l+4*k+i+16] = x4[k][ib].qs[i+4*l+16];
|
||||
for (int k = 0; k < 8; ++k) for (int i = 0; i < 4; ++i) {
|
||||
y[ib].qs[32*l+4*k+i+ 0] = x8[k][ib].qs[i+4*l+ 0];
|
||||
y[ib].qs[32*l+4*k+i+128] = x8[k][ib].qs[i+4*l+16];
|
||||
}
|
||||
}
|
||||
}
|
||||
x += 4*nblock;
|
||||
x += 8*nblock;
|
||||
y += nblock;
|
||||
}
|
||||
}
|
||||
|
||||
size_t quantize_q8_0_r4(const float * src, void * dst, int64_t nrows, int64_t n_per_row, const float * imatrix) {
|
||||
GGML_ASSERT(nrows%4 == 0);
|
||||
GGML_ASSERT(nrows%8 == 0);
|
||||
auto row_size_0 = ggml_row_size(GGML_TYPE_Q8_0, n_per_row);
|
||||
std::vector<char> qtmp(4*row_size_0);
|
||||
std::vector<char> qtmp(8*row_size_0);
|
||||
char * qrow = (char *)dst;
|
||||
for (int row = 0; row < nrows; row += 4) {
|
||||
quantize_q8_0(src, qtmp.data(), 4, n_per_row, imatrix);
|
||||
repack_q8_0(4, n_per_row, (const block_q8_0 *)qtmp.data(), (block_q8_0_x4 *)qrow);
|
||||
src += 4*n_per_row;
|
||||
qrow += 4*row_size_0;
|
||||
for (int row = 0; row < nrows; row += 8) {
|
||||
quantize_q8_0(src, qtmp.data(), 8, n_per_row, imatrix);
|
||||
repack_q8_0(8, n_per_row, (const block_q8_0 *)qtmp.data(), (block_q8_0_r8 *)qrow);
|
||||
src += 8*n_per_row;
|
||||
qrow += 8*row_size_0;
|
||||
}
|
||||
return nrows*row_size_0;
|
||||
}
|
||||
|
||||
void dequantize_row_q8_0_r4(const block_q8_0_x4 * x, float * y, int64_t k) {
|
||||
void dequantize_row_q8_0_r4(const block_q8_0_r8 * x, float * y, int64_t k) {
|
||||
// we assume we are called with 4 rows
|
||||
int n_per_row = k/4;
|
||||
int n_per_row = k/8;
|
||||
int nb = n_per_row/QK8_0;
|
||||
float * yk[4];
|
||||
for (int k = 0; k < 4; ++k) yk[k] = y + k*n_per_row;
|
||||
float * yk[8];
|
||||
for (int k = 0; k < 8; ++k) yk[k] = y + k*n_per_row;
|
||||
for (int ib = 0; ib < nb; ++ib) {
|
||||
for (int k = 0; k < 4; ++k) {
|
||||
for (int k = 0; k < 8; ++k) {
|
||||
float scale = GGML_FP16_TO_FP32(x[ib].d[k]);
|
||||
for (int l = 0; l < 4; ++l) for (int i = 0; i < 4; ++i) {
|
||||
yk[k][QK8_0*ib+4*l+i+ 0] = scale * x[ib].qs[QK8_0*l+4*k+i+ 0];
|
||||
yk[k][QK8_0*ib+4*l+i+16] = scale * x[ib].qs[QK8_0*l+4*k+i+16];
|
||||
yk[k][QK8_0*ib+4*l+i+ 0] = scale * x[ib].qs[32*l+4*k+i+ 0];
|
||||
yk[k][QK8_0*ib+4*l+i+16] = scale * x[ib].qs[32*l+4*k+i+128];
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -3987,93 +3987,77 @@ void vec_dot_q6_0_r4_q8_0(int n, float * s, size_t bs, const void * vx, size_t b
|
||||
//
|
||||
|
||||
void quantize_row_iq4_xs_r4_ref(const float * x, block_iq4_xs_r4 * y, int64_t k) {
|
||||
quantize_iq4_xs_r4(x, (void *)y, 4, k/4, nullptr);
|
||||
quantize_iq4_xs_r4(x, (void *)y, 8, k/8, nullptr);
|
||||
}
|
||||
|
||||
void quantize_row_iq4_xs_r4(const float * x, void * y, int64_t k) {
|
||||
quantize_iq4_xs_r4(x, y, 4, k/4, nullptr);
|
||||
quantize_iq4_xs_r4(x, y, 8, k/8, nullptr);
|
||||
}
|
||||
|
||||
static void repack_iq4_xs(int nrows, int n_per_row, const block_iq4_xs * x, block_iq4_xs_r4 * y) {
|
||||
GGML_ASSERT(nrows%4 == 0);
|
||||
GGML_ASSERT(nrows%8 == 0);
|
||||
GGML_ASSERT(n_per_row%QK_K == 0);
|
||||
int nblock = n_per_row/QK_K;
|
||||
const block_iq4_xs * x4[4];
|
||||
for (int row = 0; row < nrows; row += 4) {
|
||||
for (int k = 0; k < 4; ++k) x4[k] = x + nblock*k;
|
||||
const block_iq4_xs * x8[8];
|
||||
for (int row = 0; row < nrows; row += 8) {
|
||||
for (int k = 0; k < 8; ++k) x8[k] = x + nblock*k;
|
||||
for (int ibl = 0; ibl < nblock; ++ibl) {
|
||||
std::memset(y[ibl].scales_l, 0, QK_K/16);
|
||||
std::memset(y[ibl].scales_h, 0, QK_K/32);
|
||||
for (int k = 0; k < 4; ++k) {
|
||||
y[ibl].d[k] = x4[k][ibl].d;
|
||||
std::memset(y[ibl].scales_l, 0, QK_K/8);
|
||||
std::memset(y[ibl].scales_h, 0, QK_K/16);
|
||||
for (int k = 0; k < 8; ++k) {
|
||||
y[ibl].d[k] = x8[k][ibl].d;
|
||||
for (int ib = 0; ib < QK_K/32; ++ib) {
|
||||
uint8_t sl = (x4[k][ibl].scales_l[ib/2] >> 4*(ib%2)) & 0xf;
|
||||
uint8_t sh = (x4[k][ibl].scales_h >> 2*ib) & 3;
|
||||
int i = 4*ib + k;
|
||||
y[ibl].scales_l[i%16] |= (sl << 4*(i/16));
|
||||
y[ibl].scales_h[i%8 ] |= (sh << 2*(i/8));
|
||||
}
|
||||
}
|
||||
for (int ib = 0; ib < QK_K/32; ++ib) {
|
||||
for (int k = 0; k < 4; ++k) for (int i = 0; i < 4; ++i) {
|
||||
y[ibl].qs[64*ib+4*k+i+ 0] = (x4[k][ibl].qs[16*ib+i+0] & 0xf) | ((x4[k][ibl].qs[16*ib+i+ 8] & 0x0f) << 4); // 0....3 + 8...11 from each row
|
||||
y[ibl].qs[64*ib+4*k+i+16] = (x4[k][ibl].qs[16*ib+i+0] >> 4) | ((x4[k][ibl].qs[16*ib+i+ 8] & 0xf0)); // 16...19 + 24...27 from each row
|
||||
y[ibl].qs[64*ib+4*k+i+32] = (x4[k][ibl].qs[16*ib+i+4] & 0xf) | ((x4[k][ibl].qs[16*ib+i+12] & 0x0f) << 4); // 4....7 + 12...15 from each row
|
||||
y[ibl].qs[64*ib+4*k+i+48] = (x4[k][ibl].qs[16*ib+i+4] >> 4) | ((x4[k][ibl].qs[16*ib+i+12] & 0xf0)); // 20...23 + 28...31 from each row
|
||||
uint8_t sl = (x8[k][ibl].scales_l[ib/2] >> 4*(ib%2)) & 0xf;
|
||||
uint8_t sh = (x8[k][ibl].scales_h >> 2*ib) & 3;
|
||||
int i = 8*ib + k;
|
||||
y[ibl].scales_l[i%32] |= (sl << 4*(i/32));
|
||||
y[ibl].scales_h[i%16] |= (sh << 2*(i/16));
|
||||
for (int i = 0; i < 4; ++i) {
|
||||
y[ibl].qs[128*ib+4*k+i+ 0] = (x8[k][ibl].qs[16*ib+i+0] & 0xf) | ((x8[k][ibl].qs[16*ib+i+ 4] & 0xf) << 4);
|
||||
y[ibl].qs[128*ib+4*k+i+32] = (x8[k][ibl].qs[16*ib+i+8] & 0xf) | ((x8[k][ibl].qs[16*ib+i+12] & 0xf) << 4);
|
||||
y[ibl].qs[128*ib+4*k+i+64] = (x8[k][ibl].qs[16*ib+i+0] >> 4) | ((x8[k][ibl].qs[16*ib+i+ 4] >> 4) << 4);
|
||||
y[ibl].qs[128*ib+4*k+i+96] = (x8[k][ibl].qs[16*ib+i+8] >> 4) | ((x8[k][ibl].qs[16*ib+i+12] >> 4) << 4);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
x += 4*nblock;
|
||||
x += 8*nblock;
|
||||
y += nblock;
|
||||
}
|
||||
}
|
||||
|
||||
size_t quantize_iq4_xs_r4(const float * src, void * dst, int64_t nrows, int64_t n_per_row, const float * imatrix) {
|
||||
GGML_ASSERT(nrows%4 == 0);
|
||||
GGML_ASSERT(nrows%8 == 0);
|
||||
GGML_ASSERT(n_per_row%QK_K == 0);
|
||||
char * qcur = (char *)dst;
|
||||
auto row_size = ggml_row_size(GGML_TYPE_IQ4_XS, n_per_row);
|
||||
std::vector<char> qtmp(4*row_size);
|
||||
for (int row = 0; row < nrows; row += 4) {
|
||||
quantize_iq4_xs(src, (void *)qtmp.data(), 4, n_per_row, imatrix);
|
||||
repack_iq4_xs(4, n_per_row, (const block_iq4_xs *)qtmp.data(), (block_iq4_xs_r4 *)qcur);
|
||||
qcur += 4*row_size;
|
||||
src += 4*n_per_row;
|
||||
std::vector<char> qtmp(8*row_size);
|
||||
for (int row = 0; row < nrows; row += 8) {
|
||||
quantize_iq4_xs(src, (void *)qtmp.data(), 8, n_per_row, imatrix);
|
||||
repack_iq4_xs(8, n_per_row, (const block_iq4_xs *)qtmp.data(), (block_iq4_xs_r4 *)qcur);
|
||||
qcur += 8*row_size;
|
||||
src += 8*n_per_row;
|
||||
}
|
||||
return nrows*row_size;
|
||||
}
|
||||
|
||||
void dequantize_row_iq4_xs_r4(const block_iq4_xs_r4 * x, float * y, int64_t k) {
|
||||
auto n_per_row = k/4;
|
||||
float * y4[4] = {y, y + n_per_row, y + 2*n_per_row, y + 3*n_per_row};
|
||||
auto n_per_row = k/8;
|
||||
float * y8[8];
|
||||
for (int k = 0; k < 8; ++k) y8[k] = y + n_per_row*k;
|
||||
int nblock = n_per_row/QK_K;
|
||||
for (int ibl = 0; ibl < nblock; ++ibl) {
|
||||
for (int k = 0; k < 4; ++k) {
|
||||
for (int k = 0; k < 8; ++k) {
|
||||
const float d = GGML_FP16_TO_FP32(x[ibl].d[k]);
|
||||
for (int ib = 0; ib < QK_K/32; ++ib) {
|
||||
int is = 4*ib + k;
|
||||
float dl = d * ((((x[ibl].scales_l[is%16] >> 4*(is/16)) & 0xf) | (((x[ibl].scales_h[is%8] >> 2*(is/8)) & 3) << 4)) - 32);
|
||||
for (int i = 0; i < 4; ++i) {
|
||||
y4[k][QK_K*ibl+32*ib+i+ 0] = dl * iq4k_values[x[ibl].qs[64*ib+4*k+i+ 0] & 0xf];
|
||||
y4[k][QK_K*ibl+32*ib+i+ 8] = dl * iq4k_values[x[ibl].qs[64*ib+4*k+i+ 0] >> 4];
|
||||
y4[k][QK_K*ibl+32*ib+i+16] = dl * iq4k_values[x[ibl].qs[64*ib+4*k+i+16] & 0xf];
|
||||
y4[k][QK_K*ibl+32*ib+i+24] = dl * iq4k_values[x[ibl].qs[64*ib+4*k+i+16] >> 4];
|
||||
y4[k][QK_K*ibl+32*ib+i+ 4] = dl * iq4k_values[x[ibl].qs[64*ib+4*k+i+32] & 0xf];
|
||||
y4[k][QK_K*ibl+32*ib+i+12] = dl * iq4k_values[x[ibl].qs[64*ib+4*k+i+32] >> 4];
|
||||
y4[k][QK_K*ibl+32*ib+i+20] = dl * iq4k_values[x[ibl].qs[64*ib+4*k+i+48] & 0xf];
|
||||
y4[k][QK_K*ibl+32*ib+i+28] = dl * iq4k_values[x[ibl].qs[64*ib+4*k+i+48] >> 4];
|
||||
int is = 8*ib + k;
|
||||
float dl = d * ((((x[ibl].scales_l[is%32] >> 4*(is/32)) & 0xf) | (((x[ibl].scales_h[is%16] >> 2*(is/16)) & 3) << 4)) - 32);
|
||||
for (int l = 0; l < 4; ++l) for (int i = 0; i < 4; ++i) {
|
||||
y8[k][QK_K*ibl+32*ib+8*l+i+0] = dl * iq4k_values[x[ibl].qs[128*ib+4*k+i+32*l] & 0xf];
|
||||
y8[k][QK_K*ibl+32*ib+8*l+i+4] = dl * iq4k_values[x[ibl].qs[128*ib+4*k+i+32*l] >> 4];
|
||||
}
|
||||
}
|
||||
}
|
||||
//dequantize_row_iq4_xs(x + ib, ytmp, QK_K);
|
||||
//for (int k = 0; k < 4; ++k) {
|
||||
// for (int l = 0; l < 16; ++l) {
|
||||
// for (int i = 0; i < 4; ++i) {
|
||||
// //y4[k][ib*kBlockSize + i + 16*(l%4) + 4*(l/4)] = ytmp[16*l + 4*k + i];
|
||||
// y4[k][ib*kBlockSize + i + 8*(l%8) + 4*(l/8)] = ytmp[16*l + 4*k + i];
|
||||
// }
|
||||
// }
|
||||
//}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -6063,7 +6047,7 @@ void iqk_repack_tensor(struct ggml_tensor * tensor) {
|
||||
{ GGML_TYPE_IQ3_K, { GGML_TYPE_IQ3_K_R4, 4, (Repack::repack_func)repack_iq3_k} },
|
||||
{ GGML_TYPE_IQ4_K, { GGML_TYPE_IQ4_K_R4, 4, (Repack::repack_func)repack_iq4_k} },
|
||||
{ GGML_TYPE_IQ5_K, { GGML_TYPE_IQ5_K_R4, 4, (Repack::repack_func)repack_iq5_k} },
|
||||
{ GGML_TYPE_IQ4_XS, { GGML_TYPE_IQ4_XS_R4, 4, (Repack::repack_func)repack_iq4_xs} },
|
||||
{ GGML_TYPE_IQ4_XS, { GGML_TYPE_IQ4_XS_R4, 8, (Repack::repack_func)repack_iq4_xs} },
|
||||
{ GGML_TYPE_IQ4_KS, { GGML_TYPE_IQ4_KS_R4, 4, (Repack::repack_func)repack_iq4_ks} },
|
||||
{ GGML_TYPE_IQ4_NL, { GGML_TYPE_IQ4_NL_R4, 4, (Repack::repack_func)repack_iq4_nl} },
|
||||
{ GGML_TYPE_IQ2_BN, { GGML_TYPE_IQ2_BN_R4, 4, (Repack::repack_func)repack_iq2_bn} },
|
||||
@@ -6080,7 +6064,7 @@ void iqk_repack_tensor(struct ggml_tensor * tensor) {
|
||||
{ GGML_TYPE_Q4_0, { GGML_TYPE_Q4_0_R4, 4, (Repack::repack_func)repack_q4_0} },
|
||||
{ GGML_TYPE_Q5_0, { GGML_TYPE_Q5_0_R4, 4, (Repack::repack_func)repack_q5_0} },
|
||||
{ GGML_TYPE_Q6_0, { GGML_TYPE_Q6_0_R4, 4, (Repack::repack_func)repack_q6_0} },
|
||||
{ GGML_TYPE_Q8_0, { GGML_TYPE_Q8_0_R4, 4, (Repack::repack_func)repack_q8_0} },
|
||||
{ GGML_TYPE_Q8_0, { GGML_TYPE_Q8_0_R4, 8, (Repack::repack_func)repack_q8_0} },
|
||||
{ GGML_TYPE_Q8_K, { GGML_TYPE_Q8_K_R8, 8, (Repack::repack_func)repack_q8_k} },
|
||||
#ifdef __AVX512BF16__
|
||||
{ GGML_TYPE_BF16, { GGML_TYPE_BF16_R16, 16, (Repack::repack_func)repack_bf16<ggml_bf16_t>}},
|
||||
|
||||
Reference in New Issue
Block a user