Process up to 16 columns per kernel call for q8_k_r8

This brings PP-512 up to 389 t/s.
This commit is contained in:
Iwan Kawrakow
2025-01-27 12:39:56 +02:00
parent f1c114d477
commit fac48faa21

View File

@@ -7991,6 +7991,9 @@ bool MulMat::prepare(int typeA, int typeB, int ne00, MulMat& mm, int Ny) {
mm.funcs[5] = mul_mat_q8_k_r8_q8_k<6>;
mm.funcs[6] = mul_mat_q8_k_r8_q8_k<7>;
mm.funcs[7] = mul_mat_q8_k_r8_q8_k<8>;
#ifdef HAVE_FANCY_SIMD
mm.func16 = mul_mat_q8_k_r8_q8_k<16>;
#endif
expected_typeB = GGML_TYPE_Q8_KR8;
break;
case GGML_TYPE_IQ4_K_R4: