Process up to 16 columns per kernel call for q8_k_r8

This brings PP-512 up to 389 t/s.
2026-02-24 23:24:13 +00:00 · 2025-01-27 12:39:56 +02:00
parent f1c114d477
commit fac48faa21
1 changed files with 3 additions and 0 deletions
--- a/ggml/src/iqk/iqk_mul_mat.cpp
+++ b/ggml/src/iqk/iqk_mul_mat.cpp
@@ -7991,6 +7991,9 @@ bool MulMat::prepare(int typeA, int typeB, int ne00, MulMat& mm, int Ny) {
            mm.funcs[5] = mul_mat_q8_k_r8_q8_k<6>;
            mm.funcs[6] = mul_mat_q8_k_r8_q8_k<7>;
            mm.funcs[7] = mul_mat_q8_k_r8_q8_k<8>;
+#ifdef HAVE_FANCY_SIMD
+            mm.func16 = mul_mat_q8_k_r8_q8_k<16>;
+#endif
            expected_typeB = GGML_TYPE_Q8_KR8;
            break;
        case GGML_TYPE_IQ4_K_R4: