q8_KV_r8: don't use nrc_y = 16 on Zen4

This is faster - 350 t/s. Why? Much better than the 290 t/s we had before, but still slower than the 370 t/s for q8_k_r8.
2026-04-30 03:11:51 +00:00 · 2025-02-18 16:39:45 +02:00
parent 2b9526c8b6
commit e08e292bea
1 changed files with 3 additions and 3 deletions
--- a/ggml/src/iqk/iqk_mul_mat.cpp
+++ b/ggml/src/iqk/iqk_mul_mat.cpp
@@ -9367,9 +9367,9 @@ bool MulMat::prepare(int typeA, int typeB, int ne00, MulMat& mm, int Ny) {
            mm.funcs[5] = mul_mat_q8_KV_r8_q8_KV<6>;
            mm.funcs[6] = mul_mat_q8_KV_r8_q8_KV<7>;
            mm.funcs[7] = mul_mat_q8_KV_r8_q8_KV<8>;
-#ifdef HAVE_FANCY_SIMD
+//#ifdef HAVE_FANCY_SIMD
-            mm.func16 = mul_mat_q8_KV_r8_q8_KV<16>;
+//            mm.func16 = mul_mat_q8_KV_r8_q8_KV<16>;
-#endif
+//#endif
            expected_typeB = GGML_TYPE_Q8_KV;
            break;
        case GGML_TYPE_IQ4_K_R4: