iqk_mul_mat: fp16 tweaks

Use 4x3 tiling on a real AVX2 CPU (with only 16 vector registers).
This works best for the Ryzen-5975WX.
This commit is contained in:
Kawrakow
2024-06-07 15:21:16 +03:00
parent b4ecd2dce6
commit f2ced256b4

View File

@@ -2434,7 +2434,9 @@ bool MulMat::set_mul_mat(int typeA, int ne00, MulMat& mm, int& row_size_q8, int
mm.funcs[0] = mul_mat_f16_f32_T<1>;
mm.funcs[1] = mul_mat_f16_f32_T<2>;
mm.funcs[2] = mul_mat_f16_f32_T<3>;
#ifdef __AVX512F__
mm.funcs[3] = mul_mat_f16_f32_T<4>;
#endif
row_size_q8 = ggml_row_size(GGML_TYPE_F32, ne00);
return true;
}