iqk_mul_mat: fp16 implementation for AVX2

This simple implementation beats jart's tiniBLAS by a
small margin (143 t/s vs 137 t/s for PP-512, TG is
4.75 t/s, so exactly the same as ggml).
This commit is contained in:
Iwan Kawrakow
2024-06-07 14:23:32 +03:00
parent 8e072bbba3
commit bc659e7de1
2 changed files with 235 additions and 4 deletions

View File

@@ -51,6 +51,7 @@
#include "sgemm.h"
#include "ggml-impl.h"
#include "ggml-quants.h"
#include "iqk_mul_mat.h"
#ifdef _MSC_VER
#define NOINLINE __declspec(noinline)
@@ -865,6 +866,12 @@ bool llamafile_sgemm(int64_t m, int64_t n, int64_t k, const void *A, int64_t lda
if (Ctype != GGML_TYPE_F32)
return false;
if (task == GGML_TASK_TYPE_COMPUTE && k >= 256 && Atype == GGML_TYPE_F16 && Btype == GGML_TYPE_F32) {
if (iqk_mul_mat(m, n, k, Atype, A, B, (float *)C, ldc, ith, nth)) {
return true;
}
}
switch (Atype) {
case GGML_TYPE_F32: {