iqk_mul_mat: fp16 for Arm

~2% slower than tinyBLAS - not sure why.
2026-02-28 09:04:10 +00:00 · 2024-06-10 08:16:52 +02:00
parent 6ec0fcc5c7
commit baf6aaa31b
2 changed files with 113 additions and 3 deletions
--- a/sgemm.cpp
+++ b/sgemm.cpp
@@ -866,10 +866,20 @@ bool llamafile_sgemm(int64_t m, int64_t n, int64_t k, const void *A, int64_t lda
    if (Ctype != GGML_TYPE_F32)
        return false;

-    if (task == GGML_TASK_TYPE_COMPUTE && k >= 256 && Atype == GGML_TYPE_F16 && Btype == GGML_TYPE_F32) {
-        if (iqk_mul_mat(m, n, k, Atype, A, B, (float *)C, ldc, ith, nth)) {
-            return true;
+    if (task == GGML_TASK_TYPE_COMPUTE && k >= 256 && Atype == GGML_TYPE_F16) {
+#if defined __AVX2__ && defined __FMA__
+        if (Btype == GGML_TYPE_F32) {
+            if (iqk_mul_mat(m, n, k, Atype, A, B, (float *)C, ldc, ith, nth)) {
+                return true;
+            }
        }
+#elif defined __ARM_FEATURE_FP16_VECTOR_ARITHMETIC && defined __ARM_FEATURE_FMA
+        if (Btype == GGML_TYPE_F16) {
+            if (iqk_mul_mat(m, n, k, Atype, A, B, (float *)C, ldc, ith, nth)) {
+                return true;
+            }
+        }
+#endif
    }

    switch (Atype) {