WIP KQ binary mask

2026-02-25 23:54:10 +00:00 · 2024-08-28 16:42:49 +02:00
parent 316345c535
commit 97dbc16e86
1 changed files with 45 additions and 14 deletions
--- a/ggml/src/ggml.c
+++ b/ggml/src/ggml.c
@@ -2074,23 +2074,54 @@ static inline float ggml_vec_add_f32_f32(const int n, const float * x, float * y
    }
    return max;
 }
-#else
-// TODO
+#elif __ARM_NEON
 static inline float ggml_vec_add_f32_f16(const int n, const ggml_half * x, float * y, float slope) {
-    GGML_UNUSED(n);
-    GGML_UNUSED(x);
-    GGML_UNUSED(y);
-    GGML_UNUSED(slope);
-    GGML_ASSERT(false);
-    return 0.f;
+    float32x4_t vslope = vdupq_n_f32(slope);
+    float32x4_t vmax = vdupq_n_f32(-INFINITY);
+    for (int j = 0; j < n/4; ++j) {
+        float32x4_t val = vmlaq_f32(vld1q_f32(y + 4*j), vslope, vcvt_f32_f16(vld1_f16((const float16_t *)x + 4*j)));
+        vmax = vmaxq_f32(vmax, val);
+        vst1q_f32(y + 4*j, val);
+    }
+    float max = vmaxvq_f32(vmax);
+    for (int i = 4*(n/4); i < n; ++i) {
+        y[i] += slope*x[i];
+        max = MAX(max, y[i]);
+    }
+    return max;
 }
 static inline float ggml_vec_add_f32_f32(const int n, const float * x, float * y, float slope) {
-    GGML_UNUSED(n);
-    GGML_UNUSED(x);
-    GGML_UNUSED(y);
-    GGML_UNUSED(slope);
-    GGML_ASSERT(false);
-    return 0.f;
+    float32x4_t vslope = vdupq_n_f32(slope);
+    float32x4_t vmax = vdupq_n_f32(-INFINITY);
+    for (int j = 0; j < n/4; ++j) {
+        float32x4_t val = vmlaq_f32(vld1q_f32(y + 4*j), vslope, vld1q_f32(x + 4*j));
+        vmax = vmaxq_f32(vmax, val);
+        vst1q_f32(y + 4*j, val);
+    }
+    float max = vmaxvq_f32(vmax);
+    for (int i = 4*(n/4); i < n; ++i) {
+        y[i] += slope*x[i];
+        max = MAX(max, y[i]);
+    }
+    return max;
+}
+#else
+// TODO add AVX2
+static inline float ggml_vec_add_f32_f16(const int n, const ggml_half * x, float * y, float slope) {
+    float max = -INFINITY;
+    for (int i = 0; i < n; ++i) {
+        y[i] += slope * GGML_FP16_TO_FP32(x[i]);
+        max = MAX(max, y[i]);
+    }
+    return max;
+}
+static inline float ggml_vec_add_f32_f32(const int n, const float * x, float * y, float slope) {
+    float max = -INFINITY;
+    for (int i = 0; i < n; ++i) {
+        y[i] += slope * x[i];
+        max = MAX(max, y[i]);
+    }
+    return max;
 }
 #endif