From 16b8d3d229446dd52b6e48bf77bb47035aebfb43 Mon Sep 17 00:00:00 2001
From: Iwan Kawrakow <iwan.kawrakow@gmail.com>
Date: Tue, 27 Aug 2024 19:45:57 +0300
Subject: [PATCH] WIP

---
 ggml/src/ggml.c | 54 +++++++++++++++++++++++++++++++++++++++++--------
 1 file changed, 46 insertions(+), 8 deletions(-)

diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c
index cebac584..adff7669 100644
--- a/ggml/src/ggml.c
+++ b/ggml/src/ggml.c
@@ -2043,6 +2043,38 @@ inline static void ggml_vec_neg_f32 (const int n, float * y, const float * x)
 inline static void ggml_vec_mul_f32 (const int n, float * z, const float * x, const float * y) { for (int i = 0; i < n; ++i) z[i]  = x[i]*y[i];   }
 inline static void ggml_vec_div_f32 (const int n, float * z, const float * x, const float * y) { for (int i = 0; i < n; ++i) z[i]  = x[i]/y[i];   }
 
+static inline float ggml_vec_add_f32_f16(const int n, const ggml_half * x, float * y, float slope) {
+    __m512 vslope = _mm512_set1_ps(slope);
+    __m512 vmax = _mm512_set1_ps(-INFINITY);
+    for (int j = 0; j < n/16; ++j) {
+        __m512 v = _mm512_fmadd_ps(vslope, _mm512_cvtph_ps(_mm256_loadu_si256((const __m256i *)x + j)), _mm512_loadu_ps(y + 16*j));
+        _mm512_storeu_ps(y + 16*j, v);
+        vmax = _mm512_max_ps(vmax, v);
+    }
+    float max = _mm512_reduce_max_ps(vmax);
+    for (int i = 16*(n/16); i < n; ++i) {
+        y[i] += slope*GGML_FP16_TO_FP32(x[i]);
+        max = MAX(max, y[i]);
+    }
+    return max;
+}
+
+static inline float ggml_vec_add_f32_f32(const int n, const float * x, float * y, float slope) {
+    __m512 vslope = _mm512_set1_ps(slope);
+    __m512 vmax = _mm512_set1_ps(-INFINITY);
+    for (int j = 0; j < n/16; ++j) {
+        __m512 v = _mm512_fmadd_ps(vslope, _mm512_loadu_ps(x + 16*j), _mm512_loadu_ps(y + 16*j));
+        _mm512_storeu_ps(y + 16*j, v);
+        vmax = _mm512_max_ps(vmax, v);
+    }
+    float max = _mm512_reduce_max_ps(vmax);
+    for (int i = 16*(n/16); i < n; ++i) {
+        y[i] += slope*x[i];
+        max = MAX(max, y[i]);
+    }
+    return max;
+}
+
 static void ggml_vec_dot_f32(int n, float * restrict s, size_t bs, const float * restrict x, size_t bx, const float * restrict y, size_t by, int nrc) {
    assert(nrc == 1);
    UNUSED(nrc);
@@ -13782,17 +13814,23 @@ static void ggml_compute_forward_softcap_max_f32(
 
         ggml_vec_cpy_softcap_f32(nc, sp, wp, values[2], values[0]*values[3]);
 
+        float max = -INFINITY;
         if (mp_f32) {
             if (use_f16) {
-                for (int i = 0; i < nc; ++i) {
-                    wp[i] += slope*GGML_FP16_TO_FP32(mp_f16[i]);
-                }
+                max = ggml_vec_add_f32_f16(nc, mp_f16, wp, slope);
+                //for (int i = 0; i < nc; ++i) {
+                //    wp[i] += slope*GGML_FP16_TO_FP32(mp_f16[i]);
+                //}
             } else {
-                for (int i = 0; i < nc; ++i) {
-                    wp[i] += slope*mp_f32[i];
-                }
+                max = ggml_vec_add_f32_f32(nc, mp_f32, wp, slope);
+                //for (int i = 0; i < nc; ++i) {
+                //    wp[i] += slope*mp_f32[i];
+                //}
             }
         }
+        else {
+            ggml_vec_max_f32(nc, &max, wp);
+        }
 
 #ifndef NDEBUG
         for (int i = 0; i < nc; ++i) {
@@ -13801,8 +13839,8 @@ static void ggml_compute_forward_softcap_max_f32(
         }
 #endif
 
-        float max = -INFINITY;
-        ggml_vec_max_f32(nc, &max, wp);
+        //float max = -INFINITY;
+        //ggml_vec_max_f32(nc, &max, wp);
 
         ggml_float sum = ggml_vec_soft_max_f32(nc, dp, wp, max);
         assert(sum > 0.0);