mirror of
https://github.com/ikawrakow/ik_llama.cpp.git
synced 2026-02-25 23:54:10 +00:00
Simdified gelu
Gives ~1% speedup for Gemma2-9b prompt processing on AVX512/AVX2. It looks like the gelu operation is memory bound on my CPU's after SIMD-ifying it. By not using the 128 kb gelu lookup table we gain a small advantage. On the M2-Max the lookup table is slightly faster than the SIMD version, so left the lookup table for ARM_NEON.
This commit is contained in:
129
ggml/src/ggml.c
129
ggml/src/ggml.c
@@ -2436,44 +2436,14 @@ inline static void ggml_vec_sigmoid_f32 (const int n, float * y, const float * x
|
||||
inline static void ggml_vec_hardswish_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = x[i] * fminf(1.0f, fmaxf(0.0f, (x[i] + 3.0f) / 6.0f)); }
|
||||
inline static void ggml_vec_hardsigmoid_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = fminf(1.0f, fmaxf(0.0f, (x[i] + 3.0f) / 6.0f)); }
|
||||
|
||||
static const float GELU_COEF_A = 0.044715f;
|
||||
static const float GELU_QUICK_COEF = -1.702f;
|
||||
static const float GELU_COEF_A = 0.044715f;
|
||||
static const float SQRT_2_OVER_PI = 0.79788456080286535587989211986876f;
|
||||
|
||||
inline static float ggml_gelu_f32(float x) {
|
||||
return 0.5f*x*(1.0f + tanhf(SQRT_2_OVER_PI*x*(1.0f + GELU_COEF_A*x*x)));
|
||||
}
|
||||
|
||||
inline static void ggml_vec_gelu_f16(const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
|
||||
const uint16_t * i16 = (const uint16_t *) x;
|
||||
for (int i = 0; i < n; ++i) {
|
||||
y[i] = ggml_table_gelu_f16[i16[i]];
|
||||
}
|
||||
}
|
||||
|
||||
#ifdef GGML_GELU_FP16
|
||||
inline static void ggml_vec_gelu_f32(const int n, float * y, const float * x) {
|
||||
uint16_t t;
|
||||
for (int i = 0; i < n; ++i) {
|
||||
if (x[i] <= -10.0f) {
|
||||
y[i] = 0.0f;
|
||||
} else if (x[i] >= 10.0f) {
|
||||
y[i] = x[i];
|
||||
} else {
|
||||
ggml_fp16_t fp16 = GGML_FP32_TO_FP16(x[i]);
|
||||
memcpy(&t, &fp16, sizeof(uint16_t));
|
||||
y[i] = GGML_FP16_TO_FP32(ggml_table_gelu_f16[t]);
|
||||
}
|
||||
}
|
||||
}
|
||||
#else
|
||||
inline static void ggml_vec_gelu_f32(const int n, float * y, const float * x) {
|
||||
for (int i = 0; i < n; ++i) {
|
||||
y[i] = ggml_gelu_f32(x[i]);
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
inline static float ggml_gelu_quick_f32(float x) {
|
||||
return x*(1.0f/(1.0f+expf(GELU_QUICK_COEF*x)));
|
||||
}
|
||||
@@ -2566,6 +2536,20 @@ inline static float32x4_t ggml_v_softcap(float32x4_t x, float32x4_t s_before, fl
|
||||
return vmulq_f32(th, s_after);
|
||||
}
|
||||
|
||||
|
||||
// Slower than lookup on my M2-Max
|
||||
inline static float32x4_t ggml_v_gelu(float32x4_t x, float32x4_t c1, float32x4_t c2) {
|
||||
const float32x4_t one = vdupq_n_f32(1.0f);
|
||||
//float32x4_t arg = vaddq_f32(one, vmulq_f32(vmulq_f32(x, x), c1));
|
||||
float32x4_t arg = vfmaq_f32(one, c1, vmulq_f32(x, x));
|
||||
arg = vmulq_f32(arg, vmulq_f32(x, c2));
|
||||
float32x4_t exp_arg = ggml_v_expf(arg);
|
||||
float32x4_t gelu = vmulq_f32(x, vdivq_f32(exp_arg, vaddq_f32(exp_arg, one)));
|
||||
uint32x4_t mask = vcgtq_f32(x, vdupq_n_f32(10.f));
|
||||
return vbslq_f32(mask, x, gelu);
|
||||
//return vreinterpretq_f32_u32(vorrq_u32(vandq_u32(vreinterpretq_u32_f32(x), mask), vbicq_u32(vreinterpretq_u32_f32(gelu), mask)));
|
||||
}
|
||||
|
||||
#elif defined(__AVX512F__) && defined(__AVX512DQ__)
|
||||
|
||||
// adapted from arm limited optimized routine
|
||||
@@ -2622,6 +2606,15 @@ inline static __m512 ggml_v_softcap(__m512 x, __m512 s_before, __m512 s_after) {
|
||||
return _mm512_mul_ps(th, s_after);
|
||||
}
|
||||
|
||||
inline static __m512 ggml_v_gelu(__m512 x, __m512 c1, __m512 c2) {
|
||||
const __m512 one = _mm512_set1_ps(1.0f);
|
||||
__m512 arg = _mm512_fmadd_ps(x, _mm512_mul_ps(c1, x), one);
|
||||
//__m512 arg = _mm512_add_ps(one, _mm512_mul_ps(_mm512_mul_ps(x, x), c1));
|
||||
arg = _mm512_mul_ps(arg, _mm512_mul_ps(c2, x));
|
||||
__m512 exp_arg = ggml_v_expf(arg);
|
||||
return _mm512_mul_ps(x, _mm512_div_ps(exp_arg, _mm512_add_ps(exp_arg, one)));
|
||||
}
|
||||
|
||||
#elif defined(__AVX2__) && defined(__FMA__)
|
||||
|
||||
// adapted from arm limited optimized routine
|
||||
@@ -2690,6 +2683,16 @@ inline static __m256 ggml_v_softcap(__m256 x, float s_before, float s_after) {
|
||||
return _mm256_mul_ps(th, _mm256_set1_ps(s_after));
|
||||
}
|
||||
|
||||
inline static __m256 ggml_v_gelu(__m256 x, __m256 c1, __m256 c2) {
|
||||
const __m256 one = _mm256_set1_ps(1.0f);
|
||||
const __m256 mask = _mm256_cmp_ps(x, _mm256_set1_ps(10.f), _CMP_GT_OQ);
|
||||
__m256 arg = _mm256_add_ps(one, _mm256_mul_ps(_mm256_mul_ps(x, x), c1));
|
||||
arg = _mm256_mul_ps(arg, _mm256_mul_ps(x, c2));
|
||||
__m256 exp_arg = ggml_v_expf(arg);
|
||||
__m256 gelu = _mm256_mul_ps(x, _mm256_div_ps(exp_arg, _mm256_add_ps(exp_arg, one)));
|
||||
return _mm256_or_ps(_mm256_and_ps(mask, x), _mm256_andnot_ps(mask, gelu));
|
||||
}
|
||||
|
||||
#elif defined(__SSE2__) // __AVX2__ / __ARM_NEON
|
||||
|
||||
#if defined(__FMA__)
|
||||
@@ -2845,6 +2848,70 @@ static void ggml_vec_softcap_f32(const int n, float * x, float s_before, float s
|
||||
}
|
||||
}
|
||||
|
||||
inline static void ggml_vec_gelu_f16(const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
|
||||
const uint16_t * i16 = (const uint16_t *) x;
|
||||
for (int i = 0; i < n; ++i) {
|
||||
y[i] = ggml_table_gelu_f16[i16[i]];
|
||||
}
|
||||
}
|
||||
|
||||
//
|
||||
// On my AVX512 (Ryzen-7950X) and AVX2 (Ryzen-5975WX) computing gelu directly
|
||||
// via SIMD instructions is faster than the fp16-based lookup table.
|
||||
// On my M2-Max CPU the lookup table is slightly faster than the SIMD version,
|
||||
// hence we use the SIMD version only if GGML_GELU_FP16 is not defined.
|
||||
// We do not run into numerical issues for large or small arguments because
|
||||
// 0.5f * (1 + tanhf(arg))
|
||||
// is computed as
|
||||
// exp(2*arg)/(exp(2*arg) + 1)
|
||||
// The ggml_v_expf functions flushes to zero for large enough negative
|
||||
// arguments, so the above becomes zero. ggml_v_expf returns INFINITY
|
||||
// for large positive arguments, so we would get a NaN if we did nothing. But in the
|
||||
// ggml_v_gelu SIMD implementations we override the gelu result with the
|
||||
// input argument when the argument is greater than 10, so it is all good.
|
||||
//
|
||||
inline static void ggml_vec_gelu_f32(const int n, float * y, const float * x) {
|
||||
int i = 0;
|
||||
#if defined(__AVX512F__) && defined(__AVX512DQ__)
|
||||
__m512 c1 = _mm512_set1_ps(GELU_COEF_A);
|
||||
__m512 c2 = _mm512_set1_ps(2.f*SQRT_2_OVER_PI);
|
||||
for (; i + 15 < n; i += 16) {
|
||||
_mm512_storeu_ps(y + i, ggml_v_gelu(_mm512_loadu_ps(x + i), c1, c2));
|
||||
}
|
||||
#elif defined __AVX2__ && defined __FMA__
|
||||
__m256 c1 = _mm256_set1_ps(GELU_COEF_A);
|
||||
__m256 c2 = _mm256_set1_ps(2.f*SQRT_2_OVER_PI);
|
||||
for (; i + 7 < n; i += 8) {
|
||||
_mm256_storeu_ps(y + i, ggml_v_gelu(_mm256_loadu_ps(x + i), c1, c2));
|
||||
}
|
||||
#endif
|
||||
#ifdef GGML_GELU_FP16
|
||||
uint16_t t;
|
||||
for (; i < n; ++i) {
|
||||
if (x[i] <= -10.0f) {
|
||||
y[i] = 0.0f;
|
||||
} else if (x[i] >= 10.0f) {
|
||||
y[i] = x[i];
|
||||
} else {
|
||||
ggml_fp16_t fp16 = GGML_FP32_TO_FP16(x[i]);
|
||||
memcpy(&t, &fp16, sizeof(uint16_t));
|
||||
y[i] = GGML_FP16_TO_FP32(ggml_table_gelu_f16[t]);
|
||||
}
|
||||
}
|
||||
#else
|
||||
#if defined __ARM_NEON
|
||||
float32x4_t c1 = vdupq_n_f32(GELU_COEF_A);
|
||||
float32x4_t c2 = vdupq_n_f32(2.f*SQRT_2_OVER_PI);
|
||||
for (; i + 3 < n; i += 4) {
|
||||
vst1q_f32(y + i, ggml_v_gelu(vld1q_f32(x + i), c1, c2));
|
||||
}
|
||||
#endif
|
||||
for (; i < n; ++i) {
|
||||
y[i] = ggml_gelu_f32(x[i]);
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
static ggml_float ggml_vec_soft_max_f32(const int n, float * y, const float * x, float max) {
|
||||
int i = 0;
|
||||
ggml_float sum = 0;
|
||||
|
||||
Reference in New Issue
Block a user