mirror of
https://github.com/ikawrakow/ik_llama.cpp.git
synced 2026-04-29 02:41:47 +00:00
soft_cap_max: initial CPU version of fused softcap + soft_max
With this vanilla CPU implementation I'm already getting a ~3% speedup for Gemma-2-9b and a prompt of 8192 tokens.
This commit is contained in:
@@ -8324,10 +8324,12 @@ static struct ggml_tensor * llm_build_kqv(
|
||||
}
|
||||
|
||||
if (hparams.attn_soft_cap) {
|
||||
kq = ggml_softcap(ctx, kq, 1.0f / hparams.f_attn_logit_softcapping, hparams.f_attn_logit_softcapping);
|
||||
//kq = ggml_softcap(ctx, kq, 1.0f / hparams.f_attn_logit_softcapping, hparams.f_attn_logit_softcapping);
|
||||
kq = ggml_softcap_max(ctx, kq, kq_mask, kq_scale, hparams.f_max_alibi_bias,
|
||||
1.0f / hparams.f_attn_logit_softcapping, hparams.f_attn_logit_softcapping);
|
||||
} else {
|
||||
kq = ggml_soft_max_ext(ctx, kq, kq_mask, kq_scale, hparams.f_max_alibi_bias);
|
||||
}
|
||||
|
||||
kq = ggml_soft_max_ext(ctx, kq, kq_mask, kq_scale, hparams.f_max_alibi_bias);
|
||||
cb(kq, "kq_soft_max_ext", il);
|
||||
|
||||
GGML_ASSERT(kv.size == n_ctx);
|
||||
|
||||
Reference in New Issue
Block a user