mirror of
https://github.com/ikawrakow/ik_llama.cpp.git
synced 2026-02-27 16:44:21 +00:00
Adding SWIGLU unary op (#65)
* Adding GGML_UNARY_OP_SWIGLU This commit implements the ggml op and CPU compute forward. I see ~3-4% speedup of PP-512 for Phi-3.5-mini. * GGML_UNARY_OP_SWIGLU: CUDA implementation I observe ~12% speedup for PP-512(Phi-3.5-mini). * GGML_UNARY_OP_SWIGLU: Metal implementation We get ~2% speedup for PP-512(Phi-3.5-mini). * GGML_UNARY_OP_SWIGLU: minor improvement on Metal * GGML_UNARY_OP_SWIGLU: cleanup --------- Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
This commit is contained in:
@@ -8111,16 +8111,8 @@ static struct ggml_tensor * llm_build_ffn(
|
||||
} break;
|
||||
case LLM_FFN_SWIGLU:
|
||||
{
|
||||
// Project to 4h. If using swiglu double the output width, see https://arxiv.org/pdf/2002.05202.pdf
|
||||
int64_t split_point = cur->ne[0] / 2;
|
||||
struct ggml_tensor * x0 = ggml_cont(ctx, ggml_view_2d(ctx, cur, split_point, cur->ne[1], cur->nb[1], 0));
|
||||
struct ggml_tensor * x1 = ggml_cont(ctx, ggml_view_2d(ctx, cur, split_point, cur->ne[1], cur->nb[1], split_point * ggml_element_size(cur)));
|
||||
|
||||
x0 = ggml_silu(ctx, x0);
|
||||
cb(cur, "ffn_silu", il);
|
||||
|
||||
cur = ggml_mul(ctx, x0, x1);
|
||||
cb(cur, "ffn_mul", il);
|
||||
cur = ggml_swiglu(ctx, cur);
|
||||
cb(cur, "ffn_swiglu", il);
|
||||
} break;
|
||||
}
|
||||
|
||||
|
||||
Reference in New Issue
Block a user