From fe767a45ac143c75ffbda9a314ee5a7169dac662 Mon Sep 17 00:00:00 2001 From: Iwan Kawrakow Date: Sat, 26 Oct 2024 19:22:02 +0300 Subject: [PATCH] Use fused mul - unary op also for MoE models --- src/llama.cpp | 32 +++++++++++++++++--------------- 1 file changed, 17 insertions(+), 15 deletions(-) diff --git a/src/llama.cpp b/src/llama.cpp index 1384123a..a55254c0 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -8326,22 +8326,24 @@ static struct ggml_tensor * llm_build_moe_ffn( ggml_tensor * gate = llm_build_lora_mm_id(lctx, ctx, gate_exps, cur, selected_experts); // [n_ff, n_expert_used, n_tokens] cb(gate, "ffn_moe_gate", il); - switch (type_op) { - case LLM_FFN_SILU: - { - gate = ggml_silu(ctx, gate); - cb(gate, "ffn_moe_silu", il); - } break; - case LLM_FFN_GELU: - { - gate = ggml_gelu(ctx, gate); - cb(gate, "ffn_moe_gelu", il); - } break; - default: - GGML_ABORT("fatal error"); - } + // This is equivalent to the commented out code below + ggml_tensor * par = ggml_fused_mul_unary(ctx, gate, up, type_op == LLM_FFN_SILU ? GGML_UNARY_OP_SILU : GGML_UNARY_OP_GELU); - ggml_tensor * par = ggml_mul(ctx, up, gate); // [n_ff, n_expert_used, n_tokens] + //switch (type_op) { + // case LLM_FFN_SILU: + // { + // gate = ggml_silu(ctx, gate); + // cb(gate, "ffn_moe_silu", il); + // } break; + // case LLM_FFN_GELU: + // { + // gate = ggml_gelu(ctx, gate); + // cb(gate, "ffn_moe_gelu", il); + // } break; + // default: + // GGML_ABORT("fatal error"); + //} + //ggml_tensor * par = ggml_mul(ctx, up, gate); // [n_ff, n_expert_used, n_tokens] cb(par, "ffn_moe_gate_par", il); ggml_tensor * experts = llm_build_lora_mm_id(lctx, ctx, down_exps, par, selected_experts); // [n_embd, n_expert_used, n_tokens]