mirror of
https://github.com/ikawrakow/ik_llama.cpp.git
synced 2026-04-28 02:11:50 +00:00
Fused MoE ffn_up and ffn_gate (#229)
* Fusing MoE up * unary(gate) * Fusing MoE up * unary(gate): CUDA We get ~13% speedup for PP-512 and ~2% for TG-128 for DeepSeek-Lite * On CUDA also fuse MoE down * (up * unary(gate)) in case the MUL_MAT_ID op for the down experts is the next op in the graph. * Command line option to enable fused MoE up*unary(gate) * Add fmoe option to llama-bench * Adding forgotten gelu, relu, silu on ARM --------- Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
This commit is contained in:
@@ -567,6 +567,7 @@ extern "C" {
|
||||
GGML_OP_MUL_MAT,
|
||||
GGML_OP_MUL_MAT_ID,
|
||||
GGML_OP_OUT_PROD,
|
||||
GGML_OP_MOE_FUSED_UP_GATE,
|
||||
|
||||
GGML_OP_SCALE,
|
||||
GGML_OP_SET,
|
||||
@@ -1320,6 +1321,15 @@ extern "C" {
|
||||
struct ggml_tensor * b,
|
||||
struct ggml_tensor * ids);
|
||||
|
||||
// MoE up + gate + unary
|
||||
GGML_API struct ggml_tensor * ggml_moe_up_gate(
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * as_up,
|
||||
struct ggml_tensor * as_gate,
|
||||
struct ggml_tensor * b,
|
||||
struct ggml_tensor * ids,
|
||||
enum ggml_unary_op op);
|
||||
|
||||
// A: m columns, n rows,
|
||||
// B: p columns, n rows,
|
||||
// result is m columns, p rows
|
||||
|
||||
Reference in New Issue
Block a user