Fused mul + multi_add op (#858)

* Adding fused mul+multi_add + CPU implementation * fused mul+multi_add: CUDA * fused mul+multi_add: command line argument to disable it --------- Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
2026-04-25 08:59:30 +00:00 · 2025-10-24 07:40:35 +03:00
parent 483cea527d
commit db3ba4999f
15 changed files with 211 additions and 38 deletions
--- a/src/llama-build-context.cpp
+++ b/src/llama-build-context.cpp
@@ -50,6 +50,7 @@ llm_build_context::llm_build_context(
        fused_moe_up_gate(cparams.fused_moe_up_gate),
        grouped_expert_routing(cparams.grouped_expert_routing),
        fused_up_gate    (cparams.fused_up_gate),
+        fused_mmad       (cparams.fused_mmad),
        min_experts      (cparams.min_experts),
        thresh_experts   (cparams.thresh_experts),
        pooling_type     (cparams.pooling_type),
@@ -941,6 +942,11 @@ llm_expert_gating_func_type   gating_op,
    }

    if (!weight_before_ffn) {
+        if (lctx.cparams.fused_mmad) {
+            experts = ggml_mul_multi_add(ctx, experts, weights);
+            cb(experts, "ffn_moe_weighted", il);
+            return experts;
+        }
        experts = ggml_mul(ctx, experts, weights);
        cb(experts, "ffn_moe_weighted", il);
    }