Fused mul + multi_add op (#858)

* Adding fused mul+multi_add + CPU implementation

* fused mul+multi_add: CUDA

* fused mul+multi_add: command line argument to disable it

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
This commit is contained in:
Kawrakow
2025-10-24 07:40:35 +03:00
committed by GitHub
parent 483cea527d
commit db3ba4999f
15 changed files with 211 additions and 38 deletions

View File

@@ -50,6 +50,7 @@ llm_build_context::llm_build_context(
fused_moe_up_gate(cparams.fused_moe_up_gate),
grouped_expert_routing(cparams.grouped_expert_routing),
fused_up_gate (cparams.fused_up_gate),
fused_mmad (cparams.fused_mmad),
min_experts (cparams.min_experts),
thresh_experts (cparams.thresh_experts),
pooling_type (cparams.pooling_type),
@@ -941,6 +942,11 @@ llm_expert_gating_func_type gating_op,
}
if (!weight_before_ffn) {
if (lctx.cparams.fused_mmad) {
experts = ggml_mul_multi_add(ctx, experts, weights);
cb(experts, "ffn_moe_weighted", il);
return experts;
}
experts = ggml_mul(ctx, experts, weights);
cb(experts, "ffn_moe_weighted", il);
}