Merge ffn_up and ffn_gate experts tensors (#1137)

* WIP - not working * WIP - not working * WIP - GPT-OSS working However, extremely stupid. The only way I could correctly repack the up/gate experts is to copy up and gate into host buffers, repack into another host buffer, copy back into the ffn_up_gate_exps tensor. This is going to be very slow for giant 500 GB models. My attempts to do this via a compute graph on the backend holding the tensors was unsuccessful. For GPT-OSS-20B I see ~6-7% better PP when using the original ik_llama.cpp fused_up_gate CUDA implementation, and ~10% when using the small batch size implementation. Other models are not working yet on CUDA as I need to fix the fused mul-unary implementation. * WIP * WIP - Qwen3-MoE (and hopefully all others) working But when I say here and in the previous commit "working", I mean PP is working. TG is still broken. * WIP: TG seems to be working * Minor * Add command line option to merge experts up/gate * Add merge up/gate command line parameter to llama-bench * Turn off merge_up_gate_exps if split mode graph It is not yet implemented * When no bias, allow merging up/gate with tensor overrides * Arghh, we need to increase the context size again * Cleanup
2026-04-28 10:21:48 +00:00 · 2026-01-12 18:30:53 +02:00
parent bf0c6c57bb
commit c03c2d7cc6
16 changed files with 505 additions and 134 deletions
--- a/src/llama-build-context.cpp
+++ b/src/llama-build-context.cpp
@@ -910,7 +910,8 @@ ggml_tensor * llm_build_context::llm_build_moe_ffn(
                       bool   scale_w,
                      float   w_scale,
 llm_expert_gating_func_type   gating_op,
-         const llm_build_cb & cb, int il, ggml_cgraph * graph, bool add_input) {
+         const llm_build_cb & cb, int il, ggml_cgraph * graph, bool add_input,
+         ggml_tensor * up_gate_exps, ggml_tensor * up_gate_exps_b) {

    auto input = cur;

@@ -1025,6 +1026,19 @@ llm_expert_gating_func_type   gating_op,
    bool can_use_fmoe = type_op == LLM_FFN_SILU || type_op == LLM_FFN_GELU || type_op == LLM_FFN_SWIGLU_OAI_MOE;

    ggml_tensor * par;
+    if (can_use_fmoe && up_gate_exps) {
+        if (up_gate_exps_b) {
+            par = ggml_moe_up_gate_ext(ctx, up_gate_exps, nullptr, cur, selected_experts, up_gate_exps_b, nullptr,
+                    type_op == LLM_FFN_SILU ? GGML_UNARY_OP_SILU :
+                    type_op == LLM_FFN_GELU ? GGML_UNARY_OP_GELU : GGML_UNARY_OP_SWIGLU_OAI);
+        } else {
+            GGML_ASSERT(type_op != LLM_FFN_SWIGLU_OAI_MOE);
+            par = ggml_moe_up_gate(ctx, up_gate_exps, nullptr, cur, selected_experts,
+                    type_op == LLM_FFN_SILU ? GGML_UNARY_OP_SILU : GGML_UNARY_OP_GELU);
+        }
+    } else {
+    GGML_ASSERT(!up_gate_exps && !up_gate_exps_b);
+
    if (can_use_fmoe && lctx.cparams.fused_moe_up_gate && up_exps->type == gate_exps->type) {
        if (up_exps_b || gate_exps_b) {
            par = ggml_moe_up_gate_ext(ctx, up_exps, gate_exps, cur, selected_experts, up_exps_b, gate_exps_b,
@@ -1069,6 +1083,7 @@ llm_expert_gating_func_type   gating_op,
            GGML_ABORT("fatal error");
        }

+    }
    }
    cb(par, "ffn_moe_gate_par", il);

@@ -1130,7 +1145,8 @@ ggml_tensor * llm_build_context::llm_build_std_moe_ffn(ggml_context * ctx, llama
                      float   w_scale,
 llm_expert_gating_func_type   gating_op,
            llm_ffn_op_type   type_op_shexp,
-         const llm_build_cb & cb, int il, ggml_cgraph * graph, bool add_input) {
+         const llm_build_cb & cb, int il, ggml_cgraph * graph, bool add_input,
+         ggml_tensor * up_gate_exps, ggml_tensor * up_gate_exps_b) {

    auto split_up_exps    = (ggml_split_tensor_t *)up_exps->extra;
    auto split_gate_exps  = (ggml_split_tensor_t *)gate_exps->extra;
@@ -1164,7 +1180,7 @@ llm_expert_gating_func_type   gating_op,
                    the_exp_probs_b,
                    n_expert, n_expert_used,
                    type_op, norm_w, scale_w, w_scale,
-                    gating_op, cb, il, graph, false);
+                    gating_op, cb, il, graph, false, up_gate_exps, up_gate_exps_b);
        cb(routed_out, "routed_out", il);
        if (add_input) {
            routed_out = ggml_add(ctx, routed_out, input);
@@ -4047,7 +4063,8 @@ ggml_cgraph * llm_build_context::build_qwen3moe() {
                n_expert, n_expert_used,
                LLM_FFN_SILU, true, false, 0.0f,
                LLM_EXPERT_GATING_FUNC_SOFTMAX,
-                LLM_FFN_SILU, cb, il, gf, true);
+                LLM_FFN_SILU, cb, il, gf, true,
+                model.layers[il].ffn_up_gate_exps);

        //printf("%s: ffn = %s(%s)\n", __func__, cur->name, ggml_op_name(cur->op));

@@ -8410,7 +8427,8 @@ ggml_cgraph * llm_build_context::build_openai_moe() {
                n_expert, n_expert_used,
                LLM_FFN_SWIGLU_OAI_MOE, false, false, 0.0f,
                LLM_EXPERT_GATING_FUNC_TYPE_SOFTMAX_WEIGHT,
-                LLM_FFN_SWIGLU_OAI_MOE, cb, il, gf, true);
+                LLM_FFN_SWIGLU_OAI_MOE, cb, il, gf, true,
+                model.layers[il].ffn_up_gate_exps, model.layers[il].ffn_up_gate_exps_b);

        cur = lctx.cvec.apply_to(ctx0, cur, il);
        cb(cur, "l_out", il);