Slightly faster TG for split mode "graph" (#1057)

* Rearrange graph nodes So that we can do graph portions that are the same on 2 or more GPUs at the same time. * Separate graph compute implementation for split mode graph * This is better --------- Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
2026-04-26 09:29:27 +00:00 · 2025-12-12 07:54:37 +01:00
parent 6a0e72aeae
commit 0698501ae2
4 changed files with 183 additions and 91 deletions
--- a/src/llama-build-context.cpp
+++ b/src/llama-build-context.cpp
@@ -1228,6 +1228,7 @@ llm_expert_gating_func_type   gating_op,
            cur = ggml_cast(ctx, cur, GGML_TYPE_F16);
            cb(cur, "ffn_out_f16", il_cb);
        }
+        ggml_build_forward_expand(graph, routed_out);
        results.push_back(cur);
    }
    GGML_ASSERT(!results.empty());
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -4771,6 +4771,9 @@ struct llama_context * llama_new_context_with_model(
        LLAMA_LOG_INFO("XXXXXXXXXXXXXXXXXXXXX Setting only active experts offload\n");
        ggml_backend_sched_set_only_active_experts(ctx->sched, true);
    }
+    if (model->split_mode == LLAMA_SPLIT_MODE_GRAPH) {
+        ggml_backend_sched_set_split_mode_graph(ctx->sched, true);
+    }

    return ctx;
 }