Better PP performance with split mode "graph" and 3+ GPUs (#1069)

* This should do the trick for PP * Command line option to set max. extra VRAM that the scheduler can use * Fix bug and cleanup * Looks like with this change it is working with tensor overrides * Nah, it is not working * OK, this seems to be working * Disable split scheduling with tensor overrides --------- Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
2026-02-23 14:44:09 +00:00 · 2025-12-17 07:40:25 +01:00
parent 8ccceff4e9
commit 51eea5715f
6 changed files with 100 additions and 45 deletions
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -4022,6 +4022,7 @@ struct llama_context_params llama_context_default_params() {
        /*.n_seq_max                   =*/ 1,
        /*.n_threads                   =*/ GGML_DEFAULT_N_THREADS, // TODO: better default
        /*.n_threads_batch             =*/ GGML_DEFAULT_N_THREADS,
+        /*.max_extra_alloc             =*/ 256,
        /*.rope_scaling_type           =*/ LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED,
        /*.pooling_type                =*/ LLAMA_POOLING_TYPE_UNSPECIFIED,
        /*.attention_type              =*/ LLAMA_ATTENTION_TYPE_UNSPECIFIED,
@@ -4774,6 +4775,7 @@ struct llama_context * llama_new_context_with_model(
    }
    if (model->split_mode == LLAMA_SPLIT_MODE_GRAPH && !model->has_tensor_overrides()) {
        ggml_backend_sched_set_split_mode_graph(ctx->sched, true);
+        ggml_backend_sched_set_max_extra_alloc(ctx->sched, params.max_extra_alloc);
    }

    return ctx;