Better PP performance with split mode "graph" and 3+ GPUs (#1069)

* This should do the trick for PP

* Command line option to set max. extra VRAM that the scheduler can use

* Fix bug and cleanup

* Looks like with this change it is working with tensor overrides

* Nah, it is not working

* OK, this seems to be working

* Disable split scheduling with tensor overrides

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
This commit is contained in:
Kawrakow
2025-12-17 07:40:25 +01:00
committed by GitHub
parent 8ccceff4e9
commit 51eea5715f
6 changed files with 100 additions and 45 deletions

View File

@@ -4022,6 +4022,7 @@ struct llama_context_params llama_context_default_params() {
/*.n_seq_max =*/ 1,
/*.n_threads =*/ GGML_DEFAULT_N_THREADS, // TODO: better default
/*.n_threads_batch =*/ GGML_DEFAULT_N_THREADS,
/*.max_extra_alloc =*/ 256,
/*.rope_scaling_type =*/ LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED,
/*.pooling_type =*/ LLAMA_POOLING_TYPE_UNSPECIFIED,
/*.attention_type =*/ LLAMA_ATTENTION_TYPE_UNSPECIFIED,
@@ -4774,6 +4775,7 @@ struct llama_context * llama_new_context_with_model(
}
if (model->split_mode == LLAMA_SPLIT_MODE_GRAPH && !model->has_tensor_overrides()) {
ggml_backend_sched_set_split_mode_graph(ctx->sched, true);
ggml_backend_sched_set_max_extra_alloc(ctx->sched, params.max_extra_alloc);
}
return ctx;