mirror of
https://github.com/ikawrakow/ik_llama.cpp.git
synced 2026-03-04 19:10:03 +00:00
Slightly faster TG for split mode "graph" (#1057)
* Rearrange graph nodes So that we can do graph portions that are the same on 2 or more GPUs at the same time. * Separate graph compute implementation for split mode graph * This is better --------- Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
This commit is contained in:
@@ -4771,6 +4771,9 @@ struct llama_context * llama_new_context_with_model(
|
||||
LLAMA_LOG_INFO("XXXXXXXXXXXXXXXXXXXXX Setting only active experts offload\n");
|
||||
ggml_backend_sched_set_only_active_experts(ctx->sched, true);
|
||||
}
|
||||
if (model->split_mode == LLAMA_SPLIT_MODE_GRAPH) {
|
||||
ggml_backend_sched_set_split_mode_graph(ctx->sched, true);
|
||||
}
|
||||
|
||||
return ctx;
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user