mirror of
https://github.com/ikawrakow/ik_llama.cpp.git
synced 2026-02-11 08:50:11 +00:00
add split-mode-graph-scheduling parameter (#1068)
Use -smgs or --split-mode-graph-scheduling in CLI to bypass the disabling of split mode graph scheduling when tensor overrides is used. Co-authored-by: Kawrakow <iwankawrakow@gmail.com>
This commit is contained in:
@@ -4054,6 +4054,7 @@ struct llama_context_params llama_context_default_params() {
|
||||
/*.thtesh_experts =*/ 0.0f,
|
||||
/*.only_active_experts =*/ false,
|
||||
/*.k_cache_hadamard =*/ false,
|
||||
/*.split_mode_graph_scheduling =*/ false,
|
||||
/*.abort_callback =*/ nullptr,
|
||||
/*.abort_callback_data =*/ nullptr,
|
||||
/*.offload_policy =*/ nullptr,
|
||||
@@ -4342,6 +4343,7 @@ struct llama_context * llama_new_context_with_model(
|
||||
cparams.rope_cache = params.rope_cache;
|
||||
cparams.graph_reuse = params.graph_reuse;
|
||||
cparams.k_cache_hadamard = params.k_cache_hadamard;
|
||||
cparams.split_mode_graph_scheduling = params.split_mode_graph_scheduling;
|
||||
cparams.min_experts = params.min_experts;
|
||||
cparams.thresh_experts = params.thresh_experts;
|
||||
cparams.cuda_params = params.cuda_params;
|
||||
@@ -4430,6 +4432,7 @@ struct llama_context * llama_new_context_with_model(
|
||||
LLAMA_LOG_INFO("%s: rope_cache = %d\n", __func__, cparams.rope_cache);
|
||||
LLAMA_LOG_INFO("%s: graph_reuse = %d\n", __func__, cparams.graph_reuse);
|
||||
LLAMA_LOG_INFO("%s: k_cache_hadam = %d\n", __func__, cparams.k_cache_hadamard);
|
||||
LLAMA_LOG_INFO("%s: split_mode_graph_scheduling = %d\n", __func__, cparams.split_mode_graph_scheduling);
|
||||
LLAMA_LOG_INFO("%s: ser = %d, %g\n", __func__, cparams.min_experts, cparams.thresh_experts);
|
||||
LLAMA_LOG_INFO("%s: freq_base = %.1f\n", __func__, cparams.rope_freq_base);
|
||||
LLAMA_LOG_INFO("%s: freq_scale = %g\n", __func__, cparams.rope_freq_scale);
|
||||
@@ -4773,10 +4776,14 @@ struct llama_context * llama_new_context_with_model(
|
||||
LLAMA_LOG_INFO("XXXXXXXXXXXXXXXXXXXXX Setting only active experts offload\n");
|
||||
ggml_backend_sched_set_only_active_experts(ctx->sched, true);
|
||||
}
|
||||
if (model->split_mode == LLAMA_SPLIT_MODE_GRAPH && !model->has_tensor_overrides()) {
|
||||
if (model->split_mode == LLAMA_SPLIT_MODE_GRAPH && (!model->has_tensor_overrides() || cparams.split_mode_graph_scheduling)) {
|
||||
ggml_backend_sched_set_split_mode_graph(ctx->sched, true);
|
||||
ggml_backend_sched_set_max_extra_alloc(ctx->sched, params.max_extra_alloc);
|
||||
}
|
||||
if (model->has_tensor_overrides() && cparams.split_mode_graph_scheduling) {
|
||||
LLAMA_LOG_INFO("XXXXXXXX Split Mode Graph Scheduling is FORCED despite tensor overrides due to user choice.\n");
|
||||
LLAMA_LOG_INFO("XXXXXXXX It may or might NOT infer properly due to unsupported combinations between SMGS and every possible tensor overrides.\n");
|
||||
}
|
||||
}
|
||||
|
||||
return ctx;
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user