Better PP performance with split mode "graph" and 3+ GPUs (#1069)

* This should do the trick for PP

* Command line option to set max. extra VRAM that the scheduler can use

* Fix bug and cleanup

* Looks like with this change it is working with tensor overrides

* Nah, it is not working

* OK, this seems to be working

* Disable split scheduling with tensor overrides

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
This commit is contained in:
Kawrakow
2025-12-17 07:40:25 +01:00
committed by GitHub
parent 75de0528c3
commit 5585ac2aa8
6 changed files with 100 additions and 45 deletions

View File

@@ -757,6 +757,11 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
params.defrag_thold = std::stof(argv[i]);
return true;
}
if (arg == "--max-extra-alloc" || arg == "-mea") {
CHECK_ARG
params.max_extra_alloc_MiB = std::stoi(argv[i]);
return true;
}
if (arg == "--samplers") {
CHECK_ARG
const auto sampler_names = string_split(argv[i], ";");
@@ -2218,6 +2223,7 @@ void gpt_params_print_usage(int /*argc*/, char ** argv, const gpt_params & param
options.push_back({ "parallel" });
options.push_back({ "*", "-dt, --defrag-thold N", "KV cache defragmentation threshold (default: %.1f, < 0 - disabled)", (double)params.defrag_thold });
options.push_back({ "*", "-mea, --max-extra-alloc", "Max extra VRAM allocation per GPU (default: %d)", params.max_extra_alloc_MiB});
options.push_back({ "*", "-np, --parallel N", "number of parallel sequences to decode (default: %d)", params.n_parallel });
options.push_back({ "*", "-ns, --sequences N", "number of sequences to decode (default: %d)", params.n_sequences });
options.push_back({ "*", "-cb, --cont-batching", "enable continuous batching (a.k.a dynamic batching) (default: %s)", params.cont_batching ? "enabled" : "disabled" });
@@ -3109,7 +3115,7 @@ struct llama_context_params llama_context_params_from_gpt_params(const gpt_param
cparams.min_experts = params.min_experts;
cparams.thresh_experts = params.thresh_experts;
cparams.only_active_experts = params.only_active_exps;
cparams.k_cache_hadamard = params.k_cache_hadamard;
cparams.max_extra_alloc = params.max_extra_alloc_MiB;
cparams.type_k = kv_cache_type_from_str(params.cache_type_k);
cparams.type_v = kv_cache_type_from_str(params.cache_type_v);
@@ -4049,6 +4055,7 @@ void yaml_dump_non_result_info(FILE * stream, const gpt_params & params, const l
fprintf(stream, "use_thp: %s # default: false\n", params.use_thp ? "true" : "false");
fprintf(stream, "validate_quants: %s # default: false\n", params.validate_quants ? "true" : "false");
fprintf(stream, "merge_qkv: %s # default: false\n", params.merge_qkv ? "true" : "false");
fprintf(stream, "max_extra_alloc: %d # default: 256\n", params.max_extra_alloc_MiB);
fprintf(stream, "penalize_nl: %s # default: false\n", sparams.penalize_nl ? "true" : "false");
fprintf(stream, "ppl_output_type: %d # default: 0\n", params.ppl_output_type);
fprintf(stream, "ppl_stride: %d # default: 0\n", params.ppl_stride);

View File

@@ -167,6 +167,7 @@ struct gpt_params {
float yarn_beta_slow = -1.0f; // YaRN high correction dim
int32_t yarn_orig_ctx = 0; // YaRN original context length
float defrag_thold = -1.0f; // KV cache defragmentation threshold
int32_t max_extra_alloc_MiB = 256; // additional VRAM per GPU the scheduler may allocate for more efficient compute graph evaluation
ggml_backend_sched_eval_callback cb_eval = nullptr;
void * cb_eval_user_data = nullptr;