Better PP performance with split mode "graph" and 3+ GPUs (#1069)

* This should do the trick for PP * Command line option to set max. extra VRAM that the scheduler can use * Fix bug and cleanup * Looks like with this change it is working with tensor overrides * Nah, it is not working * OK, this seems to be working * Disable split scheduling with tensor overrides --------- Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
2026-01-26 17:20:01 +00:00 · 2025-12-17 07:40:25 +01:00
parent 75de0528c3
commit 5585ac2aa8
6 changed files with 100 additions and 45 deletions
--- a/common/common.cpp
+++ b/common/common.cpp
@@ -757,6 +757,11 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
        params.defrag_thold = std::stof(argv[i]);
        return true;
    }
+    if (arg == "--max-extra-alloc" || arg == "-mea") {
+        CHECK_ARG
+        params.max_extra_alloc_MiB = std::stoi(argv[i]);
+        return true;
+    }
    if (arg == "--samplers") {
        CHECK_ARG
        const auto sampler_names = string_split(argv[i], ";");
@@ -2218,6 +2223,7 @@ void gpt_params_print_usage(int /*argc*/, char ** argv, const gpt_params & param

    options.push_back({ "parallel" });
    options.push_back({ "*",           "-dt,   --defrag-thold N",       "KV cache defragmentation threshold (default: %.1f, < 0 - disabled)", (double)params.defrag_thold });
+    options.push_back({ "*",           "-mea,  --max-extra-alloc",      "Max extra VRAM allocation per GPU (default: %d)", params.max_extra_alloc_MiB});
    options.push_back({ "*",           "-np,   --parallel N",           "number of parallel sequences to decode (default: %d)", params.n_parallel });
    options.push_back({ "*",           "-ns,   --sequences N",          "number of sequences to decode (default: %d)", params.n_sequences });
    options.push_back({ "*",           "-cb,   --cont-batching",        "enable continuous batching (a.k.a dynamic batching) (default: %s)", params.cont_batching ? "enabled" : "disabled" });
@@ -3109,7 +3115,7 @@ struct llama_context_params llama_context_params_from_gpt_params(const gpt_param
    cparams.min_experts       = params.min_experts;
    cparams.thresh_experts    = params.thresh_experts;
    cparams.only_active_experts = params.only_active_exps;
-    cparams.k_cache_hadamard  = params.k_cache_hadamard;
+    cparams.max_extra_alloc   = params.max_extra_alloc_MiB;

    cparams.type_k = kv_cache_type_from_str(params.cache_type_k);
    cparams.type_v = kv_cache_type_from_str(params.cache_type_v);
@@ -4049,6 +4055,7 @@ void yaml_dump_non_result_info(FILE * stream, const gpt_params & params, const l
    fprintf(stream, "use_thp: %s # default: false\n", params.use_thp ? "true" : "false");
    fprintf(stream, "validate_quants: %s # default: false\n", params.validate_quants ? "true" : "false");
    fprintf(stream, "merge_qkv: %s # default: false\n", params.merge_qkv ? "true" : "false");
+    fprintf(stream, "max_extra_alloc: %d # default: 256\n", params.max_extra_alloc_MiB);
    fprintf(stream, "penalize_nl: %s # default: false\n", sparams.penalize_nl ? "true" : "false");
    fprintf(stream, "ppl_output_type: %d # default: 0\n", params.ppl_output_type);
    fprintf(stream, "ppl_stride: %d # default: 0\n", params.ppl_stride);
--- a/common/common.h
+++ b/common/common.h
@@ -167,6 +167,7 @@ struct gpt_params {
    float   yarn_beta_slow        =  -1.0f; // YaRN high correction dim
    int32_t yarn_orig_ctx         =     0; // YaRN original context length
    float   defrag_thold          = -1.0f; // KV cache defragmentation threshold
+    int32_t max_extra_alloc_MiB   = 256;   // additional VRAM per GPU the scheduler may allocate for more efficient compute graph evaluation

    ggml_backend_sched_eval_callback cb_eval = nullptr;
    void * cb_eval_user_data                 = nullptr;