Be able to set a max. number of GPUs to be used in split mode graph (#1051)

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
2026-01-26 17:20:01 +00:00 · 2025-12-11 07:22:53 +01:00
parent a2efa22f10
commit 22863cf9c9
6 changed files with 67 additions and 19 deletions
--- a/common/common.cpp
+++ b/common/common.cpp
@@ -1267,6 +1267,11 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
 #endif // GGML_USE_CUDA_SYCL_VULKAN
        return true;
    }
+    else if (arg == "--max-gpu") {
+        CHECK_ARG
+        params.max_gpu = std::stoi(argv[i]);
+        return true;
+    }
    if (arg == "--split-mode" || arg == "-sm") {
        CHECK_ARG
        std::string arg_next = argv[i];
@@ -2265,6 +2270,7 @@ void gpt_params_print_usage(int /*argc*/, char ** argv, const gpt_params & param
                                                                         "Example: CUDA0,CUDA1,RPC[192.168.0.1:8080]\n" });
        options.push_back({ "*",           "-mg,   --main-gpu i",       "the GPU to use for the model (with split-mode = none),\n"
                                                                        "or for intermediate results and KV (with split-mode = row) (default: %d)", params.main_gpu });
+        options.push_back({ "*",           "--max-gpu i",               "max. number of GPUs to use at a time with split mode 'graph', (default: %d)", params.max_gpu });
    }

    options.push_back({ "model" });
@@ -2973,6 +2979,7 @@ struct llama_model_params llama_model_params_from_gpt_params(const gpt_params &
    mparams.mla             = params.mla_attn;
    mparams.rpc_servers     = params.rpc_servers.c_str();
    mparams.main_gpu        = params.main_gpu;
+    mparams.max_gpu         = params.max_gpu;
    mparams.split_mode      = params.split_mode;
    mparams.tensor_split    = params.tensor_split;
    mparams.use_mmap        = params.use_mmap;
@@ -4173,6 +4180,7 @@ void yaml_dump_non_result_info(FILE * stream, const gpt_params & params, const l
    }
    fprintf(stream, "lora_init_without_apply: %s # default: false\n", params.lora_init_without_apply ? "true" : "false");
    fprintf(stream, "main_gpu: %d # default: 0\n", params.main_gpu);
+    fprintf(stream, "max_gpu: %d # default: 0\n", params.max_gpu);
    fprintf(stream, "min_keep: %d # default: 0 (disabled)\n", sparams.min_keep);
    fprintf(stream, "mirostat: %d # default: 0 (disabled)\n", sparams.mirostat);
    fprintf(stream, "mirostat_ent: %f # default: 5.0\n", sparams.mirostat_tau);
--- a/common/common.h
+++ b/common/common.h
@@ -154,6 +154,7 @@ struct gpt_params {
    int32_t n_gpu_layers          =    -1; // number of layers to store in VRAM (-1 - use default)
    int32_t n_gpu_layers_draft    =    -1; // number of layers to store in VRAM for the draft model (-1 - use default)
    int32_t main_gpu              =     0; // the GPU that is used for scratch and small tensors
+    int32_t max_gpu               =     0; // max number of GPUs to use at a time for split mode "graph"
    float   tensor_split[128]     =   {0}; // how split tensors should be distributed across GPUs
    int32_t grp_attn_n            =     1; // group-attention factor
    int32_t grp_attn_w            =   512; // group-attention width