Be able to set a max. number of GPUs to be used in split mode graph (#1051)

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
2026-04-24 16:39:45 +00:00 · 2025-12-11 07:22:53 +01:00
parent 6a5a707ac0
commit 9484d150d8
6 changed files with 67 additions and 19 deletions
--- a/common/common.h
+++ b/common/common.h
@@ -154,6 +154,7 @@ struct gpt_params {
    int32_t n_gpu_layers          =    -1; // number of layers to store in VRAM (-1 - use default)
    int32_t n_gpu_layers_draft    =    -1; // number of layers to store in VRAM for the draft model (-1 - use default)
    int32_t main_gpu              =     0; // the GPU that is used for scratch and small tensors
+    int32_t max_gpu               =     0; // max number of GPUs to use at a time for split mode "graph"
    float   tensor_split[128]     =   {0}; // how split tensors should be distributed across GPUs
    int32_t grp_attn_n            =     1; // group-attention factor
    int32_t grp_attn_w            =   512; // group-attention width