Be able to set a max. number of GPUs to be used in split mode graph (#1051)

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
This commit is contained in:
Kawrakow
2025-12-11 07:22:53 +01:00
committed by GitHub
parent a2efa22f10
commit 22863cf9c9
6 changed files with 67 additions and 19 deletions

View File

@@ -1267,6 +1267,11 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
#endif // GGML_USE_CUDA_SYCL_VULKAN
return true;
}
else if (arg == "--max-gpu") {
CHECK_ARG
params.max_gpu = std::stoi(argv[i]);
return true;
}
if (arg == "--split-mode" || arg == "-sm") {
CHECK_ARG
std::string arg_next = argv[i];
@@ -2265,6 +2270,7 @@ void gpt_params_print_usage(int /*argc*/, char ** argv, const gpt_params & param
"Example: CUDA0,CUDA1,RPC[192.168.0.1:8080]\n" });
options.push_back({ "*", "-mg, --main-gpu i", "the GPU to use for the model (with split-mode = none),\n"
"or for intermediate results and KV (with split-mode = row) (default: %d)", params.main_gpu });
options.push_back({ "*", "--max-gpu i", "max. number of GPUs to use at a time with split mode 'graph', (default: %d)", params.max_gpu });
}
options.push_back({ "model" });
@@ -2973,6 +2979,7 @@ struct llama_model_params llama_model_params_from_gpt_params(const gpt_params &
mparams.mla = params.mla_attn;
mparams.rpc_servers = params.rpc_servers.c_str();
mparams.main_gpu = params.main_gpu;
mparams.max_gpu = params.max_gpu;
mparams.split_mode = params.split_mode;
mparams.tensor_split = params.tensor_split;
mparams.use_mmap = params.use_mmap;
@@ -4173,6 +4180,7 @@ void yaml_dump_non_result_info(FILE * stream, const gpt_params & params, const l
}
fprintf(stream, "lora_init_without_apply: %s # default: false\n", params.lora_init_without_apply ? "true" : "false");
fprintf(stream, "main_gpu: %d # default: 0\n", params.main_gpu);
fprintf(stream, "max_gpu: %d # default: 0\n", params.max_gpu);
fprintf(stream, "min_keep: %d # default: 0 (disabled)\n", sparams.min_keep);
fprintf(stream, "mirostat: %d # default: 0 (disabled)\n", sparams.mirostat);
fprintf(stream, "mirostat_ent: %f # default: 5.0\n", sparams.mirostat_tau);

View File

@@ -154,6 +154,7 @@ struct gpt_params {
int32_t n_gpu_layers = -1; // number of layers to store in VRAM (-1 - use default)
int32_t n_gpu_layers_draft = -1; // number of layers to store in VRAM for the draft model (-1 - use default)
int32_t main_gpu = 0; // the GPU that is used for scratch and small tensors
int32_t max_gpu = 0; // max number of GPUs to use at a time for split mode "graph"
float tensor_split[128] = {0}; // how split tensors should be distributed across GPUs
int32_t grp_attn_n = 1; // group-attention factor
int32_t grp_attn_w = 512; // group-attention width