mirror of
https://github.com/ikawrakow/ik_llama.cpp.git
synced 2026-01-26 17:20:01 +00:00
Be able to set a max. number of GPUs to be used in split mode graph (#1051)
Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
This commit is contained in:
@@ -1267,6 +1267,11 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
|
||||
#endif // GGML_USE_CUDA_SYCL_VULKAN
|
||||
return true;
|
||||
}
|
||||
else if (arg == "--max-gpu") {
|
||||
CHECK_ARG
|
||||
params.max_gpu = std::stoi(argv[i]);
|
||||
return true;
|
||||
}
|
||||
if (arg == "--split-mode" || arg == "-sm") {
|
||||
CHECK_ARG
|
||||
std::string arg_next = argv[i];
|
||||
@@ -2265,6 +2270,7 @@ void gpt_params_print_usage(int /*argc*/, char ** argv, const gpt_params & param
|
||||
"Example: CUDA0,CUDA1,RPC[192.168.0.1:8080]\n" });
|
||||
options.push_back({ "*", "-mg, --main-gpu i", "the GPU to use for the model (with split-mode = none),\n"
|
||||
"or for intermediate results and KV (with split-mode = row) (default: %d)", params.main_gpu });
|
||||
options.push_back({ "*", "--max-gpu i", "max. number of GPUs to use at a time with split mode 'graph', (default: %d)", params.max_gpu });
|
||||
}
|
||||
|
||||
options.push_back({ "model" });
|
||||
@@ -2973,6 +2979,7 @@ struct llama_model_params llama_model_params_from_gpt_params(const gpt_params &
|
||||
mparams.mla = params.mla_attn;
|
||||
mparams.rpc_servers = params.rpc_servers.c_str();
|
||||
mparams.main_gpu = params.main_gpu;
|
||||
mparams.max_gpu = params.max_gpu;
|
||||
mparams.split_mode = params.split_mode;
|
||||
mparams.tensor_split = params.tensor_split;
|
||||
mparams.use_mmap = params.use_mmap;
|
||||
@@ -4173,6 +4180,7 @@ void yaml_dump_non_result_info(FILE * stream, const gpt_params & params, const l
|
||||
}
|
||||
fprintf(stream, "lora_init_without_apply: %s # default: false\n", params.lora_init_without_apply ? "true" : "false");
|
||||
fprintf(stream, "main_gpu: %d # default: 0\n", params.main_gpu);
|
||||
fprintf(stream, "max_gpu: %d # default: 0\n", params.max_gpu);
|
||||
fprintf(stream, "min_keep: %d # default: 0 (disabled)\n", sparams.min_keep);
|
||||
fprintf(stream, "mirostat: %d # default: 0 (disabled)\n", sparams.mirostat);
|
||||
fprintf(stream, "mirostat_ent: %f # default: 5.0\n", sparams.mirostat_tau);
|
||||
|
||||
@@ -154,6 +154,7 @@ struct gpt_params {
|
||||
int32_t n_gpu_layers = -1; // number of layers to store in VRAM (-1 - use default)
|
||||
int32_t n_gpu_layers_draft = -1; // number of layers to store in VRAM for the draft model (-1 - use default)
|
||||
int32_t main_gpu = 0; // the GPU that is used for scratch and small tensors
|
||||
int32_t max_gpu = 0; // max number of GPUs to use at a time for split mode "graph"
|
||||
float tensor_split[128] = {0}; // how split tensors should be distributed across GPUs
|
||||
int32_t grp_attn_n = 1; // group-attention factor
|
||||
int32_t grp_attn_w = 512; // group-attention width
|
||||
|
||||
Reference in New Issue
Block a user