Be able to set a max. number of GPUs to be used in split mode graph (#1051)

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
This commit is contained in:
Kawrakow
2025-12-11 07:22:53 +01:00
committed by GitHub
parent 6a5a707ac0
commit 9484d150d8
6 changed files with 67 additions and 19 deletions

View File

@@ -1267,6 +1267,11 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
#endif // GGML_USE_CUDA_SYCL_VULKAN
return true;
}
else if (arg == "--max-gpu") {
CHECK_ARG
params.max_gpu = std::stoi(argv[i]);
return true;
}
if (arg == "--split-mode" || arg == "-sm") {
CHECK_ARG
std::string arg_next = argv[i];
@@ -2265,6 +2270,7 @@ void gpt_params_print_usage(int /*argc*/, char ** argv, const gpt_params & param
"Example: CUDA0,CUDA1,RPC[192.168.0.1:8080]\n" });
options.push_back({ "*", "-mg, --main-gpu i", "the GPU to use for the model (with split-mode = none),\n"
"or for intermediate results and KV (with split-mode = row) (default: %d)", params.main_gpu });
options.push_back({ "*", "--max-gpu i", "max. number of GPUs to use at a time with split mode 'graph', (default: %d)", params.max_gpu });
}
options.push_back({ "model" });
@@ -2973,6 +2979,7 @@ struct llama_model_params llama_model_params_from_gpt_params(const gpt_params &
mparams.mla = params.mla_attn;
mparams.rpc_servers = params.rpc_servers.c_str();
mparams.main_gpu = params.main_gpu;
mparams.max_gpu = params.max_gpu;
mparams.split_mode = params.split_mode;
mparams.tensor_split = params.tensor_split;
mparams.use_mmap = params.use_mmap;
@@ -4173,6 +4180,7 @@ void yaml_dump_non_result_info(FILE * stream, const gpt_params & params, const l
}
fprintf(stream, "lora_init_without_apply: %s # default: false\n", params.lora_init_without_apply ? "true" : "false");
fprintf(stream, "main_gpu: %d # default: 0\n", params.main_gpu);
fprintf(stream, "max_gpu: %d # default: 0\n", params.max_gpu);
fprintf(stream, "min_keep: %d # default: 0 (disabled)\n", sparams.min_keep);
fprintf(stream, "mirostat: %d # default: 0 (disabled)\n", sparams.mirostat);
fprintf(stream, "mirostat_ent: %f # default: 5.0\n", sparams.mirostat_tau);

View File

@@ -154,6 +154,7 @@ struct gpt_params {
int32_t n_gpu_layers = -1; // number of layers to store in VRAM (-1 - use default)
int32_t n_gpu_layers_draft = -1; // number of layers to store in VRAM for the draft model (-1 - use default)
int32_t main_gpu = 0; // the GPU that is used for scratch and small tensors
int32_t max_gpu = 0; // max number of GPUs to use at a time for split mode "graph"
float tensor_split[128] = {0}; // how split tensors should be distributed across GPUs
int32_t grp_attn_n = 1; // group-attention factor
int32_t grp_attn_w = 512; // group-attention width

View File

@@ -362,6 +362,7 @@ extern "C" {
// LLAMA_SPLIT_ROW: the GPU that is used for small tensors and intermediate results
// LLAMA_SPLIT_LAYER: ignored
int32_t main_gpu;
int32_t max_gpu;
// proportion of the model (layers or rows) to offload to each GPU, size: llama_max_devices()
const float * tensor_split;

View File

@@ -268,15 +268,14 @@ static std::vector<int> create_split(int nr, int granularity, const std::vector<
}
while (sum > nchunk) {
last_split = 0;
float best_err = std::numeric_limits<float>::max();
float best_err = 0;
int ibest = -1;
for (int i = 0; i < (int)splits.size(); ++i) {
if (result[i] > 0) {
float p = splits[i] - last_split;
float n_want = p*nchunk;
float err = std::abs(n_want - result[i] + 1);
//float err = std::abs(n_want - result[i] + 1) + std::abs(p - 1.f*mem_used[i]/tot_memory_used)*nchunk;
if (err < best_err) {
float err = result[i] - n_want;
if (err > best_err) {
best_err = err; ibest = i;
}
}
@@ -288,14 +287,13 @@ static std::vector<int> create_split(int nr, int granularity, const std::vector<
}
while (sum < nchunk) {
last_split = 0;
float best_err = std::numeric_limits<float>::max();
float best_err = 0;
int ibest = -1;
for (int i = 0; i < (int)splits.size(); ++i) {
float p = splits[i] - last_split;
float n_want = p*nchunk;
float err = std::abs(n_want - result[i] - 1);
//float err = std::abs(n_want - result[i] - 1) + std::abs(p - 1.f*mem_used[i]/tot_memory_used)*nchunk;
if (err < best_err) {
float err = n_want - result[i];
if (err > best_err) {
best_err = err; ibest = i;
}
last_split = splits[i];
@@ -2804,6 +2802,29 @@ static void prepare_split_tensors(int split_dim, ggml_context * ctx, ggml_tensor
}
}
static void adjust_split(std::vector<float> & split, const std::vector<size_t> & mem_used, int max_gpu) {
if (max_gpu < 1 || max_gpu >= int(split.size()) || split.size() != mem_used.size()) {
return;
}
size_t tot_mem_used = 1;
for (auto & mem : mem_used) tot_mem_used += mem;
for (int i = split.size() - 1; i > 0; --i) split[i] -= split[i-1];
std::vector<std::pair<float, int>> sorted(split.size());
for (int i = 0; i < int(split.size()); ++i) {
float mem_ideal = split[i]*tot_mem_used;
float err = mem_ideal - mem_used[i];
sorted[i] = {err, i};
}
std::partial_sort(sorted.begin(), sorted.begin() + max_gpu, sorted.end(), std::greater<std::pair<float,int>>{});
for (auto & p : split) p = 0;
for (int j = 0; j < max_gpu; ++j) split[sorted[j].second] = 1;
float sum = 0;
for (auto & p : split) {
sum += p/max_gpu;
p = sum;
}
}
bool create_tensors_helper::create_tensors() {
const auto tn = LLM_TN(model.arch);
bool use_mmap_buffer = true;
@@ -2936,23 +2957,36 @@ bool create_tensors_helper::create_tensors() {
throw std::runtime_error("unknown architecture");
}
if (model.split_mode == LLAMA_SPLIT_MODE_GRAPH || model.split_mode == LLAMA_SPLIT_MODE_ATTN) {
printf("================================ max_gpu = %d\n", model.max_gpu);
std::vector<size_t> mem_used(model.splits.size(), 0);
const auto & hparams = model.hparams;
int gqa_ratio = hparams.n_head() / hparams.n_head_kv();
//printf("GQA ratio: %d\n", gqa_ratio);
auto cur_splits = model.splits;
int adjust_step = std::max(1, int(model.layers.size() / (2*model.splits.size())));
for (int il = 0; il < int(model.layers.size()); ++il) {
if (ggml_backend_buft_is_host(model.buft_layer[il].buft_matrix)) {
LLAMA_LOG_INFO("%s: not splitting layer %d because buffer type is host\n", __func__, il);
continue;
}
if (model.max_gpu > 0 && model.max_gpu < int(model.splits.size()) && il % adjust_step == 0) {
cur_splits = model.splits;
adjust_split(cur_splits, mem_used, model.max_gpu);
printf("Adjusted split at layer %2d:", il);
float last_split = 0;
for (auto & p : cur_splits) {
printf(" %g", p - last_split);
last_split = p;
}
printf("\n");
}
auto & layer = model.layers[il];
auto ctx_split = ctx_for_layer_split(il);
if (layer.attn_norm) {
auto split = create_split(ggml_nrows(layer.attn_norm), -1, model.splits, mem_used);
auto split = create_split(ggml_nrows(layer.attn_norm), -1, cur_splits, mem_used);
prepare_split_tensors(-1, ctx_split, layer.attn_norm, layer.split_attn_norm, split, mem_used);
}
if (layer.rope_freqs) {
auto split = create_split(ggml_nrows(layer.rope_freqs), -1, model.splits, mem_used);
auto split = create_split(ggml_nrows(layer.rope_freqs), -1, cur_splits, mem_used);
prepare_split_tensors(-1, ctx_split, layer.rope_freqs, layer.split_rope_freqs, split, mem_used);
}
if (layer.wo && layer.wq && layer.wk && layer.wv) {
@@ -2962,7 +2996,7 @@ bool create_tensors_helper::create_tensors() {
if (tt.blck_size > attn_granularity) attn_granularity = tt.blck_size;
}
GGML_ASSERT(attn_granularity % hparams.n_embd_head_k == 0);
auto split = create_split(layer.wo->ne[0], attn_granularity, model.splits, mem_used);
auto split = create_split(layer.wo->ne[0], attn_granularity, cur_splits, mem_used);
prepare_split_tensors(0, ctx_split, layer.wo, layer.split_wo, split, mem_used);
prepare_split_tensors(1, ctx_split, layer.wq, layer.split_wq, split, mem_used);
if (layer.bo) {
@@ -2990,7 +3024,7 @@ bool create_tensors_helper::create_tensors() {
if (layer.ffn_norm) {
if (auto it = split_tensors.find(layer.ffn_norm); it != split_tensors.end()) {
auto split = create_split(ggml_nrows(layer.ffn_norm), -1, model.splits, mem_used);
auto split = create_split(ggml_nrows(layer.ffn_norm), -1, cur_splits, mem_used);
prepare_split_tensors(-1, ctx_split, layer.ffn_norm, layer.split_ffn_norm, split, mem_used);
}
}
@@ -3005,7 +3039,7 @@ bool create_tensors_helper::create_tensors() {
auto tt = ggml_internal_get_type_traits(layer.ffn_down->type);
if (tt.blck_size > ffn_granularity) ffn_granularity = tt.blck_size;
}
auto split = create_split(layer.ffn_down->ne[0], ffn_granularity, model.splits, mem_used);
auto split = create_split(layer.ffn_down->ne[0], ffn_granularity, cur_splits, mem_used);
prepare_split_tensors(0, ctx_split, layer.ffn_down, layer.split_ffn_down, split, mem_used);
prepare_split_tensors(1, ctx_split, layer.ffn_up, layer.split_ffn_up, split, mem_used);
prepare_split_tensors(1, ctx_split, layer.ffn_gate, layer.split_ffn_gate, split, mem_used);
@@ -3024,7 +3058,7 @@ bool create_tensors_helper::create_tensors() {
auto tt = ggml_internal_get_type_traits(layer.ffn_down_shexp->type);
if (tt.blck_size > ffn_granularity) ffn_granularity = tt.blck_size;
}
auto split = create_split(layer.ffn_down_shexp->ne[0], ffn_granularity, model.splits, mem_used);
auto split = create_split(layer.ffn_down_shexp->ne[0], ffn_granularity, cur_splits, mem_used);
prepare_split_tensors(0, ctx_split, layer.ffn_down_shexp, layer.split_ffn_down_shexp, split, mem_used);
prepare_split_tensors(1, ctx_split, layer.ffn_up_shexp, layer.split_ffn_up_shexp, split, mem_used);
prepare_split_tensors(1, ctx_split, layer.ffn_gate_shexp, layer.split_ffn_gate_shexp, split, mem_used);
@@ -3043,7 +3077,7 @@ bool create_tensors_helper::create_tensors() {
auto tt = ggml_internal_get_type_traits(layer.ffn_down_exps->type);
if (tt.blck_size > ffn_granularity) ffn_granularity = tt.blck_size;
}
auto split = create_split(layer.ffn_down_exps->ne[0], ffn_granularity, model.splits, mem_used);
auto split = create_split(layer.ffn_down_exps->ne[0], ffn_granularity, cur_splits, mem_used);
//printf("split(%2d):", il); for (auto & s : split) printf(" %d", s); printf("\n");
prepare_split_tensors(0, ctx_split, layer.ffn_down_exps, layer.split_ffn_down_exps, split, mem_used);
prepare_split_tensors(1, ctx_split, layer.ffn_up_exps, layer.split_ffn_up_exps, split, mem_used);
@@ -3053,13 +3087,13 @@ bool create_tensors_helper::create_tensors() {
if (layer.ffn_gate_inp) {
if (auto it = split_tensors.find(layer.ffn_gate_inp); it != split_tensors.end()) {
auto shared_split = create_split(ggml_nrows(layer.ffn_gate_inp), -1, model.splits, mem_used);
auto shared_split = create_split(ggml_nrows(layer.ffn_gate_inp), -1, cur_splits, mem_used);
prepare_split_tensors(-1, ctx_split, layer.ffn_gate_inp, layer.split_ffn_gate_inp, shared_split, mem_used);
}
}
if (layer.ffn_exp_probs_b) {
if (auto it = split_tensors.find(layer.ffn_exp_probs_b); it != split_tensors.end()) {
auto shared_split = create_split(ggml_nrows(layer.ffn_exp_probs_b), -1, model.splits, mem_used);
auto shared_split = create_split(ggml_nrows(layer.ffn_exp_probs_b), -1, cur_splits, mem_used);
prepare_split_tensors(-1, ctx_split, layer.ffn_exp_probs_b, layer.split_ffn_exp_probs_b, shared_split, mem_used);
}
}

View File

@@ -350,6 +350,7 @@ struct llama_model {
llama_split_mode split_mode;
int main_gpu;
int max_gpu = 0; // max. number of GPUs to use per layer for aplit mode "graph"
int n_gpu_layers;
std::vector<rpc_device> rpc_servers;

View File

@@ -1742,6 +1742,7 @@ static bool llm_load_tensors(
int mla_attn,
enum llama_split_mode split_mode,
int main_gpu,
int max_gpu,
const float * tensor_split,
bool use_mlock,
bool validate_quants,
@@ -1763,6 +1764,7 @@ static bool llm_load_tensors(
model.split_mode = split_mode;
model.main_gpu = main_gpu;
model.max_gpu = max_gpu;
model.n_gpu_layers = n_gpu_layers;
const int n_layer = hparams.n_layer;
@@ -2138,7 +2140,7 @@ static int llama_model_load(const std::string & fname, llama_model & model, llam
#endif
if (!llm_load_tensors(
ml, model, params.n_gpu_layers, params.mla, params.split_mode, params.main_gpu, params.tensor_split,
ml, model, params.n_gpu_layers, params.mla, params.split_mode, params.main_gpu, params.max_gpu, params.tensor_split,
params.use_mlock, params.validate_quants,
params.progress_callback, params.progress_callback_user_data
)) {
@@ -3985,6 +3987,7 @@ struct llama_model_params llama_model_default_params() {
/*.mla =*/ 0,
/*.split_mode =*/ LLAMA_SPLIT_MODE_LAYER,
/*.main_gpu =*/ 0,
/*.max_gpu =*/ 0,
/*.tensor_split =*/ nullptr,
/*.rpc_servers =*/ nullptr,
/*.progress_callback =*/ nullptr,