From b4e1d916c5ec7e75ea3c124dd090425a99fc613f Mon Sep 17 00:00:00 2001 From: Kawrakow Date: Mon, 25 May 2026 08:16:45 +0300 Subject: [PATCH] Per GPU fit margin (#1872) --- common/common.cpp | 26 ++++++++++++++++++++++++++ common/common.h | 1 + include/llama.h | 2 ++ src/llama.cpp | 28 ++++++++++++++++++++++++---- 4 files changed, 53 insertions(+), 4 deletions(-) diff --git a/common/common.cpp b/common/common.cpp index 2ee4e772..ffb8d5fd 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -693,6 +693,10 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) { if (!params.tensor_buft_overrides.empty()) { params.tensor_buft_overrides.push_back({nullptr, nullptr}); } + if (!params.fit_margin_array.empty()) { + params.fit_margin_array.push_back(-1); + params.fit_margin_array.push_back(0); + } if (!params.chat_template.empty() && !common_chat_verify_template(params.chat_template, params.use_jinja)) { throw std::runtime_error(string_format( @@ -1945,6 +1949,23 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa } return true; } + if (arg == "--gpu-fit-margin" || arg == "-gfm") { + CHECK_ARG + auto p = string_split_pairs(argv[i], ','); + if (p.empty()) { + fprintf(stderr, "error: invalid GPU split margin argument: %s\n", argv[i]); + invalid_param = true; + } else { + auto cur_size = params.fit_margin_array.size(); + params.fit_margin_array.resize(cur_size + 2*p.size()); + for (auto & pair : p) { + params.fit_margin_array[cur_size+0] = pair.first; + params.fit_margin_array[cur_size+1] = pair.second; + cur_size += 2; + } + } + return true; + } if (arg == "-cuda" || arg == "--cuda-params") { CHECK_ARG params.cuda_params = argv[i]; @@ -4070,6 +4091,11 @@ struct llama_model_params common_model_params_to_llama(const gpt_params & params if (!mparams.flash_attn && ggml_is_quantized(mparams.type_v)) { throw std::runtime_error("Quantized V cache cannot be used without flash attention"); } + if (!params.fit_margin_array.empty()) { + GGML_ASSERT(params.fit_margin_array.size() % 2 == 0 && "Fit margin array does not have even number of elements"); + GGML_ASSERT(params.fit_margin_array[params.fit_margin_array.size()-2] == -1 && "Fit margin array is not correctly termionated"); + mparams.fit_margin_array = params.fit_margin_array.data(); + } return mparams; } diff --git a/common/common.h b/common/common.h index abcc1203..bc68ca0f 100644 --- a/common/common.h +++ b/common/common.h @@ -351,6 +351,7 @@ struct gpt_params { std::vector kv_overrides; std::vector tensor_buft_overrides; std::vector> offload_policy; + std::vector fit_margin_array; bool lora_init_without_apply = false; // only load lora to memory, but do not apply it to ctx (user can manually apply lora later using llama_lora_adapter_apply) std::vector lora_adapters; // lora adapter path with user defined scale diff --git a/include/llama.h b/include/llama.h index 9e712156..f89d82ef 100644 --- a/include/llama.h +++ b/include/llama.h @@ -399,6 +399,8 @@ extern "C" { // proportion of the model (layers or rows) to offload to each GPU, size: llama_max_devices() const float * tensor_split; + const int * fit_margin_array; + // comma separated list of RPC servers to use for offloading const char * rpc_servers; diff --git a/src/llama.cpp b/src/llama.cpp index b293cc72..aa7d649a 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -3293,6 +3293,7 @@ static bool llm_load_tensors( int n_ubatch, int amb, int fit_margin, + const int * fit_margin_array, int worst_case_tokens, bool flash_attn, bool use_mlock, @@ -3364,6 +3365,23 @@ static bool llm_load_tensors( model.mtp = mtp; size_t mem_margin = fit_margin > 0 ? size_t(fit_margin)*1024*1024 : k_default_mem_margin; + auto get_mem_margin = [mem_margin, fit_margin_array, n_gpu = int(model.devices.size()), func = __func__] (int gpu) { + size_t result = mem_margin; + bool have_margin_override = false; + if (fit_margin_array) { + for (int i = 0; ; i += 2) { + if (fit_margin_array[i] < 0) break; + if (fit_margin_array[i] == gpu && fit_margin_array[i+1] >= 0) { + result = size_t(fit_margin_array[i+1])*1024*1024; + have_margin_override = true; + } + } + } + if (have_margin_override) { + LLAMA_LOG_INFO("%s: using %0.2f MiB as fit margin for GPU %d\n", func, result/1024./1024., gpu); + } + return result; + }; const int n_layer = hparams.n_layer; int i_gpu_start = std::max((int) hparams.n_layer - n_gpu_layers, (int) 0); @@ -3382,10 +3400,11 @@ static bool llm_load_tensors( std::vector device_mem(model.devices.size()); for (int i = 0; i < int(device_mem.size()); ++i) { device_mem[i] = llama_get_device_memory(model, model.devices[i]); - if (device_mem[i] > mem_margin) { - device_mem[i] -= mem_margin; + auto this_margin = get_mem_margin(i); + if (device_mem[i] > this_margin) { + device_mem[i] -= this_margin; } else { - LLAMA_LOG_WARN("Free memory %zu MiB on device %d is less the %zu MiB safety margin\n", device_mem[i]/(1024*1024), model.devices[i], mem_margin/(1024*1024)); + LLAMA_LOG_WARN("Free memory %zu MiB on device %d is less the %zu MiB safety margin\n", device_mem[i]/(1024*1024), model.devices[i], this_margin/(1024*1024)); device_mem[i] = 0; } } @@ -4071,7 +4090,7 @@ static int llama_model_load(const std::string & fname, llama_model & model, llam if (!llm_load_tensors( ml, model, params.n_gpu_layers, params.mla, params.split_mode, params.main_gpu, params.max_gpu, params.tensor_split, params.type_k, params.type_v, params.extra_output_type, - params.max_ctx_size, params.n_seq_max, params.n_ubatch, params.amb, params.fit_margin, + params.max_ctx_size, params.n_seq_max, params.n_ubatch, params.amb, params.fit_margin, params.fit_margin_array, params.worst_graph_tokens, params.flash_attn, params.use_mlock, params.validate_quants, params.mtp, params.fit, params.dry_run, params.progress_callback, params.progress_callback_user_data @@ -6203,6 +6222,7 @@ struct llama_model_params llama_model_default_params() { /*.n_last_v =*/ -1, /*.extra_output_type =*/ GGML_TYPE_COUNT, /*.tensor_split =*/ nullptr, + /*.fit_margin_array =*/ nullptr, /*.rpc_servers =*/ nullptr, /*.progress_callback =*/ nullptr, /*.progress_callback_user_data =*/ nullptr,