mirror of
https://github.com/ikawrakow/ik_llama.cpp.git
synced 2026-05-27 08:24:30 +00:00
Per GPU fit margin (#1872)
This commit is contained in:
@@ -693,6 +693,10 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
|
||||
if (!params.tensor_buft_overrides.empty()) {
|
||||
params.tensor_buft_overrides.push_back({nullptr, nullptr});
|
||||
}
|
||||
if (!params.fit_margin_array.empty()) {
|
||||
params.fit_margin_array.push_back(-1);
|
||||
params.fit_margin_array.push_back(0);
|
||||
}
|
||||
|
||||
if (!params.chat_template.empty() && !common_chat_verify_template(params.chat_template, params.use_jinja)) {
|
||||
throw std::runtime_error(string_format(
|
||||
@@ -1945,6 +1949,23 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
|
||||
}
|
||||
return true;
|
||||
}
|
||||
if (arg == "--gpu-fit-margin" || arg == "-gfm") {
|
||||
CHECK_ARG
|
||||
auto p = string_split_pairs<int,int>(argv[i], ',');
|
||||
if (p.empty()) {
|
||||
fprintf(stderr, "error: invalid GPU split margin argument: %s\n", argv[i]);
|
||||
invalid_param = true;
|
||||
} else {
|
||||
auto cur_size = params.fit_margin_array.size();
|
||||
params.fit_margin_array.resize(cur_size + 2*p.size());
|
||||
for (auto & pair : p) {
|
||||
params.fit_margin_array[cur_size+0] = pair.first;
|
||||
params.fit_margin_array[cur_size+1] = pair.second;
|
||||
cur_size += 2;
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
if (arg == "-cuda" || arg == "--cuda-params") {
|
||||
CHECK_ARG
|
||||
params.cuda_params = argv[i];
|
||||
@@ -4070,6 +4091,11 @@ struct llama_model_params common_model_params_to_llama(const gpt_params & params
|
||||
if (!mparams.flash_attn && ggml_is_quantized(mparams.type_v)) {
|
||||
throw std::runtime_error("Quantized V cache cannot be used without flash attention");
|
||||
}
|
||||
if (!params.fit_margin_array.empty()) {
|
||||
GGML_ASSERT(params.fit_margin_array.size() % 2 == 0 && "Fit margin array does not have even number of elements");
|
||||
GGML_ASSERT(params.fit_margin_array[params.fit_margin_array.size()-2] == -1 && "Fit margin array is not correctly termionated");
|
||||
mparams.fit_margin_array = params.fit_margin_array.data();
|
||||
}
|
||||
|
||||
return mparams;
|
||||
}
|
||||
|
||||
@@ -351,6 +351,7 @@ struct gpt_params {
|
||||
std::vector<llama_model_kv_override> kv_overrides;
|
||||
std::vector<llama_model_tensor_buft_override> tensor_buft_overrides;
|
||||
std::vector<std::pair<int,int>> offload_policy;
|
||||
std::vector<int> fit_margin_array;
|
||||
|
||||
bool lora_init_without_apply = false; // only load lora to memory, but do not apply it to ctx (user can manually apply lora later using llama_lora_adapter_apply)
|
||||
std::vector<llama_lora_adapter_info> lora_adapters; // lora adapter path with user defined scale
|
||||
|
||||
@@ -399,6 +399,8 @@ extern "C" {
|
||||
// proportion of the model (layers or rows) to offload to each GPU, size: llama_max_devices()
|
||||
const float * tensor_split;
|
||||
|
||||
const int * fit_margin_array;
|
||||
|
||||
// comma separated list of RPC servers to use for offloading
|
||||
const char * rpc_servers;
|
||||
|
||||
|
||||
@@ -3293,6 +3293,7 @@ static bool llm_load_tensors(
|
||||
int n_ubatch,
|
||||
int amb,
|
||||
int fit_margin,
|
||||
const int * fit_margin_array,
|
||||
int worst_case_tokens,
|
||||
bool flash_attn,
|
||||
bool use_mlock,
|
||||
@@ -3364,6 +3365,23 @@ static bool llm_load_tensors(
|
||||
model.mtp = mtp;
|
||||
|
||||
size_t mem_margin = fit_margin > 0 ? size_t(fit_margin)*1024*1024 : k_default_mem_margin;
|
||||
auto get_mem_margin = [mem_margin, fit_margin_array, n_gpu = int(model.devices.size()), func = __func__] (int gpu) {
|
||||
size_t result = mem_margin;
|
||||
bool have_margin_override = false;
|
||||
if (fit_margin_array) {
|
||||
for (int i = 0; ; i += 2) {
|
||||
if (fit_margin_array[i] < 0) break;
|
||||
if (fit_margin_array[i] == gpu && fit_margin_array[i+1] >= 0) {
|
||||
result = size_t(fit_margin_array[i+1])*1024*1024;
|
||||
have_margin_override = true;
|
||||
}
|
||||
}
|
||||
}
|
||||
if (have_margin_override) {
|
||||
LLAMA_LOG_INFO("%s: using %0.2f MiB as fit margin for GPU %d\n", func, result/1024./1024., gpu);
|
||||
}
|
||||
return result;
|
||||
};
|
||||
|
||||
const int n_layer = hparams.n_layer;
|
||||
int i_gpu_start = std::max((int) hparams.n_layer - n_gpu_layers, (int) 0);
|
||||
@@ -3382,10 +3400,11 @@ static bool llm_load_tensors(
|
||||
std::vector<size_t> device_mem(model.devices.size());
|
||||
for (int i = 0; i < int(device_mem.size()); ++i) {
|
||||
device_mem[i] = llama_get_device_memory(model, model.devices[i]);
|
||||
if (device_mem[i] > mem_margin) {
|
||||
device_mem[i] -= mem_margin;
|
||||
auto this_margin = get_mem_margin(i);
|
||||
if (device_mem[i] > this_margin) {
|
||||
device_mem[i] -= this_margin;
|
||||
} else {
|
||||
LLAMA_LOG_WARN("Free memory %zu MiB on device %d is less the %zu MiB safety margin\n", device_mem[i]/(1024*1024), model.devices[i], mem_margin/(1024*1024));
|
||||
LLAMA_LOG_WARN("Free memory %zu MiB on device %d is less the %zu MiB safety margin\n", device_mem[i]/(1024*1024), model.devices[i], this_margin/(1024*1024));
|
||||
device_mem[i] = 0;
|
||||
}
|
||||
}
|
||||
@@ -4071,7 +4090,7 @@ static int llama_model_load(const std::string & fname, llama_model & model, llam
|
||||
if (!llm_load_tensors(
|
||||
ml, model, params.n_gpu_layers, params.mla, params.split_mode, params.main_gpu, params.max_gpu, params.tensor_split,
|
||||
params.type_k, params.type_v, params.extra_output_type,
|
||||
params.max_ctx_size, params.n_seq_max, params.n_ubatch, params.amb, params.fit_margin,
|
||||
params.max_ctx_size, params.n_seq_max, params.n_ubatch, params.amb, params.fit_margin, params.fit_margin_array,
|
||||
params.worst_graph_tokens, params.flash_attn,
|
||||
params.use_mlock, params.validate_quants, params.mtp, params.fit, params.dry_run,
|
||||
params.progress_callback, params.progress_callback_user_data
|
||||
@@ -6203,6 +6222,7 @@ struct llama_model_params llama_model_default_params() {
|
||||
/*.n_last_v =*/ -1,
|
||||
/*.extra_output_type =*/ GGML_TYPE_COUNT,
|
||||
/*.tensor_split =*/ nullptr,
|
||||
/*.fit_margin_array =*/ nullptr,
|
||||
/*.rpc_servers =*/ nullptr,
|
||||
/*.progress_callback =*/ nullptr,
|
||||
/*.progress_callback_user_data =*/ nullptr,
|
||||
|
||||
Reference in New Issue
Block a user