GPU offload policy (#405)

* Adding GPU offload policy

* Minor

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
This commit is contained in:
Kawrakow
2025-05-12 07:47:46 +03:00
committed by GitHub
parent 504fb890d9
commit 8669c3db2b
7 changed files with 77 additions and 2 deletions

View File

@@ -143,6 +143,7 @@ struct gpt_params {
std::vector<std::string> antiprompt; // strings upon which more user input is prompted (a.k.a. reverse prompts)
std::vector<llama_model_kv_override> kv_overrides;
std::vector<llama_model_tensor_buft_override> tensor_buft_overrides;
std::vector<std::pair<int,int>> offload_policy;
bool lora_init_without_apply = false; // only load lora to memory, but do not apply it to ctx (user can manually apply lora later using llama_lora_adapter_apply)
std::vector<llama_lora_adapter_info> lora_adapters; // lora adapter path with user defined scale