Port universal assisted decoding to llama-server (#699)

* port universal assisted decoding to server

* fix calls

* fix LOG_INFO

* fix llama_detokenize call

* use emplace_back
This commit is contained in:
g2mt
2025-08-17 23:22:23 -07:00
committed by GitHub
parent 6b2c84b099
commit 06bed7e01b
5 changed files with 160 additions and 55 deletions

View File

@@ -148,6 +148,8 @@ struct gpt_params {
std::vector<llama_model_tensor_buft_override> tensor_buft_overrides;
std::vector<std::pair<int,int>> offload_policy;
std::vector<std::pair<std::string, std::string>> replacements_draft; // main to speculative model replacements
bool lora_init_without_apply = false; // only load lora to memory, but do not apply it to ctx (user can manually apply lora later using llama_lora_adapter_apply)
std::vector<llama_lora_adapter_info> lora_adapters; // lora adapter path with user defined scale