Port universal assisted decoding to llama-server (#699)

* port universal assisted decoding to server

* fix calls

* fix LOG_INFO

* fix llama_detokenize call

* use emplace_back
This commit is contained in:
g2mt
2025-08-17 23:22:23 -07:00
committed by GitHub
parent 6b2c84b099
commit 06bed7e01b
5 changed files with 160 additions and 55 deletions

View File

@@ -910,7 +910,7 @@ struct server_context {
chat_templates = llama_chat_templates_from_model(model, params.chat_template);
}
GGML_ASSERT(chat_templates.template_default.get() != nullptr);
// Load draft model for speculative decoding if specified
if (!params.model_draft.empty()) {
LOG_INFO("loading draft model", {{"model", params.model_draft}});
@@ -933,8 +933,7 @@ struct server_context {
}
if (!llama_speculative_are_compatible(ctx, llama_init_dft.context)) {
LOG_ERROR("the draft model is not compatible with the target model", {});
return false;
LOG_INFO("the draft model is not compatible with the target model. tokens will be translated between the draft and target models.", {{}});
}
const int n_ctx_dft = llama_n_ctx(llama_init_dft.context);
@@ -1029,11 +1028,15 @@ struct server_context {
return;
}
slot.spec = llama_speculative_init(slot.ctx_dft);
slot.spec = llama_speculative_init(ctx, slot.ctx_dft);
if (slot.spec == nullptr) {
LOG_ERROR("failed to create speculator", {});
return;
}
for (auto & pair : params.replacements_draft) {
llama_speculative_add_replacement_tgt_dft(slot.spec, pair.first.c_str(), pair.second.c_str());
}
}
slot.reset();