Port universal assisted decoding to llama-server (#699)

* port universal assisted decoding to server * fix calls * fix LOG_INFO * fix llama_detokenize call * use emplace_back
2026-02-03 13:04:59 +00:00 · 2025-08-17 23:22:23 -07:00
parent 6b2c84b099
commit 06bed7e01b
5 changed files with 160 additions and 55 deletions
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@@ -910,7 +910,7 @@ struct server_context {
            chat_templates = llama_chat_templates_from_model(model, params.chat_template);
        }
        GGML_ASSERT(chat_templates.template_default.get() != nullptr);
-      
+
        // Load draft model for speculative decoding if specified
        if (!params.model_draft.empty()) {
            LOG_INFO("loading draft model", {{"model", params.model_draft}});
@@ -933,8 +933,7 @@ struct server_context {
            }

            if (!llama_speculative_are_compatible(ctx, llama_init_dft.context)) {
-                LOG_ERROR("the draft model is not compatible with the target model", {});
-                return false;
+                LOG_INFO("the draft model is not compatible with the target model. tokens will be translated between the draft and target models.", {{}});
            }

            const int n_ctx_dft = llama_n_ctx(llama_init_dft.context);
@@ -1029,11 +1028,15 @@ struct server_context {
                    return;
                }

-                slot.spec = llama_speculative_init(slot.ctx_dft);
+                slot.spec = llama_speculative_init(ctx, slot.ctx_dft);
                if (slot.spec == nullptr) {
                    LOG_ERROR("failed to create speculator", {});
                    return;
                }
+                for (auto & pair : params.replacements_draft) {
+                    llama_speculative_add_replacement_tgt_dft(slot.spec, pair.first.c_str(), pair.second.c_str());
+                }
+
            }

            slot.reset();