Port universal assisted decoding to llama-server (#699)

* port universal assisted decoding to server * fix calls * fix LOG_INFO * fix llama_detokenize call * use emplace_back
2026-01-26 17:20:01 +00:00 · 2025-08-17 23:22:23 -07:00
parent 6b2c84b099
commit 06bed7e01b
5 changed files with 160 additions and 55 deletions
--- a/common/speculative.h
+++ b/common/speculative.h
@@ -13,10 +13,17 @@ struct llama_speculative_params {
    float p_min = 0.75f; // min probability required to accept a token in the draft
 };

-struct llama_speculative * llama_speculative_init(struct llama_context * ctx_dft);
+struct llama_speculative * llama_speculative_init(
+        struct llama_context * ctx_tgt,
+        struct llama_context * ctx_dft
+);

 void llama_speculative_free(struct llama_speculative * spec);

+void llama_speculative_add_replacement_tgt_dft(
+        struct llama_speculative * spec,
+        const char *source, const char *dest);
+
 bool llama_speculative_are_compatible(
        const struct llama_context * ctx_tgt,
        const struct llama_context * ctx_dft);