server: improve speed of speculative decoding (#1119)

* server: improve speed of speculative decoding change logs rpc: add recompute spec dec fix * Fix n_batch_size not set to context size for draft model --------- Co-authored-by: firecoperana <firecoperana>
2026-02-27 08:34:09 +00:00 · 2026-01-10 00:01:22 -06:00
parent 6695c6c945
commit c1931663ad
7 changed files with 164 additions and 135 deletions
--- a/examples/server/server-context.h
+++ b/examples/server/server-context.h
@@ -98,6 +98,11 @@ struct server_slot {

    std::string generated_text;

+    // idx of draft tokens in the main batch
+    // non-empty if we went to evaluate draft tokens
+    // ref: https://github.com/ggml-org/llama.cpp/pull/17808
+    std::vector<int32_t> i_batch_dft;
+
    std::vector<completion_token_output> generated_token_probs;
    common_chat_msg chat_msg;

@@ -122,7 +127,9 @@ struct server_slot {
    void prompt_load(server_prompt_cache& prompt_cache, const server_tokens& tokens);

    // sampling
-    llama_token sampled;
+    llama_token sampled; // in speculative mode, this is the last accepted token
+    llama_tokens drafted;
+
    struct llama_sampling_params sparams;
    llama_sampling_context* ctx_sampling = nullptr;
    json json_schema;
@@ -168,6 +175,8 @@ struct server_slot {

    void add_token_string(const completion_token_output& token);

+    int get_n_draft_max() const;
+
    void release();

    json get_formated_timings() const;