server: improve speed of speculative decoding (#1119)

* server: improve speed of speculative decoding

change logs

rpc: add recompute

spec dec fix

* Fix n_batch_size not set to context size for draft model

---------

Co-authored-by: firecoperana <firecoperana>
This commit is contained in:
firecoperana
2026-01-10 00:01:22 -06:00
committed by GitHub
parent 6695c6c945
commit c1931663ad
7 changed files with 164 additions and 135 deletions

View File

@@ -98,6 +98,11 @@ struct server_slot {
std::string generated_text;
// idx of draft tokens in the main batch
// non-empty if we went to evaluate draft tokens
// ref: https://github.com/ggml-org/llama.cpp/pull/17808
std::vector<int32_t> i_batch_dft;
std::vector<completion_token_output> generated_token_probs;
common_chat_msg chat_msg;
@@ -122,7 +127,9 @@ struct server_slot {
void prompt_load(server_prompt_cache& prompt_cache, const server_tokens& tokens);
// sampling
llama_token sampled;
llama_token sampled; // in speculative mode, this is the last accepted token
llama_tokens drafted;
struct llama_sampling_params sparams;
llama_sampling_context* ctx_sampling = nullptr;
json json_schema;
@@ -168,6 +175,8 @@ struct server_slot {
void add_token_string(const completion_token_output& token);
int get_n_draft_max() const;
void release();
json get_formated_timings() const;