Port speculative decoding from upstream to llama-server (#645)

* server : integrate speculative decoding * server: Fix field names * server: fix include, whitespace * fix compile errors in speculative.cpp * add llama_sampling_sample_and_accept_n to sampling * finish porting speculative decoding in server * port functions from common/speculative, common/sampling * remove arg * fix function names * init params_dft to none * correct value for n_ctx * prefix kv cache tensors with model name to avoid conflict * fix call arguments * fix spec decoding args * correct slot.id * use n_max * port the rest of sampling funcs * fix func arguments * slot.id starts at 1? * Revert "prefix kv cache tensors with model name to avoid conflict" This reverts commit fbd5dfd866. * disable draft logging * disable logging in speculative.cpp in mainline, these would be LOG_DEBUG, but since ik_llama doesnt support it, logging is disabled entirely * add more draft model parameters * fix * pass flash_attn * add speculative params for parity * set speculative params in launch_slot_with_task instead
2026-01-26 17:20:01 +00:00 · 2025-08-15 21:26:44 -07:00
parent 2e2abddaa8
commit b6bc5eedad
8 changed files with 655 additions and 41 deletions
--- a/common/common.h
+++ b/common/common.h
@@ -83,10 +83,13 @@ struct gpt_params {
    int32_t n_threads_batch_draft =    -1;
    int32_t n_predict             =    -1; // new tokens to predict
    int32_t n_ctx                 =     0; // context size
+    int32_t n_ctx_draft           =     0; // context size for draft model
    int32_t n_batch               =  2048; // logical batch size for prompt processing (must be >=32 to use BLAS)
    int32_t n_ubatch              =   512; // physical batch size for prompt processing (must be >=32 to use BLAS)
    int32_t n_keep                =     0; // number of tokens to keep from initial prompt
-    int32_t n_draft               =     5; // number of tokens to draft during speculative decoding
+    int32_t n_draft               =    16; // number of tokens to draft during speculative decoding
+    int32_t n_draft_min           =     1; // minimum number of tokens to draft during speculative decoding
+    float   p_draft_min           =  0.8f; // minimum speculative decoding probability (greedy)
    int32_t n_chunks              =    -1; // max number of chunks to process (-1 = unlimited)
    int32_t n_parallel            =     1; // number of parallel sequences to decode
    int32_t n_sequences           =     1; // number of sequences to decode
@@ -207,6 +210,8 @@ struct gpt_params {

    std::string cache_type_k = "f16"; // KV cache data type for the K
    std::string cache_type_v = "f16"; // KV cache data type for the V
+    std::string cache_type_k_draft = ""; // KV cache data type for K for the draft model
+    std::string cache_type_v_draft = ""; // KV cache data type for V for the draft model

    // multimodal models (see examples/llava)
    std::string mmproj = "";        // path to multimodal projector