spec : add self speculative decoding, ngram and refactor (#1261)

* spec : add self speculative decoding and ngram-mod and refactor common : use common_ prefix for common library function llama : use LLAMA_TOKEN_NULL spec : add self speculative decoding (no draft model required) + refactor spec : add ngram-mod spec : various improvements ton ngram-map + docs spec : fix the check-rate logic of ngram-simple common : add common_speculative_is_compat() spec : simplify time measurement using common_time_meas refactor common_sampler_init refactor common_token_to_piece refactor and fix cur_p bug clean up * spec : remove check rate * spec: show warnings instead of abort --------- Co-authored-by: firecoperana <firecoperana> Co-authored-by: Sascha Rogmann <59577610+srogmann@users.noreply.github.com>
2026-02-20 13:14:09 +00:00 · 2026-02-13 12:04:55 -06:00
parent 1fdbc0dafe
commit 1cb7e1bf39
54 changed files with 2652 additions and 779 deletions
--- a/examples/perplexity/perplexity.cpp
+++ b/examples/perplexity/perplexity.cpp
@@ -352,7 +352,7 @@ static results_perplexity perplexity_v2(llama_context * ctx, const gpt_params &

    fprintf(stderr, "%s: tokenizing the input ..\n", __func__);

-    std::vector<llama_token> tokens = ::llama_tokenize(ctx, params.prompt, true);
+    std::vector<llama_token> tokens = ::common_tokenize(ctx, params.prompt, true);

    const int n_ctx = llama_n_ctx(ctx);

@@ -505,7 +505,7 @@ static results_perplexity perplexity(llama_context * ctx, const gpt_params & par
    auto tim1 = std::chrono::high_resolution_clock::now();
    fprintf(stderr, "%s: tokenizing the input ..\n", __func__);

-    std::vector<llama_token> tokens = ::llama_tokenize(ctx, params.prompt, true);
+    std::vector<llama_token> tokens = ::common_tokenize(ctx, params.prompt, true);

    auto tim2 = std::chrono::high_resolution_clock::now();
    fprintf(stderr, "%s: tokenization took %g ms\n",__func__,1e-3*std::chrono::duration_cast<std::chrono::microseconds>(tim2-tim1).count());
@@ -770,6 +770,8 @@ static void compute_logprobs(const float * batch_logits, int n_vocab, std::vecto
 }

 static void hellaswag_score(llama_context * ctx, const gpt_params & params) {
+    const llama_model * model = llama_get_model(ctx);
+    const llama_vocab * vocab = llama_model_get_vocab(model);
    // Calculates hellaswag score (acc_norm) from prompt
    //
    // Data extracted from the HellaSwag validation dataset (MIT license) https://github.com/rowanz/hellaswag/blob/master/data/hellaswag_val.jsonl
@@ -803,7 +805,7 @@ static void hellaswag_score(llama_context * ctx, const gpt_params & params) {
    size_t hs_task_count = prompt_lines.size()/6;
    fprintf(stderr, "%s : loaded %zu tasks from prompt.\n", __func__, hs_task_count);

-    const bool is_spm = llama_vocab_type(llama_get_model(ctx)) == LLAMA_VOCAB_TYPE_SPM;
+    const bool is_spm = llama_vocab_type(vocab) == LLAMA_VOCAB_TYPE_SPM;
    fprintf(stderr, "================================= is_spm = %d\n", is_spm);

    // The tasks should be randomized so the score stabilizes quickly.
@@ -850,7 +852,7 @@ static void hellaswag_score(llama_context * ctx, const gpt_params & params) {
        hs_cur.gold_ending_idx = std::stoi( prompt_lines[idx*6+1] );
        for (size_t j = 0; j < 4; j++) {
            hs_cur.ending[j] = prompt_lines[idx*6+2+j];
-            hs_cur.seq_tokens[j] = ::llama_tokenize(ctx, hs_cur.context + " " + hs_cur.ending[j], true);
+            hs_cur.seq_tokens[j] = ::common_tokenize(ctx, hs_cur.context + " " + hs_cur.ending[j], true);
        }

        // determine the common prefix of the endings
@@ -1143,8 +1145,8 @@ static void winogrande_score(llama_context * ctx, const gpt_params & params) {
    fprintf(stderr, "%s : tokenizing selected tasks\n", __func__);

    for (auto & task : data) {
-        task.seq_tokens[0] = ::llama_tokenize(ctx, task.first + task.choices[0] + task.second, true);
-        task.seq_tokens[1] = ::llama_tokenize(ctx, task.first + task.choices[1] + task.second, true);
+        task.seq_tokens[0] = ::common_tokenize(ctx, task.first + task.choices[0] + task.second, true);
+        task.seq_tokens[1] = ::common_tokenize(ctx, task.first + task.choices[1] + task.second, true);

        task.common_prefix = 0;
        for (size_t k = 0; k < task.seq_tokens[0].size(); k++) {
@@ -1159,8 +1161,8 @@ static void winogrande_score(llama_context * ctx, const gpt_params & params) {
            task.seq_tokens[0].size() - task.common_prefix +
            task.seq_tokens[1].size() - task.common_prefix;

-        task.n_base1 = ::llama_tokenize(ctx, task.first + task.choices[0], true).size();
-        task.n_base2 = ::llama_tokenize(ctx, task.first + task.choices[1], true).size();
+        task.n_base1 = ::common_tokenize(ctx, task.first + task.choices[0], true).size();
+        task.n_base2 = ::common_tokenize(ctx, task.first + task.choices[1], true).size();
    }

    fprintf(stderr, "%s : calculating winogrande score over selected tasks.\n", __func__);
@@ -1366,7 +1368,7 @@ static bool multiple_choice_prepare_one_task(llama_context * ctx, multiple_choic
            }
            return false;
        }
-        task.seq_tokens.emplace_back(::llama_tokenize(ctx, task.question + " " + answer, true));
+        task.seq_tokens.emplace_back(::common_tokenize(ctx, task.question + " " + answer, true));
    }
    auto min_len = task.seq_tokens.front().size();
    for (auto& seq : task.seq_tokens) {