server: exclude thinking tokens when finding the slot (#1079)

refactor find slot enable by default Fix load prompt rename variables Co-authored-by: firecoperana <firecoperana>
2026-04-28 10:21:48 +00:00 · 2025-12-22 02:46:45 -06:00
parent 21fc9322f9
commit 5562605076
8 changed files with 247 additions and 33 deletions
--- a/examples/server/server-common.h
+++ b/examples/server/server-common.h
@@ -171,6 +171,7 @@ std::string tokens_to_str(llama_context* ctx, const llama_tokens& tokens);
 // format incomplete utf-8 multibyte character for output
 std::string tokens_to_output_formatted_string(const llama_context* ctx, const llama_token token);

+
 struct common_prefix {
    size_t first = 0;
    size_t second = 0;
@@ -389,6 +390,7 @@ public:

    size_t get_common_prefix_exact(const server_tokens& b) const;

+    llama_tokens get_text_tokens_exclude_think(const llama_context* ctx, const thinking_tokens& think_token) const;

    common_prefix get_common_prefix(const llama_context* ctx, const server_tokens& b, bool exact = false) const;
    // take first n tokens of tokens list a