server: exclude thinking tokens when finding the slot (#1079)

refactor find slot enable by default Fix load prompt rename variables Co-authored-by: firecoperana <firecoperana>
2026-01-26 17:20:01 +00:00 · 2025-12-22 02:46:45 -06:00
parent ecabd6acf7
commit 2a633c4357
8 changed files with 247 additions and 33 deletions
--- a/common/common.cpp
+++ b/common/common.cpp
@@ -253,6 +253,30 @@ common_webui common_webui_from_name(const std::string& format) {
    }
 }

+thinking_tokens thinking_tokens_from_string(const std::string& format) {
+    thinking_tokens think_token;
+    std::string token_string = string_strip(format);
+    if (token_string == "none" || token_string == "None") {
+        think_token.exclude = false;
+        return think_token;
+    }
+    else if (token_string == "auto" || token_string == "Auto") {
+        think_token.exclude = true;
+        think_token.begin = "<think>";
+        think_token.end = "</think>";
+        return think_token;
+    }
+    // Use user provided think tokens
+    auto start_end = string_split(format, ",");
+    if (start_end.size() == 2) {
+        think_token.exclude = true;
+        think_token.begin = start_end[0];
+        think_token.end = start_end[1];
+    }
+    return think_token;
+}
+
+
 static std::string read_file(const std::string& fname) {
    std::ifstream file(fname);
    if (!file) {
@@ -1745,6 +1769,11 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
        }
        return true;
    }
+    if (arg == "--reasoning-tokens") {
+        CHECK_ARG
+        params.think_tokens = thinking_tokens_from_string(std::string(argv[i]));
+        return true;
+    }
    if (arg == "--reasoning-budget") {
        CHECK_ARG
        params.reasoning_budget = std::stoi(argv[i]);
@@ -2160,6 +2189,7 @@ void gpt_params_print_usage(int /*argc*/, char ** argv, const gpt_params & param
    options.push_back({ "main",        "       --cfg-negative-prompt-file FNAME",
                                                                        "negative prompt file to use for guidance" });
    options.push_back({ "main",        "       --cfg-scale N",          "strength of guidance (default: %.1f, 1.0 = disable)", (double)sparams.cfg_scale });
+    options.push_back({ "template" });
    options.push_back({ "main",        "       --jinja",
                                                                        "set custom jinja chat template (default: template taken from model's metadata)\n"
                                                                        "if suffix/prefix are specified, template will be disabled\n"
@@ -2176,7 +2206,15 @@ void gpt_params_print_usage(int /*argc*/, char ** argv, const gpt_params & param
                        "- deepseek-legacy: keeps `<think>` tags in `message.content` while also populating `message.reasoning_content`\n"
                        "(default: none)", });
    options.push_back({ "main",      "       --chat-template-kwargs JSON",  "sets additional params for the json template parser"});
-    options.push_back({ "main",      "       --reasoning-budget N",  "controls the amount of thinking allowed; currently only one of: -1 for unrestricted thinking budget, or 0 to disable thinking (default: -1)" });
+    options.push_back({ "main",      "       --reasoning-budget N",  "controls the amount of thinking allowed.\n"
+                                                                                                     "currently only one of: -1 for unrestricted thinking budget, or 0 to disable thinking"
+                                                                                                      "(default: -1)" });
+    options.push_back({ "main",      "       --reasoning-tokens FORMAT",     "exclude reasoning tokens to select the slot more accurately.\n"
+						                                                                                            "none: include all tokens\n"
+                                                                                                                    "auto: exclude all tokens between <think> and </think>\n"
+						                                                                                            "Or comma separated start and end tokens such as [THINK],[/THINK]\n"
+						                                                                                            "(default: auto)" });
+
    options.push_back({ "main",      "       --no-prefill-assistant",  "whether to prefill the assistant's response if the last message is an assistant message (default: prefill enabled)\n"
            "when this flag is set, if the last message is an assistant message then it will be treated as a full message and not prefilled\n" });
    options.push_back({ "grammar" });
--- a/common/common.h
+++ b/common/common.h
@@ -119,6 +119,15 @@ enum common_webui {

 common_webui common_webui_from_name(const std::string& format);

+struct thinking_tokens {
+    bool exclude = true;
+    std::string begin = "<think>";
+    std::string end = "</think>";
+};
+
+thinking_tokens thinking_tokens_from_string(const std::string& format);
+
+
 struct model_paths {
    std::string path        = ""; // model local path                                       // NOLINT
    std::string url         = ""; // model url to download                                  // NOLINT
@@ -314,6 +323,7 @@ struct gpt_params {
    std::string system_prompt = "";
    bool enable_chat_template = true;
    common_reasoning_format reasoning_format = COMMON_REASONING_FORMAT_DEEPSEEK;
+    thinking_tokens think_tokens;
    int reasoning_budget = -1;
    bool prefill_assistant = true;