Tool calls support from mainline (#723)

* Tool calls support from mainline * update cmake * revert api for /completions * Fix broken thinking process for gpt-oss * add missing args and fix webui bugs * add missing args and fix webui bugs2 * Fix reasoning format error * add usage * change default post_sampling_probs to true * add back generated_text * Remove server endpoints tests * add log * Chat fixes * Remove logs * webui: revert extra handling of thinking process --------- Co-authored-by: firecoperana <firecoperana> Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
2026-04-28 10:21:48 +00:00 · 2025-09-01 00:38:49 -05:00
parent 8de297b795
commit d7882c3cf8
87 changed files with 13581 additions and 2224 deletions
--- a/common/common.h
+++ b/common/common.h
@@ -15,14 +15,18 @@

 #define LOG_NO_FILE_LINE_FUNCTION
 #include "log.h"
-
+#include <set>
 #include <cmath>
 #include <string>
+#include <sstream>
+#include <string_view>
 #include <vector>
 #include <random>
 #include <thread>
 #include <unordered_map>
 #include <tuple>
+#include <map>
+#include <sstream>

 #ifdef _WIN32
 #define DIRECTORY_SEPARATOR '\\'
@@ -74,6 +78,14 @@ enum dimre_method {
    DIMRE_METHOD_MEAN,
 };

+// reasoning API response format (not to be confused as chat template's reasoning format)
+enum common_reasoning_format {
+    COMMON_REASONING_FORMAT_NONE,
+    COMMON_REASONING_FORMAT_AUTO,
+    COMMON_REASONING_FORMAT_DEEPSEEK_LEGACY, // Extract thinking tag contents and return as `message.reasoning_content`, or leave inline in <think> tags in stream mode
+    COMMON_REASONING_FORMAT_DEEPSEEK,        // Extract thinking tag contents and return as `message.reasoning_content`, including in streaming deltas.
+};
+
 struct gpt_params {
    uint32_t seed                 = LLAMA_DEFAULT_SEED; // RNG seed

@@ -240,13 +252,21 @@ struct gpt_params {
    bool use_jinja = false;                                                                                 // NOLINT
    std::string system_prompt = "";
    bool enable_chat_template = true;
+    common_reasoning_format reasoning_format = COMMON_REASONING_FORMAT_NONE;
+    int reasoning_budget = -1;
+    bool prefill_assistant = true;

    std::vector<std::string> api_keys;

    std::string ssl_file_key  = "";
    std::string ssl_file_cert = "";

-    bool endpoint_slots   = true;
+    std::map<std::string, std::string> default_template_kwargs;
+
+    // "advanced" endpoints are disabled by default for better security
+    bool webui            = true;
+    bool endpoint_slots   = false;
+    bool endpoint_props   = false; // only control POST requests, not GET
    bool endpoint_metrics = false;

    bool log_json = false;
@@ -314,19 +334,24 @@ std::string gpt_params_get_system_info(const gpt_params & params);
 //
 // String utils
 //
-
-std::vector<std::string> string_split(std::string input, char separator);
-std::string string_join(const std::vector<std::string> & strs, const std::string & delimiter);
-
+std::string string_join(const std::vector<std::string>& values, const std::string& separator);
 std::string string_strip(const std::string & str);
 std::string string_get_sortable_timestamp();

-void string_replace_all(std::string & s, const std::string & search, const std::string & replace);
+static bool string_starts_with(const std::string& str,
+    const std::string& prefix) {  // While we wait for C++20's std::string::starts_with...
+    return str.rfind(prefix, 0) == 0;
+}

-// Additional string utilities for builder pattern compatibility
-bool string_starts_with(const std::string & str, const std::string & prefix);
-bool string_ends_with(const std::string_view & str, const std::string_view & suffix);
-size_t string_find_partial_stop(const std::string_view & str, const std::string_view & stop);
+std::vector<std::string> string_split(const std::string& str, const std::string& delimiter);
+std::vector<std::string> string_split(const std::string& str, char delim);
+
+void string_replace_all(std::string & s, const std::string & search, const std::string & replace);
+// While we wait for C++20's std::string::ends_with...
+bool string_ends_with(const std::string_view& str, const std::string_view& suffix);
+size_t string_find_partial_stop(const std::string_view& str, const std::string_view& stop);
+
+std::string regex_escape(const std::string& s);

 template<class T>
 static std::vector<T> string_split(const std::string & str, char delim) {
@@ -342,6 +367,22 @@ static std::vector<T> string_split(const std::string & str, char delim) {
    return values;
 }

+template<>
+std::vector<std::string> string_split<std::string>(const std::string& input, char separator)
+{
+    std::vector<std::string> parts;
+    size_t begin_pos = 0;
+    size_t separator_pos = input.find(separator);
+    while (separator_pos != std::string::npos) {
+        std::string part = input.substr(begin_pos, separator_pos - begin_pos);
+        parts.emplace_back(part);
+        begin_pos = separator_pos + 1;
+        separator_pos = input.find(separator, begin_pos);
+    }
+    parts.emplace_back(input.substr(begin_pos, separator_pos - begin_pos));
+    return parts;
+}
+
 bool string_parse_kv_override(const char * data, std::vector<llama_model_kv_override> & overrides);
 void string_process_escapes(std::string & input);

@@ -432,52 +473,59 @@ bool llama_should_add_bos_token(const llama_model * model);
 //
 // Chat template utils
 //
+//struct common_tool_call {
+//    std::string name;
+//    std::string arguments;
+//    std::string id;
+//};
+//
+//// same with llama_chat_message, but uses std::string
+//struct common_chat_msg {
+//    std::string role;
+//    std::string content;
+//    std::vector<common_tool_call> tool_calls;
+//    std::string reasoning_content = "";
+//};

-// same with llama_chat_message, but uses std::string
-struct llama_chat_msg {
-    std::string role;
-    std::string content;
-};
-
-// Check if the template supplied via "--chat-template" is supported or not. Returns true if it's valid
-bool llama_chat_verify_template(const struct llama_model* , const std::string& tmpl, bool use_jinja);
-
-namespace minja {
-    class chat_template;
-}
-
-typedef minja::chat_template common_chat_template;
-
-struct common_chat_templates {
-    bool has_explicit_template; // Model had builtin template or template overridde was specified.
-    std::unique_ptr<common_chat_template> template_default; // always set (defaults to chatml)
-    std::unique_ptr<common_chat_template> template_tool_use;
-};
-
-
-// CPP wrapper for llama_chat_apply_template
-// If the built-in template is not supported, we default to chatml
-// If the custom "tmpl" is not supported, we throw an error
-std::string llama_chat_apply_template(
-    const struct llama_model* model,
-    const common_chat_template& tmpl,
-    const std::vector< llama_chat_msg>& chat,
-    bool add_ass,
-    bool use_jinja);
-
-// Format single message, while taking into account the position of that message in chat history
-std::string  llama_chat_format_single(const struct llama_model* model,
-    const common_chat_template& tmpl,
-    const std::vector< llama_chat_msg>& past_msg,
-    const  llama_chat_msg& new_msg,
-    bool add_ass,
-    bool use_jinja);
-
-// Returns an example of formatted chat
-std::string  llama_chat_format_example(const struct llama_model* model,
-    const common_chat_template& tmpl, bool use_jinja);
-
-common_chat_templates  llama_chat_templates_from_model(const struct llama_model* model, const std::string& chat_template_override);
+//// Check if the template supplied via "--chat-template" is supported or not. Returns true if it's valid
+//bool llama_chat_verify_template(const struct llama_model* , const std::string& tmpl, bool use_jinja);
+//
+//namespace minja {
+//    class chat_template;
+//}
+//
+//typedef minja::chat_template common_chat_template;
+//
+//struct common_chat_templates {
+//    bool has_explicit_template; // Model had builtin template or template overridde was specified.
+//    std::unique_ptr<common_chat_template> template_default; // always set (defaults to chatml)
+//    std::unique_ptr<common_chat_template> template_tool_use;
+//};
+//
+//
+//// CPP wrapper for llama_chat_apply_template
+//// If the built-in template is not supported, we default to chatml
+//// If the custom "tmpl" is not supported, we throw an error
+//std::string llama_chat_apply_template(
+//    const struct llama_model* model,
+//    const common_chat_template& tmpl,
+//    const std::vector< common_chat_msg>& chat,
+//    bool add_ass,
+//    bool use_jinja);
+//
+//// Format single message, while taking into account the position of that message in chat history
+//std::string  llama_chat_format_single(const struct llama_model* model,
+//    const common_chat_template& tmpl,
+//    const std::vector< common_chat_msg>& past_msg,
+//    const  common_chat_msg& new_msg,
+//    bool add_ass,
+//    bool use_jinja);
+//
+//// Returns an example of formatted chat
+//std::string  llama_chat_format_example(const struct llama_model* model,
+//    const common_chat_template& tmpl, bool use_jinja);
+//
+//common_chat_templates  llama_chat_templates_from_model(const struct llama_model* model, const std::string& chat_template_override);


 //