Tool calls support from mainline (#723)

* Tool calls support from mainline * update cmake * revert api for /completions * Fix broken thinking process for gpt-oss * add missing args and fix webui bugs * add missing args and fix webui bugs2 * Fix reasoning format error * add usage * change default post_sampling_probs to true * add back generated_text * Remove server endpoints tests * add log * Chat fixes * Remove logs * webui: revert extra handling of thinking process --------- Co-authored-by: firecoperana <firecoperana> Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
2026-04-28 18:32:04 +00:00 · 2025-09-01 00:38:49 -05:00
parent 8de297b795
commit d7882c3cf8
87 changed files with 13581 additions and 2224 deletions
--- a/common/chat.h
+++ b/common/chat.h
@@ -1,37 +1,16 @@
-// Chat support with builder pattern for llama.cpp compatibility
+// Chat support (incl. tool call grammar constraining & output parsing) w/ generic & custom template handlers.
+
 #pragma once

 #include "common.h"
+#include <functional>
+#include <chrono>
 #include <string>
 #include <vector>
-#include <functional>
+#include <map>

-// Forward declarations
 struct common_chat_templates;

-// Basic data structures compatible with original llama.cpp
-struct common_string_range {
-    size_t begin;
-    size_t end;
-
-    common_string_range(size_t begin, size_t end) : begin(begin), end(end) {
-        if (begin > end) {
-            throw std::runtime_error("Invalid range");
-        }
-    }
-
-    // prevent default ctor
-    common_string_range() = delete;
-
-    bool empty() const {
-        return begin == end;
-    }
-
-    bool operator==(const common_string_range & other) const {
-        return begin == other.begin && end == other.end;
-    }
-};
-
 struct common_chat_tool_call {
    std::string name;
    std::string arguments;
@@ -40,10 +19,6 @@ struct common_chat_tool_call {
    bool operator==(const common_chat_tool_call & other) const {
        return name == other.name && arguments == other.arguments && id == other.id;
    }
-
-    bool operator!=(const common_chat_tool_call & other) const {
-        return !(*this == other);
-    }
 };

 struct common_chat_msg_content_part {
@@ -64,11 +39,11 @@ struct common_chat_msg {
    std::string tool_name;
    std::string tool_call_id;

-    bool empty() const {
-        return content.empty() && content_parts.empty() && tool_calls.empty() &&
-               reasoning_content.empty() && tool_name.empty() && tool_call_id.empty();
-    }
+    template <class T> T to_json_oaicompat() const;

+    bool empty() const {
+        return content.empty() && content_parts.empty() && tool_calls.empty() && reasoning_content.empty() && tool_name.empty() && tool_call_id.empty();
+    }
    void ensure_tool_call_ids_set(std::vector<std::string> & ids_cache, const std::function<std::string()> & gen_tool_call_id) {
        for (auto i = 0u; i < tool_calls.size(); i++) {
            if (ids_cache.size() <= i) {
@@ -81,7 +56,6 @@ struct common_chat_msg {
            tool_calls[i].id = ids_cache[i];
        }
    }
-
    bool operator==(const common_chat_msg & other) const {
        return role == other.role
            && content == other.content
@@ -91,7 +65,6 @@ struct common_chat_msg {
            && tool_name == other.tool_name
            && tool_call_id == other.tool_call_id;
    }
-
    bool operator!=(const common_chat_msg & other) const {
        return !(*this == other);
    }
@@ -110,10 +83,6 @@ struct common_chat_msg_diff {
        && tool_call_index == other.tool_call_index
        && tool_call_delta == other.tool_call_delta;
    }
-
-    bool operator!=(const common_chat_msg_diff & other) const {
-        return !(*this == other);
-    }
 };

 struct common_chat_tool {
@@ -131,50 +100,110 @@ enum common_chat_tool_choice {
 enum common_chat_format {
    COMMON_CHAT_FORMAT_CONTENT_ONLY,
    COMMON_CHAT_FORMAT_GENERIC,
+    COMMON_CHAT_FORMAT_MISTRAL_NEMO,
+    COMMON_CHAT_FORMAT_LLAMA_3_X,
+    COMMON_CHAT_FORMAT_LLAMA_3_X_WITH_BUILTIN_TOOLS,
    COMMON_CHAT_FORMAT_DEEPSEEK_R1,
+    COMMON_CHAT_FORMAT_FIREFUNCTION_V2,
+    COMMON_CHAT_FORMAT_FUNCTIONARY_V3_2,
+    COMMON_CHAT_FORMAT_FUNCTIONARY_V3_1_LLAMA_3_1,
+    COMMON_CHAT_FORMAT_HERMES_2_PRO,
+    COMMON_CHAT_FORMAT_COMMAND_R7B,
+    COMMON_CHAT_FORMAT_GRANITE,
    COMMON_CHAT_FORMAT_GPT_OSS,
-    COMMON_CHAT_FORMAT_KIMI_K2,  // Our custom format (keep last for backward compatibility)
+
+    COMMON_CHAT_FORMAT_COUNT, // Not a format, just the # formats
 };

-enum common_reasoning_format {
-    COMMON_REASONING_FORMAT_NONE,
-    COMMON_REASONING_FORMAT_AUTO,
-    COMMON_REASONING_FORMAT_DEEPSEEK,
-    COMMON_REASONING_FORMAT_DEEPSEEK_LEGACY,
+struct common_chat_templates_inputs {
+    std::vector<common_chat_msg> messages;
+    std::string grammar;
+    std::string json_schema;
+    bool add_generation_prompt = true;
+    bool use_jinja = true;
+    // Parameters below only supported when use_jinja is true
+    std::vector<common_chat_tool> tools;
+    common_chat_tool_choice tool_choice = COMMON_CHAT_TOOL_CHOICE_AUTO;
+    bool parallel_tool_calls = false;
+    common_reasoning_format reasoning_format = COMMON_REASONING_FORMAT_NONE;
+    bool enable_thinking = true;
+    std::chrono::system_clock::time_point now = std::chrono::system_clock::now();
+    std::map<std::string, std::string> chat_template_kwargs;
+    bool add_bos = false;
+    bool add_eos = false;
+};
+
+struct common_chat_params {
+    common_chat_format                  format = COMMON_CHAT_FORMAT_CONTENT_ONLY;
+    std::string                         prompt;
+    std::string                         grammar;
+    bool                                grammar_lazy = false;
+    bool                                thinking_forced_open = false;
+    std::vector<common_grammar_trigger> grammar_triggers;
+    std::vector<std::string>            preserved_tokens;
+    std::vector<std::string>            additional_stops;
 };

 struct common_chat_syntax {
-    common_chat_format format = COMMON_CHAT_FORMAT_KIMI_K2;
-    common_reasoning_format reasoning_format = COMMON_REASONING_FORMAT_AUTO; //COMMON_REASONING_FORMAT_NONE;
+    common_chat_format       format                = COMMON_CHAT_FORMAT_CONTENT_ONLY;
+    common_reasoning_format  reasoning_format      = COMMON_REASONING_FORMAT_NONE;
    // Whether reasoning_content should be inlined in the content (e.g. for reasoning_format=deepseek in stream mode)
-    bool reasoning_in_content = false;
-    bool thinking_forced_open = false;
-    bool enable_thinking = false;
-    bool enable_tool_calls = true;
+    bool                     reasoning_in_content  = false;
+    bool                     thinking_forced_open  = false;
+    bool                     parse_tool_calls      = true;
 };

-// Exception for partial parsing
-class common_chat_msg_partial_exception : public std::runtime_error {
-  public:
-    common_chat_msg_partial_exception(const std::string & message) : std::runtime_error(message) {}
-};
+// Check if the template supplied via "--chat-template" is supported or not. Returns true if it's valid
+bool common_chat_verify_template(const std::string & tmpl, bool use_jinja);

-// Bridge functions to integrate with existing ik_llama.cpp system
-// TODO: Uncomment and implement during integration phase
-// common_chat_msg ik_to_common_msg(const struct ik_chat_msg & ik_msg);
-// struct ik_chat_msg common_to_ik_msg(const common_chat_msg & common_msg);
+void common_chat_templates_free(struct common_chat_templates * tmpls);

-// Format detection from chat template
-common_chat_format common_chat_format_detect(const std::string & chat_template);
-const char* common_chat_format_name(common_chat_format format);
-const char* common_reasoning_format_name(common_reasoning_format format);
+struct common_chat_templates_deleter { void operator()(common_chat_templates * tmpls) { common_chat_templates_free(tmpls); } };

-// Main parsing function (entry point for original llama.cpp compatibility)
-common_chat_msg common_chat_parse(const std::string & input, bool is_partial, const common_chat_syntax & syntax);
+typedef std::unique_ptr<struct common_chat_templates, common_chat_templates_deleter> common_chat_templates_ptr;

-// Forward declare parser class
-class common_chat_msg_parser;
+common_chat_templates_ptr common_chat_templates_init(
+                                    const struct llama_model * model,
+                                           const std::string & chat_template_override,
+                                           const std::string & bos_token_override = "",
+                                           const std::string & eos_token_override = "");

-// Format-specific parsing functions (accessible from chat-parser)
-void common_chat_parse_deepseek_r1(common_chat_msg_parser & builder);
+bool         common_chat_templates_was_explicit(const struct common_chat_templates * tmpls);
+const char * common_chat_templates_source(const struct common_chat_templates * tmpls, const char * variant = nullptr);

+
+struct common_chat_params      common_chat_templates_apply(
+    const struct common_chat_templates * tmpls,
+    const struct common_chat_templates_inputs & inputs);
+
+// Format single message, while taking into account the position of that message in chat history
+std::string common_chat_format_single(
+        const struct common_chat_templates * tmpls,
+        const std::vector<common_chat_msg> & past_msg,
+        const common_chat_msg & new_msg,
+        bool add_ass,
+        bool use_jinja);
+
+// Returns an example of formatted chat
+std::string common_chat_format_example(
+    const struct common_chat_templates * tmpls,
+    bool use_jinja);
+
+const char*               common_chat_format_name(common_chat_format format);
+const char*               common_reasoning_format_name(common_reasoning_format format);
+common_reasoning_format   common_reasoning_format_from_name(const std::string& format);
+common_chat_msg           common_chat_parse(const std::string & input, bool is_partial, const common_chat_syntax & syntax);
+
+common_chat_tool_choice common_chat_tool_choice_parse_oaicompat(const std::string & tool_choice);
+
+// Parses a JSON array of messages in OpenAI's chat completion API format.
+// T can be std::string containing JSON or nlohmann::ordered_json
+template <class T> std::vector<common_chat_msg> common_chat_msgs_parse_oaicompat(const T & messages);
+template <class T> T common_chat_msgs_to_json_oaicompat(const std::vector<common_chat_msg> & msgs, bool concat_typed_text = false);
+
+// Parses a JSON array of tools in OpenAI's chat completion tool call API format.
+// T can be std::string containing JSON or nlohmann::ordered_json
+template <class T> std::vector<common_chat_tool> common_chat_tools_parse_oaicompat(const T & tools);
+template <class T> T common_chat_tools_to_json_oaicompat(const std::vector<common_chat_tool> & tools);
+
+template <class T> T common_chat_msg_diff_to_json_oaicompat(const common_chat_msg_diff & diff);