Tool calls support from mainline (#723)

* Tool calls support from mainline * update cmake * revert api for /completions * Fix broken thinking process for gpt-oss * add missing args and fix webui bugs * add missing args and fix webui bugs2 * Fix reasoning format error * add usage * change default post_sampling_probs to true * add back generated_text * Remove server endpoints tests * add log * Chat fixes * Remove logs * webui: revert extra handling of thinking process --------- Co-authored-by: firecoperana <firecoperana> Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
2026-04-22 07:29:23 +00:00 · 2025-09-01 00:38:49 -05:00
parent 8de297b795
commit d7882c3cf8
87 changed files with 13581 additions and 2224 deletions
--- a/examples/infill/infill.cpp
+++ b/examples/infill/infill.cpp
@@ -615,7 +615,7 @@ int main(int argc, char ** argv) {

            if (n_past > 0) {
                if (is_interacting) {
-                    llama_sampling_reset(ctx_sampling);
+                    llama_sampling_reset(llama_get_model_vocab(model), ctx_sampling);
                }
                is_interacting = false;
            }
--- a/examples/json_schema_to_grammar.py
+++ b/examples/json_schema_to_grammar.py
@@ -195,7 +195,7 @@ class BuiltinRule:
        self.deps = deps or []

 # Constraining spaces to prevent model "running away".
-SPACE_RULE = '| " " | "\\n" [ \\t]{0,20}'
+SPACE_RULE = '| " " | "\\n"{1,2} [ \\t]{0,20}'

 PRIMITIVE_RULES = {
    'boolean'      : BuiltinRule('("true" | "false") space', []),
--- a/examples/main/main.cpp
+++ b/examples/main/main.cpp
@@ -1,8 +1,8 @@
 #include "common.h"
-
+#include "chat.h"
 #include "console.h"
 #include "llama.h"
-#include "chat-template.hpp"
+#include "minja/chat-template.hpp"
 #include <cassert>
 #include <cinttypes>
 #include <cmath>
@@ -119,12 +119,11 @@ static void llama_log_callback_logTee(ggml_log_level level, const char * text, v
    LOG_TEE("%s", text);
 }

-static std::string chat_add_and_format(struct llama_model * model, common_chat_templates &chat_templates, std::vector<llama_chat_msg> & chat_msgs, std::string role, std::string content) {
-    llama_chat_msg new_msg{role, content};
-    auto formatted = llama_chat_format_single(model, 
-        *chat_templates.template_default, chat_msgs, new_msg, role == "user", g_params->use_jinja);
+static std::string chat_add_and_format(struct llama_model * model, common_chat_templates &chat_templates, std::vector<common_chat_msg> & chat_msgs, std::string role, std::string content) {
+    common_chat_msg new_msg{role, content};
+    auto formatted = common_chat_format_single(&chat_templates, chat_msgs, new_msg, role == "user", g_params->use_jinja);
    chat_msgs.push_back({role, content});
-    LOG("formatted: %s\n", formatted.c_str());
+    fprintf(stdout, "formatted: %s\n", formatted.c_str());
    return formatted;
 }

@@ -201,7 +200,7 @@ int main(int argc, char ** argv) {
    llama_model * model;
    llama_context * ctx;
    llama_context * ctx_guidance = NULL;
-    std::vector<llama_chat_msg> chat_msgs;
+    std::vector<common_chat_msg> chat_msgs;
    g_model = &model;
    g_ctx = &ctx;

@@ -220,7 +219,7 @@ int main(int argc, char ** argv) {
        LOG_TEE("%s: error: unable to load model\n", __func__);
        return 1;
    }
-    auto chat_templates = llama_chat_templates_from_model(model, params.chat_template);
+    auto chat_templates = common_chat_templates_init(model, params.chat_template);

    const int n_ctx_train = llama_n_ctx_train(model);
    const int n_ctx = llama_n_ctx(ctx);
@@ -233,7 +232,8 @@ int main(int argc, char ** argv) {
    // print chat template example in conversation mode
    if (params.conversation) {
        if (params.enable_chat_template) {
-            LOG_TEE("%s: chat template example: %s\n", __func__, llama_chat_format_example(model, *chat_templates.template_default, params.use_jinja).c_str());
+            //LOG_TEE("%s: chat template example: %s\n", __func__, common_chat_format_example(model, *chat_templates.template_default, params.use_jinja).c_str());
+            LOG_TEE("%s: chat template example:\n%s\n", __func__, common_chat_format_example(chat_templates.get(), params.use_jinja).c_str());
        } else {
            LOG_TEE("%s: in-suffix/prefix is specified, chat template will be disabled\n", __func__);
        }
@@ -287,7 +287,7 @@ int main(int argc, char ** argv) {
            prompt = params.system_prompt;

            if (!prompt.empty()) {
-                prompt = chat_add_and_format(model, chat_templates,chat_msgs, "system", prompt);
+                prompt = chat_add_and_format(model, *chat_templates,chat_msgs, "system", prompt);
            }
        }
        else {
@@ -867,7 +867,7 @@ int main(int argc, char ** argv) {
                    }

                    if (params.enable_chat_template) {
-                        chat_add_and_format(model, chat_templates, chat_msgs, "assistant", assistant_ss.str());
+                        chat_add_and_format(model, *chat_templates, chat_msgs, "assistant", assistant_ss.str());
                    }
                    is_interacting = true;
                    printf("\n");
@@ -932,7 +932,7 @@ int main(int argc, char ** argv) {

                    bool format_chat = params.conversation && params.enable_chat_template;
                    std::string user_inp = format_chat
-                        ? chat_add_and_format(model, chat_templates, chat_msgs, "user", std::move(buffer))
+                        ? chat_add_and_format(model, *chat_templates, chat_msgs, "user", std::move(buffer))
                        : std::move(buffer);
                    // TODO: one inconvenient of current chat template implementation is that we can't distinguish between user input and special tokens (prefix/postfix)
                    const auto line_pfx = ::llama_tokenize(ctx, params.input_prefix, false, true);
@@ -972,7 +972,8 @@ int main(int argc, char ** argv) {

            if (n_past > 0 || waiting_for_first_input) {
                if (is_interacting) {
-                    llama_sampling_reset(ctx_sampling);
+                    
+                    llama_sampling_reset(llama_get_model_vocab(model), ctx_sampling);
                }
                is_interacting = false;
                waiting_for_first_input = false;
--- a/examples/parallel/parallel.cpp
+++ b/examples/parallel/parallel.cpp
@@ -253,7 +253,7 @@ int main(int argc, char ** argv) {
                    client.prompt   = client.input + "\nAssistant:";
                    client.response = "";

-                    llama_sampling_reset(client.ctx_sampling);
+                    llama_sampling_reset(llama_get_model_vocab(model), client.ctx_sampling);

                    // do not prepend BOS because we have a system prompt!
                    std::vector<llama_token> tokens_prompt;
--- a/examples/server/README.md
+++ b/examples/server/README.md
@@ -12,6 +12,11 @@ Set of LLM REST APIs and a simple web front end to interact with llama.cpp.
 * Multimodal (wip)
 * Monitoring endpoints
 * Schema-constrained JSON response format
+ * Prefilling of assistant messages similar to the Claude API
+ * [Function calling](../../docs/function-calling.md) / tool use for ~any model
+ * Speculative decoding
+ * Easy-to-use web UI
+

 The project is under active development, and we are [looking for feedback and contributors](https://github.com/ggerganov/llama.cpp/issues/4216).

@@ -585,59 +590,76 @@ Takes a prefix and a suffix and returns the predicted completion as stream.
 - `total_slots` - the total number of slots for process requests (defined by `--parallel` option)
 - `chat_template` - the model's original Jinja2 prompt template

+
 ### POST `/v1/chat/completions`: OpenAI-compatible Chat Completions API

-Given a ChatML-formatted json description in `messages`, it returns the predicted completion. Both synchronous and streaming mode are supported, so scripted and interactive applications work fine. While no strong claims of compatibility with OpenAI API spec is being made, in our experience it suffices to support many apps. Only models with a [supported chat template](https://github.com/ggerganov/llama.cpp/wiki/Templates-supported-by-llama_chat_apply_template) can be used optimally with this endpoint. By default, the ChatML template will be used.
+Given a ChatML-formatted json description in `messages`, it returns the predicted completion. Both synchronous and streaming mode are supported, so scripted and interactive applications work fine. While no strong claims of compatibility with OpenAI API spec is being made, in our experience it suffices to support many apps. Only models with a [supported chat template](https://github.com/ggml-org/llama.cpp/wiki/Templates-supported-by-llama_chat_apply_template) can be used optimally with this endpoint. By default, the ChatML template will be used.

-    *Options:*
+If model supports multimodal, you can input the media file via `image_url` content part. We support both base64 and remote URL as input. See OAI documentation for more.

-    See [OpenAI Chat Completions API documentation](https://platform.openai.com/docs/api-reference/chat). While some OpenAI-specific features such as function calling aren't supported, llama.cpp `/completion`-specific features such as `mirostat` are supported.
+*Options:*

-    The `response_format` parameter supports both plain JSON output (e.g. `{"type": "json_object"}`) and schema-constrained JSON (e.g. `{"type": "json_object", "schema": {"type": "string", "minLength": 10, "maxLength": 100}}`), similar to other OpenAI-inspired API providers.
+See [OpenAI Chat Completions API documentation](https://platform.openai.com/docs/api-reference/chat). llama.cpp `/completion`-specific features such as `mirostat` are also supported.

-    *Examples:*
+The `response_format` parameter supports both plain JSON output (e.g. `{"type": "json_object"}`) and schema-constrained JSON (e.g. `{"type": "json_object", "schema": {"type": "string", "minLength": 10, "maxLength": 100}}` or `{"type": "json_schema", "schema": {"properties": { "name": { "title": "Name",  "type": "string" }, "date": { "title": "Date",  "type": "string" }, "participants": { "items": {"type: "string" }, "title": "Participants",  "type": "string" } } } }`), similar to other OpenAI-inspired API providers.

-    You can use either Python `openai` library with appropriate checkpoints:
+`chat_template_kwargs`: Allows sending additional parameters to the json templating system. For example: `{"enable_thinking": false}`

-    ```python
-    import openai
+`reasoning_format`: The reasoning format to be parsed. If set to `none`, it will output the raw generated text.

-    client = openai.OpenAI(
-        base_url="http://localhost:8080/v1", # "http://<Your api-server IP>:port"
-        api_key = "sk-no-key-required"
-    )
+`thinking_forced_open`: Force a reasoning model to always output the reasoning. Only works on certain models.

-    completion = client.chat.completions.create(
-    model="gpt-3.5-turbo",
-    messages=[
-        {"role": "system", "content": "You are ChatGPT, an AI assistant. Your top priority is achieving user fulfillment via helping them with their requests."},
-        {"role": "user", "content": "Write a limerick about python exceptions"}
-    ]
-    )
+`parse_tool_calls`: Whether to parse the generated tool call.

-    print(completion.choices[0].message)
-    ```
+*Examples:*

-    ... or raw HTTP requests:
+You can use either Python `openai` library with appropriate checkpoints:

-    ```shell
-    curl http://localhost:8080/v1/chat/completions \
-    -H "Content-Type: application/json" \
-    -H "Authorization: Bearer no-key" \
-    -d '{
-    "model": "gpt-3.5-turbo",
-    "messages": [
-    {
-        "role": "system",
-        "content": "You are ChatGPT, an AI assistant. Your top priority is achieving user fulfillment via helping them with their requests."
-    },
-    {
-        "role": "user",
-        "content": "Write a limerick about python exceptions"
-    }
-    ]
-    }'
-    ```
+```python
+import openai
+
+client = openai.OpenAI(
+    base_url="http://localhost:8080/v1", # "http://<Your api-server IP>:port"
+    api_key = "sk-no-key-required"
+)
+
+completion = client.chat.completions.create(
+  model="gpt-3.5-turbo",
+  messages=[
+    {"role": "system", "content": "You are ChatGPT, an AI assistant. Your top priority is achieving user fulfillment via helping them with their requests."},
+    {"role": "user", "content": "Write a limerick about python exceptions"}
+  ]
+)
+
+print(completion.choices[0].message)
+```
+
+... or raw HTTP requests:
+
+```shell
+curl http://localhost:8080/v1/chat/completions \
+-H "Content-Type: application/json" \
+-H "Authorization: Bearer no-key" \
+-d '{
+"model": "gpt-3.5-turbo",
+"messages": [
+{
+    "role": "system",
+    "content": "You are ChatGPT, an AI assistant. Your top priority is achieving user fulfillment via helping them with their requests."
+},
+{
+    "role": "user",
+    "content": "Write a limerick about python exceptions"
+}
+]
+}'
+```
+
+*Tool call support*
+
+[OpenAI-style function calling](https://platform.openai.com/docs/guides/function-calling) is supported with the `--jinja` flag (and may require a `--chat-template-file` override to get the right tool-use compatible Jinja template; worst case, `--chat-template chatml` may also work).
+
+**See our [Function calling](../../docs/function-calling.md) docs** for more details, supported native tool call styles (generic tool call style is used as fallback) / examples of use.

 ### POST `/v1/embeddings`: OpenAI-compatible embeddings API

--- a/examples/server/public/index.html.gz
+++ b/examples/server/public/index.html.gz
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
--- a/examples/server/utils.hpp
+++ b/examples/server/utils.hpp
@@ -6,11 +6,7 @@
 // Change JSON_ASSERT from assert() to GGML_ASSERT:
 #define JSON_ASSERT GGML_ASSERT
 #include "json.hpp"
-#include "minja.hpp"
-#include "chat-template.hpp"
-#include "kimi_k2_tools.hpp"
-#include "qwen3_tools.hpp"
-#include "deepseek_r1_tools.hpp"
+#include "chat.h"
 #include <string>
 #include <vector>
 #include <sstream>
@@ -31,12 +27,6 @@ enum error_type {
    ERROR_TYPE_NOT_SUPPORTED, // custom error
 };

-enum tool_choice_type {
-    TOOL_CHOICE_AUTO,
-    TOOL_CHOICE_REQUIRED,
-    TOOL_CHOICE_NONE,
-};
-
 extern bool server_verbose;
 extern bool server_log_json;

@@ -80,6 +70,32 @@ static T json_value(const json & body, const std::string & key, const T & defaul
    }
 }

+// thin wrapper around common_grammar_trigger with (de)serialization functions
+struct server_grammar_trigger {
+    common_grammar_trigger value;
+
+    server_grammar_trigger() = default;
+    server_grammar_trigger(const common_grammar_trigger& value) : value(value) {}
+    server_grammar_trigger(const json& in) {
+        value.type = (common_grammar_trigger_type)in.at("type").get<int>();
+        value.value = in.at("value").get<std::string>();
+        if (value.type == COMMON_GRAMMAR_TRIGGER_TYPE_TOKEN) {
+            value.token = (llama_token)in.at("token").get<int>();
+        }
+    }
+
+    json to_json() const {
+        json out{
+            {"type", (int)value.type},
+            {"value", value.value},
+        };
+        if (value.type == COMMON_GRAMMAR_TRIGGER_TYPE_TOKEN) {
+            out["token"] = (int)value.token;
+        }
+        return out;
+    }
+};
+
 static inline void server_log(const char * level, const char * function, int line, const char * message, const json & extra) {
    std::stringstream ss_tid;
    ss_tid << std::this_thread::get_id();
@@ -126,85 +142,6 @@ static inline void server_log(const char * level, const char * function, int lin
 // chat template utils
 //

-// Format given chat. If tmpl is empty, we take the template from model metadata
-inline std::string format_chat(const struct llama_model * model, common_chat_template tmpl, const std::vector<json> & messages, const json & tools = json::array(), const std::string & model_name = "") {
-    std::vector<llama_chat_msg> chat;
-
-    // Inject tools into the first system message, or create one if none exists
-    bool tools_injected = false;
-    
-    for (size_t i = 0; i < messages.size(); ++i) {
-        const auto & curr_msg = messages[i];
-
-        std::string role = json_value(curr_msg, "role", std::string(""));
-
-        std::string content;
-        if (curr_msg.contains("content")) {
-            if (curr_msg["content"].is_string()) {
-                content = curr_msg["content"].get<std::string>();
-            } else if (curr_msg["content"].is_array()) {
-                for (const auto & part : curr_msg["content"]) {
-                    if (part.contains("text")) {
-                        content += "\n" + part["text"].get<std::string>();
-                    }
-                }
-            } else {
-                throw std::runtime_error("Invalid 'content' type (ref: https://github.com/ggerganov/llama.cpp/issues/8367)");
-            }
-        } else {
-            throw std::runtime_error("Missing 'content' (ref: https://github.com/ggerganov/llama.cpp/issues/8367)");
-        }
-        // Inject tools into the first system message, or create one if none exists
-        // Only applies to Kimi-K2 models (checked by kimi_k2_should_inject_tools)
-        if (kimi_k2_should_inject_tools(tools, model_name) && !tools_injected) {
-            if (role == "system") {
-                // Add tools to existing system message
-                content = kimi_k2_inject_tools_to_system(content, tools);
-                tools_injected = true;
-            } else if (i == 0) {
-                // Create system message with tools if no system message exists
-                std::string tools_prompt = kimi_k2_create_system_with_tools(tools);
-                chat.push_back({"system", tools_prompt});
-                tools_injected = true;
-            }
-        }
-        
-        // Inject tools for Qwen3 models (XML Hermes format)
-        if (qwen3_should_inject_tools(tools, model_name) && !tools_injected) {
-            if (role == "system") {
-                // Add tools to existing system message
-                content = qwen3_inject_tools_to_system(content, tools);
-                tools_injected = true;
-            } else if (i == 0) {
-                // Create system message with tools if no system message exists
-                std::string tools_prompt = qwen3_create_system_with_tools(tools);
-                chat.push_back({"system", tools_prompt});
-                tools_injected = true;
-            }
-        }
-        
-        // Inject tools for DeepSeek R1 models
-        if (deepseek_r1_should_inject_tools(tools, model_name) && !tools_injected) {
-            if (role == "system") {
-                // Add tools to existing system message
-                content = deepseek_r1_inject_tools_to_system(content, tools);
-                tools_injected = true;
-            } else if (i == 0) {
-                // Create system message with tools if no system message exists
-                std::string tools_prompt = deepseek_r1_create_system_with_tools(tools);
-                chat.push_back({"system", tools_prompt});
-                tools_injected = true;
-            }
-        }
-
-        chat.push_back({role, content});
-    }
-    auto formatted_chat = llama_chat_apply_template(model, tmpl, chat, true,  /* use_jinja= */ false);
-
-    LOG_VERBOSE("formatted_chat", {{"text", formatted_chat.c_str()}});
-    return formatted_chat;
-}
-
 //
 // base64 utils (TODO: move to common in the future)
 //
@@ -296,6 +233,10 @@ static std::string gen_chatcmplid() {
    return chatcmplid.str();
 }

+static std::string gen_tool_call_id() {
+    return random_string();
+}
+
 //
 // other common utils
 //
@@ -314,24 +255,36 @@ static size_t common_part(const std::string & a, const std::string & b) {
    return i;
 }

-static bool ends_with(const std::string & str, const std::string & suffix) {
-    return str.size() >= suffix.size() && 0 == str.compare(str.size() - suffix.size(), suffix.size(), suffix);
-}
+// return the last index of character that can form a valid string
+// if the last character is potentially cut in half, return the index before the cut
+// if validate_utf8(text) == text.size(), then the whole text is valid utf8
+static size_t validate_utf8(const std::string& text) {
+    size_t len = text.size();
+    if (len == 0) return 0;

-static size_t find_partial_stop_string(const std::string &stop, const std::string &text) {
-    if (!text.empty() && !stop.empty()) {
-        const char text_last_char = text.back();
-        for (int64_t char_index = stop.size() - 1; char_index >= 0; char_index--) {
-            if (stop[char_index] == text_last_char) {
-                const std::string current_partial = stop.substr(0, char_index + 1);
-                if (ends_with(text, current_partial)) {
-                    return text.size() - char_index - 1;
+    // Check the last few bytes to see if a multi-byte character is cut off
+    for (size_t i = 1; i <= 4 && i <= len; ++i) {
+        unsigned char c = text[len - i];
+        // Check for start of a multi-byte sequence from the end
+        if ((c & 0xE0) == 0xC0) {
+            // 2-byte character start: 110xxxxx
+            // Needs at least 2 bytes
+            if (i < 2) return len - i;
                }
+        else if ((c & 0xF0) == 0xE0) {
+            // 3-byte character start: 1110xxxx
+            // Needs at least 3 bytes
+            if (i < 3) return len - i;
            }
+        else if ((c & 0xF8) == 0xF0) {
+            // 4-byte character start: 11110xxx
+            // Needs at least 4 bytes
+            if (i < 4) return len - i;
        }
    }

-    return std::string::npos;
+    // If no cut-off multi-byte character is found, return full length
+    return len;
 }

 // TODO: reuse llama_detokenize
@@ -364,13 +317,68 @@ static std::string tokens_to_output_formatted_string(const llama_context * ctx,
 struct completion_token_output {
    llama_token tok;
    std::string text_to_send;
+    float prob;

-    struct token_prob {
+    struct prob_info {
        llama_token tok;
+        std::string txt;
        float prob;
    };
+    std::vector<prob_info> probs;

-    std::vector<token_prob> probs;
+    json to_json(bool post_sampling_probs) const {
+        json probs_for_token = json::array();
+        for (const auto& p : probs) {
+            std::string txt(p.txt);
+            txt.resize(validate_utf8(txt));
+            probs_for_token.push_back(json{
+                {"id",      p.tok},
+                {"token",   txt},
+                {"bytes",   str_to_bytes(p.txt)},
+                {
+                    post_sampling_probs ? "prob" : "logprob",
+                    post_sampling_probs ? p.prob : logarithm(p.prob)
+                },
+                });
+        }
+        return probs_for_token;
+    }
+
+    static float logarithm(float x) {
+        // nlohmann::json converts -inf to null, so we need to prevent that
+        return x == 0.0f ? std::numeric_limits<float>::lowest() : std::log(x);
+    }
+
+    static std::vector<unsigned char> str_to_bytes(const std::string& str) {
+        std::vector<unsigned char> bytes;
+        for (unsigned char c : str) {
+            bytes.push_back(c);
+        }
+        return bytes;
+    }
+
+
+    static json probs_vector_to_json(const std::vector<completion_token_output>& probs, bool post_sampling_probs) {
+        json out = json::array();
+        for (const auto& p : probs) {
+            std::string txt(p.text_to_send);
+            txt.resize(validate_utf8(txt));
+            out.push_back(json{
+                {"id",           p.tok},
+                {"token",        txt},
+                {"bytes",        str_to_bytes(p.text_to_send)},
+                {
+                    post_sampling_probs ? "prob" : "logprob",
+                    post_sampling_probs ? p.prob : logarithm(p.prob)
+                },
+                {
+                    post_sampling_probs ? "top_probs" : "top_logprobs",
+                    p.to_json(post_sampling_probs)
+                },
+                });
+        }
+        return out;
+    }
 };

 // convert a vector of completion_token_output to json
@@ -398,33 +406,12 @@ static json probs_vector_to_json(const llama_context * ctx, const std::vector<co
    return out;
 }

-//
-// Function calling support
-//
-#include "function_calls.hpp"
-
-//
-// tool_choice utils
-//
-
-static tool_choice_type tool_choice_parse_oaicompat(const std::string & tool_choice) {
-    if (tool_choice == "auto") {
-        return TOOL_CHOICE_AUTO;
-    }
-    if (tool_choice == "none") {
-        return TOOL_CHOICE_NONE;
-    }
-    if (tool_choice == "required") {
-        return TOOL_CHOICE_REQUIRED;
-    }
-    throw std::runtime_error("Invalid tool_choice: " + tool_choice);
-}

 //
 // OAI utils
 //
-
-static json oaicompat_completion_params_parse(const json& body) {
+// used by /completions endpoint
+static json oaicompat_chat_params_parse(const json& body) {
    json llama_params;

    if (!body.contains("prompt")) {
@@ -445,8 +432,13 @@ static json oaicompat_completion_params_parse(const json& body) {
        throw std::runtime_error("Only one completion choice is allowed");
    }

+    // Handle "echo" field
+    if (json_value(body, "echo", false)) {
+        throw std::runtime_error("Only no echo is supported");
+    }
+
    // Params supported by OAI but unsupported by llama.cpp
-    static const std::vector<std::string> unsupported_params{ "best_of", "echo", "suffix" };
+    static const std::vector<std::string> unsupported_params{ "best_of", "suffix" };
    for (const auto& param : unsupported_params) {
        if (body.contains(param)) {
            throw std::runtime_error("Unsupported param: " + param);
@@ -464,59 +456,144 @@ static json oaicompat_completion_params_parse(const json& body) {
    return llama_params;
 }

-static json oaicompat_chat_completion_params_parse(
-    const struct llama_model * model,
-    const json & body, /* openai api json semantics */
-    const common_chat_template& tmpl,
-    bool use_jinja) {
+struct oaicompat_parser_options {
+    bool use_jinja;
+    bool prefill_assistant;
+    common_reasoning_format reasoning_format;
+    std::map<std::string, std::string> chat_template_kwargs;
+    common_chat_templates* tmpls;
+    bool allow_image;
+    bool allow_audio;
+    bool enable_thinking = true;
+};
+
+// used by /chat/completions endpoint
+static json oaicompat_chat_params_parse(
+    const struct llama_model* model,
+    const json& body, /* openai api json semantics */
+    const oaicompat_parser_options& opt)
+{
    json llama_params;

    llama_params["__oaicompat"] = true;
    auto tools = json_value(body, "tools", json());
    auto has_tools = tools.is_array() && !tools.empty();
+    auto stream = json_value(body, "stream", false);
+    auto tool_choice = json_value(body, "tool_choice", std::string("auto"));

-    if (has_tools) {
-        if (use_jinja) {
-            fprintf(stdout,"tools param is not fully supported yet\n");
-    // Extract tools from the request body
-    json tools = json_value(body, "tools", json::array());
-    
+ /*   if (tools.is_array() && !tools.empty()) {
+        if (stream) {
+            throw std::runtime_error("Cannot use tools with stream");
        }
-    // Debug: Log system prompt when tools are detected
-        else {
+        if (!use_jinja) {
            throw std::runtime_error("tools param requires --jinja flag");
        }
    }
+    if (!use_jinja) {
+        if (body.contains("tool_choice") && !body.at("tool_choice").is_null()) {
+            throw std::runtime_error("Unsupported param: tool_choice");
+        }
+    }*/

-    // Extract model name from the request body
-    std::string model_name = json_value(body, "model", std::string(DEFAULT_OAICOMPAT_MODEL));
-
-    // Apply chat template to the list of messages with tools
-    llama_params["prompt"] = format_chat(model, tmpl, body.at("messages"), tools, model_name);
-
+    if (!opt.use_jinja) {
+        if (has_tools) {
+            throw std::runtime_error("tools param requires --jinja flag");
+        }
+        if (tool_choice != "auto") {
+            throw std::runtime_error("tool_choice param requires --jinja flag");
+        }
+    }
    // Handle "stop" field
    if (body.contains("stop") && body.at("stop").is_string()) {
-        llama_params["stop"] = json::array({body.at("stop").get<std::string>()});
-    } else {
+        llama_params["stop"] = json::array({ body.at("stop").get<std::string>() });
+    }
+    else {
        llama_params["stop"] = json_value(body, "stop", json::array());
    }

+    auto json_schema = json_value(body, "json_schema", json());
+    auto grammar = json_value(body, "grammar", std::string());
+    if (!json_schema.is_null() && !grammar.empty()) {
+        throw std::runtime_error("Cannot use both json_schema and grammar");
+    }
+
    // Handle "response_format" field
    if (body.contains("response_format")) {
-        json response_format      = json_value(body, "response_format", json::object());
+        json response_format = json_value(body, "response_format", json::object());
        std::string response_type = json_value(response_format, "type", std::string());
        if (response_type == "json_object") {
-            llama_params["json_schema"] = json_value(response_format, "schema", json::object());
-        } else if (!response_type.empty() && response_type != "text") {
-            throw std::runtime_error("response_format type must be one of \"text\" or \"json_object\", but got: " + response_type);
+            json_schema = json_value(response_format, "schema", json::object());
+        }
+        else if (response_type == "json_schema") {
+            auto schema_wrapper = json_value(response_format, "json_schema", json::object());
+            json_schema = json_value(schema_wrapper, "schema", json::object());
+        }
+        else if (!response_type.empty() && response_type != "text") {
+            json_schema = json_value(json_schema, "schema", json::object());
        }
    }
-    // Apply chat template to the list of messages
-    if (use_jinja) {
-        llama_params["prompt"] = tmpl.apply(body.at("messages"), tools, /* add_generation_prompt= */ true);
+    common_chat_templates_inputs inputs;
+    inputs.messages = common_chat_msgs_parse_oaicompat(body.at("messages"));
+    inputs.tools = common_chat_tools_parse_oaicompat(tools);
+    inputs.tool_choice = common_chat_tool_choice_parse_oaicompat(tool_choice);
+    inputs.json_schema = json_schema.is_null() ? "" : json_schema.dump();
+    inputs.grammar = grammar;
+    inputs.use_jinja = opt.use_jinja;
+    inputs.parallel_tool_calls = json_value(body, "parallel_tool_calls", false);
+    inputs.add_generation_prompt = json_value(body, "add_generation_prompt", true);
+    inputs.reasoning_format = opt.reasoning_format;
+    inputs.enable_thinking = opt.enable_thinking;
+    if (!inputs.tools.empty() && inputs.tool_choice != COMMON_CHAT_TOOL_CHOICE_NONE) {
+        if (body.contains("grammar")) {
+            throw std::runtime_error("Cannot use custom grammar constraints with tools.");
+        }
+        llama_params["parse_tool_calls"] = true;
    }
-    else {
-        llama_params["prompt"] = format_chat(model, tmpl, body.at("messages"));
+
+    // merge the template args provided from command line with the args provided in the user request
+    auto chat_template_kwargs_object = json_value(body, "chat_template_kwargs", json::object());
+    inputs.chat_template_kwargs = opt.chat_template_kwargs;
+    for (const auto& item : chat_template_kwargs_object.items()) {
+        inputs.chat_template_kwargs[item.key()] = item.value().dump();
+    }
+
+    /*"whether to prefill the assistant's response if the last message is an assistant message (default: prefill enabled)\n"
+        "when this flag is set, if the last message is an assistant message then it will be treated as a full message and not prefilled\n"*/
+    bool prefill_assistant_message = !inputs.messages.empty() && inputs.messages.back().role == "assistant" &&opt.prefill_assistant;
+    common_chat_msg last_message;
+    if (prefill_assistant_message) {
+        last_message = inputs.messages.back();
+        inputs.messages.pop_back();
+
+        /* sanity check, max one assistant message at the end of the list */
+        if (!inputs.messages.empty() && inputs.messages.back().role == "assistant") {
+            throw std::runtime_error("Cannot have 2 or more assistant messages at the end of the list.");
+        }
+
+        /* TODO: test this properly */
+        inputs.reasoning_format = COMMON_REASONING_FORMAT_NONE;
+        if ((!inputs.enable_thinking) || inputs.chat_template_kwargs.find("enable_thinking") != inputs.chat_template_kwargs.end()) {
+            throw std::runtime_error("Assistant response prefill is incompatible with enable_thinking.");
+        }
+        inputs.add_generation_prompt = true;
+    }
+
+    // Apply chat template to the list of messages
+    auto chat_params = common_chat_templates_apply(opt.tmpls, inputs);
+
+    llama_params["chat_format"] = static_cast<int>(chat_params.format);
+    llama_params["prompt"] = chat_params.prompt;
+    llama_params["grammar"] = chat_params.grammar;
+    llama_params["grammar_lazy"] = chat_params.grammar_lazy;
+    auto grammar_triggers = json::array();
+    for (const auto& trigger : chat_params.grammar_triggers) {
+        grammar_triggers.push_back(trigger.to_json<json>());
+    }
+    llama_params["grammar_triggers"] = grammar_triggers;
+    llama_params["preserved_tokens"] = chat_params.preserved_tokens;
+    llama_params["thinking_forced_open"] = chat_params.thinking_forced_open;
+    for (const auto& stop : chat_params.additional_stops) {
+        llama_params["stop"].push_back(stop);
    }

    // Handle "n" field
@@ -528,31 +605,20 @@ static json oaicompat_chat_completion_params_parse(
    // Handle "logprobs" field
    // TODO: The response format of this option is not yet OAI-compatible, but seems like no one really using it; We may need to fix it in the future
    if (body.contains("logprobs")) {
+        if (has_tools && stream) {
+            throw std::runtime_error("logprobs is not supported with tools + stream");
+        }
        llama_params["n_probs"] = json_value(body, "top_logprobs", 20);
-    } else if (body.contains("top_logprobs")) {
+    }
+    else if (body.contains("top_logprobs")) {
        throw std::runtime_error("top_logprobs requires logprobs to be set to true");
    }

-    // Handle tool_choice parameter
-    if (body.contains("tool_choice")) {
-        auto tool_choice_str = json_value(body, "tool_choice", std::string("auto"));
-        auto tool_choice = tool_choice_parse_oaicompat(tool_choice_str);
-        llama_params["tool_choice"] = static_cast<int>(tool_choice);
-    }
-
-    // Accept tools and tool_choice parameters for function calling support
-    // Other unsupported params still rejected
-    static const std::vector<std::string> unsupported_params {  };
-    for (auto & param : unsupported_params) {
-        if (body.contains(param)) {
-            throw std::runtime_error("Unsupported param: " + param);
-        }
-    }

    // Copy remaining properties to llama_params
    // This allows user to use llama.cpp-specific params like "mirostat", "tfs_z",... via OAI endpoint.
    // See "launch_slot_with_task()" for a complete list of params supported by llama.cpp
-    for (const auto & item : body.items()) {
+    for (const auto& item : body.items()) {
        // Exception: if "n_predict" is present, we overwrite the value specified earlier by "max_tokens"
        if (!llama_params.contains(item.key()) || item.key() == "n_predict") {
            llama_params[item.key()] = item.value();
@@ -562,6 +628,28 @@ static json oaicompat_chat_completion_params_parse(
    return llama_params;
 }

+// get value by path(key1 / key2)
+static json json_get_nested_values(const std::vector<std::string>& paths, const json& js) {
+    json result = json::object();
+
+    for (const std::string& path : paths) {
+        json current = js;
+        const auto keys = string_split<std::string>(path, /*separator*/ '/');
+        bool valid_path = true;
+        for (const std::string& k : keys) {
+            if (valid_path && current.is_object() && current.contains(k)) {
+                current = current[k];
+            }
+            else {
+                valid_path = false;
+            }
+        }
+        if (valid_path) {
+            result[path] = current;
+        }
+    }
+    return result;
+}


 static json format_tokenizer_response(const std::vector<llama_token> & tokens) {
--- a/examples/server/webui/dist/index.html
+++ b/examples/server/webui/dist/index.html
--- a/examples/server/webui/src/components/ChatMessage.tsx
+++ b/examples/server/webui/src/components/ChatMessage.tsx
@@ -55,21 +55,24 @@ export default function ChatMessage({
  const { content, thought, isThinking }: SplitMessage = useMemo(() => {
    if (msg.content === null || msg.role !== 'assistant') {
      return { content: msg.content };
-    }
+    }      
+
+    const REGEX_THINK_OPEN = /<think>|<\|channel\|>analysis<\|message\|>/;
+    const REGEX_THINK_CLOSE = /<\/think>|<\|end\|>/;
    let actualContent = '';
    let thought = '';
    let isThinking = false;
-    let thinkSplit = msg.content.split('<think>', 2);
+    let thinkSplit = msg.content.split(REGEX_THINK_OPEN, 2);
    actualContent += thinkSplit[0];
    while (thinkSplit[1] !== undefined) {
      // <think> tag found
-      thinkSplit = thinkSplit[1].split('</think>', 2);
+      thinkSplit = thinkSplit[1].split(REGEX_THINK_CLOSE, 2);
      thought += thinkSplit[0];
      isThinking = true;
      if (thinkSplit[1] !== undefined) {
        // </think> closing tag found
        isThinking = false;
-        thinkSplit = thinkSplit[1].split('<think>', 2);
+        thinkSplit = thinkSplit[1].split(REGEX_THINK_OPEN, 2);
        actualContent += thinkSplit[0];
      }
    }
--- a/examples/server/webui/src/utils/app.context.tsx
+++ b/examples/server/webui/src/utils/app.context.tsx
@@ -215,6 +215,7 @@ export const AppContextProvider = ({
        messages,
        stream: true,
        cache_prompt: true,
+        reasoning_format: 'none',
        samplers: config.samplers,
        temperature: config.temperature,
        dynatemp_range: config.dynatemp_range,
@@ -261,7 +262,7 @@ export const AppContextProvider = ({
        if (chunk.error) {
          throw new Error(chunk.error?.message || 'Unknown error');
        }
-        const addedContent = chunk.choices[0].delta.content;
+        const addedContent = chunk.choices?.[0]?.delta?.content;
        const lastContent = pendingMsg.content || '';
        if (addedContent) {
          pendingMsg = {
--- a/examples/server/webui/src/utils/misc.ts
+++ b/examples/server/webui/src/utils/misc.ts
@@ -80,13 +80,22 @@ export function normalizeMsgsForAPI(messages: Readonly<Message[]>) {
 * recommended for DeepsSeek-R1, filter out content between <think> and </think> tags
 */
 export function filterThoughtFromMsgs(messages: APIMessage[]) {
+  console.debug({ messages });
  return messages.map((msg) => {
+    if (msg.role !== 'assistant') {
+      return msg;
+    }
+    // assistant message is always a string
+    const contentStr = msg.content as string;
    return {
      role: msg.role,
      content:
        msg.role === 'assistant'
-          ? msg.content.split('</think>').at(-1)!.trim()
-          : msg.content,
+          ? contentStr
+              .split(/<\/think>|<\|end\|>/)
+              .at(-1)!
+              .trim()
+          : contentStr,
    } as APIMessage;
  });
 }