Tool calls support from mainline (#723)

* Tool calls support from mainline * update cmake * revert api for /completions * Fix broken thinking process for gpt-oss * add missing args and fix webui bugs * add missing args and fix webui bugs2 * Fix reasoning format error * add usage * change default post_sampling_probs to true * add back generated_text * Remove server endpoints tests * add log * Chat fixes * Remove logs * webui: revert extra handling of thinking process --------- Co-authored-by: firecoperana <firecoperana> Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
2026-02-08 07:20:12 +00:00 · 2025-09-01 00:38:49 -05:00
parent 8de297b795
commit d7882c3cf8
87 changed files with 13581 additions and 2224 deletions
--- a/examples/main/main.cpp
+++ b/examples/main/main.cpp
@@ -1,8 +1,8 @@
 #include "common.h"
-
+#include "chat.h"
 #include "console.h"
 #include "llama.h"
-#include "chat-template.hpp"
+#include "minja/chat-template.hpp"
 #include <cassert>
 #include <cinttypes>
 #include <cmath>
@@ -119,12 +119,11 @@ static void llama_log_callback_logTee(ggml_log_level level, const char * text, v
    LOG_TEE("%s", text);
 }

-static std::string chat_add_and_format(struct llama_model * model, common_chat_templates &chat_templates, std::vector<llama_chat_msg> & chat_msgs, std::string role, std::string content) {
-    llama_chat_msg new_msg{role, content};
-    auto formatted = llama_chat_format_single(model, 
-        *chat_templates.template_default, chat_msgs, new_msg, role == "user", g_params->use_jinja);
+static std::string chat_add_and_format(struct llama_model * model, common_chat_templates &chat_templates, std::vector<common_chat_msg> & chat_msgs, std::string role, std::string content) {
+    common_chat_msg new_msg{role, content};
+    auto formatted = common_chat_format_single(&chat_templates, chat_msgs, new_msg, role == "user", g_params->use_jinja);
    chat_msgs.push_back({role, content});
-    LOG("formatted: %s\n", formatted.c_str());
+    fprintf(stdout, "formatted: %s\n", formatted.c_str());
    return formatted;
 }

@@ -201,7 +200,7 @@ int main(int argc, char ** argv) {
    llama_model * model;
    llama_context * ctx;
    llama_context * ctx_guidance = NULL;
-    std::vector<llama_chat_msg> chat_msgs;
+    std::vector<common_chat_msg> chat_msgs;
    g_model = &model;
    g_ctx = &ctx;

@@ -220,7 +219,7 @@ int main(int argc, char ** argv) {
        LOG_TEE("%s: error: unable to load model\n", __func__);
        return 1;
    }
-    auto chat_templates = llama_chat_templates_from_model(model, params.chat_template);
+    auto chat_templates = common_chat_templates_init(model, params.chat_template);

    const int n_ctx_train = llama_n_ctx_train(model);
    const int n_ctx = llama_n_ctx(ctx);
@@ -233,7 +232,8 @@ int main(int argc, char ** argv) {
    // print chat template example in conversation mode
    if (params.conversation) {
        if (params.enable_chat_template) {
-            LOG_TEE("%s: chat template example: %s\n", __func__, llama_chat_format_example(model, *chat_templates.template_default, params.use_jinja).c_str());
+            //LOG_TEE("%s: chat template example: %s\n", __func__, common_chat_format_example(model, *chat_templates.template_default, params.use_jinja).c_str());
+            LOG_TEE("%s: chat template example:\n%s\n", __func__, common_chat_format_example(chat_templates.get(), params.use_jinja).c_str());
        } else {
            LOG_TEE("%s: in-suffix/prefix is specified, chat template will be disabled\n", __func__);
        }
@@ -287,7 +287,7 @@ int main(int argc, char ** argv) {
            prompt = params.system_prompt;

            if (!prompt.empty()) {
-                prompt = chat_add_and_format(model, chat_templates,chat_msgs, "system", prompt);
+                prompt = chat_add_and_format(model, *chat_templates,chat_msgs, "system", prompt);
            }
        }
        else {
@@ -867,7 +867,7 @@ int main(int argc, char ** argv) {
                    }

                    if (params.enable_chat_template) {
-                        chat_add_and_format(model, chat_templates, chat_msgs, "assistant", assistant_ss.str());
+                        chat_add_and_format(model, *chat_templates, chat_msgs, "assistant", assistant_ss.str());
                    }
                    is_interacting = true;
                    printf("\n");
@@ -932,7 +932,7 @@ int main(int argc, char ** argv) {

                    bool format_chat = params.conversation && params.enable_chat_template;
                    std::string user_inp = format_chat
-                        ? chat_add_and_format(model, chat_templates, chat_msgs, "user", std::move(buffer))
+                        ? chat_add_and_format(model, *chat_templates, chat_msgs, "user", std::move(buffer))
                        : std::move(buffer);
                    // TODO: one inconvenient of current chat template implementation is that we can't distinguish between user input and special tokens (prefix/postfix)
                    const auto line_pfx = ::llama_tokenize(ctx, params.input_prefix, false, true);
@@ -972,7 +972,8 @@ int main(int argc, char ** argv) {

            if (n_past > 0 || waiting_for_first_input) {
                if (is_interacting) {
-                    llama_sampling_reset(ctx_sampling);
+                    
+                    llama_sampling_reset(llama_get_model_vocab(model), ctx_sampling);
                }
                is_interacting = false;
                waiting_for_first_input = false;