add jinja template support (#677)

Co-authored-by: firecoperana <firecoperana>
2026-02-23 06:34:13 +00:00 · 2025-08-09 07:50:30 -05:00
parent e23b2a7cc9
commit ff024df079
14 changed files with 3872 additions and 129 deletions
--- a/examples/main/main.cpp
+++ b/examples/main/main.cpp
@@ -2,7 +2,7 @@

 #include "console.h"
 #include "llama.h"
-
+#include "chat-template.hpp"
 #include <cassert>
 #include <cinttypes>
 #include <cmath>
@@ -119,10 +119,10 @@ static void llama_log_callback_logTee(ggml_log_level level, const char * text, v
    LOG_TEE("%s", text);
 }

-static std::string chat_add_and_format(struct llama_model * model, std::vector<llama_chat_msg> & chat_msgs, std::string role, std::string content) {
+static std::string chat_add_and_format(struct llama_model * model, common_chat_templates &chat_templates, std::vector<llama_chat_msg> & chat_msgs, std::string role, std::string content) {
    llama_chat_msg new_msg{role, content};
-    auto formatted = llama_chat_format_single(
-        model, g_params->chat_template, chat_msgs, new_msg, role == "user");
+    auto formatted = llama_chat_format_single(model, 
+        *chat_templates.template_default, chat_msgs, new_msg, role == "user", g_params->use_jinja);
    chat_msgs.push_back({role, content});
    LOG("formatted: %s\n", formatted.c_str());
    return formatted;
@@ -220,6 +220,7 @@ int main(int argc, char ** argv) {
        LOG_TEE("%s: error: unable to load model\n", __func__);
        return 1;
    }
+    auto chat_templates = llama_chat_templates_from_model(model, params.chat_template);

    const int n_ctx_train = llama_n_ctx_train(model);
    const int n_ctx = llama_n_ctx(ctx);
@@ -229,11 +230,10 @@ int main(int argc, char ** argv) {
        LOG_TEE("%s: warning: model was trained on only %d context tokens (%d specified)\n",
                __func__, n_ctx_train, n_ctx);
    }
-
    // print chat template example in conversation mode
    if (params.conversation) {
        if (params.enable_chat_template) {
-            LOG_TEE("%s: chat template example: %s\n", __func__, llama_chat_format_example(model, params.chat_template).c_str());
+            LOG_TEE("%s: chat template example: %s\n", __func__, llama_chat_format_example(model, *chat_templates.template_default, params.use_jinja).c_str());
        } else {
            LOG_TEE("%s: in-suffix/prefix is specified, chat template will be disabled\n", __func__);
        }
@@ -274,11 +274,29 @@ int main(int argc, char ** argv) {
    LOG("add_bos: %d\n", add_bos);

    std::vector<llama_token> embd_inp;
+    bool waiting_for_first_input = params.conversation && params.enable_chat_template && params.system_prompt.empty();

    {
-        auto prompt = (params.conversation && params.enable_chat_template && !params.prompt.empty())
-            ? chat_add_and_format(model, chat_msgs, "system", params.prompt) // format the system prompt in conversation mode
-            : params.prompt;
+        //auto prompt = (params.conversation && params.enable_chat_template && !params.prompt.empty())
+        //    ? chat_add_and_format(model, chat_templates,chat_msgs, "system", params.prompt) // format the system prompt in conversation mode
+        //    : params.prompt;
+        std::string prompt;
+
+        if (params.conversation && params.enable_chat_template) {
+            // format the system prompt in conversation mode (will use template default if empty)
+            prompt = params.system_prompt;
+
+            if (!prompt.empty()) {
+                prompt = chat_add_and_format(model, chat_templates,chat_msgs, "system", prompt);
+            }
+        }
+        else {
+            // otherwise use the prompt as is
+            prompt = params.prompt;
+        }
+
+
+
        if (params.interactive_first || !params.prompt.empty() || session_tokens.empty()) {
            LOG("tokenize the prompt\n");
            embd_inp = ::llama_tokenize(ctx, prompt, true, true);
@@ -292,7 +310,7 @@ int main(int argc, char ** argv) {
    }

    // Should not run without any tokens
-    if (embd_inp.empty()) {
+    if (!params.conversation && embd_inp.empty()) {
        if (add_bos) {
            embd_inp.push_back(llama_token_bos(model));
            LOG("embd_inp was considered empty and bos was added: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd_inp).c_str());
@@ -837,7 +855,7 @@ int main(int argc, char ** argv) {
            }

            // deal with end of generation tokens in interactive mode
-            if (llama_token_is_eog(model, llama_sampling_last(ctx_sampling))) {
+            if (!waiting_for_first_input && llama_token_is_eog(model, llama_sampling_last(ctx_sampling))) {
                LOG("found an EOG token\n");

                if (params.interactive) {
@@ -849,7 +867,7 @@ int main(int argc, char ** argv) {
                    }

                    if (params.enable_chat_template) {
-                        chat_add_and_format(model, chat_msgs, "assistant", assistant_ss.str());
+                        chat_add_and_format(model, chat_templates, chat_msgs, "assistant", assistant_ss.str());
                    }
                    is_interacting = true;
                    printf("\n");
@@ -857,12 +875,12 @@ int main(int argc, char ** argv) {
            }

            // if current token is not EOG, we add it to current assistant message
-            if (params.conversation) {
+            if (params.conversation && !waiting_for_first_input) {
                auto id = llama_sampling_last(ctx_sampling);
                assistant_ss << llama_token_to_piece(ctx, id, false);
            }

-            if (n_past > 0 && is_interacting) {
+            if ((n_past > 0 || waiting_for_first_input) && is_interacting) {
                LOG("waiting for user input\n");

                if (params.conversation) {
@@ -914,7 +932,7 @@ int main(int argc, char ** argv) {

                    bool format_chat = params.conversation && params.enable_chat_template;
                    std::string user_inp = format_chat
-                        ? chat_add_and_format(model, chat_msgs, "user", std::move(buffer))
+                        ? chat_add_and_format(model, chat_templates, chat_msgs, "user", std::move(buffer))
                        : std::move(buffer);
                    // TODO: one inconvenient of current chat template implementation is that we can't distinguish between user input and special tokens (prefix/postfix)
                    const auto line_pfx = ::llama_tokenize(ctx, params.input_prefix, false, true);
@@ -952,11 +970,12 @@ int main(int argc, char ** argv) {
                input_echo = false; // do not echo this again
            }

-            if (n_past > 0) {
+            if (n_past > 0 || waiting_for_first_input) {
                if (is_interacting) {
                    llama_sampling_reset(ctx_sampling);
                }
                is_interacting = false;
+                waiting_for_first_input = false;
            }
        }

--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@@ -814,6 +814,7 @@ struct server_context {

    server_metrics metrics;

+    common_chat_templates chat_templates;
    // Necessary similarity of prompt for slot selection
    float slot_prompt_similarity = 0.0f;

@@ -860,15 +861,47 @@ struct server_context {
        add_bos_token = llama_should_add_bos_token(model);
        GGML_ASSERT(llama_add_eos_token(model) != 1);

+        if (params.chat_template.empty() && !validate_model_chat_template(params.use_jinja)) {
+            LOG_WARNING("%s: The chat template that comes with this model is not yet supported, falling back to chatml. This may cause the model to output suboptimal responses\n", __func__);
+            chat_templates = llama_chat_templates_from_model(model, "chatml");
+        }
+        else {
+            chat_templates = llama_chat_templates_from_model(model, params.chat_template);
+        }
+        GGML_ASSERT(chat_templates.template_default.get() != nullptr);
        return true;
    }

-    bool validate_model_chat_template() const {
+    bool validate_model_chat_template(bool use_jinja) const {
        llama_chat_message chat[] = {{"user", "test"}};

-        const int res = llama_chat_apply_template(model, nullptr, chat, 1, true, nullptr, 0);

-        return res > 0;
+        if (use_jinja) {
+            auto templates = llama_chat_templates_from_model(model, "");
+            GGML_ASSERT(templates.template_default);
+            try {
+                templates.template_default->apply({ {
+                    {"role", "user"},
+                    {"content", "test"},
+                } }, json(), true);
+                if (templates.template_tool_use) {
+                    templates.template_tool_use->apply({ {
+                        {"role", "user"},
+                        {"content", "test"},
+                    } }, json(), true);
+    }
+                return true;
+            }
+            catch (const std::exception& e) {
+                LOG_ERROR("failed to apply template: %s\n", e.what());
+                return false;
+            }
+        }
+        else {
+            const char* tmpl = llama_model_chat_template(model, /* name */ nullptr);
+            const int32_t chat_res = llama_chat_apply_template(model, tmpl, chat, 1, true, nullptr, 0);
+            return chat_res > 0;
+        }
    }

    void init() {
@@ -3182,22 +3215,16 @@ int main(int argc, char ** argv) {

    const auto model_meta = ctx_server.model_meta();

-    // if a custom chat template is not supplied, we will use the one that comes with the model (if any)
-    if (params.chat_template.empty()) {
-        if (!ctx_server.validate_model_chat_template()) {
-            LOG_WARNING("The chat template that comes with this model is not yet supported, falling back to chatml. This may cause the model to output suboptimal responses", {});
-            params.chat_template = "chatml";
-        }
-    }
-
    // print sample chat example to make it clear which template is used
-    {
+
        LOG_INFO("chat template", {
-            {"chat_example", llama_chat_format_example(ctx_server.model, params.chat_template)},
+        {"chat_template", ctx_server.chat_templates.template_default->source().c_str()},
+    });
+
+    LOG_INFO("chat template", {
+        {"chat_example",  llama_chat_format_example(ctx_server.model, *ctx_server.chat_templates.template_default, ctx_server.params.use_jinja).c_str()},
            {"built_in",     params.chat_template.empty()},
        });
-    }
-
    //
    // Middlewares
    //
@@ -3560,9 +3587,11 @@ int main(int argc, char ** argv) {
            { "system_prompt",               ctx_server.system_prompt.c_str() },
            { "default_generation_settings", ctx_server.default_generation_settings_for_props },
            { "total_slots",                 ctx_server.params.n_parallel },
-            { "chat_template",               curr_tmpl.c_str() }
+            { "chat_template",               ctx_server.chat_templates.template_default->source() },
        };
-
+        if (ctx_server.params.use_jinja && ctx_server.chat_templates.template_tool_use) {
+            data["chat_template_tool_use"] = ctx_server.chat_templates.template_tool_use->source();
+        }
        res.set_content(data.dump(), "application/json; charset=utf-8");
    };

@@ -3573,8 +3602,9 @@ int main(int argc, char ** argv) {
        }

        res.set_header("Access-Control-Allow-Origin", req.get_header_value("Origin"));
-
-        json data = json::parse(req.body);
+        auto body = json::parse(req.body);
+        const auto& chat_template = body.contains("tools") && ctx_server.chat_templates.template_tool_use ? *ctx_server.chat_templates.template_tool_use : *ctx_server.chat_templates.template_default;
+        json data = oaicompat_completion_params_parse(ctx_server.model, body, chat_template, ctx_server.params.use_jinja);

        const int id_task = ctx_server.queue_tasks.get_new_id();

@@ -3674,7 +3704,11 @@ int main(int argc, char ** argv) {
        }

        res.set_header("Access-Control-Allow-Origin", req.get_header_value("Origin"));
-        json data = oaicompat_completion_params_parse(ctx_server.model, json::parse(req.body), params.chat_template);
+
+        auto body = json::parse(req.body);
+        const auto& chat_template = body.contains("tools") && ctx_server.chat_templates.template_tool_use ? *ctx_server.chat_templates.template_tool_use : *ctx_server.chat_templates.template_default;
+        json data = oaicompat_completion_params_parse(ctx_server.model,body, chat_template, params.use_jinja);
+

        const int id_task = ctx_server.queue_tasks.get_new_id();

--- a/examples/server/utils.hpp
+++ b/examples/server/utils.hpp
@@ -6,6 +6,8 @@
 // Change JSON_ASSERT from assert() to GGML_ASSERT:
 #define JSON_ASSERT GGML_ASSERT
 #include "json.hpp"
+#include "minja.hpp"
+#include "chat-template.hpp"
 #include "kimi_k2_tools.hpp"
 #include "qwen3_tools.hpp"
 #include "deepseek_r1_tools.hpp"
@@ -125,7 +127,7 @@ static inline void server_log(const char * level, const char * function, int lin
 //

 // Format given chat. If tmpl is empty, we take the template from model metadata
-inline std::string format_chat(const struct llama_model * model, const std::string & tmpl, const std::vector<json> & messages, const json & tools = json::array(), const std::string & model_name = "") {
+inline std::string format_chat(const struct llama_model * model, common_chat_template tmpl, const std::vector<json> & messages, const json & tools = json::array(), const std::string & model_name = "") {
    std::vector<llama_chat_msg> chat;

    // Inject tools into the first system message, or create one if none exists
@@ -197,8 +199,8 @@ inline std::string format_chat(const struct llama_model * model, const std::stri

        chat.push_back({role, content});
    }
+    auto formatted_chat = llama_chat_apply_template(model, tmpl, chat, true,  /* use_jinja= */ false);

-    auto formatted_chat = llama_chat_apply_template(model, tmpl, chat, true);
    LOG_VERBOSE("formatted_chat", {{"text", formatted_chat.c_str()}});
    return formatted_chat;
 }
@@ -425,46 +427,24 @@ static tool_choice_type tool_choice_parse_oaicompat(const std::string & tool_cho
 static json oaicompat_completion_params_parse(
    const struct llama_model * model,
    const json & body, /* openai api json semantics */
-    const std::string & chat_template) {
+    const common_chat_template& tmpl,
+    bool use_jinja) {
    json llama_params;

    llama_params["__oaicompat"] = true;
+    auto tools = json_value(body, "tools", json());
+    auto has_tools = tools.is_array() && !tools.empty();

+    if (has_tools) {
+        if (use_jinja) {
+            fprintf(stdout,"tools param is not fully supported yet\n");
    // Extract tools from the request body
    json tools = json_value(body, "tools", json::array());
    
+        }
    // Debug: Log system prompt when tools are detected
-    if (!tools.empty() && server_verbose) {
-        LOG_VERBOSE("Tool calls detected in request", {
-            {"tool_count", tools.size()},
-            {"model", json_value(body, "model", std::string(DEFAULT_OAICOMPAT_MODEL))}
-        });
-        
-        // Extract and log system prompt from messages
-        if (body.contains("messages") && body["messages"].is_array()) {
-            for (const auto& msg : body["messages"]) {
-                if (msg.contains("role") && msg["role"] == "system" && msg.contains("content")) {
-                    std::string content_str;
-                    if (msg["content"].is_string()) {
-                        content_str = msg["content"];
-                    } else if (msg["content"].is_array()) {
-                        // Handle content blocks format
-                        for (const auto& block : msg["content"]) {
-                            if (block.contains("type") && block["type"] == "text" && block.contains("text")) {
-                                if (!content_str.empty()) content_str += " ";
-                                content_str += block["text"];
-                            }
-                        }
-                    }
-                    
-                    if (!content_str.empty()) {
-                        LOG_VERBOSE("System prompt with tools", {
-                            {"system_prompt", content_str.substr(0, 500) + (content_str.length() > 500 ? "..." : "")}
-                        });
-                    }
-                    break; // Only log first system message
-                }
-            }
+        else {
+            throw std::runtime_error("tools param requires --jinja flag");
        }
    }

@@ -472,7 +452,7 @@ static json oaicompat_completion_params_parse(
    std::string model_name = json_value(body, "model", std::string(DEFAULT_OAICOMPAT_MODEL));

    // Apply chat template to the list of messages with tools
-    llama_params["prompt"] = format_chat(model, chat_template, body.at("messages"), tools, model_name);
+    llama_params["prompt"] = format_chat(model, tmpl, body.at("messages"), tools, model_name);

    // Handle "stop" field
    if (body.contains("stop") && body.at("stop").is_string()) {
@@ -491,6 +471,13 @@ static json oaicompat_completion_params_parse(
            throw std::runtime_error("response_format type must be one of \"text\" or \"json_object\", but got: " + response_type);
        }
    }
+    // Apply chat template to the list of messages
+    if (use_jinja) {
+        llama_params["prompt"] = tmpl.apply(body.at("messages"), tools, /* add_generation_prompt= */ true);
+    }
+    else {
+        llama_params["prompt"] = format_chat(model, tmpl, body.at("messages"));
+    }

    // Handle "n" field
    int n_choices = json_value(body, "n", 1);