add jinja template support (#677)

Co-authored-by: firecoperana <firecoperana>
This commit is contained in:
firecoperana
2025-08-09 07:50:30 -05:00
committed by GitHub
parent e23b2a7cc9
commit ff024df079
14 changed files with 3872 additions and 129 deletions

View File

@@ -814,6 +814,7 @@ struct server_context {
server_metrics metrics;
common_chat_templates chat_templates;
// Necessary similarity of prompt for slot selection
float slot_prompt_similarity = 0.0f;
@@ -860,15 +861,47 @@ struct server_context {
add_bos_token = llama_should_add_bos_token(model);
GGML_ASSERT(llama_add_eos_token(model) != 1);
if (params.chat_template.empty() && !validate_model_chat_template(params.use_jinja)) {
LOG_WARNING("%s: The chat template that comes with this model is not yet supported, falling back to chatml. This may cause the model to output suboptimal responses\n", __func__);
chat_templates = llama_chat_templates_from_model(model, "chatml");
}
else {
chat_templates = llama_chat_templates_from_model(model, params.chat_template);
}
GGML_ASSERT(chat_templates.template_default.get() != nullptr);
return true;
}
bool validate_model_chat_template() const {
bool validate_model_chat_template(bool use_jinja) const {
llama_chat_message chat[] = {{"user", "test"}};
const int res = llama_chat_apply_template(model, nullptr, chat, 1, true, nullptr, 0);
return res > 0;
if (use_jinja) {
auto templates = llama_chat_templates_from_model(model, "");
GGML_ASSERT(templates.template_default);
try {
templates.template_default->apply({ {
{"role", "user"},
{"content", "test"},
} }, json(), true);
if (templates.template_tool_use) {
templates.template_tool_use->apply({ {
{"role", "user"},
{"content", "test"},
} }, json(), true);
}
return true;
}
catch (const std::exception& e) {
LOG_ERROR("failed to apply template: %s\n", e.what());
return false;
}
}
else {
const char* tmpl = llama_model_chat_template(model, /* name */ nullptr);
const int32_t chat_res = llama_chat_apply_template(model, tmpl, chat, 1, true, nullptr, 0);
return chat_res > 0;
}
}
void init() {
@@ -3182,22 +3215,16 @@ int main(int argc, char ** argv) {
const auto model_meta = ctx_server.model_meta();
// if a custom chat template is not supplied, we will use the one that comes with the model (if any)
if (params.chat_template.empty()) {
if (!ctx_server.validate_model_chat_template()) {
LOG_WARNING("The chat template that comes with this model is not yet supported, falling back to chatml. This may cause the model to output suboptimal responses", {});
params.chat_template = "chatml";
}
}
// print sample chat example to make it clear which template is used
{
LOG_INFO("chat template", {
{"chat_example", llama_chat_format_example(ctx_server.model, params.chat_template)},
{"chat_template", ctx_server.chat_templates.template_default->source().c_str()},
});
LOG_INFO("chat template", {
{"chat_example", llama_chat_format_example(ctx_server.model, *ctx_server.chat_templates.template_default, ctx_server.params.use_jinja).c_str()},
{"built_in", params.chat_template.empty()},
});
}
//
// Middlewares
//
@@ -3560,9 +3587,11 @@ int main(int argc, char ** argv) {
{ "system_prompt", ctx_server.system_prompt.c_str() },
{ "default_generation_settings", ctx_server.default_generation_settings_for_props },
{ "total_slots", ctx_server.params.n_parallel },
{ "chat_template", curr_tmpl.c_str() }
{ "chat_template", ctx_server.chat_templates.template_default->source() },
};
if (ctx_server.params.use_jinja && ctx_server.chat_templates.template_tool_use) {
data["chat_template_tool_use"] = ctx_server.chat_templates.template_tool_use->source();
}
res.set_content(data.dump(), "application/json; charset=utf-8");
};
@@ -3573,8 +3602,9 @@ int main(int argc, char ** argv) {
}
res.set_header("Access-Control-Allow-Origin", req.get_header_value("Origin"));
json data = json::parse(req.body);
auto body = json::parse(req.body);
const auto& chat_template = body.contains("tools") && ctx_server.chat_templates.template_tool_use ? *ctx_server.chat_templates.template_tool_use : *ctx_server.chat_templates.template_default;
json data = oaicompat_completion_params_parse(ctx_server.model, body, chat_template, ctx_server.params.use_jinja);
const int id_task = ctx_server.queue_tasks.get_new_id();
@@ -3674,7 +3704,11 @@ int main(int argc, char ** argv) {
}
res.set_header("Access-Control-Allow-Origin", req.get_header_value("Origin"));
json data = oaicompat_completion_params_parse(ctx_server.model, json::parse(req.body), params.chat_template);
auto body = json::parse(req.body);
const auto& chat_template = body.contains("tools") && ctx_server.chat_templates.template_tool_use ? *ctx_server.chat_templates.template_tool_use : *ctx_server.chat_templates.template_default;
json data = oaicompat_completion_params_parse(ctx_server.model,body, chat_template, params.use_jinja);
const int id_task = ctx_server.queue_tasks.get_new_id();

View File

@@ -6,6 +6,8 @@
// Change JSON_ASSERT from assert() to GGML_ASSERT:
#define JSON_ASSERT GGML_ASSERT
#include "json.hpp"
#include "minja.hpp"
#include "chat-template.hpp"
#include "kimi_k2_tools.hpp"
#include "qwen3_tools.hpp"
#include "deepseek_r1_tools.hpp"
@@ -125,7 +127,7 @@ static inline void server_log(const char * level, const char * function, int lin
//
// Format given chat. If tmpl is empty, we take the template from model metadata
inline std::string format_chat(const struct llama_model * model, const std::string & tmpl, const std::vector<json> & messages, const json & tools = json::array(), const std::string & model_name = "") {
inline std::string format_chat(const struct llama_model * model, common_chat_template tmpl, const std::vector<json> & messages, const json & tools = json::array(), const std::string & model_name = "") {
std::vector<llama_chat_msg> chat;
// Inject tools into the first system message, or create one if none exists
@@ -197,8 +199,8 @@ inline std::string format_chat(const struct llama_model * model, const std::stri
chat.push_back({role, content});
}
auto formatted_chat = llama_chat_apply_template(model, tmpl, chat, true, /* use_jinja= */ false);
auto formatted_chat = llama_chat_apply_template(model, tmpl, chat, true);
LOG_VERBOSE("formatted_chat", {{"text", formatted_chat.c_str()}});
return formatted_chat;
}
@@ -425,46 +427,24 @@ static tool_choice_type tool_choice_parse_oaicompat(const std::string & tool_cho
static json oaicompat_completion_params_parse(
const struct llama_model * model,
const json & body, /* openai api json semantics */
const std::string & chat_template) {
const common_chat_template& tmpl,
bool use_jinja) {
json llama_params;
llama_params["__oaicompat"] = true;
auto tools = json_value(body, "tools", json());
auto has_tools = tools.is_array() && !tools.empty();
if (has_tools) {
if (use_jinja) {
fprintf(stdout,"tools param is not fully supported yet\n");
// Extract tools from the request body
json tools = json_value(body, "tools", json::array());
}
// Debug: Log system prompt when tools are detected
if (!tools.empty() && server_verbose) {
LOG_VERBOSE("Tool calls detected in request", {
{"tool_count", tools.size()},
{"model", json_value(body, "model", std::string(DEFAULT_OAICOMPAT_MODEL))}
});
// Extract and log system prompt from messages
if (body.contains("messages") && body["messages"].is_array()) {
for (const auto& msg : body["messages"]) {
if (msg.contains("role") && msg["role"] == "system" && msg.contains("content")) {
std::string content_str;
if (msg["content"].is_string()) {
content_str = msg["content"];
} else if (msg["content"].is_array()) {
// Handle content blocks format
for (const auto& block : msg["content"]) {
if (block.contains("type") && block["type"] == "text" && block.contains("text")) {
if (!content_str.empty()) content_str += " ";
content_str += block["text"];
}
}
}
if (!content_str.empty()) {
LOG_VERBOSE("System prompt with tools", {
{"system_prompt", content_str.substr(0, 500) + (content_str.length() > 500 ? "..." : "")}
});
}
break; // Only log first system message
}
}
else {
throw std::runtime_error("tools param requires --jinja flag");
}
}
@@ -472,7 +452,7 @@ static json oaicompat_completion_params_parse(
std::string model_name = json_value(body, "model", std::string(DEFAULT_OAICOMPAT_MODEL));
// Apply chat template to the list of messages with tools
llama_params["prompt"] = format_chat(model, chat_template, body.at("messages"), tools, model_name);
llama_params["prompt"] = format_chat(model, tmpl, body.at("messages"), tools, model_name);
// Handle "stop" field
if (body.contains("stop") && body.at("stop").is_string()) {
@@ -491,6 +471,13 @@ static json oaicompat_completion_params_parse(
throw std::runtime_error("response_format type must be one of \"text\" or \"json_object\", but got: " + response_type);
}
}
// Apply chat template to the list of messages
if (use_jinja) {
llama_params["prompt"] = tmpl.apply(body.at("messages"), tools, /* add_generation_prompt= */ true);
}
else {
llama_params["prompt"] = format_chat(model, tmpl, body.at("messages"));
}
// Handle "n" field
int n_choices = json_value(body, "n", 1);