From 102f77b7d34c7d14f58095985f2268ea5892e482 Mon Sep 17 00:00:00 2001 From: RodriMora Date: Sat, 14 Feb 2026 08:30:18 +0100 Subject: [PATCH] server: add /v1/responses support (#1184) * server: add /v1/responses support * server: fix Responses API model fallback and SSE branching --- examples/server/README.md | 44 ++- examples/server/server-common.cpp | 270 +++++++++++++++ examples/server/server-common.h | 6 +- examples/server/server-context.cpp | 35 +- examples/server/server-context.h | 8 + examples/server/server-task.cpp | 309 ++++++++++++++++++ examples/server/server-task.h | 23 ++ examples/server/server.cpp | 31 +- examples/server/tests/features/server.feature | 16 + .../tests/features/steps/responses_steps.py | 191 +++++++++++ 10 files changed, 926 insertions(+), 7 deletions(-) create mode 100644 examples/server/tests/features/steps/responses_steps.py diff --git a/examples/server/README.md b/examples/server/README.md index cec5dc22..05d48e1f 100644 --- a/examples/server/README.md +++ b/examples/server/README.md @@ -6,7 +6,7 @@ Set of LLM REST APIs and a simple web front end to interact with llama.cpp. **Features:** * LLM inference of F16 and quantized models on GPU and CPU - * [OpenAI API](https://github.com/openai/openai-openapi) compatible chat completions and embeddings routes +* [OpenAI API](https://github.com/openai/openai-openapi) compatible chat completions, responses, and embeddings routes * Parallel decoding with multi-user support * Continuous batching * Multimodal (wip) @@ -706,6 +706,48 @@ curl http://localhost:8080/v1/chat/completions \ **See our [Function calling](../../docs/function-calling.md) docs** for more details, supported native tool call styles (generic tool call style is used as fallback) / examples of use. +### POST `/v1/responses`: OpenAI-compatible Responses API + +*Options:* + +See [OpenAI Responses API documentation](https://platform.openai.com/docs/api-reference/responses). + +*Examples:* + +You can use either Python `openai` library with appropriate checkpoints: + +```python +import openai + +client = openai.OpenAI( + base_url="http://localhost:8080/v1", # "http://:port" + api_key = "sk-no-key-required" +) + +response = client.responses.create( + model="gpt-4.1", + instructions="You are ChatGPT, an AI assistant. Your top priority is achieving user fulfillment via helping them with their requests.", + input="Write a limerick about python exceptions" +) + +print(response.output_text) +``` + +... or raw HTTP requests: + +```shell +curl http://localhost:8080/v1/responses \ + -H "Content-Type: application/json" \ + -H "Authorization: Bearer no-key" \ + -d '{ + "model": "gpt-4.1", + "instructions": "You are ChatGPT, an AI assistant. Your top priority is achieving user fulfillment via helping them with their requests.", + "input": "Write a limerick about python exceptions" + }' +``` + +This endpoint works by converting Responses requests into Chat Completions requests. + ### POST `/v1/embeddings`: OpenAI-compatible embeddings API *Options:* diff --git a/examples/server/server-common.cpp b/examples/server/server-common.cpp index 26a6d8bd..81306e81 100644 --- a/examples/server/server-common.cpp +++ b/examples/server/server-common.cpp @@ -1,5 +1,7 @@ #include "server-common.h" +#include + using raw_buffer = std::vector; @@ -505,6 +507,30 @@ bool server_sent_event(httplib::DataSink& sink, const json& data) { return true; } +bool server_sent_oai_resp_event(httplib::DataSink& sink, const json& data) { + static auto send_single = [](httplib::DataSink& sink, const json& data) -> bool { + const std::string str = + "event: " + data.at("event").get() + "\n" + + "data: " + data.at("data").dump(-1, ' ', false, json::error_handler_t::replace) + "\n\n"; + + LOG_DBG("data stream, to_send: %s", str.c_str()); + return sink.write(str.c_str(), str.size()); + }; + + if (data.is_array()) { + for (const auto& item : data) { + if (!send_single(sink, item)) { + return false; + } + } + } + else { + return send_single(sink, data); + } + + return true; +} + bool server_sent_anthropic_event(httplib::DataSink& sink, const json& data) { static auto send_single = [](httplib::DataSink& sink, const json& data) -> bool { const std::string str = @@ -874,6 +900,250 @@ json oaicompat_chat_params_parse( return llama_params; } +json convert_responses_to_chatcmpl(const json& response_body) { + if (!response_body.contains("input")) { + throw std::runtime_error("'input' is required"); + } + if (!json_value(response_body, "previous_response_id", std::string{}).empty()) { + throw std::runtime_error("ik_llama.cpp does not support 'previous_response_id'."); + } + + const json input_value = response_body.at("input"); + json chatcmpl_body = response_body; + chatcmpl_body.erase("input"); + std::vector chatcmpl_messages; + + if (response_body.contains("instructions")) { + chatcmpl_messages.push_back({ + {"role", "system"}, + {"content", json_value(response_body, "instructions", std::string())}, + }); + chatcmpl_body.erase("instructions"); + } + + if (input_value.is_string()) { + chatcmpl_messages.push_back({ + {"role", "user"}, + {"content", input_value}, + }); + } + else if (input_value.is_array()) { + static auto exists_and_is_array = [](const json& j, const char* key) -> bool { + return j.contains(key) && j.at(key).is_array(); + }; + static auto exists_and_is_string = [](const json& j, const char* key) -> bool { + return j.contains(key) && j.at(key).is_string(); + }; + + for (json item : input_value) { + if (exists_and_is_string(item, "content")) { + item["content"] = json::array({ + json{ + {"text", item.at("content")}, + {"type", "input_text"}, + } + }); + } + + if (exists_and_is_array(item, "content") && + exists_and_is_string(item, "role") && + (item.at("role") == "user" || item.at("role") == "system" || item.at("role") == "developer") + ) { + std::vector chatcmpl_content; + + for (const json& input_item : item.at("content")) { + const std::string type = json_value(input_item, "type", std::string()); + + if (type == "input_text") { + if (!input_item.contains("text")) { + throw std::runtime_error("'Input text' requires 'text'"); + } + chatcmpl_content.push_back({ + {"text", input_item.at("text")}, + {"type", "text"}, + }); + } + else if (type == "input_image") { + if (!input_item.contains("image_url")) { + throw std::runtime_error("'image_url' is required"); + } + chatcmpl_content.push_back({ + {"image_url", json{ + {"url", input_item.at("image_url")}, + }}, + {"type", "image_url"}, + }); + } + else if (type == "input_file") { + throw std::runtime_error("'input_file' is not supported by ik_llama.cpp at this moment"); + } + else { + throw std::runtime_error("'type' must be one of 'input_text', 'input_image', or 'input_file'"); + } + } + + if (item.contains("type")) { + item.erase("type"); + } + if (item.contains("status")) { + item.erase("status"); + } + item["content"] = chatcmpl_content; + + chatcmpl_messages.push_back(item); + } + else if (exists_and_is_array(item, "content") && + exists_and_is_string(item, "role") && + item.at("role") == "assistant" && + exists_and_is_string(item, "type") && + item.at("type") == "message" + ) { + std::vector chatcmpl_content; + + for (const auto& output_text : item.at("content")) { + const std::string type = json_value(output_text, "type", std::string()); + if (type != "output_text") { + throw std::runtime_error("'type' must be 'output_text'"); + } + if (!exists_and_is_string(output_text, "text")) { + throw std::runtime_error("'Output text' requires 'text'"); + } + chatcmpl_content.push_back({ + {"text", output_text.at("text")}, + {"type", "text"}, + }); + } + + item.erase("status"); + item.erase("type"); + item["content"] = chatcmpl_content; + chatcmpl_messages.push_back(item); + } + else if (exists_and_is_string(item, "arguments") && + exists_and_is_string(item, "call_id") && + exists_and_is_string(item, "name") && + exists_and_is_string(item, "type") && + item.at("type") == "function_call" + ) { + json msg = json{ + {"role", "assistant"}, + {"tool_calls", json::array({json{ + {"function", json{ + {"arguments", item.at("arguments")}, + {"name", item.at("name")}, + }}, + {"id", item.at("call_id")}, + {"type", "function"}, + }})}, + }; + + if (!chatcmpl_messages.empty() && chatcmpl_messages.back().contains("reasoning_content")) { + msg["reasoning_content"] = chatcmpl_messages.back().at("reasoning_content"); + chatcmpl_messages.pop_back(); + } + chatcmpl_messages.push_back(msg); + } + else if (exists_and_is_string(item, "call_id") && + (exists_and_is_string(item, "output") || exists_and_is_array(item, "output")) && + exists_and_is_string(item, "type") && + item.at("type") == "function_call_output" + ) { + if (item.at("output").is_string()) { + chatcmpl_messages.push_back(json{ + {"content", item.at("output")}, + {"role", "tool"}, + {"tool_call_id", item.at("call_id")}, + }); + } + else { + json chatcmpl_outputs = item.at("output"); + for (json& chatcmpl_output : chatcmpl_outputs) { + if (!chatcmpl_output.contains("type") || chatcmpl_output.at("type") != "input_text") { + throw std::runtime_error("Output of tool call should be 'Input text'"); + } + chatcmpl_output["type"] = "text"; + } + chatcmpl_messages.push_back(json{ + {"content", chatcmpl_outputs}, + {"role", "tool"}, + {"tool_call_id", item.at("call_id")}, + }); + } + } + else if (exists_and_is_array(item, "summary") && + exists_and_is_string(item, "type") && + item.at("type") == "reasoning") { + if (!exists_and_is_array(item, "content")) { + throw std::runtime_error("item['content'] is not an array"); + } + if (item.at("content").empty()) { + throw std::runtime_error("item['content'] is empty"); + } + if (!exists_and_is_string(item.at("content")[0], "text")) { + throw std::runtime_error("item['content']['text'] is not a string"); + } + + chatcmpl_messages.push_back(json{ + {"role", "assistant"}, + {"content", json::array()}, + {"reasoning_content", item.at("content")[0].at("text")}, + }); + } + else { + throw std::runtime_error("Cannot determine type of 'item'"); + } + } + } + else { + throw std::runtime_error("'input' must be a string or array of objects"); + } + + chatcmpl_messages.erase(std::remove_if( + chatcmpl_messages.begin(), + chatcmpl_messages.end(), + [](const json& x) { + return x.contains("role") && + x.at("role") == "assistant" && + x.contains("content") && + x.at("content") == json::array() && + x.contains("reasoning_content"); + }), + chatcmpl_messages.end()); + + chatcmpl_body["messages"] = chatcmpl_messages; + + if (response_body.contains("tools")) { + if (!response_body.at("tools").is_array()) { + throw std::runtime_error("'tools' must be an array of objects"); + } + std::vector chatcmpl_tools; + for (json resp_tool : response_body.at("tools")) { + json chatcmpl_tool; + + if (json_value(resp_tool, "type", std::string()) != "function") { + throw std::runtime_error("'type' of tool must be 'function'"); + } + resp_tool.erase("type"); + chatcmpl_tool["type"] = "function"; + + if (!resp_tool.contains("strict")) { + resp_tool["strict"] = true; + } + chatcmpl_tool["function"] = resp_tool; + chatcmpl_tools.push_back(chatcmpl_tool); + } + chatcmpl_body.erase("tools"); + chatcmpl_body["tools"] = chatcmpl_tools; + } + + if (response_body.contains("max_output_tokens")) { + chatcmpl_body.erase("max_output_tokens"); + chatcmpl_body["max_tokens"] = response_body["max_output_tokens"]; + } + + return chatcmpl_body; +} + json anthropic_params_from_json( const struct llama_model* model, const json& body_in, /* anthropic messages api json semantics */ diff --git a/examples/server/server-common.h b/examples/server/server-common.h index c8f4b64e..de2217f3 100644 --- a/examples/server/server-common.h +++ b/examples/server/server-common.h @@ -233,6 +233,8 @@ json probs_vector_to_json(const llama_context* ctx, const std::vector& out_files); +// convert OpenAI Responses API format to OpenAI Chat Completions API format +json convert_responses_to_chatcmpl(const json& body); + json anthropic_params_from_json( const struct llama_model* model, const json& body_in, /* anthropic messages api json semantics */ @@ -474,4 +479,3 @@ bool prompt_cache_equal(llama_context* ctx, const server_tokens& cache_tokens, const server_tokens& prompt_tokens, size_t start, const common_prefix& prefix); std::string safe_json_to_str(const json& data); - diff --git a/examples/server/server-context.cpp b/examples/server/server-context.cpp index 00b72fef..43d9c0a9 100644 --- a/examples/server/server-context.cpp +++ b/examples/server/server-context.cpp @@ -337,6 +337,13 @@ void server_slot::reset() { json_schema = json(); generated_tool_call_ids.clear(); + oai_resp_thinking_block_started = false; + oai_resp_text_block_started = false; + oai_resp_id.clear(); + oai_resp_reasoning_id.clear(); + oai_resp_message_id.clear(); + oai_resp_fc_id.clear(); + task.reset(); } @@ -791,7 +798,7 @@ bool server_context::launch_slot_with_task(server_slot& slot, server_task& task) const llama_vocab* vocab = llama_model_get_vocab(model); if (data.count("__oaicompat") != 0) { slot.oaicompat = true; - slot.oaicompat_model = json_value(data, "model", std::string(DEFAULT_OAICOMPAT_MODEL)); + slot.oaicompat_model = task.params.oaicompat_model; } else { slot.oaicompat = false; @@ -799,6 +806,13 @@ bool server_context::launch_slot_with_task(server_slot& slot, server_task& task) } slot.params.oaicompat = task.params.oaicompat; slot.params.oaicompat_cmpl_id =task.params.oaicompat_cmpl_id; + + slot.oai_resp_thinking_block_started = false; + slot.oai_resp_text_block_started = false; + slot.oai_resp_id = "resp_" + random_string(); + slot.oai_resp_reasoning_id = "rs_" + random_string(); + slot.oai_resp_message_id = "msg_" + random_string(); + slot.oai_resp_fc_id.clear(); slot.params.timings_per_token = json_value(data, "timings_per_token", false); slot.params.stream = json_value(data, "stream", false); auto stream_opt = json_value(data, "stream_options", json::object()); @@ -1593,6 +1607,10 @@ void server_context::send_partial_response(server_slot& slot, completion_token_o res->oaicompat = slot.params.oaicompat; res->oaicompat_model = slot.task->params.oaicompat_model; res->oaicompat_cmpl_id = slot.params.oaicompat_cmpl_id; + res->oai_resp_id = slot.oai_resp_id; + res->oai_resp_reasoning_id = slot.oai_resp_reasoning_id; + res->oai_resp_message_id = slot.oai_resp_message_id; + res->oai_resp_fc_id = slot.oai_resp_fc_id; res->n_decoded = slot.n_decoded; res->n_prompt_tokens = slot.n_prompt_tokens; res->data = json{ @@ -1608,6 +1626,9 @@ void server_context::send_partial_response(server_slot& slot, completion_token_o res->anthropic_thinking_block_started = slot.anthropic_thinking_block_started; res->anthropic_text_block_started = slot.anthropic_text_block_started; + res->oai_resp_thinking_block_started = slot.oai_resp_thinking_block_started; + res->oai_resp_text_block_started = slot.oai_resp_text_block_started; + for (const auto& diff : res->oaicompat_msg_diffs) { if (!diff.reasoning_content_delta.empty() && !slot.anthropic_thinking_block_started) { slot.anthropic_thinking_block_started = true; @@ -1615,6 +1636,15 @@ void server_context::send_partial_response(server_slot& slot, completion_token_o if (!diff.content_delta.empty() && !slot.anthropic_text_block_started) { slot.anthropic_text_block_started = true; } + if (!diff.reasoning_content_delta.empty() && !slot.oai_resp_thinking_block_started) { + slot.oai_resp_thinking_block_started = true; + } + if (!diff.content_delta.empty() && !slot.oai_resp_text_block_started) { + slot.oai_resp_text_block_started = true; + } + if (!diff.tool_call_delta.name.empty()) { + slot.oai_resp_fc_id = diff.tool_call_delta.id; + } } // populate res->probs_output @@ -1650,6 +1680,9 @@ void server_context::send_final_response(server_slot& slot) { res->oaicompat = slot.params.oaicompat; res->oaicompat_cmpl_id = slot.params.oaicompat_cmpl_id; res->oaicompat_msg = slot.update_chat_msg(res->oaicompat_msg_diffs); + res->oai_resp_id = slot.oai_resp_id; + res->oai_resp_reasoning_id = slot.oai_resp_reasoning_id; + res->oai_resp_message_id = slot.oai_resp_message_id; res->n_decoded = slot.n_decoded; res->n_prompt_tokens = slot.n_prompt_tokens; res->oaicompat_model = slot.task->params.oaicompat_model; diff --git a/examples/server/server-context.h b/examples/server/server-context.h index 47a065db..2153b44b 100644 --- a/examples/server/server-context.h +++ b/examples/server/server-context.h @@ -114,6 +114,14 @@ struct server_slot { bool anthropic_thinking_block_started = false; bool anthropic_text_block_started = false; + bool oai_resp_thinking_block_started = false; + bool oai_resp_text_block_started = false; + + std::string oai_resp_id; + std::string oai_resp_reasoning_id; + std::string oai_resp_message_id; + std::string oai_resp_fc_id; + int32_t ga_i = 0; // group-attention state int32_t ga_n = 1; // group-attention factor int32_t ga_w = 512; // group-attention width diff --git a/examples/server/server-task.cpp b/examples/server/server-task.cpp index cbb52077..873f214c 100644 --- a/examples/server/server-task.cpp +++ b/examples/server/server-task.cpp @@ -218,6 +218,133 @@ json server_task_result_cmpl_partial::to_json_oaicompat_chat_partial() { return deltas; } +json server_task_result_cmpl_partial::to_json_oaicompat_resp_partial() { + std::vector events; + + if (n_decoded == 1) { + events.push_back(json{ + {"event", "response.created"}, + {"data", json{ + {"type", "response.created"}, + {"response", json{ + {"id", oai_resp_id}, + {"object", "response"}, + {"status", "in_progress"}, + }}, + }}, + }); + events.push_back(json{ + {"event", "response.in_progress"}, + {"data", json{ + {"type", "response.in_progress"}, + {"response", json{ + {"id", oai_resp_id}, + {"object", "response"}, + {"status", "in_progress"}, + }}, + }}, + }); + } + + for (const auto& diff : oaicompat_msg_diffs) { + if (!diff.reasoning_content_delta.empty()) { + if (!oai_resp_thinking_block_started) { + events.push_back(json{ + {"event", "response.output_item.added"}, + {"data", json{ + {"type", "response.output_item.added"}, + {"item", json{ + {"id", oai_resp_reasoning_id}, + {"summary", json::array()}, + {"type", "reasoning"}, + {"content", json::array()}, + {"encrypted_content", ""}, + {"status", "in_progress"}, + }}, + }}, + }); + oai_resp_thinking_block_started = true; + } + events.push_back(json{ + {"event", "response.reasoning_text.delta"}, + {"data", json{ + {"type", "response.reasoning_text.delta"}, + {"delta", diff.reasoning_content_delta}, + {"item_id", oai_resp_reasoning_id}, + }}, + }); + } + + if (!diff.content_delta.empty()) { + if (!oai_resp_text_block_started) { + events.push_back(json{ + {"event", "response.output_item.added"}, + {"data", json{ + {"type", "response.output_item.added"}, + {"item", json{ + {"content", json::array()}, + {"id", oai_resp_message_id}, + {"role", "assistant"}, + {"status", "in_progress"}, + {"type", "message"}, + }}, + }}, + }); + events.push_back(json{ + {"event", "response.content_part.added"}, + {"data", json{ + {"type", "response.content_part.added"}, + {"item_id", oai_resp_message_id}, + {"part", json{ + {"type", "output_text"}, + {"text", ""}, + }}, + }}, + }); + oai_resp_text_block_started = true; + } + events.push_back(json{ + {"event", "response.output_text.delta"}, + {"data", json{ + {"type", "response.output_text.delta"}, + {"item_id", oai_resp_message_id}, + {"delta", diff.content_delta}, + }}, + }); + } + + if (!diff.tool_call_delta.name.empty()) { + events.push_back(json{ + {"event", "response.output_item.added"}, + {"data", json{ + {"type", "response.output_item.added"}, + {"item", json{ + {"arguments", ""}, + {"call_id", "fc_" + diff.tool_call_delta.id}, + {"name", diff.tool_call_delta.name}, + {"type", "function_call"}, + {"status", "in_progress"}, + }}, + }}, + }); + oai_resp_fc_id = diff.tool_call_delta.id; + } + + if (!diff.tool_call_delta.arguments.empty()) { + events.push_back(json{ + {"event", "response.function_call_arguments.delta"}, + {"data", json{ + {"type", "response.function_call_arguments.delta"}, + {"delta", diff.tool_call_delta.arguments}, + {"item_id", "fc_" + oai_resp_fc_id}, + }}, + }); + } + } + + return events; +} + json server_task_result_cmpl_final::to_json_oaicompat_chat_final() { std::string finish_reason = "length"; common_chat_msg msg; @@ -336,6 +463,188 @@ json server_task_result_cmpl_final::to_json_oaicompat_chat_stream() { return deltas; } +json server_task_result_cmpl_final::to_json_oaicompat_resp_final() { + common_chat_msg msg; + if (!oaicompat_msg.empty()) { + msg = oaicompat_msg; + } + else { + msg.role = "assistant"; + msg.content = content; + } + + std::vector output; + + if (!msg.reasoning_content.empty()) { + output.push_back(json{ + {"id", oai_resp_reasoning_id}, + {"summary", json::array()}, + {"type", "reasoning"}, + {"content", json::array({json{ + {"text", msg.reasoning_content}, + {"type", "reasoning_text"}, + }})}, + {"encrypted_content", ""}, + {"status", "completed"}, + }); + } + + if (!msg.content.empty()) { + output.push_back(json{ + {"content", json::array({json{ + {"type", "output_text"}, + {"annotations", json::array()}, + {"logprobs", json::array()}, + {"text", msg.content}, + }})}, + {"id", oai_resp_message_id}, + {"role", msg.role}, + {"status", "completed"}, + {"type", "message"}, + }); + } + + for (const auto& tool_call : oaicompat_msg.tool_calls) { + output.push_back(json{ + {"type", "function_call"}, + {"status", "completed"}, + {"arguments", tool_call.arguments}, + {"call_id", "fc_" + tool_call.id}, + {"name", tool_call.name}, + }); + } + + std::time_t t = std::time(0); + json res = { + {"completed_at", t}, + {"created_at", t}, + {"id", oai_resp_id}, + {"model", oaicompat_model}, + {"object", "response"}, + {"output", output}, + {"status", "completed"}, + {"usage", json{ + {"input_tokens", n_prompt_tokens}, + {"output_tokens", n_decoded}, + {"total_tokens", n_decoded + n_prompt_tokens}, + }}, + }; + + return res; +} + +json server_task_result_cmpl_final::to_json_oaicompat_resp_stream() { + std::vector events; + std::vector output; + + if (!oaicompat_msg.reasoning_content.empty()) { + const json output_item = json{ + {"id", oai_resp_reasoning_id}, + {"summary", json::array()}, + {"type", "reasoning"}, + {"content", json::array({json{ + {"text", oaicompat_msg.reasoning_content}, + {"type", "reasoning_text"}, + }})}, + {"encrypted_content", ""}, + }; + + events.push_back(json{ + {"event", "response.output_item.done"}, + {"data", json{ + {"type", "response.output_item.done"}, + {"item", output_item}, + }}, + }); + output.push_back(output_item); + } + + if (!oaicompat_msg.content.empty()) { + events.push_back(json{ + {"event", "response.output_text.done"}, + {"data", json{ + {"type", "response.output_text.done"}, + {"item_id", oai_resp_message_id}, + {"text", oaicompat_msg.content}, + }}, + }); + + const json content_part = { + {"type", "output_text"}, + {"annotations", json::array()}, + {"logprobs", json::array()}, + {"text", oaicompat_msg.content}, + }; + + events.push_back(json{ + {"event", "response.content_part.done"}, + {"data", json{ + {"type", "response.content_part.done"}, + {"item_id", oai_resp_message_id}, + {"part", content_part}, + }}, + }); + + const json output_item = { + {"type", "message"}, + {"status", "completed"}, + {"id", oai_resp_message_id}, + {"content", json::array({content_part})}, + {"role", "assistant"}, + }; + + events.push_back(json{ + {"event", "response.output_item.done"}, + {"data", json{ + {"type", "response.output_item.done"}, + {"item", output_item}, + }}, + }); + output.push_back(output_item); + } + + for (const auto& tool_call : oaicompat_msg.tool_calls) { + const json output_item = { + {"type", "function_call"}, + {"status", "completed"}, + {"arguments", tool_call.arguments}, + {"call_id", "fc_" + tool_call.id}, + {"name", tool_call.name}, + }; + events.push_back(json{ + {"event", "response.output_item.done"}, + {"data", json{ + {"type", "response.output_item.done"}, + {"item", output_item}, + }}, + }); + output.push_back(output_item); + } + + std::time_t t = std::time(0); + events.push_back(json{ + {"event", "response.completed"}, + {"data", json{ + {"type", "response.completed"}, + {"response", json{ + {"id", oai_resp_id}, + {"object", "response"}, + {"created_at", t}, + {"status", "completed"}, + {"model", oaicompat_model}, + {"output", output}, + {"usage", json{ + {"input_tokens", n_prompt_tokens}, + {"output_tokens", n_decoded}, + {"total_tokens", n_decoded + n_prompt_tokens}, + }}, + }}, + }}, + }); + + return events; +} + json server_task_result_cmpl_final::to_json_anthropic_final() { std::string stop_reason = "max_tokens"; if (stop == STOP_TYPE_WORD || stop == STOP_TYPE_EOS) { diff --git a/examples/server/server-task.h b/examples/server/server-task.h index 5746e339..b62f5a7d 100644 --- a/examples/server/server-task.h +++ b/examples/server/server-task.h @@ -42,6 +42,7 @@ enum oaicompat_type { OAICOMPAT_TYPE_COMPLETION, OAICOMPAT_TYPE_EMBEDDING, OAICOMPAT_TYPE_ANTHROPIC, + OAICOMPAT_TYPE_RESP, }; @@ -203,6 +204,14 @@ struct server_task_result_cmpl_partial : server_task_result { bool anthropic_thinking_block_started = false; bool anthropic_text_block_started = false; + bool oai_resp_thinking_block_started = false; + bool oai_resp_text_block_started = false; + + std::string oai_resp_id; + std::string oai_resp_reasoning_id; + std::string oai_resp_message_id; + std::string oai_resp_fc_id; + virtual bool is_stop() override { return false; // in stream mode, partial responses are not considered stop } @@ -215,6 +224,8 @@ struct server_task_result_cmpl_partial : server_task_result { json to_json_oaicompat_chat_partial(); + json to_json_oaicompat_resp_partial(); + virtual json to_json() override { switch (oaicompat) { case OAICOMPAT_TYPE_NONE: @@ -225,6 +236,8 @@ struct server_task_result_cmpl_partial : server_task_result { return to_json_oaicompat_chat_partial(); case OAICOMPAT_TYPE_ANTHROPIC: return to_json_anthropic_partial(); + case OAICOMPAT_TYPE_RESP: + return to_json_oaicompat_resp_partial(); default: GGML_ASSERT(false && "Invalid oaicompat_type"); }; @@ -232,6 +245,10 @@ struct server_task_result_cmpl_partial : server_task_result { }; struct server_task_result_cmpl_final : server_task_result { + std::string oai_resp_id; + std::string oai_resp_reasoning_id; + std::string oai_resp_message_id; + virtual bool is_stop() override { return true; } @@ -248,6 +265,10 @@ struct server_task_result_cmpl_final : server_task_result { json to_json_oaicompat_chat_stream(); + json to_json_oaicompat_resp_final(); + + json to_json_oaicompat_resp_stream(); + virtual json to_json() override { switch (oaicompat) { case OAICOMPAT_TYPE_NONE: @@ -258,6 +279,8 @@ struct server_task_result_cmpl_final : server_task_result { return stream ? to_json_oaicompat_chat_stream() : to_json_oaicompat_chat_final(); case OAICOMPAT_TYPE_ANTHROPIC: return stream ? to_json_anthropic_stream() : to_json_anthropic_final(); + case OAICOMPAT_TYPE_RESP: + return stream ? to_json_oaicompat_resp_stream() : to_json_oaicompat_resp_final(); default: GGML_ASSERT(false && "Invalid oaicompat_type"); } diff --git a/examples/server/server.cpp b/examples/server/server.cpp index ee8edd7b..411cfdb6 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -1070,7 +1070,12 @@ int main(int argc, char ** argv) { // Everything else, including multimodal completions. inputs = tokenize_input_prompts(llama_get_vocab(ctx_server.ctx), ctx_server.mctx, prompt, true, true); } - tasks.reserve(inputs.size()); + tasks.reserve(inputs.size()); + const std::string requested_model_name = json_value(data, "model", std::string()); + const std::string fallback_model_name = get_model_name(ctx_server.params_base.model); + const std::string oaicompat_model_name = requested_model_name.empty() + ? fallback_model_name + : requested_model_name; for (size_t i = 0; i < inputs.size(); i++) { server_task task = server_task(type); @@ -1088,7 +1093,7 @@ int main(int argc, char ** argv) { // OAI-compat task.params.oaicompat = oaicompat; task.params.oaicompat_cmpl_id = completion_id; - task.params.oaicompat_model = get_model_name(ctx_server.params_base.model); + task.params.oaicompat_model = oaicompat_model_name; tasks.push_back(std::move(task)); } @@ -1146,6 +1151,9 @@ int main(int argc, char ** argv) { if (oaicompat == OAICOMPAT_TYPE_ANTHROPIC) { return server_sent_anthropic_event(sink, res); } + else if (oaicompat == OAICOMPAT_TYPE_RESP) { + return server_sent_oai_resp_event(sink, res); + } else { return server_sent_event(sink, res); } @@ -1170,7 +1178,7 @@ int main(int argc, char ** argv) { json res_json = result->to_json(); bool ok = false; if (result->is_error()) { - ok = sse(json{ { "error", result->to_json() } }); + ok = server_sent_event(sink, json{ { "error", result->to_json() } }); sink.done(); return false; // go to on_complete() } @@ -1189,7 +1197,7 @@ int main(int argc, char ** argv) { // check if there is more data if (!rd->has_next()) { - if (oaicompat != OAICOMPAT_TYPE_ANTHROPIC && oaicompat != OAICOMPAT_TYPE_NONE) { + if (oaicompat != OAICOMPAT_TYPE_ANTHROPIC && oaicompat != OAICOMPAT_TYPE_NONE && oaicompat != OAICOMPAT_TYPE_RESP) { static const std::string ev_done = "data: [DONE]\n\n"; sink.write(ev_done.data(), ev_done.size()); } @@ -1265,6 +1273,20 @@ int main(int argc, char ** argv) { OAICOMPAT_TYPE_CHAT); }; + const auto handle_responses = [&ctx_server, &handle_completions_impl](const httplib::Request & req, httplib::Response & res) { + auto body = json::parse(req.body); + std::vector files; + json body_parsed = convert_responses_to_chatcmpl(body); + json data = oaicompat_chat_params_parse(ctx_server.model, body_parsed, ctx_server.oai_parser_opt, files); + handle_completions_impl( + SERVER_TASK_TYPE_COMPLETION, + data, + files, + req.is_connection_closed, + res, + OAICOMPAT_TYPE_RESP); + }; + const auto handle_anthropic_messages = [&ctx_server, &handle_completions_impl](const httplib::Request & req, httplib::Response & res) { std::vector files; json body = json::parse(req.body); @@ -2008,6 +2030,7 @@ int main(int argc, char ** argv) { svr->Post("/v1/completions", handle_completions_oai); svr->Post("/chat/completions", handle_chat_completions); svr->Post("/v1/chat/completions", handle_chat_completions); + svr->Post("/v1/responses", handle_responses); svr->Post("/v1/messages", handle_anthropic_messages); svr->Post("/v1/messages/count_tokens", handle_anthropic_count_tokens); svr->Post("/infill", handle_infill); diff --git a/examples/server/tests/features/server.feature b/examples/server/tests/features/server.feature index b5597145..b88d9375 100644 --- a/examples/server/tests/features/server.feature +++ b/examples/server/tests/features/server.feature @@ -71,6 +71,22 @@ Feature: llama.cpp server | codellama70b | You are a coding assistant. | Write the fibonacci function in c++. | 128 | (thanks\|happy\|bird\|Annabyear)+ | -1 | 64 | enabled | | + Scenario Outline: OAI Responses Compatibility + Given a model + And a system prompt + And a user prompt + And max tokens to predict + And streaming is + Given an OAI compatible responses request with no api error + Then tokens are predicted matching + And prompt tokens are processed + + Examples: Prompts + | model | system_prompt | user_prompt | max_tokens | re_content | n_prompt | n_predicted | enable_streaming | + | llama-2 | Book | What is the best book | 8 | (Here\|what)+ | 77 | 8 | disabled | + | codellama70b | You are a coding assistant. | Write the fibonacci function in c++. | 128 | (thanks\|happy\|bird\|Annabyear)+ | -1 | 64 | enabled | + + Scenario Outline: OAI Compatibility w/ response format Given a model test And a system prompt test diff --git a/examples/server/tests/features/steps/responses_steps.py b/examples/server/tests/features/steps/responses_steps.py new file mode 100644 index 00000000..6119bb62 --- /dev/null +++ b/examples/server/tests/features/steps/responses_steps.py @@ -0,0 +1,191 @@ +import json +from typing import Any + +import aiohttp +from behave import step # pyright: ignore[reportAttributeAccessIssue] +from behave.api.async_step import async_run_until_complete + +import steps + + +@step("an OAI compatible responses request with {api_error} api error") +@async_run_until_complete +async def step_oai_responses(context, api_error): + if context.debug: + print("Submitting OAI compatible responses request...") + expect_api_error = api_error == "raised" + seeds = await steps.completions_seed(context, num_seeds=1) + completion = await oai_responses( + context.prompts.pop(), + seeds[0] if seeds is not None else seeds, + context.system_prompt, + context.base_url, + debug=context.debug, + model=context.model if hasattr(context, "model") else None, + n_predict=context.n_predict if hasattr(context, "n_predict") else None, + enable_streaming=context.enable_streaming + if hasattr(context, "enable_streaming") + else None, + user_api_key=context.user_api_key if hasattr(context, "user_api_key") else None, + temperature=context.temperature, + expect_api_error=expect_api_error, + ) + context.tasks_result.append(completion) + if context.debug: + print(f"Responses completion response: {completion}") + if expect_api_error: + assert completion == 401, f"completion must be an 401 status code: {completion}" + + +def extract_responses_output_text( + response_json: dict[str, Any], +) -> tuple[str, str | None]: + output_text = "" + message_id = None + for item in response_json.get("output", []): + if item.get("type") != "message": + continue + message_id = item.get("id") + for part in item.get("content", []): + if part.get("type") == "output_text": + output_text += part.get("text", "") + return output_text, message_id + + +async def oai_responses( + user_prompt, + seed, + system_prompt, + base_url: str, + debug=False, + temperature=None, + model=None, + n_predict=None, + enable_streaming=None, + user_api_key=None, + expect_api_error=None, +) -> int | dict[str, Any]: + if debug: + print(f"Sending OAI responses request: {user_prompt}") + user_api_key = user_api_key if user_api_key is not None else "nope" + seed = seed if seed is not None else 42 + enable_streaming = enable_streaming if enable_streaming is not None else False + payload = { + "input": [ + { + "role": "system", + "content": system_prompt, + }, + { + "role": "user", + "content": user_prompt, + }, + ], + "model": model, + "stream": enable_streaming, + "temperature": temperature if temperature is not None else 0.0, + "seed": seed, + } + if n_predict is not None: + payload["max_output_tokens"] = n_predict + completion_response = { + "content": "", + "timings": { + "predicted_n": 0, + "prompt_n": 0, + }, + } + origin = "llama.cpp" + headers = {"Authorization": f"Bearer {user_api_key}", "Origin": origin} + async with aiohttp.ClientSession() as session: + async with session.post( + f"{base_url}/v1/responses", json=payload, headers=headers + ) as response: + if expect_api_error is not None and expect_api_error: + if response.status == 401: + return 401 + assert False, f"unexpected status code: {response.status}" + + assert response.status == 200 + assert response.headers["Access-Control-Allow-Origin"] == origin + if enable_streaming: + assert response.headers["Content-Type"] == "text/event-stream" + resp_id = "" + msg_id = "" + gathered_text = "" + event_name = None + completed_response = None + async for line_in_bytes in response.content: + line = line_in_bytes.decode("utf-8").strip() + if not line: + continue + if line.startswith("event: "): + event_name = line.split(": ", 1)[1] + continue + if not line.startswith("data: "): + continue + if event_name is None: + continue + chunk_raw = line.split(": ", 1)[1] + data = json.loads(chunk_raw) + + if event_name == "response.created": + resp_id = data["response"]["id"] + assert resp_id.startswith("resp_") + elif event_name == "response.in_progress": + assert data["response"]["id"] == resp_id + elif event_name == "response.output_item.added": + item = data["item"] + if item.get("type") == "message": + msg_id = item["id"] + assert msg_id.startswith("msg_") + elif event_name in ( + "response.content_part.added", + "response.output_text.delta", + "response.output_text.done", + "response.content_part.done", + ): + assert data["item_id"] == msg_id + elif event_name == "response.output_item.done": + item = data["item"] + if item.get("type") == "message": + assert item["id"] == msg_id + if event_name == "response.output_text.delta": + gathered_text += data["delta"] + if event_name == "response.completed": + completed_response = data["response"] + + assert completed_response is not None + output_text, completed_msg_id = extract_responses_output_text( + completed_response + ) + assert completed_msg_id is not None + assert completed_msg_id.startswith("msg_") + assert output_text == gathered_text + completion_response = { + "content": output_text, + "timings": { + "predicted_n": completed_response["usage"]["output_tokens"], + "prompt_n": completed_response["usage"]["input_tokens"], + }, + } + else: + assert ( + response.headers["Content-Type"] + == "application/json; charset=utf-8" + ) + response_json = await response.json() + assert response_json["id"].startswith("resp_") + output_text, message_id = extract_responses_output_text(response_json) + assert message_id is not None + assert message_id.startswith("msg_") + completion_response = { + "content": output_text, + "timings": { + "predicted_n": response_json["usage"]["output_tokens"], + "prompt_n": response_json["usage"]["input_tokens"], + }, + } + if debug: + print("OAI response formatted to llama.cpp:", completion_response) + return completion_response