server: add /v1/responses support (#1184)

* server: add /v1/responses support * server: fix Responses API model fallback and SSE branching
2026-02-20 13:14:09 +00:00 · 2026-02-14 08:30:18 +01:00
parent 1cb7e1bf39
commit 102f77b7d3
10 changed files with 926 additions and 7 deletions
--- a/examples/server/README.md
+++ b/examples/server/README.md
@@ -6,7 +6,7 @@ Set of LLM REST APIs and a simple web front end to interact with llama.cpp.

 **Features:**
 * LLM inference of F16 and quantized models on GPU and CPU
- * [OpenAI API](https://github.com/openai/openai-openapi) compatible chat completions and embeddings routes
+* [OpenAI API](https://github.com/openai/openai-openapi) compatible chat completions, responses, and embeddings routes
 * Parallel decoding with multi-user support
 * Continuous batching
 * Multimodal (wip)
@@ -706,6 +706,48 @@ curl http://localhost:8080/v1/chat/completions \

 **See our [Function calling](../../docs/function-calling.md) docs** for more details, supported native tool call styles (generic tool call style is used as fallback) / examples of use.

+### POST `/v1/responses`: OpenAI-compatible Responses API
+
+*Options:*
+
+See [OpenAI Responses API documentation](https://platform.openai.com/docs/api-reference/responses).
+
+*Examples:*
+
+You can use either Python `openai` library with appropriate checkpoints:
+
+```python
+import openai
+
+client = openai.OpenAI(
+  base_url="http://localhost:8080/v1", # "http://<Your api-server IP>:port"
+  api_key = "sk-no-key-required"
+)
+
+response = client.responses.create(
+  model="gpt-4.1",
+  instructions="You are ChatGPT, an AI assistant. Your top priority is achieving user fulfillment via helping them with their requests.",
+  input="Write a limerick about python exceptions"
+)
+
+print(response.output_text)
+```
+
+... or raw HTTP requests:
+
+```shell
+curl http://localhost:8080/v1/responses \
+  -H "Content-Type: application/json" \
+  -H "Authorization: Bearer no-key" \
+  -d '{
+    "model": "gpt-4.1",
+    "instructions": "You are ChatGPT, an AI assistant. Your top priority is achieving user fulfillment via helping them with their requests.",
+    "input": "Write a limerick about python exceptions"
+  }'
+```
+
+This endpoint works by converting Responses requests into Chat Completions requests.
+
 ### POST `/v1/embeddings`: OpenAI-compatible embeddings API

    *Options:*
--- a/examples/server/server-common.cpp
+++ b/examples/server/server-common.cpp
@@ -1,5 +1,7 @@
 #include "server-common.h"

+#include <algorithm>
+
 using raw_buffer = std::vector<uint8_t>;


@@ -505,6 +507,30 @@ bool server_sent_event(httplib::DataSink& sink, const json& data) {
    return true;
 }

+bool server_sent_oai_resp_event(httplib::DataSink& sink, const json& data) {
+    static auto send_single = [](httplib::DataSink& sink, const json& data) -> bool {
+        const std::string str =
+            "event: " + data.at("event").get<std::string>() + "\n" +
+            "data: " + data.at("data").dump(-1, ' ', false, json::error_handler_t::replace) + "\n\n";
+
+        LOG_DBG("data stream, to_send: %s", str.c_str());
+        return sink.write(str.c_str(), str.size());
+    };
+
+    if (data.is_array()) {
+        for (const auto& item : data) {
+            if (!send_single(sink, item)) {
+                return false;
+            }
+        }
+    }
+    else {
+        return send_single(sink, data);
+    }
+
+    return true;
+}
+
 bool server_sent_anthropic_event(httplib::DataSink& sink, const json& data) {
    static auto send_single = [](httplib::DataSink& sink, const json& data) -> bool {
        const std::string str =
@@ -874,6 +900,250 @@ json oaicompat_chat_params_parse(
    return llama_params;
 }

+json convert_responses_to_chatcmpl(const json& response_body) {
+    if (!response_body.contains("input")) {
+        throw std::runtime_error("'input' is required");
+    }
+    if (!json_value(response_body, "previous_response_id", std::string{}).empty()) {
+        throw std::runtime_error("ik_llama.cpp does not support 'previous_response_id'.");
+    }
+
+    const json input_value = response_body.at("input");
+    json chatcmpl_body = response_body;
+    chatcmpl_body.erase("input");
+    std::vector<json> chatcmpl_messages;
+
+    if (response_body.contains("instructions")) {
+        chatcmpl_messages.push_back({
+            {"role",    "system"},
+            {"content", json_value(response_body, "instructions", std::string())},
+        });
+        chatcmpl_body.erase("instructions");
+    }
+
+    if (input_value.is_string()) {
+        chatcmpl_messages.push_back({
+            {"role",    "user"},
+            {"content", input_value},
+        });
+    }
+    else if (input_value.is_array()) {
+        static auto exists_and_is_array = [](const json& j, const char* key) -> bool {
+            return j.contains(key) && j.at(key).is_array();
+        };
+        static auto exists_and_is_string = [](const json& j, const char* key) -> bool {
+            return j.contains(key) && j.at(key).is_string();
+        };
+
+        for (json item : input_value) {
+            if (exists_and_is_string(item, "content")) {
+                item["content"] = json::array({
+                    json{
+                        {"text", item.at("content")},
+                        {"type", "input_text"},
+                    }
+                });
+            }
+
+            if (exists_and_is_array(item, "content") &&
+                exists_and_is_string(item, "role") &&
+                (item.at("role") == "user" || item.at("role") == "system" || item.at("role") == "developer")
+            ) {
+                std::vector<json> chatcmpl_content;
+
+                for (const json& input_item : item.at("content")) {
+                    const std::string type = json_value(input_item, "type", std::string());
+
+                    if (type == "input_text") {
+                        if (!input_item.contains("text")) {
+                            throw std::runtime_error("'Input text' requires 'text'");
+                        }
+                        chatcmpl_content.push_back({
+                            {"text", input_item.at("text")},
+                            {"type", "text"},
+                        });
+                    }
+                    else if (type == "input_image") {
+                        if (!input_item.contains("image_url")) {
+                            throw std::runtime_error("'image_url' is required");
+                        }
+                        chatcmpl_content.push_back({
+                            {"image_url", json{
+                                {"url", input_item.at("image_url")},
+                            }},
+                            {"type", "image_url"},
+                        });
+                    }
+                    else if (type == "input_file") {
+                        throw std::runtime_error("'input_file' is not supported by ik_llama.cpp at this moment");
+                    }
+                    else {
+                        throw std::runtime_error("'type' must be one of 'input_text', 'input_image', or 'input_file'");
+                    }
+                }
+
+                if (item.contains("type")) {
+                    item.erase("type");
+                }
+                if (item.contains("status")) {
+                    item.erase("status");
+                }
+                item["content"] = chatcmpl_content;
+
+                chatcmpl_messages.push_back(item);
+            }
+            else if (exists_and_is_array(item, "content") &&
+                exists_and_is_string(item, "role") &&
+                item.at("role") == "assistant" &&
+                exists_and_is_string(item, "type") &&
+                item.at("type") == "message"
+            ) {
+                std::vector<json> chatcmpl_content;
+
+                for (const auto& output_text : item.at("content")) {
+                    const std::string type = json_value(output_text, "type", std::string());
+                    if (type != "output_text") {
+                        throw std::runtime_error("'type' must be 'output_text'");
+                    }
+                    if (!exists_and_is_string(output_text, "text")) {
+                        throw std::runtime_error("'Output text' requires 'text'");
+                    }
+                    chatcmpl_content.push_back({
+                        {"text", output_text.at("text")},
+                        {"type", "text"},
+                    });
+                }
+
+                item.erase("status");
+                item.erase("type");
+                item["content"] = chatcmpl_content;
+                chatcmpl_messages.push_back(item);
+            }
+            else if (exists_and_is_string(item, "arguments") &&
+                exists_and_is_string(item, "call_id") &&
+                exists_and_is_string(item, "name") &&
+                exists_and_is_string(item, "type") &&
+                item.at("type") == "function_call"
+            ) {
+                json msg = json{
+                    {"role", "assistant"},
+                    {"tool_calls", json::array({json{
+                        {"function", json{
+                            {"arguments", item.at("arguments")},
+                            {"name",      item.at("name")},
+                        }},
+                        {"id",   item.at("call_id")},
+                        {"type", "function"},
+                    }})},
+                };
+
+                if (!chatcmpl_messages.empty() && chatcmpl_messages.back().contains("reasoning_content")) {
+                    msg["reasoning_content"] = chatcmpl_messages.back().at("reasoning_content");
+                    chatcmpl_messages.pop_back();
+                }
+                chatcmpl_messages.push_back(msg);
+            }
+            else if (exists_and_is_string(item, "call_id") &&
+                (exists_and_is_string(item, "output") || exists_and_is_array(item, "output")) &&
+                exists_and_is_string(item, "type") &&
+                item.at("type") == "function_call_output"
+            ) {
+                if (item.at("output").is_string()) {
+                    chatcmpl_messages.push_back(json{
+                        {"content",      item.at("output")},
+                        {"role",         "tool"},
+                        {"tool_call_id", item.at("call_id")},
+                    });
+                }
+                else {
+                    json chatcmpl_outputs = item.at("output");
+                    for (json& chatcmpl_output : chatcmpl_outputs) {
+                        if (!chatcmpl_output.contains("type") || chatcmpl_output.at("type") != "input_text") {
+                            throw std::runtime_error("Output of tool call should be 'Input text'");
+                        }
+                        chatcmpl_output["type"] = "text";
+                    }
+                    chatcmpl_messages.push_back(json{
+                        {"content",      chatcmpl_outputs},
+                        {"role",         "tool"},
+                        {"tool_call_id", item.at("call_id")},
+                    });
+                }
+            }
+            else if (exists_and_is_array(item, "summary") &&
+                exists_and_is_string(item, "type") &&
+                item.at("type") == "reasoning") {
+                if (!exists_and_is_array(item, "content")) {
+                    throw std::runtime_error("item['content'] is not an array");
+                }
+                if (item.at("content").empty()) {
+                    throw std::runtime_error("item['content'] is empty");
+                }
+                if (!exists_and_is_string(item.at("content")[0], "text")) {
+                    throw std::runtime_error("item['content']['text'] is not a string");
+                }
+
+                chatcmpl_messages.push_back(json{
+                    {"role", "assistant"},
+                    {"content", json::array()},
+                    {"reasoning_content", item.at("content")[0].at("text")},
+                });
+            }
+            else {
+                throw std::runtime_error("Cannot determine type of 'item'");
+            }
+        }
+    }
+    else {
+        throw std::runtime_error("'input' must be a string or array of objects");
+    }
+
+    chatcmpl_messages.erase(std::remove_if(
+        chatcmpl_messages.begin(),
+        chatcmpl_messages.end(),
+        [](const json& x) {
+            return x.contains("role") &&
+                x.at("role") == "assistant" &&
+                x.contains("content") &&
+                x.at("content") == json::array() &&
+                x.contains("reasoning_content");
+        }),
+        chatcmpl_messages.end());
+
+    chatcmpl_body["messages"] = chatcmpl_messages;
+
+    if (response_body.contains("tools")) {
+        if (!response_body.at("tools").is_array()) {
+            throw std::runtime_error("'tools' must be an array of objects");
+        }
+        std::vector<json> chatcmpl_tools;
+        for (json resp_tool : response_body.at("tools")) {
+            json chatcmpl_tool;
+
+            if (json_value(resp_tool, "type", std::string()) != "function") {
+                throw std::runtime_error("'type' of tool must be 'function'");
+            }
+            resp_tool.erase("type");
+            chatcmpl_tool["type"] = "function";
+
+            if (!resp_tool.contains("strict")) {
+                resp_tool["strict"] = true;
+            }
+            chatcmpl_tool["function"] = resp_tool;
+            chatcmpl_tools.push_back(chatcmpl_tool);
+        }
+        chatcmpl_body.erase("tools");
+        chatcmpl_body["tools"] = chatcmpl_tools;
+    }
+
+    if (response_body.contains("max_output_tokens")) {
+        chatcmpl_body.erase("max_output_tokens");
+        chatcmpl_body["max_tokens"] = response_body["max_output_tokens"];
+    }
+
+    return chatcmpl_body;
+}
+
 json anthropic_params_from_json(
    const struct llama_model* model,
    const json& body_in, /* anthropic messages api json semantics */
--- a/examples/server/server-common.h
+++ b/examples/server/server-common.h
@@ -233,6 +233,8 @@ json probs_vector_to_json(const llama_context* ctx, const std::vector<completion

 bool server_sent_event(httplib::DataSink& sink, const json& data);

+bool server_sent_oai_resp_event(httplib::DataSink& sink, const json& data);
+
 bool server_sent_anthropic_event(httplib::DataSink& sink, const json& data);

 //
@@ -259,6 +261,9 @@ json oaicompat_chat_params_parse(
    const oaicompat_parser_options& opt,
    std::vector<raw_buffer>& out_files);

+// convert OpenAI Responses API format to OpenAI Chat Completions API format
+json convert_responses_to_chatcmpl(const json& body);
+
 json anthropic_params_from_json(
    const struct llama_model* model,
    const json& body_in, /* anthropic messages api json semantics */
@@ -474,4 +479,3 @@ bool prompt_cache_equal(llama_context* ctx, const server_tokens& cache_tokens,
    const server_tokens& prompt_tokens, size_t start, const common_prefix& prefix);

 std::string safe_json_to_str(const json& data);
-
--- a/examples/server/server-context.cpp
+++ b/examples/server/server-context.cpp
@@ -337,6 +337,13 @@ void server_slot::reset() {
    json_schema = json();
    generated_tool_call_ids.clear();

+    oai_resp_thinking_block_started = false;
+    oai_resp_text_block_started = false;
+    oai_resp_id.clear();
+    oai_resp_reasoning_id.clear();
+    oai_resp_message_id.clear();
+    oai_resp_fc_id.clear();
+
    task.reset();
 }

@@ -791,7 +798,7 @@ bool server_context::launch_slot_with_task(server_slot& slot, server_task& task)
    const llama_vocab* vocab = llama_model_get_vocab(model);
    if (data.count("__oaicompat") != 0) {
        slot.oaicompat = true;
-        slot.oaicompat_model = json_value(data, "model", std::string(DEFAULT_OAICOMPAT_MODEL));
+        slot.oaicompat_model = task.params.oaicompat_model;
    }
    else {
        slot.oaicompat = false;
@@ -799,6 +806,13 @@ bool server_context::launch_slot_with_task(server_slot& slot, server_task& task)
    }
    slot.params.oaicompat = task.params.oaicompat;
    slot.params.oaicompat_cmpl_id =task.params.oaicompat_cmpl_id;
+
+    slot.oai_resp_thinking_block_started = false;
+    slot.oai_resp_text_block_started = false;
+    slot.oai_resp_id = "resp_" + random_string();
+    slot.oai_resp_reasoning_id = "rs_" + random_string();
+    slot.oai_resp_message_id = "msg_" + random_string();
+    slot.oai_resp_fc_id.clear();
    slot.params.timings_per_token = json_value(data, "timings_per_token", false);
    slot.params.stream = json_value(data, "stream", false);
    auto stream_opt = json_value(data, "stream_options", json::object());
@@ -1593,6 +1607,10 @@ void server_context::send_partial_response(server_slot& slot, completion_token_o
    res->oaicompat = slot.params.oaicompat;
    res->oaicompat_model = slot.task->params.oaicompat_model;
    res->oaicompat_cmpl_id = slot.params.oaicompat_cmpl_id;
+    res->oai_resp_id = slot.oai_resp_id;
+    res->oai_resp_reasoning_id = slot.oai_resp_reasoning_id;
+    res->oai_resp_message_id = slot.oai_resp_message_id;
+    res->oai_resp_fc_id = slot.oai_resp_fc_id;
    res->n_decoded = slot.n_decoded;
    res->n_prompt_tokens = slot.n_prompt_tokens;
    res->data = json{
@@ -1608,6 +1626,9 @@ void server_context::send_partial_response(server_slot& slot, completion_token_o
    res->anthropic_thinking_block_started = slot.anthropic_thinking_block_started;
    res->anthropic_text_block_started = slot.anthropic_text_block_started;

+    res->oai_resp_thinking_block_started = slot.oai_resp_thinking_block_started;
+    res->oai_resp_text_block_started = slot.oai_resp_text_block_started;
+
    for (const auto& diff : res->oaicompat_msg_diffs) {
        if (!diff.reasoning_content_delta.empty() && !slot.anthropic_thinking_block_started) {
            slot.anthropic_thinking_block_started = true;
@@ -1615,6 +1636,15 @@ void server_context::send_partial_response(server_slot& slot, completion_token_o
        if (!diff.content_delta.empty() && !slot.anthropic_text_block_started) {
            slot.anthropic_text_block_started = true;
        }
+        if (!diff.reasoning_content_delta.empty() && !slot.oai_resp_thinking_block_started) {
+            slot.oai_resp_thinking_block_started = true;
+        }
+        if (!diff.content_delta.empty() && !slot.oai_resp_text_block_started) {
+            slot.oai_resp_text_block_started = true;
+        }
+        if (!diff.tool_call_delta.name.empty()) {
+            slot.oai_resp_fc_id = diff.tool_call_delta.id;
+        }
    }

    // populate res->probs_output
@@ -1650,6 +1680,9 @@ void server_context::send_final_response(server_slot& slot) {
    res->oaicompat = slot.params.oaicompat;
    res->oaicompat_cmpl_id = slot.params.oaicompat_cmpl_id;
    res->oaicompat_msg = slot.update_chat_msg(res->oaicompat_msg_diffs);
+    res->oai_resp_id = slot.oai_resp_id;
+    res->oai_resp_reasoning_id = slot.oai_resp_reasoning_id;
+    res->oai_resp_message_id = slot.oai_resp_message_id;
    res->n_decoded = slot.n_decoded;
    res->n_prompt_tokens = slot.n_prompt_tokens;
    res->oaicompat_model = slot.task->params.oaicompat_model;
--- a/examples/server/server-context.h
+++ b/examples/server/server-context.h
@@ -114,6 +114,14 @@ struct server_slot {
    bool anthropic_thinking_block_started = false;
    bool anthropic_text_block_started = false;

+    bool oai_resp_thinking_block_started = false;
+    bool oai_resp_text_block_started = false;
+
+    std::string oai_resp_id;
+    std::string oai_resp_reasoning_id;
+    std::string oai_resp_message_id;
+    std::string oai_resp_fc_id;
+
    int32_t ga_i = 0;   // group-attention state
    int32_t ga_n = 1;   // group-attention factor
    int32_t ga_w = 512; // group-attention width
--- a/examples/server/server-task.cpp
+++ b/examples/server/server-task.cpp
@@ -218,6 +218,133 @@ json server_task_result_cmpl_partial::to_json_oaicompat_chat_partial() {
    return deltas;
 }

+json server_task_result_cmpl_partial::to_json_oaicompat_resp_partial() {
+    std::vector<json> events;
+
+    if (n_decoded == 1) {
+        events.push_back(json{
+            {"event", "response.created"},
+            {"data", json{
+                {"type", "response.created"},
+                {"response", json{
+                    {"id",     oai_resp_id},
+                    {"object", "response"},
+                    {"status", "in_progress"},
+                }},
+            }},
+        });
+        events.push_back(json{
+            {"event", "response.in_progress"},
+            {"data", json{
+                {"type", "response.in_progress"},
+                {"response", json{
+                    {"id",     oai_resp_id},
+                    {"object", "response"},
+                    {"status", "in_progress"},
+                }},
+            }},
+        });
+    }
+
+    for (const auto& diff : oaicompat_msg_diffs) {
+        if (!diff.reasoning_content_delta.empty()) {
+            if (!oai_resp_thinking_block_started) {
+                events.push_back(json{
+                    {"event", "response.output_item.added"},
+                    {"data", json{
+                        {"type", "response.output_item.added"},
+                        {"item", json{
+                            {"id",                oai_resp_reasoning_id},
+                            {"summary",           json::array()},
+                            {"type",              "reasoning"},
+                            {"content",           json::array()},
+                            {"encrypted_content", ""},
+                            {"status",            "in_progress"},
+                        }},
+                    }},
+                });
+                oai_resp_thinking_block_started = true;
+            }
+            events.push_back(json{
+                {"event", "response.reasoning_text.delta"},
+                {"data", json{
+                    {"type",    "response.reasoning_text.delta"},
+                    {"delta",   diff.reasoning_content_delta},
+                    {"item_id", oai_resp_reasoning_id},
+                }},
+            });
+        }
+
+        if (!diff.content_delta.empty()) {
+            if (!oai_resp_text_block_started) {
+                events.push_back(json{
+                    {"event", "response.output_item.added"},
+                    {"data", json{
+                        {"type", "response.output_item.added"},
+                        {"item", json{
+                            {"content", json::array()},
+                            {"id",      oai_resp_message_id},
+                            {"role",    "assistant"},
+                            {"status",  "in_progress"},
+                            {"type",    "message"},
+                        }},
+                    }},
+                });
+                events.push_back(json{
+                    {"event", "response.content_part.added"},
+                    {"data", json{
+                        {"type",    "response.content_part.added"},
+                        {"item_id", oai_resp_message_id},
+                        {"part", json{
+                            {"type", "output_text"},
+                            {"text", ""},
+                        }},
+                    }},
+                });
+                oai_resp_text_block_started = true;
+            }
+            events.push_back(json{
+                {"event", "response.output_text.delta"},
+                {"data", json{
+                    {"type",    "response.output_text.delta"},
+                    {"item_id", oai_resp_message_id},
+                    {"delta",   diff.content_delta},
+                }},
+            });
+        }
+
+        if (!diff.tool_call_delta.name.empty()) {
+            events.push_back(json{
+                {"event", "response.output_item.added"},
+                {"data", json{
+                    {"type",  "response.output_item.added"},
+                    {"item", json{
+                        {"arguments", ""},
+                        {"call_id",   "fc_" + diff.tool_call_delta.id},
+                        {"name",      diff.tool_call_delta.name},
+                        {"type",      "function_call"},
+                        {"status",    "in_progress"},
+                    }},
+                }},
+            });
+            oai_resp_fc_id = diff.tool_call_delta.id;
+        }
+
+        if (!diff.tool_call_delta.arguments.empty()) {
+            events.push_back(json{
+                {"event", "response.function_call_arguments.delta"},
+                {"data", json{
+                    {"type",    "response.function_call_arguments.delta"},
+                    {"delta",   diff.tool_call_delta.arguments},
+                    {"item_id", "fc_" + oai_resp_fc_id},
+                }},
+            });
+        }
+    }
+
+    return events;
+}
+
 json server_task_result_cmpl_final::to_json_oaicompat_chat_final() {
    std::string finish_reason = "length";
    common_chat_msg msg;
@@ -336,6 +463,188 @@ json server_task_result_cmpl_final::to_json_oaicompat_chat_stream() {
    return deltas;
 }

+json server_task_result_cmpl_final::to_json_oaicompat_resp_final() {
+    common_chat_msg msg;
+    if (!oaicompat_msg.empty()) {
+        msg = oaicompat_msg;
+    }
+    else {
+        msg.role = "assistant";
+        msg.content = content;
+    }
+
+    std::vector<json> output;
+
+    if (!msg.reasoning_content.empty()) {
+        output.push_back(json{
+            {"id",      oai_resp_reasoning_id},
+            {"summary", json::array()},
+            {"type",    "reasoning"},
+            {"content", json::array({json{
+                {"text", msg.reasoning_content},
+                {"type", "reasoning_text"},
+            }})},
+            {"encrypted_content", ""},
+            {"status",            "completed"},
+        });
+    }
+
+    if (!msg.content.empty()) {
+        output.push_back(json{
+            {"content", json::array({json{
+                {"type",        "output_text"},
+                {"annotations", json::array()},
+                {"logprobs",    json::array()},
+                {"text",        msg.content},
+            }})},
+            {"id",     oai_resp_message_id},
+            {"role",   msg.role},
+            {"status", "completed"},
+            {"type",   "message"},
+        });
+    }
+
+    for (const auto& tool_call : oaicompat_msg.tool_calls) {
+        output.push_back(json{
+            {"type",      "function_call"},
+            {"status",    "completed"},
+            {"arguments", tool_call.arguments},
+            {"call_id",   "fc_" + tool_call.id},
+            {"name",      tool_call.name},
+        });
+    }
+
+    std::time_t t = std::time(0);
+    json res = {
+        {"completed_at", t},
+        {"created_at",   t},
+        {"id",           oai_resp_id},
+        {"model",        oaicompat_model},
+        {"object",       "response"},
+        {"output",       output},
+        {"status",       "completed"},
+        {"usage",        json{
+            {"input_tokens",  n_prompt_tokens},
+            {"output_tokens", n_decoded},
+            {"total_tokens",  n_decoded + n_prompt_tokens},
+        }},
+    };
+
+    return res;
+}
+
+json server_task_result_cmpl_final::to_json_oaicompat_resp_stream() {
+    std::vector<json> events;
+    std::vector<json> output;
+
+    if (!oaicompat_msg.reasoning_content.empty()) {
+        const json output_item = json{
+            {"id",      oai_resp_reasoning_id},
+            {"summary", json::array()},
+            {"type",    "reasoning"},
+            {"content", json::array({json{
+                {"text", oaicompat_msg.reasoning_content},
+                {"type", "reasoning_text"},
+            }})},
+            {"encrypted_content", ""},
+        };
+
+        events.push_back(json{
+            {"event", "response.output_item.done"},
+            {"data", json{
+                {"type", "response.output_item.done"},
+                {"item", output_item},
+            }},
+        });
+        output.push_back(output_item);
+    }
+
+    if (!oaicompat_msg.content.empty()) {
+        events.push_back(json{
+            {"event", "response.output_text.done"},
+            {"data", json{
+                {"type",    "response.output_text.done"},
+                {"item_id", oai_resp_message_id},
+                {"text",    oaicompat_msg.content},
+            }},
+        });
+
+        const json content_part = {
+            {"type",        "output_text"},
+            {"annotations", json::array()},
+            {"logprobs",    json::array()},
+            {"text",        oaicompat_msg.content},
+        };
+
+        events.push_back(json{
+            {"event", "response.content_part.done"},
+            {"data", json{
+                {"type",    "response.content_part.done"},
+                {"item_id", oai_resp_message_id},
+                {"part",    content_part},
+            }},
+        });
+
+        const json output_item = {
+            {"type",    "message"},
+            {"status",  "completed"},
+            {"id",      oai_resp_message_id},
+            {"content", json::array({content_part})},
+            {"role",    "assistant"},
+        };
+
+        events.push_back(json{
+            {"event", "response.output_item.done"},
+            {"data", json{
+                {"type", "response.output_item.done"},
+                {"item", output_item},
+            }},
+        });
+        output.push_back(output_item);
+    }
+
+    for (const auto& tool_call : oaicompat_msg.tool_calls) {
+        const json output_item = {
+            {"type",      "function_call"},
+            {"status",    "completed"},
+            {"arguments", tool_call.arguments},
+            {"call_id",   "fc_" + tool_call.id},
+            {"name",      tool_call.name},
+        };
+        events.push_back(json{
+            {"event", "response.output_item.done"},
+            {"data", json{
+                {"type", "response.output_item.done"},
+                {"item", output_item},
+            }},
+        });
+        output.push_back(output_item);
+    }
+
+    std::time_t t = std::time(0);
+    events.push_back(json{
+        {"event", "response.completed"},
+        {"data", json{
+            {"type", "response.completed"},
+            {"response", json{
+                {"id",         oai_resp_id},
+                {"object",     "response"},
+                {"created_at", t},
+                {"status",     "completed"},
+                {"model",      oaicompat_model},
+                {"output",     output},
+                {"usage",      json{
+                    {"input_tokens",  n_prompt_tokens},
+                    {"output_tokens", n_decoded},
+                    {"total_tokens",  n_decoded + n_prompt_tokens},
+                }},
+            }},
+        }},
+    });
+
+    return events;
+}
+
 json server_task_result_cmpl_final::to_json_anthropic_final() {
    std::string stop_reason = "max_tokens";
    if (stop == STOP_TYPE_WORD || stop == STOP_TYPE_EOS) {
--- a/examples/server/server-task.h
+++ b/examples/server/server-task.h
@@ -42,6 +42,7 @@ enum oaicompat_type {
    OAICOMPAT_TYPE_COMPLETION,
    OAICOMPAT_TYPE_EMBEDDING,
    OAICOMPAT_TYPE_ANTHROPIC,
+    OAICOMPAT_TYPE_RESP,
 };


@@ -203,6 +204,14 @@ struct server_task_result_cmpl_partial : server_task_result {
    bool anthropic_thinking_block_started = false;
    bool anthropic_text_block_started = false;

+    bool oai_resp_thinking_block_started = false;
+    bool oai_resp_text_block_started = false;
+
+    std::string oai_resp_id;
+    std::string oai_resp_reasoning_id;
+    std::string oai_resp_message_id;
+    std::string oai_resp_fc_id;
+
    virtual bool is_stop() override {
        return false; // in stream mode, partial responses are not considered stop
    }
@@ -215,6 +224,8 @@ struct server_task_result_cmpl_partial : server_task_result {

    json to_json_oaicompat_chat_partial();

+    json to_json_oaicompat_resp_partial();
+
    virtual json to_json() override {
        switch (oaicompat) {
        case OAICOMPAT_TYPE_NONE:
@@ -225,6 +236,8 @@ struct server_task_result_cmpl_partial : server_task_result {
            return to_json_oaicompat_chat_partial();
        case OAICOMPAT_TYPE_ANTHROPIC:
            return to_json_anthropic_partial();
+        case OAICOMPAT_TYPE_RESP:
+            return to_json_oaicompat_resp_partial();
        default:
            GGML_ASSERT(false && "Invalid oaicompat_type");
        };
@@ -232,6 +245,10 @@ struct server_task_result_cmpl_partial : server_task_result {
 };

 struct server_task_result_cmpl_final : server_task_result {
+    std::string oai_resp_id;
+    std::string oai_resp_reasoning_id;
+    std::string oai_resp_message_id;
+
    virtual bool is_stop() override {
        return true;
    }
@@ -248,6 +265,10 @@ struct server_task_result_cmpl_final : server_task_result {

    json to_json_oaicompat_chat_stream();

+    json to_json_oaicompat_resp_final();
+
+    json to_json_oaicompat_resp_stream();
+
    virtual json to_json() override {
        switch (oaicompat) {
        case OAICOMPAT_TYPE_NONE:
@@ -258,6 +279,8 @@ struct server_task_result_cmpl_final : server_task_result {
            return stream ? to_json_oaicompat_chat_stream() : to_json_oaicompat_chat_final();
        case OAICOMPAT_TYPE_ANTHROPIC:
            return stream ? to_json_anthropic_stream() : to_json_anthropic_final();
+        case OAICOMPAT_TYPE_RESP:
+            return stream ? to_json_oaicompat_resp_stream() : to_json_oaicompat_resp_final();
        default:
            GGML_ASSERT(false && "Invalid oaicompat_type");
        }
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@@ -1070,7 +1070,12 @@ int main(int argc, char ** argv) {
                    // Everything else, including multimodal completions.
                    inputs = tokenize_input_prompts(llama_get_vocab(ctx_server.ctx), ctx_server.mctx, prompt, true, true);
                }
-                tasks.reserve(inputs.size());              
+                tasks.reserve(inputs.size());
+                const std::string requested_model_name = json_value(data, "model", std::string());
+                const std::string fallback_model_name = get_model_name(ctx_server.params_base.model);
+                const std::string oaicompat_model_name = requested_model_name.empty()
+                    ? fallback_model_name
+                    : requested_model_name;
                for (size_t i = 0; i < inputs.size(); i++) {
                    server_task task = server_task(type);

@@ -1088,7 +1093,7 @@ int main(int argc, char ** argv) {
                    // OAI-compat
                    task.params.oaicompat = oaicompat;
                    task.params.oaicompat_cmpl_id = completion_id;
-                    task.params.oaicompat_model = get_model_name(ctx_server.params_base.model);
+                    task.params.oaicompat_model = oaicompat_model_name;
                    tasks.push_back(std::move(task));
                }

@@ -1146,6 +1151,9 @@ int main(int argc, char ** argv) {
                        if (oaicompat == OAICOMPAT_TYPE_ANTHROPIC) {
                            return server_sent_anthropic_event(sink, res);
                        }
+                        else if (oaicompat == OAICOMPAT_TYPE_RESP) {
+                            return server_sent_oai_resp_event(sink, res);
+                        }
                        else {
                            return server_sent_event(sink, res);
                        }
@@ -1170,7 +1178,7 @@ int main(int argc, char ** argv) {
                    json res_json = result->to_json();
                    bool ok = false;
                    if (result->is_error()) {
-                        ok = sse(json{ { "error", result->to_json() } });
+                        ok = server_sent_event(sink, json{ { "error", result->to_json() } });
                        sink.done();
                        return false; // go to on_complete()
                    }
@@ -1189,7 +1197,7 @@ int main(int argc, char ** argv) {

                    // check if there is more data
                    if (!rd->has_next()) {
-                        if (oaicompat != OAICOMPAT_TYPE_ANTHROPIC && oaicompat != OAICOMPAT_TYPE_NONE) {
+                        if (oaicompat != OAICOMPAT_TYPE_ANTHROPIC && oaicompat != OAICOMPAT_TYPE_NONE && oaicompat != OAICOMPAT_TYPE_RESP) {
                            static const std::string ev_done = "data: [DONE]\n\n";
                            sink.write(ev_done.data(), ev_done.size());
                        }
@@ -1265,6 +1273,20 @@ int main(int argc, char ** argv) {
            OAICOMPAT_TYPE_CHAT);
    };

+    const auto handle_responses = [&ctx_server, &handle_completions_impl](const httplib::Request & req, httplib::Response & res) {
+        auto body = json::parse(req.body);
+        std::vector<raw_buffer> files;
+        json body_parsed = convert_responses_to_chatcmpl(body);
+        json data = oaicompat_chat_params_parse(ctx_server.model, body_parsed, ctx_server.oai_parser_opt, files);
+        handle_completions_impl(
+            SERVER_TASK_TYPE_COMPLETION,
+            data,
+            files,
+            req.is_connection_closed,
+            res,
+            OAICOMPAT_TYPE_RESP);
+    };
+
    const auto handle_anthropic_messages = [&ctx_server, &handle_completions_impl](const httplib::Request & req, httplib::Response & res) {
        std::vector<raw_buffer> files;
        json body = json::parse(req.body);
@@ -2008,6 +2030,7 @@ int main(int argc, char ** argv) {
    svr->Post("/v1/completions",     handle_completions_oai);
    svr->Post("/chat/completions",    handle_chat_completions);
    svr->Post("/v1/chat/completions", handle_chat_completions);
+    svr->Post("/v1/responses",        handle_responses);
    svr->Post("/v1/messages",         handle_anthropic_messages);
    svr->Post("/v1/messages/count_tokens", handle_anthropic_count_tokens);
    svr->Post("/infill",              handle_infill);
--- a/examples/server/tests/features/server.feature
+++ b/examples/server/tests/features/server.feature
@@ -71,6 +71,22 @@ Feature: llama.cpp server
      | codellama70b | You are a coding assistant. | Write the fibonacci function in c++. | 128        | (thanks\|happy\|bird\|Annabyear)+ | -1       | 64          | enabled          |           |


+  Scenario Outline: OAI Responses Compatibility
+    Given a model <model>
+    And   a system prompt <system_prompt>
+    And   a user prompt <user_prompt>
+    And   <max_tokens> max tokens to predict
+    And   streaming is <enable_streaming>
+    Given an OAI compatible responses request with no api error
+    Then  <n_predicted> tokens are predicted matching <re_content>
+    And   <n_prompt> prompt tokens are processed
+
+    Examples: Prompts
+      | model        | system_prompt               | user_prompt                          | max_tokens | re_content                        | n_prompt | n_predicted | enable_streaming |
+      | llama-2      | Book                        | What is the best book                | 8          | (Here\|what)+                     | 77       | 8           | disabled         |
+      | codellama70b | You are a coding assistant. | Write the fibonacci function in c++. | 128        | (thanks\|happy\|bird\|Annabyear)+ | -1       | 64          | enabled          |
+
+
  Scenario Outline: OAI Compatibility w/ response format
    Given a model test
    And   a system prompt test
--- a/examples/server/tests/features/steps/responses_steps.py
+++ b/examples/server/tests/features/steps/responses_steps.py
@@ -0,0 +1,191 @@
+import json
+from typing import Any
+
+import aiohttp
+from behave import step  # pyright: ignore[reportAttributeAccessIssue]
+from behave.api.async_step import async_run_until_complete
+
+import steps
+
+
+@step("an OAI compatible responses request with {api_error} api error")
+@async_run_until_complete
+async def step_oai_responses(context, api_error):
+    if context.debug:
+        print("Submitting OAI compatible responses request...")
+    expect_api_error = api_error == "raised"
+    seeds = await steps.completions_seed(context, num_seeds=1)
+    completion = await oai_responses(
+        context.prompts.pop(),
+        seeds[0] if seeds is not None else seeds,
+        context.system_prompt,
+        context.base_url,
+        debug=context.debug,
+        model=context.model if hasattr(context, "model") else None,
+        n_predict=context.n_predict if hasattr(context, "n_predict") else None,
+        enable_streaming=context.enable_streaming
+        if hasattr(context, "enable_streaming")
+        else None,
+        user_api_key=context.user_api_key if hasattr(context, "user_api_key") else None,
+        temperature=context.temperature,
+        expect_api_error=expect_api_error,
+    )
+    context.tasks_result.append(completion)
+    if context.debug:
+        print(f"Responses completion response: {completion}")
+    if expect_api_error:
+        assert completion == 401, f"completion must be an 401 status code: {completion}"
+
+
+def extract_responses_output_text(
+    response_json: dict[str, Any],
+) -> tuple[str, str | None]:
+    output_text = ""
+    message_id = None
+    for item in response_json.get("output", []):
+        if item.get("type") != "message":
+            continue
+        message_id = item.get("id")
+        for part in item.get("content", []):
+            if part.get("type") == "output_text":
+                output_text += part.get("text", "")
+    return output_text, message_id
+
+
+async def oai_responses(
+    user_prompt,
+    seed,
+    system_prompt,
+    base_url: str,
+    debug=False,
+    temperature=None,
+    model=None,
+    n_predict=None,
+    enable_streaming=None,
+    user_api_key=None,
+    expect_api_error=None,
+) -> int | dict[str, Any]:
+    if debug:
+        print(f"Sending OAI responses request: {user_prompt}")
+    user_api_key = user_api_key if user_api_key is not None else "nope"
+    seed = seed if seed is not None else 42
+    enable_streaming = enable_streaming if enable_streaming is not None else False
+    payload = {
+        "input": [
+            {
+                "role": "system",
+                "content": system_prompt,
+            },
+            {
+                "role": "user",
+                "content": user_prompt,
+            },
+        ],
+        "model": model,
+        "stream": enable_streaming,
+        "temperature": temperature if temperature is not None else 0.0,
+        "seed": seed,
+    }
+    if n_predict is not None:
+        payload["max_output_tokens"] = n_predict
+    completion_response = {
+        "content": "",
+        "timings": {
+            "predicted_n": 0,
+            "prompt_n": 0,
+        },
+    }
+    origin = "llama.cpp"
+    headers = {"Authorization": f"Bearer {user_api_key}", "Origin": origin}
+    async with aiohttp.ClientSession() as session:
+        async with session.post(
+            f"{base_url}/v1/responses", json=payload, headers=headers
+        ) as response:
+            if expect_api_error is not None and expect_api_error:
+                if response.status == 401:
+                    return 401
+                assert False, f"unexpected status code: {response.status}"
+
+            assert response.status == 200
+            assert response.headers["Access-Control-Allow-Origin"] == origin
+            if enable_streaming:
+                assert response.headers["Content-Type"] == "text/event-stream"
+                resp_id = ""
+                msg_id = ""
+                gathered_text = ""
+                event_name = None
+                completed_response = None
+                async for line_in_bytes in response.content:
+                    line = line_in_bytes.decode("utf-8").strip()
+                    if not line:
+                        continue
+                    if line.startswith("event: "):
+                        event_name = line.split(": ", 1)[1]
+                        continue
+                    if not line.startswith("data: "):
+                        continue
+                    if event_name is None:
+                        continue
+                    chunk_raw = line.split(": ", 1)[1]
+                    data = json.loads(chunk_raw)
+
+                    if event_name == "response.created":
+                        resp_id = data["response"]["id"]
+                        assert resp_id.startswith("resp_")
+                    elif event_name == "response.in_progress":
+                        assert data["response"]["id"] == resp_id
+                    elif event_name == "response.output_item.added":
+                        item = data["item"]
+                        if item.get("type") == "message":
+                            msg_id = item["id"]
+                            assert msg_id.startswith("msg_")
+                    elif event_name in (
+                        "response.content_part.added",
+                        "response.output_text.delta",
+                        "response.output_text.done",
+                        "response.content_part.done",
+                    ):
+                        assert data["item_id"] == msg_id
+                    elif event_name == "response.output_item.done":
+                        item = data["item"]
+                        if item.get("type") == "message":
+                            assert item["id"] == msg_id
+                    if event_name == "response.output_text.delta":
+                        gathered_text += data["delta"]
+                    if event_name == "response.completed":
+                        completed_response = data["response"]
+
+                assert completed_response is not None
+                output_text, completed_msg_id = extract_responses_output_text(
+                    completed_response
+                )
+                assert completed_msg_id is not None
+                assert completed_msg_id.startswith("msg_")
+                assert output_text == gathered_text
+                completion_response = {
+                    "content": output_text,
+                    "timings": {
+                        "predicted_n": completed_response["usage"]["output_tokens"],
+                        "prompt_n": completed_response["usage"]["input_tokens"],
+                    },
+                }
+            else:
+                assert (
+                    response.headers["Content-Type"]
+                    == "application/json; charset=utf-8"
+                )
+                response_json = await response.json()
+                assert response_json["id"].startswith("resp_")
+                output_text, message_id = extract_responses_output_text(response_json)
+                assert message_id is not None
+                assert message_id.startswith("msg_")
+                completion_response = {
+                    "content": output_text,
+                    "timings": {
+                        "predicted_n": response_json["usage"]["output_tokens"],
+                        "prompt_n": response_json["usage"]["input_tokens"],
+                    },
+                }
+    if debug:
+        print("OAI response formatted to llama.cpp:", completion_response)
+    return completion_response