server: add /v1/responses support (#1184)

* server: add /v1/responses support

* server: fix Responses API model fallback and SSE branching
This commit is contained in:
RodriMora
2026-02-14 08:30:18 +01:00
committed by GitHub
parent 1cb7e1bf39
commit 102f77b7d3
10 changed files with 926 additions and 7 deletions

View File

@@ -337,6 +337,13 @@ void server_slot::reset() {
json_schema = json();
generated_tool_call_ids.clear();
oai_resp_thinking_block_started = false;
oai_resp_text_block_started = false;
oai_resp_id.clear();
oai_resp_reasoning_id.clear();
oai_resp_message_id.clear();
oai_resp_fc_id.clear();
task.reset();
}
@@ -791,7 +798,7 @@ bool server_context::launch_slot_with_task(server_slot& slot, server_task& task)
const llama_vocab* vocab = llama_model_get_vocab(model);
if (data.count("__oaicompat") != 0) {
slot.oaicompat = true;
slot.oaicompat_model = json_value(data, "model", std::string(DEFAULT_OAICOMPAT_MODEL));
slot.oaicompat_model = task.params.oaicompat_model;
}
else {
slot.oaicompat = false;
@@ -799,6 +806,13 @@ bool server_context::launch_slot_with_task(server_slot& slot, server_task& task)
}
slot.params.oaicompat = task.params.oaicompat;
slot.params.oaicompat_cmpl_id =task.params.oaicompat_cmpl_id;
slot.oai_resp_thinking_block_started = false;
slot.oai_resp_text_block_started = false;
slot.oai_resp_id = "resp_" + random_string();
slot.oai_resp_reasoning_id = "rs_" + random_string();
slot.oai_resp_message_id = "msg_" + random_string();
slot.oai_resp_fc_id.clear();
slot.params.timings_per_token = json_value(data, "timings_per_token", false);
slot.params.stream = json_value(data, "stream", false);
auto stream_opt = json_value(data, "stream_options", json::object());
@@ -1593,6 +1607,10 @@ void server_context::send_partial_response(server_slot& slot, completion_token_o
res->oaicompat = slot.params.oaicompat;
res->oaicompat_model = slot.task->params.oaicompat_model;
res->oaicompat_cmpl_id = slot.params.oaicompat_cmpl_id;
res->oai_resp_id = slot.oai_resp_id;
res->oai_resp_reasoning_id = slot.oai_resp_reasoning_id;
res->oai_resp_message_id = slot.oai_resp_message_id;
res->oai_resp_fc_id = slot.oai_resp_fc_id;
res->n_decoded = slot.n_decoded;
res->n_prompt_tokens = slot.n_prompt_tokens;
res->data = json{
@@ -1608,6 +1626,9 @@ void server_context::send_partial_response(server_slot& slot, completion_token_o
res->anthropic_thinking_block_started = slot.anthropic_thinking_block_started;
res->anthropic_text_block_started = slot.anthropic_text_block_started;
res->oai_resp_thinking_block_started = slot.oai_resp_thinking_block_started;
res->oai_resp_text_block_started = slot.oai_resp_text_block_started;
for (const auto& diff : res->oaicompat_msg_diffs) {
if (!diff.reasoning_content_delta.empty() && !slot.anthropic_thinking_block_started) {
slot.anthropic_thinking_block_started = true;
@@ -1615,6 +1636,15 @@ void server_context::send_partial_response(server_slot& slot, completion_token_o
if (!diff.content_delta.empty() && !slot.anthropic_text_block_started) {
slot.anthropic_text_block_started = true;
}
if (!diff.reasoning_content_delta.empty() && !slot.oai_resp_thinking_block_started) {
slot.oai_resp_thinking_block_started = true;
}
if (!diff.content_delta.empty() && !slot.oai_resp_text_block_started) {
slot.oai_resp_text_block_started = true;
}
if (!diff.tool_call_delta.name.empty()) {
slot.oai_resp_fc_id = diff.tool_call_delta.id;
}
}
// populate res->probs_output
@@ -1650,6 +1680,9 @@ void server_context::send_final_response(server_slot& slot) {
res->oaicompat = slot.params.oaicompat;
res->oaicompat_cmpl_id = slot.params.oaicompat_cmpl_id;
res->oaicompat_msg = slot.update_chat_msg(res->oaicompat_msg_diffs);
res->oai_resp_id = slot.oai_resp_id;
res->oai_resp_reasoning_id = slot.oai_resp_reasoning_id;
res->oai_resp_message_id = slot.oai_resp_message_id;
res->n_decoded = slot.n_decoded;
res->n_prompt_tokens = slot.n_prompt_tokens;
res->oaicompat_model = slot.task->params.oaicompat_model;