mirror of
https://github.com/ikawrakow/ik_llama.cpp.git
synced 2026-02-20 13:14:09 +00:00
server: add /v1/responses support (#1184)
* server: add /v1/responses support * server: fix Responses API model fallback and SSE branching
This commit is contained in:
@@ -6,7 +6,7 @@ Set of LLM REST APIs and a simple web front end to interact with llama.cpp.
|
||||
|
||||
**Features:**
|
||||
* LLM inference of F16 and quantized models on GPU and CPU
|
||||
* [OpenAI API](https://github.com/openai/openai-openapi) compatible chat completions and embeddings routes
|
||||
* [OpenAI API](https://github.com/openai/openai-openapi) compatible chat completions, responses, and embeddings routes
|
||||
* Parallel decoding with multi-user support
|
||||
* Continuous batching
|
||||
* Multimodal (wip)
|
||||
@@ -706,6 +706,48 @@ curl http://localhost:8080/v1/chat/completions \
|
||||
|
||||
**See our [Function calling](../../docs/function-calling.md) docs** for more details, supported native tool call styles (generic tool call style is used as fallback) / examples of use.
|
||||
|
||||
### POST `/v1/responses`: OpenAI-compatible Responses API
|
||||
|
||||
*Options:*
|
||||
|
||||
See [OpenAI Responses API documentation](https://platform.openai.com/docs/api-reference/responses).
|
||||
|
||||
*Examples:*
|
||||
|
||||
You can use either Python `openai` library with appropriate checkpoints:
|
||||
|
||||
```python
|
||||
import openai
|
||||
|
||||
client = openai.OpenAI(
|
||||
base_url="http://localhost:8080/v1", # "http://<Your api-server IP>:port"
|
||||
api_key = "sk-no-key-required"
|
||||
)
|
||||
|
||||
response = client.responses.create(
|
||||
model="gpt-4.1",
|
||||
instructions="You are ChatGPT, an AI assistant. Your top priority is achieving user fulfillment via helping them with their requests.",
|
||||
input="Write a limerick about python exceptions"
|
||||
)
|
||||
|
||||
print(response.output_text)
|
||||
```
|
||||
|
||||
... or raw HTTP requests:
|
||||
|
||||
```shell
|
||||
curl http://localhost:8080/v1/responses \
|
||||
-H "Content-Type: application/json" \
|
||||
-H "Authorization: Bearer no-key" \
|
||||
-d '{
|
||||
"model": "gpt-4.1",
|
||||
"instructions": "You are ChatGPT, an AI assistant. Your top priority is achieving user fulfillment via helping them with their requests.",
|
||||
"input": "Write a limerick about python exceptions"
|
||||
}'
|
||||
```
|
||||
|
||||
This endpoint works by converting Responses requests into Chat Completions requests.
|
||||
|
||||
### POST `/v1/embeddings`: OpenAI-compatible embeddings API
|
||||
|
||||
*Options:*
|
||||
|
||||
@@ -1,5 +1,7 @@
|
||||
#include "server-common.h"
|
||||
|
||||
#include <algorithm>
|
||||
|
||||
using raw_buffer = std::vector<uint8_t>;
|
||||
|
||||
|
||||
@@ -505,6 +507,30 @@ bool server_sent_event(httplib::DataSink& sink, const json& data) {
|
||||
return true;
|
||||
}
|
||||
|
||||
bool server_sent_oai_resp_event(httplib::DataSink& sink, const json& data) {
|
||||
static auto send_single = [](httplib::DataSink& sink, const json& data) -> bool {
|
||||
const std::string str =
|
||||
"event: " + data.at("event").get<std::string>() + "\n" +
|
||||
"data: " + data.at("data").dump(-1, ' ', false, json::error_handler_t::replace) + "\n\n";
|
||||
|
||||
LOG_DBG("data stream, to_send: %s", str.c_str());
|
||||
return sink.write(str.c_str(), str.size());
|
||||
};
|
||||
|
||||
if (data.is_array()) {
|
||||
for (const auto& item : data) {
|
||||
if (!send_single(sink, item)) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
}
|
||||
else {
|
||||
return send_single(sink, data);
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
bool server_sent_anthropic_event(httplib::DataSink& sink, const json& data) {
|
||||
static auto send_single = [](httplib::DataSink& sink, const json& data) -> bool {
|
||||
const std::string str =
|
||||
@@ -874,6 +900,250 @@ json oaicompat_chat_params_parse(
|
||||
return llama_params;
|
||||
}
|
||||
|
||||
json convert_responses_to_chatcmpl(const json& response_body) {
|
||||
if (!response_body.contains("input")) {
|
||||
throw std::runtime_error("'input' is required");
|
||||
}
|
||||
if (!json_value(response_body, "previous_response_id", std::string{}).empty()) {
|
||||
throw std::runtime_error("ik_llama.cpp does not support 'previous_response_id'.");
|
||||
}
|
||||
|
||||
const json input_value = response_body.at("input");
|
||||
json chatcmpl_body = response_body;
|
||||
chatcmpl_body.erase("input");
|
||||
std::vector<json> chatcmpl_messages;
|
||||
|
||||
if (response_body.contains("instructions")) {
|
||||
chatcmpl_messages.push_back({
|
||||
{"role", "system"},
|
||||
{"content", json_value(response_body, "instructions", std::string())},
|
||||
});
|
||||
chatcmpl_body.erase("instructions");
|
||||
}
|
||||
|
||||
if (input_value.is_string()) {
|
||||
chatcmpl_messages.push_back({
|
||||
{"role", "user"},
|
||||
{"content", input_value},
|
||||
});
|
||||
}
|
||||
else if (input_value.is_array()) {
|
||||
static auto exists_and_is_array = [](const json& j, const char* key) -> bool {
|
||||
return j.contains(key) && j.at(key).is_array();
|
||||
};
|
||||
static auto exists_and_is_string = [](const json& j, const char* key) -> bool {
|
||||
return j.contains(key) && j.at(key).is_string();
|
||||
};
|
||||
|
||||
for (json item : input_value) {
|
||||
if (exists_and_is_string(item, "content")) {
|
||||
item["content"] = json::array({
|
||||
json{
|
||||
{"text", item.at("content")},
|
||||
{"type", "input_text"},
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
if (exists_and_is_array(item, "content") &&
|
||||
exists_and_is_string(item, "role") &&
|
||||
(item.at("role") == "user" || item.at("role") == "system" || item.at("role") == "developer")
|
||||
) {
|
||||
std::vector<json> chatcmpl_content;
|
||||
|
||||
for (const json& input_item : item.at("content")) {
|
||||
const std::string type = json_value(input_item, "type", std::string());
|
||||
|
||||
if (type == "input_text") {
|
||||
if (!input_item.contains("text")) {
|
||||
throw std::runtime_error("'Input text' requires 'text'");
|
||||
}
|
||||
chatcmpl_content.push_back({
|
||||
{"text", input_item.at("text")},
|
||||
{"type", "text"},
|
||||
});
|
||||
}
|
||||
else if (type == "input_image") {
|
||||
if (!input_item.contains("image_url")) {
|
||||
throw std::runtime_error("'image_url' is required");
|
||||
}
|
||||
chatcmpl_content.push_back({
|
||||
{"image_url", json{
|
||||
{"url", input_item.at("image_url")},
|
||||
}},
|
||||
{"type", "image_url"},
|
||||
});
|
||||
}
|
||||
else if (type == "input_file") {
|
||||
throw std::runtime_error("'input_file' is not supported by ik_llama.cpp at this moment");
|
||||
}
|
||||
else {
|
||||
throw std::runtime_error("'type' must be one of 'input_text', 'input_image', or 'input_file'");
|
||||
}
|
||||
}
|
||||
|
||||
if (item.contains("type")) {
|
||||
item.erase("type");
|
||||
}
|
||||
if (item.contains("status")) {
|
||||
item.erase("status");
|
||||
}
|
||||
item["content"] = chatcmpl_content;
|
||||
|
||||
chatcmpl_messages.push_back(item);
|
||||
}
|
||||
else if (exists_and_is_array(item, "content") &&
|
||||
exists_and_is_string(item, "role") &&
|
||||
item.at("role") == "assistant" &&
|
||||
exists_and_is_string(item, "type") &&
|
||||
item.at("type") == "message"
|
||||
) {
|
||||
std::vector<json> chatcmpl_content;
|
||||
|
||||
for (const auto& output_text : item.at("content")) {
|
||||
const std::string type = json_value(output_text, "type", std::string());
|
||||
if (type != "output_text") {
|
||||
throw std::runtime_error("'type' must be 'output_text'");
|
||||
}
|
||||
if (!exists_and_is_string(output_text, "text")) {
|
||||
throw std::runtime_error("'Output text' requires 'text'");
|
||||
}
|
||||
chatcmpl_content.push_back({
|
||||
{"text", output_text.at("text")},
|
||||
{"type", "text"},
|
||||
});
|
||||
}
|
||||
|
||||
item.erase("status");
|
||||
item.erase("type");
|
||||
item["content"] = chatcmpl_content;
|
||||
chatcmpl_messages.push_back(item);
|
||||
}
|
||||
else if (exists_and_is_string(item, "arguments") &&
|
||||
exists_and_is_string(item, "call_id") &&
|
||||
exists_and_is_string(item, "name") &&
|
||||
exists_and_is_string(item, "type") &&
|
||||
item.at("type") == "function_call"
|
||||
) {
|
||||
json msg = json{
|
||||
{"role", "assistant"},
|
||||
{"tool_calls", json::array({json{
|
||||
{"function", json{
|
||||
{"arguments", item.at("arguments")},
|
||||
{"name", item.at("name")},
|
||||
}},
|
||||
{"id", item.at("call_id")},
|
||||
{"type", "function"},
|
||||
}})},
|
||||
};
|
||||
|
||||
if (!chatcmpl_messages.empty() && chatcmpl_messages.back().contains("reasoning_content")) {
|
||||
msg["reasoning_content"] = chatcmpl_messages.back().at("reasoning_content");
|
||||
chatcmpl_messages.pop_back();
|
||||
}
|
||||
chatcmpl_messages.push_back(msg);
|
||||
}
|
||||
else if (exists_and_is_string(item, "call_id") &&
|
||||
(exists_and_is_string(item, "output") || exists_and_is_array(item, "output")) &&
|
||||
exists_and_is_string(item, "type") &&
|
||||
item.at("type") == "function_call_output"
|
||||
) {
|
||||
if (item.at("output").is_string()) {
|
||||
chatcmpl_messages.push_back(json{
|
||||
{"content", item.at("output")},
|
||||
{"role", "tool"},
|
||||
{"tool_call_id", item.at("call_id")},
|
||||
});
|
||||
}
|
||||
else {
|
||||
json chatcmpl_outputs = item.at("output");
|
||||
for (json& chatcmpl_output : chatcmpl_outputs) {
|
||||
if (!chatcmpl_output.contains("type") || chatcmpl_output.at("type") != "input_text") {
|
||||
throw std::runtime_error("Output of tool call should be 'Input text'");
|
||||
}
|
||||
chatcmpl_output["type"] = "text";
|
||||
}
|
||||
chatcmpl_messages.push_back(json{
|
||||
{"content", chatcmpl_outputs},
|
||||
{"role", "tool"},
|
||||
{"tool_call_id", item.at("call_id")},
|
||||
});
|
||||
}
|
||||
}
|
||||
else if (exists_and_is_array(item, "summary") &&
|
||||
exists_and_is_string(item, "type") &&
|
||||
item.at("type") == "reasoning") {
|
||||
if (!exists_and_is_array(item, "content")) {
|
||||
throw std::runtime_error("item['content'] is not an array");
|
||||
}
|
||||
if (item.at("content").empty()) {
|
||||
throw std::runtime_error("item['content'] is empty");
|
||||
}
|
||||
if (!exists_and_is_string(item.at("content")[0], "text")) {
|
||||
throw std::runtime_error("item['content']['text'] is not a string");
|
||||
}
|
||||
|
||||
chatcmpl_messages.push_back(json{
|
||||
{"role", "assistant"},
|
||||
{"content", json::array()},
|
||||
{"reasoning_content", item.at("content")[0].at("text")},
|
||||
});
|
||||
}
|
||||
else {
|
||||
throw std::runtime_error("Cannot determine type of 'item'");
|
||||
}
|
||||
}
|
||||
}
|
||||
else {
|
||||
throw std::runtime_error("'input' must be a string or array of objects");
|
||||
}
|
||||
|
||||
chatcmpl_messages.erase(std::remove_if(
|
||||
chatcmpl_messages.begin(),
|
||||
chatcmpl_messages.end(),
|
||||
[](const json& x) {
|
||||
return x.contains("role") &&
|
||||
x.at("role") == "assistant" &&
|
||||
x.contains("content") &&
|
||||
x.at("content") == json::array() &&
|
||||
x.contains("reasoning_content");
|
||||
}),
|
||||
chatcmpl_messages.end());
|
||||
|
||||
chatcmpl_body["messages"] = chatcmpl_messages;
|
||||
|
||||
if (response_body.contains("tools")) {
|
||||
if (!response_body.at("tools").is_array()) {
|
||||
throw std::runtime_error("'tools' must be an array of objects");
|
||||
}
|
||||
std::vector<json> chatcmpl_tools;
|
||||
for (json resp_tool : response_body.at("tools")) {
|
||||
json chatcmpl_tool;
|
||||
|
||||
if (json_value(resp_tool, "type", std::string()) != "function") {
|
||||
throw std::runtime_error("'type' of tool must be 'function'");
|
||||
}
|
||||
resp_tool.erase("type");
|
||||
chatcmpl_tool["type"] = "function";
|
||||
|
||||
if (!resp_tool.contains("strict")) {
|
||||
resp_tool["strict"] = true;
|
||||
}
|
||||
chatcmpl_tool["function"] = resp_tool;
|
||||
chatcmpl_tools.push_back(chatcmpl_tool);
|
||||
}
|
||||
chatcmpl_body.erase("tools");
|
||||
chatcmpl_body["tools"] = chatcmpl_tools;
|
||||
}
|
||||
|
||||
if (response_body.contains("max_output_tokens")) {
|
||||
chatcmpl_body.erase("max_output_tokens");
|
||||
chatcmpl_body["max_tokens"] = response_body["max_output_tokens"];
|
||||
}
|
||||
|
||||
return chatcmpl_body;
|
||||
}
|
||||
|
||||
json anthropic_params_from_json(
|
||||
const struct llama_model* model,
|
||||
const json& body_in, /* anthropic messages api json semantics */
|
||||
|
||||
@@ -233,6 +233,8 @@ json probs_vector_to_json(const llama_context* ctx, const std::vector<completion
|
||||
|
||||
bool server_sent_event(httplib::DataSink& sink, const json& data);
|
||||
|
||||
bool server_sent_oai_resp_event(httplib::DataSink& sink, const json& data);
|
||||
|
||||
bool server_sent_anthropic_event(httplib::DataSink& sink, const json& data);
|
||||
|
||||
//
|
||||
@@ -259,6 +261,9 @@ json oaicompat_chat_params_parse(
|
||||
const oaicompat_parser_options& opt,
|
||||
std::vector<raw_buffer>& out_files);
|
||||
|
||||
// convert OpenAI Responses API format to OpenAI Chat Completions API format
|
||||
json convert_responses_to_chatcmpl(const json& body);
|
||||
|
||||
json anthropic_params_from_json(
|
||||
const struct llama_model* model,
|
||||
const json& body_in, /* anthropic messages api json semantics */
|
||||
@@ -474,4 +479,3 @@ bool prompt_cache_equal(llama_context* ctx, const server_tokens& cache_tokens,
|
||||
const server_tokens& prompt_tokens, size_t start, const common_prefix& prefix);
|
||||
|
||||
std::string safe_json_to_str(const json& data);
|
||||
|
||||
|
||||
@@ -337,6 +337,13 @@ void server_slot::reset() {
|
||||
json_schema = json();
|
||||
generated_tool_call_ids.clear();
|
||||
|
||||
oai_resp_thinking_block_started = false;
|
||||
oai_resp_text_block_started = false;
|
||||
oai_resp_id.clear();
|
||||
oai_resp_reasoning_id.clear();
|
||||
oai_resp_message_id.clear();
|
||||
oai_resp_fc_id.clear();
|
||||
|
||||
task.reset();
|
||||
}
|
||||
|
||||
@@ -791,7 +798,7 @@ bool server_context::launch_slot_with_task(server_slot& slot, server_task& task)
|
||||
const llama_vocab* vocab = llama_model_get_vocab(model);
|
||||
if (data.count("__oaicompat") != 0) {
|
||||
slot.oaicompat = true;
|
||||
slot.oaicompat_model = json_value(data, "model", std::string(DEFAULT_OAICOMPAT_MODEL));
|
||||
slot.oaicompat_model = task.params.oaicompat_model;
|
||||
}
|
||||
else {
|
||||
slot.oaicompat = false;
|
||||
@@ -799,6 +806,13 @@ bool server_context::launch_slot_with_task(server_slot& slot, server_task& task)
|
||||
}
|
||||
slot.params.oaicompat = task.params.oaicompat;
|
||||
slot.params.oaicompat_cmpl_id =task.params.oaicompat_cmpl_id;
|
||||
|
||||
slot.oai_resp_thinking_block_started = false;
|
||||
slot.oai_resp_text_block_started = false;
|
||||
slot.oai_resp_id = "resp_" + random_string();
|
||||
slot.oai_resp_reasoning_id = "rs_" + random_string();
|
||||
slot.oai_resp_message_id = "msg_" + random_string();
|
||||
slot.oai_resp_fc_id.clear();
|
||||
slot.params.timings_per_token = json_value(data, "timings_per_token", false);
|
||||
slot.params.stream = json_value(data, "stream", false);
|
||||
auto stream_opt = json_value(data, "stream_options", json::object());
|
||||
@@ -1593,6 +1607,10 @@ void server_context::send_partial_response(server_slot& slot, completion_token_o
|
||||
res->oaicompat = slot.params.oaicompat;
|
||||
res->oaicompat_model = slot.task->params.oaicompat_model;
|
||||
res->oaicompat_cmpl_id = slot.params.oaicompat_cmpl_id;
|
||||
res->oai_resp_id = slot.oai_resp_id;
|
||||
res->oai_resp_reasoning_id = slot.oai_resp_reasoning_id;
|
||||
res->oai_resp_message_id = slot.oai_resp_message_id;
|
||||
res->oai_resp_fc_id = slot.oai_resp_fc_id;
|
||||
res->n_decoded = slot.n_decoded;
|
||||
res->n_prompt_tokens = slot.n_prompt_tokens;
|
||||
res->data = json{
|
||||
@@ -1608,6 +1626,9 @@ void server_context::send_partial_response(server_slot& slot, completion_token_o
|
||||
res->anthropic_thinking_block_started = slot.anthropic_thinking_block_started;
|
||||
res->anthropic_text_block_started = slot.anthropic_text_block_started;
|
||||
|
||||
res->oai_resp_thinking_block_started = slot.oai_resp_thinking_block_started;
|
||||
res->oai_resp_text_block_started = slot.oai_resp_text_block_started;
|
||||
|
||||
for (const auto& diff : res->oaicompat_msg_diffs) {
|
||||
if (!diff.reasoning_content_delta.empty() && !slot.anthropic_thinking_block_started) {
|
||||
slot.anthropic_thinking_block_started = true;
|
||||
@@ -1615,6 +1636,15 @@ void server_context::send_partial_response(server_slot& slot, completion_token_o
|
||||
if (!diff.content_delta.empty() && !slot.anthropic_text_block_started) {
|
||||
slot.anthropic_text_block_started = true;
|
||||
}
|
||||
if (!diff.reasoning_content_delta.empty() && !slot.oai_resp_thinking_block_started) {
|
||||
slot.oai_resp_thinking_block_started = true;
|
||||
}
|
||||
if (!diff.content_delta.empty() && !slot.oai_resp_text_block_started) {
|
||||
slot.oai_resp_text_block_started = true;
|
||||
}
|
||||
if (!diff.tool_call_delta.name.empty()) {
|
||||
slot.oai_resp_fc_id = diff.tool_call_delta.id;
|
||||
}
|
||||
}
|
||||
|
||||
// populate res->probs_output
|
||||
@@ -1650,6 +1680,9 @@ void server_context::send_final_response(server_slot& slot) {
|
||||
res->oaicompat = slot.params.oaicompat;
|
||||
res->oaicompat_cmpl_id = slot.params.oaicompat_cmpl_id;
|
||||
res->oaicompat_msg = slot.update_chat_msg(res->oaicompat_msg_diffs);
|
||||
res->oai_resp_id = slot.oai_resp_id;
|
||||
res->oai_resp_reasoning_id = slot.oai_resp_reasoning_id;
|
||||
res->oai_resp_message_id = slot.oai_resp_message_id;
|
||||
res->n_decoded = slot.n_decoded;
|
||||
res->n_prompt_tokens = slot.n_prompt_tokens;
|
||||
res->oaicompat_model = slot.task->params.oaicompat_model;
|
||||
|
||||
@@ -114,6 +114,14 @@ struct server_slot {
|
||||
bool anthropic_thinking_block_started = false;
|
||||
bool anthropic_text_block_started = false;
|
||||
|
||||
bool oai_resp_thinking_block_started = false;
|
||||
bool oai_resp_text_block_started = false;
|
||||
|
||||
std::string oai_resp_id;
|
||||
std::string oai_resp_reasoning_id;
|
||||
std::string oai_resp_message_id;
|
||||
std::string oai_resp_fc_id;
|
||||
|
||||
int32_t ga_i = 0; // group-attention state
|
||||
int32_t ga_n = 1; // group-attention factor
|
||||
int32_t ga_w = 512; // group-attention width
|
||||
|
||||
@@ -218,6 +218,133 @@ json server_task_result_cmpl_partial::to_json_oaicompat_chat_partial() {
|
||||
return deltas;
|
||||
}
|
||||
|
||||
json server_task_result_cmpl_partial::to_json_oaicompat_resp_partial() {
|
||||
std::vector<json> events;
|
||||
|
||||
if (n_decoded == 1) {
|
||||
events.push_back(json{
|
||||
{"event", "response.created"},
|
||||
{"data", json{
|
||||
{"type", "response.created"},
|
||||
{"response", json{
|
||||
{"id", oai_resp_id},
|
||||
{"object", "response"},
|
||||
{"status", "in_progress"},
|
||||
}},
|
||||
}},
|
||||
});
|
||||
events.push_back(json{
|
||||
{"event", "response.in_progress"},
|
||||
{"data", json{
|
||||
{"type", "response.in_progress"},
|
||||
{"response", json{
|
||||
{"id", oai_resp_id},
|
||||
{"object", "response"},
|
||||
{"status", "in_progress"},
|
||||
}},
|
||||
}},
|
||||
});
|
||||
}
|
||||
|
||||
for (const auto& diff : oaicompat_msg_diffs) {
|
||||
if (!diff.reasoning_content_delta.empty()) {
|
||||
if (!oai_resp_thinking_block_started) {
|
||||
events.push_back(json{
|
||||
{"event", "response.output_item.added"},
|
||||
{"data", json{
|
||||
{"type", "response.output_item.added"},
|
||||
{"item", json{
|
||||
{"id", oai_resp_reasoning_id},
|
||||
{"summary", json::array()},
|
||||
{"type", "reasoning"},
|
||||
{"content", json::array()},
|
||||
{"encrypted_content", ""},
|
||||
{"status", "in_progress"},
|
||||
}},
|
||||
}},
|
||||
});
|
||||
oai_resp_thinking_block_started = true;
|
||||
}
|
||||
events.push_back(json{
|
||||
{"event", "response.reasoning_text.delta"},
|
||||
{"data", json{
|
||||
{"type", "response.reasoning_text.delta"},
|
||||
{"delta", diff.reasoning_content_delta},
|
||||
{"item_id", oai_resp_reasoning_id},
|
||||
}},
|
||||
});
|
||||
}
|
||||
|
||||
if (!diff.content_delta.empty()) {
|
||||
if (!oai_resp_text_block_started) {
|
||||
events.push_back(json{
|
||||
{"event", "response.output_item.added"},
|
||||
{"data", json{
|
||||
{"type", "response.output_item.added"},
|
||||
{"item", json{
|
||||
{"content", json::array()},
|
||||
{"id", oai_resp_message_id},
|
||||
{"role", "assistant"},
|
||||
{"status", "in_progress"},
|
||||
{"type", "message"},
|
||||
}},
|
||||
}},
|
||||
});
|
||||
events.push_back(json{
|
||||
{"event", "response.content_part.added"},
|
||||
{"data", json{
|
||||
{"type", "response.content_part.added"},
|
||||
{"item_id", oai_resp_message_id},
|
||||
{"part", json{
|
||||
{"type", "output_text"},
|
||||
{"text", ""},
|
||||
}},
|
||||
}},
|
||||
});
|
||||
oai_resp_text_block_started = true;
|
||||
}
|
||||
events.push_back(json{
|
||||
{"event", "response.output_text.delta"},
|
||||
{"data", json{
|
||||
{"type", "response.output_text.delta"},
|
||||
{"item_id", oai_resp_message_id},
|
||||
{"delta", diff.content_delta},
|
||||
}},
|
||||
});
|
||||
}
|
||||
|
||||
if (!diff.tool_call_delta.name.empty()) {
|
||||
events.push_back(json{
|
||||
{"event", "response.output_item.added"},
|
||||
{"data", json{
|
||||
{"type", "response.output_item.added"},
|
||||
{"item", json{
|
||||
{"arguments", ""},
|
||||
{"call_id", "fc_" + diff.tool_call_delta.id},
|
||||
{"name", diff.tool_call_delta.name},
|
||||
{"type", "function_call"},
|
||||
{"status", "in_progress"},
|
||||
}},
|
||||
}},
|
||||
});
|
||||
oai_resp_fc_id = diff.tool_call_delta.id;
|
||||
}
|
||||
|
||||
if (!diff.tool_call_delta.arguments.empty()) {
|
||||
events.push_back(json{
|
||||
{"event", "response.function_call_arguments.delta"},
|
||||
{"data", json{
|
||||
{"type", "response.function_call_arguments.delta"},
|
||||
{"delta", diff.tool_call_delta.arguments},
|
||||
{"item_id", "fc_" + oai_resp_fc_id},
|
||||
}},
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
return events;
|
||||
}
|
||||
|
||||
json server_task_result_cmpl_final::to_json_oaicompat_chat_final() {
|
||||
std::string finish_reason = "length";
|
||||
common_chat_msg msg;
|
||||
@@ -336,6 +463,188 @@ json server_task_result_cmpl_final::to_json_oaicompat_chat_stream() {
|
||||
return deltas;
|
||||
}
|
||||
|
||||
json server_task_result_cmpl_final::to_json_oaicompat_resp_final() {
|
||||
common_chat_msg msg;
|
||||
if (!oaicompat_msg.empty()) {
|
||||
msg = oaicompat_msg;
|
||||
}
|
||||
else {
|
||||
msg.role = "assistant";
|
||||
msg.content = content;
|
||||
}
|
||||
|
||||
std::vector<json> output;
|
||||
|
||||
if (!msg.reasoning_content.empty()) {
|
||||
output.push_back(json{
|
||||
{"id", oai_resp_reasoning_id},
|
||||
{"summary", json::array()},
|
||||
{"type", "reasoning"},
|
||||
{"content", json::array({json{
|
||||
{"text", msg.reasoning_content},
|
||||
{"type", "reasoning_text"},
|
||||
}})},
|
||||
{"encrypted_content", ""},
|
||||
{"status", "completed"},
|
||||
});
|
||||
}
|
||||
|
||||
if (!msg.content.empty()) {
|
||||
output.push_back(json{
|
||||
{"content", json::array({json{
|
||||
{"type", "output_text"},
|
||||
{"annotations", json::array()},
|
||||
{"logprobs", json::array()},
|
||||
{"text", msg.content},
|
||||
}})},
|
||||
{"id", oai_resp_message_id},
|
||||
{"role", msg.role},
|
||||
{"status", "completed"},
|
||||
{"type", "message"},
|
||||
});
|
||||
}
|
||||
|
||||
for (const auto& tool_call : oaicompat_msg.tool_calls) {
|
||||
output.push_back(json{
|
||||
{"type", "function_call"},
|
||||
{"status", "completed"},
|
||||
{"arguments", tool_call.arguments},
|
||||
{"call_id", "fc_" + tool_call.id},
|
||||
{"name", tool_call.name},
|
||||
});
|
||||
}
|
||||
|
||||
std::time_t t = std::time(0);
|
||||
json res = {
|
||||
{"completed_at", t},
|
||||
{"created_at", t},
|
||||
{"id", oai_resp_id},
|
||||
{"model", oaicompat_model},
|
||||
{"object", "response"},
|
||||
{"output", output},
|
||||
{"status", "completed"},
|
||||
{"usage", json{
|
||||
{"input_tokens", n_prompt_tokens},
|
||||
{"output_tokens", n_decoded},
|
||||
{"total_tokens", n_decoded + n_prompt_tokens},
|
||||
}},
|
||||
};
|
||||
|
||||
return res;
|
||||
}
|
||||
|
||||
json server_task_result_cmpl_final::to_json_oaicompat_resp_stream() {
|
||||
std::vector<json> events;
|
||||
std::vector<json> output;
|
||||
|
||||
if (!oaicompat_msg.reasoning_content.empty()) {
|
||||
const json output_item = json{
|
||||
{"id", oai_resp_reasoning_id},
|
||||
{"summary", json::array()},
|
||||
{"type", "reasoning"},
|
||||
{"content", json::array({json{
|
||||
{"text", oaicompat_msg.reasoning_content},
|
||||
{"type", "reasoning_text"},
|
||||
}})},
|
||||
{"encrypted_content", ""},
|
||||
};
|
||||
|
||||
events.push_back(json{
|
||||
{"event", "response.output_item.done"},
|
||||
{"data", json{
|
||||
{"type", "response.output_item.done"},
|
||||
{"item", output_item},
|
||||
}},
|
||||
});
|
||||
output.push_back(output_item);
|
||||
}
|
||||
|
||||
if (!oaicompat_msg.content.empty()) {
|
||||
events.push_back(json{
|
||||
{"event", "response.output_text.done"},
|
||||
{"data", json{
|
||||
{"type", "response.output_text.done"},
|
||||
{"item_id", oai_resp_message_id},
|
||||
{"text", oaicompat_msg.content},
|
||||
}},
|
||||
});
|
||||
|
||||
const json content_part = {
|
||||
{"type", "output_text"},
|
||||
{"annotations", json::array()},
|
||||
{"logprobs", json::array()},
|
||||
{"text", oaicompat_msg.content},
|
||||
};
|
||||
|
||||
events.push_back(json{
|
||||
{"event", "response.content_part.done"},
|
||||
{"data", json{
|
||||
{"type", "response.content_part.done"},
|
||||
{"item_id", oai_resp_message_id},
|
||||
{"part", content_part},
|
||||
}},
|
||||
});
|
||||
|
||||
const json output_item = {
|
||||
{"type", "message"},
|
||||
{"status", "completed"},
|
||||
{"id", oai_resp_message_id},
|
||||
{"content", json::array({content_part})},
|
||||
{"role", "assistant"},
|
||||
};
|
||||
|
||||
events.push_back(json{
|
||||
{"event", "response.output_item.done"},
|
||||
{"data", json{
|
||||
{"type", "response.output_item.done"},
|
||||
{"item", output_item},
|
||||
}},
|
||||
});
|
||||
output.push_back(output_item);
|
||||
}
|
||||
|
||||
for (const auto& tool_call : oaicompat_msg.tool_calls) {
|
||||
const json output_item = {
|
||||
{"type", "function_call"},
|
||||
{"status", "completed"},
|
||||
{"arguments", tool_call.arguments},
|
||||
{"call_id", "fc_" + tool_call.id},
|
||||
{"name", tool_call.name},
|
||||
};
|
||||
events.push_back(json{
|
||||
{"event", "response.output_item.done"},
|
||||
{"data", json{
|
||||
{"type", "response.output_item.done"},
|
||||
{"item", output_item},
|
||||
}},
|
||||
});
|
||||
output.push_back(output_item);
|
||||
}
|
||||
|
||||
std::time_t t = std::time(0);
|
||||
events.push_back(json{
|
||||
{"event", "response.completed"},
|
||||
{"data", json{
|
||||
{"type", "response.completed"},
|
||||
{"response", json{
|
||||
{"id", oai_resp_id},
|
||||
{"object", "response"},
|
||||
{"created_at", t},
|
||||
{"status", "completed"},
|
||||
{"model", oaicompat_model},
|
||||
{"output", output},
|
||||
{"usage", json{
|
||||
{"input_tokens", n_prompt_tokens},
|
||||
{"output_tokens", n_decoded},
|
||||
{"total_tokens", n_decoded + n_prompt_tokens},
|
||||
}},
|
||||
}},
|
||||
}},
|
||||
});
|
||||
|
||||
return events;
|
||||
}
|
||||
|
||||
json server_task_result_cmpl_final::to_json_anthropic_final() {
|
||||
std::string stop_reason = "max_tokens";
|
||||
if (stop == STOP_TYPE_WORD || stop == STOP_TYPE_EOS) {
|
||||
|
||||
@@ -42,6 +42,7 @@ enum oaicompat_type {
|
||||
OAICOMPAT_TYPE_COMPLETION,
|
||||
OAICOMPAT_TYPE_EMBEDDING,
|
||||
OAICOMPAT_TYPE_ANTHROPIC,
|
||||
OAICOMPAT_TYPE_RESP,
|
||||
};
|
||||
|
||||
|
||||
@@ -203,6 +204,14 @@ struct server_task_result_cmpl_partial : server_task_result {
|
||||
bool anthropic_thinking_block_started = false;
|
||||
bool anthropic_text_block_started = false;
|
||||
|
||||
bool oai_resp_thinking_block_started = false;
|
||||
bool oai_resp_text_block_started = false;
|
||||
|
||||
std::string oai_resp_id;
|
||||
std::string oai_resp_reasoning_id;
|
||||
std::string oai_resp_message_id;
|
||||
std::string oai_resp_fc_id;
|
||||
|
||||
virtual bool is_stop() override {
|
||||
return false; // in stream mode, partial responses are not considered stop
|
||||
}
|
||||
@@ -215,6 +224,8 @@ struct server_task_result_cmpl_partial : server_task_result {
|
||||
|
||||
json to_json_oaicompat_chat_partial();
|
||||
|
||||
json to_json_oaicompat_resp_partial();
|
||||
|
||||
virtual json to_json() override {
|
||||
switch (oaicompat) {
|
||||
case OAICOMPAT_TYPE_NONE:
|
||||
@@ -225,6 +236,8 @@ struct server_task_result_cmpl_partial : server_task_result {
|
||||
return to_json_oaicompat_chat_partial();
|
||||
case OAICOMPAT_TYPE_ANTHROPIC:
|
||||
return to_json_anthropic_partial();
|
||||
case OAICOMPAT_TYPE_RESP:
|
||||
return to_json_oaicompat_resp_partial();
|
||||
default:
|
||||
GGML_ASSERT(false && "Invalid oaicompat_type");
|
||||
};
|
||||
@@ -232,6 +245,10 @@ struct server_task_result_cmpl_partial : server_task_result {
|
||||
};
|
||||
|
||||
struct server_task_result_cmpl_final : server_task_result {
|
||||
std::string oai_resp_id;
|
||||
std::string oai_resp_reasoning_id;
|
||||
std::string oai_resp_message_id;
|
||||
|
||||
virtual bool is_stop() override {
|
||||
return true;
|
||||
}
|
||||
@@ -248,6 +265,10 @@ struct server_task_result_cmpl_final : server_task_result {
|
||||
|
||||
json to_json_oaicompat_chat_stream();
|
||||
|
||||
json to_json_oaicompat_resp_final();
|
||||
|
||||
json to_json_oaicompat_resp_stream();
|
||||
|
||||
virtual json to_json() override {
|
||||
switch (oaicompat) {
|
||||
case OAICOMPAT_TYPE_NONE:
|
||||
@@ -258,6 +279,8 @@ struct server_task_result_cmpl_final : server_task_result {
|
||||
return stream ? to_json_oaicompat_chat_stream() : to_json_oaicompat_chat_final();
|
||||
case OAICOMPAT_TYPE_ANTHROPIC:
|
||||
return stream ? to_json_anthropic_stream() : to_json_anthropic_final();
|
||||
case OAICOMPAT_TYPE_RESP:
|
||||
return stream ? to_json_oaicompat_resp_stream() : to_json_oaicompat_resp_final();
|
||||
default:
|
||||
GGML_ASSERT(false && "Invalid oaicompat_type");
|
||||
}
|
||||
|
||||
@@ -1070,7 +1070,12 @@ int main(int argc, char ** argv) {
|
||||
// Everything else, including multimodal completions.
|
||||
inputs = tokenize_input_prompts(llama_get_vocab(ctx_server.ctx), ctx_server.mctx, prompt, true, true);
|
||||
}
|
||||
tasks.reserve(inputs.size());
|
||||
tasks.reserve(inputs.size());
|
||||
const std::string requested_model_name = json_value(data, "model", std::string());
|
||||
const std::string fallback_model_name = get_model_name(ctx_server.params_base.model);
|
||||
const std::string oaicompat_model_name = requested_model_name.empty()
|
||||
? fallback_model_name
|
||||
: requested_model_name;
|
||||
for (size_t i = 0; i < inputs.size(); i++) {
|
||||
server_task task = server_task(type);
|
||||
|
||||
@@ -1088,7 +1093,7 @@ int main(int argc, char ** argv) {
|
||||
// OAI-compat
|
||||
task.params.oaicompat = oaicompat;
|
||||
task.params.oaicompat_cmpl_id = completion_id;
|
||||
task.params.oaicompat_model = get_model_name(ctx_server.params_base.model);
|
||||
task.params.oaicompat_model = oaicompat_model_name;
|
||||
tasks.push_back(std::move(task));
|
||||
}
|
||||
|
||||
@@ -1146,6 +1151,9 @@ int main(int argc, char ** argv) {
|
||||
if (oaicompat == OAICOMPAT_TYPE_ANTHROPIC) {
|
||||
return server_sent_anthropic_event(sink, res);
|
||||
}
|
||||
else if (oaicompat == OAICOMPAT_TYPE_RESP) {
|
||||
return server_sent_oai_resp_event(sink, res);
|
||||
}
|
||||
else {
|
||||
return server_sent_event(sink, res);
|
||||
}
|
||||
@@ -1170,7 +1178,7 @@ int main(int argc, char ** argv) {
|
||||
json res_json = result->to_json();
|
||||
bool ok = false;
|
||||
if (result->is_error()) {
|
||||
ok = sse(json{ { "error", result->to_json() } });
|
||||
ok = server_sent_event(sink, json{ { "error", result->to_json() } });
|
||||
sink.done();
|
||||
return false; // go to on_complete()
|
||||
}
|
||||
@@ -1189,7 +1197,7 @@ int main(int argc, char ** argv) {
|
||||
|
||||
// check if there is more data
|
||||
if (!rd->has_next()) {
|
||||
if (oaicompat != OAICOMPAT_TYPE_ANTHROPIC && oaicompat != OAICOMPAT_TYPE_NONE) {
|
||||
if (oaicompat != OAICOMPAT_TYPE_ANTHROPIC && oaicompat != OAICOMPAT_TYPE_NONE && oaicompat != OAICOMPAT_TYPE_RESP) {
|
||||
static const std::string ev_done = "data: [DONE]\n\n";
|
||||
sink.write(ev_done.data(), ev_done.size());
|
||||
}
|
||||
@@ -1265,6 +1273,20 @@ int main(int argc, char ** argv) {
|
||||
OAICOMPAT_TYPE_CHAT);
|
||||
};
|
||||
|
||||
const auto handle_responses = [&ctx_server, &handle_completions_impl](const httplib::Request & req, httplib::Response & res) {
|
||||
auto body = json::parse(req.body);
|
||||
std::vector<raw_buffer> files;
|
||||
json body_parsed = convert_responses_to_chatcmpl(body);
|
||||
json data = oaicompat_chat_params_parse(ctx_server.model, body_parsed, ctx_server.oai_parser_opt, files);
|
||||
handle_completions_impl(
|
||||
SERVER_TASK_TYPE_COMPLETION,
|
||||
data,
|
||||
files,
|
||||
req.is_connection_closed,
|
||||
res,
|
||||
OAICOMPAT_TYPE_RESP);
|
||||
};
|
||||
|
||||
const auto handle_anthropic_messages = [&ctx_server, &handle_completions_impl](const httplib::Request & req, httplib::Response & res) {
|
||||
std::vector<raw_buffer> files;
|
||||
json body = json::parse(req.body);
|
||||
@@ -2008,6 +2030,7 @@ int main(int argc, char ** argv) {
|
||||
svr->Post("/v1/completions", handle_completions_oai);
|
||||
svr->Post("/chat/completions", handle_chat_completions);
|
||||
svr->Post("/v1/chat/completions", handle_chat_completions);
|
||||
svr->Post("/v1/responses", handle_responses);
|
||||
svr->Post("/v1/messages", handle_anthropic_messages);
|
||||
svr->Post("/v1/messages/count_tokens", handle_anthropic_count_tokens);
|
||||
svr->Post("/infill", handle_infill);
|
||||
|
||||
@@ -71,6 +71,22 @@ Feature: llama.cpp server
|
||||
| codellama70b | You are a coding assistant. | Write the fibonacci function in c++. | 128 | (thanks\|happy\|bird\|Annabyear)+ | -1 | 64 | enabled | |
|
||||
|
||||
|
||||
Scenario Outline: OAI Responses Compatibility
|
||||
Given a model <model>
|
||||
And a system prompt <system_prompt>
|
||||
And a user prompt <user_prompt>
|
||||
And <max_tokens> max tokens to predict
|
||||
And streaming is <enable_streaming>
|
||||
Given an OAI compatible responses request with no api error
|
||||
Then <n_predicted> tokens are predicted matching <re_content>
|
||||
And <n_prompt> prompt tokens are processed
|
||||
|
||||
Examples: Prompts
|
||||
| model | system_prompt | user_prompt | max_tokens | re_content | n_prompt | n_predicted | enable_streaming |
|
||||
| llama-2 | Book | What is the best book | 8 | (Here\|what)+ | 77 | 8 | disabled |
|
||||
| codellama70b | You are a coding assistant. | Write the fibonacci function in c++. | 128 | (thanks\|happy\|bird\|Annabyear)+ | -1 | 64 | enabled |
|
||||
|
||||
|
||||
Scenario Outline: OAI Compatibility w/ response format
|
||||
Given a model test
|
||||
And a system prompt test
|
||||
|
||||
191
examples/server/tests/features/steps/responses_steps.py
Normal file
191
examples/server/tests/features/steps/responses_steps.py
Normal file
@@ -0,0 +1,191 @@
|
||||
import json
|
||||
from typing import Any
|
||||
|
||||
import aiohttp
|
||||
from behave import step # pyright: ignore[reportAttributeAccessIssue]
|
||||
from behave.api.async_step import async_run_until_complete
|
||||
|
||||
import steps
|
||||
|
||||
|
||||
@step("an OAI compatible responses request with {api_error} api error")
|
||||
@async_run_until_complete
|
||||
async def step_oai_responses(context, api_error):
|
||||
if context.debug:
|
||||
print("Submitting OAI compatible responses request...")
|
||||
expect_api_error = api_error == "raised"
|
||||
seeds = await steps.completions_seed(context, num_seeds=1)
|
||||
completion = await oai_responses(
|
||||
context.prompts.pop(),
|
||||
seeds[0] if seeds is not None else seeds,
|
||||
context.system_prompt,
|
||||
context.base_url,
|
||||
debug=context.debug,
|
||||
model=context.model if hasattr(context, "model") else None,
|
||||
n_predict=context.n_predict if hasattr(context, "n_predict") else None,
|
||||
enable_streaming=context.enable_streaming
|
||||
if hasattr(context, "enable_streaming")
|
||||
else None,
|
||||
user_api_key=context.user_api_key if hasattr(context, "user_api_key") else None,
|
||||
temperature=context.temperature,
|
||||
expect_api_error=expect_api_error,
|
||||
)
|
||||
context.tasks_result.append(completion)
|
||||
if context.debug:
|
||||
print(f"Responses completion response: {completion}")
|
||||
if expect_api_error:
|
||||
assert completion == 401, f"completion must be an 401 status code: {completion}"
|
||||
|
||||
|
||||
def extract_responses_output_text(
|
||||
response_json: dict[str, Any],
|
||||
) -> tuple[str, str | None]:
|
||||
output_text = ""
|
||||
message_id = None
|
||||
for item in response_json.get("output", []):
|
||||
if item.get("type") != "message":
|
||||
continue
|
||||
message_id = item.get("id")
|
||||
for part in item.get("content", []):
|
||||
if part.get("type") == "output_text":
|
||||
output_text += part.get("text", "")
|
||||
return output_text, message_id
|
||||
|
||||
|
||||
async def oai_responses(
|
||||
user_prompt,
|
||||
seed,
|
||||
system_prompt,
|
||||
base_url: str,
|
||||
debug=False,
|
||||
temperature=None,
|
||||
model=None,
|
||||
n_predict=None,
|
||||
enable_streaming=None,
|
||||
user_api_key=None,
|
||||
expect_api_error=None,
|
||||
) -> int | dict[str, Any]:
|
||||
if debug:
|
||||
print(f"Sending OAI responses request: {user_prompt}")
|
||||
user_api_key = user_api_key if user_api_key is not None else "nope"
|
||||
seed = seed if seed is not None else 42
|
||||
enable_streaming = enable_streaming if enable_streaming is not None else False
|
||||
payload = {
|
||||
"input": [
|
||||
{
|
||||
"role": "system",
|
||||
"content": system_prompt,
|
||||
},
|
||||
{
|
||||
"role": "user",
|
||||
"content": user_prompt,
|
||||
},
|
||||
],
|
||||
"model": model,
|
||||
"stream": enable_streaming,
|
||||
"temperature": temperature if temperature is not None else 0.0,
|
||||
"seed": seed,
|
||||
}
|
||||
if n_predict is not None:
|
||||
payload["max_output_tokens"] = n_predict
|
||||
completion_response = {
|
||||
"content": "",
|
||||
"timings": {
|
||||
"predicted_n": 0,
|
||||
"prompt_n": 0,
|
||||
},
|
||||
}
|
||||
origin = "llama.cpp"
|
||||
headers = {"Authorization": f"Bearer {user_api_key}", "Origin": origin}
|
||||
async with aiohttp.ClientSession() as session:
|
||||
async with session.post(
|
||||
f"{base_url}/v1/responses", json=payload, headers=headers
|
||||
) as response:
|
||||
if expect_api_error is not None and expect_api_error:
|
||||
if response.status == 401:
|
||||
return 401
|
||||
assert False, f"unexpected status code: {response.status}"
|
||||
|
||||
assert response.status == 200
|
||||
assert response.headers["Access-Control-Allow-Origin"] == origin
|
||||
if enable_streaming:
|
||||
assert response.headers["Content-Type"] == "text/event-stream"
|
||||
resp_id = ""
|
||||
msg_id = ""
|
||||
gathered_text = ""
|
||||
event_name = None
|
||||
completed_response = None
|
||||
async for line_in_bytes in response.content:
|
||||
line = line_in_bytes.decode("utf-8").strip()
|
||||
if not line:
|
||||
continue
|
||||
if line.startswith("event: "):
|
||||
event_name = line.split(": ", 1)[1]
|
||||
continue
|
||||
if not line.startswith("data: "):
|
||||
continue
|
||||
if event_name is None:
|
||||
continue
|
||||
chunk_raw = line.split(": ", 1)[1]
|
||||
data = json.loads(chunk_raw)
|
||||
|
||||
if event_name == "response.created":
|
||||
resp_id = data["response"]["id"]
|
||||
assert resp_id.startswith("resp_")
|
||||
elif event_name == "response.in_progress":
|
||||
assert data["response"]["id"] == resp_id
|
||||
elif event_name == "response.output_item.added":
|
||||
item = data["item"]
|
||||
if item.get("type") == "message":
|
||||
msg_id = item["id"]
|
||||
assert msg_id.startswith("msg_")
|
||||
elif event_name in (
|
||||
"response.content_part.added",
|
||||
"response.output_text.delta",
|
||||
"response.output_text.done",
|
||||
"response.content_part.done",
|
||||
):
|
||||
assert data["item_id"] == msg_id
|
||||
elif event_name == "response.output_item.done":
|
||||
item = data["item"]
|
||||
if item.get("type") == "message":
|
||||
assert item["id"] == msg_id
|
||||
if event_name == "response.output_text.delta":
|
||||
gathered_text += data["delta"]
|
||||
if event_name == "response.completed":
|
||||
completed_response = data["response"]
|
||||
|
||||
assert completed_response is not None
|
||||
output_text, completed_msg_id = extract_responses_output_text(
|
||||
completed_response
|
||||
)
|
||||
assert completed_msg_id is not None
|
||||
assert completed_msg_id.startswith("msg_")
|
||||
assert output_text == gathered_text
|
||||
completion_response = {
|
||||
"content": output_text,
|
||||
"timings": {
|
||||
"predicted_n": completed_response["usage"]["output_tokens"],
|
||||
"prompt_n": completed_response["usage"]["input_tokens"],
|
||||
},
|
||||
}
|
||||
else:
|
||||
assert (
|
||||
response.headers["Content-Type"]
|
||||
== "application/json; charset=utf-8"
|
||||
)
|
||||
response_json = await response.json()
|
||||
assert response_json["id"].startswith("resp_")
|
||||
output_text, message_id = extract_responses_output_text(response_json)
|
||||
assert message_id is not None
|
||||
assert message_id.startswith("msg_")
|
||||
completion_response = {
|
||||
"content": output_text,
|
||||
"timings": {
|
||||
"predicted_n": response_json["usage"]["output_tokens"],
|
||||
"prompt_n": response_json["usage"]["input_tokens"],
|
||||
},
|
||||
}
|
||||
if debug:
|
||||
print("OAI response formatted to llama.cpp:", completion_response)
|
||||
return completion_response
|
||||
Reference in New Issue
Block a user