server: propagate task index to response objects for batch requests (#1303)

When multiple prompts are sent in a single /v1/completions request,
each response needs to carry the correct index so the client can
match results to their corresponding prompts. The index field was
not being set on partial responses, final responses, or embedding
responses, causing batch results to all report index 0.

Set res->index = slot.task->index in send_partial_response,
send_final_response, and send_embedding.

Generated with [Devin](https://cli.devin.ai/docs)

Co-authored-by: Joshua Jolley <jjolley@clearwateranalytics.com>
Co-authored-by: Devin <noreply@cognition.ai>
This commit is contained in:
Joshua Jolley
2026-02-24 07:39:38 -07:00
committed by GitHub
parent aaa545c3dc
commit 68431b049a

View File

@@ -1644,6 +1644,7 @@ void server_context::send_partial_response(server_slot& slot, completion_token_o
res->final_result = false;
res->id = slot.id_task;
res->id_multi = slot.id_multi;
res->index = slot.task->index;
res->error = false;
res->stop = false;
res->stream = slot.params.stream;
@@ -1715,6 +1716,7 @@ void server_context::send_final_response(server_slot& slot) {
res->final_result = true;
res->id = slot.id_task;
res->id_multi = slot.id_multi;
res->index = slot.task->index;
res->error = false;
res->stop = true; // to do: set value
res->stream = slot.params.stream;
@@ -1770,6 +1772,8 @@ void server_context::send_final_response(server_slot& slot) {
void server_context::send_embedding(const server_slot& slot, const llama_batch& batch) {
auto res = std::make_unique<server_task_result_embd>();
res->id = slot.task->id;
res->index = slot.task->index;
res->server_task_result::index = slot.task->index;
res->n_tokens = slot.prompt_tokens.size();
res->oaicompat = slot.task->params.oaicompat;