From f1ccf340ddedf57399c707d49be0bd37cbbc0c01 Mon Sep 17 00:00:00 2001 From: firecoperana <18252262+firecoperana@users.noreply.github.com> Date: Sat, 7 Feb 2026 10:31:39 -0600 Subject: [PATCH] fix model name missing in final response (#1250) Co-authored-by: firecoperana --- examples/server/server-context.cpp | 11 +++++------ examples/server/server-context.h | 1 + 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/examples/server/server-context.cpp b/examples/server/server-context.cpp index 8e71b2db..a7d87257 100644 --- a/examples/server/server-context.cpp +++ b/examples/server/server-context.cpp @@ -1631,12 +1631,11 @@ void server_context::send_final_response(server_slot& slot) { res->timings = slot.get_timings(); res->post_sampling_probs = slot.params.post_sampling_probs; res->oaicompat = slot.params.oaicompat; - res->oaicompat_model = slot.params.oaicompat_model; res->oaicompat_cmpl_id = slot.params.oaicompat_cmpl_id; res->oaicompat_msg = slot.update_chat_msg(res->oaicompat_msg_diffs); res->n_decoded = slot.n_decoded; res->n_prompt_tokens = slot.n_prompt_tokens; - res->oaicompat_model = slot.oaicompat_model; + res->oaicompat_model = slot.task->params.oaicompat_model; res->data = json{ {"content", !slot.params.stream ? slot.generated_text : ""}, {"generated_text", slot.generated_text}, // Always include full text for finish_reason logic @@ -2590,9 +2589,9 @@ void server_context::batch_pending_prompt(const int32_t n_ubatch, const int32_t slot.state = SLOT_STATE_PROCESSING; slot.command = SLOT_COMMAND_NONE; + send_final_response(slot); slot.release(); slot.print_timings(); - send_final_response(slot); continue; } @@ -2933,9 +2932,9 @@ void server_context::speculative_decoding_accept() { if (!process_token(result, slot)) { // release slot because of stop condition + send_final_response(slot); slot.release(); slot.print_timings(); - send_final_response(slot); metrics.on_prediction(slot); break; } @@ -2953,7 +2952,7 @@ void server_context::speculative_decoding_accept() { bool server_context::accept_special_token(const server_slot& slot, const llama_token token) { return params_base.special || slot.sparams.preserved_tokens.find(token) != slot.sparams.preserved_tokens.end(); -}; +} void server_context::send_token_results(completion_token_outputs& results, server_slot& slot, int32_t n) { @@ -2962,9 +2961,9 @@ void server_context::send_token_results(completion_token_outputs& results, serve bool has_next = process_token(it, slot); count++; if (!has_next) { + send_final_response(slot); slot.release(); slot.print_timings(); - send_final_response(slot); metrics.on_prediction(slot); break; } diff --git a/examples/server/server-context.h b/examples/server/server-context.h index a5676e04..7c5f4267 100644 --- a/examples/server/server-context.h +++ b/examples/server/server-context.h @@ -336,4 +336,5 @@ struct server_context { // Re-aggregates all active vectors and updates the model state bool apply_control_vectors_internal(); + };