From de3f33027313fe5c3c92db76152f8bea5836597d Mon Sep 17 00:00:00 2001 From: Yap Sok Ann Date: Mon, 24 Nov 2025 12:52:15 +0700 Subject: [PATCH] Fix truncated logprobs when streaming is off (#998) The logic to skip the logprobs of the stop token was originally from ggml-org/llama.cpp#2849, and was later modified as part of ggml-org/llama.cpp#10643 to be applied only to STOP_TYPE_WORD. The latter change wasn't included in #723. Then, after #958 got merged, the logic got inadvertently applied to GLM-4.5/4.6 and Kimi K2, resulting in truncated logprobs when streaming is off. This commit reverts the logic from ggml-org/llama.cpp#2849, such that the logprobs of the stop token will always be included in the response, when logprobs is enabled. From testing, this matches with the behavior of Fireworks inference server, for both chat completions and text completions endpoints. Also fix logprobs param handling for the text completion endpoint. --- examples/server/server.cpp | 15 +++------------ examples/server/utils.hpp | 6 ++++++ 2 files changed, 9 insertions(+), 12 deletions(-) diff --git a/examples/server/server.cpp b/examples/server/server.cpp index a77b3b7a..18d2af8a 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -2646,18 +2646,9 @@ struct server_context { // populate res.probs_output if (slot.sparams.n_probs > 0) { - if (!slot.params.stream && slot.stopped_word) { - const std::vector stop_word_toks = llama_tokenize(ctx, slot.stopping_word, false); - - size_t safe_offset = std::min(slot.generated_token_probs.size(), stop_word_toks.size()); - res.probs_output = std::vector( - slot.generated_token_probs.begin(), - slot.generated_token_probs.end() - safe_offset); - } else { - res.probs_output = std::vector( - slot.generated_token_probs.begin(), - slot.generated_token_probs.end()); - } + res.probs_output = std::vector( + slot.generated_token_probs.begin(), + slot.generated_token_probs.end()); res.data["completion_probabilities"] = probs_vector_to_json(ctx, res.probs_output); } diff --git a/examples/server/utils.hpp b/examples/server/utils.hpp index 24dcb044..3c4b6dd1 100644 --- a/examples/server/utils.hpp +++ b/examples/server/utils.hpp @@ -466,6 +466,12 @@ static json oaicompat_chat_params_parse(const json& body) { throw std::runtime_error("Only no echo is supported"); } + // Handle "logprobs" field + int n_probs = json_value(body, "logprobs", 0); + if (n_probs > 0) { + llama_params["n_probs"] = n_probs; + } + // Params supported by OAI but unsupported by llama.cpp static const std::vector unsupported_params{ "best_of", "suffix" }; for (const auto& param : unsupported_params) {