Bug fixes for completions and prompt caching in server (#906)

* Bug fixes for completions and prompt caching in server * Fix compiler warning about redefinition --------- Co-authored-by: firecoperana <firecoperana>
2026-02-04 13:30:47 +00:00 · 2025-11-06 05:10:51 +00:00
parent 50f95d7bf3
commit 18f5a6caef
2 changed files with 96 additions and 63 deletions
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@@ -15,13 +15,7 @@
 // crash the server in debug mode, otherwise send an http 500 error
 #define CPPHTTPLIB_NO_EXCEPTIONS 1
 #endif
-// increase max payload length to allow use of larger context size
-#define CPPHTTPLIB_FORM_URL_ENCODED_PAYLOAD_MAX_LENGTH 1048576
-// disable Nagle's algorithm
-#define CPPHTTPLIB_TCP_NODELAY true
-#include "httplib.h"
-// Change JSON_ASSERT from assert() to GGML_ASSERT:
-#define JSON_ASSERT GGML_ASSERT
+
 #include <nlohmann/json.hpp>
 #include "index.html.gz.hpp"
 #include "index_llamacpp.html.gz.hpp"
@@ -3050,7 +3044,7 @@ struct server_context {
                                GGML_ASSERT(slot.ga_n == 1);
                                
                                // reuse any previously computed tokens that are common with the new prompt
-                                slot.n_past = common_part(slot.cache_tokens.tokens_data(), prompt_tokens.tokens_data());
+                                slot.n_past  = slot.cache_tokens.get_common_prefix(prompt_tokens);

                                // push the prompt into the sampling context (do not apply grammar)
                                for (int i = 0; i < slot.n_past; ++i) {
@@ -3137,7 +3131,6 @@ struct server_context {
                        {
                            const auto& chunk = slot.prompt_tokens.find_chunk(slot.n_past);
                            slot.cache_tokens.push_back(chunk.get()); // copy
-                            fprintf(stdout, slot.cache_tokens.detokenize(ctx, true).c_str());
                        }

                        slot.n_past += n_pos;
@@ -4293,14 +4286,15 @@ int main(int argc, char ** argv) {
            }

            const auto& prompt = data.at("prompt");
-            fprintf(stdout, prompt.get<std::string>().c_str());

            // process prompt
            std::vector<server_tokens> inputs;

            if (oaicompat && ctx_server.mctx != nullptr) {
                // This is the case used by OAI compatible chat path with MTMD. TODO It can be moved to the path below.
-                printFilesInfo(files);
+#ifndef NDEBUG
+                print_files_info(files);
+#endif // !NDEBUG
                inputs.push_back(process_mtmd_prompt(ctx_server.mctx, prompt.get<std::string>(), files));
            }
            else {
@@ -4346,31 +4340,26 @@ int main(int argc, char ** argv) {
                        if (!result.error) {
                            result.oaicompat = oaicompat;
                            result.oaicompat_cmpl_id = completion_id;
-                            json result_array;
+                            json res_json;
                            if (oaicompat) {
                                if (result.final_result) {
-                                    result_array = result.to_json_final();
+                                    res_json = result.to_json_final();
                                }
                                else {
-                                    result_array = result.to_json_partial();
+                                    res_json = result.to_json_partial();
                                }
                            }
                            else {
                                // legacy completions
-                                result_array = result.data;
+                                res_json = result.data;
                            }
-                            if (result_array.is_array()) {
-                                for (auto it = result_array.begin(); it != result_array.end(); ++it) {
-                                    if (!it->empty()) {
-                                        const std::string str =
-                                            "data: " +
-                                            it->dump(-1, ' ', false, json::error_handler_t::replace) +
-                                            "\n\n";
-                                        LOG_VERBOSE("data stream", { {"to_send", str} });
-                                        if (!sink.write(str.c_str(), str.size())) {
-                                            ctx_server.queue_results.remove_waiting_task_id(id_task);
-                                            return false;
-                                        }
+                            if (res_json.is_array()) {
+                                // chat completions and oai completions
+                                for (const auto& res : res_json) {
+                                    if (!server_sent_event(sink, res)) {
+                                        // sending failed (HTTP connection closed), cancel the generation
+                                        ctx_server.queue_results.remove_waiting_task_id(id_task);
+                                        return false;
                                    }
                                }
                                if (result.stop) {
@@ -4378,14 +4367,19 @@ int main(int argc, char ** argv) {
                                    break;
                                }
                            }
+                            else {
+                                // legacy completions
+                                if (!server_sent_event(sink, res_json)) {
+                                    ctx_server.queue_results.remove_waiting_task_id(id_task);
+                                    return false;
+                                }
+                                if (result.stop) {
+                                    break;
+                                }
+                            }
                        }
                        else {
-                            const std::string str =
-                                "error: " +
-                                result.data.dump(-1, ' ', false, json::error_handler_t::replace) +
-                                "\n\n";
-                            LOG_VERBOSE("data stream", { {"to_send", str} });
-                            if (!sink.write(str.c_str(), str.size())) {
+                            if (!server_sent_event(sink, result.data)) {
                                ctx_server.queue_results.remove_waiting_task_id(id_task);
                                return false;
                            }
@@ -4436,7 +4430,7 @@ int main(int argc, char ** argv) {
            data,
            files,
            res,
-            OAICOMPAT_TYPE_CHAT);
+            OAICOMPAT_TYPE_COMPLETION);
    };

    const auto handle_models = [&params, &model_meta](const httplib::Request & req, httplib::Response & res) {