server: stop processing the prompt when client disconnects (#1134)

implement generator-based API for task results Update httplib.h to 0.27.0 Fix embedding error Stop prompt processing when disconnected Co-authored-by: firecoperana <firecoperana>
2026-04-29 19:01:47 +00:00 · 2026-01-12 23:56:59 -06:00
parent d3e3ad40f9
commit 1a461525d5
24 changed files with 7654 additions and 4549 deletions
--- a/src/llama-build-context.cpp
+++ b/src/llama-build-context.cpp
@@ -368,9 +368,8 @@ ggml_cgraph * llm_build_context::append_pooling(struct ggml_cgraph * gf) {
        inp = gf->nodes[i];
        if (strcmp(inp->name, "result_norm") == 0 || strcmp(inp->name, "result_embd") == 0) {
            break;
-        } else {
-            inp = nullptr;
        }
+        inp = nullptr;
    }
    GGML_ASSERT(inp != nullptr && "missing result_norm/result_embd tensor");

@@ -1767,7 +1766,7 @@ static ggml_tensor * build_output(llama_context & lctx, ggml_context * ctx, ggml
            if (output_norm) {
                auto the_norm = split_output_norm ? split_output_norm->splits[id] : output_norm;
                auto cur_normed = llm_build_context::llm_build_norm(ctx, cur, lctx.model.hparams, the_norm, NULL, LLM_NORM_RMS, cb, -1);
-                cb(cur_normed, "output_normed", 1000*(id+1));
+                cb(cur_normed, "result_norm", 1000*(id+1));
                o.push_back(llm_build_context::llm_build_lora_mm(lctx, ctx, split, cur_normed));
            } else {
                o.push_back(llm_build_context::llm_build_lora_mm(lctx, ctx, split, cur));
@@ -1787,7 +1786,7 @@ static ggml_tensor * build_output(llama_context & lctx, ggml_context * ctx, ggml
    } else {
        if (output_norm) {
            cur = llm_build_context::llm_build_norm(ctx, cur, lctx.model.hparams, output_norm, NULL, LLM_NORM_RMS, cb, -1);
-            cb(cur, "output_normed", -1);
+            cb(cur, "result_norm", -1);
        }
        cur = llm_build_context::llm_build_lora_mm(lctx, ctx, output, cur);
    }
--- a/src/llama-impl.h
+++ b/src/llama-impl.h
@@ -229,3 +229,6 @@ struct llama_split_tensor {
    std::vector<ggml_tensor *> tensor_splits;
    ggml_split_tensor_t        ggml;
 };
+
+void  llama_decode_reset();
+void  llama_decode_stop();
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -142,6 +142,15 @@ static std::string trim(const std::string & str) {
    return str.substr(start, end - start);
 }

+static bool stop_internal_decode = false;
+
+void  llama_decode_reset() {
+    stop_internal_decode = false;
+}
+
+void  llama_decode_stop() {
+    stop_internal_decode = true;
+}

 static std::vector<std::string> string_split(const std::string& str, const std::string& delimiter) {
    std::vector<std::string> parts;
@@ -3077,6 +3086,10 @@ static int llama_decode_internal(
                //kv_self.n = llama_kv_cache_cell_max(kv_self);
            }
        }
+        if (stop_internal_decode) {
+            return -3;
+        }
+
 #if IK_PRINT_TIMING
        auto tim2 = ggml_time_us();
        printf("prelude(...): %d us\n", int(tim2-tim1));
@@ -5039,7 +5052,7 @@ int32_t llama_n_ctx_train(const struct llama_model * model) {
    return model->hparams.n_ctx_train;
 }

-int32_t llama_n_embd(const struct llama_model * model) {
+int32_t llama_model_n_embd(const struct llama_model * model) {
    return model->hparams.n_embd;
 }