server: stop processing the prompt when client disconnects (#1134)

implement generator-based API for task results Update httplib.h to 0.27.0 Fix embedding error Stop prompt processing when disconnected Co-authored-by: firecoperana <firecoperana>
2026-04-30 19:31:48 +00:00 · 2026-01-12 23:56:59 -06:00
parent d3e3ad40f9
commit 1a461525d5
24 changed files with 7654 additions and 4549 deletions
--- a/include/llama.h
+++ b/include/llama.h
@@ -602,13 +602,12 @@ extern "C" {
    LLAMA_API int32_t llama_n_vocab    (const struct llama_model * model);
    LLAMA_API const struct llama_vocab* llama_get_model_vocab(const struct llama_model* model);
    LLAMA_API int32_t llama_n_ctx_train(const struct llama_model * model);
-    LLAMA_API int32_t llama_n_embd     (const struct llama_model * model);
+    LLAMA_API int32_t llama_model_n_embd     (const struct llama_model * model);
    LLAMA_API int32_t llama_model_n_embd_inp(const struct llama_model* model);
    
    LLAMA_API int32_t llama_n_layer    (const struct llama_model * model);

    // Compat
-    static    int32_t     llama_model_n_embd(const struct llama_model * model) { return llama_n_embd(model); }
    LLAMA_API bool        llama_vocab_get_add_bos(const struct llama_vocab * vocab);
    LLAMA_API bool        llama_vocab_get_add_eos(const struct llama_vocab * vocab);
    LLAMA_API int32_t     llama_vocab_n_tokens(const struct llama_vocab * vocab);
@@ -1518,6 +1517,8 @@ std::pair<std::vector<uint32_t>, llama_partial_utf8> decode_utf8(
        const std::string & src,
        llama_partial_utf8 partial_start);

+
+
 // Randomly selects a token from the candidates based on their probabilities using given std::mt19937.
 // This is a temporary workaround in order to fix race conditions when sampling with multiple sequences.
 llama_token llama_sample_token_with_rng(struct llama_context * ctx, llama_token_data_array * candidates, std::mt19937 & rng);