server: stop processing the prompt when client disconnects (#1134)

implement generator-based API for task results

Update httplib.h to 0.27.0

Fix embedding error

Stop prompt processing when disconnected

Co-authored-by: firecoperana <firecoperana>
This commit is contained in:
firecoperana
2026-01-12 23:56:59 -06:00
committed by GitHub
parent d3e3ad40f9
commit 1a461525d5
24 changed files with 7654 additions and 4549 deletions

View File

@@ -27,15 +27,15 @@
#include <random>
#include <set>
// increase max payload length to allow use of larger context size
#define CPPHTTPLIB_FORM_URL_ENCODED_PAYLOAD_MAX_LENGTH 1048576
// increase backlog size to avoid connection resets for >> 1 slots
#define CPPHTTPLIB_LISTEN_BACKLOG 512
// increase max URI length to handle longer prompts in query string
#define CPPHTTPLIB_REQUEST_URI_MAX_LENGTH 32768
// disable Nagle's algorithm
#define CPPHTTPLIB_TCP_NODELAY true
#include "httplib.h"
//// increase max payload length to allow use of larger context size
//#define CPPHTTPLIB_FORM_URL_ENCODED_PAYLOAD_MAX_LENGTH 1048576
//// increase backlog size to avoid connection resets for >> 1 slots
//#define CPPHTTPLIB_LISTEN_BACKLOG 512
//// increase max URI length to handle longer prompts in query string
//#define CPPHTTPLIB_REQUEST_URI_MAX_LENGTH 32768
//// disable Nagle's algorithm
//#define CPPHTTPLIB_TCP_NODELAY true
#include <cpp-httplib/httplib.h>
#define DEFAULT_OAICOMPAT_MODEL "gpt-3.5-turbo-0613"
@@ -459,3 +459,6 @@ void print_files_info(const std::vector<raw_buffer>& files);
bool prompt_cache_equal(llama_context* ctx, const server_tokens& cache_tokens,
const server_tokens& prompt_tokens, size_t start, const common_prefix& prefix);
std::string safe_json_to_str(const json& data);