server: stop processing the prompt when client disconnects (#1134)

implement generator-based API for task results

Update httplib.h to 0.27.0

Fix embedding error

Stop prompt processing when disconnected

Co-authored-by: firecoperana <firecoperana>
This commit is contained in:
firecoperana
2026-01-12 23:56:59 -06:00
committed by GitHub
parent d3e3ad40f9
commit 1a461525d5
24 changed files with 7654 additions and 4549 deletions

View File

@@ -368,9 +368,8 @@ ggml_cgraph * llm_build_context::append_pooling(struct ggml_cgraph * gf) {
inp = gf->nodes[i];
if (strcmp(inp->name, "result_norm") == 0 || strcmp(inp->name, "result_embd") == 0) {
break;
} else {
inp = nullptr;
}
inp = nullptr;
}
GGML_ASSERT(inp != nullptr && "missing result_norm/result_embd tensor");
@@ -1767,7 +1766,7 @@ static ggml_tensor * build_output(llama_context & lctx, ggml_context * ctx, ggml
if (output_norm) {
auto the_norm = split_output_norm ? split_output_norm->splits[id] : output_norm;
auto cur_normed = llm_build_context::llm_build_norm(ctx, cur, lctx.model.hparams, the_norm, NULL, LLM_NORM_RMS, cb, -1);
cb(cur_normed, "output_normed", 1000*(id+1));
cb(cur_normed, "result_norm", 1000*(id+1));
o.push_back(llm_build_context::llm_build_lora_mm(lctx, ctx, split, cur_normed));
} else {
o.push_back(llm_build_context::llm_build_lora_mm(lctx, ctx, split, cur));
@@ -1787,7 +1786,7 @@ static ggml_tensor * build_output(llama_context & lctx, ggml_context * ctx, ggml
} else {
if (output_norm) {
cur = llm_build_context::llm_build_norm(ctx, cur, lctx.model.hparams, output_norm, NULL, LLM_NORM_RMS, cb, -1);
cb(cur, "output_normed", -1);
cb(cur, "result_norm", -1);
}
cur = llm_build_context::llm_build_lora_mm(lctx, ctx, output, cur);
}

View File

@@ -229,3 +229,6 @@ struct llama_split_tensor {
std::vector<ggml_tensor *> tensor_splits;
ggml_split_tensor_t ggml;
};
void llama_decode_reset();
void llama_decode_stop();

View File

@@ -142,6 +142,15 @@ static std::string trim(const std::string & str) {
return str.substr(start, end - start);
}
static bool stop_internal_decode = false;
void llama_decode_reset() {
stop_internal_decode = false;
}
void llama_decode_stop() {
stop_internal_decode = true;
}
static std::vector<std::string> string_split(const std::string& str, const std::string& delimiter) {
std::vector<std::string> parts;
@@ -3077,6 +3086,10 @@ static int llama_decode_internal(
//kv_self.n = llama_kv_cache_cell_max(kv_self);
}
}
if (stop_internal_decode) {
return -3;
}
#if IK_PRINT_TIMING
auto tim2 = ggml_time_us();
printf("prelude(...): %d us\n", int(tim2-tim1));
@@ -5039,7 +5052,7 @@ int32_t llama_n_ctx_train(const struct llama_model * model) {
return model->hparams.n_ctx_train;
}
int32_t llama_n_embd(const struct llama_model * model) {
int32_t llama_model_n_embd(const struct llama_model * model) {
return model->hparams.n_embd;
}