mirror of
https://github.com/ikawrakow/ik_llama.cpp.git
synced 2026-04-29 10:51:51 +00:00
server: stop processing the prompt when client disconnects (#1134)
implement generator-based API for task results Update httplib.h to 0.27.0 Fix embedding error Stop prompt processing when disconnected Co-authored-by: firecoperana <firecoperana>
This commit is contained in:
@@ -368,9 +368,8 @@ ggml_cgraph * llm_build_context::append_pooling(struct ggml_cgraph * gf) {
|
||||
inp = gf->nodes[i];
|
||||
if (strcmp(inp->name, "result_norm") == 0 || strcmp(inp->name, "result_embd") == 0) {
|
||||
break;
|
||||
} else {
|
||||
inp = nullptr;
|
||||
}
|
||||
inp = nullptr;
|
||||
}
|
||||
GGML_ASSERT(inp != nullptr && "missing result_norm/result_embd tensor");
|
||||
|
||||
@@ -1767,7 +1766,7 @@ static ggml_tensor * build_output(llama_context & lctx, ggml_context * ctx, ggml
|
||||
if (output_norm) {
|
||||
auto the_norm = split_output_norm ? split_output_norm->splits[id] : output_norm;
|
||||
auto cur_normed = llm_build_context::llm_build_norm(ctx, cur, lctx.model.hparams, the_norm, NULL, LLM_NORM_RMS, cb, -1);
|
||||
cb(cur_normed, "output_normed", 1000*(id+1));
|
||||
cb(cur_normed, "result_norm", 1000*(id+1));
|
||||
o.push_back(llm_build_context::llm_build_lora_mm(lctx, ctx, split, cur_normed));
|
||||
} else {
|
||||
o.push_back(llm_build_context::llm_build_lora_mm(lctx, ctx, split, cur));
|
||||
@@ -1787,7 +1786,7 @@ static ggml_tensor * build_output(llama_context & lctx, ggml_context * ctx, ggml
|
||||
} else {
|
||||
if (output_norm) {
|
||||
cur = llm_build_context::llm_build_norm(ctx, cur, lctx.model.hparams, output_norm, NULL, LLM_NORM_RMS, cb, -1);
|
||||
cb(cur, "output_normed", -1);
|
||||
cb(cur, "result_norm", -1);
|
||||
}
|
||||
cur = llm_build_context::llm_build_lora_mm(lctx, ctx, output, cur);
|
||||
}
|
||||
|
||||
@@ -229,3 +229,6 @@ struct llama_split_tensor {
|
||||
std::vector<ggml_tensor *> tensor_splits;
|
||||
ggml_split_tensor_t ggml;
|
||||
};
|
||||
|
||||
void llama_decode_reset();
|
||||
void llama_decode_stop();
|
||||
|
||||
@@ -142,6 +142,15 @@ static std::string trim(const std::string & str) {
|
||||
return str.substr(start, end - start);
|
||||
}
|
||||
|
||||
static bool stop_internal_decode = false;
|
||||
|
||||
void llama_decode_reset() {
|
||||
stop_internal_decode = false;
|
||||
}
|
||||
|
||||
void llama_decode_stop() {
|
||||
stop_internal_decode = true;
|
||||
}
|
||||
|
||||
static std::vector<std::string> string_split(const std::string& str, const std::string& delimiter) {
|
||||
std::vector<std::string> parts;
|
||||
@@ -3077,6 +3086,10 @@ static int llama_decode_internal(
|
||||
//kv_self.n = llama_kv_cache_cell_max(kv_self);
|
||||
}
|
||||
}
|
||||
if (stop_internal_decode) {
|
||||
return -3;
|
||||
}
|
||||
|
||||
#if IK_PRINT_TIMING
|
||||
auto tim2 = ggml_time_us();
|
||||
printf("prelude(...): %d us\n", int(tim2-tim1));
|
||||
@@ -5039,7 +5052,7 @@ int32_t llama_n_ctx_train(const struct llama_model * model) {
|
||||
return model->hparams.n_ctx_train;
|
||||
}
|
||||
|
||||
int32_t llama_n_embd(const struct llama_model * model) {
|
||||
int32_t llama_model_n_embd(const struct llama_model * model) {
|
||||
return model->hparams.n_embd;
|
||||
}
|
||||
|
||||
|
||||
Reference in New Issue
Block a user