From e68dabc24247dc6ba3a04f5396388ba794604cfa Mon Sep 17 00:00:00 2001 From: Nexes the Elder <124105151+Nexesenex@users.noreply.github.com> Date: Tue, 28 Oct 2025 08:58:31 +0100 Subject: [PATCH] A few server commits from mainline. (#872) server : handle models with missing EOS token (#8997) server : fix segfault on long system prompt (#8987) * server : fix segfault on long system prompt * server : fix parallel generation with very small batch sizes * server : fix typo in comment server : init stop and error fields of the result struct (#9026) server : fix duplicated n_predict key in the generation_settings (#8994) server : support reading arguments from environment variables (#9105) * server : support reading arguments from environment variables * add -fa and -dt * readme : specify non-arg env var server : add some missing env variables (#9116) * server : add some missing env variables * add LLAMA_ARG_HOST to server dockerfile * also add LLAMA_ARG_CONT_BATCHING Credits are to the respective authors. Not a single merge conflict occurred. Compiled, then tested without bug. --- .devops/llama-server-cuda.Dockerfile | 2 + .devops/llama-server-intel.Dockerfile | 2 + .devops/llama-server-rocm.Dockerfile | 2 + .devops/llama-server-vulkan.Dockerfile | 2 + .devops/llama-server.Dockerfile | 2 + common/common.cpp | 71 +++++++++++++++++++++++--- common/common.h | 2 +- examples/server/README.md | 45 ++++++++++++++++ examples/server/server.cpp | 47 ++++++++--------- 9 files changed, 142 insertions(+), 33 deletions(-) diff --git a/.devops/llama-server-cuda.Dockerfile b/.devops/llama-server-cuda.Dockerfile index 67328cf1..18424898 100644 --- a/.devops/llama-server-cuda.Dockerfile +++ b/.devops/llama-server-cuda.Dockerfile @@ -24,6 +24,8 @@ ENV CUDA_DOCKER_ARCH=${CUDA_DOCKER_ARCH} ENV GGML_CUDA=1 # Enable cURL ENV LLAMA_CURL=1 +# Must be set to 0.0.0.0 so it can listen to requests from host machine +ENV LLAMA_ARG_HOST=0.0.0.0 RUN make -j$(nproc) llama-server diff --git a/.devops/llama-server-intel.Dockerfile b/.devops/llama-server-intel.Dockerfile index f525658d..9c355b66 100644 --- a/.devops/llama-server-intel.Dockerfile +++ b/.devops/llama-server-intel.Dockerfile @@ -26,6 +26,8 @@ RUN apt-get update && \ COPY --from=build /app/build/bin/llama-server /llama-server ENV LC_ALL=C.utf8 +# Must be set to 0.0.0.0 so it can listen to requests from host machine +ENV LLAMA_ARG_HOST=0.0.0.0 HEALTHCHECK CMD [ "curl", "-f", "http://localhost:8080/health" ] diff --git a/.devops/llama-server-rocm.Dockerfile b/.devops/llama-server-rocm.Dockerfile index 763b4cd3..fd0e19ad 100644 --- a/.devops/llama-server-rocm.Dockerfile +++ b/.devops/llama-server-rocm.Dockerfile @@ -39,6 +39,8 @@ ENV GPU_TARGETS=${ROCM_DOCKER_ARCH} ENV GGML_HIPBLAS=1 ENV CC=/opt/rocm/llvm/bin/clang ENV CXX=/opt/rocm/llvm/bin/clang++ +# Must be set to 0.0.0.0 so it can listen to requests from host machine +ENV LLAMA_ARG_HOST=0.0.0.0 # Enable cURL ENV LLAMA_CURL=1 diff --git a/.devops/llama-server-vulkan.Dockerfile b/.devops/llama-server-vulkan.Dockerfile index 13a61ffd..93c5e0c2 100644 --- a/.devops/llama-server-vulkan.Dockerfile +++ b/.devops/llama-server-vulkan.Dockerfile @@ -23,6 +23,8 @@ RUN cp /app/build/bin/llama-server /llama-server && \ rm -rf /app ENV LC_ALL=C.utf8 +# Must be set to 0.0.0.0 so it can listen to requests from host machine +ENV LLAMA_ARG_HOST=0.0.0.0 HEALTHCHECK CMD [ "curl", "-f", "http://localhost:8080/health" ] diff --git a/.devops/llama-server.Dockerfile b/.devops/llama-server.Dockerfile index 513e33b3..f80fad01 100644 --- a/.devops/llama-server.Dockerfile +++ b/.devops/llama-server.Dockerfile @@ -21,6 +21,8 @@ RUN apt-get update && \ COPY --from=build /app/llama-server /llama-server ENV LC_ALL=C.utf8 +# Must be set to 0.0.0.0 so it can listen to requests from host machine +ENV LLAMA_ARG_HOST=0.0.0.0 HEALTHCHECK CMD [ "curl", "-f", "http://localhost:8080/health" ] diff --git a/common/common.cpp b/common/common.cpp index 7ad73b9e..1e761b6d 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -87,6 +87,41 @@ #endif using json = nlohmann::ordered_json; +// +// Environment variable utils +// + +template +static typename std::enable_if::value, void>::type +get_env(std::string name, T & target) { + char * value = std::getenv(name.c_str()); + target = value ? std::string(value) : target; +} + +template +static typename std::enable_if::value && std::is_integral::value, void>::type +get_env(std::string name, T & target) { + char * value = std::getenv(name.c_str()); + target = value ? std::stoi(value) : target; +} + +template +static typename std::enable_if::value, void>::type +get_env(std::string name, T & target) { + char * value = std::getenv(name.c_str()); + target = value ? std::stof(value) : target; +} + +template +static typename std::enable_if::value, void>::type +get_env(std::string name, T & target) { + char * value = std::getenv(name.c_str()); + if (value) { + std::string val(value); + target = val == "1" || val == "true"; + } +} + // // CPU utils // @@ -239,12 +274,6 @@ static std::string parse_device_list(const std::string& value) { // CLI argument parsing // -void gpt_params_handle_hf_token(gpt_params & params) { - if (params.hf_token.empty() && std::getenv("HF_TOKEN")) { - params.hf_token = std::getenv("HF_TOKEN"); - } -} - void gpt_params_handle_model_default(gpt_params & params) { if (!params.hf_repo.empty()) { // short-hand to avoid specifying --hf-file -> default it to --model @@ -292,7 +321,9 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) { gpt_params_handle_model_default(params); - gpt_params_handle_hf_token(params); + if (params.hf_token.empty()) { + get_env("HF_TOKEN", params.hf_token); + } if (params.escape) { if (!params.prompt_is_binary) { @@ -329,6 +360,32 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) { return true; } +void gpt_params_parse_from_env(gpt_params & params) { + // we only care about server-related params for now + get_env("LLAMA_ARG_MODEL", params.model); + get_env("LLAMA_ARG_MODEL_URL", params.model_url); + get_env("LLAMA_ARG_MODEL_ALIAS", params.model_alias); + get_env("LLAMA_ARG_HF_REPO", params.hf_repo); + get_env("LLAMA_ARG_HF_FILE", params.hf_file); + get_env("LLAMA_ARG_THREADS", params.n_threads); + get_env("LLAMA_ARG_CTX_SIZE", params.n_ctx); + get_env("LLAMA_ARG_N_PARALLEL", params.n_parallel); + get_env("LLAMA_ARG_BATCH", params.n_batch); + get_env("LLAMA_ARG_UBATCH", params.n_ubatch); + get_env("LLAMA_ARG_N_GPU_LAYERS", params.n_gpu_layers); + get_env("LLAMA_ARG_THREADS_HTTP", params.n_threads_http); + get_env("LLAMA_ARG_CHAT_TEMPLATE", params.chat_template); + get_env("LLAMA_ARG_N_PREDICT", params.n_predict); + get_env("LLAMA_ARG_ENDPOINT_METRICS", params.endpoint_metrics); + get_env("LLAMA_ARG_ENDPOINT_SLOTS", params.endpoint_slots); + get_env("LLAMA_ARG_EMBEDDINGS", params.embedding); + get_env("LLAMA_ARG_FLASH_ATTN", params.flash_attn); + get_env("LLAMA_ARG_DEFRAG_THOLD", params.defrag_thold); + get_env("LLAMA_ARG_CONT_BATCHING", params.cont_batching); + get_env("LLAMA_ARG_HOST", params.hostname); + get_env("LLAMA_ARG_PORT", params.port); +} + bool gpt_params_parse(int argc, char ** argv, gpt_params & params) { const auto params_org = params; // the example can modify the default params diff --git a/common/common.h b/common/common.h index c3d50098..86f7a840 100644 --- a/common/common.h +++ b/common/common.h @@ -369,7 +369,7 @@ struct gpt_params { bool sweep_bench_output_jsonl = false; }; -void gpt_params_handle_hf_token(gpt_params & params); +void gpt_params_parse_from_env(gpt_params & params); void gpt_params_handle_model_default(gpt_params & params); bool gpt_params_parse_ex (int argc, char ** argv, gpt_params & params); diff --git a/examples/server/README.md b/examples/server/README.md index 3a73261f..cec5dc22 100644 --- a/examples/server/README.md +++ b/examples/server/README.md @@ -255,6 +255,51 @@ logging: --log-append Don't truncate the old log file. ``` +Available environment variables (if specified, these variables will override parameters specified in arguments): + +- `LLAMA_CACHE`: cache directory, used by `--hf-repo` +- `HF_TOKEN`: Hugging Face access token, used when accessing a gated model with `--hf-repo` +- `LLAMA_ARG_MODEL`: equivalent to `-m` +- `LLAMA_ARG_MODEL_URL`: equivalent to `-mu` +- `LLAMA_ARG_MODEL_ALIAS`: equivalent to `-a` +- `LLAMA_ARG_HF_REPO`: equivalent to `--hf-repo` +- `LLAMA_ARG_HF_FILE`: equivalent to `--hf-file` +- `LLAMA_ARG_THREADS`: equivalent to `-t` +- `LLAMA_ARG_CTX_SIZE`: equivalent to `-c` +- `LLAMA_ARG_N_PARALLEL`: equivalent to `-np` +- `LLAMA_ARG_BATCH`: equivalent to `-b` +- `LLAMA_ARG_UBATCH`: equivalent to `-ub` +- `LLAMA_ARG_N_GPU_LAYERS`: equivalent to `-ngl` +- `LLAMA_ARG_THREADS_HTTP`: equivalent to `--threads-http` +- `LLAMA_ARG_CHAT_TEMPLATE`: equivalent to `--chat-template` +- `LLAMA_ARG_N_PREDICT`: equivalent to `-n` +- `LLAMA_ARG_ENDPOINT_METRICS`: if set to `1`, it will enable metrics endpoint (equivalent to `--metrics`) +- `LLAMA_ARG_ENDPOINT_SLOTS`: if set to `0`, it will **disable** slots endpoint (equivalent to `--no-slots`). This feature is enabled by default. +- `LLAMA_ARG_EMBEDDINGS`: if set to `1`, it will enable embeddings endpoint (equivalent to `--embeddings`) +- `LLAMA_ARG_FLASH_ATTN`: if set to `1`, it will enable flash attention (equivalent to `-fa`) +- `LLAMA_ARG_CONT_BATCHING`: if set to `0`, it will **disable** continuous batching (equivalent to `--no-cont-batching`). This feature is enabled by default. +- `LLAMA_ARG_DEFRAG_THOLD`: equivalent to `-dt` +- `LLAMA_ARG_HOST`: equivalent to `--host` +- `LLAMA_ARG_PORT`: equivalent to `--port` + +Example usage of docker compose with environment variables: + +```yml +services: + llamacpp-server: + image: ghcr.io/ggerganov/llama.cpp:server + ports: + - 8080:8080 + volumes: + - ./models:/models + environment: + # alternatively, you can use "LLAMA_ARG_MODEL_URL" to download the model + LLAMA_ARG_MODEL: /models/my_model.gguf + LLAMA_ARG_CTX_SIZE: 4096 + LLAMA_ARG_N_PARALLEL: 2 + LLAMA_ARG_ENDPOINT_METRICS: 1 # to disable, either remove or set to 0 + LLAMA_ARG_PORT: 8080 +``` ## Build diff --git a/examples/server/server.cpp b/examples/server/server.cpp index eb7740b6..8845a206 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -1146,6 +1146,7 @@ struct server_context { bool clean_kv_cache = true; bool add_bos_token = true; + bool has_eos_token = false; // For speculative decoding llama_model * model_draft = nullptr; @@ -1232,7 +1233,7 @@ struct server_context { n_ctx = llama_n_ctx(ctx); add_bos_token = llama_should_add_bos_token(model); - GGML_ASSERT(llama_add_eos_token(model) != 1); + has_eos_token = llama_add_eos_token(model) != 1; chat_templates = common_chat_templates_init(model, params.chat_template); try { @@ -1350,13 +1351,13 @@ struct server_context { default_generation_settings_for_props = get_formated_generation(slots.front()); default_generation_settings_for_props["seed"] = -1; - // the update_slots() logic will always submit a maximum of n_batch tokens + // the update_slots() logic will always submit a maximum of n_batch or n_parallel tokens // note that n_batch can be > n_ctx (e.g. for non-causal attention models such as BERT where the KV cache is not used) { const int32_t n_batch = llama_n_batch(ctx); // only a single seq_id per token is needed - batch = llama_batch_init(n_batch, 0, 1); + batch = llama_batch_init(std::max(n_batch, params.n_parallel), 0, 1); } metrics.init(); @@ -1784,7 +1785,7 @@ struct server_context { { slot.sparams.logit_bias.clear(); - if (json_value(data, "ignore_eos", false)) { + if (json_value(data, "ignore_eos", false) && has_eos_token) { slot.sparams.logit_bias[llama_token_eos(model)] = -INFINITY; } @@ -1889,28 +1890,19 @@ struct server_context { if (!system_prompt.empty()) { system_tokens = ::llama_tokenize(ctx, system_prompt, true); - llama_batch_clear(batch); - - for (int i = 0; i < (int)system_tokens.size(); ++i) { - llama_batch_add(batch, system_tokens[i], i, { 0 }, false); - } - const int32_t n_batch = llama_n_batch(ctx); + const int32_t n_tokens_prompt = system_tokens.size(); - for (int32_t i = 0; i < batch.n_tokens; i += n_batch) { - const int32_t n_tokens = std::min(params.n_batch, batch.n_tokens - i); - llama_batch batch_view = { - n_tokens, - batch.token + i, - nullptr, - batch.pos + i, - batch.n_seq_id + i, - batch.seq_id + i, - batch.logits + i, - 0, 0, 0, // unused - }; + for (int32_t i = 0; i < n_tokens_prompt; i += n_batch) { + const int32_t n_tokens = std::min(n_batch, n_tokens_prompt - i); - if (llama_decode(ctx, batch_view) != 0) { + llama_batch_clear(batch); + + for (int32_t j = 0; j < n_tokens; ++j) { + llama_batch_add(batch, system_tokens[i + j], i + j, { 0 }, false); + } + + if (llama_decode(ctx, batch) != 0) { LOG_ERROR("llama_decode() failed", {}); return; } @@ -2114,7 +2106,7 @@ struct server_context { return json { {"n_ctx", slot.n_ctx}, - {"n_predict", slot.n_predict}, + {"n_predict", slot.n_predict}, // Server configured n_predict {"model", params.model_alias}, {"seed", slot.sparams.seed}, {"temperature", slot.sparams.temp}, @@ -2141,7 +2133,7 @@ struct server_context { {"mirostat_eta", slot.sparams.mirostat_eta}, {"penalize_nl", slot.sparams.penalize_nl}, {"stop", slot.params.antiprompt}, - {"n_predict", slot.params.n_predict}, // TODO: fix duplicate key n_predict + {"max_tokens", slot.params.n_predict}, // User configured n_predict {"n_keep", slot.params.n_keep}, {"n_discard", slot.params.n_discard}, {"ignore_eos", ignore_eos}, @@ -2673,6 +2665,8 @@ struct server_context { llama_lora_adapters_apply(ctx, lora_adapters); server_task_result result; result.id = task.id; + result.stop = true; + result.error = false; result.data = json{{ "success", true }}; queue_results.send(result); } break; @@ -3619,6 +3613,9 @@ int main(int argc, char ** argv) { return 1; } + // parse arguments from environment variables + gpt_params_parse_from_env(params); + // TODO: not great to use extern vars server_log_json = params.log_json; server_verbose = params.verbosity > 0;