mirror of
https://github.com/ikawrakow/ik_llama.cpp.git
synced 2026-01-26 17:20:01 +00:00
A few server commits from mainline. (#872)
server : handle models with missing EOS token (#8997) server : fix segfault on long system prompt (#8987) * server : fix segfault on long system prompt * server : fix parallel generation with very small batch sizes * server : fix typo in comment server : init stop and error fields of the result struct (#9026) server : fix duplicated n_predict key in the generation_settings (#8994) server : support reading arguments from environment variables (#9105) * server : support reading arguments from environment variables * add -fa and -dt * readme : specify non-arg env var server : add some missing env variables (#9116) * server : add some missing env variables * add LLAMA_ARG_HOST to server dockerfile * also add LLAMA_ARG_CONT_BATCHING Credits are to the respective authors. Not a single merge conflict occurred. Compiled, then tested without bug.
This commit is contained in:
@@ -24,6 +24,8 @@ ENV CUDA_DOCKER_ARCH=${CUDA_DOCKER_ARCH}
|
||||
ENV GGML_CUDA=1
|
||||
# Enable cURL
|
||||
ENV LLAMA_CURL=1
|
||||
# Must be set to 0.0.0.0 so it can listen to requests from host machine
|
||||
ENV LLAMA_ARG_HOST=0.0.0.0
|
||||
|
||||
RUN make -j$(nproc) llama-server
|
||||
|
||||
|
||||
@@ -26,6 +26,8 @@ RUN apt-get update && \
|
||||
COPY --from=build /app/build/bin/llama-server /llama-server
|
||||
|
||||
ENV LC_ALL=C.utf8
|
||||
# Must be set to 0.0.0.0 so it can listen to requests from host machine
|
||||
ENV LLAMA_ARG_HOST=0.0.0.0
|
||||
|
||||
HEALTHCHECK CMD [ "curl", "-f", "http://localhost:8080/health" ]
|
||||
|
||||
|
||||
@@ -39,6 +39,8 @@ ENV GPU_TARGETS=${ROCM_DOCKER_ARCH}
|
||||
ENV GGML_HIPBLAS=1
|
||||
ENV CC=/opt/rocm/llvm/bin/clang
|
||||
ENV CXX=/opt/rocm/llvm/bin/clang++
|
||||
# Must be set to 0.0.0.0 so it can listen to requests from host machine
|
||||
ENV LLAMA_ARG_HOST=0.0.0.0
|
||||
|
||||
# Enable cURL
|
||||
ENV LLAMA_CURL=1
|
||||
|
||||
@@ -23,6 +23,8 @@ RUN cp /app/build/bin/llama-server /llama-server && \
|
||||
rm -rf /app
|
||||
|
||||
ENV LC_ALL=C.utf8
|
||||
# Must be set to 0.0.0.0 so it can listen to requests from host machine
|
||||
ENV LLAMA_ARG_HOST=0.0.0.0
|
||||
|
||||
HEALTHCHECK CMD [ "curl", "-f", "http://localhost:8080/health" ]
|
||||
|
||||
|
||||
@@ -21,6 +21,8 @@ RUN apt-get update && \
|
||||
COPY --from=build /app/llama-server /llama-server
|
||||
|
||||
ENV LC_ALL=C.utf8
|
||||
# Must be set to 0.0.0.0 so it can listen to requests from host machine
|
||||
ENV LLAMA_ARG_HOST=0.0.0.0
|
||||
|
||||
HEALTHCHECK CMD [ "curl", "-f", "http://localhost:8080/health" ]
|
||||
|
||||
|
||||
@@ -87,6 +87,41 @@
|
||||
#endif
|
||||
using json = nlohmann::ordered_json;
|
||||
|
||||
//
|
||||
// Environment variable utils
|
||||
//
|
||||
|
||||
template<typename T>
|
||||
static typename std::enable_if<std::is_same<T, std::string>::value, void>::type
|
||||
get_env(std::string name, T & target) {
|
||||
char * value = std::getenv(name.c_str());
|
||||
target = value ? std::string(value) : target;
|
||||
}
|
||||
|
||||
template<typename T>
|
||||
static typename std::enable_if<!std::is_same<T, bool>::value && std::is_integral<T>::value, void>::type
|
||||
get_env(std::string name, T & target) {
|
||||
char * value = std::getenv(name.c_str());
|
||||
target = value ? std::stoi(value) : target;
|
||||
}
|
||||
|
||||
template<typename T>
|
||||
static typename std::enable_if<std::is_floating_point<T>::value, void>::type
|
||||
get_env(std::string name, T & target) {
|
||||
char * value = std::getenv(name.c_str());
|
||||
target = value ? std::stof(value) : target;
|
||||
}
|
||||
|
||||
template<typename T>
|
||||
static typename std::enable_if<std::is_same<T, bool>::value, void>::type
|
||||
get_env(std::string name, T & target) {
|
||||
char * value = std::getenv(name.c_str());
|
||||
if (value) {
|
||||
std::string val(value);
|
||||
target = val == "1" || val == "true";
|
||||
}
|
||||
}
|
||||
|
||||
//
|
||||
// CPU utils
|
||||
//
|
||||
@@ -239,12 +274,6 @@ static std::string parse_device_list(const std::string& value) {
|
||||
// CLI argument parsing
|
||||
//
|
||||
|
||||
void gpt_params_handle_hf_token(gpt_params & params) {
|
||||
if (params.hf_token.empty() && std::getenv("HF_TOKEN")) {
|
||||
params.hf_token = std::getenv("HF_TOKEN");
|
||||
}
|
||||
}
|
||||
|
||||
void gpt_params_handle_model_default(gpt_params & params) {
|
||||
if (!params.hf_repo.empty()) {
|
||||
// short-hand to avoid specifying --hf-file -> default it to --model
|
||||
@@ -292,7 +321,9 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
|
||||
|
||||
gpt_params_handle_model_default(params);
|
||||
|
||||
gpt_params_handle_hf_token(params);
|
||||
if (params.hf_token.empty()) {
|
||||
get_env("HF_TOKEN", params.hf_token);
|
||||
}
|
||||
|
||||
if (params.escape) {
|
||||
if (!params.prompt_is_binary) {
|
||||
@@ -329,6 +360,32 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
|
||||
return true;
|
||||
}
|
||||
|
||||
void gpt_params_parse_from_env(gpt_params & params) {
|
||||
// we only care about server-related params for now
|
||||
get_env("LLAMA_ARG_MODEL", params.model);
|
||||
get_env("LLAMA_ARG_MODEL_URL", params.model_url);
|
||||
get_env("LLAMA_ARG_MODEL_ALIAS", params.model_alias);
|
||||
get_env("LLAMA_ARG_HF_REPO", params.hf_repo);
|
||||
get_env("LLAMA_ARG_HF_FILE", params.hf_file);
|
||||
get_env("LLAMA_ARG_THREADS", params.n_threads);
|
||||
get_env("LLAMA_ARG_CTX_SIZE", params.n_ctx);
|
||||
get_env("LLAMA_ARG_N_PARALLEL", params.n_parallel);
|
||||
get_env("LLAMA_ARG_BATCH", params.n_batch);
|
||||
get_env("LLAMA_ARG_UBATCH", params.n_ubatch);
|
||||
get_env("LLAMA_ARG_N_GPU_LAYERS", params.n_gpu_layers);
|
||||
get_env("LLAMA_ARG_THREADS_HTTP", params.n_threads_http);
|
||||
get_env("LLAMA_ARG_CHAT_TEMPLATE", params.chat_template);
|
||||
get_env("LLAMA_ARG_N_PREDICT", params.n_predict);
|
||||
get_env("LLAMA_ARG_ENDPOINT_METRICS", params.endpoint_metrics);
|
||||
get_env("LLAMA_ARG_ENDPOINT_SLOTS", params.endpoint_slots);
|
||||
get_env("LLAMA_ARG_EMBEDDINGS", params.embedding);
|
||||
get_env("LLAMA_ARG_FLASH_ATTN", params.flash_attn);
|
||||
get_env("LLAMA_ARG_DEFRAG_THOLD", params.defrag_thold);
|
||||
get_env("LLAMA_ARG_CONT_BATCHING", params.cont_batching);
|
||||
get_env("LLAMA_ARG_HOST", params.hostname);
|
||||
get_env("LLAMA_ARG_PORT", params.port);
|
||||
}
|
||||
|
||||
bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
|
||||
const auto params_org = params; // the example can modify the default params
|
||||
|
||||
|
||||
@@ -369,7 +369,7 @@ struct gpt_params {
|
||||
bool sweep_bench_output_jsonl = false;
|
||||
};
|
||||
|
||||
void gpt_params_handle_hf_token(gpt_params & params);
|
||||
void gpt_params_parse_from_env(gpt_params & params);
|
||||
void gpt_params_handle_model_default(gpt_params & params);
|
||||
|
||||
bool gpt_params_parse_ex (int argc, char ** argv, gpt_params & params);
|
||||
|
||||
@@ -255,6 +255,51 @@ logging:
|
||||
--log-append Don't truncate the old log file.
|
||||
```
|
||||
|
||||
Available environment variables (if specified, these variables will override parameters specified in arguments):
|
||||
|
||||
- `LLAMA_CACHE`: cache directory, used by `--hf-repo`
|
||||
- `HF_TOKEN`: Hugging Face access token, used when accessing a gated model with `--hf-repo`
|
||||
- `LLAMA_ARG_MODEL`: equivalent to `-m`
|
||||
- `LLAMA_ARG_MODEL_URL`: equivalent to `-mu`
|
||||
- `LLAMA_ARG_MODEL_ALIAS`: equivalent to `-a`
|
||||
- `LLAMA_ARG_HF_REPO`: equivalent to `--hf-repo`
|
||||
- `LLAMA_ARG_HF_FILE`: equivalent to `--hf-file`
|
||||
- `LLAMA_ARG_THREADS`: equivalent to `-t`
|
||||
- `LLAMA_ARG_CTX_SIZE`: equivalent to `-c`
|
||||
- `LLAMA_ARG_N_PARALLEL`: equivalent to `-np`
|
||||
- `LLAMA_ARG_BATCH`: equivalent to `-b`
|
||||
- `LLAMA_ARG_UBATCH`: equivalent to `-ub`
|
||||
- `LLAMA_ARG_N_GPU_LAYERS`: equivalent to `-ngl`
|
||||
- `LLAMA_ARG_THREADS_HTTP`: equivalent to `--threads-http`
|
||||
- `LLAMA_ARG_CHAT_TEMPLATE`: equivalent to `--chat-template`
|
||||
- `LLAMA_ARG_N_PREDICT`: equivalent to `-n`
|
||||
- `LLAMA_ARG_ENDPOINT_METRICS`: if set to `1`, it will enable metrics endpoint (equivalent to `--metrics`)
|
||||
- `LLAMA_ARG_ENDPOINT_SLOTS`: if set to `0`, it will **disable** slots endpoint (equivalent to `--no-slots`). This feature is enabled by default.
|
||||
- `LLAMA_ARG_EMBEDDINGS`: if set to `1`, it will enable embeddings endpoint (equivalent to `--embeddings`)
|
||||
- `LLAMA_ARG_FLASH_ATTN`: if set to `1`, it will enable flash attention (equivalent to `-fa`)
|
||||
- `LLAMA_ARG_CONT_BATCHING`: if set to `0`, it will **disable** continuous batching (equivalent to `--no-cont-batching`). This feature is enabled by default.
|
||||
- `LLAMA_ARG_DEFRAG_THOLD`: equivalent to `-dt`
|
||||
- `LLAMA_ARG_HOST`: equivalent to `--host`
|
||||
- `LLAMA_ARG_PORT`: equivalent to `--port`
|
||||
|
||||
Example usage of docker compose with environment variables:
|
||||
|
||||
```yml
|
||||
services:
|
||||
llamacpp-server:
|
||||
image: ghcr.io/ggerganov/llama.cpp:server
|
||||
ports:
|
||||
- 8080:8080
|
||||
volumes:
|
||||
- ./models:/models
|
||||
environment:
|
||||
# alternatively, you can use "LLAMA_ARG_MODEL_URL" to download the model
|
||||
LLAMA_ARG_MODEL: /models/my_model.gguf
|
||||
LLAMA_ARG_CTX_SIZE: 4096
|
||||
LLAMA_ARG_N_PARALLEL: 2
|
||||
LLAMA_ARG_ENDPOINT_METRICS: 1 # to disable, either remove or set to 0
|
||||
LLAMA_ARG_PORT: 8080
|
||||
```
|
||||
|
||||
## Build
|
||||
|
||||
|
||||
@@ -1146,6 +1146,7 @@ struct server_context {
|
||||
|
||||
bool clean_kv_cache = true;
|
||||
bool add_bos_token = true;
|
||||
bool has_eos_token = false;
|
||||
|
||||
// For speculative decoding
|
||||
llama_model * model_draft = nullptr;
|
||||
@@ -1232,7 +1233,7 @@ struct server_context {
|
||||
n_ctx = llama_n_ctx(ctx);
|
||||
|
||||
add_bos_token = llama_should_add_bos_token(model);
|
||||
GGML_ASSERT(llama_add_eos_token(model) != 1);
|
||||
has_eos_token = llama_add_eos_token(model) != 1;
|
||||
|
||||
chat_templates = common_chat_templates_init(model, params.chat_template);
|
||||
try {
|
||||
@@ -1350,13 +1351,13 @@ struct server_context {
|
||||
default_generation_settings_for_props = get_formated_generation(slots.front());
|
||||
default_generation_settings_for_props["seed"] = -1;
|
||||
|
||||
// the update_slots() logic will always submit a maximum of n_batch tokens
|
||||
// the update_slots() logic will always submit a maximum of n_batch or n_parallel tokens
|
||||
// note that n_batch can be > n_ctx (e.g. for non-causal attention models such as BERT where the KV cache is not used)
|
||||
{
|
||||
const int32_t n_batch = llama_n_batch(ctx);
|
||||
|
||||
// only a single seq_id per token is needed
|
||||
batch = llama_batch_init(n_batch, 0, 1);
|
||||
batch = llama_batch_init(std::max(n_batch, params.n_parallel), 0, 1);
|
||||
}
|
||||
|
||||
metrics.init();
|
||||
@@ -1784,7 +1785,7 @@ struct server_context {
|
||||
{
|
||||
slot.sparams.logit_bias.clear();
|
||||
|
||||
if (json_value(data, "ignore_eos", false)) {
|
||||
if (json_value(data, "ignore_eos", false) && has_eos_token) {
|
||||
slot.sparams.logit_bias[llama_token_eos(model)] = -INFINITY;
|
||||
}
|
||||
|
||||
@@ -1889,28 +1890,19 @@ struct server_context {
|
||||
if (!system_prompt.empty()) {
|
||||
system_tokens = ::llama_tokenize(ctx, system_prompt, true);
|
||||
|
||||
llama_batch_clear(batch);
|
||||
|
||||
for (int i = 0; i < (int)system_tokens.size(); ++i) {
|
||||
llama_batch_add(batch, system_tokens[i], i, { 0 }, false);
|
||||
}
|
||||
|
||||
const int32_t n_batch = llama_n_batch(ctx);
|
||||
const int32_t n_tokens_prompt = system_tokens.size();
|
||||
|
||||
for (int32_t i = 0; i < batch.n_tokens; i += n_batch) {
|
||||
const int32_t n_tokens = std::min(params.n_batch, batch.n_tokens - i);
|
||||
llama_batch batch_view = {
|
||||
n_tokens,
|
||||
batch.token + i,
|
||||
nullptr,
|
||||
batch.pos + i,
|
||||
batch.n_seq_id + i,
|
||||
batch.seq_id + i,
|
||||
batch.logits + i,
|
||||
0, 0, 0, // unused
|
||||
};
|
||||
for (int32_t i = 0; i < n_tokens_prompt; i += n_batch) {
|
||||
const int32_t n_tokens = std::min(n_batch, n_tokens_prompt - i);
|
||||
|
||||
if (llama_decode(ctx, batch_view) != 0) {
|
||||
llama_batch_clear(batch);
|
||||
|
||||
for (int32_t j = 0; j < n_tokens; ++j) {
|
||||
llama_batch_add(batch, system_tokens[i + j], i + j, { 0 }, false);
|
||||
}
|
||||
|
||||
if (llama_decode(ctx, batch) != 0) {
|
||||
LOG_ERROR("llama_decode() failed", {});
|
||||
return;
|
||||
}
|
||||
@@ -2114,7 +2106,7 @@ struct server_context {
|
||||
|
||||
return json {
|
||||
{"n_ctx", slot.n_ctx},
|
||||
{"n_predict", slot.n_predict},
|
||||
{"n_predict", slot.n_predict}, // Server configured n_predict
|
||||
{"model", params.model_alias},
|
||||
{"seed", slot.sparams.seed},
|
||||
{"temperature", slot.sparams.temp},
|
||||
@@ -2141,7 +2133,7 @@ struct server_context {
|
||||
{"mirostat_eta", slot.sparams.mirostat_eta},
|
||||
{"penalize_nl", slot.sparams.penalize_nl},
|
||||
{"stop", slot.params.antiprompt},
|
||||
{"n_predict", slot.params.n_predict}, // TODO: fix duplicate key n_predict
|
||||
{"max_tokens", slot.params.n_predict}, // User configured n_predict
|
||||
{"n_keep", slot.params.n_keep},
|
||||
{"n_discard", slot.params.n_discard},
|
||||
{"ignore_eos", ignore_eos},
|
||||
@@ -2673,6 +2665,8 @@ struct server_context {
|
||||
llama_lora_adapters_apply(ctx, lora_adapters);
|
||||
server_task_result result;
|
||||
result.id = task.id;
|
||||
result.stop = true;
|
||||
result.error = false;
|
||||
result.data = json{{ "success", true }};
|
||||
queue_results.send(result);
|
||||
} break;
|
||||
@@ -3619,6 +3613,9 @@ int main(int argc, char ** argv) {
|
||||
return 1;
|
||||
}
|
||||
|
||||
// parse arguments from environment variables
|
||||
gpt_params_parse_from_env(params);
|
||||
|
||||
// TODO: not great to use extern vars
|
||||
server_log_json = params.log_json;
|
||||
server_verbose = params.verbosity > 0;
|
||||
|
||||
Reference in New Issue
Block a user