mirror of
https://github.com/ikawrakow/ik_llama.cpp.git
synced 2026-04-27 09:53:40 +00:00
server: add checkpoint tolerance and fix grammar_trigger init (#1346)
Co-authored-by: firecoperana <firecoperana>
This commit is contained in:
@@ -2051,6 +2051,11 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
|
||||
params.ctx_checkpoints_interval = std::stoi(argv[i]);
|
||||
return true;
|
||||
}
|
||||
if (arg == "--ctx-checkpoints-tolerance") {
|
||||
CHECK_ARG
|
||||
params.ctx_checkpoints_tolerance = std::stoi(argv[i]);
|
||||
return true;
|
||||
}
|
||||
if (arg == "-cram" || arg == "--cache-ram") {
|
||||
CHECK_ARG
|
||||
params.cache_ram_mib = std::stoi(argv[i]);
|
||||
@@ -2248,6 +2253,7 @@ void gpt_params_print_usage(int /*argc*/, char ** argv, const gpt_params & param
|
||||
|
||||
options.push_back({ "*", "--ctx-checkpoints N", "max number of context checkpoints to create per slot (default: %d)",params.ctx_checkpoints_n});
|
||||
options.push_back({ "*", "--ctx-checkpoints-interval N", "minimum number of tokens between each context checkpoint. (default: %d, <=0 disable)",params.ctx_checkpoints_interval});
|
||||
options.push_back({ "*", "--ctx-checkpoints-tolerance N", "the number of tokens before the full prompt to create the checkpoint. (default: %d, <=0 disable)",params.ctx_checkpoints_tolerance});
|
||||
options.push_back({ "*", "-cram, --cache-ram N", "set the maximum cache size in MiB (default: %d, -1 - no limit, 0 - disable)",params.cache_ram_mib });
|
||||
options.push_back({ "*", "-crs, --cache-ram-similarity N", "max of similarity of prompt tokens to cache tokens that triggers prompt cache (default: %.2f).",params.cache_ram_similarity });
|
||||
options.push_back({ "*", "-cram-n-min --cache-ram-n-min N", "minimum number of the cached tokens that triggers prompt cache (default: %d).", params.cache_ram_n_min });
|
||||
|
||||
@@ -281,7 +281,6 @@ struct gpt_params {
|
||||
int32_t banned_n = 1; // number of tokens that are banned in the phrase
|
||||
size_t n_buffer = 0; // number of token buffers for string ban
|
||||
bool can_ban_phrases = true; // whether to ban strings
|
||||
bool do_checkpoint = false; // do checkpoint for recurrent models only
|
||||
|
||||
std::vector<llama_model_kv_override> kv_overrides;
|
||||
std::vector<llama_model_tensor_buft_override> tensor_buft_overrides;
|
||||
@@ -420,8 +419,10 @@ struct gpt_params {
|
||||
|
||||
float slot_prompt_similarity = 0.1f;
|
||||
|
||||
bool do_checkpoint = false; // do checkpoint for recurrent models only
|
||||
int32_t ctx_checkpoints_n = 8; // max number of context checkpoints per slot
|
||||
int32_t ctx_checkpoints_interval = 512; // minimum number of tokens between each context checkpoints
|
||||
int32_t ctx_checkpoints_tolerance = 5; // the number of tokens before the full prompt to create the checkpoint
|
||||
int32_t cache_ram_mib = 8192; // -1 = no limit, 0 - disable, 1 = 1 MiB, etc.
|
||||
int32_t cache_ram_n_min = 0; // min number of tokens required to save in the ram
|
||||
float cache_ram_similarity = 0.5f; // similarity of tokens to cached tokens
|
||||
|
||||
Reference in New Issue
Block a user