mirror of
https://github.com/ikawrakow/ik_llama.cpp.git
synced 2026-04-30 19:31:48 +00:00
server: cache prompt to host memory (#954)
* server : host-memory prompt caching change similarity calculation and prompt save conditions Remove unneeded token limit rename variable Separate prompt save and load logic change default values change log remove truncate prompt logic * add description * bug fixes * remove token limit in init --------- Co-authored-by: firecoperana <firecoperana>
This commit is contained in:
@@ -330,7 +330,10 @@ struct gpt_params {
|
||||
std::string sql_save_file;
|
||||
std::string sqlite_zstd_ext_file;
|
||||
|
||||
float slot_prompt_similarity = 0.5f;
|
||||
float slot_prompt_similarity = 0.1f;
|
||||
int32_t cache_ram_mib = 8192; // -1 = no limit, 0 - disable, 1 = 1 MiB, etc.
|
||||
int32_t cache_ram_n_min = 0; // min number of tokens required to save in the ram
|
||||
float cache_ram_similarity = 0.5f; // similarity of tokens to cached tokens
|
||||
|
||||
// batched-bench params
|
||||
bool is_pp_shared = false;
|
||||
|
||||
Reference in New Issue
Block a user