server: cache prompt to host memory (#954)

* server : host-memory prompt caching change similarity calculation and prompt save conditions Remove unneeded token limit rename variable Separate prompt save and load logic change default values change log remove truncate prompt logic * add description * bug fixes * remove token limit in init --------- Co-authored-by: firecoperana <firecoperana>
2026-04-30 19:31:48 +00:00 · 2025-11-14 16:40:13 +00:00
parent 2642f48921
commit 0cb6dcc8c8
4 changed files with 347 additions and 50 deletions
--- a/common/common.h
+++ b/common/common.h
@@ -330,7 +330,10 @@ struct gpt_params {
    std::string sql_save_file;
    std::string sqlite_zstd_ext_file;

-    float slot_prompt_similarity = 0.5f;
+    float slot_prompt_similarity = 0.1f;
+    int32_t cache_ram_mib = 8192;   // -1 = no limit, 0 - disable, 1 = 1 MiB, etc.
+    int32_t cache_ram_n_min = 0;     // min number of tokens required to save in the ram
+    float cache_ram_similarity = 0.5f; // similarity of tokens to cached tokens

    // batched-bench params
    bool is_pp_shared = false;