server: cache prompt to host memory (#954)

* server : host-memory prompt caching change similarity calculation and prompt save conditions Remove unneeded token limit rename variable Separate prompt save and load logic change default values change log remove truncate prompt logic * add description * bug fixes * remove token limit in init --------- Co-authored-by: firecoperana <firecoperana>
2026-04-23 16:09:18 +00:00 · 2025-11-14 16:40:13 +00:00
parent 2642f48921
commit 0cb6dcc8c8
4 changed files with 347 additions and 50 deletions
--- a/common/common.cpp
+++ b/common/common.cpp
@@ -1816,6 +1816,21 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
        params.ctx_shift = false;
        return true;
    }
+    if (arg == "-cram" || arg == "--cache-ram") {
+        CHECK_ARG
+        params.cache_ram_mib = std::stoi(argv[i]);
+        return true;
+    }
+    if (arg == "-crs" || arg == "--cache-ram-similarity") {
+        CHECK_ARG
+        params.cache_ram_similarity = std::stof(argv[i]);
+        return true;
+    }
+    if (arg == "-cram-n-min" || arg == "--cache-ram-n-min") {
+        CHECK_ARG
+        params.cache_ram_n_min = std::stoi(argv[i]);
+        return true;
+    }
    if (arg == "--pos") {
        CHECK_ARG
        params.i_pos = std::stoi(argv[i]);
@@ -1995,6 +2010,9 @@ void gpt_params_print_usage(int /*argc*/, char ** argv, const gpt_params & param

    options.push_back({ "*",           "-c,    --ctx-size N",           "size of the prompt context (default: %d, 0 = loaded from model)", params.n_ctx });
    options.push_back({ "*",           "-cd,   --ctx-size-draft N",     "size of the prompt context for the draft model (default: %d, 0 = loaded from model)", params.n_ctx_draft });
+    options.push_back({ "*",           "-cram, --cache-ram N",           "set the maximum cache size in MiB (default: %d, -1 - no limit, 0 - disable)",params.cache_ram_mib });
+    options.push_back({ "*",           "-crs,  --cache-ram-similarity N",           "max of similarity of prompt tokens to cache tokens that triggers prompt cache (default: %.2f).",params.cache_ram_similarity });
+    options.push_back({ "*",           "-cram-n-min --cache-ram-n-min N",           "minimum number of the cached tokens that triggers prompt cache (default: %d).", params.cache_ram_n_min });
    options.push_back({ "*",           "-n,    --predict N",            "number of tokens to predict (default: %d, -1 = infinity, -2 = until context filled)", params.n_predict });
    options.push_back({ "*",           "-b,    --batch-size N",         "logical maximum batch size (default: %d)", params.n_batch });
    options.push_back({ "*",           "-ub,   --ubatch-size N",        "physical maximum batch size (default: %d)", params.n_ubatch });
@@ -2007,7 +2025,7 @@ void gpt_params_print_usage(int /*argc*/, char ** argv, const gpt_params & param
    options.push_back({ "*",           "-no-fmoe, --no-fused-moe",      "disable fused MoE (default: %s)", params.fused_moe_up_gate ? "enabled" : "disabled" });
    options.push_back({ "*",           "-ger,  --grouped-expert-routing", "enable grouped expert routing (default: %s)", params.grouped_expert_routing ? "enabled" : "disabled" });
    options.push_back({ "*",           "-no-fug, --no-fused-up-gate",   "disaable fused up-gate (default: %s)", params.fused_up_gate ? "enabled" : "disabled" });
-    options.push_back({ "*",           "-no-mmad, --no-fused-mul-multiadd", "disaable fused mul-multi_add (default: %s)", params.fused_mmad? "enabled" : "disabled" });
+    options.push_back({ "*",           "-no-mmad, --no-fused-mul-multiadd", "disable fused mul-multi_add (default: %s)", params.fused_mmad? "enabled" : "disabled" });
    options.push_back({ "*",           "-rcache, --rope-cache",         "enable RoPE cache (default: %s)", params.rope_cache ? "enabled" : "disabled" });
    options.push_back({ "*",           "-gr, --graph-reuse",            "enable graph reuse (default: %s)", params.graph_reuse ? "enabled" : "disabled" });
    options.push_back({ "*",         "-ser,  --smart-expert-reduction,","experts reduction (default: %d,%g)", params.min_experts, params.thresh_experts});