server: enable checkpoint for recurrent models

create checkpoint after cancel fix ban string and rm context during rewind add checkpoint interval only save recurrent cache
2026-04-29 19:01:47 +00:00 · 2026-02-22 15:10:56 -06:00
parent c77ec4b8b8
commit 233898704c
11 changed files with 181 additions and 107 deletions
--- a/common/common.cpp
+++ b/common/common.cpp
@@ -2041,6 +2041,16 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
        }
        return true;
    }
+    if (arg == "--ctx-checkpoints") {
+        CHECK_ARG
+        params.ctx_checkpoints_n = std::stoi(argv[i]);
+        return true;
+    }
+    if (arg == "--ctx-checkpoints-interval") {
+        CHECK_ARG
+        params.ctx_checkpoints_interval = std::stoi(argv[i]);
+        return true;
+    }
    if (arg == "-cram" || arg == "--cache-ram") {
        CHECK_ARG
        params.cache_ram_mib = std::stoi(argv[i]);
@@ -2235,7 +2245,10 @@ void gpt_params_print_usage(int /*argc*/, char ** argv, const gpt_params & param

    options.push_back({ "*",           "-c,    --ctx-size N",           "size of the prompt context (default: %d, 0 = loaded from model)", params.n_ctx });
    options.push_back({ "*",           "-cd,   --ctx-size-draft N",     "size of the prompt context for the draft model (default: %d, 0 = loaded from model)", params.speculative.n_ctx });
-    options.push_back({ "*",           "-cram, --cache-ram N",           "set the maximum cache size in MiB (default: %d, -1 - no limit, 0 - disable)",params.cache_ram_mib });
+
+    options.push_back({ "*",           "--ctx-checkpoints N",           "max number of context checkpoints to create per slot (default: %d)",params.ctx_checkpoints_n});
+    options.push_back({ "*",           "--ctx-checkpoints-interval N",  "number of tokens between each context checkpoint.  (default: %d, <=0 disable)",params.ctx_checkpoints_interval});
+    options.push_back({ "*",           "-cram, --cache-ram N",          "set the maximum cache size in MiB (default: %d, -1 - no limit, 0 - disable)",params.cache_ram_mib });
    options.push_back({ "*",           "-crs,  --cache-ram-similarity N",           "max of similarity of prompt tokens to cache tokens that triggers prompt cache (default: %.2f).",params.cache_ram_similarity });
    options.push_back({ "*",           "-cram-n-min --cache-ram-n-min N",           "minimum number of the cached tokens that triggers prompt cache (default: %d).", params.cache_ram_n_min });
    options.push_back({ "*",           "-n,    --predict N",            "number of tokens to predict (default: %d, -1 = infinity, -2 = until context filled)", params.n_predict });
--- a/common/common.h
+++ b/common/common.h
@@ -280,6 +280,7 @@ struct gpt_params {
    std::vector<std::string> ban_phrases;  // strings that are banned in generation
    int32_t banned_n                 =  1; // number of tokens that are banned in the phrase
    size_t n_buffer 				 =  0; // number of token buffers for string ban
+    bool can_ban_phrases             = true; // whether to ban strings

    std::vector<llama_model_kv_override> kv_overrides;
    std::vector<llama_model_tensor_buft_override> tensor_buft_overrides;
@@ -418,7 +419,8 @@ struct gpt_params {

    float slot_prompt_similarity = 0.1f;

-    int32_t n_ctx_checkpoints = 8;            // max number of context checkpoints per slot
+    int32_t ctx_checkpoints_n = 8;            // max number of context checkpoints per slot
+    int32_t ctx_checkpoints_interval = 0;     // number of tokens between each context checkpoints
    int32_t cache_ram_mib = 8192;   // -1 = no limit, 0 - disable, 1 = 1 MiB, etc.
    int32_t cache_ram_n_min = 0;     // min number of tokens required to save in the ram
    float cache_ram_similarity = 0.5f; // similarity of tokens to cached tokens