server: enable checkpoint for recurrent models (#1310)

* server: enable checkpoint for recurrent models

create checkpoint after cancel

fix ban string and rm context during rewind

add checkpoint interval

only save recurrent cache

* save checkpoint during pp

---------

Co-authored-by: firecoperana <firecoperana>
This commit is contained in:
firecoperana
2026-02-25 23:51:18 -06:00
committed by GitHub
parent 216f44363f
commit 3fac78c48b
11 changed files with 204 additions and 111 deletions

View File

@@ -2041,6 +2041,16 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
}
return true;
}
if (arg == "--ctx-checkpoints") {
CHECK_ARG
params.ctx_checkpoints_n = std::stoi(argv[i]);
return true;
}
if (arg == "--ctx-checkpoints-interval") {
CHECK_ARG
params.ctx_checkpoints_interval = std::stoi(argv[i]);
return true;
}
if (arg == "-cram" || arg == "--cache-ram") {
CHECK_ARG
params.cache_ram_mib = std::stoi(argv[i]);
@@ -2235,7 +2245,10 @@ void gpt_params_print_usage(int /*argc*/, char ** argv, const gpt_params & param
options.push_back({ "*", "-c, --ctx-size N", "size of the prompt context (default: %d, 0 = loaded from model)", params.n_ctx });
options.push_back({ "*", "-cd, --ctx-size-draft N", "size of the prompt context for the draft model (default: %d, 0 = loaded from model)", params.speculative.n_ctx });
options.push_back({ "*", "-cram, --cache-ram N", "set the maximum cache size in MiB (default: %d, -1 - no limit, 0 - disable)",params.cache_ram_mib });
options.push_back({ "*", "--ctx-checkpoints N", "max number of context checkpoints to create per slot (default: %d)",params.ctx_checkpoints_n});
options.push_back({ "*", "--ctx-checkpoints-interval N", "minimum number of tokens between each context checkpoint. (default: %d, <=0 disable)",params.ctx_checkpoints_interval});
options.push_back({ "*", "-cram, --cache-ram N", "set the maximum cache size in MiB (default: %d, -1 - no limit, 0 - disable)",params.cache_ram_mib });
options.push_back({ "*", "-crs, --cache-ram-similarity N", "max of similarity of prompt tokens to cache tokens that triggers prompt cache (default: %.2f).",params.cache_ram_similarity });
options.push_back({ "*", "-cram-n-min --cache-ram-n-min N", "minimum number of the cached tokens that triggers prompt cache (default: %d).", params.cache_ram_n_min });
options.push_back({ "*", "-n, --predict N", "number of tokens to predict (default: %d, -1 = infinity, -2 = until context filled)", params.n_predict });