API + Model: Add support for specifying k/v cache size

2026-03-14 15:57:27 +00:00 · 2024-05-26 14:17:01 -07:00
parent d710a1b441
commit 767e6a798a
4 changed files with 51 additions and 5 deletions
--- a/config_sample.yml
+++ b/config_sample.yml
@@ -77,6 +77,12 @@ model:
  # Only use this if the model's base sequence length in config.json is incorrect (ex. Mistral 7B)
  #override_base_seq_len:

+  # Size of the prompt cache to allocate (in number of tokens, must be a multiple of 256)
+  # Larger cache uses more VRAM, but allows for more prompts to be cached and a larger batch of gens to proceed simultanously
+  # The minimum size is max_seq_len, but we recommend setting this to the highest value that will fit on your GPU
+  # Recommend setting this to at least max_seq_len * 2 if you want to use CFG with full-length positive and negative prompts
+  #cache_size:
+
  # Automatically allocate resources to GPUs (default: True)
  # NOTE: Not parsed for single GPU users
  #gpu_split_auto: True