From 5cb8f3ed2c046e9420d604f2d3ef2384d44189f2 Mon Sep 17 00:00:00 2001 From: kingbri <8082010+kingbri1@users.noreply.github.com> Date: Tue, 14 Oct 2025 23:04:36 -0400 Subject: [PATCH] Config: Fix comments for max_seq_len and cache_size The default is the minimum between max_position_embeddings and cache_size. On AMD and older than Ampere NVIDIA GPUs, cache_size is ignored due to not being supported by batching on exl2. Signed-off-by: kingbri <8082010+kingbri1@users.noreply.github.com> --- config_sample.yml | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/config_sample.yml b/config_sample.yml index 1dbc7d5..0b65f9e 100644 --- a/config_sample.yml +++ b/config_sample.yml @@ -78,11 +78,14 @@ model: # Options: exllamav2, exllamav3 backend: - # Max sequence length (default: fetch from the model's config.json). + # Max sequence length (default: min(max_position_embeddings, cache_size)). + # Set to -1 to fetch from the model's config.json max_seq_len: # Size of the key/value cache to allocate, in tokens (default: 4096). # Must be a multiple of 256. + # ExllamaV2 note: On AMD GPUs and NVIDIA GPUs older than Ampere, this value + # is ignored. Please use max_seq_len cache_size: # Enable different cache modes for VRAM savings (default: FP16).