From 5cb8f3ed2c046e9420d604f2d3ef2384d44189f2 Mon Sep 17 00:00:00 2001
From: kingbri <8082010+kingbri1@users.noreply.github.com>
Date: Tue, 14 Oct 2025 23:04:36 -0400
Subject: [PATCH] Config: Fix comments for max_seq_len and cache_size

The default is the minimum between max_position_embeddings and cache_size.
On AMD and older than Ampere NVIDIA GPUs, cache_size is ignored due
to not being supported by batching on exl2.

Signed-off-by: kingbri <8082010+kingbri1@users.noreply.github.com>
---
 config_sample.yml | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/config_sample.yml b/config_sample.yml
index 1dbc7d5..0b65f9e 100644
--- a/config_sample.yml
+++ b/config_sample.yml
@@ -78,11 +78,14 @@ model:
   # Options: exllamav2, exllamav3
   backend:
 
-  # Max sequence length (default: fetch from the model's config.json).
+  # Max sequence length (default: min(max_position_embeddings, cache_size)).
+  # Set to -1 to fetch from the model's config.json
   max_seq_len:
 
   # Size of the key/value cache to allocate, in tokens (default: 4096).
   # Must be a multiple of 256.
+  # ExllamaV2 note: On AMD GPUs and NVIDIA GPUs older than Ampere, this value
+  # is ignored. Please use max_seq_len
   cache_size:
 
   # Enable different cache modes for VRAM savings (default: FP16).