Config + Endpoints: Make cache_size more prominent

Since cache_size is a more important parameter now for multi-user setups, mark it as such by placing it below max_seq_len. Signed-off-by: kingbri <8082010+kingbri1@users.noreply.github.com>
2026-04-25 00:39:10 +00:00 · 2025-10-14 21:53:33 -04:00
parent 62e9fa217a
commit 69a25d7fa6
3 changed files with 31 additions and 31 deletions
--- a/common/config_models.py
+++ b/common/config_models.py
@@ -180,6 +180,25 @@ class ModelConfig(BaseConfigModel):
        ),
        ge=-1,
    )
    cache_size: Optional[int] = Field(
        None,
        description=(
            "Size of the prompt cache to allocate (default: max_seq_len).\n"
            "Must be a multiple of 256 and can't be less than max_seq_len.\n"
            "For CFG, set this to 2 * max_seq_len."
        ),
        multiple_of=256,
        gt=0,
    )
    cache_mode: Optional[CACHE_TYPE] = Field(
        "FP16",
        description=(
            "Enable different cache modes for VRAM savings (default: FP16).\n"
            f"Possible values for exllamav2: {str(CACHE_SIZES)[15:-1]}.\n"
            "For exllamav3, specify the pair k_bits,v_bits where k_bits and v_bits "
            "are integers from 2-8 (i.e. 8,8)."
        ),
    )
    tensor_parallel: Optional[bool] = Field(
        False,
        description=(
@@ -236,25 +255,6 @@ class ModelConfig(BaseConfigModel):
            "or auto-calculate."
        ),
    )
    cache_mode: Optional[CACHE_TYPE] = Field(
        "FP16",
        description=(
            "Enable different cache modes for VRAM savings (default: FP16).\n"
            f"Possible values for exllamav2: {str(CACHE_SIZES)[15:-1]}.\n"
            "For exllamav3, specify the pair k_bits,v_bits where k_bits and v_bits "
            "are integers from 2-8 (i.e. 8,8)."
        ),
    )
    cache_size: Optional[int] = Field(
        None,
        description=(
            "Size of the prompt cache to allocate (default: max_seq_len).\n"
            "Must be a multiple of 256 and can't be less than max_seq_len.\n"
            "For CFG, set this to 2 * max_seq_len."
        ),
        multiple_of=256,
        gt=0,
    )
    chunk_size: Optional[int] = Field(
        2048,
        description=(
--- a/config_sample.yml
+++ b/config_sample.yml
@@ -81,6 +81,15 @@ model:
  # Max sequence length (default: fetch from the model's config.json).
  max_seq_len:
  # Size of the key/value cache to allocate, in tokens (default: 4096).
  # Must be a multiple of 256.
  cache_size:
  # Enable different cache modes for VRAM savings (default: FP16).
  # Possible values for exllamav2: 'FP16', 'Q8', 'Q6', 'Q4'.
  # For exllamav3, specify the pair k_bits,v_bits where k_bits and v_bits are integers from 2-8 (i.e. 8,8).
  cache_mode: FP16
  # Load model with tensor parallelism.
  # Falls back to autosplit if GPU split isn't provided.
  # This ignores the gpu_split_auto value.
@@ -118,15 +127,6 @@ model:
  # Leaving this value blank will either pull from the model or auto-calculate.
  rope_alpha:
  # Enable different cache modes for VRAM savings (default: FP16).
  # Possible values for exllamav2: 'FP16', 'Q8', 'Q6', 'Q4'.
  # For exllamav3, specify the pair k_bits,v_bits where k_bits and v_bits are integers from 2-8 (i.e. 8,8).
  cache_mode: FP16
  # Size of the key/value cache to allocate, in tokens (default: 4096).
  # Must be a multiple of 256.
  cache_size:
  # Chunk size for prompt ingestion (default: 2048).
  # A lower value reduces VRAM usage but decreases ingestion speed.
  # NOTE: Effects vary depending on the model.
--- a/endpoints/core/types/model.py
+++ b/endpoints/core/types/model.py
@@ -14,11 +14,11 @@ class ModelCardParameters(BaseModel):
    # Safe to do this since it's guaranteed to fetch a max seq len
    # from model_container
    max_seq_len: Optional[int] = None
    cache_size: Optional[int] = None
    cache_mode: Optional[str] = "FP16"
    rope_scale: Optional[float] = 1.0
    rope_alpha: Optional[float] = 1.0
    max_batch_size: Optional[int] = 1
    cache_size: Optional[int] = None
    cache_mode: Optional[str] = "FP16"
    chunk_size: Optional[int] = 2048
    prompt_template: Optional[str] = None
    prompt_template_content: Optional[str] = None
@@ -89,6 +89,7 @@ class ModelLoadRequest(BaseModel):
        default=None,
        examples=[4096],
    )
    cache_mode: Optional[str] = None
    tensor_parallel: Optional[bool] = None
    tensor_parallel_backend: Optional[str] = "native"
    gpu_split_auto: Optional[bool] = None
@@ -107,7 +108,6 @@ class ModelLoadRequest(BaseModel):
        default=None,
        examples=[1.0],
    )
    cache_mode: Optional[str] = None
    chunk_size: Optional[int] = None
    output_chunking: Optional[bool] = True
    prompt_template: Optional[str] = None