Config + Endpoints: Make cache_size more prominent

Since cache_size is a more important parameter now for multi-user
setups, mark it as such by placing it below max_seq_len.

Signed-off-by: kingbri <8082010+kingbri1@users.noreply.github.com>
This commit is contained in:
kingbri
2025-10-14 21:53:33 -04:00
parent 62e9fa217a
commit 69a25d7fa6
3 changed files with 31 additions and 31 deletions

View File

@@ -180,6 +180,25 @@ class ModelConfig(BaseConfigModel):
),
ge=-1,
)
cache_size: Optional[int] = Field(
None,
description=(
"Size of the prompt cache to allocate (default: max_seq_len).\n"
"Must be a multiple of 256 and can't be less than max_seq_len.\n"
"For CFG, set this to 2 * max_seq_len."
),
multiple_of=256,
gt=0,
)
cache_mode: Optional[CACHE_TYPE] = Field(
"FP16",
description=(
"Enable different cache modes for VRAM savings (default: FP16).\n"
f"Possible values for exllamav2: {str(CACHE_SIZES)[15:-1]}.\n"
"For exllamav3, specify the pair k_bits,v_bits where k_bits and v_bits "
"are integers from 2-8 (i.e. 8,8)."
),
)
tensor_parallel: Optional[bool] = Field(
False,
description=(
@@ -236,25 +255,6 @@ class ModelConfig(BaseConfigModel):
"or auto-calculate."
),
)
cache_mode: Optional[CACHE_TYPE] = Field(
"FP16",
description=(
"Enable different cache modes for VRAM savings (default: FP16).\n"
f"Possible values for exllamav2: {str(CACHE_SIZES)[15:-1]}.\n"
"For exllamav3, specify the pair k_bits,v_bits where k_bits and v_bits "
"are integers from 2-8 (i.e. 8,8)."
),
)
cache_size: Optional[int] = Field(
None,
description=(
"Size of the prompt cache to allocate (default: max_seq_len).\n"
"Must be a multiple of 256 and can't be less than max_seq_len.\n"
"For CFG, set this to 2 * max_seq_len."
),
multiple_of=256,
gt=0,
)
chunk_size: Optional[int] = Field(
2048,
description=(