API + Model: Add support for specifying k/v cache size

This commit is contained in:
DocShotgun
2024-05-26 14:17:01 -07:00
parent d710a1b441
commit 767e6a798a
4 changed files with 51 additions and 5 deletions

View File

@@ -15,6 +15,7 @@ class ModelCardParameters(BaseModel):
max_seq_len: Optional[int] = None
rope_scale: Optional[float] = 1.0
rope_alpha: Optional[float] = 1.0
cache_size: Optional[int] = None
cache_mode: Optional[str] = "FP16"
chunk_size: Optional[int] = 2048
prompt_template: Optional[str] = None
@@ -72,6 +73,13 @@ class ModelLoadRequest(BaseModel):
default=None,
examples=[4096],
)
cache_size: Optional[int] = Field(
description=(
"Number in tokens, must be greater than or equal to max_seq_len"
),
default=None,
examples=[4096],
)
gpu_split_auto: Optional[bool] = True
autosplit_reserve: Optional[List[float]] = [96]
gpu_split: Optional[List[float]] = Field(