mirror of
https://github.com/theroyallab/tabbyAPI.git
synced 2026-03-15 00:07:28 +00:00
API + Model: Add support for specifying k/v cache size
This commit is contained in:
@@ -15,6 +15,7 @@ class ModelCardParameters(BaseModel):
|
||||
max_seq_len: Optional[int] = None
|
||||
rope_scale: Optional[float] = 1.0
|
||||
rope_alpha: Optional[float] = 1.0
|
||||
cache_size: Optional[int] = None
|
||||
cache_mode: Optional[str] = "FP16"
|
||||
chunk_size: Optional[int] = 2048
|
||||
prompt_template: Optional[str] = None
|
||||
@@ -72,6 +73,13 @@ class ModelLoadRequest(BaseModel):
|
||||
default=None,
|
||||
examples=[4096],
|
||||
)
|
||||
cache_size: Optional[int] = Field(
|
||||
description=(
|
||||
"Number in tokens, must be greater than or equal to max_seq_len"
|
||||
),
|
||||
default=None,
|
||||
examples=[4096],
|
||||
)
|
||||
gpu_split_auto: Optional[bool] = True
|
||||
autosplit_reserve: Optional[List[float]] = [96]
|
||||
gpu_split: Optional[List[float]] = Field(
|
||||
|
||||
Reference in New Issue
Block a user