mirror of
https://github.com/theroyallab/tabbyAPI.git
synced 2026-04-25 00:39:10 +00:00
Config + Endpoints: Make cache_size more prominent
Since cache_size is a more important parameter now for multi-user setups, mark it as such by placing it below max_seq_len. Signed-off-by: kingbri <8082010+kingbri1@users.noreply.github.com>
This commit is contained in:
@@ -180,6 +180,25 @@ class ModelConfig(BaseConfigModel):
|
|||||||
),
|
),
|
||||||
ge=-1,
|
ge=-1,
|
||||||
)
|
)
|
||||||
|
cache_size: Optional[int] = Field(
|
||||||
|
None,
|
||||||
|
description=(
|
||||||
|
"Size of the prompt cache to allocate (default: max_seq_len).\n"
|
||||||
|
"Must be a multiple of 256 and can't be less than max_seq_len.\n"
|
||||||
|
"For CFG, set this to 2 * max_seq_len."
|
||||||
|
),
|
||||||
|
multiple_of=256,
|
||||||
|
gt=0,
|
||||||
|
)
|
||||||
|
cache_mode: Optional[CACHE_TYPE] = Field(
|
||||||
|
"FP16",
|
||||||
|
description=(
|
||||||
|
"Enable different cache modes for VRAM savings (default: FP16).\n"
|
||||||
|
f"Possible values for exllamav2: {str(CACHE_SIZES)[15:-1]}.\n"
|
||||||
|
"For exllamav3, specify the pair k_bits,v_bits where k_bits and v_bits "
|
||||||
|
"are integers from 2-8 (i.e. 8,8)."
|
||||||
|
),
|
||||||
|
)
|
||||||
tensor_parallel: Optional[bool] = Field(
|
tensor_parallel: Optional[bool] = Field(
|
||||||
False,
|
False,
|
||||||
description=(
|
description=(
|
||||||
@@ -236,25 +255,6 @@ class ModelConfig(BaseConfigModel):
|
|||||||
"or auto-calculate."
|
"or auto-calculate."
|
||||||
),
|
),
|
||||||
)
|
)
|
||||||
cache_mode: Optional[CACHE_TYPE] = Field(
|
|
||||||
"FP16",
|
|
||||||
description=(
|
|
||||||
"Enable different cache modes for VRAM savings (default: FP16).\n"
|
|
||||||
f"Possible values for exllamav2: {str(CACHE_SIZES)[15:-1]}.\n"
|
|
||||||
"For exllamav3, specify the pair k_bits,v_bits where k_bits and v_bits "
|
|
||||||
"are integers from 2-8 (i.e. 8,8)."
|
|
||||||
),
|
|
||||||
)
|
|
||||||
cache_size: Optional[int] = Field(
|
|
||||||
None,
|
|
||||||
description=(
|
|
||||||
"Size of the prompt cache to allocate (default: max_seq_len).\n"
|
|
||||||
"Must be a multiple of 256 and can't be less than max_seq_len.\n"
|
|
||||||
"For CFG, set this to 2 * max_seq_len."
|
|
||||||
),
|
|
||||||
multiple_of=256,
|
|
||||||
gt=0,
|
|
||||||
)
|
|
||||||
chunk_size: Optional[int] = Field(
|
chunk_size: Optional[int] = Field(
|
||||||
2048,
|
2048,
|
||||||
description=(
|
description=(
|
||||||
|
|||||||
@@ -81,6 +81,15 @@ model:
|
|||||||
# Max sequence length (default: fetch from the model's config.json).
|
# Max sequence length (default: fetch from the model's config.json).
|
||||||
max_seq_len:
|
max_seq_len:
|
||||||
|
|
||||||
|
# Size of the key/value cache to allocate, in tokens (default: 4096).
|
||||||
|
# Must be a multiple of 256.
|
||||||
|
cache_size:
|
||||||
|
|
||||||
|
# Enable different cache modes for VRAM savings (default: FP16).
|
||||||
|
# Possible values for exllamav2: 'FP16', 'Q8', 'Q6', 'Q4'.
|
||||||
|
# For exllamav3, specify the pair k_bits,v_bits where k_bits and v_bits are integers from 2-8 (i.e. 8,8).
|
||||||
|
cache_mode: FP16
|
||||||
|
|
||||||
# Load model with tensor parallelism.
|
# Load model with tensor parallelism.
|
||||||
# Falls back to autosplit if GPU split isn't provided.
|
# Falls back to autosplit if GPU split isn't provided.
|
||||||
# This ignores the gpu_split_auto value.
|
# This ignores the gpu_split_auto value.
|
||||||
@@ -118,15 +127,6 @@ model:
|
|||||||
# Leaving this value blank will either pull from the model or auto-calculate.
|
# Leaving this value blank will either pull from the model or auto-calculate.
|
||||||
rope_alpha:
|
rope_alpha:
|
||||||
|
|
||||||
# Enable different cache modes for VRAM savings (default: FP16).
|
|
||||||
# Possible values for exllamav2: 'FP16', 'Q8', 'Q6', 'Q4'.
|
|
||||||
# For exllamav3, specify the pair k_bits,v_bits where k_bits and v_bits are integers from 2-8 (i.e. 8,8).
|
|
||||||
cache_mode: FP16
|
|
||||||
|
|
||||||
# Size of the key/value cache to allocate, in tokens (default: 4096).
|
|
||||||
# Must be a multiple of 256.
|
|
||||||
cache_size:
|
|
||||||
|
|
||||||
# Chunk size for prompt ingestion (default: 2048).
|
# Chunk size for prompt ingestion (default: 2048).
|
||||||
# A lower value reduces VRAM usage but decreases ingestion speed.
|
# A lower value reduces VRAM usage but decreases ingestion speed.
|
||||||
# NOTE: Effects vary depending on the model.
|
# NOTE: Effects vary depending on the model.
|
||||||
|
|||||||
@@ -14,11 +14,11 @@ class ModelCardParameters(BaseModel):
|
|||||||
# Safe to do this since it's guaranteed to fetch a max seq len
|
# Safe to do this since it's guaranteed to fetch a max seq len
|
||||||
# from model_container
|
# from model_container
|
||||||
max_seq_len: Optional[int] = None
|
max_seq_len: Optional[int] = None
|
||||||
|
cache_size: Optional[int] = None
|
||||||
|
cache_mode: Optional[str] = "FP16"
|
||||||
rope_scale: Optional[float] = 1.0
|
rope_scale: Optional[float] = 1.0
|
||||||
rope_alpha: Optional[float] = 1.0
|
rope_alpha: Optional[float] = 1.0
|
||||||
max_batch_size: Optional[int] = 1
|
max_batch_size: Optional[int] = 1
|
||||||
cache_size: Optional[int] = None
|
|
||||||
cache_mode: Optional[str] = "FP16"
|
|
||||||
chunk_size: Optional[int] = 2048
|
chunk_size: Optional[int] = 2048
|
||||||
prompt_template: Optional[str] = None
|
prompt_template: Optional[str] = None
|
||||||
prompt_template_content: Optional[str] = None
|
prompt_template_content: Optional[str] = None
|
||||||
@@ -89,6 +89,7 @@ class ModelLoadRequest(BaseModel):
|
|||||||
default=None,
|
default=None,
|
||||||
examples=[4096],
|
examples=[4096],
|
||||||
)
|
)
|
||||||
|
cache_mode: Optional[str] = None
|
||||||
tensor_parallel: Optional[bool] = None
|
tensor_parallel: Optional[bool] = None
|
||||||
tensor_parallel_backend: Optional[str] = "native"
|
tensor_parallel_backend: Optional[str] = "native"
|
||||||
gpu_split_auto: Optional[bool] = None
|
gpu_split_auto: Optional[bool] = None
|
||||||
@@ -107,7 +108,6 @@ class ModelLoadRequest(BaseModel):
|
|||||||
default=None,
|
default=None,
|
||||||
examples=[1.0],
|
examples=[1.0],
|
||||||
)
|
)
|
||||||
cache_mode: Optional[str] = None
|
|
||||||
chunk_size: Optional[int] = None
|
chunk_size: Optional[int] = None
|
||||||
output_chunking: Optional[bool] = True
|
output_chunking: Optional[bool] = True
|
||||||
prompt_template: Optional[str] = None
|
prompt_template: Optional[str] = None
|
||||||
|
|||||||
Reference in New Issue
Block a user