diff --git a/common/config_models.py b/common/config_models.py index 340685e..0e71734 100644 --- a/common/config_models.py +++ b/common/config_models.py @@ -180,6 +180,25 @@ class ModelConfig(BaseConfigModel): ), ge=-1, ) + cache_size: Optional[int] = Field( + None, + description=( + "Size of the prompt cache to allocate (default: max_seq_len).\n" + "Must be a multiple of 256 and can't be less than max_seq_len.\n" + "For CFG, set this to 2 * max_seq_len." + ), + multiple_of=256, + gt=0, + ) + cache_mode: Optional[CACHE_TYPE] = Field( + "FP16", + description=( + "Enable different cache modes for VRAM savings (default: FP16).\n" + f"Possible values for exllamav2: {str(CACHE_SIZES)[15:-1]}.\n" + "For exllamav3, specify the pair k_bits,v_bits where k_bits and v_bits " + "are integers from 2-8 (i.e. 8,8)." + ), + ) tensor_parallel: Optional[bool] = Field( False, description=( @@ -236,25 +255,6 @@ class ModelConfig(BaseConfigModel): "or auto-calculate." ), ) - cache_mode: Optional[CACHE_TYPE] = Field( - "FP16", - description=( - "Enable different cache modes for VRAM savings (default: FP16).\n" - f"Possible values for exllamav2: {str(CACHE_SIZES)[15:-1]}.\n" - "For exllamav3, specify the pair k_bits,v_bits where k_bits and v_bits " - "are integers from 2-8 (i.e. 8,8)." - ), - ) - cache_size: Optional[int] = Field( - None, - description=( - "Size of the prompt cache to allocate (default: max_seq_len).\n" - "Must be a multiple of 256 and can't be less than max_seq_len.\n" - "For CFG, set this to 2 * max_seq_len." - ), - multiple_of=256, - gt=0, - ) chunk_size: Optional[int] = Field( 2048, description=( diff --git a/config_sample.yml b/config_sample.yml index 0d51719..1dbc7d5 100644 --- a/config_sample.yml +++ b/config_sample.yml @@ -81,6 +81,15 @@ model: # Max sequence length (default: fetch from the model's config.json). max_seq_len: + # Size of the key/value cache to allocate, in tokens (default: 4096). + # Must be a multiple of 256. + cache_size: + + # Enable different cache modes for VRAM savings (default: FP16). + # Possible values for exllamav2: 'FP16', 'Q8', 'Q6', 'Q4'. + # For exllamav3, specify the pair k_bits,v_bits where k_bits and v_bits are integers from 2-8 (i.e. 8,8). + cache_mode: FP16 + # Load model with tensor parallelism. # Falls back to autosplit if GPU split isn't provided. # This ignores the gpu_split_auto value. @@ -118,15 +127,6 @@ model: # Leaving this value blank will either pull from the model or auto-calculate. rope_alpha: - # Enable different cache modes for VRAM savings (default: FP16). - # Possible values for exllamav2: 'FP16', 'Q8', 'Q6', 'Q4'. - # For exllamav3, specify the pair k_bits,v_bits where k_bits and v_bits are integers from 2-8 (i.e. 8,8). - cache_mode: FP16 - - # Size of the key/value cache to allocate, in tokens (default: 4096). - # Must be a multiple of 256. - cache_size: - # Chunk size for prompt ingestion (default: 2048). # A lower value reduces VRAM usage but decreases ingestion speed. # NOTE: Effects vary depending on the model. diff --git a/endpoints/core/types/model.py b/endpoints/core/types/model.py index 6e2e0c9..8422929 100644 --- a/endpoints/core/types/model.py +++ b/endpoints/core/types/model.py @@ -14,11 +14,11 @@ class ModelCardParameters(BaseModel): # Safe to do this since it's guaranteed to fetch a max seq len # from model_container max_seq_len: Optional[int] = None + cache_size: Optional[int] = None + cache_mode: Optional[str] = "FP16" rope_scale: Optional[float] = 1.0 rope_alpha: Optional[float] = 1.0 max_batch_size: Optional[int] = 1 - cache_size: Optional[int] = None - cache_mode: Optional[str] = "FP16" chunk_size: Optional[int] = 2048 prompt_template: Optional[str] = None prompt_template_content: Optional[str] = None @@ -89,6 +89,7 @@ class ModelLoadRequest(BaseModel): default=None, examples=[4096], ) + cache_mode: Optional[str] = None tensor_parallel: Optional[bool] = None tensor_parallel_backend: Optional[str] = "native" gpu_split_auto: Optional[bool] = None @@ -107,7 +108,6 @@ class ModelLoadRequest(BaseModel): default=None, examples=[1.0], ) - cache_mode: Optional[str] = None chunk_size: Optional[int] = None output_chunking: Optional[bool] = True prompt_template: Optional[str] = None