mirror of
https://github.com/theroyallab/tabbyAPI.git
synced 2026-03-14 15:57:27 +00:00
Model: Check for unsupported cache mode in exllamav2
This commit is contained in:
@@ -185,9 +185,17 @@ class ExllamaV2Container(BaseModelContainer):
|
||||
# MARK: User configuration
|
||||
|
||||
# Get cache mode
|
||||
# TODO: Separate validation for Exl2 and Exl3 q-cache options
|
||||
self.cache_mode = unwrap(kwargs.get("cache_mode"), "FP16")
|
||||
|
||||
# Catch exllamav3 cache_mode
|
||||
if not self.cache_mode.startswith("Q"):
|
||||
logger.warning(
|
||||
f"Provided cache mode '{self.cache_mode}' is not a "
|
||||
"valid choice for exllamav2, please check your settings. "
|
||||
"Defaulting to FP16."
|
||||
)
|
||||
self.cache_mode = "FP16"
|
||||
|
||||
# Turn off GPU split if the user is using 1 GPU
|
||||
gpu_count = torch.cuda.device_count()
|
||||
gpu_split_auto = unwrap(kwargs.get("gpu_split_auto"), True)
|
||||
@@ -393,6 +401,15 @@ class ExllamaV2Container(BaseModelContainer):
|
||||
# Set draft cache mode
|
||||
self.draft_cache_mode = unwrap(draft_args.get("draft_cache_mode"), "FP16")
|
||||
|
||||
# Catch exllamav3 draft_cache_mode
|
||||
if not self.draft_cache_mode.startswith("Q"):
|
||||
logger.warning(
|
||||
f"Provided draft cache mode '{self.draft_cache_mode}' is not a "
|
||||
"valid choice for exllamav2, please check your settings. "
|
||||
"Defaulting to FP16."
|
||||
)
|
||||
self.draft_cache_mode = "FP16"
|
||||
|
||||
# Edit the draft config size
|
||||
if chunk_size:
|
||||
self.draft_config.max_input_len = chunk_size
|
||||
|
||||
@@ -229,7 +229,6 @@ class ModelConfig(BaseConfigModel):
|
||||
"or auto-calculate."
|
||||
),
|
||||
)
|
||||
# TODO: Separate validation for Exl2 and Exl3 q-cache options
|
||||
cache_mode: Optional[CACHE_TYPE] = Field(
|
||||
"FP16",
|
||||
description=(
|
||||
|
||||
Reference in New Issue
Block a user