mirror of
https://github.com/theroyallab/tabbyAPI.git
synced 2026-03-15 00:07:28 +00:00
Model: Check for unsupported cache mode in exllamav2
This commit is contained in:
@@ -185,9 +185,17 @@ class ExllamaV2Container(BaseModelContainer):
|
|||||||
# MARK: User configuration
|
# MARK: User configuration
|
||||||
|
|
||||||
# Get cache mode
|
# Get cache mode
|
||||||
# TODO: Separate validation for Exl2 and Exl3 q-cache options
|
|
||||||
self.cache_mode = unwrap(kwargs.get("cache_mode"), "FP16")
|
self.cache_mode = unwrap(kwargs.get("cache_mode"), "FP16")
|
||||||
|
|
||||||
|
# Catch exllamav3 cache_mode
|
||||||
|
if not self.cache_mode.startswith("Q"):
|
||||||
|
logger.warning(
|
||||||
|
f"Provided cache mode '{self.cache_mode}' is not a "
|
||||||
|
"valid choice for exllamav2, please check your settings. "
|
||||||
|
"Defaulting to FP16."
|
||||||
|
)
|
||||||
|
self.cache_mode = "FP16"
|
||||||
|
|
||||||
# Turn off GPU split if the user is using 1 GPU
|
# Turn off GPU split if the user is using 1 GPU
|
||||||
gpu_count = torch.cuda.device_count()
|
gpu_count = torch.cuda.device_count()
|
||||||
gpu_split_auto = unwrap(kwargs.get("gpu_split_auto"), True)
|
gpu_split_auto = unwrap(kwargs.get("gpu_split_auto"), True)
|
||||||
@@ -393,6 +401,15 @@ class ExllamaV2Container(BaseModelContainer):
|
|||||||
# Set draft cache mode
|
# Set draft cache mode
|
||||||
self.draft_cache_mode = unwrap(draft_args.get("draft_cache_mode"), "FP16")
|
self.draft_cache_mode = unwrap(draft_args.get("draft_cache_mode"), "FP16")
|
||||||
|
|
||||||
|
# Catch exllamav3 draft_cache_mode
|
||||||
|
if not self.draft_cache_mode.startswith("Q"):
|
||||||
|
logger.warning(
|
||||||
|
f"Provided draft cache mode '{self.draft_cache_mode}' is not a "
|
||||||
|
"valid choice for exllamav2, please check your settings. "
|
||||||
|
"Defaulting to FP16."
|
||||||
|
)
|
||||||
|
self.draft_cache_mode = "FP16"
|
||||||
|
|
||||||
# Edit the draft config size
|
# Edit the draft config size
|
||||||
if chunk_size:
|
if chunk_size:
|
||||||
self.draft_config.max_input_len = chunk_size
|
self.draft_config.max_input_len = chunk_size
|
||||||
|
|||||||
@@ -229,7 +229,6 @@ class ModelConfig(BaseConfigModel):
|
|||||||
"or auto-calculate."
|
"or auto-calculate."
|
||||||
),
|
),
|
||||||
)
|
)
|
||||||
# TODO: Separate validation for Exl2 and Exl3 q-cache options
|
|
||||||
cache_mode: Optional[CACHE_TYPE] = Field(
|
cache_mode: Optional[CACHE_TYPE] = Field(
|
||||||
"FP16",
|
"FP16",
|
||||||
description=(
|
description=(
|
||||||
|
|||||||
Reference in New Issue
Block a user