Model: Check for unsupported cache mode in exllamav2

2026-03-15 00:07:28 +00:00 · 2025-05-06 01:18:15 -07:00
parent 45b966363e
commit 9dcde59c57
2 changed files with 18 additions and 2 deletions
--- a/backends/exllamav2/model.py
+++ b/backends/exllamav2/model.py
@@ -185,9 +185,17 @@ class ExllamaV2Container(BaseModelContainer):
        # MARK: User configuration
        # Get cache mode
        # TODO: Separate validation for Exl2 and Exl3 q-cache options
        self.cache_mode = unwrap(kwargs.get("cache_mode"), "FP16")
        # Catch exllamav3 cache_mode
        if not self.cache_mode.startswith("Q"):
            logger.warning(
                f"Provided cache mode '{self.cache_mode}' is not a "
                "valid choice for exllamav2, please check your settings. "
                "Defaulting to FP16."
            )
            self.cache_mode = "FP16"
        # Turn off GPU split if the user is using 1 GPU
        gpu_count = torch.cuda.device_count()
        gpu_split_auto = unwrap(kwargs.get("gpu_split_auto"), True)
@@ -393,6 +401,15 @@ class ExllamaV2Container(BaseModelContainer):
            # Set draft cache mode
            self.draft_cache_mode = unwrap(draft_args.get("draft_cache_mode"), "FP16")
            # Catch exllamav3 draft_cache_mode
            if not self.draft_cache_mode.startswith("Q"):
                logger.warning(
                    f"Provided draft cache mode '{self.draft_cache_mode}' is not a "
                    "valid choice for exllamav2, please check your settings. "
                    "Defaulting to FP16."
                )
                self.draft_cache_mode = "FP16"
            # Edit the draft config size
            if chunk_size:
                self.draft_config.max_input_len = chunk_size
--- a/common/config_models.py
+++ b/common/config_models.py
@@ -229,7 +229,6 @@ class ModelConfig(BaseConfigModel):
            "or auto-calculate."
        ),
    )
    # TODO: Separate validation for Exl2 and Exl3 q-cache options
    cache_mode: Optional[CACHE_TYPE] = Field(
        "FP16",
        description=(