Model: Remove dev wheel setting checks

Removes TP and DRY sampler checks since those are in stable. Signed-off-by: kingbri <bdashore3@proton.me>
2026-04-29 18:51:53 +00:00 · 2024-09-14 22:13:56 -04:00
parent 2d221832fb
commit 2a41910931
1 changed files with 5 additions and 27 deletions
--- a/backends/exllamav2/model.py
+++ b/backends/exllamav2/model.py
@@ -17,6 +17,7 @@ from exllamav2 import (
    ExLlamaV2Cache_Q4,
    ExLlamaV2Cache_Q6,
    ExLlamaV2Cache_Q8,
    ExLlamaV2Cache_TP,
    ExLlamaV2Tokenizer,
    ExLlamaV2Lora,
 )
@@ -55,14 +56,6 @@ from common.templating import (
 from common.transformers_utils import GenerationConfig, HuggingFaceConfig
 from common.utils import coalesce, unwrap
 # Dynamic imports
 try:
    from exllamav2 import ExLlamaV2Cache_TP
    has_tp = True
 except ImportError:
    has_tp = False
 class ExllamaV2Container:
    """The model container class for ExLlamaV2 models."""
@@ -197,17 +190,10 @@ class ExllamaV2Container:
        else:
            # Set tensor parallel
            if use_tp:
-                if has_tp:
+                self.use_tp = True
                    self.use_tp = True
-                    # TP has its own autosplit loader
+                # TP has its own autosplit loader
-                    self.gpu_split_auto = False
+                self.gpu_split_auto = False
                else:
                    # TODO: Remove conditional with exl2 v0.1.9 release
                    logger.warning(
                        "Tensor parallelism is not supported in the "
                        "current ExllamaV2 version."
                    )
            # Enable manual GPU split if provided
            if gpu_split:
@@ -703,7 +689,7 @@ class ExllamaV2Container:
    ):
        """Utility function to create a model cache."""
-        if has_tp and use_tp:
+        if use_tp:
            return ExLlamaV2Cache_TP(
                model,
                base=cache_class,
@@ -967,14 +953,6 @@ class ExllamaV2Container:
        Meant for dev wheels!
        """
        if unwrap(kwargs.get("dry_allowed_length"), 0) > 0 and not hasattr(
            ExLlamaV2Sampler.Settings, "dry_multiplier"
        ):
            logger.warning(
                "DRY sampling is not supported by the currently "
                "installed ExLlamaV2 version."
            )
        return kwargs
    async def generate_gen(