Model: Add exl3 and associated load functions

Initial exl3 compat and loading functionality. Signed-off-by: kingbri <8082010+kingbri1@users.noreply.github.com>
2026-04-26 09:18:53 +00:00 · 2025-04-28 23:54:55 -04:00
parent 7c6a053747
commit 0c1d794390
5 changed files with 357 additions and 67 deletions
--- a/backends/base_model_container.py
+++ b/backends/base_model_container.py
@@ -25,6 +25,10 @@ class BaseModelContainer(abc.ABC):
    prompt_template: Optional[PromptTemplate] = None
    generation_config: Optional[GenerationConfig] = None

+    # Optional features
+    use_draft_model: bool = False
+    use_vision: bool = False
+
    # Load synchronization
    # The bool is a master switch for accepting requests
    # The lock keeps load tasks sequential
@@ -65,7 +69,7 @@ class BaseModelContainer(abc.ABC):

    # NOTE: Might be an optional method
    @abc.abstractmethod
-    async def load_gen(self, progress_callback=None, **kwargs) -> AsyncIterator[Any]:
+    async def load_gen(self, progress_callback=None, **kwargs):
        """
        Loads the model into memory, yielding progress updates.

@@ -134,57 +138,6 @@ class BaseModelContainer(abc.ABC):

        pass

-    @abc.abstractmethod
-    async def generate(
-        self,
-        request_id: str,
-        prompt: str,
-        params: BaseSamplerRequest,
-        abort_event: Optional[asyncio.Event] = None,
-        mm_embeddings: Optional[MultimodalEmbeddingWrapper] = None,
-    ) -> Dict[str, Any]:
-        """
-        Generates a complete response for a given prompt and parameters.
-
-        Args:
-            request_id: Unique identifier for the generation request.
-            prompt: The input prompt string.
-            params: Sampling and generation parameters.
-            abort_event: An asyncio Event to signal cancellation.
-            mm_embeddings: Optional multimodal embeddings.
-
-        Returns:
-            A dictionary containing the generation info
-        """
-
-        pass
-
-    @abc.abstractmethod
-    async def stream_generate(
-        self,
-        request_id: str,
-        prompt: str,
-        params: BaseSamplerRequest,
-        abort_event: Optional[asyncio.Event] = None,
-        mm_embeddings: Optional[MultimodalEmbeddingWrapper] = None,
-    ) -> AsyncIterator[Dict[str, Any]]:
-        """
-        Generates a response iteratively (streaming) for a given prompt.
-
-        Args:
-            request_id: Unique identifier for the generation request.
-            prompt: The input prompt string.
-            params: Sampling and generation parameters.
-            abort_event: An asyncio Event to signal cancellation.
-            mm_embeddings: Optional multimodal embeddings.
-
-        Yields:
-            Generation chunks
-        """
-
-        if False:
-            yield
-
    @abc.abstractmethod
    def model_info(self) -> ModelCard:
        """
@@ -239,3 +192,54 @@ class BaseModelContainer(abc.ABC):
        """

        return []
+
+    @abc.abstractmethod
+    async def generate(
+        self,
+        request_id: str,
+        prompt: str,
+        params: BaseSamplerRequest,
+        abort_event: Optional[asyncio.Event] = None,
+        mm_embeddings: Optional[MultimodalEmbeddingWrapper] = None,
+    ) -> Dict[str, Any]:
+        """
+        Generates a complete response for a given prompt and parameters.
+
+        Args:
+            request_id: Unique identifier for the generation request.
+            prompt: The input prompt string.
+            params: Sampling and generation parameters.
+            abort_event: An asyncio Event to signal cancellation.
+            mm_embeddings: Optional multimodal embeddings.
+
+        Returns:
+            A dictionary containing the generation info
+        """
+
+        pass
+
+    @abc.abstractmethod
+    async def stream_generate(
+        self,
+        request_id: str,
+        prompt: str,
+        params: BaseSamplerRequest,
+        abort_event: Optional[asyncio.Event] = None,
+        mm_embeddings: Optional[MultimodalEmbeddingWrapper] = None,
+    ) -> AsyncIterator[Dict[str, Any]]:
+        """
+        Generates a response iteratively (streaming) for a given prompt.
+
+        Args:
+            request_id: Unique identifier for the generation request.
+            prompt: The input prompt string.
+            params: Sampling and generation parameters.
+            abort_event: An asyncio Event to signal cancellation.
+            mm_embeddings: Optional multimodal embeddings.
+
+        Yields:
+            Generation chunks
+        """
+
+        if False:
+            yield