API: Add more methods to semaphore

The semaphore/queue model for Tabby is as follows: - Any load requests go through the semaphore by default - Any load request can include the skip_queue parameter to bypass the semaphore - Any unload requests are immediately executed - All completion requests are placed inside the semaphore by default This model preserves the parallelism of single-user mode with extra convenience methods for queues in multi-user. It also helps mitigate problems that were previously present in the concurrency stack. Also change how the program's loop runs so it exits when the API thread dies. Signed-off-by: kingbri <bdashore3@proton.me>
2026-04-20 22:38:55 +00:00 · 2024-03-03 01:22:34 -05:00
parent c82697fef2
commit b0c295dd2f
5 changed files with 77 additions and 39 deletions
--- a/backends/exllamav2/model.py
+++ b/backends/exllamav2/model.py
@@ -55,6 +55,7 @@ class ExllamaV2Container:
    autosplit_reserve: List[float] = [96 * 1024**2]

    # Load state
+    model_is_loading: bool = False
    model_loaded: bool = False

    def __init__(self, model_directory: pathlib.Path, quiet=False, **kwargs):
@@ -350,6 +351,9 @@ class ExllamaV2Container:
                def progress(loaded_modules: int, total_modules: int)
        """

+        # Notify that the model is being loaded
+        self.model_is_loading = True
+
        # Load tokenizer
        self.tokenizer = ExLlamaV2Tokenizer(self.config)

@@ -439,6 +443,7 @@ class ExllamaV2Container:
        torch.cuda.empty_cache()

        # Update model load state
+        self.model_is_loading = False
        self.model_loaded = True
        logger.info("Model successfully loaded.")

@@ -472,7 +477,7 @@ class ExllamaV2Container:

        # Update model load state
        self.model_loaded = False
-        logger.info("Model unloaded.")
+        logger.info("Loras unloaded." if loras_only else "Model unloaded.")

    def encode_tokens(self, text: str, **kwargs):
        """Wrapper to encode tokens from a text string"""