Model: Bypass lock checks when shutting down

Previously, when a SIGINT was emitted and a model load is running, the API didn't shut down until the load finished due to waitng for the lock. However, when shutting down, the lock doesn't matter since the process is being killed anyway. Signed-off-by: kingbri <bdashore3@proton.me>
2026-03-14 15:57:27 +00:00 · 2024-08-03 16:05:34 -04:00
parent 65c16f2a7c
commit 2a33ebbf29
3 changed files with 15 additions and 10 deletions
--- a/backends/exllamav2/model.py
+++ b/backends/exllamav2/model.py
@@ -734,11 +734,15 @@ class ExllamaV2Container:
        Free all VRAM resources used by this model
        """

-        try:
-            await self.load_lock.acquire()
+        # Shutdown immediately unloads and bypasses all locks
+        do_shutdown = kwargs.get("shutdown")

-            # Wait for other jobs to finish
-            await self.wait_for_jobs(kwargs.get("skip_wait"))
+        try:
+            if not do_shutdown:
+                await self.load_lock.acquire()
+
+                # Wait for other jobs to finish
+                await self.wait_for_jobs(kwargs.get("skip_wait"))

            # Delete references held in the grammar module
            clear_grammar_func_cache()
@@ -778,10 +782,11 @@ class ExllamaV2Container:

            logger.info("Loras unloaded." if loras_only else "Model unloaded.")
        finally:
-            self.load_lock.release()
+            if not do_shutdown:
+                self.load_lock.release()

-            async with self.load_condition:
-                self.load_condition.notify_all()
+                async with self.load_condition:
+                    self.load_condition.notify_all()

    def encode_tokens(self, text: str, **kwargs):
        """Wrapper to encode tokens from a text string"""