From f15ac1f69d0a18745d5fd7c73a56173389397d6a Mon Sep 17 00:00:00 2001
From: kingbri <8082010+kingbri1@users.noreply.github.com>
Date: Sat, 19 Apr 2025 22:34:06 -0400
Subject: [PATCH] Model: Reject model requests when unloading

If a model is being unloaded, that means its being shut down and
no requests should be accepted from then on.

Also, remove model_is_loaded since we simply check if the container
is None now.

Signed-off-by: kingbri <8082010+kingbri1@users.noreply.github.com>
---
 backends/exllamav2/model.py | 25 +++++++++++++------------
 backends/infinity/model.py  |  3 ---
 2 files changed, 13 insertions(+), 15 deletions(-)

diff --git a/backends/exllamav2/model.py b/backends/exllamav2/model.py
index 683e41c..4450c13 100644
--- a/backends/exllamav2/model.py
+++ b/backends/exllamav2/model.py
@@ -7,7 +7,6 @@ import math
 import pathlib
 import traceback
 import torch
-import uuid
 from exllamav2 import (
     ExLlamaV2,
     ExLlamaV2Config,
@@ -99,14 +98,12 @@ class ExllamaV2Container:
     use_vision: bool = False
     vision_model: Optional[ExLlamaV2VisionTower] = None
 
-    # Load state
-    model_is_loading: bool = False
-    model_loaded: bool = False
-
     # Load synchronization
+    # The bool is a master switch for accepting requests
     # The lock keeps load tasks sequential
     # The condition notifies any waiting tasks
     active_job_ids: Dict[str, Optional[ExLlamaV2DynamicJobAsync]] = {}
+    loaded: bool = False
     load_lock: asyncio.Lock = asyncio.Lock()
     load_condition: asyncio.Condition = asyncio.Condition()
 
@@ -560,7 +557,6 @@ class ExllamaV2Container:
         # Do this operation under the load lock's context
         try:
             await self.load_lock.acquire()
-            self.model_is_loading = True
 
             # Wait for existing generation jobs to finish
             await self.wait_for_jobs(kwargs.get("skip_wait"))
@@ -579,11 +575,10 @@ class ExllamaV2Container:
             torch.cuda.empty_cache()
 
             # Cleanup and update model load state
-            self.model_loaded = True
+            self.loaded = True
             logger.info("Model successfully loaded.")
         finally:
             self.load_lock.release()
-            self.model_is_loading = False
 
             async with self.load_condition:
                 self.load_condition.notify_all()
@@ -773,7 +768,7 @@ class ExllamaV2Container:
 
         try:
             # Don't acquire locks unless a model is loaded
-            if self.model_loaded:
+            if self.loaded:
                 await self.load_lock.acquire()
 
                 # Immediately cancel all jobs
@@ -796,7 +791,7 @@ class ExllamaV2Container:
         finally:
             # This means the generator is being recreated
             # The load lock is already released in the load function
-            if self.model_loaded:
+            if self.loaded:
                 self.load_lock.release()
 
                 async with self.load_condition:
@@ -905,8 +900,7 @@ class ExllamaV2Container:
                     self.generator = None
 
                 # Set all model state variables to False
-                self.model_is_loading = False
-                self.model_loaded = False
+                self.loaded = False
 
             gc.collect()
             torch.cuda.empty_cache()
@@ -1233,9 +1227,16 @@ class ExllamaV2Container:
         """
 
         # Wait for load lock to be freed before processing
+        # Mainly used for loras and other operations where the class is available
         async with self.load_condition:
             await self.load_condition.wait_for(lambda: not self.load_lock.locked())
 
+        # If the model is being unloaded, don't accept new requests
+        if not self.loaded:
+            raise RuntimeError(
+                "Model is being unloaded. Cannot process new generation requests."
+            )
+
         # Mark that the job is running
         self.active_job_ids[request_id] = None
 
diff --git a/backends/infinity/model.py b/backends/infinity/model.py
index 660aab7..04698e5 100644
--- a/backends/infinity/model.py
+++ b/backends/infinity/model.py
@@ -14,7 +14,6 @@ if dependencies.extras:
 
 class InfinityContainer:
     model_dir: pathlib.Path
-    model_is_loading: bool = False
     model_loaded: bool = False
 
     # Use a runtime type hint here
@@ -24,8 +23,6 @@ class InfinityContainer:
         self.model_dir = model_directory
 
     async def load(self, **kwargs):
-        self.model_is_loading = True
-
         # Use cpu by default
         device = unwrap(kwargs.get("embeddings_device"), "cpu")