API: Auto-unload on a load request

Automatically unload the existing model when calling /load. This was requested many times, and does make more sense in the long run. Signed-off-by: kingbri <bdashore3@proton.me>
2026-05-14 01:33:50 +00:00 · 2024-02-21 23:00:11 -05:00
parent 368eb2e2d9
commit bee26a2f2c
2 changed files with 14 additions and 4 deletions
--- a/backends/exllamav2/model.py
+++ b/backends/exllamav2/model.py
@@ -464,6 +464,8 @@ class ExllamaV2Container:
        gc.collect()
        torch.cuda.empty_cache()

+        logger.info("Model unloaded.")
+
    def encode_tokens(self, text: str, **kwargs):
        """Wrapper to encode tokens from a text string"""

--- a/main.py
+++ b/main.py
@@ -172,11 +172,19 @@ async def load_model(request: Request, data: ModelLoadRequest):
    """Loads a model into the model container."""
    global MODEL_CONTAINER

-    if MODEL_CONTAINER and MODEL_CONTAINER.model:
-        raise HTTPException(400, "A model is already loaded! Please unload it first.")
-
    if not data.name:
-        raise HTTPException(400, "model_name not found.")
+        raise HTTPException(400, "A model name was not provided.")
+
+    # Unload the existing model
+    if MODEL_CONTAINER and MODEL_CONTAINER.model:
+        loaded_model_name = MODEL_CONTAINER.get_model_path().name
+
+        if loaded_model_name == data.name:
+            raise HTTPException(
+                400, f"Model \"{loaded_model_name}\"is already loaded! Aborting."
+            )
+        else:
+            MODEL_CONTAINER.unload()

    model_path = pathlib.Path(unwrap(get_model_config().get("model_dir"), "models"))
    model_path = model_path / data.name