OAI: Add API-based model loading/unloading and auth routes

Models can be loaded and unloaded via the API. Also add authentication to use the API and for administrator tasks. Both types of authorization use different keys. Also fix the unload function to properly free all used vram. Signed-off-by: kingbri <bdashore3@proton.me>
2026-03-14 15:57:27 +00:00 · 2023-11-14 01:17:19 -05:00
parent 47343e2f1a
commit b625bface9
11 changed files with 195 additions and 55 deletions
--- a/model.py
+++ b/model.py
@@ -32,7 +32,7 @@ class ModelContainer:
    gpu_split_auto: bool = True
    gpu_split: list or None = None

-    def __init__(self, model_directory: str, quiet = False, **kwargs):
+    def __init__(self, model_directory: pathlib.Path, quiet = False, **kwargs):
        """
        Create model container

@@ -62,11 +62,11 @@ class ModelContainer:
        self.quiet = quiet

        self.cache_fp8 = "cache_mode" in kwargs and kwargs["cache_mode"] == "FP8"
-        self.gpu_split_auto = kwargs.get("gpu_split_auto", True)
        self.gpu_split = kwargs.get("gpu_split", None)
+        self.gpu_split_auto = self.gpu_split == "auto"

        self.config = ExLlamaV2Config()
-        self.config.model_dir = model_directory
+        self.config.model_dir = str(model_directory.resolve())
        self.config.prepare()

        if "max_seq_len" in kwargs: self.config.max_seq_len = kwargs["max_seq_len"]
@@ -85,7 +85,7 @@ class ModelContainer:
        if self.draft_enabled:

            self.draft_config = ExLlamaV2Config()
-            self.draft_config.model_dir = kwargs["draft_model_directory"]
+            self.draft_config.model_dir = kwargs["draft_model_dir"]
            self.draft_config.prepare()

            self.draft_config.max_seq_len = self.config.max_seq_len
@@ -103,7 +103,7 @@ class ModelContainer:


    def get_model_path(self):
-        model_path = pathlib.Path(self.draft_config.model_dir if self.draft_enabled else self.config.model_dir)
+        model_path = pathlib.Path(self.config.model_dir)
        return model_path


@@ -185,9 +185,12 @@ class ModelContainer:

        if self.model: self.model.unload()
        self.model = None
+        if self.draft_model: self.draft_model.unload()
+        self.draft_model = None
        self.config = None
        self.cache = None
        self.tokenizer = None
+        self.generator = None
        gc.collect()
        torch.cuda.empty_cache()