Model: Auto-detect a one GPU setup and fix gpu_split_auto

It makes more sense to use gpu split parameters when the user has >1 GPUs. Otherwise, set split and split_auto to False and save the user some VRAM. Signed-off-by: kingbri <bdashore3@proton.me>
2026-04-25 08:48:57 +00:00 · 2024-02-06 22:58:55 -05:00
parent 849179df17
commit c0ad647fa7
3 changed files with 15 additions and 9 deletions
--- a/backends/exllamav2/model.py
+++ b/backends/exllamav2/model.py
@@ -103,8 +103,15 @@ class ExllamaV2Container:
        self.quiet = quiet

        self.cache_fp8 = "cache_mode" in kwargs and kwargs["cache_mode"] == "FP8"
-        self.gpu_split = kwargs.get("gpu_split")
-        self.gpu_split_auto = unwrap(kwargs.get("gpu_split_auto"), False)
+
+        # Turn off GPU split if the user is using 1 GPU
+        gpu_count = torch.cuda.device_count()
+        if gpu_count > 1:
+            self.gpu_split = kwargs.get("gpu_split")
+            self.gpu_split_auto = unwrap(kwargs.get("gpu_split_auto"), True)
+        else:
+            self.gpu_split_auto = False
+            logger.info("Disabling GPU split because one GPU is in use.")

        self.config = ExLlamaV2Config()
        self.config.model_dir = str(model_directory.resolve())
@@ -354,9 +361,7 @@ class ExllamaV2Container:
        # Load model with manual split
        # Entrypoint for single GPU users
        if not self.gpu_split_auto:
-            logger.info(
-                "Loading with a manual GPU split (or a one GPU setup)"
-            )
+            logger.info("Loading with a manual GPU split (or a one GPU setup)")

            for value in self.model.load_gen(
                self.gpu_split,