Model: Auto-detect a one GPU setup and fix gpu_split_auto

It makes more sense to use gpu split parameters when the user has
>1 GPUs. Otherwise, set split and split_auto to False and save
the user some VRAM.

Signed-off-by: kingbri <bdashore3@proton.me>
This commit is contained in:
kingbri
2024-02-06 22:58:55 -05:00
parent 849179df17
commit c0ad647fa7
3 changed files with 15 additions and 9 deletions

View File

@@ -103,8 +103,15 @@ class ExllamaV2Container:
self.quiet = quiet
self.cache_fp8 = "cache_mode" in kwargs and kwargs["cache_mode"] == "FP8"
self.gpu_split = kwargs.get("gpu_split")
self.gpu_split_auto = unwrap(kwargs.get("gpu_split_auto"), False)
# Turn off GPU split if the user is using 1 GPU
gpu_count = torch.cuda.device_count()
if gpu_count > 1:
self.gpu_split = kwargs.get("gpu_split")
self.gpu_split_auto = unwrap(kwargs.get("gpu_split_auto"), True)
else:
self.gpu_split_auto = False
logger.info("Disabling GPU split because one GPU is in use.")
self.config = ExLlamaV2Config()
self.config.model_dir = str(model_directory.resolve())
@@ -354,9 +361,7 @@ class ExllamaV2Container:
# Load model with manual split
# Entrypoint for single GPU users
if not self.gpu_split_auto:
logger.info(
"Loading with a manual GPU split (or a one GPU setup)"
)
logger.info("Loading with a manual GPU split (or a one GPU setup)")
for value in self.model.load_gen(
self.gpu_split,