diff --git a/OAI/types/model.py b/OAI/types/model.py index 22db757..c0ddd5d 100644 --- a/OAI/types/model.py +++ b/OAI/types/model.py @@ -70,7 +70,7 @@ class ModelLoadRequest(BaseModel): default=None, examples=[4096], ) - gpu_split_auto: Optional[bool] = False + gpu_split_auto: Optional[bool] = True gpu_split: Optional[List[float]] = Field( default_factory=list, examples=[[24.0, 20.0]] ) diff --git a/backends/exllamav2/model.py b/backends/exllamav2/model.py index 615c3e1..4b535ed 100644 --- a/backends/exllamav2/model.py +++ b/backends/exllamav2/model.py @@ -103,8 +103,15 @@ class ExllamaV2Container: self.quiet = quiet self.cache_fp8 = "cache_mode" in kwargs and kwargs["cache_mode"] == "FP8" - self.gpu_split = kwargs.get("gpu_split") - self.gpu_split_auto = unwrap(kwargs.get("gpu_split_auto"), False) + + # Turn off GPU split if the user is using 1 GPU + gpu_count = torch.cuda.device_count() + if gpu_count > 1: + self.gpu_split = kwargs.get("gpu_split") + self.gpu_split_auto = unwrap(kwargs.get("gpu_split_auto"), True) + else: + self.gpu_split_auto = False + logger.info("Disabling GPU split because one GPU is in use.") self.config = ExLlamaV2Config() self.config.model_dir = str(model_directory.resolve()) @@ -354,9 +361,7 @@ class ExllamaV2Container: # Load model with manual split # Entrypoint for single GPU users if not self.gpu_split_auto: - logger.info( - "Loading with a manual GPU split (or a one GPU setup)" - ) + logger.info("Loading with a manual GPU split (or a one GPU setup)") for value in self.model.load_gen( self.gpu_split, diff --git a/config_sample.yml b/config_sample.yml index ff35754..c50111c 100644 --- a/config_sample.yml +++ b/config_sample.yml @@ -68,11 +68,12 @@ model: # Only use this if the model's base sequence length in config.json is incorrect (ex. Mistral/Mixtral models) #override_base_seq_len: - # Automatically allocate resources to GPUs (default: False) - # WARNING: Will use more VRAM for single GPU users - #gpu_split_auto: False + # Automatically allocate resources to GPUs (default: True) + # NOTE: Not parsed for single GPU users + #gpu_split_auto: True # An integer array of GBs of vram to split between GPUs (default: []) + # NOTE: Not parsed for single GPU users #gpu_split: [20.6, 24] # Rope scale (default: 1.0)