Implement lora support (#24)

* Model: Implement basic lora support

* Add ability to load loras from config on launch
* Supports loading multiple loras and lora scaling
* Add function to unload loras

* Colab: Update for basic lora support

* Model: Test vram alloc after lora load, add docs

* Git: Add loras folder to .gitignore

* API: Add basic lora-related endpoints

* Add /loras/ endpoint for querying available loras
* Add /model/lora endpoint for querying currently loaded loras
* Add /model/lora/load endpoint for loading loras
* Add /model/lora/unload endpoint for unloading loras
* Move lora config-checking logic to main.py for better compat with API endpoints

* Revert bad CRLF line ending changes

* API: Add basic lora-related endpoints (fixed)

* Add /loras/ endpoint for querying available loras
* Add /model/lora endpoint for querying currently loaded loras
* Add /model/lora/load endpoint for loading loras
* Add /model/lora/unload endpoint for unloading loras
* Move lora config-checking logic to main.py for better compat with API endpoints

* Model: Unload loras first when unloading model

* API + Models: Cleanup lora endpoints and functions

Condenses down endpoint and model load code. Also makes the routes
behave the same way as model routes to help not confuse the end user.

Signed-off-by: kingbri <bdashore3@proton.me>

* Loras: Optimize load endpoint

Return successes and failures along with consolidating the request
to the rewritten load_loras function.

Signed-off-by: kingbri <bdashore3@proton.me>

---------

Co-authored-by: kingbri <bdashore3@proton.me>
Co-authored-by: DocShotgun <126566557+DocShotgun@users.noreply.github.com>
This commit is contained in:
DocShotgun
2023-12-08 20:36:40 -08:00
committed by kingbri
parent 161c9d2c19
commit 7380a3b79a
8 changed files with 197 additions and 19 deletions

View File

@@ -6,6 +6,7 @@ from exllamav2 import(
ExLlamaV2Cache,
ExLlamaV2Cache_8bit,
ExLlamaV2Tokenizer,
ExLlamaV2Lora
)
from exllamav2.generator import(
ExLlamaV2StreamingGenerator,
@@ -30,6 +31,8 @@ class ModelContainer:
cache_fp8: bool = False
gpu_split_auto: bool = True
gpu_split: list or None = None
active_loras: List[ExLlamaV2Lora] = []
def __init__(self, model_directory: pathlib.Path, quiet = False, **kwargs):
"""
@@ -54,6 +57,8 @@ class ModelContainer:
'draft_rope_alpha' (float): RoPE alpha (NTK) factor for draft model.
By default, the draft model's alpha value is calculated automatically to scale to the size of the
full model.
'lora_dir' (str): Lora directory
'loras' (list[dict]): List of loras to be loaded, consisting of 'name' and 'scaling'
'gpu_split_auto' (bool): Automatically split model across available devices (default: True)
'gpu_split' (list[float]): Allocation for weights and (some) tensors, per device
'no_flash_attn' (bool): Turns off flash attention (increases vram usage) (default: False)
@@ -141,6 +146,32 @@ class ModelContainer:
"""
for _ in self.load_gen(progress_callback): pass
def load_loras(self, lora_directory: pathlib.Path, **kwargs):
"""
Load loras
"""
loras = kwargs.get("loras") or []
success: List[str] = []
failure: List[str] = []
for lora in loras:
lora_name = lora.get("name") or None
lora_scaling = lora.get("scaling") or 1.0
if lora_name is None:
print("One of your loras does not have a name. Please check your config.yml! Skipping lora load.")
failure.append(lora_name)
continue
print(f"Loading lora: {lora_name} at scaling {lora_scaling}")
lora_path = lora_directory / lora_name
self.active_loras.append(ExLlamaV2Lora.from_directory(self.model, lora_path, lora_scaling))
print("Lora successfully loaded.")
success.append(lora_name)
# Return success and failure names
return { 'success': success, 'failure': failure }
def load_gen(self, progress_callback = None):
"""
@@ -204,23 +235,30 @@ class ModelContainer:
print("Model successfully loaded.")
def unload(self):
def unload(self, loras_only: bool = False):
"""
Free all VRAM resources used by this model
"""
if self.model: self.model.unload()
self.model = None
if self.draft_model: self.draft_model.unload()
self.draft_model = None
self.config = None
self.cache = None
self.tokenizer = None
self.generator = None
for lora in self.active_loras:
lora.unload()
self.active_loras = []
# Unload the entire model if not just unloading loras
if not loras_only:
if self.model: self.model.unload()
self.model = None
if self.draft_model: self.draft_model.unload()
self.draft_model = None
self.config = None
self.cache = None
self.tokenizer = None
self.generator = None
gc.collect()
torch.cuda.empty_cache()
# Common function for token operations
def get_tokens(self, text: Optional[str], ids: Optional[List[int]], **kwargs):
if text:
@@ -381,7 +419,7 @@ class ModelContainer:
active_ids = ids[:, max(0, overflow):]
chunk_tokens = self.config.max_seq_len - active_ids.shape[-1]
self.generator.begin_stream(active_ids, gen_settings, token_healing = token_healing)
self.generator.begin_stream(active_ids, gen_settings, token_healing = token_healing, loras = self.active_loras)
# Generate