Tree: Update to use ModelContainer and args

Use command-line arguments to load an initial model if necessary.
API routes are broken, but we should be using the container from
now on as a primary interface with the exllama2 library.

Also these args should be turned into a YAML configuration file in
the future.

Signed-off-by: kingbri <bdashore3@proton.me>
This commit is contained in:
kingbri
2023-11-10 23:19:54 -05:00
parent 9d34479e3e
commit 5d32aa02cd
4 changed files with 39 additions and 61 deletions

View File

@@ -1,4 +1,4 @@
import json, uuid, os, gc, time
import gc, time
import torch
from exllamav2 import(
@@ -34,6 +34,7 @@ class ModelContainer:
gpu_split: list or None = None
def __init__(self, model_directory: str, quiet = False, **kwargs):
print(kwargs)
"""
Create model container
@@ -57,6 +58,7 @@ class ModelContainer:
full model.
'gpu_split_auto' (bool): Automatically split model across available devices (default: True)
'gpu_split' (list): Allocation for weights and (some) tensors, per device
'no_flash_attn' (bool): Turns off flash attention (increases vram usage)
"""
self.quiet = quiet
@@ -72,6 +74,7 @@ class ModelContainer:
if "max_seq_len" in kwargs: self.config.max_seq_len = kwargs["max_seq_len"]
if "rope_scale" in kwargs: self.config.scale_pos_emb = kwargs["rope_scale"]
if "rope_alpha" in kwargs: self.config.scale_alpha_value = kwargs["rope_alpha"]
if "no_flash_attn" in kwargs: self.config.no_flash_attn = kwargs["no_flash_attn"]
chunk_size = min(kwargs.get("chunk_size", 2048), self.config.max_seq_len)
self.config.max_input_len = chunk_size