mirror of
https://github.com/theroyallab/tabbyAPI.git
synced 2026-03-14 15:57:27 +00:00
Tree: Update to use ModelContainer and args
Use command-line arguments to load an initial model if necessary. API routes are broken, but we should be using the container from now on as a primary interface with the exllama2 library. Also these args should be turned into a YAML configuration file in the future. Signed-off-by: kingbri <bdashore3@proton.me>
This commit is contained in:
5
model.py
5
model.py
@@ -1,4 +1,4 @@
|
||||
import json, uuid, os, gc, time
|
||||
import gc, time
|
||||
import torch
|
||||
|
||||
from exllamav2 import(
|
||||
@@ -34,6 +34,7 @@ class ModelContainer:
|
||||
gpu_split: list or None = None
|
||||
|
||||
def __init__(self, model_directory: str, quiet = False, **kwargs):
|
||||
print(kwargs)
|
||||
"""
|
||||
Create model container
|
||||
|
||||
@@ -57,6 +58,7 @@ class ModelContainer:
|
||||
full model.
|
||||
'gpu_split_auto' (bool): Automatically split model across available devices (default: True)
|
||||
'gpu_split' (list): Allocation for weights and (some) tensors, per device
|
||||
'no_flash_attn' (bool): Turns off flash attention (increases vram usage)
|
||||
"""
|
||||
|
||||
self.quiet = quiet
|
||||
@@ -72,6 +74,7 @@ class ModelContainer:
|
||||
if "max_seq_len" in kwargs: self.config.max_seq_len = kwargs["max_seq_len"]
|
||||
if "rope_scale" in kwargs: self.config.scale_pos_emb = kwargs["rope_scale"]
|
||||
if "rope_alpha" in kwargs: self.config.scale_alpha_value = kwargs["rope_alpha"]
|
||||
if "no_flash_attn" in kwargs: self.config.no_flash_attn = kwargs["no_flash_attn"]
|
||||
|
||||
chunk_size = min(kwargs.get("chunk_size", 2048), self.config.max_seq_len)
|
||||
self.config.max_input_len = chunk_size
|
||||
|
||||
Reference in New Issue
Block a user