Tree: Update to use ModelContainer and args

Use command-line arguments to load an initial model if necessary. API routes are broken, but we should be using the container from now on as a primary interface with the exllama2 library. Also these args should be turned into a YAML configuration file in the future. Signed-off-by: kingbri <bdashore3@proton.me>
2026-03-14 15:57:27 +00:00 · 2023-11-10 23:19:54 -05:00
parent 9d34479e3e
commit 5d32aa02cd
4 changed files with 39 additions and 61 deletions
--- a/model.py
+++ b/model.py
@@ -1,4 +1,4 @@
-import json, uuid, os, gc, time
+import gc, time
 import torch

 from exllamav2 import(
@@ -34,6 +34,7 @@ class ModelContainer:
    gpu_split: list or None = None

    def __init__(self, model_directory: str, quiet = False, **kwargs):
+        print(kwargs)
        """
        Create model container

@@ -57,6 +58,7 @@ class ModelContainer:
                    full model.
                'gpu_split_auto' (bool): Automatically split model across available devices (default: True)
                'gpu_split' (list): Allocation for weights and (some) tensors, per device
+                'no_flash_attn' (bool): Turns off flash attention (increases vram usage)
        """

        self.quiet = quiet
@@ -72,6 +74,7 @@ class ModelContainer:
        if "max_seq_len" in kwargs: self.config.max_seq_len = kwargs["max_seq_len"]
        if "rope_scale" in kwargs: self.config.scale_pos_emb = kwargs["rope_scale"]
        if "rope_alpha" in kwargs: self.config.scale_alpha_value = kwargs["rope_alpha"]
+        if "no_flash_attn" in kwargs: self.config.no_flash_attn = kwargs["no_flash_attn"]

        chunk_size = min(kwargs.get("chunk_size", 2048), self.config.max_seq_len)
        self.config.max_input_len = chunk_size