mirror of
https://github.com/turboderp-org/exui.git
synced 2026-05-11 16:30:18 +00:00
392 lines
12 KiB
Python
392 lines
12 KiB
Python
import json, uuid, os, gc
|
|
import torch
|
|
from pynvml import *
|
|
|
|
from exllamav2 import(
|
|
ExLlamaV2,
|
|
ExLlamaV2Config,
|
|
ExLlamaV2Cache,
|
|
ExLlamaV2Cache_8bit,
|
|
ExLlamaV2Tokenizer,
|
|
)
|
|
|
|
from exllamav2.generator import(
|
|
ExLlamaV2StreamingGenerator,
|
|
ExLlamaV2Sampler
|
|
)
|
|
|
|
from exllamav2.attn import ExLlamaV2Attention
|
|
# from exllamav2.util import list_live_tensors
|
|
from backend.config import config_filename
|
|
from backend.util import *
|
|
|
|
auto_split_reserve_bytes = 512 * 1024**2
|
|
|
|
models = {}
|
|
|
|
# Load/save config
|
|
|
|
def load_models():
|
|
global models
|
|
|
|
filename = config_filename("models.json")
|
|
if os.path.exists(filename):
|
|
with open(filename, "r") as f:
|
|
models = json.load(f)
|
|
else:
|
|
models = {}
|
|
|
|
|
|
def save_models():
|
|
global models
|
|
|
|
filename = config_filename("models.json")
|
|
models_json = json.dumps(models, indent = 4)
|
|
with open(filename, "w") as outfile:
|
|
outfile.write(models_json)
|
|
|
|
|
|
# List models
|
|
|
|
def list_models():
|
|
global models
|
|
|
|
models_list = {}
|
|
for k, v in models.items(): models_list[k] = v["name"]
|
|
current_model = loaded_model.get_uuid() if loaded_model is not None else None
|
|
return models_list, current_model
|
|
|
|
|
|
# Get model
|
|
|
|
def get_model_info(data = None):
|
|
global models
|
|
|
|
if data is None: return None
|
|
|
|
i = data["model_uuid"]
|
|
if i is None: return None
|
|
return models[i]
|
|
|
|
|
|
# Remove model config
|
|
|
|
def remove_model(data):
|
|
global models
|
|
|
|
i = data["model_uuid"]
|
|
if i is None: return
|
|
|
|
del models[i]
|
|
save_models()
|
|
|
|
|
|
# Update model config
|
|
|
|
def update_model(data):
|
|
global models
|
|
|
|
print(data)
|
|
|
|
if data["model_uuid"] is None or data["model_uuid"] == "new":
|
|
new_model = {}
|
|
i = str(uuid.uuid4())
|
|
new_model["model_uuid"] = i
|
|
new_model["name"] = data.get("name", "Unnamed model")
|
|
new_model["model_directory"] = data.get("model_directory", "")
|
|
models[i] = new_model
|
|
prepare_model(new_model)
|
|
save_models()
|
|
return i
|
|
|
|
i = data["model_uuid"]
|
|
model = models[i]
|
|
|
|
prev_model = model.copy()
|
|
for k, v in data.items(): model[k] = v
|
|
|
|
if model["model_directory"] != prev_model["model_directory"]:
|
|
prepare_model(model)
|
|
if model.get("draft_model_directory", "") != prev_model.get("draft_model_directory", "") \
|
|
or model.get("draft_enabled", "") != prev_model.get("draft_enabled", ""):
|
|
prepare_draft_model(model)
|
|
|
|
save_models()
|
|
return None
|
|
|
|
|
|
def prepare_draft_model(model):
|
|
|
|
if "draft_enabled" not in model: model["draft_enabled"] = False
|
|
if model["draft_enabled"]:
|
|
|
|
prep_draft_config = ExLlamaV2Config()
|
|
prep_draft_config.model_dir = expanduser(model.get("draft_model_directory", ""))
|
|
try:
|
|
prep_draft_config.prepare()
|
|
model["draft_config_status"] = "ok"
|
|
model["draft_config_status_error"] = None
|
|
except Exception as e:
|
|
model["draft_config_status"] = "error"
|
|
model["draft_config_status_error"] = str(e)
|
|
return
|
|
|
|
draft_stats = {}
|
|
draft_stats["hidden_size"] = prep_draft_config.hidden_size
|
|
draft_stats["intermediate_size"] = prep_draft_config.intermediate_size
|
|
draft_stats["num_attention_heads"] = prep_draft_config.num_attention_heads
|
|
draft_stats["num_key_value_heads"] = prep_draft_config.num_key_value_heads
|
|
draft_stats["num_hidden_layers"] = prep_draft_config.num_hidden_layers
|
|
draft_stats["vocab_size"] = prep_draft_config.vocab_size
|
|
draft_stats["head_dim"] = prep_draft_config.head_dim
|
|
draft_stats["default_seq_len"] = prep_draft_config.max_seq_len
|
|
model["draft_stats"] = draft_stats
|
|
|
|
if "draft_rope_alpha" not in model: model["draft_rope_alpha"] = 1.0
|
|
if "draft_rope_alpha_auto" not in model: model["draft_rope_alpha_auto"] = True
|
|
|
|
|
|
def prepare_model(model):
|
|
|
|
prep_config = ExLlamaV2Config()
|
|
prep_config.model_dir = expanduser(model["model_directory"])
|
|
|
|
try:
|
|
prep_config.prepare()
|
|
model["config_status"] = "ok"
|
|
model["config_status_error"] = None
|
|
except Exception as e:
|
|
model["config_status"] = "error"
|
|
model["config_status_error"] = str(e)
|
|
return
|
|
|
|
stats = {}
|
|
stats["hidden_size"] = prep_config.hidden_size
|
|
stats["intermediate_size"] = prep_config.intermediate_size
|
|
stats["num_attention_heads"] = prep_config.num_attention_heads
|
|
stats["num_key_value_heads"] = prep_config.num_key_value_heads
|
|
stats["num_hidden_layers"] = prep_config.num_hidden_layers
|
|
stats["vocab_size"] = prep_config.vocab_size
|
|
stats["head_dim"] = prep_config.head_dim
|
|
stats["default_seq_len"] = prep_config.max_seq_len
|
|
model["stats"] = stats
|
|
|
|
model["default_seq_len"] = prep_config.max_seq_len
|
|
if "seq_len" not in model: model["seq_len"] = prep_config.max_seq_len
|
|
if "rope_scale" not in model: model["rope_scale"] = prep_config.scale_pos_emb
|
|
if "rope_alpha" not in model: model["rope_alpha"] = prep_config.scale_alpha_value
|
|
|
|
if "cache_mode" not in model: model["cache_mode"] = "FP16"
|
|
if "chunk_size" not in model: model["chunk_size"] = prep_config.max_input_len
|
|
if "gpu_split" not in model: model["gpu_split"] = ""
|
|
if "gpu_split_auto" not in model: model["gpu_split_auto"] = True
|
|
|
|
|
|
class ModelContainer:
|
|
|
|
config: ExLlamaV2Config or None = None
|
|
draft_config: ExLlamaV2Config or None = None
|
|
model: ExLlamaV2 or None = None
|
|
draft_model: ExLlamaV2 or None = None
|
|
cache: ExLlamaV2Cache or None = None
|
|
draft_cache: ExLlamaV2Cache or None = None
|
|
tokenizer: ExLlamaV2Tokenizer or None = None
|
|
generator: ExLlamaV2StreamingGenerator or None = None
|
|
model_dict = None
|
|
|
|
cache_fp8: bool = False
|
|
draft_enabled: bool = False
|
|
|
|
def __init__(self, model, progress_callback = None):
|
|
|
|
self.model_dict = model
|
|
|
|
self.config = ExLlamaV2Config()
|
|
self.config.model_dir = expanduser(model["model_directory"])
|
|
self.config.prepare()
|
|
|
|
self.config.max_seq_len = model["seq_len"]
|
|
self.config.scale_pos_emb = model["rope_scale"]
|
|
self.config.scale_alpha_value = model["rope_alpha"]
|
|
self.config.max_input_len = model["chunk_size"]
|
|
self.config.max_attn_size = model["chunk_size"] ** 2
|
|
|
|
self.draft_enabled = self.model_dict["draft_enabled"] if "draft_enabled" in self.model_dict else False
|
|
|
|
if self.draft_enabled:
|
|
|
|
self.draft_config = ExLlamaV2Config()
|
|
self.draft_config.model_dir = expanduser(model["draft_model_directory"])
|
|
self.draft_config.prepare()
|
|
|
|
self.draft_config.max_seq_len = model["seq_len"]
|
|
|
|
alpha = model["draft_rope_alpha"]
|
|
if model["draft_rope_alpha_auto"]:
|
|
ratio = self.config.max_seq_len / self.draft_config.max_seq_len
|
|
alpha = -0.13436 + 0.80541 * ratio + 0.28833 * ratio ** 2
|
|
print(f" -- Applying draft model auto RoPE alpha = {alpha:.4f}")
|
|
|
|
self.draft_config.scale_alpha_value = alpha
|
|
self.draft_config.max_input_len = model["chunk_size"]
|
|
self.draft_config.max_attn_size = model["chunk_size"] ** 2
|
|
|
|
|
|
def load(self, progress_callback = None):
|
|
|
|
if self.model_dict["cache_mode"] == "FP8": self.cache_fp8 = True
|
|
elif self.model_dict["cache_mode"] == "FP16": self.cache_fp8 = False
|
|
else: raise ValueError("bad cache_mode: " + self.model_dict["cache_mode"])
|
|
|
|
self.tokenizer = ExLlamaV2Tokenizer(self.config)
|
|
|
|
# Load draft model
|
|
|
|
if self.draft_enabled:
|
|
|
|
self.draft_model = ExLlamaV2(self.draft_config)
|
|
print("Loading draft model: " + self.draft_config.model_dir)
|
|
|
|
self.draft_cache = ExLlamaV2Cache(self.draft_model, lazy = True)
|
|
reserve = [96 * 1024**2] + [0] * 16
|
|
yield from self.draft_model.load_autosplit_gen(self.draft_cache, reserve_vram = reserve, last_id_only = True, callback_gen = progress_callback)
|
|
|
|
# Test VRAM allocation with a full-length forward pass
|
|
|
|
input_ids = torch.zeros((1, self.config.max_input_len), dtype = torch.long)
|
|
self.draft_model.forward(input_ids, cache = self.cache, preprocess_only = True)
|
|
|
|
# Load model
|
|
|
|
self.model = ExLlamaV2(self.config)
|
|
print("Loading model: " + self.config.model_dir)
|
|
|
|
if self.model_dict["gpu_split_auto"]:
|
|
auto_split = True
|
|
elif self.model_dict["gpu_split"] is None or self.model_dict["gpu_split"].strip() == "":
|
|
auto_split = False
|
|
split = None
|
|
else:
|
|
auto_split = False
|
|
split = [float(alloc) for alloc in self.model_dict["gpu_split"].split(",")]
|
|
|
|
if not auto_split:
|
|
for value in self.model.load_gen(split, callback_gen = progress_callback):
|
|
if isinstance(value, str):
|
|
yield value
|
|
|
|
if self.cache_fp8:
|
|
self.cache = ExLlamaV2Cache_8bit(self.model, lazy = auto_split)
|
|
else:
|
|
self.cache = ExLlamaV2Cache(self.model, lazy = auto_split)
|
|
|
|
if auto_split:
|
|
reserve = [96 * 1024**2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
|
|
yield from self.model.load_autosplit_gen(self.cache, reserve_vram = reserve, last_id_only = True, callback_gen = progress_callback)
|
|
|
|
# Test VRAM allocation with a full-length forward pass
|
|
|
|
input_ids = torch.zeros((1, self.config.max_input_len), dtype = torch.long)
|
|
self.model.forward(input_ids, cache = self.cache, preprocess_only = True)
|
|
|
|
# Create generator
|
|
|
|
self.generator = ExLlamaV2StreamingGenerator(self.model, self.cache, self.tokenizer, self.draft_model, self.draft_cache)
|
|
|
|
|
|
def get_free_vram(self):
|
|
global auto_split_reserve_bytes
|
|
|
|
nvmlInit()
|
|
device_count = torch.cuda.device_count()
|
|
free_vram = []
|
|
for i in range(device_count):
|
|
handle = nvmlDeviceGetHandleByIndex(i)
|
|
info = nvmlDeviceGetMemoryInfo(handle)
|
|
free_vram.append(info.free - auto_split_reserve_bytes)
|
|
|
|
return free_vram
|
|
|
|
|
|
def get_uuid(self):
|
|
|
|
return self.model_dict["model_uuid"]
|
|
|
|
|
|
def unload(self):
|
|
|
|
if self.model: self.model.unload()
|
|
self.model = None
|
|
self.config = None
|
|
self.cache = None
|
|
self.tokenizer = None
|
|
|
|
|
|
def stream_progress(module, num_modules):
|
|
|
|
packet = \
|
|
{
|
|
"result": "progress",
|
|
"module": module ,
|
|
"num_modules": num_modules
|
|
}
|
|
# print(json.dumps(packet))
|
|
yield json.dumps(packet) + "\n"
|
|
|
|
|
|
loaded_model: ModelContainer or None = None
|
|
|
|
def get_loaded_model():
|
|
return loaded_model
|
|
|
|
|
|
def load_model(data):
|
|
global models, loaded_model
|
|
|
|
if loaded_model is not None:
|
|
loaded_model.unload()
|
|
loaded_model = None
|
|
|
|
gc.collect()
|
|
torch.cuda.empty_cache()
|
|
|
|
i = data["model_uuid"]
|
|
model = models[i]
|
|
|
|
try:
|
|
loaded_model = ModelContainer(model)
|
|
yield from loaded_model.load(progress_callback = stream_progress)
|
|
success = True
|
|
except Exception as e:
|
|
loaded_model = None
|
|
errormsg = type(e).__name__ + ":\n"
|
|
errormsg += str(e)
|
|
success = False
|
|
|
|
if not success:
|
|
gc.collect()
|
|
torch.cuda.empty_cache()
|
|
result = { "result": "fail", "error": errormsg }
|
|
# print(json.dumps(result) + "\n")
|
|
yield json.dumps(result) + "\n"
|
|
return ""
|
|
|
|
result = { "result": "ok" }
|
|
# print(json.dumps(result) + "\n")
|
|
yield json.dumps(result) + "\n"
|
|
|
|
|
|
def unload_model():
|
|
global loaded_model
|
|
|
|
if loaded_model is not None:
|
|
loaded_model.unload()
|
|
loaded_model = None
|
|
|
|
gc.collect()
|
|
torch.cuda.empty_cache()
|
|
|
|
result = { "result": "ok" }
|
|
return result
|
|
|