Initial commit

This commit is contained in:
turboderp
2023-10-29 20:55:38 +01:00
commit a647a51fea
22 changed files with 4396 additions and 0 deletions

42
backend/config.py Normal file
View File

@@ -0,0 +1,42 @@
import sys, os, json
config_dir: str = "///"
def set_config_dir(config_dir_):
global config_dir
config_dir = os.path.expanduser(config_dir_)
if not os.path.exists(config_dir):
os.makedirs(config_dir)
def config_filename(filename: str):
global config_dir
return os.path.join(config_dir, filename)
class GlobalState:
def __init__(self):
pass
def load(self):
filename = config_filename("state.json")
if os.path.exists(filename):
with open(filename, "r") as f:
r = json.load(f)
else:
r = {}
def save(self):
r = {}
filename = config_filename("state.json")
r_json = json.dumps(r, indent = 4)
with open(filename, "w") as outfile:
outfile.write(r_json)
global_state = GlobalState()

399
backend/models.py Normal file
View File

@@ -0,0 +1,399 @@
import json, uuid, os, gc
import torch
from pynvml import *
from exllamav2 import(
ExLlamaV2,
ExLlamaV2Config,
ExLlamaV2Cache,
ExLlamaV2Cache_8bit,
ExLlamaV2Tokenizer,
)
from exllamav2.generator import(
ExLlamaV2StreamingGenerator,
ExLlamaV2Sampler
)
from exllamav2.attn import ExLlamaV2Attention
# from exllamav2.util import list_live_tensors
from backend.config import config_filename
auto_split_reserve_bytes = 512 * 1024**2
models = {}
# Load/save config
def load_models():
global models
filename = config_filename("models.json")
if os.path.exists(filename):
with open(filename, "r") as f:
models = json.load(f)
else:
models = {}
def save_models():
global models
filename = config_filename("models.json")
models_json = json.dumps(models, indent = 4)
with open(filename, "w") as outfile:
outfile.write(models_json)
# List models
def list_models():
global models
models_list = {}
for k, v in models.items(): models_list[k] = v["name"]
current_model = loaded_model.get_uuid() if loaded_model is not None else None
return models_list, current_model
# Get model
def get_model_info(data = None):
global models
if data is None: return None
i = data["model_uuid"]
if i is None: return None
return models[i]
# Remove model config
def remove_model(data):
global models
i = data["model_uuid"]
if i is None: return
del models[i]
save_models()
# Update model config
def update_model(data):
global models
if data["model_uuid"] is None:
new_model = {}
i = str(uuid.uuid4())
new_model["model_uuid"] = i
new_model["name"] = data["name"] or "Unnamed model"
new_model["model_directory"] = data["model_directory"]
models[i] = new_model
prepare_model(new_model)
save_models()
return i
i = data["model_uuid"]
model = models[i]
if "name" in data: model["name"] = data["name"]
if "model_directory" in data: model["model_directory"] = data["model_directory"]
prepare_model(model)
if "seq_len" in data: model["seq_len"] = data["seq_len"]
if "rope_scale" in data: model["rope_scale"] = data["rope_scale"]
if "rope_alpha" in data: model["rope_alpha"] = data["rope_alpha"]
if "cache_mode" in data: model["cache_mode"] = data["cache_mode"]
if "chunk_size" in data: model["chunk_size"] = data["chunk_size"]
if "gpu_split" in data: model["gpu_split"] = data["gpu_split"]
if "gpu_split_auto" in data: model["gpu_split_auto"] = data["gpu_split_auto"]
if "draft_enabled" in data: model["draft_enabled"] = data["draft_enabled"]
if "draft_model_directory" in data: model["draft_model_directory"] = data["draft_model_directory"]
prepare_draft_model(model)
if "draft_rope_alpha" in data: model["draft_rope_alpha"] = data["draft_rope_alpha"]
if "draft_rope_alpha_auto" in data: model["draft_rope_alpha_auto"] = data["draft_rope_alpha_auto"]
save_models()
return None
def prepare_draft_model(model):
if "draft_enabled" not in model: model["draft_enabled"] = False
if model["draft_enabled"]:
prep_draft_config = ExLlamaV2Config()
prep_draft_config.model_dir = model["draft_model_directory"]
try:
prep_draft_config.prepare()
model["draft_config_status"] = "ok"
model["draft_config_status_error"] = None
except Exception as e:
model["draft_config_status"] = "error"
model["draft_config_status_error"] = str(e)
return
draft_stats = {}
draft_stats["hidden_size"] = prep_draft_config.hidden_size
draft_stats["intermediate_size"] = prep_draft_config.intermediate_size
draft_stats["num_attention_heads"] = prep_draft_config.num_attention_heads
draft_stats["num_key_value_heads"] = prep_draft_config.num_key_value_heads
draft_stats["num_hidden_layers"] = prep_draft_config.num_hidden_layers
draft_stats["vocab_size"] = prep_draft_config.vocab_size
draft_stats["head_dim"] = prep_draft_config.head_dim
model["draft_stats"] = draft_stats
if "draft_rope_alpha" not in model: model["draft_rope_alpha"] = 1.0
if "draft_rope_alpha_auto" not in model: model["draft_rope_alpha_auto"] = True
def prepare_model(model):
prep_config = ExLlamaV2Config()
prep_config.model_dir = model["model_directory"]
try:
prep_config.prepare()
model["config_status"] = "ok"
model["config_status_error"] = None
except Exception as e:
model["config_status"] = "error"
model["config_status_error"] = str(e)
return
stats = {}
stats["hidden_size"] = prep_config.hidden_size
stats["intermediate_size"] = prep_config.intermediate_size
stats["num_attention_heads"] = prep_config.num_attention_heads
stats["num_key_value_heads"] = prep_config.num_key_value_heads
stats["num_hidden_layers"] = prep_config.num_hidden_layers
stats["vocab_size"] = prep_config.vocab_size
stats["head_dim"] = prep_config.head_dim
model["stats"] = stats
model["default_seq_len"] = prep_config.max_seq_len
if "seq_len" not in model: model["seq_len"] = prep_config.max_seq_len
if "rope_scale" not in model: model["rope_scale"] = prep_config.scale_pos_emb
if "rope_alpha" not in model: model["rope_alpha"] = prep_config.scale_alpha_value
if "cache_mode" not in model: model["cache_mode"] = "FP16"
if "chunk_size" not in model: model["chunk_size"] = prep_config.max_input_len
if "gpu_split" not in model: model["gpu_split"] = ""
if "gpu_split_auto" not in model: model["gpu_split_auto"] = True
class ModelContainer:
config: ExLlamaV2Config or None = None
draft_config: ExLlamaV2Config or None = None
model: ExLlamaV2 or None = None
draft_model: ExLlamaV2 or None = None
cache: ExLlamaV2Cache or None = None
draft_cache: ExLlamaV2Cache or None = None
tokenizer: ExLlamaV2Tokenizer or None = None
generator: ExLlamaV2StreamingGenerator or None = None
model_dict = None
cache_fp8: bool = False
draft_enabled: bool = False
def __init__(self, model, progress_callback = None):
self.model_dict = model
self.config = ExLlamaV2Config()
self.config.model_dir = model["model_directory"]
self.config.prepare()
self.config.max_seq_len = model["seq_len"]
self.config.scale_pos_emb = model["rope_scale"]
self.config.scale_alpha_value = model["rope_alpha"]
self.config.max_input_len = model["chunk_size"]
self.config.max_attn_size = model["chunk_size"] ** 2
self.draft_enabled = self.model_dict["draft_enabled"] if "draft_enabled" in self.model_dict else False
if self.draft_enabled:
self.draft_config = ExLlamaV2Config()
self.draft_config.model_dir = model["draft_model_directory"]
self.draft_config.prepare()
self.draft_config.max_seq_len = model["seq_len"]
alpha = model["draft_rope_alpha"]
if model["draft_rope_alpha_auto"]:
ratio = self.config.max_seq_len / self.draft_config.max_seq_len
alpha = -0.13436 + 0.80541 * ratio + 0.28833 * ratio ** 2
print(f" -- Applying draft model auto RoPE alpha = {alpha:.4f}")
self.draft_config.scale_alpha_value = alpha
self.draft_config.max_input_len = model["chunk_size"]
self.draft_config.max_attn_size = model["chunk_size"] ** 2
def load(self, progress_callback = None):
if self.model_dict["cache_mode"] == "FP8": self.cache_fp8 = True
elif self.model_dict["cache_mode"] == "FP16": self.cache_fp8 = False
else: raise ValueError("bad cache_mode: " + self.model_dict["cache_mode"])
self.tokenizer = ExLlamaV2Tokenizer(self.config)
# Load draft model
if self.draft_enabled:
self.draft_model = ExLlamaV2(self.draft_config)
print("Loading draft model: " + self.draft_config.model_dir)
self.draft_cache = ExLlamaV2Cache(self.draft_model, lazy = True)
reserve = [96 * 1024**2] + [0] * 16
yield from self.draft_model.load_autosplit_gen(self.draft_cache, reserve_vram = reserve, last_id_only = True, callback_gen = progress_callback)
# Test VRAM allocation with a full-length forward pass
input_ids = torch.zeros((1, self.config.max_input_len), dtype = torch.long)
self.draft_model.forward(input_ids, cache = self.cache, preprocess_only = True)
# Load model
self.model = ExLlamaV2(self.config)
print("Loading model: " + self.config.model_dir)
if self.model_dict["gpu_split_auto"]:
auto_split = True
elif self.model_dict["gpu_split"] is None or self.model_dict["gpu_split"].strip() == "":
auto_split = False
split = None
else:
auto_split = False
split = [float(alloc) for alloc in self.model_dict["gpu_split"].split(",")]
if not auto_split:
for value in self.model.load_gen(split, callback_gen = progress_callback):
if isinstance(value, str):
yield value
if self.cache_fp8:
self.cache = ExLlamaV2Cache_8bit(self.model, lazy = auto_split)
else:
self.cache = ExLlamaV2Cache(self.model, lazy = auto_split)
if auto_split:
reserve = [96 * 1024**2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
yield from self.model.load_autosplit_gen(self.cache, reserve_vram = reserve, last_id_only = True, callback_gen = progress_callback)
# Test VRAM allocation with a full-length forward pass
input_ids = torch.zeros((1, self.config.max_input_len), dtype = torch.long)
self.model.forward(input_ids, cache = self.cache, preprocess_only = True)
# Create generator
self.generator = ExLlamaV2StreamingGenerator(self.model, self.cache, self.tokenizer, self.draft_model, self.draft_cache)
def get_free_vram(self):
global auto_split_reserve_bytes
nvmlInit()
device_count = torch.cuda.device_count()
free_vram = []
for i in range(device_count):
handle = nvmlDeviceGetHandleByIndex(i)
info = nvmlDeviceGetMemoryInfo(handle)
free_vram.append(info.free - auto_split_reserve_bytes)
return free_vram
def get_uuid(self):
return self.model_dict["model_uuid"]
def unload(self):
if self.model: self.model.unload()
self.model = None
self.config = None
self.cache = None
self.tokenizer = None
def stream_progress(module, num_modules):
packet = \
{
"result": "progress",
"module": module ,
"num_modules": num_modules
}
# print(json.dumps(packet))
yield json.dumps(packet) + "\n"
loaded_model: ModelContainer or None = None
def get_loaded_model():
return loaded_model
def load_model(data):
global models, loaded_model
if loaded_model is not None:
loaded_model.unload()
loaded_model = None
gc.collect()
torch.cuda.empty_cache()
i = data["model_uuid"]
model = models[i]
try:
loaded_model = ModelContainer(model)
yield from loaded_model.load(progress_callback = stream_progress)
success = True
except Exception as e:
loaded_model = None
errormsg = type(e).__name__ + ":\n"
errormsg += str(e)
success = False
if not success:
gc.collect()
torch.cuda.empty_cache()
result = { "result": "fail", "error": errormsg }
# print(json.dumps(result) + "\n")
yield json.dumps(result) + "\n"
return ""
result = { "result": "ok" }
# print(json.dumps(result) + "\n")
yield json.dumps(result) + "\n"
def unload_model():
global loaded_model
if loaded_model is not None:
loaded_model.unload()
loaded_model = None
gc.collect()
torch.cuda.empty_cache()
result = { "result": "ok" }
return result

164
backend/prompts.py Normal file
View File

@@ -0,0 +1,164 @@
class PromptFormat:
botname = "Chatbort"
username = "User"
def __init__(self):
pass
def format(self, prompt, response, system_prompt, settings):
raise NotImplementedError
def stop_conditions(self, tokenizer, settings):
raise NotImplementedError
def is_instruct(self):
raise NotImplementedError
def encode_special_tokens(self):
return True
class PromptFormat_raw(PromptFormat):
description = "Model-agnostic mode simulating a raw chatlog between two or more users"
def __init__(self):
super().__init__()
pass
def is_instruct(self):
return False
def stop_conditions(self, tokenizer, settings):
raise NotImplementedError
def format(self, prompt, response, system_prompt, settings):
raise NotImplementedError
def encode_special_tokens(self):
return False
# def default_system_prompt(self):
# return \
# f"""This is a conversation between a helpful AI assistant named {self.botname} and a """ + \
# (f"""user named {self.username}.""" if self.username != "User" else """user.""")
#
# def first_prompt(self):
# return \
# f"""<|system_prompt|>\n{self.username}: <|user_prompt|>\n{self.botname}:"""
#
# def subs_prompt(self):
# return \
# f"""{self.username}: <|user_prompt|>\n{self.botname}:"""
#
# def stop_conditions(self, tokenizer):
# return \
# [self.username + ":",
# self.username[0:1] + ":",
# self.username.upper() + ":",
# self.username.lower() + ":",
# tokenizer.eos_token_id]
#
# def encoding_options(self):
# return False, False, False
#
# def print_bot_name(self):
# return True
class PromptFormat_llama(PromptFormat):
description = "Llama-chat, Llama2-chat and Mistral-instruct models"
def __init__(self):
super().__init__()
pass
def is_instruct(self):
return True
def stop_conditions(self, tokenizer, settings):
return \
[tokenizer.eos_token_id]
def format(self, prompt, response, system_prompt, settings):
text = "<s>[INST] "
if system_prompt:
text += "<<SYS>>\n"
text += system_prompt
text += "\n<</SYS>>\n\n "
text += prompt
text += " [/INST]"
if response:
text += response
text += "</s>"
return text
# class PromptFormat_codellama(PromptFormat_llama):
#
# description = "CodeLlama-instruct"
#
# def __init__(self):
# super().__init__()
# pass
#
# def default_system_prompt(self):
# return \
# """You are a helpful coding assistant. Always answer as helpfully as possible."""
class PromptFormat_chatml(PromptFormat):
description = "ChatML format, as used by e.g. (Mistral)Orca"
def __init__(self):
super().__init__()
pass
def is_instruct(self):
return True
def stop_conditions(self, tokenizer, settings):
return \
[tokenizer.eos_token_id,
"""<|im_end|>"""]
def format(self, prompt, response, system_prompt, settings):
text = ""
if system_prompt:
text += "<|im_start|>system\n"
text += system_prompt
text += "\n<|im_end|>\n"
text += "<|im_start|>user\n"
text += prompt
text += "<|im_end|>\n"
text += "<|im_start|>assistant\n"
if response:
text += response
text += "<|im_end|>\n"
return text
class PromptFormat_tinyllama(PromptFormat_chatml):
description = "ChatML format, but ignoring special/added tokens. Use for TinyLlama-chat v0.3"
def encode_special_tokens(self):
return False
prompt_formats = \
{
"Chat-RP": PromptFormat_raw,
"Llama-chat": PromptFormat_llama,
"ChatML": PromptFormat_chatml,
"TinyLlama-chat": PromptFormat_tinyllama
}
def list_prompt_formats():
global prompt_formats
return list(prompt_formats.keys())

456
backend/sessions.py Normal file
View File

@@ -0,0 +1,456 @@
import json, uuid, os, gc, glob, time
import torch
from exllamav2 import (
ExLlamaV2,
ExLlamaV2Config,
ExLlamaV2Cache,
ExLlamaV2Cache_8bit,
ExLlamaV2Tokenizer,
)
from exllamav2.generator import (
ExLlamaV2StreamingGenerator,
ExLlamaV2Sampler
)
from exllamav2.generator.filters import (
ExLlamaV2SelectFilter
)
from backend.config import set_config_dir, global_state, config_filename
from backend.models import get_loaded_model
from backend.prompts import prompt_formats
from backend.util import MultiTimer
session_list: dict or None = None
current_session = None
# List models
def list_sessions():
global session_list
if session_list is None:
s_pattern = config_filename("session_*.json")
s_files = glob.glob(s_pattern)
s_files = sorted(s_files, key = os.path.getctime)
session_list = {}
for s_file in s_files:
with open(s_file, "r") as s:
j = json.load(s)
i = j["session_uuid"]
n = j["name"]
session_list[i] = (n, s_file)
sl = {}
for k, v in session_list.items(): sl[k] = v[0]
return sl, current_session.session_uuid if current_session is not None else None
# Session
def get_session():
global current_session
return current_session
def set_session(data):
global current_session
current_session = Session(data["session_uuid"])
current_session.load()
return current_session.to_json()
def new_session():
global current_session, session_list
current_session = Session()
current_session.init_new()
filename = current_session.save()
session_list[current_session.session_uuid] = (current_session.name, filename)
print(f"Created session {current_session.session_uuid}")
return current_session.to_json()
def delete_session(d_session):
global current_session, session_list
if d_session in session_list:
filename = session_list[d_session][1]
os.remove(filename)
del session_list[d_session]
if current_session is not None and current_session.session_uuid == d_session:
current_session = None
def get_default_session_settings():
return \
{
"prompt_format": "Chat-RP",
"roles": [ "User", "Assistant", "", "", "", "", "", "" ],
"system_prompt_default": True,
"system_prompt": "This is a chat between a curious user and a helpful AI assistant.",
"maxtokens": 1024,
"chunktokens": 512,
"stop_newline": False,
"temperature": 0.8,
"top_k": 50,
"top_p": 0.8,
"typical": 0.0,
"repp": 1.15,
"repr": 1024,
"repd": 512,
}
class Session:
name: str = None
session_uuid: str = None
history: [] = None
settings: {} = None
mode: str
def __init__(self, session_uuid = None):
self.session_uuid = session_uuid
def filename(self):
return config_filename("session_" + self.session_uuid + ".json")
def init_new(self):
self.name = "Unnamed session"
self.session_uuid = str(uuid.uuid4())
self.history = []
self.mode = ""
self.settings = get_default_session_settings()
def to_json(self):
j = {}
j["session_uuid"] = self.session_uuid
j["name"] = self.name
j["history"] = self.history
j["mode"] = self.mode
j["settings"] = self.settings
return j
def from_json(self, j):
self.name = j["name"]
self.session_uuid = j["session_uuid"]
self.history = j["history"]
self.mode = j["mode"]
self.settings = j.get("settings", get_default_session_settings())
def load(self):
print(f"Loading session: {self.filename()}")
with open(self.filename(), "r") as s:
j = json.load(s)
self.from_json(j)
def save(self):
print(f"Saving session: {self.filename()}")
jd = json.dumps(self.to_json(), indent = 4)
with open(self.filename(), "w") as outfile:
outfile.write(jd)
return self.filename()
def update_settings(self, data):
self.settings = data
self.save()
def user_input(self, data):
prompt_format = prompt_formats[self.settings["prompt_format"]]()
input_text = data["user_input_text"]
new_block = {}
new_block["block_uuid"] = str(uuid.uuid4())
new_block["author"] = "user"
if prompt_format.is_instruct(): prefix = ""
else: prefix = self.settings["roles"][0] + ": "
new_block["text"] = prefix + input_text
self.history.append(new_block)
self.save()
return new_block
def create_context(self, prompt_format, max_len, prefix = ""):
if prompt_format.is_instruct():
prompts = []
responses = []
# Create prompt-response pairs, pad in case of multiple prompts or responses in a row
for h in self.history:
if h["author"] == "assistant":
if len(prompts) == len(responses): prompts.append("")
responses.append(h["text"])
elif h["author"] == "user":
if len(prompts) != len(responses): responses.append("")
prompts.append(h["text"])
else:
print("Unknown author")
# Create context until we run out of space
while True:
context_str = ""
for turn in range(len(prompts)):
p = prompts[turn]
r = responses[turn] if turn < len(responses) else None
sp = self.settings["system_prompt"] if context_str == "" else None
up_text = prompt_format.format(p, r, sp, self.settings)
context_str = context_str + up_text
context_str += prefix
context_ids = get_loaded_model().tokenizer.encode(context_str, encode_special_tokens = prompt_format.encode_special_tokens())
if context_ids.shape[-1] < max_len: return context_str, context_ids
prompts = prompts[1:]
responses = responses[1:]
# Non-instruct format
else:
history_copy = self.history
while True:
context_str = self.settings["system_prompt"] + "\n" + "\n".join([h["text"] for h in history_copy]) + "\n"
context_str += prefix
context_ids = get_loaded_model().tokenizer.encode(context_str, encode_special_tokens = prompt_format.encode_special_tokens())
if context_ids.shape[-1] < max_len: return context_str, context_ids
history_copy = history_copy[1:]
def generate(self, data):
mt = MultiTimer()
if get_loaded_model() is None:
packet = { "result": "fail", "error": "No model loaded." }
yield json.dumps(packet) + "\n"
return packet
model = get_loaded_model().model
generator = get_loaded_model().generator
tokenizer = get_loaded_model().tokenizer
cache = get_loaded_model().cache
prompt_format = prompt_formats[self.settings["prompt_format"]]()
# Create response block
if prompt_format.is_instruct():
new_block = {}
new_block["block_uuid"] = str(uuid.uuid4())
new_block["author"] = "assistant"
new_block["text"] = ""
packet = {}
packet["result"] = "begin_block"
packet["block"] = new_block
yield json.dumps(packet) + "\n"
# Sampling settings
gen_settings = ExLlamaV2Sampler.Settings()
gen_settings.temperature = self.settings["temperature"]
gen_settings.top_k = self.settings["top_k"]
gen_settings.top_p = self.settings["top_p"]
gen_settings.typical = self.settings["typical"]
gen_settings.token_repetition_penalty = self.settings["repp"]
gen_settings.token_repetition_range = self.settings["repr"]
gen_settings.token_repetition_decay = self.settings["repr"]
if gen_settings.temperature == 0:
gen_settings.temperature = 1.0
gen_settings.top_k = 1
gen_settings.top_p = 0
gen_settings.typical = 0
if prompt_format.is_instruct():
generator.set_stop_conditions(prompt_format.stop_conditions(tokenizer, self.settings))
else:
if self.settings["stop_newline"]:
generator.set_stop_conditions(["\n"])
else:
stop = set()
for r in self.settings["roles"]:
if r.strip() != "":
stop.add("\n" + r + ":")
stop.add("\n " + r + ":")
stop.add("\n" + r.upper() + ":")
stop.add("\n " + r.upper() + ":")
stop.add("\n" + r.lower() + ":")
stop.add("\n " + r.lower() + ":")
generator.set_stop_conditions(list(stop) + [tokenizer.eos_token_id])
# Begin response
generated_tokens = 0
max_new_tokens = self.settings["maxtokens"]
chunk_tokens = 0
last_chunk_time = time.time()
full_response = "" # TODO: Preload response
save_tokens = torch.empty((1, 0), dtype = torch.bool)
chunk_buffer = ""
# If not in instruct mode, generate bot name prefix
prefix = ""
if not prompt_format.is_instruct():
bot_roles = []
for r in self.settings["roles"][1:]:
if r.strip() != "": bot_roles.append(r + ":")
assert len(bot_roles) >= 1
past_tokens = model.config.max_seq_len - self.settings["chunktokens"] - save_tokens.shape[-1]
context_str, context_ids = self.create_context(prompt_format, past_tokens)
gen_settings.filters = [ExLlamaV2SelectFilter(model, tokenizer, bot_roles, case_insensitive = False)]
mt.set_stage("prompt")
generator.begin_stream(context_ids, gen_settings, token_healing = False)
mt.stop()
mt.set_stage("gen")
while True:
chunk, eos, tokens = generator.stream()
prefix += chunk
if eos: break
mt.stop()
gen_settings.filters = []
# Begin block with bot name prefix
new_block = {}
new_block["block_uuid"] = str(uuid.uuid4())
new_block["author"] = "assistant"
new_block["text"] = prefix
packet = {}
packet["result"] = "begin_block"
packet["block"] = new_block
yield json.dumps(packet) + "\n"
# Stream response
mt.set_stage("gen")
while True:
if chunk_tokens == 0:
packet = {}
packet["result"] = "prompt_eval"
packet["block_uuid"] = new_block["block_uuid"]
yield json.dumps(packet) + "\n"
past_tokens = model.config.max_seq_len - self.settings["chunktokens"] - save_tokens.shape[-1]
context_str, context_ids = self.create_context(prompt_format, past_tokens, prefix)
context_ids = torch.cat((context_ids, save_tokens), dim = -1)
mt.set_stage("prompt")
generator.begin_stream(context_ids, gen_settings, token_healing = False)
chunk_tokens = model.config.max_seq_len - context_ids.shape[-1] - 1
mt.set_stage("gen")
chunk, eos, tokens = generator.stream()
save_tokens = torch.cat((save_tokens, tokens), dim = -1)
generated_tokens += 1
chunk_tokens -= 1
chunk_buffer += chunk
now = time.time()
elapsed = now - last_chunk_time
if chunk_buffer != "" and (elapsed > 0.05 or eos or generated_tokens == max_new_tokens):
packet = {}
packet["result"] = "stream_to_block"
packet["block_uuid"] = new_block["block_uuid"]
packet["text"] = chunk_buffer
yield json.dumps(packet) + "\n"
full_response += chunk_buffer
chunk_buffer = ""
last_chunk_time = now
if eos or generated_tokens == max_new_tokens: break
# Compile metadata
mt.stop()
meta = {}
meta["prompt_tokens"] = context_ids.shape[-1]
meta["prompt_speed"] = context_ids.shape[-1] / (mt.stages["prompt"] + 1e-8)
meta["gen_tokens"] = generated_tokens
meta["gen_speed"] = generated_tokens / (mt.stages["gen"] + 1e-8)
meta["overflow"] = max_new_tokens if generated_tokens == max_new_tokens else 0
new_block["meta"] = meta
# Save response block
new_block["text"] = prefix + full_response.rstrip()
self.history.append(new_block)
self.save()
# Done
packet = { "result": "ok", "new_block": new_block }
yield json.dumps(packet) + "\n"
return packet
def rename(self, data):
global session_list
if "session_uuid" in data:
assert data["session_uuid"] == self.session_uuid
session_list[self.session_uuid] = (data["new_name"], session_list[self.session_uuid][1])
self.name = data["new_name"]
self.save()
def delete_block(self, block_uuid):
print(f"Deleting block: {block_uuid}")
for h in self.history:
if h["block_uuid"] == block_uuid:
self.history.remove(h)
break
self.save()
def edit_block(self, block):
block_uuid = block['block_uuid']
print(f"Editing block: {block_uuid}")
for i in range(len(self.history)):
if self.history[i]["block_uuid"] == block_uuid:
self.history[i] = block
break
self.save()

26
backend/util.py Normal file
View File

@@ -0,0 +1,26 @@
import time
class MultiTimer:
def __init__(self):
self.current_stage = ""
self.stages = {}
self.last = time.time()
def set_stage(self, stage):
now = time.time()
elapsed = now - self.last
self.last = now
prev = self.current_stage
self.current_stage = stage
if prev != "" and prev in self.stages:
self.stages[prev] += elapsed
else:
self.stages[prev] = elapsed
def stop(self):
self.set_stage("")

BIN
doc/icon.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 73 KiB

220
server.py Normal file
View File

@@ -0,0 +1,220 @@
import sys, os, json, argparse
from threading import Timer, Lock
from flask import Flask, render_template, request
from flask import Response, stream_with_context
from waitress import serve
import webbrowser
import torch
from backend.models import update_model, load_models, get_model_info, list_models, remove_model, load_model, unload_model
from backend.config import set_config_dir, global_state
from backend.sessions import list_sessions, set_session, get_session, get_default_session_settings, new_session, delete_session
from backend.prompts import list_prompt_formats
app = Flask("ExUI")
app.static_folder = 'static'
api_lock = Lock()
parser = argparse.ArgumentParser(description="ExUI, chatbot UI for ExLlamaV2")
parser.add_argument("-host", "--host", type = str, help = "IP:PORT eg, 0.0.0.0:5000", default = "localhost:5000")
parser.add_argument("-d", "--dir", type = str, help = "Location for user data and sessions, default: ~/exui", default = "~/exui")
args = parser.parse_args()
@app.route("/api/delete_block", methods=['POST'])
def api_delete_block():
global api_lock
with api_lock:
s = get_session()
data = request.get_json()
s.delete_block(data["block_uuid"])
result = { "result": "ok" }
return json.dumps(result) + "\n"
@app.route("/api/edit_block", methods=['POST'])
def api_edit_block():
global api_lock
with api_lock:
s = get_session()
data = request.get_json()
s.edit_block(data["block"])
result = { "result": "ok" }
return json.dumps(result) + "\n"
@app.route("/api/list_prompt_formats")
def api_glist_prompt_formats():
global api_lock
with api_lock:
result = {"result": "ok", "prompt_formats": list_prompt_formats()}
return json.dumps(result) + "\n"
@app.route("/api/generate", methods=['POST'])
def api_generate():
global api_lock
with api_lock:
data = request.get_json()
s = get_session()
result = Response(stream_with_context(s.generate(data)), mimetype = 'application/json')
return result
@app.route("/api/get_default_settings")
def api_get_default_settings():
global api_lock
with api_lock:
result = { "result": "ok",
"settings": get_default_session_settings(),
"prompt_formats": list_prompt_formats() }
return json.dumps(result) + "\n"
@app.route("/api/update_settings", methods=['POST'])
def api_update_settings():
global api_lock
with api_lock:
s = get_session()
data = request.get_json()
s.update_settings(data["settings"])
result = { "result": "ok" }
return json.dumps(result) + "\n"
@app.route("/api/user_input", methods=['POST'])
def api_user_input():
global api_lock
with api_lock:
s = get_session()
data = request.get_json()
new_block = s.user_input(data)
result = { "result": "ok", "new_block": new_block }
return json.dumps(result) + "\n"
@app.route("/api/list_sessions")
def api_list_sessions():
global api_lock
with api_lock:
s, c = list_sessions()
result = { "result": "ok", "sessions": s, "current_session": c }
return json.dumps(result) + "\n"
@app.route("/api/new_session", methods=['POST'])
def api_new_session():
global api_lock
with api_lock:
data = request.get_json()
session = new_session()
if "settings" in data: get_session().update_settings(data["settings"])
if "new_name" in data: get_session().rename(data)
result = { "result": "ok", "session": session }
return json.dumps(result) + "\n"
@app.route("/api/rename_session", methods=['POST'])
def api_rename_session():
global api_lock
with api_lock:
data = request.get_json()
s = get_session()
s.rename(data)
result = { "result": "ok" }
return json.dumps(result) + "\n"
@app.route("/api/delete_session", methods=['POST'])
def api_delete_session():
global api_lock
with api_lock:
data = request.get_json()
delete_session(data["session_uuid"]);
result = { "result": "ok" }
return json.dumps(result) + "\n"
@app.route("/api/set_session", methods=['POST'])
def api_set_session():
global api_lock
with api_lock:
data = request.get_json()
session = set_session(data)
if session is not None:
result = { "result": "ok",
"session": session,
"prompt_formats": list_prompt_formats() }
else:
result = { "result": "fail" }
return json.dumps(result) + "\n"
@app.route("/api/list_models")
def api_list_models():
global api_lock
with api_lock:
m, c = list_models()
result = { "result": "ok", "models": m, "current_model": c }
return json.dumps(result) + "\n"
@app.route("/api/update_model", methods=['POST'])
def api_update_model():
global api_lock
with api_lock:
data = request.get_json()
i = update_model(data)
result = { "result": "ok", "new_model_uuid": i }
return json.dumps(result) + "\n"
@app.route("/api/remove_model", methods=['POST'])
def api_remove_model():
global api_lock
with api_lock:
data = request.get_json()
remove_model(data)
result = { "result": "ok" }
return json.dumps(result) + "\n"
@app.route("/api/load_model", methods=['POST'])
def api_load_model():
global api_lock
with api_lock:
data = request.get_json()
result = Response(stream_with_context(load_model(data)), mimetype = 'application/json')
return result
@app.route("/api/unload_model")
def api_unload_model():
global api_lock
with api_lock:
result = unload_model()
return json.dumps(result) + "\n"
@app.route("/api/get_model_info", methods=['POST'])
def api_get_model_info():
global api_lock
with api_lock:
data = request.get_json()
info = get_model_info(data)
if info: result = { "result": "ok", "model_info": info }
else: result = { "result": "fail" }
return json.dumps(result) + "\n"
@app.route("/")
def home():
global api_lock
with api_lock:
return render_template("index.html")
# Prepare torch
# torch.cuda._lazy_init()
# Prepare config
print(f" -- User dir: {args.dir}")
set_config_dir(args.dir)
global_state.load()
load_models()
# Start server
machine = args.host
host, port = machine.split(":")
if host == "localhost":
Timer(1, lambda: webbrowser.open(f'http://{machine}/')).start()
serve(app, host = host, port = port)

1930
static/controls.js vendored Normal file

File diff suppressed because it is too large Load Diff

BIN
static/gfx/avatar_cat.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 11 KiB

BIN
static/gfx/avatar_dog.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 14 KiB

BIN
static/gfx/avatar_frog.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 14 KiB

BIN
static/gfx/avatar_monke.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 12 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 1.9 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 14 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 13 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 16 KiB

BIN
static/gfx/favicon.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 11 KiB

BIN
static/gfx/icon_chat.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 15 KiB

BIN
static/gfx/icon_model.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 18 KiB

1044
static/style.css Normal file

File diff suppressed because it is too large Load Diff

57
templates/index.html Normal file
View File

@@ -0,0 +1,57 @@
<!DOCTYPE html>
<html>
<head>
<title>ExUI</title>
<link rel="stylesheet" type="text/css" href="{{ url_for('static', filename='style.css') }}">
<link rel="icon" type="image/png" sizes="32x32" href="{{ url_for('static', filename='gfx/favicon.png') }}">
{% include 'svg_icons.html' %}
</head>
<body>
<script src="https://cdn.jsdelivr.net/npm/marked/marked.min.js"></script>
<div id="page-disabled">
<div id="busy" class="busy">
<p>Please wait...</p>
<div id="busy-anim"></div>
</div>
<div id="loading" class="loading">
<p>Loading</p>
<div id="loading-progress-container">
<div id="loading-progress"></div>
</div>
</div>
</div>
<div id="page-enabled">
<div class="vflex">
<div id="mainmenu" class="mainmenu">
</div>
<div id="mainbody" class="mainbody">
<div id="model-page" class="model-page hidden">
<div class="vflex">
<div id="model-list" class="model-list"></div>
<div id="model-view" class="model-view"></div>
</div>
</div>
<div id="chat-page" class="chat-page hidden">
<div class="vflex">
<div class="hflex">
<div id="session-list" class="session-list"></div>
<div id="session-list-controls" class="session-list-controls"></div>
</div>
<div class="hflex">
<div id="session-view" class="session-view">
<div id="session-view-history" class="session-view-history"></div>
<div class="session-input-surround">
<textarea id="session-input" class="session-input" placeholder="Type here..." rows="1"></textarea>
</div>
</div>
</div>
<div id="session-settings" class="session-settings">
</div>
</div>
</div>
</div>
</div>
</div>
<script src="{{ url_for('static', filename='controls.js') }}"></script>
</body>
</html>

58
templates/svg_icons.html Normal file
View File

@@ -0,0 +1,58 @@
<svg style="display: none;">
<defs>
<symbol id="pencil-icon" viewBox="0 0 16 16">
<path d="M16.84,2.73C16.45,2.73 16.07,2.88 15.77,3.17L13.65,5.29L18.95,10.6L21.07,8.5C21.67,7.89 21.67,6.94 21.07,6.36L17.9,3.17C17.6,2.88 17.22,2.73 16.84,2.73M12.94,6L4.84,14.11L7.4,14.39L7.58,16.68L9.86,16.85L10.15,19.41L18.25,11.3M4.25,15.04L2.5,21.73L9.2,19.94L8.96,17.78L6.65,17.61L6.47,15.29"/>
</symbol>
</defs>
</svg>
<svg style="display: none;">
<defs>
<symbol id="delete-icon" viewBox="0 0 24 24">
<path d="M9,3V4H4V6H5V19C5,20.1 5.9,21 7,21H17C18.1,21 19,20.1 19,19V6H20V4H15V3H9M7,6H17V19H7V6M9,8V17H11V8H9M13,8V17H15V8H13Z"/>
</symbol>
</defs>
</svg>
<svg style="display: none;">
<defs>
<symbol id="model-icon" viewBox="0 0 24 24">
<path d="M12.414 5H21a1 1 0 0 1 1 1v14a1 1 0 0 1-1 1H3a1 1 0 0 1-1-1V4a1 1 0 0 1 1-1h7.414l2 2zM20 11H4v8h16v-8zm0-2V7h-8.414l-2-2H4v4h16z"/>
</symbol>
</defs>
</svg>
<svg style="display: none;">
<defs>
<symbol id="model-plus-icon" viewBox="0 0 24 24">
<path d="M12.414 5H21a1 1 0 0 1 1 1v14a1 1 0 0 1-1 1H3a1 1 0 0 1-1-1V4a1 1 0 0 1 1-1h7.414l2 2zM4 5v14h16V7h-8.414l-2-2H4zm7 7V9h2v3h3v2h-3v3h-2v-3H8v-2h3z"/>
</symbol>
</defs>
</svg>
<svg style="display: none;">
<defs>
<symbol id="model-loaded-icon" viewBox="0 0 14 16">
<path d="M0 .984v14.032a1 1 0 0 0 1.506.845l12.006-7.016a.974.974 0 0 0 0-1.69L1.506.139A1 1 0 0 0 0 .984Z"/>
</symbol>
</defs>
</svg>
<svg style="display: none;">
<defs>
<symbol id="session-icon" viewBox="0 0 20 18">
<path d="M18 4H16V9C16 10.0609 15.5786 11.0783 14.8284 11.8284C14.0783 12.5786 13.0609 13 12 13H9L6.846 14.615C7.17993 14.8628 7.58418 14.9977 8 15H11.667L15.4 17.8C15.5731 17.9298 15.7836 18 16 18C16.2652 18 16.5196 17.8946 16.7071 17.7071C16.8946 17.5196 17 17.2652 17 17V15H18C18.5304 15 19.0391 14.7893 19.4142 14.4142C19.7893 14.0391 20 13.5304 20 13V6C20 5.46957 19.7893 4.96086 19.4142 4.58579C19.0391 4.21071 18.5304 4 18 4Z" fill="currentColor"/>
<path d="M12 0H2C1.46957 0 0.960859 0.210714 0.585786 0.585786C0.210714 0.960859 0 1.46957 0 2V9C0 9.53043 0.210714 10.0391 0.585786 10.4142C0.960859 10.7893 1.46957 11 2 11H3V13C3 13.1857 3.05171 13.3678 3.14935 13.5257C3.24698 13.6837 3.38668 13.8114 3.55279 13.8944C3.71889 13.9775 3.90484 14.0126 4.08981 13.996C4.27477 13.9793 4.45143 13.9114 4.6 13.8L8.333 11H12C12.5304 11 13.0391 10.7893 13.4142 10.4142C13.7893 10.0391 14 9.53043 14 9V2C14 1.46957 13.7893 0.960859 13.4142 0.585786C13.0391 0.210714 12.5304 0 12 0Z" fill="currentColor"/>
</symbol>
</defs>
</svg>
<svg style="display: none;">
<defs>
<symbol id="session-new-icon" viewBox="0 0 20 18" aria-hidden="true" fill="none">
<path stroke="currentColor" stroke-linecap="round" stroke-linejoin="round" stroke-width="2" d="M16 5h2a1 1 0 0 1 1 1v7a1 1 0 0 1-1 1h-2v3l-4-3H8m4-13H2a1 1 0 0 0-1 1v7a1 1 0 0 0 1 1h2v3l4-3h4a1 1 0 0 0 1-1V2a1 1 0 0 0-1-1Z"/>
</symbol>
</defs>
</svg>

After

Width:  |  Height:  |  Size: 3.1 KiB