Initial commit

2026-04-20 06:19:11 +00:00 · 2023-10-29 20:55:38 +01:00
commit a647a51fea
22 changed files with 4396 additions and 0 deletions
--- a/backend/config.py
+++ b/backend/config.py
@@ -0,0 +1,42 @@
+import sys, os, json
+
+config_dir: str = "///"
+
+def set_config_dir(config_dir_):
+    global config_dir
+    config_dir = os.path.expanduser(config_dir_)
+    if not os.path.exists(config_dir):
+        os.makedirs(config_dir)
+
+
+def config_filename(filename: str):
+    global config_dir
+    return os.path.join(config_dir, filename)
+
+
+class GlobalState:
+
+    def __init__(self):
+        pass
+
+    def load(self):
+
+        filename = config_filename("state.json")
+        if os.path.exists(filename):
+            with open(filename, "r") as f:
+                r = json.load(f)
+        else:
+            r = {}
+
+
+    def save(self):
+
+        r = {}
+
+        filename = config_filename("state.json")
+        r_json = json.dumps(r, indent = 4)
+        with open(filename, "w") as outfile:
+            outfile.write(r_json)
+
+
+global_state = GlobalState()
--- a/backend/models.py
+++ b/backend/models.py
@@ -0,0 +1,399 @@
+import json, uuid, os, gc
+import torch
+from pynvml import *
+
+from exllamav2 import(
+    ExLlamaV2,
+    ExLlamaV2Config,
+    ExLlamaV2Cache,
+    ExLlamaV2Cache_8bit,
+    ExLlamaV2Tokenizer,
+)
+
+from exllamav2.generator import(
+    ExLlamaV2StreamingGenerator,
+    ExLlamaV2Sampler
+)
+
+from exllamav2.attn import ExLlamaV2Attention
+# from exllamav2.util import list_live_tensors
+from backend.config import config_filename
+
+auto_split_reserve_bytes = 512 * 1024**2
+
+models = {}
+
+# Load/save config
+
+def load_models():
+    global models
+
+    filename = config_filename("models.json")
+    if os.path.exists(filename):
+        with open(filename, "r") as f:
+            models = json.load(f)
+    else:
+        models = {}
+
+
+def save_models():
+    global models
+
+    filename = config_filename("models.json")
+    models_json = json.dumps(models, indent = 4)
+    with open(filename, "w") as outfile:
+        outfile.write(models_json)
+
+
+# List models
+
+def list_models():
+    global models
+
+    models_list = {}
+    for k, v in models.items(): models_list[k] = v["name"]
+    current_model = loaded_model.get_uuid() if loaded_model is not None else None
+    return models_list, current_model
+
+
+# Get model
+
+def get_model_info(data = None):
+    global models
+
+    if data is None: return None
+
+    i = data["model_uuid"]
+    if i is None: return None
+    return models[i]
+
+
+# Remove model config
+
+def remove_model(data):
+    global models
+
+    i = data["model_uuid"]
+    if i is None: return
+
+    del models[i]
+    save_models()
+
+
+# Update model config
+
+def update_model(data):
+    global models
+
+    if data["model_uuid"] is None:
+        new_model = {}
+        i = str(uuid.uuid4())
+        new_model["model_uuid"] = i
+        new_model["name"] = data["name"] or "Unnamed model"
+        new_model["model_directory"] = data["model_directory"]
+        models[i] = new_model
+        prepare_model(new_model)
+        save_models()
+        return i
+
+    i = data["model_uuid"]
+    model = models[i]
+
+    if "name" in data: model["name"] = data["name"]
+    if "model_directory" in data: model["model_directory"] = data["model_directory"]
+
+    prepare_model(model)
+
+    if "seq_len" in data: model["seq_len"] = data["seq_len"]
+    if "rope_scale" in data: model["rope_scale"] = data["rope_scale"]
+    if "rope_alpha" in data: model["rope_alpha"] = data["rope_alpha"]
+
+    if "cache_mode" in data: model["cache_mode"] = data["cache_mode"]
+    if "chunk_size" in data: model["chunk_size"] = data["chunk_size"]
+    if "gpu_split" in data: model["gpu_split"] = data["gpu_split"]
+    if "gpu_split_auto" in data: model["gpu_split_auto"] = data["gpu_split_auto"]
+
+    if "draft_enabled" in data: model["draft_enabled"] = data["draft_enabled"]
+    if "draft_model_directory" in data: model["draft_model_directory"] = data["draft_model_directory"]
+
+    prepare_draft_model(model)
+
+    if "draft_rope_alpha" in data: model["draft_rope_alpha"] = data["draft_rope_alpha"]
+    if "draft_rope_alpha_auto" in data: model["draft_rope_alpha_auto"] = data["draft_rope_alpha_auto"]
+
+    save_models()
+    return None
+
+
+def prepare_draft_model(model):
+
+    if "draft_enabled" not in model: model["draft_enabled"] = False
+    if model["draft_enabled"]:
+
+        prep_draft_config = ExLlamaV2Config()
+        prep_draft_config.model_dir = model["draft_model_directory"]
+        try:
+            prep_draft_config.prepare()
+            model["draft_config_status"] = "ok"
+            model["draft_config_status_error"] = None
+        except Exception as e:
+            model["draft_config_status"] = "error"
+            model["draft_config_status_error"] = str(e)
+            return
+
+        draft_stats = {}
+        draft_stats["hidden_size"] = prep_draft_config.hidden_size
+        draft_stats["intermediate_size"] = prep_draft_config.intermediate_size
+        draft_stats["num_attention_heads"] = prep_draft_config.num_attention_heads
+        draft_stats["num_key_value_heads"] = prep_draft_config.num_key_value_heads
+        draft_stats["num_hidden_layers"] = prep_draft_config.num_hidden_layers
+        draft_stats["vocab_size"] = prep_draft_config.vocab_size
+        draft_stats["head_dim"] = prep_draft_config.head_dim
+        model["draft_stats"] = draft_stats
+
+        if "draft_rope_alpha" not in model: model["draft_rope_alpha"] = 1.0
+        if "draft_rope_alpha_auto" not in model: model["draft_rope_alpha_auto"] = True
+
+
+def prepare_model(model):
+
+    prep_config = ExLlamaV2Config()
+    prep_config.model_dir = model["model_directory"]
+
+    try:
+        prep_config.prepare()
+        model["config_status"] = "ok"
+        model["config_status_error"] = None
+    except Exception as e:
+        model["config_status"] = "error"
+        model["config_status_error"] = str(e)
+        return
+
+    stats = {}
+    stats["hidden_size"] = prep_config.hidden_size
+    stats["intermediate_size"] = prep_config.intermediate_size
+    stats["num_attention_heads"] = prep_config.num_attention_heads
+    stats["num_key_value_heads"] = prep_config.num_key_value_heads
+    stats["num_hidden_layers"] = prep_config.num_hidden_layers
+    stats["vocab_size"] = prep_config.vocab_size
+    stats["head_dim"] = prep_config.head_dim
+    model["stats"] = stats
+
+    model["default_seq_len"] = prep_config.max_seq_len
+    if "seq_len" not in model: model["seq_len"] = prep_config.max_seq_len
+    if "rope_scale" not in model: model["rope_scale"] = prep_config.scale_pos_emb
+    if "rope_alpha" not in model: model["rope_alpha"] = prep_config.scale_alpha_value
+
+    if "cache_mode" not in model: model["cache_mode"] = "FP16"
+    if "chunk_size" not in model: model["chunk_size"] = prep_config.max_input_len
+    if "gpu_split" not in model: model["gpu_split"] = ""
+    if "gpu_split_auto" not in model: model["gpu_split_auto"] = True
+
+
+class ModelContainer:
+
+    config: ExLlamaV2Config or None = None
+    draft_config: ExLlamaV2Config or None = None
+    model: ExLlamaV2 or None = None
+    draft_model: ExLlamaV2 or None = None
+    cache: ExLlamaV2Cache or None = None
+    draft_cache: ExLlamaV2Cache or None = None
+    tokenizer: ExLlamaV2Tokenizer or None = None
+    generator: ExLlamaV2StreamingGenerator or None = None
+    model_dict = None
+
+    cache_fp8: bool = False
+    draft_enabled: bool = False
+
+    def __init__(self, model, progress_callback = None):
+
+        self.model_dict = model
+
+        self.config = ExLlamaV2Config()
+        self.config.model_dir = model["model_directory"]
+        self.config.prepare()
+
+        self.config.max_seq_len = model["seq_len"]
+        self.config.scale_pos_emb = model["rope_scale"]
+        self.config.scale_alpha_value = model["rope_alpha"]
+        self.config.max_input_len = model["chunk_size"]
+        self.config.max_attn_size = model["chunk_size"] ** 2
+
+        self.draft_enabled = self.model_dict["draft_enabled"] if "draft_enabled" in self.model_dict else False
+
+        if self.draft_enabled:
+
+            self.draft_config = ExLlamaV2Config()
+            self.draft_config.model_dir = model["draft_model_directory"]
+            self.draft_config.prepare()
+
+            self.draft_config.max_seq_len = model["seq_len"]
+
+            alpha = model["draft_rope_alpha"]
+            if model["draft_rope_alpha_auto"]:
+                ratio = self.config.max_seq_len / self.draft_config.max_seq_len
+                alpha = -0.13436 + 0.80541 * ratio + 0.28833 * ratio ** 2
+                print(f" -- Applying draft model auto RoPE alpha = {alpha:.4f}")
+
+            self.draft_config.scale_alpha_value = alpha
+            self.draft_config.max_input_len = model["chunk_size"]
+            self.draft_config.max_attn_size = model["chunk_size"] ** 2
+
+
+    def load(self, progress_callback = None):
+
+        if self.model_dict["cache_mode"] == "FP8": self.cache_fp8 = True
+        elif self.model_dict["cache_mode"] == "FP16": self.cache_fp8 = False
+        else: raise ValueError("bad cache_mode: " + self.model_dict["cache_mode"])
+
+        self.tokenizer = ExLlamaV2Tokenizer(self.config)
+
+        # Load draft model
+
+        if self.draft_enabled:
+
+            self.draft_model = ExLlamaV2(self.draft_config)
+            print("Loading draft model: " + self.draft_config.model_dir)
+
+            self.draft_cache = ExLlamaV2Cache(self.draft_model, lazy = True)
+            reserve = [96 * 1024**2] + [0] * 16
+            yield from self.draft_model.load_autosplit_gen(self.draft_cache, reserve_vram = reserve, last_id_only = True, callback_gen = progress_callback)
+
+            # Test VRAM allocation with a full-length forward pass
+
+            input_ids = torch.zeros((1, self.config.max_input_len), dtype = torch.long)
+            self.draft_model.forward(input_ids, cache = self.cache, preprocess_only = True)
+
+        # Load model
+
+        self.model = ExLlamaV2(self.config)
+        print("Loading model: " + self.config.model_dir)
+
+        if self.model_dict["gpu_split_auto"]:
+            auto_split = True
+        elif self.model_dict["gpu_split"] is None or self.model_dict["gpu_split"].strip() == "":
+            auto_split = False
+            split = None
+        else:
+            auto_split = False
+            split = [float(alloc) for alloc in self.model_dict["gpu_split"].split(",")]
+
+        if not auto_split:
+            for value in self.model.load_gen(split, callback_gen = progress_callback):
+                if isinstance(value, str):
+                    yield value
+
+        if self.cache_fp8:
+            self.cache = ExLlamaV2Cache_8bit(self.model, lazy = auto_split)
+        else:
+            self.cache = ExLlamaV2Cache(self.model, lazy = auto_split)
+
+        if auto_split:
+            reserve = [96 * 1024**2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
+            yield from self.model.load_autosplit_gen(self.cache, reserve_vram = reserve, last_id_only = True, callback_gen = progress_callback)
+
+        # Test VRAM allocation with a full-length forward pass
+
+        input_ids = torch.zeros((1, self.config.max_input_len), dtype = torch.long)
+        self.model.forward(input_ids, cache = self.cache, preprocess_only = True)
+
+        # Create generator
+
+        self.generator = ExLlamaV2StreamingGenerator(self.model, self.cache, self.tokenizer, self.draft_model, self.draft_cache)
+
+
+    def get_free_vram(self):
+        global auto_split_reserve_bytes
+
+        nvmlInit()
+        device_count = torch.cuda.device_count()
+        free_vram = []
+        for i in range(device_count):
+            handle = nvmlDeviceGetHandleByIndex(i)
+            info = nvmlDeviceGetMemoryInfo(handle)
+            free_vram.append(info.free - auto_split_reserve_bytes)
+
+        return free_vram
+
+
+    def get_uuid(self):
+
+        return self.model_dict["model_uuid"]
+
+
+    def unload(self):
+
+        if self.model: self.model.unload()
+        self.model = None
+        self.config = None
+        self.cache = None
+        self.tokenizer = None
+
+
+def stream_progress(module, num_modules):
+
+    packet = \
+    {
+        "result": "progress",
+        "module": module ,
+        "num_modules": num_modules
+    }
+    # print(json.dumps(packet))
+    yield json.dumps(packet) + "\n"
+
+
+loaded_model: ModelContainer or None = None
+
+def get_loaded_model():
+    return loaded_model
+
+
+def load_model(data):
+    global models, loaded_model
+
+    if loaded_model is not None:
+        loaded_model.unload()
+        loaded_model = None
+
+    gc.collect()
+    torch.cuda.empty_cache()
+
+    i = data["model_uuid"]
+    model = models[i]
+
+    try:
+        loaded_model = ModelContainer(model)
+        yield from loaded_model.load(progress_callback = stream_progress)
+        success = True
+    except Exception as e:
+        loaded_model = None
+        errormsg = type(e).__name__ + ":\n"
+        errormsg += str(e)
+        success = False
+
+    if not success:
+        gc.collect()
+        torch.cuda.empty_cache()
+        result = { "result": "fail", "error": errormsg }
+        # print(json.dumps(result) + "\n")
+        yield json.dumps(result) + "\n"
+        return ""
+
+    result = { "result": "ok" }
+    # print(json.dumps(result) + "\n")
+    yield json.dumps(result) + "\n"
+
+
+def unload_model():
+    global loaded_model
+
+    if loaded_model is not None:
+        loaded_model.unload()
+        loaded_model = None
+
+    gc.collect()
+    torch.cuda.empty_cache()
+
+    result = { "result": "ok" }
+    return result
+
--- a/backend/prompts.py
+++ b/backend/prompts.py
@@ -0,0 +1,164 @@
+
+class PromptFormat:
+
+    botname = "Chatbort"
+    username = "User"
+
+    def __init__(self):
+        pass
+
+    def format(self, prompt, response, system_prompt, settings):
+        raise NotImplementedError
+
+    def stop_conditions(self, tokenizer, settings):
+        raise NotImplementedError
+
+    def is_instruct(self):
+        raise NotImplementedError
+
+    def encode_special_tokens(self):
+        return True
+
+
+class PromptFormat_raw(PromptFormat):
+
+    description = "Model-agnostic mode simulating a raw chatlog between two or more users"
+
+    def __init__(self):
+        super().__init__()
+        pass
+
+    def is_instruct(self):
+        return False
+
+    def stop_conditions(self, tokenizer, settings):
+        raise NotImplementedError
+
+    def format(self, prompt, response, system_prompt, settings):
+        raise NotImplementedError
+
+    def encode_special_tokens(self):
+        return False
+
+    # def default_system_prompt(self):
+    #     return \
+    #         f"""This is a conversation between a helpful AI assistant named {self.botname} and a """ + \
+    #         (f"""user named {self.username}.""" if self.username != "User" else """user.""")
+    #
+    # def first_prompt(self):
+    #     return \
+    #         f"""<|system_prompt|>\n{self.username}: <|user_prompt|>\n{self.botname}:"""
+    #
+    # def subs_prompt(self):
+    #     return \
+    #         f"""{self.username}: <|user_prompt|>\n{self.botname}:"""
+    #
+    # def stop_conditions(self, tokenizer):
+    #     return \
+    #         [self.username + ":",
+    #          self.username[0:1] + ":",
+    #          self.username.upper() + ":",
+    #          self.username.lower() + ":",
+    #          tokenizer.eos_token_id]
+    #
+    # def encoding_options(self):
+    #     return False, False, False
+    #
+    # def print_bot_name(self):
+    #     return True
+
+
+class PromptFormat_llama(PromptFormat):
+
+    description = "Llama-chat, Llama2-chat and Mistral-instruct models"
+
+    def __init__(self):
+        super().__init__()
+        pass
+
+    def is_instruct(self):
+        return True
+
+    def stop_conditions(self, tokenizer, settings):
+        return \
+            [tokenizer.eos_token_id]
+
+    def format(self, prompt, response, system_prompt, settings):
+        text = "<s>[INST] "
+        if system_prompt:
+            text += "<<SYS>>\n"
+            text += system_prompt
+            text += "\n<</SYS>>\n\n "
+        text += prompt
+        text += " [/INST]"
+        if response:
+            text += response
+            text += "</s>"
+        return text
+
+# class PromptFormat_codellama(PromptFormat_llama):
+#
+#     description = "CodeLlama-instruct"
+#
+#     def __init__(self):
+#         super().__init__()
+#         pass
+#
+#     def default_system_prompt(self):
+#         return \
+#             """You are a helpful coding assistant. Always answer as helpfully as possible."""
+
+
+class PromptFormat_chatml(PromptFormat):
+
+    description = "ChatML format, as used by e.g. (Mistral)Orca"
+
+    def __init__(self):
+        super().__init__()
+        pass
+
+    def is_instruct(self):
+        return True
+
+    def stop_conditions(self, tokenizer, settings):
+        return \
+            [tokenizer.eos_token_id,
+             """<|im_end|>"""]
+
+    def format(self, prompt, response, system_prompt, settings):
+        text = ""
+        if system_prompt:
+            text += "<|im_start|>system\n"
+            text += system_prompt
+            text += "\n<|im_end|>\n"
+        text += "<|im_start|>user\n"
+        text += prompt
+        text += "<|im_end|>\n"
+        text += "<|im_start|>assistant\n"
+        if response:
+            text += response
+            text += "<|im_end|>\n"
+        return text
+
+
+class PromptFormat_tinyllama(PromptFormat_chatml):
+
+    description = "ChatML format, but ignoring special/added tokens. Use for TinyLlama-chat v0.3"
+
+    def encode_special_tokens(self):
+        return False
+
+
+prompt_formats = \
+{
+    "Chat-RP": PromptFormat_raw,
+    "Llama-chat": PromptFormat_llama,
+    "ChatML": PromptFormat_chatml,
+    "TinyLlama-chat": PromptFormat_tinyllama
+}
+
+def list_prompt_formats():
+    global prompt_formats
+    return list(prompt_formats.keys())
+
+
--- a/backend/sessions.py
+++ b/backend/sessions.py
@@ -0,0 +1,456 @@
+import json, uuid, os, gc, glob, time
+import torch
+
+from exllamav2 import (
+    ExLlamaV2,
+    ExLlamaV2Config,
+    ExLlamaV2Cache,
+    ExLlamaV2Cache_8bit,
+    ExLlamaV2Tokenizer,
+)
+from exllamav2.generator import (
+    ExLlamaV2StreamingGenerator,
+    ExLlamaV2Sampler
+)
+from exllamav2.generator.filters import (
+    ExLlamaV2SelectFilter
+)
+
+from backend.config import set_config_dir, global_state, config_filename
+from backend.models import get_loaded_model
+from backend.prompts import prompt_formats
+from backend.util import MultiTimer
+
+session_list: dict or None = None
+current_session = None
+
+# List models
+
+def list_sessions():
+    global session_list
+
+    if session_list is None:
+
+        s_pattern = config_filename("session_*.json")
+        s_files = glob.glob(s_pattern)
+        s_files = sorted(s_files, key = os.path.getctime)
+
+        session_list = {}
+
+        for s_file in s_files:
+            with open(s_file, "r") as s:
+                j = json.load(s)
+                i = j["session_uuid"]
+                n = j["name"]
+                session_list[i] = (n, s_file)
+
+    sl = {}
+    for k, v in session_list.items(): sl[k] = v[0]
+    return sl, current_session.session_uuid if current_session is not None else None
+
+
+# Session
+
+def get_session():
+    global current_session
+    return current_session
+
+
+def set_session(data):
+    global current_session
+    current_session = Session(data["session_uuid"])
+    current_session.load()
+    return current_session.to_json()
+
+
+def new_session():
+    global current_session, session_list
+    current_session = Session()
+    current_session.init_new()
+    filename = current_session.save()
+    session_list[current_session.session_uuid] = (current_session.name, filename)
+    print(f"Created session {current_session.session_uuid}")
+    return current_session.to_json()
+
+
+def delete_session(d_session):
+    global current_session, session_list
+    if d_session in session_list:
+        filename = session_list[d_session][1]
+        os.remove(filename)
+        del session_list[d_session]
+    if current_session is not None and current_session.session_uuid == d_session:
+        current_session = None
+
+
+def get_default_session_settings():
+    return \
+    {
+        "prompt_format": "Chat-RP",
+        "roles": [ "User", "Assistant", "", "", "", "", "", "" ],
+        "system_prompt_default": True,
+        "system_prompt": "This is a chat between a curious user and a helpful AI assistant.",
+        "maxtokens": 1024,
+        "chunktokens": 512,
+        "stop_newline": False,
+        "temperature": 0.8,
+        "top_k": 50,
+        "top_p": 0.8,
+        "typical": 0.0,
+        "repp": 1.15,
+        "repr": 1024,
+        "repd": 512,
+    }
+
+class Session:
+
+    name: str = None
+    session_uuid: str = None
+    history: [] = None
+    settings: {} = None
+    mode: str
+
+    def __init__(self, session_uuid = None):
+        self.session_uuid = session_uuid
+
+
+    def filename(self):
+        return config_filename("session_" + self.session_uuid + ".json")
+
+
+    def init_new(self):
+        self.name = "Unnamed session"
+        self.session_uuid = str(uuid.uuid4())
+        self.history = []
+        self.mode = ""
+        self.settings = get_default_session_settings()
+
+
+    def to_json(self):
+        j = {}
+        j["session_uuid"] = self.session_uuid
+        j["name"] = self.name
+        j["history"] = self.history
+        j["mode"] = self.mode
+        j["settings"] = self.settings
+        return j
+
+
+    def from_json(self, j):
+        self.name = j["name"]
+        self.session_uuid = j["session_uuid"]
+        self.history = j["history"]
+        self.mode = j["mode"]
+        self.settings = j.get("settings", get_default_session_settings())
+
+
+    def load(self):
+        print(f"Loading session: {self.filename()}")
+        with open(self.filename(), "r") as s:
+            j = json.load(s)
+        self.from_json(j)
+
+
+    def save(self):
+        print(f"Saving session: {self.filename()}")
+        jd = json.dumps(self.to_json(), indent = 4)
+        with open(self.filename(), "w") as outfile:
+            outfile.write(jd)
+        return self.filename()
+
+
+    def update_settings(self, data):
+        self.settings = data
+        self.save()
+
+
+    def user_input(self, data):
+        prompt_format = prompt_formats[self.settings["prompt_format"]]()
+        input_text = data["user_input_text"]
+        new_block = {}
+        new_block["block_uuid"] = str(uuid.uuid4())
+        new_block["author"] = "user"
+        if prompt_format.is_instruct(): prefix = ""
+        else: prefix = self.settings["roles"][0] + ": "
+        new_block["text"] = prefix + input_text
+        self.history.append(new_block)
+        self.save()
+        return new_block
+
+
+    def create_context(self, prompt_format, max_len, prefix = ""):
+
+        if prompt_format.is_instruct():
+
+            prompts = []
+            responses = []
+
+            # Create prompt-response pairs, pad in case of multiple prompts or responses in a row
+
+            for h in self.history:
+
+                if h["author"] == "assistant":
+                    if len(prompts) == len(responses): prompts.append("")
+                    responses.append(h["text"])
+
+                elif h["author"] == "user":
+                    if len(prompts) != len(responses): responses.append("")
+                    prompts.append(h["text"])
+
+                else:
+                    print("Unknown author")
+
+            # Create context until we run out of space
+
+            while True:
+
+                context_str = ""
+                for turn in range(len(prompts)):
+
+                    p = prompts[turn]
+                    r = responses[turn] if turn < len(responses) else None
+                    sp = self.settings["system_prompt"] if context_str == "" else None
+
+                    up_text = prompt_format.format(p, r, sp, self.settings)
+                    context_str = context_str + up_text
+
+                context_str += prefix
+                context_ids = get_loaded_model().tokenizer.encode(context_str, encode_special_tokens = prompt_format.encode_special_tokens())
+
+                if context_ids.shape[-1] < max_len: return context_str, context_ids
+                prompts = prompts[1:]
+                responses = responses[1:]
+
+        # Non-instruct format
+
+        else:
+
+            history_copy = self.history
+            while True:
+
+                context_str = self.settings["system_prompt"] + "\n" + "\n".join([h["text"] for h in history_copy]) + "\n"
+                context_str += prefix
+                context_ids = get_loaded_model().tokenizer.encode(context_str, encode_special_tokens = prompt_format.encode_special_tokens())
+
+                if context_ids.shape[-1] < max_len: return context_str, context_ids
+                history_copy = history_copy[1:]
+
+
+    def generate(self, data):
+
+        mt = MultiTimer()
+
+        if get_loaded_model() is None:
+            packet = { "result": "fail", "error": "No model loaded." }
+            yield json.dumps(packet) + "\n"
+            return packet
+
+        model = get_loaded_model().model
+        generator = get_loaded_model().generator
+        tokenizer = get_loaded_model().tokenizer
+        cache = get_loaded_model().cache
+
+        prompt_format = prompt_formats[self.settings["prompt_format"]]()
+
+        # Create response block
+
+        if prompt_format.is_instruct():
+
+            new_block = {}
+            new_block["block_uuid"] = str(uuid.uuid4())
+            new_block["author"] = "assistant"
+            new_block["text"] = ""
+
+            packet = {}
+            packet["result"] = "begin_block"
+            packet["block"] = new_block
+            yield json.dumps(packet) + "\n"
+
+        # Sampling settings
+
+        gen_settings = ExLlamaV2Sampler.Settings()
+        gen_settings.temperature = self.settings["temperature"]
+        gen_settings.top_k = self.settings["top_k"]
+        gen_settings.top_p = self.settings["top_p"]
+        gen_settings.typical = self.settings["typical"]
+        gen_settings.token_repetition_penalty = self.settings["repp"]
+        gen_settings.token_repetition_range = self.settings["repr"]
+        gen_settings.token_repetition_decay = self.settings["repr"]
+
+        if gen_settings.temperature == 0:
+            gen_settings.temperature = 1.0
+            gen_settings.top_k = 1
+            gen_settings.top_p = 0
+            gen_settings.typical = 0
+
+        if prompt_format.is_instruct():
+            generator.set_stop_conditions(prompt_format.stop_conditions(tokenizer, self.settings))
+        else:
+            if self.settings["stop_newline"]:
+                generator.set_stop_conditions(["\n"])
+            else:
+                stop = set()
+                for r in self.settings["roles"]:
+                    if r.strip() != "":
+                        stop.add("\n" + r + ":")
+                        stop.add("\n " + r + ":")
+                        stop.add("\n" + r.upper() + ":")
+                        stop.add("\n " + r.upper() + ":")
+                        stop.add("\n" + r.lower() + ":")
+                        stop.add("\n " + r.lower() + ":")
+                generator.set_stop_conditions(list(stop) + [tokenizer.eos_token_id])
+
+        # Begin response
+
+        generated_tokens = 0
+        max_new_tokens = self.settings["maxtokens"]
+        chunk_tokens = 0
+
+        last_chunk_time = time.time()
+        full_response = ""  # TODO: Preload response
+        save_tokens = torch.empty((1, 0), dtype = torch.bool)
+        chunk_buffer = ""
+
+        # If not in instruct mode, generate bot name prefix
+
+        prefix = ""
+        if not prompt_format.is_instruct():
+
+            bot_roles = []
+            for r in self.settings["roles"][1:]:
+                if r.strip() != "": bot_roles.append(r + ":")
+            assert len(bot_roles) >= 1
+
+            past_tokens = model.config.max_seq_len - self.settings["chunktokens"] - save_tokens.shape[-1]
+            context_str, context_ids = self.create_context(prompt_format, past_tokens)
+            gen_settings.filters = [ExLlamaV2SelectFilter(model, tokenizer, bot_roles, case_insensitive = False)]
+
+            mt.set_stage("prompt")
+            generator.begin_stream(context_ids, gen_settings, token_healing = False)
+            mt.stop()
+
+            mt.set_stage("gen")
+            while True:
+                chunk, eos, tokens = generator.stream()
+                prefix += chunk
+                if eos: break
+            mt.stop()
+
+            gen_settings.filters = []
+
+            # Begin block with bot name prefix
+
+            new_block = {}
+            new_block["block_uuid"] = str(uuid.uuid4())
+            new_block["author"] = "assistant"
+            new_block["text"] = prefix
+
+            packet = {}
+            packet["result"] = "begin_block"
+            packet["block"] = new_block
+            yield json.dumps(packet) + "\n"
+
+        # Stream response
+
+        mt.set_stage("gen")
+        while True:
+
+            if chunk_tokens == 0:
+
+                packet = {}
+                packet["result"] = "prompt_eval"
+                packet["block_uuid"] = new_block["block_uuid"]
+                yield json.dumps(packet) + "\n"
+
+                past_tokens = model.config.max_seq_len - self.settings["chunktokens"] - save_tokens.shape[-1]
+                context_str, context_ids = self.create_context(prompt_format, past_tokens, prefix)
+                context_ids = torch.cat((context_ids, save_tokens), dim = -1)
+
+                mt.set_stage("prompt")
+                generator.begin_stream(context_ids, gen_settings, token_healing = False)
+                chunk_tokens = model.config.max_seq_len - context_ids.shape[-1] - 1
+                mt.set_stage("gen")
+
+            chunk, eos, tokens = generator.stream()
+            save_tokens = torch.cat((save_tokens, tokens), dim = -1)
+
+            generated_tokens += 1
+            chunk_tokens -= 1
+
+            chunk_buffer += chunk
+
+            now = time.time()
+            elapsed = now - last_chunk_time
+
+            if chunk_buffer != "" and (elapsed > 0.05 or eos or generated_tokens == max_new_tokens):
+
+                packet = {}
+                packet["result"] = "stream_to_block"
+                packet["block_uuid"] = new_block["block_uuid"]
+                packet["text"] = chunk_buffer
+                yield json.dumps(packet) + "\n"
+
+                full_response += chunk_buffer
+                chunk_buffer = ""
+                last_chunk_time = now
+
+            if eos or generated_tokens == max_new_tokens: break
+
+        # Compile metadata
+
+        mt.stop()
+        meta = {}
+        meta["prompt_tokens"] = context_ids.shape[-1]
+        meta["prompt_speed"] = context_ids.shape[-1] / (mt.stages["prompt"] + 1e-8)
+        meta["gen_tokens"] = generated_tokens
+        meta["gen_speed"] = generated_tokens / (mt.stages["gen"] + 1e-8)
+        meta["overflow"] = max_new_tokens if generated_tokens == max_new_tokens else 0
+        new_block["meta"] = meta
+
+        # Save response block
+
+        new_block["text"] = prefix + full_response.rstrip()
+        self.history.append(new_block)
+        self.save()
+
+        # Done
+
+        packet = { "result": "ok", "new_block": new_block }
+        yield json.dumps(packet) + "\n"
+
+        return packet
+
+
+    def rename(self, data):
+        global session_list
+
+        if "session_uuid" in data:
+            assert data["session_uuid"] == self.session_uuid
+
+        session_list[self.session_uuid] = (data["new_name"], session_list[self.session_uuid][1])
+        self.name = data["new_name"]
+        self.save()
+
+
+    def delete_block(self, block_uuid):
+
+        print(f"Deleting block: {block_uuid}")
+
+        for h in self.history:
+            if h["block_uuid"] == block_uuid:
+                self.history.remove(h)
+                break
+        self.save()
+
+
+    def edit_block(self, block):
+
+        block_uuid = block['block_uuid']
+        print(f"Editing block: {block_uuid}")
+
+        for i in range(len(self.history)):
+            if self.history[i]["block_uuid"] == block_uuid:
+                self.history[i] = block
+                break
+        self.save()
+
--- a/backend/util.py
+++ b/backend/util.py
@@ -0,0 +1,26 @@
+import time
+
+class MultiTimer:
+
+    def __init__(self):
+        self.current_stage = ""
+        self.stages = {}
+        self.last = time.time()
+
+    def set_stage(self, stage):
+        now = time.time()
+        elapsed = now - self.last
+        self.last = now
+
+        prev = self.current_stage
+        self.current_stage = stage
+
+        if prev != "" and prev in self.stages:
+            self.stages[prev] += elapsed
+        else:
+            self.stages[prev] = elapsed
+
+    def stop(self):
+        self.set_stage("")
+
+
--- a/doc/icon.png
+++ b/doc/icon.png
--- a/server.py
+++ b/server.py
@@ -0,0 +1,220 @@
+import sys, os, json, argparse
+from threading import Timer, Lock
+
+from flask import Flask, render_template, request
+from flask import Response, stream_with_context
+from waitress import serve
+import webbrowser
+
+import torch
+
+from backend.models import update_model, load_models, get_model_info, list_models, remove_model, load_model, unload_model
+from backend.config import set_config_dir, global_state
+from backend.sessions import list_sessions, set_session, get_session, get_default_session_settings, new_session, delete_session
+from backend.prompts import list_prompt_formats
+
+app = Flask("ExUI")
+app.static_folder = 'static'
+api_lock = Lock()
+
+parser = argparse.ArgumentParser(description="ExUI, chatbot UI for ExLlamaV2")
+parser.add_argument("-host", "--host", type = str, help = "IP:PORT eg, 0.0.0.0:5000", default = "localhost:5000")
+parser.add_argument("-d", "--dir", type = str, help = "Location for user data and sessions, default: ~/exui", default = "~/exui")
+args = parser.parse_args()
+
+@app.route("/api/delete_block", methods=['POST'])
+def api_delete_block():
+    global api_lock
+    with api_lock:
+        s = get_session()
+        data = request.get_json()
+        s.delete_block(data["block_uuid"])
+        result = { "result": "ok" }
+        return json.dumps(result) + "\n"
+
+@app.route("/api/edit_block", methods=['POST'])
+def api_edit_block():
+    global api_lock
+    with api_lock:
+        s = get_session()
+        data = request.get_json()
+        s.edit_block(data["block"])
+        result = { "result": "ok" }
+        return json.dumps(result) + "\n"
+
+@app.route("/api/list_prompt_formats")
+def api_glist_prompt_formats():
+    global api_lock
+    with api_lock:
+        result = {"result": "ok", "prompt_formats": list_prompt_formats()}
+        return json.dumps(result) + "\n"
+
+@app.route("/api/generate", methods=['POST'])
+def api_generate():
+    global api_lock
+    with api_lock:
+        data = request.get_json()
+        s = get_session()
+        result = Response(stream_with_context(s.generate(data)), mimetype = 'application/json')
+        return result
+
+@app.route("/api/get_default_settings")
+def api_get_default_settings():
+    global api_lock
+    with api_lock:
+        result = { "result": "ok",
+                   "settings": get_default_session_settings(),
+                   "prompt_formats": list_prompt_formats() }
+        return json.dumps(result) + "\n"
+
+@app.route("/api/update_settings", methods=['POST'])
+def api_update_settings():
+    global api_lock
+    with api_lock:
+        s = get_session()
+        data = request.get_json()
+        s.update_settings(data["settings"])
+        result = { "result": "ok" }
+        return json.dumps(result) + "\n"
+
+@app.route("/api/user_input", methods=['POST'])
+def api_user_input():
+    global api_lock
+    with api_lock:
+        s = get_session()
+        data = request.get_json()
+        new_block = s.user_input(data)
+        result = { "result": "ok", "new_block": new_block }
+        return json.dumps(result) + "\n"
+
+@app.route("/api/list_sessions")
+def api_list_sessions():
+    global api_lock
+    with api_lock:
+        s, c = list_sessions()
+        result = { "result": "ok", "sessions": s, "current_session": c }
+        return json.dumps(result) + "\n"
+
+@app.route("/api/new_session", methods=['POST'])
+def api_new_session():
+    global api_lock
+    with api_lock:
+        data = request.get_json()
+        session = new_session()
+        if "settings" in data: get_session().update_settings(data["settings"])
+        if "new_name" in data: get_session().rename(data)
+        result = { "result": "ok", "session": session }
+        return json.dumps(result) + "\n"
+
+@app.route("/api/rename_session", methods=['POST'])
+def api_rename_session():
+    global api_lock
+    with api_lock:
+        data = request.get_json()
+        s = get_session()
+        s.rename(data)
+        result = { "result": "ok" }
+        return json.dumps(result) + "\n"
+
+@app.route("/api/delete_session", methods=['POST'])
+def api_delete_session():
+    global api_lock
+    with api_lock:
+        data = request.get_json()
+        delete_session(data["session_uuid"]);
+        result = { "result": "ok" }
+        return json.dumps(result) + "\n"
+
+@app.route("/api/set_session", methods=['POST'])
+def api_set_session():
+    global api_lock
+    with api_lock:
+        data = request.get_json()
+        session = set_session(data)
+        if session is not None:
+            result = { "result": "ok",
+                       "session": session,
+                       "prompt_formats": list_prompt_formats() }
+        else:
+            result = { "result": "fail" }
+        return json.dumps(result) + "\n"
+
+@app.route("/api/list_models")
+def api_list_models():
+    global api_lock
+    with api_lock:
+        m, c = list_models()
+        result = { "result": "ok", "models": m, "current_model": c }
+        return json.dumps(result) + "\n"
+
+@app.route("/api/update_model", methods=['POST'])
+def api_update_model():
+    global api_lock
+    with api_lock:
+        data = request.get_json()
+        i = update_model(data)
+        result = { "result": "ok", "new_model_uuid": i }
+        return json.dumps(result) + "\n"
+
+@app.route("/api/remove_model", methods=['POST'])
+def api_remove_model():
+    global api_lock
+    with api_lock:
+        data = request.get_json()
+        remove_model(data)
+        result = { "result": "ok" }
+        return json.dumps(result) + "\n"
+
+@app.route("/api/load_model", methods=['POST'])
+def api_load_model():
+    global api_lock
+    with api_lock:
+        data = request.get_json()
+        result = Response(stream_with_context(load_model(data)), mimetype = 'application/json')
+        return result
+
+@app.route("/api/unload_model")
+def api_unload_model():
+    global api_lock
+    with api_lock:
+        result = unload_model()
+        return json.dumps(result) + "\n"
+
+@app.route("/api/get_model_info", methods=['POST'])
+def api_get_model_info():
+    global api_lock
+    with api_lock:
+        data = request.get_json()
+        info = get_model_info(data)
+        if info: result = { "result": "ok", "model_info": info }
+        else: result = { "result": "fail" }
+        return json.dumps(result) + "\n"
+
+@app.route("/")
+def home():
+    global api_lock
+    with api_lock:
+        return render_template("index.html")
+
+# Prepare torch
+
+# torch.cuda._lazy_init()
+
+# Prepare config
+
+print(f" -- User dir: {args.dir}")
+
+set_config_dir(args.dir)
+global_state.load()
+load_models()
+
+# Start server
+
+machine = args.host
+host, port = machine.split(":")
+
+if host == "localhost":
+    Timer(1, lambda: webbrowser.open(f'http://{machine}/')).start()
+
+serve(app, host = host, port = port)
+
--- a/static/controls.js
+++ b/static/controls.js
--- a/static/gfx/avatar_cat.png
+++ b/static/gfx/avatar_cat.png
--- a/static/gfx/avatar_dog.png
+++ b/static/gfx/avatar_dog.png
--- a/static/gfx/avatar_frog.png
+++ b/static/gfx/avatar_frog.png
--- a/static/gfx/avatar_monke.png
+++ b/static/gfx/avatar_monke.png
--- a/static/gfx/avatar_notcat.png
+++ b/static/gfx/avatar_notcat.png
--- a/static/gfx/avatar_penguin.png
+++ b/static/gfx/avatar_penguin.png
--- a/static/gfx/avatar_squirrel.png
+++ b/static/gfx/avatar_squirrel.png
--- a/static/gfx/avatar_unicorn.png
+++ b/static/gfx/avatar_unicorn.png
--- a/static/gfx/favicon.png
+++ b/static/gfx/favicon.png
--- a/static/gfx/icon_chat.png
+++ b/static/gfx/icon_chat.png
--- a/static/gfx/icon_model.png
+++ b/static/gfx/icon_model.png
--- a/static/style.css
+++ b/static/style.css
--- a/templates/index.html
+++ b/templates/index.html
@@ -0,0 +1,57 @@
+<!DOCTYPE html>
+<html>
+    <head>
+        <title>ExUI</title>
+        <link rel="stylesheet" type="text/css"  href="{{ url_for('static', filename='style.css') }}">
+        <link rel="icon" type="image/png" sizes="32x32" href="{{ url_for('static', filename='gfx/favicon.png') }}">
+        {% include 'svg_icons.html' %}
+    </head>
+    <body>
+        <script src="https://cdn.jsdelivr.net/npm/marked/marked.min.js"></script>
+        <div id="page-disabled">
+            <div id="busy" class="busy">
+                <p>Please wait...</p>
+                <div id="busy-anim"></div>
+            </div>
+            <div id="loading" class="loading">
+                <p>Loading</p>
+                <div id="loading-progress-container">
+                    <div id="loading-progress"></div>
+                </div>
+            </div>
+        </div>
+        <div id="page-enabled">
+            <div class="vflex">
+                <div id="mainmenu" class="mainmenu">
+                </div>
+                <div id="mainbody" class="mainbody">
+                    <div id="model-page" class="model-page hidden">
+                        <div class="vflex">
+                            <div id="model-list" class="model-list"></div>
+                            <div id="model-view" class="model-view"></div>
+                        </div>
+                    </div>
+                    <div id="chat-page" class="chat-page hidden">
+                        <div class="vflex">
+                            <div class="hflex">
+                                <div id="session-list" class="session-list"></div>
+                                <div id="session-list-controls" class="session-list-controls"></div>
+                            </div>
+                            <div class="hflex">
+                                <div id="session-view" class="session-view">
+                                    <div id="session-view-history" class="session-view-history"></div>
+                                    <div class="session-input-surround">
+                                        <textarea id="session-input" class="session-input" placeholder="Type here..." rows="1"></textarea>
+                                    </div>
+                                </div>
+                            </div>
+                            <div id="session-settings" class="session-settings">
+                            </div>
+                        </div>
+                    </div>
+                </div>
+            </div>
+        </div>
+        <script src="{{ url_for('static', filename='controls.js') }}"></script>
+    </body>
+</html>
--- a/templates/svg_icons.html
+++ b/templates/svg_icons.html
@@ -0,0 +1,58 @@
+
+<svg style="display: none;">
+    <defs>
+        <symbol id="pencil-icon" viewBox="0 0 16 16">
+            <path d="M16.84,2.73C16.45,2.73 16.07,2.88 15.77,3.17L13.65,5.29L18.95,10.6L21.07,8.5C21.67,7.89 21.67,6.94 21.07,6.36L17.9,3.17C17.6,2.88 17.22,2.73 16.84,2.73M12.94,6L4.84,14.11L7.4,14.39L7.58,16.68L9.86,16.85L10.15,19.41L18.25,11.3M4.25,15.04L2.5,21.73L9.2,19.94L8.96,17.78L6.65,17.61L6.47,15.29"/>
+        </symbol>
+    </defs>
+</svg>
+
+<svg style="display: none;">
+    <defs>
+        <symbol id="delete-icon" viewBox="0 0 24 24">
+            <path d="M9,3V4H4V6H5V19C5,20.1 5.9,21 7,21H17C18.1,21 19,20.1 19,19V6H20V4H15V3H9M7,6H17V19H7V6M9,8V17H11V8H9M13,8V17H15V8H13Z"/>
+        </symbol>
+    </defs>
+</svg>
+
+<svg style="display: none;">
+    <defs>
+        <symbol id="model-icon" viewBox="0 0 24 24">
+            <path d="M12.414 5H21a1 1 0 0 1 1 1v14a1 1 0 0 1-1 1H3a1 1 0 0 1-1-1V4a1 1 0 0 1 1-1h7.414l2 2zM20 11H4v8h16v-8zm0-2V7h-8.414l-2-2H4v4h16z"/>
+        </symbol>
+    </defs>
+</svg>
+
+<svg style="display: none;">
+    <defs>
+        <symbol id="model-plus-icon" viewBox="0 0 24 24">
+            <path d="M12.414 5H21a1 1 0 0 1 1 1v14a1 1 0 0 1-1 1H3a1 1 0 0 1-1-1V4a1 1 0 0 1 1-1h7.414l2 2zM4 5v14h16V7h-8.414l-2-2H4zm7 7V9h2v3h3v2h-3v3h-2v-3H8v-2h3z"/>
+        </symbol>
+    </defs>
+</svg>
+
+<svg style="display: none;">
+    <defs>
+        <symbol id="model-loaded-icon" viewBox="0 0 14 16">
+            <path d="M0 .984v14.032a1 1 0 0 0 1.506.845l12.006-7.016a.974.974 0 0 0 0-1.69L1.506.139A1 1 0 0 0 0 .984Z"/>
+        </symbol>
+    </defs>
+</svg>
+
+<svg style="display: none;">
+    <defs>
+        <symbol id="session-icon" viewBox="0 0 20 18">
+            <path d="M18 4H16V9C16 10.0609 15.5786 11.0783 14.8284 11.8284C14.0783 12.5786 13.0609 13 12 13H9L6.846 14.615C7.17993 14.8628 7.58418 14.9977 8 15H11.667L15.4 17.8C15.5731 17.9298 15.7836 18 16 18C16.2652 18 16.5196 17.8946 16.7071 17.7071C16.8946 17.5196 17 17.2652 17 17V15H18C18.5304 15 19.0391 14.7893 19.4142 14.4142C19.7893 14.0391 20 13.5304 20 13V6C20 5.46957 19.7893 4.96086 19.4142 4.58579C19.0391 4.21071 18.5304 4 18 4Z" fill="currentColor"/>
+            <path d="M12 0H2C1.46957 0 0.960859 0.210714 0.585786 0.585786C0.210714 0.960859 0 1.46957 0 2V9C0 9.53043 0.210714 10.0391 0.585786 10.4142C0.960859 10.7893 1.46957 11 2 11H3V13C3 13.1857 3.05171 13.3678 3.14935 13.5257C3.24698 13.6837 3.38668 13.8114 3.55279 13.8944C3.71889 13.9775 3.90484 14.0126 4.08981 13.996C4.27477 13.9793 4.45143 13.9114 4.6 13.8L8.333 11H12C12.5304 11 13.0391 10.7893 13.4142 10.4142C13.7893 10.0391 14 9.53043 14 9V2C14 1.46957 13.7893 0.960859 13.4142 0.585786C13.0391 0.210714 12.5304 0 12 0Z" fill="currentColor"/>
+        </symbol>
+    </defs>
+</svg>
+
+<svg style="display: none;">
+    <defs>
+        <symbol id="session-new-icon" viewBox="0 0 20 18" aria-hidden="true" fill="none">
+            <path stroke="currentColor" stroke-linecap="round" stroke-linejoin="round" stroke-width="2" d="M16 5h2a1 1 0 0 1 1 1v7a1 1 0 0 1-1 1h-2v3l-4-3H8m4-13H2a1 1 0 0 0-1 1v7a1 1 0 0 0 1 1h2v3l4-3h4a1 1 0 0 0 1-1V2a1 1 0 0 0-1-1Z"/>
+        </symbol>
+    </defs>
+</svg>
+