mirror of
https://github.com/turboderp-org/exllamav2.git
synced 2026-04-20 06:19:00 +00:00
39 lines
972 B
Python
39 lines
972 B
Python
|
|
import sys, os
|
|
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
|
|
|
|
from exllamav2 import ExLlamaV2Cache, model_init
|
|
from exllamav2.server import ExLlamaV2WebSocketServer
|
|
|
|
import argparse
|
|
|
|
# Configure and init model
|
|
|
|
parser = argparse.ArgumentParser(description = "WebSocket server example")
|
|
parser.add_argument("-host", "--host", type = str, default = "0.0.0.0:7862", help = "IP:PORT eg, 0.0.0.0:7862")
|
|
|
|
model_init.add_args(parser)
|
|
args = parser.parse_args()
|
|
model_init.check_args(args)
|
|
model_init.print_options(args)
|
|
model, tokenizer = model_init.init(args, allow_auto_split = True)
|
|
|
|
# Load model after cache if --gpu_split auto
|
|
|
|
if not model.loaded:
|
|
cache = ExLlamaV2Cache(model, lazy = True)
|
|
model.load_autosplit(cache)
|
|
|
|
# Else create cache
|
|
|
|
else:
|
|
cache = ExLlamaV2Cache(model)
|
|
|
|
# Create server
|
|
|
|
ip, port = args.host.split(":")
|
|
port = int(port)
|
|
|
|
server = ExLlamaV2WebSocketServer(ip, port, model, tokenizer, cache)
|
|
server.serve()
|