mirror of
https://github.com/turboderp-org/exllamav2.git
synced 2026-05-04 13:11:18 +00:00
40 lines
1.1 KiB
Python
40 lines
1.1 KiB
Python
from exllamav2 import *
|
|
from exllamav2.generator import *
|
|
import sys, torch
|
|
|
|
config = ExLlamaV2Config()
|
|
config.model_dir = "/mnt/str/models/mixtral-8x7b-instruct-exl2/3.0bpw/"
|
|
config.prepare()
|
|
|
|
model = ExLlamaV2(config)
|
|
cache = ExLlamaV2Cache(model, lazy = True)
|
|
|
|
print("Loading model...")
|
|
model.load_autosplit(cache)
|
|
|
|
tokenizer = ExLlamaV2Tokenizer(config)
|
|
generator = ExLlamaV2StreamingGenerator(model, cache, tokenizer)
|
|
generator.set_stop_conditions([tokenizer.eos_token_id])
|
|
gen_settings = ExLlamaV2Sampler.Settings()
|
|
|
|
while True:
|
|
|
|
print()
|
|
instruction = input("User: ")
|
|
print()
|
|
print("Assistant:", end = "")
|
|
|
|
instruction_ids = tokenizer.encode(f"[INST] {instruction} [/INST]", add_bos = True)
|
|
context_ids = instruction_ids if generator.sequence_ids is None \
|
|
else torch.cat([generator.sequence_ids, instruction_ids], dim = -1)
|
|
|
|
generator.begin_stream_ex(context_ids, gen_settings)
|
|
|
|
while True:
|
|
res = generator.stream_ex()
|
|
if res["eos"]: break
|
|
print(res["chunk"], end = "")
|
|
sys.stdout.flush()
|
|
|
|
print()
|