mirror of
https://github.com/turboderp-org/exllamav2.git
synced 2026-04-20 14:29:28 +00:00
127 lines
3.0 KiB
Python
127 lines
3.0 KiB
Python
import sys, os, gc
|
|
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
|
|
|
|
from human_eval.data import write_jsonl, read_problems
|
|
|
|
from exllamav2 import(
|
|
ExLlamaV2,
|
|
ExLlamaV2Config,
|
|
ExLlamaV2Cache,
|
|
ExLlamaV2Tokenizer,
|
|
)
|
|
|
|
from exllamav2.generator import(
|
|
ExLlamaV2BaseGenerator,
|
|
ExLlamaV2Sampler
|
|
)
|
|
|
|
import torch
|
|
|
|
# Models to test
|
|
|
|
# model_base = "/mnt/str/models/"
|
|
# variants = ["mistral-7b-instruct"]
|
|
# model_base = "/mnt/str/models/mistral-7b-instruct-exl3"
|
|
# variants = ["8.0bpw"]
|
|
model_base = "/mnt/str/models/mixtral-8x7b-instruct-exl2/"
|
|
variants = ["4.0bpw"]
|
|
|
|
# model_base = "/mnt/str/models/tiefighter-13b-exl4/"
|
|
|
|
# variants = [v for v in os.listdir(model_base) if os.path.isdir(os.path.join(model_base, v))]
|
|
|
|
# variants = \
|
|
# [
|
|
# "2.4bpw",
|
|
# "2.5bpw",
|
|
# "2.7bpw",
|
|
# "3.0bpw",
|
|
# "4.0bpw",
|
|
# "6.0bpw",
|
|
# "8.0bpw",
|
|
# ]
|
|
|
|
gpu_split = (16, 16, 24)
|
|
|
|
# Load model
|
|
|
|
def get_model(base, variant_, gpu_split_, batch_size_):
|
|
|
|
model_dir = os.path.join(base, variant_)
|
|
|
|
config = ExLlamaV2Config()
|
|
config.model_dir = model_dir
|
|
config.prepare()
|
|
config.max_seq_len = 2048
|
|
config.max_batch_size = batch_size_
|
|
|
|
model_ = ExLlamaV2(config)
|
|
print(" -- Loading model: " + model_dir)
|
|
|
|
model_.load(gpu_split_)
|
|
|
|
tokenizer_ = ExLlamaV2Tokenizer(config)
|
|
|
|
cache_ = ExLlamaV2Cache(model_, batch_size = batch_size)
|
|
# cache_ = None
|
|
|
|
return model_, cache_, tokenizer_
|
|
|
|
|
|
problems = read_problems()
|
|
|
|
for variant in variants:
|
|
|
|
# Model
|
|
|
|
model = None
|
|
cache = None
|
|
tokenizer = None
|
|
|
|
gc.collect()
|
|
torch.cuda.empty_cache()
|
|
gc.collect()
|
|
|
|
batch_size = 10
|
|
num_samples_per_task = 1
|
|
samples = []
|
|
|
|
model, cache, tokenizer = get_model(model_base, variant, gpu_split, batch_size)
|
|
|
|
gen = ExLlamaV2BaseGenerator(model, cache, tokenizer)
|
|
gen_settings = ExLlamaV2Sampler.Settings()
|
|
# gen_settings.top_k = 1
|
|
|
|
for task_id in problems:
|
|
print(task_id)
|
|
for _ in range(num_samples_per_task):
|
|
|
|
# Get problem and batch of completions
|
|
|
|
problem = [problems[task_id]["prompt"]] * batch_size
|
|
responses = gen.generate_simple(problem, gen_settings, 500, stop_token = tokenizer.eos_token_id)
|
|
|
|
for response in responses:
|
|
|
|
# Simplified cleanup of response: remove all lines starting from the first line with no indentation,
|
|
# i.e. keep exactly one function
|
|
|
|
r = response[len(problem[0]):]
|
|
s =r.split("\n")
|
|
crop = len(s)
|
|
for l in range(1, len(s)):
|
|
if len(s[l]) > 0:
|
|
b = s[l][0:1]
|
|
if b != " " and b != "\t" and b != "#":
|
|
crop = l
|
|
break
|
|
r = "\n".join(s[:crop])
|
|
|
|
# Store sample
|
|
|
|
samples.append(dict(task_id = task_id, completion = r))
|
|
|
|
# Save output
|
|
|
|
write_jsonl(f"samples-{variant}.jsonl", samples)
|