mirror of
https://github.com/turboderp-org/exllamav2.git
synced 2026-05-04 05:01:39 +00:00
194 lines
4.3 KiB
Python
194 lines
4.3 KiB
Python
|
|
import sys, os, gc
|
|
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
|
|
|
|
from exllamav2 import(
|
|
ExLlamaV2,
|
|
ExLlamaV2Config,
|
|
ExLlamaV2Cache,
|
|
ExLlamaV2Tokenizer,
|
|
)
|
|
|
|
from datasets import load_dataset
|
|
import torch
|
|
|
|
# Models to test
|
|
|
|
# model_base = "/mnt/str/models/_exl2"
|
|
# model_base = "/mnt/str/models/mixtral-8x7b-instruct-exl2/"
|
|
model_base = "/mnt/str/models/tiefighter-13b-exl4/"
|
|
|
|
variants = [v for v in os.listdir(model_base) if os.path.isdir(os.path.join(model_base, v))]
|
|
|
|
# variants = \
|
|
# [
|
|
# "2.4bpw",
|
|
# "2.5bpw",
|
|
# "3.0bpw",
|
|
# "4.0bpw",
|
|
# "6.0bpw",
|
|
# ]
|
|
|
|
gpu_split = (20, 21.3, 24)
|
|
|
|
qa_set = "cais/mmlu"
|
|
qa_split = "test"
|
|
|
|
categories = \
|
|
[
|
|
"anatomy",
|
|
"computer_security",
|
|
"formal_logic",
|
|
"logical_fallacies",
|
|
"philosophy",
|
|
"nutrition",
|
|
]
|
|
|
|
examples_per_category = 3
|
|
questions_per_category = 97
|
|
|
|
# Load model
|
|
|
|
def get_model(base, variant_, gpu_split_, batch_size_):
|
|
|
|
model_dir = os.path.join(base, variant_)
|
|
|
|
config = ExLlamaV2Config()
|
|
config.model_dir = model_dir
|
|
config.prepare()
|
|
config.max_seq_len = 2048
|
|
config.max_batch_size = batch_size_
|
|
|
|
model_ = ExLlamaV2(config)
|
|
print(" -- Loading model: " + model_dir)
|
|
|
|
model_.load(gpu_split_)
|
|
|
|
tokenizer_ = ExLlamaV2Tokenizer(config)
|
|
|
|
# cache_ = ExLlamaV2Cache(model)
|
|
cache_ = None
|
|
|
|
return model_, cache_, tokenizer_
|
|
|
|
|
|
# Load questions
|
|
|
|
def format_question(question, options, answer, ex = False):
|
|
|
|
clabels = "ABCD"
|
|
text = f"Question:\n"
|
|
text += question
|
|
text += "\n\nChoices:\n"
|
|
for i, o in enumerate(options):
|
|
text += clabels[i] + ": " + o + "\n"
|
|
text += "\nAnswer: " + clabels[answer]
|
|
# if ex:
|
|
# text += ", " + options[answer]
|
|
return text
|
|
|
|
|
|
def get_dataset(ds_name, category_, split_):
|
|
|
|
print(f" -- Loading dataset: {ds_name}/{category_}...")
|
|
dataset_ = load_dataset(ds_name, category_, split = split_)
|
|
return dataset_
|
|
|
|
|
|
# Prepare the prompts
|
|
|
|
prep_prompts = {}
|
|
for category in categories:
|
|
|
|
dataset = get_dataset(qa_set, category, qa_split)
|
|
|
|
rows = []
|
|
for example in dataset:
|
|
rows.append(example)
|
|
if len(rows) == questions_per_category + examples_per_category: break
|
|
|
|
examples_prompt = ""
|
|
for i in range(examples_per_category):
|
|
examples_prompt += format_question(rows[i]["question"], rows[i]["choices"], rows[i]["answer"], ex = True)
|
|
examples_prompt += "\n\n"
|
|
|
|
prompts = []
|
|
labels = []
|
|
for j in range(questions_per_category):
|
|
i = j + examples_per_category
|
|
q_prompt = format_question(rows[i]["question"], rows[i]["choices"], rows[i]["answer"])
|
|
prompts.append(examples_prompt + q_prompt)
|
|
labels.append(rows[i]["answer"])
|
|
|
|
prep = {"prompts": prompts,
|
|
"labels": labels}
|
|
|
|
prep_prompts[category] = prep
|
|
|
|
|
|
# Do the test
|
|
|
|
results = ";".join([""] + categories) + "\n"
|
|
|
|
for variant in variants:
|
|
|
|
# Model
|
|
|
|
model = None
|
|
cache = None
|
|
tokenizer = None
|
|
|
|
gc.collect()
|
|
torch.cuda.empty_cache()
|
|
gc.collect()
|
|
|
|
model, cache, tokenizer = get_model(model_base, variant, gpu_split, 1)
|
|
|
|
# Logit positions corresponding to valid answers
|
|
|
|
answer_logits = []
|
|
llabels = "ABCD"
|
|
for i in range(4):
|
|
answer_ = "The answer is: " + llabels[i]
|
|
answer_logits.append(tokenizer.tokenizer.encode(answer_)[-1])
|
|
|
|
# Categories
|
|
|
|
cat_results = []
|
|
|
|
for category in categories:
|
|
|
|
print(f" -- Testing: {category}...")
|
|
|
|
prompts = prep_prompts[category]["prompts"]
|
|
labels = prep_prompts[category]["labels"]
|
|
|
|
# Evaluate prompts
|
|
|
|
score = 0.0
|
|
# for prompt_ids, mask in zip(prompt_ids_list, mask_list):
|
|
|
|
for prompt, label in zip(prompts, labels):
|
|
|
|
prompt_ids = tokenizer.encode(prompt)
|
|
prompt_ids = prompt_ids[:, :-1]
|
|
|
|
logits = model.forward(prompt_ids, last_id_only = True)
|
|
logits = logits.float()
|
|
|
|
logits_ans = logits[:, :, answer_logits]
|
|
prob_ans = torch.softmax(logits_ans, dim = -1)
|
|
|
|
score += prob_ans[0, 0, label]
|
|
|
|
score /= questions_per_category
|
|
print(f" -- Score: {score:.4f}")
|
|
|
|
cat_results.append(f"{score:.4f}");
|
|
|
|
results += ";".join([variant] + cat_results) + "\n"
|
|
|
|
print(" -- Finished")
|
|
print()
|
|
print(results)
|