mirror of
https://github.com/turboderp-org/exllamav3.git
synced 2026-03-15 00:07:24 +00:00
187 lines
6.1 KiB
Python
187 lines
6.1 KiB
Python
import sys, os
|
|
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
|
|
from exllamav3 import Config, Model, Cache, Tokenizer, Generator, Job, TopPSampler
|
|
from common import format_prompt, get_stop_conditions
|
|
|
|
"""
|
|
A couple of examples showing uses of the generator
|
|
"""
|
|
|
|
prompt_format = "llama3" # see common.py
|
|
model_dir = "/mnt/str/eval_models/llama3.1-8b-instruct/exl3/4.0bpw/"
|
|
cache_size = 16384
|
|
|
|
system_prompt = "You are a very nice language model."
|
|
|
|
instructions = [
|
|
"Write a short story beginning with the words 'Once in a while, when you least expect it'.",
|
|
"Why are cats so awesome?",
|
|
"Who was the tallest president of the United States?",
|
|
"Why are there so many different kinds of screws?",
|
|
"oinnvdoehwemnascnawwui8dh2",
|
|
"Write a haiku about catnip."
|
|
]
|
|
|
|
# Generate a single completion to a single prompt
|
|
def generate_single(generator, tokenizer):
|
|
instruction = instructions[0]
|
|
print("------------------")
|
|
print("Prompt: " + instruction)
|
|
print()
|
|
response = generator.generate(
|
|
prompt = format_prompt(prompt_format, system_prompt, instruction),
|
|
stop_conditions = get_stop_conditions(prompt_format, tokenizer),
|
|
max_new_tokens = 500,
|
|
completion_only = True,
|
|
add_bos = True
|
|
)
|
|
print("Response: " + response)
|
|
print()
|
|
|
|
|
|
# Generate multiple batched completions
|
|
def generate_batched(generator, tokenizer):
|
|
print("------------------")
|
|
responses = generator.generate(
|
|
prompt = [format_prompt(prompt_format, system_prompt, instruction) for instruction in instructions],
|
|
stop_conditions = get_stop_conditions(prompt_format, tokenizer),
|
|
max_new_tokens = 100,
|
|
completion_only = True,
|
|
add_bos = True
|
|
)
|
|
for idx, response in enumerate(responses):
|
|
print(f"#{idx + 1}: {response}")
|
|
print("------------------")
|
|
|
|
|
|
# Create a job and generate a stream of tokens
|
|
def generate_streaming(generator, tokenizer):
|
|
instruction = instructions[0]
|
|
print("------------------")
|
|
print("Prompt: " + instruction)
|
|
print()
|
|
print("Response: ", end = "", flush = True)
|
|
|
|
# Create the job and enqueue it
|
|
formatted_prompt = format_prompt(prompt_format, system_prompt, instruction)
|
|
job = Job(
|
|
input_ids = tokenizer.encode(formatted_prompt, add_bos = True),
|
|
max_new_tokens = 400,
|
|
stop_conditions = get_stop_conditions(prompt_format, tokenizer),
|
|
)
|
|
generator.enqueue(job)
|
|
|
|
# Keep iterating until the generator has no more jobs
|
|
while generator.num_remaining_jobs():
|
|
results = generator.iterate()
|
|
|
|
# Each iteration returns a list of results, each of which may contain output tokens for a running job. We
|
|
# only care about the "text" field here.
|
|
for result in results:
|
|
text = result.get("text", "")
|
|
print(text, end = "", flush = True)
|
|
|
|
print()
|
|
|
|
|
|
# Create a batch of jobs and stream the results
|
|
def generate_streaming_batched(generator, tokenizer):
|
|
|
|
# Some buffers for collecting results
|
|
responses = [""] * len(instructions)
|
|
|
|
for idx, instruction in enumerate(instructions):
|
|
|
|
# Only print the second job to the console
|
|
if idx == 1:
|
|
print("------------------")
|
|
print("Prompt: " + instruction)
|
|
print()
|
|
print("Streamed response: ", end = "", flush = True)
|
|
|
|
# Create each job and enqueue it. Since one iteration of the generator can return multiple results, adding
|
|
# an identifier argument lets us track which sequence each chunk of output pertains to. The identifier can
|
|
# be any object, but a simple index will work here
|
|
formatted_prompt = format_prompt(prompt_format, system_prompt, instruction)
|
|
job = Job(
|
|
input_ids = tokenizer.encode(formatted_prompt, add_bos = True),
|
|
max_new_tokens = 400,
|
|
stop_conditions = get_stop_conditions(prompt_format, tokenizer),
|
|
identifier = idx,
|
|
)
|
|
generator.enqueue(job)
|
|
|
|
# Keep iterating until the generator has no more jobs
|
|
while generator.num_remaining_jobs():
|
|
results = generator.iterate()
|
|
|
|
for result in results:
|
|
text = result.get("text", "")
|
|
idx = result["identifier"]
|
|
|
|
# If this result is from the first job, stream to the console
|
|
if idx == 1:
|
|
print(text, end = "", flush = True)
|
|
|
|
# Collect results
|
|
responses[idx] += text
|
|
|
|
print()
|
|
print("--------------")
|
|
|
|
# Finally print all the collected results
|
|
for idx, response in enumerate(responses):
|
|
print(f"#{idx + 1}: {response}")
|
|
print("------------------")
|
|
|
|
|
|
# Generate a series of completions with increasing temperature
|
|
def generate_temperature(generator, tokenizer):
|
|
instruction = instructions[5]
|
|
print("------------------")
|
|
print("Prompt: " + instruction)
|
|
print()
|
|
temperature = 0.0
|
|
while temperature <= 3.01:
|
|
print(f"Temperature = {temperature:.2f}: ", end = "", flush = True)
|
|
response = generator.generate(
|
|
prompt = format_prompt(prompt_format, system_prompt, instruction),
|
|
stop_conditions = get_stop_conditions(prompt_format, tokenizer),
|
|
sampler = TopPSampler(temperature = temperature, top_p = 0.95, temperature_last = True),
|
|
max_new_tokens = 100,
|
|
completion_only = True,
|
|
add_bos = True
|
|
)
|
|
print(response)
|
|
print()
|
|
temperature += 0.25
|
|
|
|
|
|
def main():
|
|
|
|
# Load a model with cache
|
|
config = Config.from_directory(model_dir)
|
|
model = Model.from_config(config)
|
|
cache = Cache(model, max_num_tokens = cache_size)
|
|
model.load(progressbar = True)
|
|
tokenizer = Tokenizer.from_config(config)
|
|
|
|
# Create generator
|
|
generator = Generator(
|
|
model = model,
|
|
cache = cache,
|
|
tokenizer = tokenizer,
|
|
)
|
|
|
|
# Do some things
|
|
generate_single(generator, tokenizer)
|
|
generate_batched(generator, tokenizer)
|
|
generate_streaming(generator, tokenizer)
|
|
generate_streaming_batched(generator, tokenizer)
|
|
generate_temperature(generator, tokenizer)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|
|
|