Files
exllamav3/examples/generator.py
2025-04-06 14:42:49 +02:00

187 lines
6.1 KiB
Python

import sys, os
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
from exllamav3 import Config, Model, Cache, Tokenizer, Generator, Job, TopPSampler
from common import format_prompt, get_stop_conditions
"""
A couple of examples showing uses of the generator
"""
prompt_format = "llama3" # see common.py
model_dir = "/mnt/str/eval_models/llama3.1-8b-instruct/exl3/4.0bpw/"
cache_size = 16384
system_prompt = "You are a very nice language model."
instructions = [
"Write a short story beginning with the words 'Once in a while, when you least expect it'.",
"Why are cats so awesome?",
"Who was the tallest president of the United States?",
"Why are there so many different kinds of screws?",
"oinnvdoehwemnascnawwui8dh2",
"Write a haiku about catnip."
]
# Generate a single completion to a single prompt
def generate_single(generator, tokenizer):
instruction = instructions[0]
print("------------------")
print("Prompt: " + instruction)
print()
response = generator.generate(
prompt = format_prompt(prompt_format, system_prompt, instruction),
stop_conditions = get_stop_conditions(prompt_format, tokenizer),
max_new_tokens = 500,
completion_only = True,
add_bos = True
)
print("Response: " + response)
print()
# Generate multiple batched completions
def generate_batched(generator, tokenizer):
print("------------------")
responses = generator.generate(
prompt = [format_prompt(prompt_format, system_prompt, instruction) for instruction in instructions],
stop_conditions = get_stop_conditions(prompt_format, tokenizer),
max_new_tokens = 100,
completion_only = True,
add_bos = True
)
for idx, response in enumerate(responses):
print(f"#{idx + 1}: {response}")
print("------------------")
# Create a job and generate a stream of tokens
def generate_streaming(generator, tokenizer):
instruction = instructions[0]
print("------------------")
print("Prompt: " + instruction)
print()
print("Response: ", end = "", flush = True)
# Create the job and enqueue it
formatted_prompt = format_prompt(prompt_format, system_prompt, instruction)
job = Job(
input_ids = tokenizer.encode(formatted_prompt, add_bos = True),
max_new_tokens = 400,
stop_conditions = get_stop_conditions(prompt_format, tokenizer),
)
generator.enqueue(job)
# Keep iterating until the generator has no more jobs
while generator.num_remaining_jobs():
results = generator.iterate()
# Each iteration returns a list of results, each of which may contain output tokens for a running job. We
# only care about the "text" field here.
for result in results:
text = result.get("text", "")
print(text, end = "", flush = True)
print()
# Create a batch of jobs and stream the results
def generate_streaming_batched(generator, tokenizer):
# Some buffers for collecting results
responses = [""] * len(instructions)
for idx, instruction in enumerate(instructions):
# Only print the second job to the console
if idx == 1:
print("------------------")
print("Prompt: " + instruction)
print()
print("Streamed response: ", end = "", flush = True)
# Create each job and enqueue it. Since one iteration of the generator can return multiple results, adding
# an identifier argument lets us track which sequence each chunk of output pertains to. The identifier can
# be any object, but a simple index will work here
formatted_prompt = format_prompt(prompt_format, system_prompt, instruction)
job = Job(
input_ids = tokenizer.encode(formatted_prompt, add_bos = True),
max_new_tokens = 400,
stop_conditions = get_stop_conditions(prompt_format, tokenizer),
identifier = idx,
)
generator.enqueue(job)
# Keep iterating until the generator has no more jobs
while generator.num_remaining_jobs():
results = generator.iterate()
for result in results:
text = result.get("text", "")
idx = result["identifier"]
# If this result is from the first job, stream to the console
if idx == 1:
print(text, end = "", flush = True)
# Collect results
responses[idx] += text
print()
print("--------------")
# Finally print all the collected results
for idx, response in enumerate(responses):
print(f"#{idx + 1}: {response}")
print("------------------")
# Generate a series of completions with increasing temperature
def generate_temperature(generator, tokenizer):
instruction = instructions[5]
print("------------------")
print("Prompt: " + instruction)
print()
temperature = 0.0
while temperature <= 3.01:
print(f"Temperature = {temperature:.2f}: ", end = "", flush = True)
response = generator.generate(
prompt = format_prompt(prompt_format, system_prompt, instruction),
stop_conditions = get_stop_conditions(prompt_format, tokenizer),
sampler = TopPSampler(temperature = temperature, top_p = 0.95, temperature_last = True),
max_new_tokens = 100,
completion_only = True,
add_bos = True
)
print(response)
print()
temperature += 0.25
def main():
# Load a model with cache
config = Config.from_directory(model_dir)
model = Model.from_config(config)
cache = Cache(model, max_num_tokens = cache_size)
model.load(progressbar = True)
tokenizer = Tokenizer.from_config(config)
# Create generator
generator = Generator(
model = model,
cache = cache,
tokenizer = tokenizer,
)
# Do some things
generate_single(generator, tokenizer)
generate_batched(generator, tokenizer)
generate_streaming(generator, tokenizer)
generate_streaming_batched(generator, tokenizer)
generate_temperature(generator, tokenizer)
if __name__ == "__main__":
main()