exllamav3/examples/generator.py

import sys, os
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
from exllamav3 import Config, Model, Cache, Tokenizer, Generator, Job, TopPSampler
from common import format_prompt, get_stop_conditions

"""
A couple of examples showing uses of the generator
"""

prompt_format = "llama3"  # see common.py
model_dir = "/mnt/str/eval_models/llama3.1-8b-instruct/exl3/4.0bpw/"
cache_size = 16384

system_prompt = "You are a very nice language model."

instructions = [
    "Write a short story beginning with the words 'Once in a while, when you least expect it'.",
    "Why are cats so awesome?",
    "Who was the tallest president of the United States?",
    "Why are there so many different kinds of screws?",
    "oinnvdoehwemnascnawwui8dh2",
    "Write a haiku about catnip."
]

# Generate a single completion to a single prompt
def generate_single(generator, tokenizer):
    instruction = instructions[0]
    print("------------------")
    print("Prompt: " + instruction)
    print()
    response = generator.generate(
        prompt = format_prompt(prompt_format, system_prompt, instruction),
        stop_conditions = get_stop_conditions(prompt_format, tokenizer),
        max_new_tokens = 500,
        completion_only = True,
        add_bos = True
    )
    print("Response: " + response)
    print()


# Generate multiple batched completions
def generate_batched(generator, tokenizer):
    print("------------------")
    responses = generator.generate(
        prompt = [format_prompt(prompt_format, system_prompt, instruction) for instruction in instructions],
        stop_conditions = get_stop_conditions(prompt_format, tokenizer),
        max_new_tokens = 100,
        completion_only = True,
        add_bos = True
    )
    for idx, response in enumerate(responses):
        print(f"#{idx + 1}: {response}")
        print("------------------")


# Create a job and generate a stream of tokens
def generate_streaming(generator, tokenizer):
    instruction = instructions[0]
    print("------------------")
    print("Prompt: " + instruction)
    print()
    print("Response: ", end = "", flush = True)

    # Create the job and enqueue it
    formatted_prompt = format_prompt(prompt_format, system_prompt, instruction)
    job = Job(
        input_ids = tokenizer.encode(formatted_prompt, add_bos = True),
        max_new_tokens = 400,
        stop_conditions = get_stop_conditions(prompt_format, tokenizer),
    )
    generator.enqueue(job)

    # Keep iterating until the generator has no more jobs
    while generator.num_remaining_jobs():
        results = generator.iterate()

        # Each iteration returns a list of results, each of which may contain output tokens for a running job. We
        # only care about the "text" field here.
        for result in results:
            text = result.get("text", "")
            print(text, end = "", flush = True)

    print()


# Create a batch of jobs and stream the results
def generate_streaming_batched(generator, tokenizer):

    # Some buffers for collecting results
    responses = [""] * len(instructions)

    for idx, instruction in enumerate(instructions):

        # Only print the second job to the console
        if idx == 1:
            print("------------------")
            print("Prompt: " + instruction)
            print()
            print("Streamed response: ", end = "", flush = True)

        # Create each job and enqueue it. Since one iteration of the generator can return multiple results, adding
        # an identifier argument lets us track which sequence each chunk of output pertains to. The identifier can
        # be any object, but a simple index will work here
        formatted_prompt = format_prompt(prompt_format, system_prompt, instruction)
        job = Job(
            input_ids = tokenizer.encode(formatted_prompt, add_bos = True),
            max_new_tokens = 400,
            stop_conditions = get_stop_conditions(prompt_format, tokenizer),
            identifier = idx,
        )
        generator.enqueue(job)

    # Keep iterating until the generator has no more jobs
    while generator.num_remaining_jobs():
        results = generator.iterate()

        for result in results:
            text = result.get("text", "")
            idx = result["identifier"]

            # If this result is from the first job, stream to the console
            if idx == 1:
                print(text, end = "", flush = True)

            # Collect results
            responses[idx] += text

    print()
    print("--------------")

    # Finally print all the collected results
    for idx, response in enumerate(responses):
        print(f"#{idx + 1}: {response}")
        print("------------------")


# Generate a series of completions with increasing temperature
def generate_temperature(generator, tokenizer):
    instruction = instructions[5]
    print("------------------")
    print("Prompt: " + instruction)
    print()
    temperature = 0.0
    while temperature <= 3.01:
        print(f"Temperature = {temperature:.2f}: ", end = "", flush = True)
        response = generator.generate(
            prompt = format_prompt(prompt_format, system_prompt, instruction),
            stop_conditions = get_stop_conditions(prompt_format, tokenizer),
            sampler = TopPSampler(temperature = temperature, top_p = 0.95, temperature_last = True),
            max_new_tokens = 100,
            completion_only = True,
            add_bos = True
        )
        print(response)
        print()
        temperature += 0.25


def main():

    # Load a model with cache
    config = Config.from_directory(model_dir)
    model = Model.from_config(config)
    cache = Cache(model, max_num_tokens = cache_size)
    model.load(progressbar = True)
    tokenizer = Tokenizer.from_config(config)

    # Create generator
    generator = Generator(
        model = model,
        cache = cache,
        tokenizer = tokenizer,
    )

    # Do some things
    generate_single(generator, tokenizer)
    generate_batched(generator, tokenizer)
    generate_streaming(generator, tokenizer)
    generate_streaming_batched(generator, tokenizer)
    generate_temperature(generator, tokenizer)


if __name__ == "__main__":
    main()