mirror of
https://github.com/turboderp-org/exllamav2.git
synced 2026-03-14 15:57:25 +00:00
Pixtral example
This commit is contained in:
142
examples/multimodal_pixtral.py
Normal file
142
examples/multimodal_pixtral.py
Normal file
@@ -0,0 +1,142 @@
|
||||
import sys, os
|
||||
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
|
||||
|
||||
from exllamav2 import (
|
||||
ExLlamaV2,
|
||||
ExLlamaV2Config,
|
||||
ExLlamaV2Cache,
|
||||
ExLlamaV2Tokenizer,
|
||||
ExLlamaV2VisionTower,
|
||||
)
|
||||
|
||||
from exllamav2.generator import (
|
||||
ExLlamaV2DynamicGenerator,
|
||||
ExLlamaV2DynamicJob,
|
||||
ExLlamaV2Sampler,
|
||||
)
|
||||
|
||||
from PIL import Image
|
||||
import requests
|
||||
|
||||
# Model used:
|
||||
#
|
||||
# Quantized: https://huggingface.co/turboderp/pixtral-12b-exl2
|
||||
# Unquantized: https://huggingface.co/mistral-community/pixtral-12b/
|
||||
|
||||
model_directory = "/mnt/str/models/pixtral-12b-exl2/6.0bpw"
|
||||
config = ExLlamaV2Config(model_directory)
|
||||
config.max_seq_len = 16384 # default is 1M
|
||||
|
||||
# Load vision model and multimodal projector and initialize preprocessor
|
||||
|
||||
vision_model = ExLlamaV2VisionTower(config)
|
||||
vision_model.load(progress = True)
|
||||
|
||||
# Load EXL2 model
|
||||
|
||||
model = ExLlamaV2(config)
|
||||
cache = ExLlamaV2Cache(model, lazy = True, max_seq_len = 16384)
|
||||
model.load_autosplit(cache, progress = True)
|
||||
tokenizer = ExLlamaV2Tokenizer(config)
|
||||
|
||||
# Create generator
|
||||
|
||||
generator = ExLlamaV2DynamicGenerator(
|
||||
model = model,
|
||||
cache = cache,
|
||||
tokenizer = tokenizer
|
||||
)
|
||||
|
||||
# Util function to get a PIL image from a URL or from a file in the script's directory
|
||||
|
||||
def get_image(file = None, url = None):
|
||||
assert (file or url) and not (file and url)
|
||||
if file:
|
||||
script_dir = os.path.dirname(os.path.abspath(__file__))
|
||||
file_path = os.path.join(script_dir, file)
|
||||
return Image.open(file_path)
|
||||
elif url:
|
||||
return Image.open(requests.get(url, stream = True).raw)
|
||||
|
||||
# Convert image(s) to embeddings
|
||||
|
||||
image_embeddings = [
|
||||
vision_model.get_image_embeddings(
|
||||
model = model,
|
||||
tokenizer = tokenizer,
|
||||
image = img,
|
||||
text_alias = alias,
|
||||
)
|
||||
for (alias, img) in [
|
||||
("{{IMAGE_1}}", get_image(file = "test_image_1.jpg")),
|
||||
("{{IMAGE_2}}", get_image(file = "test_image_2.jpg")),
|
||||
# ("{{IMAGE_3}}", get_image(url = "https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcRSERy82bn3jpYKr1cNxMLXTyEsVvSt2wZOIQ&s")),
|
||||
]
|
||||
]
|
||||
|
||||
# Define a prompt using the aliases above as placeholders for image tokens. The tokenizer will replace each alias
|
||||
# with a range of temporary token IDs, and the model will embed those temporary IDs from their respective sources
|
||||
# rather than the model's text embedding table.
|
||||
#
|
||||
# The temporary IDs are unique for the lifetime of the process and persist as long as a reference is held to the
|
||||
# corresponding ExLlamaV2Embedding object. This way, images can be reused between generations, or used multiple
|
||||
# for multiple jobs in a batch, and the generator will be able to apply prompt caching and deduplication to image
|
||||
# tokens as well as text tokens.
|
||||
#
|
||||
# Image token IDs are assigned sequentially, however, so two ExLlamaV2Embedding objects created from the same
|
||||
# source image will not be recognized as the same image for purposes of prompt caching etc.
|
||||
|
||||
prompt = "[INST]{{IMAGE_1}}{{IMAGE_2}}\n" + \
|
||||
"What are the similarities and differences between these two experiments?[/INST]"
|
||||
|
||||
# Generate
|
||||
|
||||
streaming = True
|
||||
greedy = True
|
||||
|
||||
if streaming:
|
||||
|
||||
input_ids = tokenizer.encode(
|
||||
prompt,
|
||||
add_bos = True,
|
||||
encode_special_tokens = True,
|
||||
embeddings = image_embeddings,
|
||||
)
|
||||
|
||||
job = ExLlamaV2DynamicJob(
|
||||
input_ids = input_ids,
|
||||
max_new_tokens = 500,
|
||||
decode_special_tokens = True,
|
||||
stop_conditions = [tokenizer.eos_token_id],
|
||||
gen_settings = ExLlamaV2Sampler.Settings.greedy() if greedy else None,
|
||||
embeddings = image_embeddings,
|
||||
)
|
||||
|
||||
generator.enqueue(job)
|
||||
|
||||
print()
|
||||
print(prompt, end = ""); sys.stdout.flush()
|
||||
|
||||
eos = False
|
||||
while generator.num_remaining_jobs():
|
||||
results = generator.iterate()
|
||||
for result in results:
|
||||
text = result.get("text", "")
|
||||
print(text, end = ""); sys.stdout.flush()
|
||||
|
||||
print()
|
||||
|
||||
else:
|
||||
|
||||
output = generator.generate(
|
||||
prompt = prompt,
|
||||
max_new_tokens = 500,
|
||||
add_bos = True,
|
||||
encode_special_tokens = True,
|
||||
decode_special_tokens = True,
|
||||
stop_conditions = [tokenizer.eos_token_id],
|
||||
gen_settings = ExLlamaV2Sampler.Settings.greedy() if greedy else None,
|
||||
embeddings = image_embeddings,
|
||||
)
|
||||
|
||||
print(output)
|
||||
|
Before Width: | Height: | Size: 84 KiB After Width: | Height: | Size: 84 KiB |
|
Before Width: | Height: | Size: 40 KiB After Width: | Height: | Size: 40 KiB |
@@ -1,82 +0,0 @@
|
||||
import sys, os
|
||||
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
|
||||
|
||||
import torch
|
||||
|
||||
from exllamav2 import (
|
||||
ExLlamaV2,
|
||||
ExLlamaV2Config,
|
||||
ExLlamaV2Cache,
|
||||
ExLlamaV2Tokenizer,
|
||||
ExLlamaV2VisionTower,
|
||||
)
|
||||
|
||||
from exllamav2.generator import (
|
||||
ExLlamaV2DynamicGenerator,
|
||||
ExLlamaV2Sampler,
|
||||
)
|
||||
|
||||
from PIL import Image
|
||||
import requests
|
||||
|
||||
# Unquantized model used for experiment:
|
||||
#
|
||||
# https://huggingface.co/mistral-community/pixtral-12b/
|
||||
|
||||
model_directory = "/mnt/str/models/pixtral-12b-exl2/5.0bpw"
|
||||
config = ExLlamaV2Config(model_directory)
|
||||
config.max_seq_len = 16384 # default is 1M
|
||||
|
||||
# Load vision model and multimodal projector and initialize preprocessor
|
||||
|
||||
vision_model = ExLlamaV2VisionTower(config)
|
||||
vision_model.load(progress = True)
|
||||
|
||||
# Load EXL2 model
|
||||
|
||||
model = ExLlamaV2(config)
|
||||
cache = ExLlamaV2Cache(model, lazy = True, max_seq_len = 16384)
|
||||
model.load_autosplit(cache, progress = True)
|
||||
tokenizer = ExLlamaV2Tokenizer(config)
|
||||
|
||||
# Create generator
|
||||
|
||||
generator = ExLlamaV2DynamicGenerator(
|
||||
model = model,
|
||||
cache = cache,
|
||||
tokenizer = tokenizer
|
||||
)
|
||||
|
||||
# Create an MMEmbedding for the image features and a prompt containing the placeholder string
|
||||
|
||||
image_embeddings = [
|
||||
vision_model.get_image_embeddings(
|
||||
model = model,
|
||||
tokenizer = tokenizer,
|
||||
image = img,
|
||||
text_alias = alias,
|
||||
)
|
||||
for (alias, img) in [
|
||||
("{{IMAGE_1}}", Image.open("test_image_1.jpg")),
|
||||
("{{IMAGE_2}}", Image.open("test_image_2.jpg")),
|
||||
]
|
||||
]
|
||||
|
||||
prompt = "[INST]{{IMAGE_1}}{{IMAGE_2}}\n" + \
|
||||
"What are the similarities and differences between these two experiments?[/INST]"
|
||||
|
||||
# Run prompt through generator, with embeddings. The tokenizer will insert preepared image tokens in place
|
||||
# of the aliases
|
||||
|
||||
output = generator.generate(
|
||||
prompt = prompt,
|
||||
max_new_tokens = 500,
|
||||
add_bos = True,
|
||||
encode_special_tokens = True,
|
||||
decode_special_tokens = True,
|
||||
stop_conditions = [tokenizer.eos_token_id],
|
||||
gen_settings = ExLlamaV2Sampler.Settings.greedy(),
|
||||
embeddings = image_embeddings,
|
||||
)
|
||||
|
||||
print(output)
|
||||
Reference in New Issue
Block a user