exllamav3/examples/loading.py

import sys, os
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))

from exllamav3 import Config, Model, Cache, Tokenizer

# Define model by loading the config from the model directory
config = Config.from_directory("/mnt/str/models/llama3.1-8b-instruct/exl3/4.0bpw/")

# Create and load tokenizer
tokenizer = Tokenizer.from_config(config)

# Create the model instance. Model isn't loaded until one of the load functions is called
model = Model.from_config(config)

# Load model without cache (then unload)
print("Loading")
model.load()
model.unload()

# Load model with progress bar
print("Loading with progress bar")
model.load(progressbar = True)
model.unload()

# Create a cache attached to the model. When the model is loaded, the cache tensors are also created
cache = Cache(model, max_num_tokens = 2048)

# Load again, this time with a cache
print("Loading with cache")
model.load(progressbar = True)
model.unload()

# Unloading the model also destroys the tensors of any attached cache(s), freeing up all the VRAM used by both.
# The cache is still attached to the model at this point, so loading the same model again would create the
# cache tensors anew. If desired, we can explicitly detach the cache to prevent this:
cache.detach_from_model(model)

# Load with callback function
def progress_callback(module: int, modules: int):
    print(f"Callback: Loaded {module} of {modules} modules")

print("Loading with callback")
model.load(callback = progress_callback)
model.unload()

# Load model using generator function. In this mode, the loader presents as an iterator yielding the current
# progress on each iteration
print("Loading with generator function")
f = model.load_gen()
for module, modules in f:
    print(f"Generator: Loaded {module} of {modules} modules")
model.unload()

# Load model using generator function, callback and progress bar
print("Loading with generator function, callback and progress bar")
f = model.load_gen(progressbar = True, callback = progress_callback)
for module, modules in f:
    print(f"Generator: Loaded {module} of {modules} modules")
model.unload()