Files
exllamav2/tests/test_autosplit.py
turboderp 4afe616aee Fix unhandled OoM condition when loading GPTQ model with auto split
Free minimum reserved VRAM on previous device when moving to next device
2023-10-28 20:08:39 +02:00

76 lines
1.8 KiB
Python

import sys, os
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
from exllamav2 import(
ExLlamaV2,
ExLlamaV2Config,
ExLlamaV2Cache,
ExLlamaV2Cache_8bit,
ExLlamaV2Tokenizer,
)
from exllamav2.generator import (
ExLlamaV2BaseGenerator,
ExLlamaV2Sampler
)
import time
# Initialize model and cache
model_directory = "/mnt/str/models/_gptq/TheBloke_Spicyboros-70B-2.2-GPTQ/"
config = ExLlamaV2Config()
config.model_dir = model_directory
config.prepare()
config.max_seq_len = 4096
config.max_input_len = 1024
config.max_attn_size = 1024**2
model = ExLlamaV2(config)
print("Loading model: " + model_directory)
def progress_rep(module, num_modules):
yield f"Progress: {100 * module / num_modules:.2f}%"
cache = ExLlamaV2Cache(model, lazy = True)
f = model.load_autosplit_gen(cache, last_id_only = True, callback_gen = progress_rep)
for item in f:
print(item)
tokenizer = ExLlamaV2Tokenizer(config)
# Initialize generator
generator = ExLlamaV2BaseGenerator(model, cache, tokenizer)
# Generate some text
settings = ExLlamaV2Sampler.Settings()
settings.temperature = 0.85
settings.top_k = 50
settings.top_p = 0.8
settings.token_repetition_penalty = 1.15
settings.disallow_tokens(tokenizer, [tokenizer.eos_token_id])
prompt = "All work and no play makes Jack a dull boy. " * 1000
prompt_ids = tokenizer.encode(prompt)
prompt_ids = prompt_ids[:, :2048 - 50 - 1]
prompt = tokenizer.decode(prompt_ids)
max_new_tokens = 50
generator.warmup()
time_begin = time.time()
output = generator.generate_simple(prompt, settings, max_new_tokens, seed = 1234)
time_end = time.time()
time_total = time_end - time_begin
print(output)
print()
print(f"Response generated in {time_total:.2f} seconds, {max_new_tokens} tokens, {max_new_tokens / time_total:.2f} tokens/second")