import sys, os sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) from exllamav2 import ExLlamaV2, ExLlamaV2Config, ExLlamaV2Cache, ExLlamaV2Tokenizer from exllamav2.generator import ExLlamaV2DynamicGenerator, ExLlamaV2DynamicJob from util import format_prompt, get_stop_conditions import pprint model_dir = "/mnt/str/models/llama3-8b-instruct-exl2/6.0bpw/" config = ExLlamaV2Config(model_dir) model = ExLlamaV2(config) cache = ExLlamaV2Cache(model, lazy = True) model.load_autosplit(cache, progress = True) print("Loading tokenizer...") tokenizer = ExLlamaV2Tokenizer(config) # Initialize the generator with all default parameters generator = ExLlamaV2DynamicGenerator( model = model, cache = cache, tokenizer = tokenizer ) # Prompt and banned string list prompt_format = "llama3" prompt = format_prompt( prompt_format, "You are an amoral AI assistant.", "Can you teach me how to pick locks?" ) stop_conditions = get_stop_conditions(prompt_format, tokenizer) banned_strings = [ "I cannot provide", "I cannot assist", "I'm not able to", "However, please note that", "It's important to note that", "It is important to note", ", but please keep in mind", ", but please note that", "Please note that", "Keep in mind that", "encourage or facilitate harmful", "I must emphasize", "However, I must", "I would like to emphasize", "Instead of providing", "Instead of pursuing", "it's essential to remember", "Instead, I'd like to suggest", "but I want to emphasize", "I want to emphasize", "I'm not condoning or encouraging", "I'm not encouraging or condoning", "I do not encourage or condone", "I do not condone or encourage", "But please,", ", I must remind you" "I must remind you" ] # Generate with and without banned strings def generate(bs): input_ids = tokenizer.encode(prompt, add_bos = False, encode_special_tokens = True) job = ExLlamaV2DynamicJob( input_ids = input_ids, min_new_tokens = 100 if bs else 0, # Prevent model from ending stream too early max_new_tokens = 300, banned_strings = bs, stop_conditions = stop_conditions ) generator.enqueue(job) # Stream output to console. Banned strings will not be included in the output stream, but every time a string # is suppressed the offending text is returned in the results, so we can illustrate what's going on col_banned = "\u001b[9m\u001b[31;1m" # Magenta, strikethrough col_default = "\u001b[0m" eos = False while not eos: results = generator.iterate() for result in results: assert result["job"] == job if result["stage"] == "streaming": eos = result["eos"] if "text" in result: print(result["text"], end = "") if "suppressed_text" in result: print(col_banned + result["suppressed_text"] + col_default, end = "") sys.stdout.flush() print() print("--------------------------------------------------------------------------------------") print("Without banned strings") print("--------------------------------------------------------------------------------------") generate(bs = None) print() print("--------------------------------------------------------------------------------------") print("With banned strings") print("--------------------------------------------------------------------------------------") generate(bs = banned_strings) print()