from __future__ import annotations import os, sys sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) from exllamav3.util.file import disk_lru_cache from exllamav3 import model_init, Generator, Job, ComboSampler from exllamav3.util.progress import ProgressBar import argparse from pathlib import Path import torch import time, random, json from pathlib import Path # from datasets import load_dataset from urllib import request from collections import deque # ! Doesn't work, removes trailing whitespace from questions, which breaks the eval script # @disk_lru_cache("fetch_ifbench_test_data") # def fetch_ifbench_test_data() -> list[dict]: # ds = load_dataset("allenai/IFBench_test", split = "train") # data = [row for row in ds] # print(f"Loaded {len(data)} IFBench test prompts from allenai/IFBench_test") # return data @disk_lru_cache("fetch_ifbench_test_data_git") def fetch_ifbench_test_data_git() -> (list[dict], list): url = "https://raw.githubusercontent.com/allenai/IFBench/refs/heads/main/data/IFBench_test.jsonl" with request.urlopen(url) as response: raw = response.read().decode("utf-8") return [json.loads(line) for line in raw.splitlines() if line.strip()], raw def strip_reasoning(s: str) -> str: # TODO: Respect other think tags l = s.find("") r = s.rfind("") if l >= 0 and r >= 0: s = s[:l] + s[r+8:] elif r >= 0: s = s[r + 8:] return s.strip() def write_jsonl(rows: list[dict], path: Path) -> None: with open(path, "w") as f: for row in rows: f.write(json.dumps(row, ensure_ascii=False) + "\n") @torch.inference_mode() def main(args): # Get the stuff ifbench, ifbench_raw = fetch_ifbench_test_data_git() if args.limit: ifbench = ifbench[:args.limit] # Initialize model, config, cache, tokenizer = model_init.init(args) generator = Generator( model = model, cache = cache, max_batch_size = args.max_batch_size, tokenizer = tokenizer, show_visualizer = args.visualize_cache, ) sampler = model_init.get_arg_sampler(args) # Create prompts template_args = {} if args.think: template_args["enable_thinking"] = True if args.nothink: template_args["enable_thinking"] = False all_results: list[dict] = [] with ProgressBar("Prompts", len(ifbench), transient = False) as progress: for idx, bp in enumerate(ifbench): prompt = bp["prompt"] all_results.append({ "prompt": prompt }) input_ids = tokenizer.hf_chat_template( [{ "role": "user", "content": prompt }], add_generation_prompt = True, **template_args, ) job = Job( input_ids = input_ids, max_new_tokens = args.max_tokens, stop_conditions = config.eos_token_id_list, sampler = sampler, identifier = idx, # max_rq_tokens = 512, ) generator.enqueue(job) progress.update(idx + 1) # Generate tps_hist = deque() sampled_tokens = 0 last_update = time.time() total_jobs = generator.num_remaining_jobs() done_jobs = 0 with ProgressBar("Generating samples", total_jobs, transient = False) as progress: while generator.num_remaining_jobs(): results = generator.iterate() # Some feedback now = time.time() if now > last_update + 1: tps = sampled_tokens / (now - last_update) sampled_tokens = 0 tps_hist.append(tps) if len(tps_hist) > 3: tps_hist.popleft() tps = round(sum(tps_hist) / len(tps_hist)) num_pend = generator.num_pending_jobs() num_act = generator.num_active_jobs() print(f" -- pending: {num_pend:4} active {num_act:4} {tps:6} tokens/s", end = "") if num_act: sjob = random.choice(generator.active_jobs) snum = "#" + str(sjob.identifier) ssamp = repr(sjob.full_completion)[-64:-1] print(f" sample from {snum:>4}: {ssamp}") else: print() last_update = now # Collect results for result in results: if "token_ids" in result: sampled_tokens += result["token_ids"].shape[-1] if result.get("eos"): idx = result["identifier"] completion = result["full_completion"] if result["eos_reason"] == "max_new_tokens": print(f" !! Job #{idx} exceeded token limit, ends in: {repr(completion)[-100:-1]}") all_results[idx]["response"] = strip_reasoning(completion) done_jobs += 1 progress.update(done_jobs) # Save results write_jsonl(all_results, args.output) print (f" -- Responses written to {args.output}") # Evaluate if args.eval: absfile = Path(args.output).resolve() print(" !! Sorry, eval is not implemented since IFBench is not available as a Python package. " "To evaluate, please clone the IFBench repo and run:") print(f"python run_eval.py --input_data=data/IFBench_test.jsonl --input_response_data={absfile} --output_dir=./eval/") if __name__ == "__main__": parser = argparse.ArgumentParser(description = "Run IFBench evaluation") model_init.add_args( parser, add_sampling_args = True, default_cache_size = 65536, default_sampling_args = { "temperature": 0.0, "repetition_penalty": 1.0, "presence_penalty": 0.0, "frequency_penalty": 0.0, "penalty_range": 1024, "min_p": 0.0, "top_k": 0, "top_p": 0.0, "adaptive_target": 1.0, "adaptive_decay": 0.9, } ) parser.add_argument("-vis", "--visualize_cache", action = "store_true", help = "Show cache visualizer (slow)") parser.add_argument("-o", "--output", type = str, help = "Output .jsonl filename", required = True) parser.add_argument("-e", "--eval", action = "store_true", help = "Run evaluation script on output file after sampling") parser.add_argument("-mt", "--max_tokens", type = int, default = 16384, help = "Max number of tokens for each completion") parser.add_argument("-mbs", "--max_batch_size", type = int, default = 64, help = "Max batch size") parser.add_argument("-think", "--think", action = "store_true", help = "set template_arg enable_thinking=true") parser.add_argument("-nothink", "--nothink", action = "store_true", help = "set template_arg enable_thinking=false") parser.add_argument("-limit", "--limit", type = int, help = "Limit number of questions (creates incomplete results file)", default = 0) args = parser.parse_args() # Validate args directory = os.path.dirname(args.output) if os.path.exists(args.output): print(f" !! Warning: Output file exists and will be overwritten.") assert not (args.think and args.nothink), "Be nice" main(args)