Add Q6 and Q8 cache options to eval scripts

2026-04-20 14:29:28 +00:00 · 2024-06-09 02:13:06 +02:00
parent f3596fc0d9
commit 675450d845
3 changed files with 18 additions and 2 deletions
--- a/doc/eval.md
+++ b/doc/eval.md
@@ -58,6 +58,10 @@ prefix for the response.
 performance.

 - **-cq4 / --cache_q4**: Use Q4 cache
+  
+- **-cq6 / --cache_q6**: Use Q6 cache
+
+- **-cq8 / --cache_q8**: Use Q8 cache

 ## MMLU

@@ -83,3 +87,7 @@ the full list of subjects.
 performance.

 - **-cq4 / --cache_q4**: Use Q4 cache
+
+- **-cq6 / --cache_q6**: Use Q6 cache
+
+- **-cq8 / --cache_q8**: Use Q8 cache
--- a/eval/humaneval.py
+++ b/eval/humaneval.py
@@ -3,7 +3,7 @@ import sys, os
 sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
 from human_eval.data import write_jsonl, read_problems
 from exllamav2 import model_init
-from exllamav2 import ExLlamaV2Cache, ExLlamaV2Cache_Q4
+from exllamav2 import ExLlamaV2Cache, ExLlamaV2Cache_Q4, ExLlamaV2Cache_Q6, ExLlamaV2Cache_Q8
 from exllamav2.generator import ExLlamaV2DynamicGenerator, ExLlamaV2DynamicJob, ExLlamaV2Sampler
 import argparse, contextlib
 import util
@@ -15,6 +15,8 @@ parser.add_argument("-o", "--output", type = str, help = "Output .jsonl filename
 parser.add_argument("-cs", "--cache_size", type = int, default = None)
 parser.add_argument("-spt", "--samples_per_task", type = int, default = 200)
 parser.add_argument("-cq4", "--cache_q4", action = "store_true", help = "Use Q4 cache")
+parser.add_argument("-cq6", "--cache_q6", action = "store_true", help = "Use Q6 cache")
+parser.add_argument("-cq8", "--cache_q8", action = "store_true", help = "Use Q8 cache")
 parser.add_argument("--max_tokens", type = int, default = 768, help = "Max number of tokens for each completion")
 parser.add_argument("-pf", "--prompt_format", type = str, help = "Instruct format to apply. Default is raw completion (for base models) ")
 parser.add_argument("-v", "--verbose", action = "store_true", help = "Spam completions to console while generating")
@@ -75,6 +77,8 @@ model, tokenizer = model_init.init(
 )

 if args.cache_q4: cache_type = ExLlamaV2Cache_Q4
+elif args.cache_q6: cache_type = ExLlamaV2Cache_Q6
+elif args.cache_q8: cache_type = ExLlamaV2Cache_Q8
 else: cache_type = ExLlamaV2Cache
 cache = cache_type(
    model,
--- a/eval/mmlu.py
+++ b/eval/mmlu.py
@@ -2,7 +2,7 @@ from __future__ import annotations
 import sys, os
 sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
 from exllamav2 import model_init
-from exllamav2 import ExLlamaV2Cache, ExLlamaV2Cache_Q4
+from exllamav2 import ExLlamaV2Cache, ExLlamaV2Cache_Q4, ExLlamaV2Cache_Q6, ExLlamaV2Cache_Q8
 from exllamav2.generator import ExLlamaV2DynamicGenerator, ExLlamaV2DynamicJob, ExLlamaV2Sampler
 import argparse, contextlib
 import torch
@@ -14,6 +14,8 @@ import random
 parser = argparse.ArgumentParser(description = "Run MMLU evaluation on EXL2 model")
 parser.add_argument("-cs", "--cache_size", type = int, default = None)
 parser.add_argument("-cq4", "--cache_q4", action = "store_true", help = "Use Q4 cache")
+parser.add_argument("-cq6", "--cache_q6", action = "store_true", help = "Use Q6 cache")
+parser.add_argument("-cq8", "--cache_q8", action = "store_true", help = "Use Q8 cache")
 parser.add_argument("-sub", "--subjects", type = str, default = "all", help = "Comma-separated list of categories to test, or 'all'")
 parser.add_argument("-fs", "--fewshot_examples", type = int, default = 5, help = "Number of examples for fewshot examples, max 5")
 parser.add_argument("-shf", "--shuffle", action = "store_true", help = "Shuffle choices randomly")
@@ -33,6 +35,8 @@ model, tokenizer = model_init.init(
 )

 if args.cache_q4: cache_type = ExLlamaV2Cache_Q4
+elif args.cache_q6: cache_type = ExLlamaV2Cache_Q6
+elif args.cache_q8: cache_type = ExLlamaV2Cache_Q8
 else: cache_type = ExLlamaV2Cache
 cache = cache_type(
    model,