34B testing

2026-03-15 00:07:26 +00:00 · 2023-09-10 06:15:33 +02:00
parent 6046dcf39a
commit 918368b295
3 changed files with 41 additions and 7 deletions
--- a/exllamav2/exllamav2_ext/cuda/q_attn.cu
+++ b/exllamav2/exllamav2_ext/cuda/q_attn.cu
@@ -9,7 +9,7 @@ const int THREADS_X = 32;
 const int THREADS_Y = 1;
 const int THREADS_Z = 4;
 const int BLOCKSIZE_X = 2; // 2*half == 1*uint32_t
-const int BLOCKSIZE_Z = 4; // num_heads must be divisible by BLOCKSIZE_Z  TODO: Check that this is the case when Llama2-34b releases
+const int BLOCKSIZE_Z = 4; // num_heads must be divisible by BLOCKSIZE_Z

 __global__ void update_cache_kernel
 (
--- a/exllamav2/generator/streaming.py
+++ b/exllamav2/generator/streaming.py
@@ -85,7 +85,7 @@ class ExLlamaV2StreamingGenerator(ExLlamaV2BaseGenerator):

            position = self.held_text.find(ss)
            if position != -1:
-                return self.held_text[:position], True, self.no_tokens  # TODO: Decide if we want to tokenize a partial string here
+                return self.held_text[:position], True, self.no_tokens

            # Check for overlap between end of held_text and start of stop string

--- a/tests/test_alloc.py
+++ b/tests/test_alloc.py
@@ -1,4 +1,4 @@
-import sys, os
+import sys, os, math
 sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))

 from exllamav2 import(
@@ -21,9 +21,10 @@ import torch

 # model_directory =  "/mnt/str/models/_exl2/openllama-3b-3.0bpw-h6-exl2/"
 # model_directory =  "/mnt/str/models/_exl2/llama-7b-3.0bpw-h6-exl2/"
-model_directory =  "/mnt/str/models/_exl2/llama2-70b-chat-4.0bpw-h6-exl2/"
+# model_directory =  "/mnt/str/models/_exl2/llama2-70b-chat-2.5bpw-h6-exl2/"
+model_directory = "/mnt/str/models/_exl2/codellama-34b-instruct-4.0bpw-h6-exl2/"

-allocation = [16, 24]
+allocation = [18, 24]


 # Prime CUDA and initialize mem measurement
@@ -39,6 +40,7 @@ torch.cuda.empty_cache()

 mem_base = {}
 for dev in torch_devices:
+    torch.cuda.reset_peak_memory_stats(dev)
    mem_base[dev] = torch.cuda.max_memory_allocated(dev)


@@ -47,6 +49,7 @@ for dev in torch_devices:
 config = ExLlamaV2Config()
 config.model_dir = model_directory
 config.prepare()
+config.max_seq_len = 8192

 model = ExLlamaV2(config)
 print("Loading model: " + model_directory)
@@ -91,9 +94,7 @@ max_new_tokens = 150

 generator.warmup()
 time_begin = time.time()
-
 output = generator.generate_simple(prompt, settings, max_new_tokens, seed = 1234)
-
 time_end = time.time()
 time_total = time_end - time_begin

@@ -102,6 +103,18 @@ print()
 print(f"Response generated in {time_total:.2f} seconds, {max_new_tokens} tokens, {max_new_tokens / time_total:.2f} tokens/second")
 print()

+print(f"Prompt processing, {model.config.max_seq_len - 1} tokens...")
+
+cache.current_seq_len = 0
+time_begin = time.time()
+input_ids = torch.randint(0, model.config.vocab_size - 1, (1, model.config.max_seq_len - 1))
+model.forward(input_ids, cache, preprocess_only = True)
+torch.cuda.synchronize()
+time_end = time.time()
+time_total = time_end - time_begin
+
+print(f"Prompt processed in {time_total:.2f} seconds, {(model.config.max_seq_len - 1) / time_total:.2f} tokens/second")
+print()

 # Report

@@ -109,21 +122,42 @@ res1 = f" ** VRAM reported by Torch     : "
 res2 = f" ** VRAM expected              : "
 res3 = f" ** VRAM expected (with cache) : "
 res4 = f" ** VRAM allocated (max)       : "
+res5 = f" ** Cache size                 : "
 first = True

+mem_total = 0
+mem_exp = 0
 for idx, device in enumerate(torch_devices):
    mem_this = torch.cuda.max_memory_allocated(device) - mem_base[device]
+    mem_total += mem_this
+    mem_exp += expected_with_cache[idx] * 1024 ** 3
    if not first: res1 += " - "
    if not first: res2 += " - "
    if not first: res3 += " - "
    if not first: res4 += " - "
+    if not first: res5 += " - "
    first = False
    res1 += f"[{device}] {mem_this / (1024 ** 2):,.2f} MB"
    res2 += f"[{device}] {expected[idx] * 1024:,.2f} MB"
    res3 += f"[{device}] {expected_with_cache[idx] * 1024:,.2f} MB"
    res4 += f"[{device}] {allocation[idx] * 1024:,.2f} MB"
+    res5 += f"[{device}] {cache_fp[idx] / (1024 ** 2) if idx < len(cache_fp) else 0:,.2f} MB"

 print(res4)
 print(res2)
+print(res5)
 print(res3)
 print(res1)
+
+print()
+print(f"Max sequence length:  {config.max_seq_len}")
+print(f"Hidden size:          {config.hidden_size}")
+print(f"Attention heads:      {config.num_attention_heads}")
+print(f"Key/value heads:      {config.num_key_value_heads}")
+print(f"Max attention size:   {math.sqrt(config.max_attention_size)} ** 2")
+print(f"Max input len:        {config.max_input_len}")
+# print(f"Correction amount:    {mem_total - mem_exp:,.2f} B")
+
+
+
+