Add 4-bit GPTQ support

2026-04-29 02:31:36 +00:00 · 2023-09-05 14:03:51 +02:00
parent 6d576b3e56
commit 3c80d41234
15 changed files with 1197 additions and 418 deletions
--- a/tests/test_gemv.py
+++ b/tests/test_gemv.py
@@ -0,0 +1,156 @@
+
+import sys, os
+sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+
+from exllamav2.model import ExLlamaV2, ExLlamaV2Config, ExLlamaV2Cache, ExLlamaV2Linear
+from exllamav2.tokenizer import ExLlamaV2Tokenizer
+import argparse, os, math, time
+import pandas, fastparquet
+import torch
+import torch.nn.functional as F
+from conversion.tokenize import get_tokens
+from conversion.quantize import list_live_tensors
+from exllamav2.ext import exllamav2_ext as ext_c, none_tensor
+
+with torch.no_grad():
+
+    # Full-precision model
+
+    config_full = ExLlamaV2Config()
+    config_full.model_dir = "/mnt/str/models/llama-7b"
+    config_full.prepare()
+    model_full = ExLlamaV2(config_full)
+    model_full.load(lazy = True)
+
+    tokenizer = ExLlamaV2Tokenizer(config_full)
+
+    # Quantized model
+
+    config_quant = ExLlamaV2Config()
+    # config_quant.model_dir = "/mnt/str/models/_exl2/llama-7b-4.0bpw-h6-exl2/"
+    # config_quant.model_dir = "/mnt/str/models/llama-7b-4bit-128g/"
+    # config_quant.model_dir = "/mnt/str/models/_test_models/TheBloke_WizardLM-30B-Uncensored-GPTQ/"
+    config_quant.prepare()
+    model_quant = ExLlamaV2(config_quant)
+    model_quant.load(lazy = True)
+
+    # Create input state for layer 0 (should also be normalized, really)
+
+    embed = model_full.modules_dict["model.embed_tokens"]
+    embed.load()
+    test_ids = tokenizer.encode("Hello there!")
+    test_state = embed.forward(test_ids).cuda()
+
+    # Forward through full and quant layers
+
+    linear_full = model_full.modules_dict["model.layers.0.self_attn.v_proj"]
+    linear_full.load()
+    test_state_full = linear_full.forward(test_state)
+
+    linear_quant = model_quant.modules_dict["model.layers.0.self_attn.v_proj"]
+    linear_quant.load()
+    test_state_quant = linear_quant.forward(test_state, force_cuda = True)
+
+    test_state_recons = linear_quant.forward(test_state, force_recons = True)
+
+    # Measure differences in output state
+
+    print()
+
+    diff_max = torch.max((test_state_quant - test_state_full).abs())
+    diff_mse = F.mse_loss(test_state_quant, test_state_full)
+    print(f"Quant vs. original (max, mse): {diff_max.item():.8f}, {diff_mse.item():.8f}")
+    diff_max = torch.max((test_state_recons - test_state_full).abs())
+    diff_mse = F.mse_loss(test_state_recons, test_state_full)
+    print(f"Recons vs. original (max, mse): {diff_max.item():.8f}, {diff_mse.item():.8f}")
+    diff_max = torch.max((test_state_quant - test_state_recons).abs())
+    diff_mse = F.mse_loss(test_state_quant, test_state_recons)
+    print(f"Recons vs. quant (max, mse): {diff_max.item():.8f}, {diff_mse.item():.8f}")
+
+    # Print some stuff
+
+    print()
+    print("full: ", test_state_full)
+    print("quant: ", test_state_quant)
+    print("dequant: ", test_state_recons)
+
+    # Allocate some input states and initialize some more linear layers. Using multiple layers here for a more
+    # realistic benchmark, since individual quantized layers can be small enough to fit entirely in the GPU's L2 cache
+
+    for size_m in [1, 2, 3, 4, 8, 16, 32, 64, 96, 128, 256]:
+
+        itr = 5000
+        a_num = 113
+        b_num = 27
+        print()
+        print(f"size_m: {size_m}, iter: {itr}")
+
+        a = [torch.randn((size_m, linear_full.in_features), dtype=torch.half, device="cuda:0") for i in range(a_num)]
+
+        linear_full_arr = []
+        linear_quant_arr = []
+        for i in range(b_num):
+            linear_full = model_full.modules_dict[f"model.layers.{i}.self_attn.q_proj"]
+            linear_full.load()
+            linear_full_arr.append(linear_full)
+            linear_quant = model_quant.modules_dict[f"model.layers.{i}.self_attn.q_proj"]
+            linear_quant.load()
+            linear_quant_arr.append(linear_quant)
+
+        # Time some forward passes through original and quantized (with and without reconstruction)
+
+        begin = time.time()
+        torch.cuda.synchronize()
+        for i in range(itr):
+            c = linear_full_arr[i % b_num].forward(a[i % a_num])
+        torch.cuda.synchronize()
+        end = time.time()
+        print(f"Torch time: {(end - begin) / itr * 1000:.4f} ms")
+
+        begin = time.time()
+        torch.cuda.synchronize()
+        for i in range(itr):
+            c = linear_quant_arr[i % b_num].forward(a[i % a_num], force_cuda = True)
+        torch.cuda.synchronize()
+        end = time.time()
+        print(f"Quant time: {(end - begin) / itr * 1000:.4f} ms")
+
+        begin = time.time()
+        torch.cuda.synchronize()
+        for i in range(itr):
+            c = linear_quant_arr[i % b_num].forward(a[i % a_num], force_recons = True)
+        torch.cuda.synchronize()
+        end = time.time()
+        print(f"Recons time: {(end - begin) / itr * 1000:.4f} ms")
+
+    # Load all matrices in a full layer of the quant model
+
+    target_layer = 4
+    prefix = f"layers.{target_layer}."
+
+    for k in model_quant.modules_dict.keys():
+        if not prefix in k: continue
+        module_quant = model_quant.modules_dict[k]
+        module_quant.load()
+
+    # Test that result of multiplication with identity matrix is the same with and without reconstruction
+
+    print()
+
+    for k in model_quant.modules_dict.keys():
+
+        if not "layers.4." in k: continue
+
+        module_quant = model_quant.modules_dict[k]
+        module_quant.load()
+        if isinstance(module_quant, ExLlamaV2Linear):
+
+            ident = torch.eye(module_quant.in_features, dtype = torch.half).cuda()
+
+            test1 = module_quant.forward(ident, force_cuda = True)
+            test2 = module_quant.forward(ident, force_recons = True)
+
+            diff = torch.max((test1 - test2).abs())
+            print (f"{k:40} {diff.item():.4f}")
+
+    xx = 0