Optimization, wider loads in EXL2 kernel (int4)

2026-04-21 23:09:09 +00:00 · 2023-09-07 10:56:43 +02:00
parent 1075b7514f
commit f79e16c5d0
19 changed files with 405 additions and 120 deletions
--- a/tests/test_gemv.py
+++ b/tests/test_gemv.py
@@ -17,7 +17,8 @@ with torch.no_grad():
    # Full-precision model

    config_full = ExLlamaV2Config()
-    config_full.model_dir = "/mnt/str/models/llama-7b"
+    # config_full.model_dir = "/mnt/str/models/llama-7b"
+    config_full.model_dir = "/mnt/str/models/_exl2/llama2-7b"
    config_full.prepare()
    model_full = ExLlamaV2(config_full)
    model_full.load(lazy = True)
@@ -28,7 +29,8 @@ with torch.no_grad():

    config_quant = ExLlamaV2Config()
    # config_quant.model_dir = "/mnt/str/models/_exl2/llama-7b-4.0bpw-h6-exl2/"
-    config_quant.model_dir = "/mnt/str/models/llama-7b-4bit-128g/"
+    config_quant.model_dir = "/mnt/str/models/_exl2/llama2-7b-5.0bpw-h6-exl2/"
+    # config_quant.model_dir = "/mnt/str/models/llama-7b-4bit-128g/"
    # config_quant.model_dir = "/mnt/str/models/_test_models/TheBloke_WizardLM-30B-Uncensored-GPTQ/"
    config_quant.prepare()
    model_quant = ExLlamaV2(config_quant)
@@ -43,11 +45,13 @@ with torch.no_grad():

    # Forward through full and quant layers

-    linear_full = model_full.modules_dict["model.layers.0.self_attn.v_proj"]
+    linear_full = model_full.modules_dict["model.layers.0.mlp.gate_proj"]
+    # linear_full = model_full.modules_dict["model.layers.0.self_attn.q_proj"]
    linear_full.load()
    test_state_full = linear_full.forward(test_state)

-    linear_quant = model_quant.modules_dict["model.layers.0.self_attn.v_proj"]
+    linear_quant = model_quant.modules_dict["model.layers.0.mlp.gate_proj"]
+    # linear_quant = model_quant.modules_dict["model.layers.0.self_attn.q_proj"]
    linear_quant.load()
    test_state_quant = linear_quant.forward(test_state, force_cuda = True)

@@ -77,7 +81,7 @@ with torch.no_grad():
    # Allocate some input states and initialize some more linear layers. Using multiple layers here for a more
    # realistic benchmark, since individual quantized layers can be small enough to fit entirely in the GPU's L2 cache

-    for size_m in [1, 2, 3, 4, 8, 16, 32, 64, 96, 128, 256]:
+    for size_m in [1]: #, 2, 3, 4, 8, 16, 32, 64]:

        itr = 5000
        a_num = 113
@@ -139,12 +143,14 @@ with torch.no_grad():

    for k in model_quant.modules_dict.keys():

-        if not "layers.4." in k: continue
+        if not prefix in k and not "head" in k: continue

        module_quant = model_quant.modules_dict[k]
        module_quant.load()
        if isinstance(module_quant, ExLlamaV2Linear):

+            gi = module_quant.dump_group_info()
+
            mat = torch.eye(module_quant.in_features, dtype = torch.half).cuda()
            test1 = module_quant.forward(mat, force_cuda = True)
            test2 = module_quant.forward(mat, force_recons = True)
@@ -155,6 +161,6 @@ with torch.no_grad():
            test2 = module_quant.forward(mat, force_recons = True)
            diff_r = F.mse_loss(test1, test2)

-            print (f"{k:40} ident: {diff_i.item():.6f}  u: {diff_r.item():.6f}")
+            print (f"{k:40}  {gi:30}  ident: {diff_i.item():.6f}  u: {diff_r.item():.6f}")

    xx = 0