compare_q.py: Fix llama.cpp bpw measurement for MoE models

2026-04-20 14:29:51 +00:00 · 2025-05-18 00:19:59 +02:00
parent 475dfcca47
commit 07ffea7f89
1 changed files with 4 additions and 12 deletions
--- a/eval/compare_q_llamacpp.py
+++ b/eval/compare_q_llamacpp.py
@@ -23,26 +23,19 @@ def get_storage_info(model_dir):
    head_numel = 0
    for tensor_info in tensors:
        name = tensor_info.name
-        if any(name.endswith(k) for k in [
-            ".ffn_down.weight",
-            ".ffn_gate.weight",
-            ".ffn_up.weight",
-            ".attn_q.weight",
-            ".attn_k.weight",
-            ".attn_v.weight",
-            ".attn_output.weight",
-        ]):
-            sum_bits += tensor_info.n_bytes * 8
-            sum_numel += tensor_info.n_elements
        if (name == "token_embd.weight" and head_bpw == 0) or \
            name == "output.weight":
            head_bpw = tensor_info.n_bytes * 8 / tensor_info.n_elements
            head_numel = tensor_info.n_elements
+        elif name.endswith(".weight"):
+            sum_bits += tensor_info.n_bytes * 8
+            sum_numel += tensor_info.n_elements
    vram_bits = head_numel * head_bpw + sum_bits
    return sum_bits / sum_numel, head_bpw, vram_bits

 def load_llamacpp(model_dir: str):
    init_backend()
+    bpw_layer, bpw_head, vram_bits = get_storage_info(model_dir)
    model = Llama(
        model_path = model_dir,
        logits_all = True,
@@ -50,7 +43,6 @@ def load_llamacpp(model_dir: str):
        n_ctx = 2048,
        n_gpu_layers = 999
    )
-    bpw_layer, bpw_head, vram_bits = get_storage_info(model_dir)
    return model, bpw_layer, bpw_head, vram_bits

 def fwd_llamacpp(model_instance, input_ids: torch.Tensor):