mirror of
https://github.com/turboderp-org/exllamav3.git
synced 2026-04-20 14:29:51 +00:00
compare_q.py: Add more GPTQ layer types
This commit is contained in:
@@ -1,5 +1,7 @@
|
||||
import torch
|
||||
from gptqmodel.nn_modules.qlinear.marlin import MarlinQuantLinear
|
||||
from gptqmodel.nn_modules.qlinear.tritonv2 import TritonV2QuantLinear
|
||||
from gptqmodel.nn_modules.qlinear.exllamav2 import ExllamaV2QuantLinear
|
||||
from transformers import AutoTokenizer, AutoModelForCausalLM
|
||||
from aqlm import QuantizedLinear
|
||||
from awq.modules.linear import WQLinear_GEMM
|
||||
@@ -85,6 +87,17 @@ def get_storage_info(model):
|
||||
"scales": module.scales,
|
||||
})
|
||||
sum_numel += module.in_features * module.out_features
|
||||
elif any(isinstance(module, x) for x in [TritonV2QuantLinear]):
|
||||
sum_bits += get_tensors_size({
|
||||
"g_idx": module.g_idx,
|
||||
"qweight": module.qweight,
|
||||
"qzeros": module.qzeros,
|
||||
"scales": module.scales,
|
||||
})
|
||||
sum_numel += module.in_features * module.out_features
|
||||
elif any(isinstance(module, x) for x in [ExllamaV2QuantLinear]):
|
||||
sum_bits += get_tensors_size(module.q_tensors)
|
||||
sum_numel += module.in_features * module.out_features
|
||||
vram_bits = head_numel * head_bpw + sum_bits
|
||||
return sum_bits / sum_numel, head_bpw, vram_bits
|
||||
|
||||
|
||||
8
eval/spec/llama3.1-8b-instruct_autoround.json
Normal file
8
eval/spec/llama3.1-8b-instruct_autoround.json
Normal file
@@ -0,0 +1,8 @@
|
||||
[
|
||||
{
|
||||
"model_dir": "/mnt/str/models/llama3.1-8b-instruct/autoround/4bit-asym",
|
||||
"load_fn": "transformers",
|
||||
"fwd_fn": "transformers",
|
||||
"label": "AutoRound 4bit asym"
|
||||
}
|
||||
]
|
||||
Reference in New Issue
Block a user