Speed up quant model loading and inference ...

... based on 3 evidences: 1. torch.Tensor.view on one big tensor is slightly faster than calling torch.Tensor.to on multiple small tensors. 2. but torch.Tensor.to with dtype change is significantly slower than torch.Tensor.view 3. “baking” model on GPU is significantly faster than computing on CPU when model load. mainly influence inference of Q8_0, Q4_0/1/K and loading of all quants
2026-04-29 10:41:25 +00:00 · 2024-08-30 00:49:05 -07:00
parent 3d62fa9598
commit 4c9380c46a
7 changed files with 126 additions and 181 deletions
--- a/backend/utils.py
+++ b/backend/utils.py
@@ -157,9 +157,9 @@ def beautiful_print_gguf_state_dict_statics(state_dict):
    from gguf.constants import GGMLQuantizationType
    type_counts = {}
    for k, v in state_dict.items():
-        gguf_type = getattr(v, 'gguf_type', None)
-        if gguf_type is not None:
-            type_name = GGMLQuantizationType(gguf_type).name
+        gguf_cls = getattr(v, 'gguf_cls', None)
+        if gguf_cls is not None:
+            type_name = gguf_cls.__name__
            if type_name in type_counts:
                type_counts[type_name] += 1
            else: