Support LoRAs for Q8/Q5/Q4 GGUF Models

what a crazy night of math
2026-04-29 18:51:31 +00:00 · 2024-08-15 05:34:33 -07:00
parent fd0d25ba8a
commit 1bd6cf0e0c
5 changed files with 149 additions and 37 deletions
--- a/backend/operations_gguf.py
+++ b/backend/operations_gguf.py
@@ -2,34 +2,27 @@ import gguf
 import torch


-quants_mapping = {
-    gguf.GGMLQuantizationType.Q4_0: gguf.Q4_0,
-    gguf.GGMLQuantizationType.Q5_0: gguf.Q5_0,
-    gguf.GGMLQuantizationType.Q8_0: gguf.Q8_0,
-}
+# def functional_quantize_gguf(weight):
+#     gguf_cls = weight.gguf_cls
+#     gguf_cls.en


 def functional_linear_gguf(x, weight, bias=None):
    target_dtype = x.dtype
-    weight = dequantize_tensor(weight, target_dtype)
-    bias = dequantize_tensor(bias, target_dtype)
+    weight = dequantize_tensor(weight).to(target_dtype)
+    bias = dequantize_tensor(bias).to(target_dtype)
    return torch.nn.functional.linear(x, weight, bias)


-def dequantize_tensor(tensor, target_dtype=torch.float16):
+def dequantize_tensor(tensor):
    if tensor is None:
        return None

    data = torch.tensor(tensor.data)
-    gguf_type = tensor.gguf_type
+    gguf_cls = tensor.gguf_cls
    gguf_real_shape = tensor.gguf_real_shape

-    if gguf_type in [gguf.GGMLQuantizationType.F32, gguf.GGMLQuantizationType.F16, gguf.GGMLQuantizationType.BF16]:
-        return data.to(target_dtype)
+    if gguf_cls is None:
+        return data

-    if gguf_type not in quants_mapping:
-        raise NotImplementedError(f'Quant type {gguf_type} not implemented!')
-
-    quant_cls = quants_mapping.get(gguf_type)
-
-    return quant_cls.dequantize_pytorch(data, gguf_real_shape).to(target_dtype)
+    return gguf_cls.dequantize_pytorch(data, gguf_real_shape)