Files
stable-diffusion-webui-forge/backend/operations_gguf.py
layerdiffusion 4c9380c46a Speed up quant model loading and inference ...
... based on 3 evidences:
1. torch.Tensor.view on one big tensor is slightly faster than calling torch.Tensor.to on multiple small tensors.
2. but torch.Tensor.to with dtype change is significantly slower than torch.Tensor.view
3. “baking” model on GPU is significantly faster than computing on CPU when model load.

mainly influence inference of Q8_0, Q4_0/1/K and loading of all quants
2024-08-30 00:49:05 -07:00

71 lines
2.1 KiB
Python

import gguf
import torch
quants_mapping = {
gguf.GGMLQuantizationType.Q2_K: gguf.Q2_K,
gguf.GGMLQuantizationType.Q3_K: gguf.Q3_K,
gguf.GGMLQuantizationType.Q4_0: gguf.Q4_0,
gguf.GGMLQuantizationType.Q4_K: gguf.Q4_K,
gguf.GGMLQuantizationType.Q4_1: gguf.Q4_1,
gguf.GGMLQuantizationType.Q5_0: gguf.Q5_0,
gguf.GGMLQuantizationType.Q5_1: gguf.Q5_1,
gguf.GGMLQuantizationType.Q5_K: gguf.Q5_K,
gguf.GGMLQuantizationType.Q6_K: gguf.Q6_K,
gguf.GGMLQuantizationType.Q8_0: gguf.Q8_0,
}
class ParameterGGUF(torch.nn.Parameter):
def __init__(self, tensor=None, requires_grad=False, no_init=False):
super().__init__()
if no_init:
return
self.gguf_cls = quants_mapping.get(tensor.tensor_type, None)
self.real_shape = torch.Size(reversed(list(tensor.shape)))
self.computation_dtype = torch.float16
self.baked = False
return
@property
def shape(self):
return self.real_shape
def __new__(cls, tensor=None, requires_grad=False, no_init=False):
return super().__new__(cls, torch.tensor(tensor.data), requires_grad=requires_grad)
def dequantize_as_pytorch_parameter(self):
if self.gguf_cls is not None:
self.gguf_cls.bake(self)
return torch.nn.Parameter(dequantize_tensor(self), requires_grad=False)
def copy_with_data(self, data):
new = ParameterGGUF(data, no_init=True)
new.gguf_cls = self.gguf_cls
new.real_shape = self.real_shape
new.computation_dtype = self.computation_dtype
new.baked = self.baked
return new
def to(self, *args, **kwargs):
return self.copy_with_data(self.data.to(*args, **kwargs))
def pin_memory(self, device=None):
return self.copy_with_data(torch.Tensor.pin_memory(self, device=device))
def dequantize_tensor(tensor):
if tensor is None:
return None
if not hasattr(tensor, 'gguf_cls'):
return tensor
gguf_cls = tensor.gguf_cls
if gguf_cls is None:
return tensor
return gguf_cls.dequantize_pytorch(tensor)