mirror of
https://github.com/lllyasviel/stable-diffusion-webui-forge.git
synced 2026-02-21 07:13:56 +00:00
... based on 3 evidences: 1. torch.Tensor.view on one big tensor is slightly faster than calling torch.Tensor.to on multiple small tensors. 2. but torch.Tensor.to with dtype change is significantly slower than torch.Tensor.view 3. “baking” model on GPU is significantly faster than computing on CPU when model load. mainly influence inference of Q8_0, Q4_0/1/K and loading of all quants
71 lines
2.1 KiB
Python
71 lines
2.1 KiB
Python
import gguf
|
|
import torch
|
|
|
|
|
|
quants_mapping = {
|
|
gguf.GGMLQuantizationType.Q2_K: gguf.Q2_K,
|
|
gguf.GGMLQuantizationType.Q3_K: gguf.Q3_K,
|
|
gguf.GGMLQuantizationType.Q4_0: gguf.Q4_0,
|
|
gguf.GGMLQuantizationType.Q4_K: gguf.Q4_K,
|
|
gguf.GGMLQuantizationType.Q4_1: gguf.Q4_1,
|
|
gguf.GGMLQuantizationType.Q5_0: gguf.Q5_0,
|
|
gguf.GGMLQuantizationType.Q5_1: gguf.Q5_1,
|
|
gguf.GGMLQuantizationType.Q5_K: gguf.Q5_K,
|
|
gguf.GGMLQuantizationType.Q6_K: gguf.Q6_K,
|
|
gguf.GGMLQuantizationType.Q8_0: gguf.Q8_0,
|
|
}
|
|
|
|
|
|
class ParameterGGUF(torch.nn.Parameter):
|
|
def __init__(self, tensor=None, requires_grad=False, no_init=False):
|
|
super().__init__()
|
|
if no_init:
|
|
return
|
|
|
|
self.gguf_cls = quants_mapping.get(tensor.tensor_type, None)
|
|
self.real_shape = torch.Size(reversed(list(tensor.shape)))
|
|
self.computation_dtype = torch.float16
|
|
self.baked = False
|
|
return
|
|
|
|
@property
|
|
def shape(self):
|
|
return self.real_shape
|
|
|
|
def __new__(cls, tensor=None, requires_grad=False, no_init=False):
|
|
return super().__new__(cls, torch.tensor(tensor.data), requires_grad=requires_grad)
|
|
|
|
def dequantize_as_pytorch_parameter(self):
|
|
if self.gguf_cls is not None:
|
|
self.gguf_cls.bake(self)
|
|
return torch.nn.Parameter(dequantize_tensor(self), requires_grad=False)
|
|
|
|
def copy_with_data(self, data):
|
|
new = ParameterGGUF(data, no_init=True)
|
|
new.gguf_cls = self.gguf_cls
|
|
new.real_shape = self.real_shape
|
|
new.computation_dtype = self.computation_dtype
|
|
new.baked = self.baked
|
|
return new
|
|
|
|
def to(self, *args, **kwargs):
|
|
return self.copy_with_data(self.data.to(*args, **kwargs))
|
|
|
|
def pin_memory(self, device=None):
|
|
return self.copy_with_data(torch.Tensor.pin_memory(self, device=device))
|
|
|
|
|
|
def dequantize_tensor(tensor):
|
|
if tensor is None:
|
|
return None
|
|
|
|
if not hasattr(tensor, 'gguf_cls'):
|
|
return tensor
|
|
|
|
gguf_cls = tensor.gguf_cls
|
|
|
|
if gguf_cls is None:
|
|
return tensor
|
|
|
|
return gguf_cls.dequantize_pytorch(tensor)
|