mirror of
https://github.com/lllyasviel/stable-diffusion-webui-forge.git
synced 2026-05-01 03:31:30 +00:00
reimplement q8/q85/q4 and review and match official gguf
This commit is contained in:
@@ -2,6 +2,13 @@ import gguf
|
|||||||
import torch
|
import torch
|
||||||
|
|
||||||
|
|
||||||
|
quants_mapping = {
|
||||||
|
gguf.GGMLQuantizationType.Q4_0: gguf.Q4_0,
|
||||||
|
gguf.GGMLQuantizationType.Q5_0: gguf.Q5_0,
|
||||||
|
gguf.GGMLQuantizationType.Q8_0: gguf.Q8_0,
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
def functional_linear_gguf(x, weight, bias=None):
|
def functional_linear_gguf(x, weight, bias=None):
|
||||||
target_dtype = x.dtype
|
target_dtype = x.dtype
|
||||||
weight = dequantize_tensor(weight, target_dtype)
|
weight = dequantize_tensor(weight, target_dtype)
|
||||||
@@ -20,80 +27,9 @@ def dequantize_tensor(tensor, target_dtype=torch.float16):
|
|||||||
if gguf_type in [gguf.GGMLQuantizationType.F32, gguf.GGMLQuantizationType.F16, gguf.GGMLQuantizationType.BF16]:
|
if gguf_type in [gguf.GGMLQuantizationType.F32, gguf.GGMLQuantizationType.F16, gguf.GGMLQuantizationType.BF16]:
|
||||||
return data.to(target_dtype)
|
return data.to(target_dtype)
|
||||||
|
|
||||||
if gguf_type not in dequantize_functions:
|
if gguf_type not in quants_mapping:
|
||||||
raise NotImplementedError(f'Quant type {gguf_type} not implemented!')
|
raise NotImplementedError(f'Quant type {gguf_type} not implemented!')
|
||||||
|
|
||||||
return dequantize(data, gguf_type, gguf_real_shape).to(target_dtype)
|
quant_cls = quants_mapping.get(gguf_type)
|
||||||
|
|
||||||
|
return quant_cls.dequantize_pytorch(data, gguf_real_shape).to(target_dtype)
|
||||||
def dequantize(data, qtype, oshape):
|
|
||||||
# (c) City96 || Apache-2.0 (apache.org/licenses/LICENSE-2.0)
|
|
||||||
|
|
||||||
"""
|
|
||||||
Dequantize tensor back to usable shape/dtype
|
|
||||||
"""
|
|
||||||
block_size, type_size = gguf.GGML_QUANT_SIZES[qtype]
|
|
||||||
dequantize_blocks = dequantize_functions[qtype]
|
|
||||||
|
|
||||||
rows = data.reshape(
|
|
||||||
(-1, data.shape[-1])
|
|
||||||
).view(torch.uint8)
|
|
||||||
|
|
||||||
n_blocks = rows.numel() // type_size
|
|
||||||
blocks = rows.reshape((n_blocks, type_size))
|
|
||||||
blocks = dequantize_blocks(blocks, block_size, type_size)
|
|
||||||
return blocks.reshape(oshape)
|
|
||||||
|
|
||||||
|
|
||||||
def dequantize_blocks_Q8_0(blocks, block_size, type_size):
|
|
||||||
# (c) City96 || Apache-2.0 (apache.org/licenses/LICENSE-2.0)
|
|
||||||
|
|
||||||
d = blocks[:, :2].view(torch.float16)
|
|
||||||
x = blocks[:, 2:].view(torch.int8).to(torch.float16)
|
|
||||||
return x * d
|
|
||||||
|
|
||||||
|
|
||||||
def dequantize_blocks_Q5_0(blocks, block_size, type_size):
|
|
||||||
# (c) City96 || Apache-2.0 (apache.org/licenses/LICENSE-2.0)
|
|
||||||
|
|
||||||
def to_uint32(x):
|
|
||||||
x = x.view(torch.uint8).to(torch.int32)
|
|
||||||
return (x[:, 0] | x[:, 1] << 8 | x[:, 2] << 16 | x[:, 3] << 24).unsqueeze(1)
|
|
||||||
|
|
||||||
n_blocks = blocks.shape[0]
|
|
||||||
|
|
||||||
d = blocks[:, :2]
|
|
||||||
qh = blocks[:, 2:6]
|
|
||||||
qs = blocks[:, 6:]
|
|
||||||
|
|
||||||
d = d.view(torch.float16).to(torch.float32)
|
|
||||||
qh = to_uint32(qh)
|
|
||||||
|
|
||||||
qh = qh.reshape(n_blocks, 1) >> torch.arange(32, device=d.device, dtype=torch.int32).reshape(1, 32)
|
|
||||||
ql = qs.reshape(n_blocks, -1, 1, block_size // 2) >> torch.tensor([0, 4], device=d.device, dtype=torch.uint8).reshape(1, 1, 2, 1)
|
|
||||||
|
|
||||||
qh = (qh & 1).to(torch.uint8)
|
|
||||||
ql = (ql & 0x0F).reshape(n_blocks, -1)
|
|
||||||
|
|
||||||
qs = (ql | (qh << 4)).to(torch.int8) - 16
|
|
||||||
return d * qs
|
|
||||||
|
|
||||||
|
|
||||||
def dequantize_blocks_Q4_0(blocks, block_size, type_size):
|
|
||||||
# (c) City96 || Apache-2.0 (apache.org/licenses/LICENSE-2.0)
|
|
||||||
|
|
||||||
n_blocks = blocks.shape[0]
|
|
||||||
|
|
||||||
d = blocks[:, :2].view(torch.float16)
|
|
||||||
qs = blocks[:, 2:]
|
|
||||||
|
|
||||||
qs = qs.reshape((n_blocks, -1, 1, block_size // 2)) >> torch.tensor([0, 4], device=d.device, dtype=torch.uint8).reshape((1, 1, 2, 1))
|
|
||||||
qs = (qs & 0x0F).reshape((n_blocks, -1)).to(torch.int8) - 8
|
|
||||||
return d * qs
|
|
||||||
|
|
||||||
|
|
||||||
dequantize_functions = {
|
|
||||||
gguf.GGMLQuantizationType.Q8_0: dequantize_blocks_Q8_0,
|
|
||||||
gguf.GGMLQuantizationType.Q5_0: dequantize_blocks_Q5_0,
|
|
||||||
gguf.GGMLQuantizationType.Q4_0: dequantize_blocks_Q4_0,
|
|
||||||
}
|
|
||||||
|
|||||||
2
packages_3rdparty/gguf/README.md
vendored
Normal file
2
packages_3rdparty/gguf/README.md
vendored
Normal file
@@ -0,0 +1,2 @@
|
|||||||
|
This is Forge's implementation of GGUF - the difference is that it supports pytorch quant/dequant
|
||||||
|
Codes are based on LLama.cpp's GGUF - the difference is that it supports quant
|
||||||
58
packages_3rdparty/gguf/quants.py
vendored
58
packages_3rdparty/gguf/quants.py
vendored
@@ -3,6 +3,7 @@ from abc import ABC, abstractmethod
|
|||||||
from typing import Any, Callable, Sequence
|
from typing import Any, Callable, Sequence
|
||||||
from math import log2, ceil
|
from math import log2, ceil
|
||||||
|
|
||||||
|
import torch
|
||||||
from numpy.typing import DTypeLike
|
from numpy.typing import DTypeLike
|
||||||
|
|
||||||
from .constants import GGML_QUANT_SIZES, GGMLQuantizationType, QK_K
|
from .constants import GGML_QUANT_SIZES, GGMLQuantizationType, QK_K
|
||||||
@@ -123,6 +124,21 @@ class __Quant(ABC):
|
|||||||
grid = np.take_along_axis(grid_map, grid, axis=-1)
|
grid = np.take_along_axis(grid_map, grid, axis=-1)
|
||||||
cls.grid = grid.reshape((1, 1, *cls.grid_shape))
|
cls.grid = grid.reshape((1, 1, *cls.grid_shape))
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def dequantize_pytorch(cls, data: torch.Tensor, original_shape=torch.float16) -> torch.Tensor:
|
||||||
|
# (c) City96 || Apache-2.0 (apache.org/licenses/LICENSE-2.0)
|
||||||
|
block_size, type_size = GGML_QUANT_SIZES[cls.qtype]
|
||||||
|
rows = data.reshape((-1, data.shape[-1])).view(torch.uint8)
|
||||||
|
n_blocks = rows.numel() // type_size
|
||||||
|
blocks = rows.reshape((n_blocks, type_size))
|
||||||
|
blocks = cls.dequantize_blocks_pytorch(blocks, block_size, type_size)
|
||||||
|
return blocks.reshape(original_shape)
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
@abstractmethod
|
||||||
|
def dequantize_blocks_pytorch(cls, blocks, block_size, type_size) -> torch.Tensor:
|
||||||
|
raise NotImplementedError
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
@abstractmethod
|
@abstractmethod
|
||||||
def quantize_blocks(cls, blocks: np.ndarray) -> np.ndarray:
|
def quantize_blocks(cls, blocks: np.ndarray) -> np.ndarray:
|
||||||
@@ -251,6 +267,17 @@ class Q4_0(__Quant, qtype=GGMLQuantizationType.Q4_0):
|
|||||||
|
|
||||||
return (d * qs.astype(np.float32))
|
return (d * qs.astype(np.float32))
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def dequantize_blocks_pytorch(cls, blocks, block_size, type_size) -> torch.Tensor:
|
||||||
|
n_blocks = blocks.shape[0]
|
||||||
|
|
||||||
|
d = blocks[:, :2].view(torch.float16)
|
||||||
|
qs = blocks[:, 2:]
|
||||||
|
|
||||||
|
qs = qs.reshape((n_blocks, -1, 1, block_size // 2)) >> torch.tensor([0, 4], device=d.device, dtype=torch.uint8).reshape((1, 1, 2, 1))
|
||||||
|
qs = (qs & 0x0F).reshape((n_blocks, -1)).to(torch.int8) - 8
|
||||||
|
return d * qs
|
||||||
|
|
||||||
|
|
||||||
class Q4_1(__Quant, qtype=GGMLQuantizationType.Q4_1):
|
class Q4_1(__Quant, qtype=GGMLQuantizationType.Q4_1):
|
||||||
@classmethod
|
@classmethod
|
||||||
@@ -331,6 +358,31 @@ class Q5_0(__Quant, qtype=GGMLQuantizationType.Q5_0):
|
|||||||
|
|
||||||
return (d * qs.astype(np.float32))
|
return (d * qs.astype(np.float32))
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def dequantize_blocks_pytorch(cls, blocks, block_size, type_size) -> torch.Tensor:
|
||||||
|
def to_uint32(x):
|
||||||
|
# pytorch uint32 by City96 - Apache-2.0
|
||||||
|
x = x.view(torch.uint8).to(torch.int32)
|
||||||
|
return (x[:, 0] | x[:, 1] << 8 | x[:, 2] << 16 | x[:, 3] << 24).unsqueeze(1)
|
||||||
|
|
||||||
|
n_blocks = blocks.shape[0]
|
||||||
|
|
||||||
|
d = blocks[:, :2]
|
||||||
|
qh = blocks[:, 2:6]
|
||||||
|
qs = blocks[:, 6:]
|
||||||
|
|
||||||
|
d = d.view(torch.float16).to(torch.float32)
|
||||||
|
qh = to_uint32(qh)
|
||||||
|
|
||||||
|
qh = qh.reshape(n_blocks, 1) >> torch.arange(32, device=d.device, dtype=torch.int32).reshape(1, 32)
|
||||||
|
ql = qs.reshape(n_blocks, -1, 1, block_size // 2) >> torch.tensor([0, 4], device=d.device, dtype=torch.uint8).reshape(1, 1, 2, 1)
|
||||||
|
|
||||||
|
qh = (qh & 1).to(torch.uint8)
|
||||||
|
ql = (ql & 0x0F).reshape(n_blocks, -1)
|
||||||
|
|
||||||
|
qs = (ql | (qh << 4)).to(torch.int8) - 16
|
||||||
|
return d * qs
|
||||||
|
|
||||||
|
|
||||||
class Q5_1(__Quant, qtype=GGMLQuantizationType.Q5_1):
|
class Q5_1(__Quant, qtype=GGMLQuantizationType.Q5_1):
|
||||||
@classmethod
|
@classmethod
|
||||||
@@ -402,6 +454,12 @@ class Q8_0(__Quant, qtype=GGMLQuantizationType.Q8_0):
|
|||||||
|
|
||||||
return (x * d)
|
return (x * d)
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def dequantize_blocks_pytorch(cls, blocks, block_size, type_size) -> torch.Tensor:
|
||||||
|
d = blocks[:, :2].view(torch.float16)
|
||||||
|
x = blocks[:, 2:].view(torch.int8).to(torch.float16)
|
||||||
|
return x * d
|
||||||
|
|
||||||
|
|
||||||
class Q2_K(__Quant, qtype=GGMLQuantizationType.Q2_K):
|
class Q2_K(__Quant, qtype=GGMLQuantizationType.Q2_K):
|
||||||
@classmethod
|
@classmethod
|
||||||
|
|||||||
Reference in New Issue
Block a user