From 424baa110de0e5e140b614b20070cb6703936e7f Mon Sep 17 00:00:00 2001 From: DenOfEquity <166248528+DenOfEquity@users.noreply.github.com> Date: Sun, 25 Aug 2024 10:14:45 +0100 Subject: [PATCH 1/9] Update dragdrop.js remove unnecessary check for PNGInfo element enables repeated drag-drop into PNG Info fixes https://github.com/lllyasviel/stable-diffusion-webui-forge/issues/1485 --- javascript/dragdrop.js | 21 +-------------------- 1 file changed, 1 insertion(+), 20 deletions(-) diff --git a/javascript/dragdrop.js b/javascript/dragdrop.js index 882562d7..2b7a1456 100644 --- a/javascript/dragdrop.js +++ b/javascript/dragdrop.js @@ -26,26 +26,7 @@ function dropReplaceImage(imgWrap, files) { } }; - if (imgWrap.closest('#pnginfo_image')) { - // special treatment for PNG Info tab, wait for fetch request to finish - const oldFetch = window.fetch; - window.fetch = async(input, options) => { - const response = await oldFetch(input, options); - if ('api/predict/' === input) { - const content = await response.text(); - window.fetch = oldFetch; - window.requestAnimationFrame(() => callback()); - return new Response(content, { - status: response.status, - statusText: response.statusText, - headers: response.headers - }); - } - return response; - }; - } else { - window.requestAnimationFrame(() => callback()); - } + window.requestAnimationFrame(() => callback()); } function eventHasFiles(e) { From 13d6f8ed900b0857e9872e67befb02f7ed54da35 Mon Sep 17 00:00:00 2001 From: layerdiffusion <19834515+lllyasviel@users.noreply.github.com> Date: Sun, 25 Aug 2024 14:26:46 -0700 Subject: [PATCH 2/9] revise GGUF by precomputing some parameters rather than computing them in each diffusion iteration --- backend/loader.py | 4 + backend/operations.py | 12 +++ backend/operations_gguf.py | 27 +++++- backend/patcher/lora.py | 3 +- packages_3rdparty/gguf/quants.py | 139 +++++++++++++++++++++---------- 5 files changed, 137 insertions(+), 48 deletions(-) diff --git a/backend/loader.py b/backend/loader.py index 24dc4b40..9154c92e 100644 --- a/backend/loader.py +++ b/backend/loader.py @@ -162,6 +162,10 @@ def load_huggingface_component(guess, component_name, lib_name, cls_name, repo_p model.initial_device = initial_device model.offload_device = offload_device + if storage_dtype in ['gguf']: + from backend.operations_gguf import bake_gguf_model + model = bake_gguf_model(model) + return model print(f'Skipped: {component_name} = {lib_name}.{cls_name}') diff --git a/backend/operations.py b/backend/operations.py index 72cbfc0d..b3d34b82 100644 --- a/backend/operations.py +++ b/backend/operations.py @@ -405,12 +405,24 @@ class ForgeOperationsGGUF(ForgeOperations): self.weight = state_dict[prefix + 'weight'] if prefix + 'bias' in state_dict: self.bias = state_dict[prefix + 'bias'] + if self.weight is not None and hasattr(self.weight, 'parent'): + self.weight.parent = self + if self.bias is not None and hasattr(self.bias, 'parent'): + self.bias.parent = self + return def _apply(self, fn, recurse=True): if self.weight is not None: self.weight = utils.tensor2parameter(fn(self.weight)) if self.bias is not None: self.bias = utils.tensor2parameter(fn(self.bias)) + for i in range(5): + quant_state_name = f'quant_state_{i}' + quant_state = getattr(self, quant_state_name, None) + if quant_state is not None: + quant_state = fn(quant_state) + quant_state = utils.tensor2parameter(quant_state) + setattr(self, quant_state_name, quant_state) return self def forward(self, x): diff --git a/backend/operations_gguf.py b/backend/operations_gguf.py index 72da4604..5e190b40 100644 --- a/backend/operations_gguf.py +++ b/backend/operations_gguf.py @@ -27,6 +27,7 @@ class ParameterGGUF(torch.nn.Parameter): self.gguf_type = tensor.tensor_type self.gguf_real_shape = torch.Size(reversed(list(tensor.shape))) self.gguf_cls = quants_mapping.get(self.gguf_type, None) + self.parent = None @property def shape(self): @@ -43,6 +44,7 @@ class ParameterGGUF(torch.nn.Parameter): new.gguf_type = self.gguf_type new.gguf_real_shape = self.gguf_real_shape new.gguf_cls = self.gguf_cls + new.parent = self.parent return new def pin_memory(self, device=None): @@ -50,17 +52,38 @@ class ParameterGGUF(torch.nn.Parameter): new.gguf_type = self.gguf_type new.gguf_real_shape = self.gguf_real_shape new.gguf_cls = self.gguf_cls + new.parent = self.parent return new @classmethod - def make(cls, data, gguf_type, gguf_cls, gguf_real_shape): + def make(cls, data, gguf_type, gguf_cls, gguf_real_shape, parent): new = ParameterGGUF(data, no_init=True) new.gguf_type = gguf_type new.gguf_real_shape = gguf_real_shape new.gguf_cls = gguf_cls + new.parent = parent return new +def bake_gguf_model(model): + computation_dtype = model.computation_dtype + backed_layer_counter = 0 + + for m in model.modules(): + if hasattr(m, 'weight'): + weight = m.weight + if hasattr(weight, 'gguf_cls'): + gguf_cls = weight.gguf_cls + if gguf_cls is not None: + backed_layer_counter += 1 + gguf_cls.bake_layer(m, weight, computation_dtype) + + if backed_layer_counter > 0: + print(f'GGUF backed {backed_layer_counter} layers.') + + return model + + def dequantize_tensor(tensor): if tensor is None: return None @@ -68,7 +91,7 @@ def dequantize_tensor(tensor): if not hasattr(tensor, 'gguf_cls'): return tensor - data = torch.tensor(tensor.data) + data = tensor gguf_cls = tensor.gguf_cls gguf_real_shape = tensor.gguf_real_shape diff --git a/backend/patcher/lora.py b/backend/patcher/lora.py index 83da8fe5..fdd6f67d 100644 --- a/backend/patcher/lora.py +++ b/backend/patcher/lora.py @@ -425,7 +425,8 @@ class LoraLoader: data=weight, gguf_type=gguf_type, gguf_cls=gguf_cls, - gguf_real_shape=gguf_real_shape + gguf_real_shape=gguf_real_shape, + parent=parent_layer )) continue diff --git a/packages_3rdparty/gguf/quants.py b/packages_3rdparty/gguf/quants.py index c6c80c91..84083da9 100644 --- a/packages_3rdparty/gguf/quants.py +++ b/packages_3rdparty/gguf/quants.py @@ -89,6 +89,8 @@ class __Quant(ABC): grid_map: tuple[int | float, ...] = () grid_hex: bytes | None = None + computation_dtype: torch.dtype = torch.bfloat16 + def __init__(self): return TypeError("Quant conversion classes can't have instances") @@ -141,18 +143,29 @@ class __Quant(ABC): return blocks.reshape(original_shape) @classmethod - def dequantize_pytorch(cls, data: torch.Tensor, original_shape) -> torch.Tensor: - # (c) City96 || Apache-2.0 (apache.org/licenses/LICENSE-2.0) - block_size, type_size = GGML_QUANT_SIZES[cls.qtype] + def bake_layer(cls, layer, weight, computation_dtype): + data = weight.data + cls.computation_dtype = computation_dtype + cls.block_size, cls.type_size = GGML_QUANT_SIZES[cls.qtype] rows = data.reshape((-1, data.shape[-1])).view(torch.uint8) - n_blocks = rows.numel() // type_size - blocks = rows.reshape((n_blocks, type_size)) - blocks = cls.dequantize_blocks_pytorch(blocks, block_size, type_size) + n_blocks = rows.numel() // cls.type_size + blocks = rows.reshape((n_blocks, cls.type_size)) + weight.data = blocks + cls.bake_layer_weight(layer, weight) + return + + @classmethod + def bake_layer_weight(cls, layer, weight): + pass + + @classmethod + def dequantize_pytorch(cls, x, original_shape) -> torch.Tensor: + blocks = cls.dequantize_blocks_pytorch(x.data, cls.block_size, cls.type_size, x.parent) return blocks.reshape(original_shape) @classmethod @abstractmethod - def dequantize_blocks_pytorch(cls, blocks, block_size, type_size) -> torch.Tensor: + def dequantize_blocks_pytorch(cls, blocks, block_size, type_size, parent) -> torch.Tensor: raise NotImplementedError @classmethod @@ -289,15 +302,26 @@ class Q4_0(__Quant, qtype=GGMLQuantizationType.Q4_0): return (d * qs.astype(np.float32)) @classmethod - def dequantize_blocks_pytorch(cls, blocks, block_size, type_size) -> torch.Tensor: + def bake_layer_weight(cls, layer, weight): + blocks = weight.data + d, x = quick_split(blocks, [2]) + d = d.view(torch.float16).to(cls.computation_dtype) + weight.data = x + layer.quant_state_0 = torch.nn.Parameter(d, requires_grad=False) + return + + @classmethod + def dequantize_blocks_pytorch(cls, blocks, block_size, type_size, parent) -> torch.Tensor: n_blocks = blocks.shape[0] - d = blocks[:, :2].view(torch.float16) - qs = blocks[:, 2:] + d, qs = parent.quant_state_0, blocks + + if d.device != qs.device: + d = d.to(device=qs.device) qs = qs.reshape((n_blocks, -1, 1, block_size // 2)) >> torch.tensor([0, 4], device=d.device, dtype=torch.uint8).reshape((1, 1, 2, 1)) qs = (qs & 0x0F).reshape((n_blocks, -1)).to(torch.int8) - 8 - return d * qs + return (d * qs) @classmethod def quantize_blocks_pytorch(cls, blocks, block_size, type_size) -> torch.Tensor: @@ -358,12 +382,29 @@ class Q4_1(__Quant, qtype=GGMLQuantizationType.Q4_1): return (d * qs) + m @classmethod - def dequantize_blocks_pytorch(cls, blocks, block_size, type_size) -> torch.Tensor: + def bake_layer_weight(cls, layer, weight): + blocks = weight.data + + d, m, qs = quick_split(blocks, [2, 2]) + d = d.view(torch.float16).to(cls.computation_dtype) + m = m.view(torch.float16).to(cls.computation_dtype) + + weight.data = qs + layer.quant_state_0 = torch.nn.Parameter(d, requires_grad=False) + layer.quant_state_1 = torch.nn.Parameter(m, requires_grad=False) + return + + @classmethod + def dequantize_blocks_pytorch(cls, blocks, block_size, type_size, parent) -> torch.Tensor: n_blocks = blocks.shape[0] - d = blocks[:, :2].view(torch.float16) - m = blocks[:, 2:4].view(torch.float16) - qs = blocks[:, 4:] + d, m, qs = parent.quant_state_0, parent.quant_state_1, blocks + + if d.device != qs.device: + d = d.to(device=qs.device) + + if m.device != qs.device: + m = m.to(device=qs.device) qs = qs.reshape((n_blocks, -1, 1, block_size // 2)) >> torch.tensor([0, 4], device=d.device, dtype=torch.uint8).reshape(1, 1, 2, 1) qs = (qs & 0x0F).reshape(n_blocks, -1) @@ -414,7 +455,7 @@ class Q5_0(__Quant, qtype=GGMLQuantizationType.Q5_0): return (d * qs.astype(np.float32)) @classmethod - def dequantize_blocks_pytorch(cls, blocks, block_size, type_size) -> torch.Tensor: + def dequantize_blocks_pytorch(cls, blocks, block_size, type_size, parent) -> torch.Tensor: def to_uint32(x): # pytorch uint32 by City96 - Apache-2.0 x = x.view(torch.uint8).to(torch.int32) @@ -422,11 +463,8 @@ class Q5_0(__Quant, qtype=GGMLQuantizationType.Q5_0): n_blocks = blocks.shape[0] - d = blocks[:, :2] - qh = blocks[:, 2:6] - qs = blocks[:, 6:] - - d = d.view(torch.float16).to(torch.float32) + d, qh, qs = quick_split(blocks, [2, 4]) + d = d.view(torch.float16).to(cls.computation_dtype) qh = to_uint32(qh) qh = qh.reshape(n_blocks, 1) >> torch.arange(32, device=d.device, dtype=torch.int32).reshape(1, 32) @@ -436,7 +474,7 @@ class Q5_0(__Quant, qtype=GGMLQuantizationType.Q5_0): ql = (ql & 0x0F).reshape(n_blocks, -1) qs = (ql | (qh << 4)).to(torch.int8) - 16 - return d * qs + return (d * qs) @classmethod def quantize_blocks_pytorch(cls, blocks, block_size, type_size) -> torch.Tensor: @@ -520,7 +558,7 @@ class Q5_1(__Quant, qtype=GGMLQuantizationType.Q5_1): return (d * qs) + m @classmethod - def dequantize_blocks_pytorch(cls, blocks, block_size, type_size) -> torch.Tensor: + def dequantize_blocks_pytorch(cls, blocks, block_size, type_size, parent) -> torch.Tensor: def to_uint32(x): # pytorch uint32 by City96 - Apache-2.0 x = x.view(torch.uint8).to(torch.int32) @@ -528,11 +566,9 @@ class Q5_1(__Quant, qtype=GGMLQuantizationType.Q5_1): n_blocks = blocks.shape[0] - d = blocks[:, :2].view(torch.float16) - m = blocks[:, 2:4].view(torch.float16) - qh = blocks[:, 4:8] - qs = blocks[:, 8:] - + d, m, qh, qs = quick_split(blocks, [2, 2, 4]) + d = d.view(torch.float16).to(cls.computation_dtype) + m = m.view(torch.float16).to(cls.computation_dtype) qh = to_uint32(qh) qh = qh.reshape((n_blocks, 1)) >> torch.arange(32, device=d.device, dtype=torch.int32).reshape(1, 32) @@ -570,9 +606,22 @@ class Q8_0(__Quant, qtype=GGMLQuantizationType.Q8_0): return (x * d) @classmethod - def dequantize_blocks_pytorch(cls, blocks, block_size, type_size) -> torch.Tensor: - d = blocks[:, :2].view(torch.float16) - x = blocks[:, 2:].view(torch.int8).to(torch.float16) + def bake_layer_weight(cls, layer, weight): + blocks = weight.data + d, x = quick_split(blocks, [2]) + d = d.view(torch.float16).to(cls.computation_dtype) + weight.data = x + layer.quant_state_0 = torch.nn.Parameter(d, requires_grad=False) + return + + @classmethod + def dequantize_blocks_pytorch(cls, blocks, block_size, type_size, parent) -> torch.Tensor: + x = blocks + d = parent.quant_state_0 + + if d.device != x.device: + d = d.to(device=x.device) + return x * d @classmethod @@ -613,12 +662,12 @@ class Q2_K(__Quant, qtype=GGMLQuantizationType.Q2_K): return qs.reshape((n_blocks, -1)) @classmethod - def dequantize_blocks_pytorch(cls, blocks, block_size, type_size) -> torch.Tensor: + def dequantize_blocks_pytorch(cls, blocks, block_size, type_size, parent) -> torch.Tensor: # (c) City96 || Apache-2.0 (apache.org/licenses/LICENSE-2.0) n_blocks = blocks.shape[0] scales, qs, d, dmin = quick_split(blocks, [QK_K // 16, QK_K // 4, 2]) - d = d.view(torch.float16) - dmin = dmin.view(torch.float16) + d = d.view(torch.float16).to(cls.computation_dtype) + dmin = dmin.view(torch.float16).to(cls.computation_dtype) # (n_blocks, 16, 1) dl = (d * (scales & 0xF)).reshape((n_blocks, QK_K // 16, 1)) ml = (dmin * (scales >> 4)).reshape((n_blocks, QK_K // 16, 1)) @@ -673,11 +722,11 @@ class Q3_K(__Quant, qtype=GGMLQuantizationType.Q3_K): return (dl * q).reshape((n_blocks, QK_K)) @classmethod - def dequantize_blocks_pytorch(cls, blocks, block_size, type_size) -> torch.Tensor: + def dequantize_blocks_pytorch(cls, blocks, block_size, type_size, parent) -> torch.Tensor: # (c) City96 || Apache-2.0 (apache.org/licenses/LICENSE-2.0) n_blocks = blocks.shape[0] hmask, qs, scales, d = quick_split(blocks, [QK_K // 8, QK_K // 4, 12]) - d = d.view(torch.float16) + d = d.view(torch.float16).to(cls.computation_dtype) lscales, hscales = scales[:, :8], scales[:, 8:] lscales = lscales.reshape((n_blocks, 1, 8)) >> torch.tensor([0, 4], device=d.device, dtype=torch.uint8).reshape((1, 2, 1)) lscales = lscales.reshape((n_blocks, 16)) @@ -754,14 +803,14 @@ class Q4_K(__Quant, qtype=GGMLQuantizationType.Q4_K): return (d * qs - dm).reshape((n_blocks, QK_K)) @classmethod - def dequantize_blocks_pytorch(cls, blocks, block_size, type_size) -> torch.Tensor: + def dequantize_blocks_pytorch(cls, blocks, block_size, type_size, parent) -> torch.Tensor: # (c) City96 || Apache-2.0 (apache.org/licenses/LICENSE-2.0) QK_K = 256 K_SCALE_SIZE = 12 n_blocks = blocks.shape[0] d, dmin, scales, qs = quick_split(blocks, [2, 2, K_SCALE_SIZE]) - d = d.view(torch.float16) - dmin = dmin.view(torch.float16) + d = d.view(torch.float16).to(cls.computation_dtype) + dmin = dmin.view(torch.float16).to(cls.computation_dtype) sc, m = Q4_K.get_scale_min_pytorch(scales) d = (d * sc).reshape((n_blocks, -1, 1)) dm = (dmin * m).reshape((n_blocks, -1, 1)) @@ -797,14 +846,14 @@ class Q5_K(__Quant, qtype=GGMLQuantizationType.Q5_K): return (d * q - dm).reshape((n_blocks, QK_K)) @classmethod - def dequantize_blocks_pytorch(cls, blocks, block_size, type_size) -> torch.Tensor: + def dequantize_blocks_pytorch(cls, blocks, block_size, type_size, parent) -> torch.Tensor: # (c) City96 || Apache-2.0 (apache.org/licenses/LICENSE-2.0) QK_K = 256 K_SCALE_SIZE = 12 n_blocks = blocks.shape[0] d, dmin, scales, qh, qs = quick_split(blocks, [2, 2, K_SCALE_SIZE, QK_K // 8]) - d = d.view(torch.float16) - dmin = dmin.view(torch.float16) + d = d.view(torch.float16).to(cls.computation_dtype) + dmin = dmin.view(torch.float16).to(cls.computation_dtype) sc, m = Q4_K.get_scale_min_pytorch(scales) d = (d * sc).reshape((n_blocks, -1, 1)) dm = (dmin * m).reshape((n_blocks, -1, 1)) @@ -839,12 +888,12 @@ class Q6_K(__Quant, qtype=GGMLQuantizationType.Q6_K): return (d * q).reshape((n_blocks, QK_K)) @classmethod - def dequantize_blocks_pytorch(cls, blocks, block_size, type_size) -> torch.Tensor: + def dequantize_blocks_pytorch(cls, blocks, block_size, type_size, parent) -> torch.Tensor: # Written by ChatGPT n_blocks = blocks.shape[0] ql, qh, scales, d, = quick_split(blocks, [QK_K // 2, QK_K // 4, QK_K // 16]) - scales = scales.view(torch.int8) - d = d.view(torch.float16) + scales = scales.view(torch.int8).to(cls.computation_dtype) + d = d.view(torch.float16).to(cls.computation_dtype) d = (d * scales).reshape((n_blocks, QK_K // 16, 1)) ql = ql.reshape((n_blocks, -1, 1, 64)) >> torch.tensor([0, 4], device=d.device, dtype=torch.uint8).reshape((1, 1, 2, 1)) ql = (ql & 0x0F).reshape((n_blocks, -1, 32)) From 868f662eb69add3931e36d679133421683f59c3f Mon Sep 17 00:00:00 2001 From: layerdiffusion <19834515+lllyasviel@users.noreply.github.com> Date: Sun, 25 Aug 2024 14:44:01 -0700 Subject: [PATCH 3/9] fix --- packages_3rdparty/gguf/quants.py | 1 + 1 file changed, 1 insertion(+) diff --git a/packages_3rdparty/gguf/quants.py b/packages_3rdparty/gguf/quants.py index 84083da9..dcea8f5a 100644 --- a/packages_3rdparty/gguf/quants.py +++ b/packages_3rdparty/gguf/quants.py @@ -609,6 +609,7 @@ class Q8_0(__Quant, qtype=GGMLQuantizationType.Q8_0): def bake_layer_weight(cls, layer, weight): blocks = weight.data d, x = quick_split(blocks, [2]) + x = x.view(torch.int8) d = d.view(torch.float16).to(cls.computation_dtype) weight.data = x layer.quant_state_0 = torch.nn.Parameter(d, requires_grad=False) From e60bb1c96fbcc257a4dbfc8d212df24a363cf379 Mon Sep 17 00:00:00 2001 From: layerdiffusion <19834515+lllyasviel@users.noreply.github.com> Date: Sun, 25 Aug 2024 15:02:54 -0700 Subject: [PATCH 4/9] Make Q4_K_S as fast as Q4_0 by baking the layer when model load --- packages_3rdparty/gguf/quants.py | 29 +++++++++++++++++++++++++---- 1 file changed, 25 insertions(+), 4 deletions(-) diff --git a/packages_3rdparty/gguf/quants.py b/packages_3rdparty/gguf/quants.py index dcea8f5a..98cd0ff5 100644 --- a/packages_3rdparty/gguf/quants.py +++ b/packages_3rdparty/gguf/quants.py @@ -804,9 +804,10 @@ class Q4_K(__Quant, qtype=GGMLQuantizationType.Q4_K): return (d * qs - dm).reshape((n_blocks, QK_K)) @classmethod - def dequantize_blocks_pytorch(cls, blocks, block_size, type_size, parent) -> torch.Tensor: - # (c) City96 || Apache-2.0 (apache.org/licenses/LICENSE-2.0) - QK_K = 256 + def bake_layer_weight(cls, layer, weight): # Only compute one time when model load + # Copyright Forge 2024 + + blocks = weight.data K_SCALE_SIZE = 12 n_blocks = blocks.shape[0] d, dmin, scales, qs = quick_split(blocks, [2, 2, K_SCALE_SIZE]) @@ -814,7 +815,27 @@ class Q4_K(__Quant, qtype=GGMLQuantizationType.Q4_K): dmin = dmin.view(torch.float16).to(cls.computation_dtype) sc, m = Q4_K.get_scale_min_pytorch(scales) d = (d * sc).reshape((n_blocks, -1, 1)) - dm = (dmin * m).reshape((n_blocks, -1, 1)) + dm = (dmin * m).reshape((n_blocks, -1, 1)).to(cls.computation_dtype) + + weight.data = qs + layer.quant_state_0 = torch.nn.Parameter(d, requires_grad=False) + layer.quant_state_1 = torch.nn.Parameter(dm, requires_grad=False) + return + + @classmethod + def dequantize_blocks_pytorch(cls, blocks, block_size, type_size, parent) -> torch.Tensor: + # Compute in each diffusion iteration + + n_blocks = blocks.shape[0] + + d, dm, qs = parent.quant_state_0, parent.quant_state_1, blocks + + if d.device != qs.device: + d = d.to(device=qs.device) + + if dm.device != qs.device: + dm = dm.to(device=qs.device) + qs = qs.reshape((n_blocks, -1, 1, 32)) >> torch.tensor([0, 4], device=d.device, dtype=torch.uint8).reshape((1, 1, 2, 1)) qs = (qs & 0x0F).reshape((n_blocks, -1, 32)) return (d * qs - dm).reshape((n_blocks, QK_K)) From 82dfc2b15be168c43e4d65585343b3911f561297 Mon Sep 17 00:00:00 2001 From: layerdiffusion <19834515+lllyasviel@users.noreply.github.com> Date: Sun, 25 Aug 2024 16:49:23 -0700 Subject: [PATCH 5/9] Significantly speed up Q4_0, Q4_1, Q4_K by precomputing all possible 4bit dequant into a lookup table and use pytorch indexing to get dequant, rather than really computing the bit operations. This should give very similar performance to native CUDA kernels, while being LoRA friendly and more flexiable --- packages_3rdparty/gguf/quants.py | 23 ++++----- packages_3rdparty/gguf/quick_4bits_ops.py | 61 +++++++++++++++++++++++ 2 files changed, 72 insertions(+), 12 deletions(-) create mode 100644 packages_3rdparty/gguf/quick_4bits_ops.py diff --git a/packages_3rdparty/gguf/quants.py b/packages_3rdparty/gguf/quants.py index 98cd0ff5..c0d144d5 100644 --- a/packages_3rdparty/gguf/quants.py +++ b/packages_3rdparty/gguf/quants.py @@ -8,6 +8,7 @@ from numpy.typing import DTypeLike from .constants import GGML_QUANT_SIZES, GGMLQuantizationType, QK_K from .lazy import LazyNumpyTensor +from .quick_4bits_ops import change_4bits_order, quick_unpack_4bits, quick_unpack_4bits_u import numpy as np @@ -306,22 +307,20 @@ class Q4_0(__Quant, qtype=GGMLQuantizationType.Q4_0): blocks = weight.data d, x = quick_split(blocks, [2]) d = d.view(torch.float16).to(cls.computation_dtype) + x = change_4bits_order(x) weight.data = x layer.quant_state_0 = torch.nn.Parameter(d, requires_grad=False) return @classmethod def dequantize_blocks_pytorch(cls, blocks, block_size, type_size, parent) -> torch.Tensor: - n_blocks = blocks.shape[0] - d, qs = parent.quant_state_0, blocks if d.device != qs.device: d = d.to(device=qs.device) - qs = qs.reshape((n_blocks, -1, 1, block_size // 2)) >> torch.tensor([0, 4], device=d.device, dtype=torch.uint8).reshape((1, 1, 2, 1)) - qs = (qs & 0x0F).reshape((n_blocks, -1)).to(torch.int8) - 8 - return (d * qs) + qs = quick_unpack_4bits(qs) + return d * qs @classmethod def quantize_blocks_pytorch(cls, blocks, block_size, type_size) -> torch.Tensor: @@ -389,6 +388,8 @@ class Q4_1(__Quant, qtype=GGMLQuantizationType.Q4_1): d = d.view(torch.float16).to(cls.computation_dtype) m = m.view(torch.float16).to(cls.computation_dtype) + qs = change_4bits_order(qs) + weight.data = qs layer.quant_state_0 = torch.nn.Parameter(d, requires_grad=False) layer.quant_state_1 = torch.nn.Parameter(m, requires_grad=False) @@ -396,8 +397,6 @@ class Q4_1(__Quant, qtype=GGMLQuantizationType.Q4_1): @classmethod def dequantize_blocks_pytorch(cls, blocks, block_size, type_size, parent) -> torch.Tensor: - n_blocks = blocks.shape[0] - d, m, qs = parent.quant_state_0, parent.quant_state_1, blocks if d.device != qs.device: @@ -406,9 +405,7 @@ class Q4_1(__Quant, qtype=GGMLQuantizationType.Q4_1): if m.device != qs.device: m = m.to(device=qs.device) - qs = qs.reshape((n_blocks, -1, 1, block_size // 2)) >> torch.tensor([0, 4], device=d.device, dtype=torch.uint8).reshape(1, 1, 2, 1) - qs = (qs & 0x0F).reshape(n_blocks, -1) - + qs = quick_unpack_4bits_u(qs) return (d * qs) + m @@ -817,6 +814,9 @@ class Q4_K(__Quant, qtype=GGMLQuantizationType.Q4_K): d = (d * sc).reshape((n_blocks, -1, 1)) dm = (dmin * m).reshape((n_blocks, -1, 1)).to(cls.computation_dtype) + qs = qs.reshape((n_blocks, -1, 1, 32)) + qs = change_4bits_order(qs) + weight.data = qs layer.quant_state_0 = torch.nn.Parameter(d, requires_grad=False) layer.quant_state_1 = torch.nn.Parameter(dm, requires_grad=False) @@ -836,8 +836,7 @@ class Q4_K(__Quant, qtype=GGMLQuantizationType.Q4_K): if dm.device != qs.device: dm = dm.to(device=qs.device) - qs = qs.reshape((n_blocks, -1, 1, 32)) >> torch.tensor([0, 4], device=d.device, dtype=torch.uint8).reshape((1, 1, 2, 1)) - qs = (qs & 0x0F).reshape((n_blocks, -1, 32)) + qs = quick_unpack_4bits_u(qs).reshape((n_blocks, -1, 32)) return (d * qs - dm).reshape((n_blocks, QK_K)) diff --git a/packages_3rdparty/gguf/quick_4bits_ops.py b/packages_3rdparty/gguf/quick_4bits_ops.py new file mode 100644 index 00000000..97404bbc --- /dev/null +++ b/packages_3rdparty/gguf/quick_4bits_ops.py @@ -0,0 +1,61 @@ +# By Forge + + +import torch + + +def native_unpack_4x4bits_in_1x16bits_to_4x8bits_in_1x32bits(x): + x = x.view(torch.uint8).view(x.size(0), -1) + unpacked = torch.stack([x & 15, x >> 4], dim=-1) + reshaped = unpacked.view(x.size(0), -1) + reshaped = reshaped.to(torch.int8) - 8 + return reshaped.view(torch.int32) + + +def native_unpack_4x4bits_in_1x16bits_to_4x8bits_in_1x32bits_u(x): + x = x.view(torch.uint8).view(x.size(0), -1) + unpacked = torch.stack([x & 15, x >> 4], dim=-1) + reshaped = unpacked.view(x.size(0), -1) + return reshaped.view(torch.int32) + + +native_4bits_lookup_table = native_unpack_4x4bits_in_1x16bits_to_4x8bits_in_1x32bits(torch.arange(start=0, end=256*256, dtype=torch.long).to(torch.uint16))[:, 0] +native_4bits_lookup_table_u = native_unpack_4x4bits_in_1x16bits_to_4x8bits_in_1x32bits_u(torch.arange(start=0, end=256*256, dtype=torch.long).to(torch.uint16))[:, 0] + + +def quick_unpack_4bits(x): + global native_4bits_lookup_table + + s0 = x.size(0) + x = x.view(torch.uint16) + + if native_4bits_lookup_table.device != x.device: + native_4bits_lookup_table = native_4bits_lookup_table.to(device=x.device) + + y = torch.index_select(input=native_4bits_lookup_table, dim=0, index=x.to(dtype=torch.int32).flatten()) + y = y.view(torch.int8) + y = y.view(s0, -1) + + return y + + +def quick_unpack_4bits_u(x): + global native_4bits_lookup_table_u + + s0 = x.size(0) + x = x.view(torch.uint16) + + if native_4bits_lookup_table_u.device != x.device: + native_4bits_lookup_table_u = native_4bits_lookup_table_u.to(device=x.device) + + y = torch.index_select(input=native_4bits_lookup_table_u, dim=0, index=x.to(dtype=torch.int32).flatten()) + y = y.view(torch.uint8) + y = y.view(s0, -1) + + return y + + +def change_4bits_order(x): + y = torch.stack([x & 15, x >> 4], dim=-2).view(x.size(0), -1) + z = y[:, ::2] | (y[:, 1::2] << 4) + return z From cae37a2725227949d08fa77df7001a235913a611 Mon Sep 17 00:00:00 2001 From: layerdiffusion <19834515+lllyasviel@users.noreply.github.com> Date: Sun, 25 Aug 2024 17:24:31 -0700 Subject: [PATCH 6/9] fix dequant of unbaked parameters --- backend/operations_gguf.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/backend/operations_gguf.py b/backend/operations_gguf.py index 5e190b40..d2c43c8c 100644 --- a/backend/operations_gguf.py +++ b/backend/operations_gguf.py @@ -37,6 +37,9 @@ class ParameterGGUF(torch.nn.Parameter): return super().__new__(cls, torch.tensor(tensor.data), requires_grad=requires_grad) def dequantize_as_pytorch_parameter(self): + if self.parent is None: + self.parent = torch.nn.Module() + self.gguf_cls.bake_layer(self.parent, self, computation_dtype=torch.float16) return torch.nn.Parameter(dequantize_tensor(self), requires_grad=False) def to(self, *args, **kwargs): From b25b62da96bdf8518fdc815c946730c545854764 Mon Sep 17 00:00:00 2001 From: layerdiffusion <19834515+lllyasviel@users.noreply.github.com> Date: Sun, 25 Aug 2024 17:31:50 -0700 Subject: [PATCH 7/9] fix T5 not baked --- backend/loader.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/backend/loader.py b/backend/loader.py index 9154c92e..734f8855 100644 --- a/backend/loader.py +++ b/backend/loader.py @@ -104,6 +104,11 @@ def load_huggingface_component(guess, component_name, lib_name, cls_name, repo_p load_state_dict(model, state_dict, log_name=cls_name, ignore_errors=['transformer.encoder.embed_tokens.weight', 'logit_scale']) + if storage_dtype in ['gguf']: + from backend.operations_gguf import bake_gguf_model + model.computation_dtype = torch.float16 + model = bake_gguf_model(model) + return model if cls_name in ['UNet2DConditionModel', 'FluxTransformer2DModel']: assert isinstance(state_dict, dict) and len(state_dict) > 16, 'You do not have model state dict!' From 891e355fc8030503f1fb04304b869f175189b03e Mon Sep 17 00:00:00 2001 From: lllyasviel <19834515+lllyasviel@users.noreply.github.com> Date: Sun, 25 Aug 2024 17:38:54 -0700 Subject: [PATCH 8/9] Update README.md --- README.md | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/README.md b/README.md index 6eec83b4..a806b485 100644 --- a/README.md +++ b/README.md @@ -6,6 +6,23 @@ The name "Forge" is inspired from "Minecraft Forge". This project is aimed at be Forge is currently based on SD-WebUI 1.10.1 at [this commit](https://github.com/AUTOMATIC1111/stable-diffusion-webui/commit/82a973c04367123ae98bd9abdf80d9eda9b910e2). (Because original SD-WebUI is almost static now, Forge will sync with original WebUI every 90 days, or when important fixes.) +### Forge Issue&Discussion is Under Attack Now + +Today, a group of attackers attacked Forge Repo questions/discussions by sending spam files with viruses to all questions/discussions. + +As a protection, issue and discussion is in temp outage now. We will resume issues and discussions soon. + +Screenshots: + +(DO NOT download any file from those attackers!) + +![image](https://github.com/user-attachments/assets/45fa406f-bdc3-4df4-aaa7-1a7544aac342) + +![image](https://github.com/user-attachments/assets/c73ecefd-bcb5-42bb-a39e-d0070645b484) + +![image](https://github.com/user-attachments/assets/734d47cd-05d8-4ce3-ab21-97f5e3d364ff) + + # Quick List [Gradio 4 UI Must Read (TLDR: You need to use RIGHT MOUSE BUTTON to move canvas!)](https://github.com/lllyasviel/stable-diffusion-webui-forge/discussions/853) From 388b70134b10a71d8666a6c1c2749d7ad896467f Mon Sep 17 00:00:00 2001 From: layerdiffusion <19834515+lllyasviel@users.noreply.github.com> Date: Sun, 25 Aug 2024 20:28:40 -0700 Subject: [PATCH 9/9] fix offline loras --- backend/patcher/lora.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/backend/patcher/lora.py b/backend/patcher/lora.py index fdd6f67d..cb18c871 100644 --- a/backend/patcher/lora.py +++ b/backend/patcher/lora.py @@ -421,13 +421,15 @@ class LoraLoader: if gguf_cls is not None: from backend.operations_gguf import ParameterGGUF weight = gguf_cls.quantize_pytorch(weight, gguf_real_shape) - utils.set_attr_raw(self.model, key, ParameterGGUF.make( + weight = ParameterGGUF.make( data=weight, gguf_type=gguf_type, gguf_cls=gguf_cls, gguf_real_shape=gguf_real_shape, parent=parent_layer - )) + ) + gguf_cls.bake_layer(parent_layer, weight, gguf_cls.computation_dtype) + utils.set_attr_raw(self.model, key, weight) continue utils.set_attr_raw(self.model, key, torch.nn.Parameter(weight, requires_grad=False))