From 424baa110de0e5e140b614b20070cb6703936e7f Mon Sep 17 00:00:00 2001
From: DenOfEquity <166248528+DenOfEquity@users.noreply.github.com>
Date: Sun, 25 Aug 2024 10:14:45 +0100
Subject: [PATCH 1/9] Update dragdrop.js

remove unnecessary check for PNGInfo element
enables repeated drag-drop into PNG Info
fixes https://github.com/lllyasviel/stable-diffusion-webui-forge/issues/1485
---
 javascript/dragdrop.js | 21 +--------------------
 1 file changed, 1 insertion(+), 20 deletions(-)

diff --git a/javascript/dragdrop.js b/javascript/dragdrop.js
index 882562d7..2b7a1456 100644
--- a/javascript/dragdrop.js
+++ b/javascript/dragdrop.js
@@ -26,26 +26,7 @@ function dropReplaceImage(imgWrap, files) {
         }
     };
 
-    if (imgWrap.closest('#pnginfo_image')) {
-        // special treatment for PNG Info tab, wait for fetch request to finish
-        const oldFetch = window.fetch;
-        window.fetch = async(input, options) => {
-            const response = await oldFetch(input, options);
-            if ('api/predict/' === input) {
-                const content = await response.text();
-                window.fetch = oldFetch;
-                window.requestAnimationFrame(() => callback());
-                return new Response(content, {
-                    status: response.status,
-                    statusText: response.statusText,
-                    headers: response.headers
-                });
-            }
-            return response;
-        };
-    } else {
-        window.requestAnimationFrame(() => callback());
-    }
+    window.requestAnimationFrame(() => callback());
 }
 
 function eventHasFiles(e) {

From 13d6f8ed900b0857e9872e67befb02f7ed54da35 Mon Sep 17 00:00:00 2001
From: layerdiffusion <19834515+lllyasviel@users.noreply.github.com>
Date: Sun, 25 Aug 2024 14:26:46 -0700
Subject: [PATCH 2/9] revise GGUF by precomputing some parameters

rather than computing them in each diffusion iteration
---
 backend/loader.py                |   4 +
 backend/operations.py            |  12 +++
 backend/operations_gguf.py       |  27 +++++-
 backend/patcher/lora.py          |   3 +-
 packages_3rdparty/gguf/quants.py | 139 +++++++++++++++++++++----------
 5 files changed, 137 insertions(+), 48 deletions(-)

diff --git a/backend/loader.py b/backend/loader.py
index 24dc4b40..9154c92e 100644
--- a/backend/loader.py
+++ b/backend/loader.py
@@ -162,6 +162,10 @@ def load_huggingface_component(guess, component_name, lib_name, cls_name, repo_p
             model.initial_device = initial_device
             model.offload_device = offload_device
 
+            if storage_dtype in ['gguf']:
+                from backend.operations_gguf import bake_gguf_model
+                model = bake_gguf_model(model)
+
             return model
 
     print(f'Skipped: {component_name} = {lib_name}.{cls_name}')
diff --git a/backend/operations.py b/backend/operations.py
index 72cbfc0d..b3d34b82 100644
--- a/backend/operations.py
+++ b/backend/operations.py
@@ -405,12 +405,24 @@ class ForgeOperationsGGUF(ForgeOperations):
                     self.weight = state_dict[prefix + 'weight']
                 if prefix + 'bias' in state_dict:
                     self.bias = state_dict[prefix + 'bias']
+            if self.weight is not None and hasattr(self.weight, 'parent'):
+                self.weight.parent = self
+            if self.bias is not None and hasattr(self.bias, 'parent'):
+                self.bias.parent = self
+            return
 
         def _apply(self, fn, recurse=True):
             if self.weight is not None:
                 self.weight = utils.tensor2parameter(fn(self.weight))
             if self.bias is not None:
                 self.bias = utils.tensor2parameter(fn(self.bias))
+            for i in range(5):
+                quant_state_name = f'quant_state_{i}'
+                quant_state = getattr(self, quant_state_name, None)
+                if quant_state is not None:
+                    quant_state = fn(quant_state)
+                    quant_state = utils.tensor2parameter(quant_state)
+                    setattr(self, quant_state_name, quant_state)
             return self
 
         def forward(self, x):
diff --git a/backend/operations_gguf.py b/backend/operations_gguf.py
index 72da4604..5e190b40 100644
--- a/backend/operations_gguf.py
+++ b/backend/operations_gguf.py
@@ -27,6 +27,7 @@ class ParameterGGUF(torch.nn.Parameter):
         self.gguf_type = tensor.tensor_type
         self.gguf_real_shape = torch.Size(reversed(list(tensor.shape)))
         self.gguf_cls = quants_mapping.get(self.gguf_type, None)
+        self.parent = None
 
     @property
     def shape(self):
@@ -43,6 +44,7 @@ class ParameterGGUF(torch.nn.Parameter):
         new.gguf_type = self.gguf_type
         new.gguf_real_shape = self.gguf_real_shape
         new.gguf_cls = self.gguf_cls
+        new.parent = self.parent
         return new
 
     def pin_memory(self, device=None):
@@ -50,17 +52,38 @@ class ParameterGGUF(torch.nn.Parameter):
         new.gguf_type = self.gguf_type
         new.gguf_real_shape = self.gguf_real_shape
         new.gguf_cls = self.gguf_cls
+        new.parent = self.parent
         return new
 
     @classmethod
-    def make(cls, data, gguf_type, gguf_cls, gguf_real_shape):
+    def make(cls, data, gguf_type, gguf_cls, gguf_real_shape, parent):
         new = ParameterGGUF(data, no_init=True)
         new.gguf_type = gguf_type
         new.gguf_real_shape = gguf_real_shape
         new.gguf_cls = gguf_cls
+        new.parent = parent
         return new
 
 
+def bake_gguf_model(model):
+    computation_dtype = model.computation_dtype
+    backed_layer_counter = 0
+
+    for m in model.modules():
+        if hasattr(m, 'weight'):
+            weight = m.weight
+            if hasattr(weight, 'gguf_cls'):
+                gguf_cls = weight.gguf_cls
+                if gguf_cls is not None:
+                    backed_layer_counter += 1
+                    gguf_cls.bake_layer(m, weight, computation_dtype)
+
+    if backed_layer_counter > 0:
+        print(f'GGUF backed {backed_layer_counter} layers.')
+
+    return model
+
+
 def dequantize_tensor(tensor):
     if tensor is None:
         return None
@@ -68,7 +91,7 @@ def dequantize_tensor(tensor):
     if not hasattr(tensor, 'gguf_cls'):
         return tensor
 
-    data = torch.tensor(tensor.data)
+    data = tensor
     gguf_cls = tensor.gguf_cls
     gguf_real_shape = tensor.gguf_real_shape
 
diff --git a/backend/patcher/lora.py b/backend/patcher/lora.py
index 83da8fe5..fdd6f67d 100644
--- a/backend/patcher/lora.py
+++ b/backend/patcher/lora.py
@@ -425,7 +425,8 @@ class LoraLoader:
                     data=weight,
                     gguf_type=gguf_type,
                     gguf_cls=gguf_cls,
-                    gguf_real_shape=gguf_real_shape
+                    gguf_real_shape=gguf_real_shape,
+                    parent=parent_layer
                 ))
                 continue
 
diff --git a/packages_3rdparty/gguf/quants.py b/packages_3rdparty/gguf/quants.py
index c6c80c91..84083da9 100644
--- a/packages_3rdparty/gguf/quants.py
+++ b/packages_3rdparty/gguf/quants.py
@@ -89,6 +89,8 @@ class __Quant(ABC):
     grid_map: tuple[int | float, ...] = ()
     grid_hex: bytes | None = None
 
+    computation_dtype: torch.dtype = torch.bfloat16
+
     def __init__(self):
         return TypeError("Quant conversion classes can't have instances")
 
@@ -141,18 +143,29 @@ class __Quant(ABC):
         return blocks.reshape(original_shape)
 
     @classmethod
-    def dequantize_pytorch(cls, data: torch.Tensor, original_shape) -> torch.Tensor:
-        # (c) City96 || Apache-2.0 (apache.org/licenses/LICENSE-2.0)
-        block_size, type_size = GGML_QUANT_SIZES[cls.qtype]
+    def bake_layer(cls, layer, weight, computation_dtype):
+        data = weight.data
+        cls.computation_dtype = computation_dtype
+        cls.block_size, cls.type_size = GGML_QUANT_SIZES[cls.qtype]
         rows = data.reshape((-1, data.shape[-1])).view(torch.uint8)
-        n_blocks = rows.numel() // type_size
-        blocks = rows.reshape((n_blocks, type_size))
-        blocks = cls.dequantize_blocks_pytorch(blocks, block_size, type_size)
+        n_blocks = rows.numel() // cls.type_size
+        blocks = rows.reshape((n_blocks, cls.type_size))
+        weight.data = blocks
+        cls.bake_layer_weight(layer, weight)
+        return
+
+    @classmethod
+    def bake_layer_weight(cls, layer, weight):
+        pass
+
+    @classmethod
+    def dequantize_pytorch(cls, x, original_shape) -> torch.Tensor:
+        blocks = cls.dequantize_blocks_pytorch(x.data, cls.block_size, cls.type_size, x.parent)
         return blocks.reshape(original_shape)
 
     @classmethod
     @abstractmethod
-    def dequantize_blocks_pytorch(cls, blocks, block_size, type_size) -> torch.Tensor:
+    def dequantize_blocks_pytorch(cls, blocks, block_size, type_size, parent) -> torch.Tensor:
         raise NotImplementedError
 
     @classmethod
@@ -289,15 +302,26 @@ class Q4_0(__Quant, qtype=GGMLQuantizationType.Q4_0):
         return (d * qs.astype(np.float32))
 
     @classmethod
-    def dequantize_blocks_pytorch(cls, blocks, block_size, type_size) -> torch.Tensor:
+    def bake_layer_weight(cls, layer, weight):
+        blocks = weight.data
+        d, x = quick_split(blocks, [2])
+        d = d.view(torch.float16).to(cls.computation_dtype)
+        weight.data = x
+        layer.quant_state_0 = torch.nn.Parameter(d, requires_grad=False)
+        return
+
+    @classmethod
+    def dequantize_blocks_pytorch(cls, blocks, block_size, type_size, parent) -> torch.Tensor:
         n_blocks = blocks.shape[0]
 
-        d = blocks[:, :2].view(torch.float16)
-        qs = blocks[:, 2:]
+        d, qs = parent.quant_state_0, blocks
+
+        if d.device != qs.device:
+            d = d.to(device=qs.device)
 
         qs = qs.reshape((n_blocks, -1, 1, block_size // 2)) >> torch.tensor([0, 4], device=d.device, dtype=torch.uint8).reshape((1, 1, 2, 1))
         qs = (qs & 0x0F).reshape((n_blocks, -1)).to(torch.int8) - 8
-        return d * qs
+        return (d * qs)
 
     @classmethod
     def quantize_blocks_pytorch(cls, blocks, block_size, type_size) -> torch.Tensor:
@@ -358,12 +382,29 @@ class Q4_1(__Quant, qtype=GGMLQuantizationType.Q4_1):
         return (d * qs) + m
 
     @classmethod
-    def dequantize_blocks_pytorch(cls, blocks, block_size, type_size) -> torch.Tensor:
+    def bake_layer_weight(cls, layer, weight):
+        blocks = weight.data
+
+        d, m, qs = quick_split(blocks, [2, 2])
+        d = d.view(torch.float16).to(cls.computation_dtype)
+        m = m.view(torch.float16).to(cls.computation_dtype)
+
+        weight.data = qs
+        layer.quant_state_0 = torch.nn.Parameter(d, requires_grad=False)
+        layer.quant_state_1 = torch.nn.Parameter(m, requires_grad=False)
+        return
+
+    @classmethod
+    def dequantize_blocks_pytorch(cls, blocks, block_size, type_size, parent) -> torch.Tensor:
         n_blocks = blocks.shape[0]
 
-        d = blocks[:, :2].view(torch.float16)
-        m = blocks[:, 2:4].view(torch.float16)
-        qs = blocks[:, 4:]
+        d, m, qs = parent.quant_state_0, parent.quant_state_1, blocks
+
+        if d.device != qs.device:
+            d = d.to(device=qs.device)
+
+        if m.device != qs.device:
+            m = m.to(device=qs.device)
 
         qs = qs.reshape((n_blocks, -1, 1, block_size // 2)) >> torch.tensor([0, 4], device=d.device, dtype=torch.uint8).reshape(1, 1, 2, 1)
         qs = (qs & 0x0F).reshape(n_blocks, -1)
@@ -414,7 +455,7 @@ class Q5_0(__Quant, qtype=GGMLQuantizationType.Q5_0):
         return (d * qs.astype(np.float32))
 
     @classmethod
-    def dequantize_blocks_pytorch(cls, blocks, block_size, type_size) -> torch.Tensor:
+    def dequantize_blocks_pytorch(cls, blocks, block_size, type_size, parent) -> torch.Tensor:
         def to_uint32(x):
             # pytorch uint32 by City96 - Apache-2.0
             x = x.view(torch.uint8).to(torch.int32)
@@ -422,11 +463,8 @@ class Q5_0(__Quant, qtype=GGMLQuantizationType.Q5_0):
 
         n_blocks = blocks.shape[0]
 
-        d = blocks[:, :2]
-        qh = blocks[:, 2:6]
-        qs = blocks[:, 6:]
-
-        d = d.view(torch.float16).to(torch.float32)
+        d, qh, qs = quick_split(blocks, [2, 4])
+        d = d.view(torch.float16).to(cls.computation_dtype)
         qh = to_uint32(qh)
 
         qh = qh.reshape(n_blocks, 1) >> torch.arange(32, device=d.device, dtype=torch.int32).reshape(1, 32)
@@ -436,7 +474,7 @@ class Q5_0(__Quant, qtype=GGMLQuantizationType.Q5_0):
         ql = (ql & 0x0F).reshape(n_blocks, -1)
 
         qs = (ql | (qh << 4)).to(torch.int8) - 16
-        return d * qs
+        return (d * qs)
 
     @classmethod
     def quantize_blocks_pytorch(cls, blocks, block_size, type_size) -> torch.Tensor:
@@ -520,7 +558,7 @@ class Q5_1(__Quant, qtype=GGMLQuantizationType.Q5_1):
         return (d * qs) + m
 
     @classmethod
-    def dequantize_blocks_pytorch(cls, blocks, block_size, type_size) -> torch.Tensor:
+    def dequantize_blocks_pytorch(cls, blocks, block_size, type_size, parent) -> torch.Tensor:
         def to_uint32(x):
             # pytorch uint32 by City96 - Apache-2.0
             x = x.view(torch.uint8).to(torch.int32)
@@ -528,11 +566,9 @@ class Q5_1(__Quant, qtype=GGMLQuantizationType.Q5_1):
 
         n_blocks = blocks.shape[0]
 
-        d = blocks[:, :2].view(torch.float16)
-        m = blocks[:, 2:4].view(torch.float16)
-        qh = blocks[:, 4:8]
-        qs = blocks[:, 8:]
-
+        d, m, qh, qs = quick_split(blocks, [2, 2, 4])
+        d = d.view(torch.float16).to(cls.computation_dtype)
+        m = m.view(torch.float16).to(cls.computation_dtype)
         qh = to_uint32(qh)
 
         qh = qh.reshape((n_blocks, 1)) >> torch.arange(32, device=d.device, dtype=torch.int32).reshape(1, 32)
@@ -570,9 +606,22 @@ class Q8_0(__Quant, qtype=GGMLQuantizationType.Q8_0):
         return (x * d)
 
     @classmethod
-    def dequantize_blocks_pytorch(cls, blocks, block_size, type_size) -> torch.Tensor:
-        d = blocks[:, :2].view(torch.float16)
-        x = blocks[:, 2:].view(torch.int8).to(torch.float16)
+    def bake_layer_weight(cls, layer, weight):
+        blocks = weight.data
+        d, x = quick_split(blocks, [2])
+        d = d.view(torch.float16).to(cls.computation_dtype)
+        weight.data = x
+        layer.quant_state_0 = torch.nn.Parameter(d, requires_grad=False)
+        return
+
+    @classmethod
+    def dequantize_blocks_pytorch(cls, blocks, block_size, type_size, parent) -> torch.Tensor:
+        x = blocks
+        d = parent.quant_state_0
+
+        if d.device != x.device:
+            d = d.to(device=x.device)
+
         return x * d
 
     @classmethod
@@ -613,12 +662,12 @@ class Q2_K(__Quant, qtype=GGMLQuantizationType.Q2_K):
         return qs.reshape((n_blocks, -1))
 
     @classmethod
-    def dequantize_blocks_pytorch(cls, blocks, block_size, type_size) -> torch.Tensor:
+    def dequantize_blocks_pytorch(cls, blocks, block_size, type_size, parent) -> torch.Tensor:
         # (c) City96 || Apache-2.0 (apache.org/licenses/LICENSE-2.0)
         n_blocks = blocks.shape[0]
         scales, qs, d, dmin = quick_split(blocks, [QK_K // 16, QK_K // 4, 2])
-        d = d.view(torch.float16)
-        dmin = dmin.view(torch.float16)
+        d = d.view(torch.float16).to(cls.computation_dtype)
+        dmin = dmin.view(torch.float16).to(cls.computation_dtype)
         # (n_blocks, 16, 1)
         dl = (d * (scales & 0xF)).reshape((n_blocks, QK_K // 16, 1))
         ml = (dmin * (scales >> 4)).reshape((n_blocks, QK_K // 16, 1))
@@ -673,11 +722,11 @@ class Q3_K(__Quant, qtype=GGMLQuantizationType.Q3_K):
         return (dl * q).reshape((n_blocks, QK_K))
 
     @classmethod
-    def dequantize_blocks_pytorch(cls, blocks, block_size, type_size) -> torch.Tensor:
+    def dequantize_blocks_pytorch(cls, blocks, block_size, type_size, parent) -> torch.Tensor:
         # (c) City96 || Apache-2.0 (apache.org/licenses/LICENSE-2.0)
         n_blocks = blocks.shape[0]
         hmask, qs, scales, d = quick_split(blocks, [QK_K // 8, QK_K // 4, 12])
-        d = d.view(torch.float16)
+        d = d.view(torch.float16).to(cls.computation_dtype)
         lscales, hscales = scales[:, :8], scales[:, 8:]
         lscales = lscales.reshape((n_blocks, 1, 8)) >> torch.tensor([0, 4], device=d.device, dtype=torch.uint8).reshape((1, 2, 1))
         lscales = lscales.reshape((n_blocks, 16))
@@ -754,14 +803,14 @@ class Q4_K(__Quant, qtype=GGMLQuantizationType.Q4_K):
         return (d * qs - dm).reshape((n_blocks, QK_K))
 
     @classmethod
-    def dequantize_blocks_pytorch(cls, blocks, block_size, type_size) -> torch.Tensor:
+    def dequantize_blocks_pytorch(cls, blocks, block_size, type_size, parent) -> torch.Tensor:
         # (c) City96 || Apache-2.0 (apache.org/licenses/LICENSE-2.0)
         QK_K = 256
         K_SCALE_SIZE = 12
         n_blocks = blocks.shape[0]
         d, dmin, scales, qs = quick_split(blocks, [2, 2, K_SCALE_SIZE])
-        d = d.view(torch.float16)
-        dmin = dmin.view(torch.float16)
+        d = d.view(torch.float16).to(cls.computation_dtype)
+        dmin = dmin.view(torch.float16).to(cls.computation_dtype)
         sc, m = Q4_K.get_scale_min_pytorch(scales)
         d = (d * sc).reshape((n_blocks, -1, 1))
         dm = (dmin * m).reshape((n_blocks, -1, 1))
@@ -797,14 +846,14 @@ class Q5_K(__Quant, qtype=GGMLQuantizationType.Q5_K):
         return (d * q - dm).reshape((n_blocks, QK_K))
 
     @classmethod
-    def dequantize_blocks_pytorch(cls, blocks, block_size, type_size) -> torch.Tensor:
+    def dequantize_blocks_pytorch(cls, blocks, block_size, type_size, parent) -> torch.Tensor:
         # (c) City96 || Apache-2.0 (apache.org/licenses/LICENSE-2.0)
         QK_K = 256
         K_SCALE_SIZE = 12
         n_blocks = blocks.shape[0]
         d, dmin, scales, qh, qs = quick_split(blocks, [2, 2, K_SCALE_SIZE, QK_K // 8])
-        d = d.view(torch.float16)
-        dmin = dmin.view(torch.float16)
+        d = d.view(torch.float16).to(cls.computation_dtype)
+        dmin = dmin.view(torch.float16).to(cls.computation_dtype)
         sc, m = Q4_K.get_scale_min_pytorch(scales)
         d = (d * sc).reshape((n_blocks, -1, 1))
         dm = (dmin * m).reshape((n_blocks, -1, 1))
@@ -839,12 +888,12 @@ class Q6_K(__Quant, qtype=GGMLQuantizationType.Q6_K):
         return (d * q).reshape((n_blocks, QK_K))
 
     @classmethod
-    def dequantize_blocks_pytorch(cls, blocks, block_size, type_size) -> torch.Tensor:
+    def dequantize_blocks_pytorch(cls, blocks, block_size, type_size, parent) -> torch.Tensor:
         # Written by ChatGPT
         n_blocks = blocks.shape[0]
         ql, qh, scales, d, = quick_split(blocks, [QK_K // 2, QK_K // 4, QK_K // 16])
-        scales = scales.view(torch.int8)
-        d = d.view(torch.float16)
+        scales = scales.view(torch.int8).to(cls.computation_dtype)
+        d = d.view(torch.float16).to(cls.computation_dtype)
         d = (d * scales).reshape((n_blocks, QK_K // 16, 1))
         ql = ql.reshape((n_blocks, -1, 1, 64)) >> torch.tensor([0, 4], device=d.device, dtype=torch.uint8).reshape((1, 1, 2, 1))
         ql = (ql & 0x0F).reshape((n_blocks, -1, 32))

From 868f662eb69add3931e36d679133421683f59c3f Mon Sep 17 00:00:00 2001
From: layerdiffusion <19834515+lllyasviel@users.noreply.github.com>
Date: Sun, 25 Aug 2024 14:44:01 -0700
Subject: [PATCH 3/9] fix

---
 packages_3rdparty/gguf/quants.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/packages_3rdparty/gguf/quants.py b/packages_3rdparty/gguf/quants.py
index 84083da9..dcea8f5a 100644
--- a/packages_3rdparty/gguf/quants.py
+++ b/packages_3rdparty/gguf/quants.py
@@ -609,6 +609,7 @@ class Q8_0(__Quant, qtype=GGMLQuantizationType.Q8_0):
     def bake_layer_weight(cls, layer, weight):
         blocks = weight.data
         d, x = quick_split(blocks, [2])
+        x = x.view(torch.int8)
         d = d.view(torch.float16).to(cls.computation_dtype)
         weight.data = x
         layer.quant_state_0 = torch.nn.Parameter(d, requires_grad=False)

From e60bb1c96fbcc257a4dbfc8d212df24a363cf379 Mon Sep 17 00:00:00 2001
From: layerdiffusion <19834515+lllyasviel@users.noreply.github.com>
Date: Sun, 25 Aug 2024 15:02:54 -0700
Subject: [PATCH 4/9] Make Q4_K_S as fast as Q4_0

by baking the layer when model load
---
 packages_3rdparty/gguf/quants.py | 29 +++++++++++++++++++++++++----
 1 file changed, 25 insertions(+), 4 deletions(-)

diff --git a/packages_3rdparty/gguf/quants.py b/packages_3rdparty/gguf/quants.py
index dcea8f5a..98cd0ff5 100644
--- a/packages_3rdparty/gguf/quants.py
+++ b/packages_3rdparty/gguf/quants.py
@@ -804,9 +804,10 @@ class Q4_K(__Quant, qtype=GGMLQuantizationType.Q4_K):
         return (d * qs - dm).reshape((n_blocks, QK_K))
 
     @classmethod
-    def dequantize_blocks_pytorch(cls, blocks, block_size, type_size, parent) -> torch.Tensor:
-        # (c) City96 || Apache-2.0 (apache.org/licenses/LICENSE-2.0)
-        QK_K = 256
+    def bake_layer_weight(cls, layer, weight):  # Only compute one time when model load
+        # Copyright Forge 2024
+
+        blocks = weight.data
         K_SCALE_SIZE = 12
         n_blocks = blocks.shape[0]
         d, dmin, scales, qs = quick_split(blocks, [2, 2, K_SCALE_SIZE])
@@ -814,7 +815,27 @@ class Q4_K(__Quant, qtype=GGMLQuantizationType.Q4_K):
         dmin = dmin.view(torch.float16).to(cls.computation_dtype)
         sc, m = Q4_K.get_scale_min_pytorch(scales)
         d = (d * sc).reshape((n_blocks, -1, 1))
-        dm = (dmin * m).reshape((n_blocks, -1, 1))
+        dm = (dmin * m).reshape((n_blocks, -1, 1)).to(cls.computation_dtype)
+
+        weight.data = qs
+        layer.quant_state_0 = torch.nn.Parameter(d, requires_grad=False)
+        layer.quant_state_1 = torch.nn.Parameter(dm, requires_grad=False)
+        return
+
+    @classmethod
+    def dequantize_blocks_pytorch(cls, blocks, block_size, type_size, parent) -> torch.Tensor:
+        # Compute in each diffusion iteration
+
+        n_blocks = blocks.shape[0]
+
+        d, dm, qs = parent.quant_state_0, parent.quant_state_1, blocks
+
+        if d.device != qs.device:
+            d = d.to(device=qs.device)
+
+        if dm.device != qs.device:
+            dm = dm.to(device=qs.device)
+
         qs = qs.reshape((n_blocks, -1, 1, 32)) >> torch.tensor([0, 4], device=d.device, dtype=torch.uint8).reshape((1, 1, 2, 1))
         qs = (qs & 0x0F).reshape((n_blocks, -1, 32))
         return (d * qs - dm).reshape((n_blocks, QK_K))

From 82dfc2b15be168c43e4d65585343b3911f561297 Mon Sep 17 00:00:00 2001
From: layerdiffusion <19834515+lllyasviel@users.noreply.github.com>
Date: Sun, 25 Aug 2024 16:49:23 -0700
Subject: [PATCH 5/9] Significantly speed up Q4_0, Q4_1, Q4_K

by precomputing all possible 4bit dequant into a lookup table and use pytorch indexing to get dequant, rather than really computing the bit operations.
This should give very similar performance to native CUDA kernels, while being LoRA friendly and more flexiable
---
 packages_3rdparty/gguf/quants.py          | 23 ++++-----
 packages_3rdparty/gguf/quick_4bits_ops.py | 61 +++++++++++++++++++++++
 2 files changed, 72 insertions(+), 12 deletions(-)
 create mode 100644 packages_3rdparty/gguf/quick_4bits_ops.py

diff --git a/packages_3rdparty/gguf/quants.py b/packages_3rdparty/gguf/quants.py
index 98cd0ff5..c0d144d5 100644
--- a/packages_3rdparty/gguf/quants.py
+++ b/packages_3rdparty/gguf/quants.py
@@ -8,6 +8,7 @@ from numpy.typing import DTypeLike
 
 from .constants import GGML_QUANT_SIZES, GGMLQuantizationType, QK_K
 from .lazy import LazyNumpyTensor
+from .quick_4bits_ops import change_4bits_order, quick_unpack_4bits, quick_unpack_4bits_u
 
 import numpy as np
 
@@ -306,22 +307,20 @@ class Q4_0(__Quant, qtype=GGMLQuantizationType.Q4_0):
         blocks = weight.data
         d, x = quick_split(blocks, [2])
         d = d.view(torch.float16).to(cls.computation_dtype)
+        x = change_4bits_order(x)
         weight.data = x
         layer.quant_state_0 = torch.nn.Parameter(d, requires_grad=False)
         return
 
     @classmethod
     def dequantize_blocks_pytorch(cls, blocks, block_size, type_size, parent) -> torch.Tensor:
-        n_blocks = blocks.shape[0]
-
         d, qs = parent.quant_state_0, blocks
 
         if d.device != qs.device:
             d = d.to(device=qs.device)
 
-        qs = qs.reshape((n_blocks, -1, 1, block_size // 2)) >> torch.tensor([0, 4], device=d.device, dtype=torch.uint8).reshape((1, 1, 2, 1))
-        qs = (qs & 0x0F).reshape((n_blocks, -1)).to(torch.int8) - 8
-        return (d * qs)
+        qs = quick_unpack_4bits(qs)
+        return d * qs
 
     @classmethod
     def quantize_blocks_pytorch(cls, blocks, block_size, type_size) -> torch.Tensor:
@@ -389,6 +388,8 @@ class Q4_1(__Quant, qtype=GGMLQuantizationType.Q4_1):
         d = d.view(torch.float16).to(cls.computation_dtype)
         m = m.view(torch.float16).to(cls.computation_dtype)
 
+        qs = change_4bits_order(qs)
+
         weight.data = qs
         layer.quant_state_0 = torch.nn.Parameter(d, requires_grad=False)
         layer.quant_state_1 = torch.nn.Parameter(m, requires_grad=False)
@@ -396,8 +397,6 @@ class Q4_1(__Quant, qtype=GGMLQuantizationType.Q4_1):
 
     @classmethod
     def dequantize_blocks_pytorch(cls, blocks, block_size, type_size, parent) -> torch.Tensor:
-        n_blocks = blocks.shape[0]
-
         d, m, qs = parent.quant_state_0, parent.quant_state_1, blocks
 
         if d.device != qs.device:
@@ -406,9 +405,7 @@ class Q4_1(__Quant, qtype=GGMLQuantizationType.Q4_1):
         if m.device != qs.device:
             m = m.to(device=qs.device)
 
-        qs = qs.reshape((n_blocks, -1, 1, block_size // 2)) >> torch.tensor([0, 4], device=d.device, dtype=torch.uint8).reshape(1, 1, 2, 1)
-        qs = (qs & 0x0F).reshape(n_blocks, -1)
-
+        qs = quick_unpack_4bits_u(qs)
         return (d * qs) + m
 
 
@@ -817,6 +814,9 @@ class Q4_K(__Quant, qtype=GGMLQuantizationType.Q4_K):
         d = (d * sc).reshape((n_blocks, -1, 1))
         dm = (dmin * m).reshape((n_blocks, -1, 1)).to(cls.computation_dtype)
 
+        qs = qs.reshape((n_blocks, -1, 1, 32))
+        qs = change_4bits_order(qs)
+
         weight.data = qs
         layer.quant_state_0 = torch.nn.Parameter(d, requires_grad=False)
         layer.quant_state_1 = torch.nn.Parameter(dm, requires_grad=False)
@@ -836,8 +836,7 @@ class Q4_K(__Quant, qtype=GGMLQuantizationType.Q4_K):
         if dm.device != qs.device:
             dm = dm.to(device=qs.device)
 
-        qs = qs.reshape((n_blocks, -1, 1, 32)) >> torch.tensor([0, 4], device=d.device, dtype=torch.uint8).reshape((1, 1, 2, 1))
-        qs = (qs & 0x0F).reshape((n_blocks, -1, 32))
+        qs = quick_unpack_4bits_u(qs).reshape((n_blocks, -1, 32))
         return (d * qs - dm).reshape((n_blocks, QK_K))
 
 
diff --git a/packages_3rdparty/gguf/quick_4bits_ops.py b/packages_3rdparty/gguf/quick_4bits_ops.py
new file mode 100644
index 00000000..97404bbc
--- /dev/null
+++ b/packages_3rdparty/gguf/quick_4bits_ops.py
@@ -0,0 +1,61 @@
+# By Forge
+
+
+import torch
+
+
+def native_unpack_4x4bits_in_1x16bits_to_4x8bits_in_1x32bits(x):
+    x = x.view(torch.uint8).view(x.size(0), -1)
+    unpacked = torch.stack([x & 15, x >> 4], dim=-1)
+    reshaped = unpacked.view(x.size(0), -1)
+    reshaped = reshaped.to(torch.int8) - 8
+    return reshaped.view(torch.int32)
+
+
+def native_unpack_4x4bits_in_1x16bits_to_4x8bits_in_1x32bits_u(x):
+    x = x.view(torch.uint8).view(x.size(0), -1)
+    unpacked = torch.stack([x & 15, x >> 4], dim=-1)
+    reshaped = unpacked.view(x.size(0), -1)
+    return reshaped.view(torch.int32)
+
+
+native_4bits_lookup_table = native_unpack_4x4bits_in_1x16bits_to_4x8bits_in_1x32bits(torch.arange(start=0, end=256*256, dtype=torch.long).to(torch.uint16))[:, 0]
+native_4bits_lookup_table_u = native_unpack_4x4bits_in_1x16bits_to_4x8bits_in_1x32bits_u(torch.arange(start=0, end=256*256, dtype=torch.long).to(torch.uint16))[:, 0]
+
+
+def quick_unpack_4bits(x):
+    global native_4bits_lookup_table
+
+    s0 = x.size(0)
+    x = x.view(torch.uint16)
+
+    if native_4bits_lookup_table.device != x.device:
+        native_4bits_lookup_table = native_4bits_lookup_table.to(device=x.device)
+
+    y = torch.index_select(input=native_4bits_lookup_table, dim=0, index=x.to(dtype=torch.int32).flatten())
+    y = y.view(torch.int8)
+    y = y.view(s0, -1)
+
+    return y
+
+
+def quick_unpack_4bits_u(x):
+    global native_4bits_lookup_table_u
+
+    s0 = x.size(0)
+    x = x.view(torch.uint16)
+
+    if native_4bits_lookup_table_u.device != x.device:
+        native_4bits_lookup_table_u = native_4bits_lookup_table_u.to(device=x.device)
+
+    y = torch.index_select(input=native_4bits_lookup_table_u, dim=0, index=x.to(dtype=torch.int32).flatten())
+    y = y.view(torch.uint8)
+    y = y.view(s0, -1)
+
+    return y
+
+
+def change_4bits_order(x):
+    y = torch.stack([x & 15, x >> 4], dim=-2).view(x.size(0), -1)
+    z = y[:, ::2] | (y[:, 1::2] << 4)
+    return z

From cae37a2725227949d08fa77df7001a235913a611 Mon Sep 17 00:00:00 2001
From: layerdiffusion <19834515+lllyasviel@users.noreply.github.com>
Date: Sun, 25 Aug 2024 17:24:31 -0700
Subject: [PATCH 6/9] fix dequant of unbaked parameters

---
 backend/operations_gguf.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/backend/operations_gguf.py b/backend/operations_gguf.py
index 5e190b40..d2c43c8c 100644
--- a/backend/operations_gguf.py
+++ b/backend/operations_gguf.py
@@ -37,6 +37,9 @@ class ParameterGGUF(torch.nn.Parameter):
         return super().__new__(cls, torch.tensor(tensor.data), requires_grad=requires_grad)
 
     def dequantize_as_pytorch_parameter(self):
+        if self.parent is None:
+            self.parent = torch.nn.Module()
+            self.gguf_cls.bake_layer(self.parent, self, computation_dtype=torch.float16)
         return torch.nn.Parameter(dequantize_tensor(self), requires_grad=False)
 
     def to(self, *args, **kwargs):

From b25b62da96bdf8518fdc815c946730c545854764 Mon Sep 17 00:00:00 2001
From: layerdiffusion <19834515+lllyasviel@users.noreply.github.com>
Date: Sun, 25 Aug 2024 17:31:50 -0700
Subject: [PATCH 7/9] fix T5 not baked

---
 backend/loader.py | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/backend/loader.py b/backend/loader.py
index 9154c92e..734f8855 100644
--- a/backend/loader.py
+++ b/backend/loader.py
@@ -104,6 +104,11 @@ def load_huggingface_component(guess, component_name, lib_name, cls_name, repo_p
 
             load_state_dict(model, state_dict, log_name=cls_name, ignore_errors=['transformer.encoder.embed_tokens.weight', 'logit_scale'])
 
+            if storage_dtype in ['gguf']:
+                from backend.operations_gguf import bake_gguf_model
+                model.computation_dtype = torch.float16
+                model = bake_gguf_model(model)
+
             return model
         if cls_name in ['UNet2DConditionModel', 'FluxTransformer2DModel']:
             assert isinstance(state_dict, dict) and len(state_dict) > 16, 'You do not have model state dict!'

From 891e355fc8030503f1fb04304b869f175189b03e Mon Sep 17 00:00:00 2001
From: lllyasviel <19834515+lllyasviel@users.noreply.github.com>
Date: Sun, 25 Aug 2024 17:38:54 -0700
Subject: [PATCH 8/9] Update README.md

---
 README.md | 17 +++++++++++++++++
 1 file changed, 17 insertions(+)

diff --git a/README.md b/README.md
index 6eec83b4..a806b485 100644
--- a/README.md
+++ b/README.md
@@ -6,6 +6,23 @@ The name "Forge" is inspired from "Minecraft Forge". This project is aimed at be
 
 Forge is currently based on SD-WebUI 1.10.1 at [this commit](https://github.com/AUTOMATIC1111/stable-diffusion-webui/commit/82a973c04367123ae98bd9abdf80d9eda9b910e2). (Because original SD-WebUI is almost static now, Forge will sync with original WebUI every 90 days, or when important fixes.)
 
+### Forge Issue&Discussion is Under Attack Now
+
+Today, a group of attackers attacked Forge Repo questions/discussions by sending spam files with viruses to all questions/discussions.
+
+As a protection, issue and discussion is in temp outage now. We will resume issues and discussions soon.
+
+Screenshots: 
+
+(DO NOT download any file from those attackers!)
+
+![image](https://github.com/user-attachments/assets/45fa406f-bdc3-4df4-aaa7-1a7544aac342)
+
+![image](https://github.com/user-attachments/assets/c73ecefd-bcb5-42bb-a39e-d0070645b484)
+
+![image](https://github.com/user-attachments/assets/734d47cd-05d8-4ce3-ab21-97f5e3d364ff)
+
+
 # Quick List
 
 [Gradio 4 UI Must Read (TLDR: You need to use RIGHT MOUSE BUTTON to move canvas!)](https://github.com/lllyasviel/stable-diffusion-webui-forge/discussions/853)

From 388b70134b10a71d8666a6c1c2749d7ad896467f Mon Sep 17 00:00:00 2001
From: layerdiffusion <19834515+lllyasviel@users.noreply.github.com>
Date: Sun, 25 Aug 2024 20:28:40 -0700
Subject: [PATCH 9/9] fix offline loras

---
 backend/patcher/lora.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/backend/patcher/lora.py b/backend/patcher/lora.py
index fdd6f67d..cb18c871 100644
--- a/backend/patcher/lora.py
+++ b/backend/patcher/lora.py
@@ -421,13 +421,15 @@ class LoraLoader:
             if gguf_cls is not None:
                 from backend.operations_gguf import ParameterGGUF
                 weight = gguf_cls.quantize_pytorch(weight, gguf_real_shape)
-                utils.set_attr_raw(self.model, key, ParameterGGUF.make(
+                weight = ParameterGGUF.make(
                     data=weight,
                     gguf_type=gguf_type,
                     gguf_cls=gguf_cls,
                     gguf_real_shape=gguf_real_shape,
                     parent=parent_layer
-                ))
+                )
+                gguf_cls.bake_layer(parent_layer, weight, gguf_cls.computation_dtype)
+                utils.set_attr_raw(self.model, key, weight)
                 continue
 
             utils.set_attr_raw(self.model, key, torch.nn.Parameter(weight, requires_grad=False))