From ea01a1c7d0e2384f0a872e22a8df2d96b475229d Mon Sep 17 00:00:00 2001
From: Jaret Burkett <jaretburkett@gmail.com>
Date: Mon, 25 Aug 2025 09:21:40 -0600
Subject: [PATCH] Fixed a bug where samples would fail if merging in lora on
 sampling for unquantized models. Quantize non ARA modules as uint8 when using
 an ARA

---
 toolkit/stable_diffusion_model.py |  2 ++
 toolkit/util/quantize.py          | 13 ++++++++++++-
 2 files changed, 14 insertions(+), 1 deletion(-)

diff --git a/toolkit/stable_diffusion_model.py b/toolkit/stable_diffusion_model.py
index b042d699..53bd689a 100644
--- a/toolkit/stable_diffusion_model.py
+++ b/toolkit/stable_diffusion_model.py
@@ -1145,6 +1145,8 @@ class StableDiffusion:
             # the network to drastically speed up inference
             unique_network_weights = set([x.network_multiplier for x in image_configs])
             if len(unique_network_weights) == 1 and network.can_merge_in:
+                # make sure it is on device before merging. 
+                self.unet.to(self.device_torch)
                 can_merge_in = True
                 merge_multiplier = unique_network_weights.pop()
                 network.merge_in(merge_weight=merge_multiplier)
diff --git a/toolkit/util/quantize.py b/toolkit/util/quantize.py
index f421190c..31a96bd1 100644
--- a/toolkit/util/quantize.py
+++ b/toolkit/util/quantize.py
@@ -261,6 +261,7 @@ def quantize_model(
         base_model.accuracy_recovery_adapter = network
 
         # quantize it
+        lora_exclude_modules = []
         quantization_type = get_qtype(base_model.model_config.qtype)
         for lora_module in tqdm(network.unet_loras, desc="Attaching quantization"):
             # the lora has already hijacked the original module
@@ -271,10 +272,20 @@ def quantize_model(
                 param.requires_grad = False
             quantize(orig_module, weights=quantization_type)
             freeze(orig_module)
+            module_name = lora_module.lora_name.replace('$$', '.').replace('transformer.', '')
+            lora_exclude_modules.append(module_name)
             if base_model.model_config.low_vram:
                 # move it back to cpu
                 orig_module.to("cpu")
-        
+        pass
+        # quantize additional layers
+        print_acc(" - quantizing additional layers")
+        quantization_type = get_qtype('uint8')
+        quantize(
+            model_to_quantize,
+            weights=quantization_type,
+            exclude=lora_exclude_modules
+        )
     else:
         # quantize model the original way without an accuracy recovery adapter
         # move and quantize only certain pieces at a time.