From ea01a1c7d0e2384f0a872e22a8df2d96b475229d Mon Sep 17 00:00:00 2001 From: Jaret Burkett Date: Mon, 25 Aug 2025 09:21:40 -0600 Subject: [PATCH] Fixed a bug where samples would fail if merging in lora on sampling for unquantized models. Quantize non ARA modules as uint8 when using an ARA --- toolkit/stable_diffusion_model.py | 2 ++ toolkit/util/quantize.py | 13 ++++++++++++- 2 files changed, 14 insertions(+), 1 deletion(-) diff --git a/toolkit/stable_diffusion_model.py b/toolkit/stable_diffusion_model.py index b042d699..53bd689a 100644 --- a/toolkit/stable_diffusion_model.py +++ b/toolkit/stable_diffusion_model.py @@ -1145,6 +1145,8 @@ class StableDiffusion: # the network to drastically speed up inference unique_network_weights = set([x.network_multiplier for x in image_configs]) if len(unique_network_weights) == 1 and network.can_merge_in: + # make sure it is on device before merging. + self.unet.to(self.device_torch) can_merge_in = True merge_multiplier = unique_network_weights.pop() network.merge_in(merge_weight=merge_multiplier) diff --git a/toolkit/util/quantize.py b/toolkit/util/quantize.py index f421190c..31a96bd1 100644 --- a/toolkit/util/quantize.py +++ b/toolkit/util/quantize.py @@ -261,6 +261,7 @@ def quantize_model( base_model.accuracy_recovery_adapter = network # quantize it + lora_exclude_modules = [] quantization_type = get_qtype(base_model.model_config.qtype) for lora_module in tqdm(network.unet_loras, desc="Attaching quantization"): # the lora has already hijacked the original module @@ -271,10 +272,20 @@ def quantize_model( param.requires_grad = False quantize(orig_module, weights=quantization_type) freeze(orig_module) + module_name = lora_module.lora_name.replace('$$', '.').replace('transformer.', '') + lora_exclude_modules.append(module_name) if base_model.model_config.low_vram: # move it back to cpu orig_module.to("cpu") - + pass + # quantize additional layers + print_acc(" - quantizing additional layers") + quantization_type = get_qtype('uint8') + quantize( + model_to_quantize, + weights=quantization_type, + exclude=lora_exclude_modules + ) else: # quantize model the original way without an accuracy recovery adapter # move and quantize only certain pieces at a time.