From 68bf7f85aa61da07eb0b44cdcfc6101d431dce75 Mon Sep 17 00:00:00 2001 From: layerdiffusion <19834515+lllyasviel@users.noreply.github.com> Date: Thu, 22 Aug 2024 10:35:11 -0700 Subject: [PATCH] speed up nf4 lora in offline patching mode --- backend/memory_management.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/backend/memory_management.py b/backend/memory_management.py index bad32392..a43268a2 100644 --- a/backend/memory_management.py +++ b/backend/memory_management.py @@ -405,7 +405,12 @@ class LoadedModel: mem_counter += module_mem else: memory_in_swap += module_mem + + if hasattr(m, 'weight') and hasattr(m.weight, 'bnb_quantized') and not m.weight.bnb_quantized and self.device.type == 'cuda': + m.to(self.device) # Quantize happens here + m.to(self.model.offload_device) + if PIN_SHARED_MEMORY and is_device_cpu(self.model.offload_device): m._apply(lambda x: x.pin_memory()) elif hasattr(m, "weight"):