From 68bf7f85aa61da07eb0b44cdcfc6101d431dce75 Mon Sep 17 00:00:00 2001
From: layerdiffusion <19834515+lllyasviel@users.noreply.github.com>
Date: Thu, 22 Aug 2024 10:35:11 -0700
Subject: [PATCH] speed up nf4 lora in offline patching mode

---
 backend/memory_management.py | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/backend/memory_management.py b/backend/memory_management.py
index bad32392..a43268a2 100644
--- a/backend/memory_management.py
+++ b/backend/memory_management.py
@@ -405,7 +405,12 @@ class LoadedModel:
                         mem_counter += module_mem
                     else:
                         memory_in_swap += module_mem
+
+                        if hasattr(m, 'weight') and hasattr(m.weight, 'bnb_quantized') and not m.weight.bnb_quantized and self.device.type == 'cuda':
+                            m.to(self.device)  # Quantize happens here
+
                         m.to(self.model.offload_device)
+
                         if PIN_SHARED_MEMORY and is_device_cpu(self.model.offload_device):
                             m._apply(lambda x: x.pin_memory())
                 elif hasattr(m, "weight"):