speed up nf4 lora in offline patching mode

2026-03-13 00:49:48 +00:00 · 2024-08-22 10:35:11 -07:00
parent 95d04e5c8f
commit 68bf7f85aa
1 changed files with 5 additions and 0 deletions
--- a/backend/memory_management.py
+++ b/backend/memory_management.py
@@ -405,7 +405,12 @@ class LoadedModel:
                        mem_counter += module_mem
                    else:
                        memory_in_swap += module_mem
+
+                        if hasattr(m, 'weight') and hasattr(m.weight, 'bnb_quantized') and not m.weight.bnb_quantized and self.device.type == 'cuda':
+                            m.to(self.device)  # Quantize happens here
+
                        m.to(self.model.offload_device)
+
                        if PIN_SHARED_MEMORY and is_device_cpu(self.model.offload_device):
                            m._apply(lambda x: x.pin_memory())
                elif hasattr(m, "weight"):