speed up nf4 lora in offline patching mode

2026-04-26 17:29:09 +00:00 · 2024-08-22 10:35:11 -07:00
parent 95d04e5c8f
commit 68bf7f85aa
1 changed files with 5 additions and 0 deletions
--- a/backend/memory_management.py
+++ b/backend/memory_management.py
@@ -405,7 +405,12 @@ class LoadedModel:
                        mem_counter += module_mem
                    else:
                        memory_in_swap += module_mem
                        if hasattr(m, 'weight') and hasattr(m.weight, 'bnb_quantized') and not m.weight.bnb_quantized and self.device.type == 'cuda':
                            m.to(self.device)  # Quantize happens here
                        m.to(self.model.offload_device)
                        if PIN_SHARED_MEMORY and is_device_cpu(self.model.offload_device):
                            m._apply(lambda x: x.pin_memory())
                elif hasattr(m, "weight"):