speed up nf4 lora in offline patching mode

This commit is contained in:
layerdiffusion
2024-08-22 10:35:11 -07:00
parent 95d04e5c8f
commit 68bf7f85aa

View File

@@ -405,7 +405,12 @@ class LoadedModel:
mem_counter += module_mem
else:
memory_in_swap += module_mem
if hasattr(m, 'weight') and hasattr(m.weight, 'bnb_quantized') and not m.weight.bnb_quantized and self.device.type == 'cuda':
m.to(self.device) # Quantize happens here
m.to(self.model.offload_device)
if PIN_SHARED_MEMORY and is_device_cpu(self.model.offload_device):
m._apply(lambda x: x.pin_memory())
elif hasattr(m, "weight"):