mirror of
https://github.com/lllyasviel/stable-diffusion-webui-forge.git
synced 2026-04-30 03:01:15 +00:00
Speed up quant model loading and inference ...
... based on 3 evidences: 1. torch.Tensor.view on one big tensor is slightly faster than calling torch.Tensor.to on multiple small tensors. 2. but torch.Tensor.to with dtype change is significantly slower than torch.Tensor.view 3. “baking” model on GPU is significantly faster than computing on CPU when model load. mainly influence inference of Q8_0, Q4_0/1/K and loading of all quants
This commit is contained in:
@@ -395,20 +395,22 @@ class ForgeOperationsGGUF(ForgeOperations):
|
||||
|
||||
def _load_from_state_dict(self, state_dict, prefix, local_metadata, strict, missing_keys, unexpected_keys, error_msgs):
|
||||
if hasattr(self, 'dummy'):
|
||||
computation_dtype = self.dummy.dtype
|
||||
if computation_dtype not in [torch.float16, torch.bfloat16]:
|
||||
# GGUF cast only supports 16bits otherwise super slow
|
||||
computation_dtype = torch.float16
|
||||
if prefix + 'weight' in state_dict:
|
||||
self.weight = state_dict[prefix + 'weight'].to(device=self.dummy.device)
|
||||
self.weight.computation_dtype = computation_dtype
|
||||
if prefix + 'bias' in state_dict:
|
||||
self.bias = state_dict[prefix + 'bias'].to(device=self.dummy.device)
|
||||
self.bias.computation_dtype = computation_dtype
|
||||
del self.dummy
|
||||
else:
|
||||
if prefix + 'weight' in state_dict:
|
||||
self.weight = state_dict[prefix + 'weight']
|
||||
if prefix + 'bias' in state_dict:
|
||||
self.bias = state_dict[prefix + 'bias']
|
||||
if self.weight is not None and hasattr(self.weight, 'parent'):
|
||||
self.weight.parent = self
|
||||
if self.bias is not None and hasattr(self.bias, 'parent'):
|
||||
self.bias.parent = self
|
||||
return
|
||||
|
||||
def _apply(self, fn, recurse=True):
|
||||
|
||||
Reference in New Issue
Block a user