Speed up quant model loading and inference ...

... based on 3 evidences:
1. torch.Tensor.view on one big tensor is slightly faster than calling torch.Tensor.to on multiple small tensors.
2. but torch.Tensor.to with dtype change is significantly slower than torch.Tensor.view
3. “baking” model on GPU is significantly faster than computing on CPU when model load.

mainly influence inference of Q8_0, Q4_0/1/K and loading of all quants
This commit is contained in:
layerdiffusion
2024-08-30 00:49:05 -07:00
parent 3d62fa9598
commit 4c9380c46a
7 changed files with 126 additions and 181 deletions

View File

@@ -395,20 +395,22 @@ class ForgeOperationsGGUF(ForgeOperations):
def _load_from_state_dict(self, state_dict, prefix, local_metadata, strict, missing_keys, unexpected_keys, error_msgs):
if hasattr(self, 'dummy'):
computation_dtype = self.dummy.dtype
if computation_dtype not in [torch.float16, torch.bfloat16]:
# GGUF cast only supports 16bits otherwise super slow
computation_dtype = torch.float16
if prefix + 'weight' in state_dict:
self.weight = state_dict[prefix + 'weight'].to(device=self.dummy.device)
self.weight.computation_dtype = computation_dtype
if prefix + 'bias' in state_dict:
self.bias = state_dict[prefix + 'bias'].to(device=self.dummy.device)
self.bias.computation_dtype = computation_dtype
del self.dummy
else:
if prefix + 'weight' in state_dict:
self.weight = state_dict[prefix + 'weight']
if prefix + 'bias' in state_dict:
self.bias = state_dict[prefix + 'bias']
if self.weight is not None and hasattr(self.weight, 'parent'):
self.weight.parent = self
if self.bias is not None and hasattr(self.bias, 'parent'):
self.bias.parent = self
return
def _apply(self, fn, recurse=True):