Drop device tensors for head layer during conversion

This commit is contained in:
turboderp
2024-02-16 17:31:19 +01:00
parent 5967a29eb4
commit 7af6494afa
2 changed files with 19 additions and 1 deletions

View File

@@ -274,7 +274,6 @@ def quant(job, save_fn, model):
elif isinstance(module, ExLlamaV2RMSNorm) or isinstance(module, ExLlamaV2LayerNorm):
mode = "norm"
# Reference forward pass
cache = None
@@ -338,6 +337,10 @@ def quant(job, save_fn, model):
quant_moe_mlp(job, module, hidden_states, target_states, quantizers, cache, attn_params, strat)
if mode == "linear":
model.drop_device_tensors()
gc.collect() # shruge
torch.cuda.empty_cache()
quant_lm_head(job, module, hidden_states, quantizers, cache, attn_params)
quantizers.clear()
@@ -362,6 +365,7 @@ def quant(job, save_fn, model):
x = hidden_states[i].to("cuda:0")
output = module.forward(x, cache, attn_params)
x = None
q_states.append(output.to("cpu"))
output = output[0].float()

View File

@@ -75,6 +75,14 @@ class ExLlamaV2DeviceTensors:
self.ready = True
def drop(self):
self.scratch = None
self.sin = None
self.cos = None
self.ready = False
def begin_scratch_alloc(self):
self.scratch_idx = 0
@@ -459,6 +467,12 @@ class ExLlamaV2:
self.device_tensors.append(tensors)
def drop_device_tensors(self):
for dt in self.device_tensors:
dt.drop()
def get_device_tensors(self, device_idx, scratch = True):
tensors = self.device_tensors[device_idx]