mirror of
https://github.com/turboderp-org/exllamav2.git
synced 2026-03-15 00:07:26 +00:00
Drop device tensors for head layer during conversion
This commit is contained in:
@@ -274,7 +274,6 @@ def quant(job, save_fn, model):
|
||||
elif isinstance(module, ExLlamaV2RMSNorm) or isinstance(module, ExLlamaV2LayerNorm):
|
||||
mode = "norm"
|
||||
|
||||
|
||||
# Reference forward pass
|
||||
|
||||
cache = None
|
||||
@@ -338,6 +337,10 @@ def quant(job, save_fn, model):
|
||||
quant_moe_mlp(job, module, hidden_states, target_states, quantizers, cache, attn_params, strat)
|
||||
|
||||
if mode == "linear":
|
||||
|
||||
model.drop_device_tensors()
|
||||
gc.collect() # shruge
|
||||
torch.cuda.empty_cache()
|
||||
quant_lm_head(job, module, hidden_states, quantizers, cache, attn_params)
|
||||
|
||||
quantizers.clear()
|
||||
@@ -362,6 +365,7 @@ def quant(job, save_fn, model):
|
||||
|
||||
x = hidden_states[i].to("cuda:0")
|
||||
output = module.forward(x, cache, attn_params)
|
||||
x = None
|
||||
q_states.append(output.to("cpu"))
|
||||
|
||||
output = output[0].float()
|
||||
|
||||
@@ -75,6 +75,14 @@ class ExLlamaV2DeviceTensors:
|
||||
self.ready = True
|
||||
|
||||
|
||||
def drop(self):
|
||||
|
||||
self.scratch = None
|
||||
self.sin = None
|
||||
self.cos = None
|
||||
self.ready = False
|
||||
|
||||
|
||||
def begin_scratch_alloc(self):
|
||||
|
||||
self.scratch_idx = 0
|
||||
@@ -459,6 +467,12 @@ class ExLlamaV2:
|
||||
self.device_tensors.append(tensors)
|
||||
|
||||
|
||||
def drop_device_tensors(self):
|
||||
|
||||
for dt in self.device_tensors:
|
||||
dt.drop()
|
||||
|
||||
|
||||
def get_device_tensors(self, device_idx, scratch = True):
|
||||
|
||||
tensors = self.device_tensors[device_idx]
|
||||
|
||||
Reference in New Issue
Block a user