From 7af6494afa0b097b70734a5a8bcbfd6c6b63c34c Mon Sep 17 00:00:00 2001 From: turboderp <11859846+turboderp@users.noreply.github.com> Date: Fri, 16 Feb 2024 17:31:19 +0100 Subject: [PATCH] Drop device tensors for head layer during conversion --- conversion/quantize.py | 6 +++++- exllamav2/model.py | 14 ++++++++++++++ 2 files changed, 19 insertions(+), 1 deletion(-) diff --git a/conversion/quantize.py b/conversion/quantize.py index e1cac47..433269f 100644 --- a/conversion/quantize.py +++ b/conversion/quantize.py @@ -274,7 +274,6 @@ def quant(job, save_fn, model): elif isinstance(module, ExLlamaV2RMSNorm) or isinstance(module, ExLlamaV2LayerNorm): mode = "norm" - # Reference forward pass cache = None @@ -338,6 +337,10 @@ def quant(job, save_fn, model): quant_moe_mlp(job, module, hidden_states, target_states, quantizers, cache, attn_params, strat) if mode == "linear": + + model.drop_device_tensors() + gc.collect() # shruge + torch.cuda.empty_cache() quant_lm_head(job, module, hidden_states, quantizers, cache, attn_params) quantizers.clear() @@ -362,6 +365,7 @@ def quant(job, save_fn, model): x = hidden_states[i].to("cuda:0") output = module.forward(x, cache, attn_params) + x = None q_states.append(output.to("cpu")) output = output[0].float() diff --git a/exllamav2/model.py b/exllamav2/model.py index 5c07a79..b608628 100644 --- a/exllamav2/model.py +++ b/exllamav2/model.py @@ -75,6 +75,14 @@ class ExLlamaV2DeviceTensors: self.ready = True + def drop(self): + + self.scratch = None + self.sin = None + self.cos = None + self.ready = False + + def begin_scratch_alloc(self): self.scratch_idx = 0 @@ -459,6 +467,12 @@ class ExLlamaV2: self.device_tensors.append(tensors) + def drop_device_tensors(self): + + for dt in self.device_tensors: + dt.drop() + + def get_device_tensors(self, device_idx, scratch = True): tensors = self.device_tensors[device_idx]