From 7af6494afa0b097b70734a5a8bcbfd6c6b63c34c Mon Sep 17 00:00:00 2001
From: turboderp <11859846+turboderp@users.noreply.github.com>
Date: Fri, 16 Feb 2024 17:31:19 +0100
Subject: [PATCH] Drop device tensors for head layer during conversion

---
 conversion/quantize.py |  6 +++++-
 exllamav2/model.py     | 14 ++++++++++++++
 2 files changed, 19 insertions(+), 1 deletion(-)

diff --git a/conversion/quantize.py b/conversion/quantize.py
index e1cac47..433269f 100644
--- a/conversion/quantize.py
+++ b/conversion/quantize.py
@@ -274,7 +274,6 @@ def quant(job, save_fn, model):
         elif isinstance(module, ExLlamaV2RMSNorm) or isinstance(module, ExLlamaV2LayerNorm):
             mode = "norm"
 
-
         # Reference forward pass
 
         cache = None
@@ -338,6 +337,10 @@ def quant(job, save_fn, model):
             quant_moe_mlp(job, module, hidden_states, target_states, quantizers, cache, attn_params, strat)
 
         if mode == "linear":
+
+            model.drop_device_tensors()
+            gc.collect()  # shruge
+            torch.cuda.empty_cache()
             quant_lm_head(job, module, hidden_states, quantizers, cache, attn_params)
 
         quantizers.clear()
@@ -362,6 +365,7 @@ def quant(job, save_fn, model):
 
                 x = hidden_states[i].to("cuda:0")
                 output = module.forward(x, cache, attn_params)
+                x = None
                 q_states.append(output.to("cpu"))
 
                 output = output[0].float()
diff --git a/exllamav2/model.py b/exllamav2/model.py
index 5c07a79..b608628 100644
--- a/exllamav2/model.py
+++ b/exllamav2/model.py
@@ -75,6 +75,14 @@ class ExLlamaV2DeviceTensors:
         self.ready = True
 
 
+    def drop(self):
+
+        self.scratch = None
+        self.sin = None
+        self.cos = None
+        self.ready = False
+
+
     def begin_scratch_alloc(self):
 
         self.scratch_idx = 0
@@ -459,6 +467,12 @@ class ExLlamaV2:
             self.device_tensors.append(tensors)
 
 
+    def drop_device_tensors(self):
+
+        for dt in self.device_tensors:
+            dt.drop()
+
+
     def get_device_tensors(self, device_idx, scratch = True):
 
         tensors = self.device_tensors[device_idx]