Drop device tensors for head layer during conversion

2026-03-15 00:07:26 +00:00 · 2024-02-16 17:31:19 +01:00
parent 5967a29eb4
commit 7af6494afa
2 changed files with 19 additions and 1 deletions
--- a/conversion/quantize.py
+++ b/conversion/quantize.py
@@ -274,7 +274,6 @@ def quant(job, save_fn, model):
        elif isinstance(module, ExLlamaV2RMSNorm) or isinstance(module, ExLlamaV2LayerNorm):
            mode = "norm"

-
        # Reference forward pass

        cache = None
@@ -338,6 +337,10 @@ def quant(job, save_fn, model):
            quant_moe_mlp(job, module, hidden_states, target_states, quantizers, cache, attn_params, strat)

        if mode == "linear":
+
+            model.drop_device_tensors()
+            gc.collect()  # shruge
+            torch.cuda.empty_cache()
            quant_lm_head(job, module, hidden_states, quantizers, cache, attn_params)

        quantizers.clear()
@@ -362,6 +365,7 @@ def quant(job, save_fn, model):

                x = hidden_states[i].to("cuda:0")
                output = module.forward(x, cache, attn_params)
+                x = None
                q_states.append(output.to("cpu"))

                output = output[0].float()
--- a/exllamav2/model.py
+++ b/exllamav2/model.py
@@ -75,6 +75,14 @@ class ExLlamaV2DeviceTensors:
        self.ready = True


+    def drop(self):
+
+        self.scratch = None
+        self.sin = None
+        self.cos = None
+        self.ready = False
+
+
    def begin_scratch_alloc(self):

        self.scratch_idx = 0
@@ -459,6 +467,12 @@ class ExLlamaV2:
            self.device_tensors.append(tensors)


+    def drop_device_tensors(self):
+
+        for dt in self.device_tensors:
+            dt.drop()
+
+
    def get_device_tensors(self, device_idx, scratch = True):

        tensors = self.device_tensors[device_idx]