Add generator versions of model.load() and model.load_autosplit()

2026-04-20 14:29:28 +00:00 · 2023-10-23 01:17:10 +02:00
parent a2a1743b3d
commit 093b89d38c
2 changed files with 26 additions and 6 deletions
--- a/exllamav2/model.py
+++ b/exllamav2/model.py
@@ -234,7 +234,11 @@ class ExLlamaV2:
        return [(ab - rb - rba) / 1024**3 for (ab, rb, rba) in zip(allocation_bytes, reserve_bytes, reserve_bytes_attn)]


-    def load(self, gpu_split = None, lazy = False, stats = False, callback = None):
+    def load(self, gpu_split = None, lazy = False, stats = False, callback = None, callback_gen = None):
+        f = self.load_gen(gpu_split, lazy, stats, callback, callback_gen)
+        for item in f: return item
+
+    def load_gen(self, gpu_split = None, lazy = False, stats = False, callback = None, callback_gen = None):

        assert not self.config.qkv_embed or not lazy, "Lazy initialization is unsupported when config.qkv_embed = True"

@@ -247,21 +251,29 @@ class ExLlamaV2:
            if not lazy:

                for idx, module in enumerate(self.modules):
+
                    if callback is not None: callback(idx, len(self.modules))
+                    if callback_gen is not None: yield from callback_gen(idx, len(self.modules))
+
                    module.load()

                if callback is not None: callback(len(self.modules), len(self.modules))
+                if callback_gen is not None: yield from callback_gen(len(self.modules), len(self.modules))

            # Cache map

            self.set_cache_map()

            self.loaded = True
-            if stats: return gpu_split, stats_
-            else: return gpu_split
+            if stats: yield gpu_split, stats_
+            else: yield gpu_split


-    def load_autosplit(self, cache, reserve_vram = None, last_id_only = False, callback = None):
+    def load_autosplit(self, cache, reserve_vram = None, last_id_only = False, callback = None, callback_gen = None):
+        f = self.load_autosplit_gen(cache, reserve_vram, last_id_only, callback, callback_gen)
+        for item in f: x = item
+
+    def load_autosplit_gen(self, cache, reserve_vram = None, last_id_only = False, callback = None, callback_gen = None):

        assert not self.config.qkv_embed, "Auto GPU split is unsupported when config.qkv_embed = True"

@@ -301,6 +313,7 @@ class ExLlamaV2:
            for idx, module in enumerate(self.modules):

                if callback is not None: callback(idx, len(self.modules))
+                if callback_gen is not None: yield from callback_gen(idx, len(self.modules))

                # Embedding layer on CPU

@@ -375,6 +388,7 @@ class ExLlamaV2:
                    break

            if callback is not None: callback(len(self.modules), len(self.modules))
+            if callback_gen is not None: yield from callback_gen(len(self.modules), len(self.modules))

            hidden_state = None
            attn_mask = None
@@ -384,6 +398,9 @@ class ExLlamaV2:
            torch.cuda.empty_cache()
            self.loaded = True

+        if 'yield' in locals():
+            yield
+

    def unload(self):

--- a/tests/test_autosplit.py
+++ b/tests/test_autosplit.py
@@ -32,10 +32,13 @@ model = ExLlamaV2(config)
 print("Loading model: " + model_directory)

 def progress_rep(module, num_modules):
-    print(f"Progress: {100 * module / num_modules:.2f}%")
+    yield f"Progress: {100 * module / num_modules:.2f}%"

 cache = ExLlamaV2Cache_8bit(model, lazy = True)
-model.load_autosplit(cache, last_id_only = True, callback = progress_rep)
+
+f = model.load_autosplit_gen(cache, last_id_only = True, callback_gen = progress_rep)
+for item in f:
+    print(item)

 tokenizer = ExLlamaV2Tokenizer(config)