Add generator versions of model.load() and model.load_autosplit()

This commit is contained in:
turboderp
2023-10-23 01:17:10 +02:00
parent a2a1743b3d
commit 093b89d38c
2 changed files with 26 additions and 6 deletions

View File

@@ -234,7 +234,11 @@ class ExLlamaV2:
return [(ab - rb - rba) / 1024**3 for (ab, rb, rba) in zip(allocation_bytes, reserve_bytes, reserve_bytes_attn)]
def load(self, gpu_split = None, lazy = False, stats = False, callback = None):
def load(self, gpu_split = None, lazy = False, stats = False, callback = None, callback_gen = None):
f = self.load_gen(gpu_split, lazy, stats, callback, callback_gen)
for item in f: return item
def load_gen(self, gpu_split = None, lazy = False, stats = False, callback = None, callback_gen = None):
assert not self.config.qkv_embed or not lazy, "Lazy initialization is unsupported when config.qkv_embed = True"
@@ -247,21 +251,29 @@ class ExLlamaV2:
if not lazy:
for idx, module in enumerate(self.modules):
if callback is not None: callback(idx, len(self.modules))
if callback_gen is not None: yield from callback_gen(idx, len(self.modules))
module.load()
if callback is not None: callback(len(self.modules), len(self.modules))
if callback_gen is not None: yield from callback_gen(len(self.modules), len(self.modules))
# Cache map
self.set_cache_map()
self.loaded = True
if stats: return gpu_split, stats_
else: return gpu_split
if stats: yield gpu_split, stats_
else: yield gpu_split
def load_autosplit(self, cache, reserve_vram = None, last_id_only = False, callback = None):
def load_autosplit(self, cache, reserve_vram = None, last_id_only = False, callback = None, callback_gen = None):
f = self.load_autosplit_gen(cache, reserve_vram, last_id_only, callback, callback_gen)
for item in f: x = item
def load_autosplit_gen(self, cache, reserve_vram = None, last_id_only = False, callback = None, callback_gen = None):
assert not self.config.qkv_embed, "Auto GPU split is unsupported when config.qkv_embed = True"
@@ -301,6 +313,7 @@ class ExLlamaV2:
for idx, module in enumerate(self.modules):
if callback is not None: callback(idx, len(self.modules))
if callback_gen is not None: yield from callback_gen(idx, len(self.modules))
# Embedding layer on CPU
@@ -375,6 +388,7 @@ class ExLlamaV2:
break
if callback is not None: callback(len(self.modules), len(self.modules))
if callback_gen is not None: yield from callback_gen(len(self.modules), len(self.modules))
hidden_state = None
attn_mask = None
@@ -384,6 +398,9 @@ class ExLlamaV2:
torch.cuda.empty_cache()
self.loaded = True
if 'yield' in locals():
yield
def unload(self):

View File

@@ -32,10 +32,13 @@ model = ExLlamaV2(config)
print("Loading model: " + model_directory)
def progress_rep(module, num_modules):
print(f"Progress: {100 * module / num_modules:.2f}%")
yield f"Progress: {100 * module / num_modules:.2f}%"
cache = ExLlamaV2Cache_8bit(model, lazy = True)
model.load_autosplit(cache, last_id_only = True, callback = progress_rep)
f = model.load_autosplit_gen(cache, last_id_only = True, callback_gen = progress_rep)
for item in f:
print(item)
tokenizer = ExLlamaV2Tokenizer(config)