mirror of
https://github.com/turboderp-org/exllamav2.git
synced 2026-04-20 14:29:28 +00:00
Add generator versions of model.load() and model.load_autosplit()
This commit is contained in:
@@ -234,7 +234,11 @@ class ExLlamaV2:
|
||||
return [(ab - rb - rba) / 1024**3 for (ab, rb, rba) in zip(allocation_bytes, reserve_bytes, reserve_bytes_attn)]
|
||||
|
||||
|
||||
def load(self, gpu_split = None, lazy = False, stats = False, callback = None):
|
||||
def load(self, gpu_split = None, lazy = False, stats = False, callback = None, callback_gen = None):
|
||||
f = self.load_gen(gpu_split, lazy, stats, callback, callback_gen)
|
||||
for item in f: return item
|
||||
|
||||
def load_gen(self, gpu_split = None, lazy = False, stats = False, callback = None, callback_gen = None):
|
||||
|
||||
assert not self.config.qkv_embed or not lazy, "Lazy initialization is unsupported when config.qkv_embed = True"
|
||||
|
||||
@@ -247,21 +251,29 @@ class ExLlamaV2:
|
||||
if not lazy:
|
||||
|
||||
for idx, module in enumerate(self.modules):
|
||||
|
||||
if callback is not None: callback(idx, len(self.modules))
|
||||
if callback_gen is not None: yield from callback_gen(idx, len(self.modules))
|
||||
|
||||
module.load()
|
||||
|
||||
if callback is not None: callback(len(self.modules), len(self.modules))
|
||||
if callback_gen is not None: yield from callback_gen(len(self.modules), len(self.modules))
|
||||
|
||||
# Cache map
|
||||
|
||||
self.set_cache_map()
|
||||
|
||||
self.loaded = True
|
||||
if stats: return gpu_split, stats_
|
||||
else: return gpu_split
|
||||
if stats: yield gpu_split, stats_
|
||||
else: yield gpu_split
|
||||
|
||||
|
||||
def load_autosplit(self, cache, reserve_vram = None, last_id_only = False, callback = None):
|
||||
def load_autosplit(self, cache, reserve_vram = None, last_id_only = False, callback = None, callback_gen = None):
|
||||
f = self.load_autosplit_gen(cache, reserve_vram, last_id_only, callback, callback_gen)
|
||||
for item in f: x = item
|
||||
|
||||
def load_autosplit_gen(self, cache, reserve_vram = None, last_id_only = False, callback = None, callback_gen = None):
|
||||
|
||||
assert not self.config.qkv_embed, "Auto GPU split is unsupported when config.qkv_embed = True"
|
||||
|
||||
@@ -301,6 +313,7 @@ class ExLlamaV2:
|
||||
for idx, module in enumerate(self.modules):
|
||||
|
||||
if callback is not None: callback(idx, len(self.modules))
|
||||
if callback_gen is not None: yield from callback_gen(idx, len(self.modules))
|
||||
|
||||
# Embedding layer on CPU
|
||||
|
||||
@@ -375,6 +388,7 @@ class ExLlamaV2:
|
||||
break
|
||||
|
||||
if callback is not None: callback(len(self.modules), len(self.modules))
|
||||
if callback_gen is not None: yield from callback_gen(len(self.modules), len(self.modules))
|
||||
|
||||
hidden_state = None
|
||||
attn_mask = None
|
||||
@@ -384,6 +398,9 @@ class ExLlamaV2:
|
||||
torch.cuda.empty_cache()
|
||||
self.loaded = True
|
||||
|
||||
if 'yield' in locals():
|
||||
yield
|
||||
|
||||
|
||||
def unload(self):
|
||||
|
||||
|
||||
@@ -32,10 +32,13 @@ model = ExLlamaV2(config)
|
||||
print("Loading model: " + model_directory)
|
||||
|
||||
def progress_rep(module, num_modules):
|
||||
print(f"Progress: {100 * module / num_modules:.2f}%")
|
||||
yield f"Progress: {100 * module / num_modules:.2f}%"
|
||||
|
||||
cache = ExLlamaV2Cache_8bit(model, lazy = True)
|
||||
model.load_autosplit(cache, last_id_only = True, callback = progress_rep)
|
||||
|
||||
f = model.load_autosplit_gen(cache, last_id_only = True, callback_gen = progress_rep)
|
||||
for item in f:
|
||||
print(item)
|
||||
|
||||
tokenizer = ExLlamaV2Tokenizer(config)
|
||||
|
||||
|
||||
Reference in New Issue
Block a user