mirror of
https://github.com/theroyallab/tabbyAPI.git
synced 2026-05-11 08:20:08 +00:00
ExLlamaV3: Respect device split when loading draft model
This commit is contained in:
@@ -500,6 +500,7 @@ class ExllamaV3Container(BaseModelContainer):
|
||||
if self.use_vision:
|
||||
for value in self.vision_model.load_gen(
|
||||
reserve_per_device=self.autosplit_reserve,
|
||||
use_per_device=self.gpu_split,
|
||||
callback=progress_callback,
|
||||
):
|
||||
if value:
|
||||
@@ -508,6 +509,7 @@ class ExllamaV3Container(BaseModelContainer):
|
||||
if self.use_draft_model:
|
||||
for value in self.draft_model.load_gen(
|
||||
reserve_per_device=self.autosplit_reserve,
|
||||
use_per_device=self.gpu_split,
|
||||
callback=progress_callback,
|
||||
):
|
||||
if value:
|
||||
|
||||
Reference in New Issue
Block a user