ExLlamaV3: Respect device split when loading draft model

This commit is contained in:
turboderp
2026-04-25 01:51:46 +02:00
parent 6aa842a1b2
commit e909f7ecdb

View File

@@ -500,6 +500,7 @@ class ExllamaV3Container(BaseModelContainer):
if self.use_vision:
for value in self.vision_model.load_gen(
reserve_per_device=self.autosplit_reserve,
use_per_device=self.gpu_split,
callback=progress_callback,
):
if value:
@@ -508,6 +509,7 @@ class ExllamaV3Container(BaseModelContainer):
if self.use_draft_model:
for value in self.draft_model.load_gen(
reserve_per_device=self.autosplit_reserve,
use_per_device=self.gpu_split,
callback=progress_callback,
):
if value: