From e909f7ecdb4682dc2836996c9c813c8c01c5e2e4 Mon Sep 17 00:00:00 2001 From: turboderp <11859846+turboderp@users.noreply.github.com> Date: Sat, 25 Apr 2026 01:51:46 +0200 Subject: [PATCH] ExLlamaV3: Respect device split when loading draft model --- backends/exllamav3/model.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/backends/exllamav3/model.py b/backends/exllamav3/model.py index 0582201..abb61a6 100644 --- a/backends/exllamav3/model.py +++ b/backends/exllamav3/model.py @@ -500,6 +500,7 @@ class ExllamaV3Container(BaseModelContainer): if self.use_vision: for value in self.vision_model.load_gen( reserve_per_device=self.autosplit_reserve, + use_per_device=self.gpu_split, callback=progress_callback, ): if value: @@ -508,6 +509,7 @@ class ExllamaV3Container(BaseModelContainer): if self.use_draft_model: for value in self.draft_model.load_gen( reserve_per_device=self.autosplit_reserve, + use_per_device=self.gpu_split, callback=progress_callback, ): if value: