Model: Add template fetching to Exl3

Use the same functionality as exl2's loader. Signed-off-by: kingbri <8082010+kingbri1@users.noreply.github.com>
2026-04-20 14:28:54 +00:00 · 2025-05-02 00:22:34 -04:00
parent e8f00412f6
commit c89bea030e
1 changed files with 16 additions and 5 deletions
--- a/backends/exllamav3/model.py
+++ b/backends/exllamav3/model.py
@@ -126,11 +126,6 @@ class ExllamaV3Container(BaseModelContainer):
        # Fallback to 4096 since exl3 can't fetch from HF's config.json
        self.max_seq_len = unwrap(kwargs.get("max_seq_len"), 4096)

-        # Try to set prompt template
-        self.prompt_template = await find_prompt_template(
-            kwargs.get("prompt_template"), model_directory
-        )
-
        # Turn off GPU split if the user is using 1 GPU
        gpu_count = torch.cuda.device_count()
        gpu_split_auto = unwrap(kwargs.get("gpu_split_auto"), True)
@@ -191,6 +186,22 @@ class ExllamaV3Container(BaseModelContainer):
        user_chunk_size = unwrap(kwargs.get("chunk_size"), 2048)
        self.chunk_size = self.adjust_chunk_size(user_chunk_size)

+        # Template setup
+        self.prompt_template = await find_prompt_template(
+            kwargs.get("prompt_template"), model_directory
+        )
+
+        # Catch all for template lookup errors
+        if self.prompt_template:
+            logger.info(
+                f'Using template "{self.prompt_template.name}" for chat completions.'
+            )
+        else:
+            logger.warning(
+                "Chat completions are disabled because a prompt "
+                "template wasn't provided or auto-detected."
+            )
+
        # TODO: speculative decoding

        return self