Model: Enable max_rq_tokens (output chunking)

2026-03-14 15:57:27 +00:00 · 2025-10-05 18:54:45 +02:00
parent e09a61969f
commit 52e093ae6c
4 changed files with 250 additions and 230 deletions
--- a/backends/exllamav3/model.py
+++ b/backends/exllamav3/model.py
@@ -85,6 +85,7 @@ class ExllamaV3Container(BaseModelContainer):
    cache_mode: str = "FP16"
    draft_cache_mode: str = "FP16"
    chunk_size: int = 2048
+    max_rq_tokens: Optional[int] = 2048
    max_batch_size: Optional[int] = None

    # Required methods
@@ -250,6 +251,10 @@ class ExllamaV3Container(BaseModelContainer):
        user_chunk_size = unwrap(kwargs.get("chunk_size"), 2048)
        self.chunk_size = self.adjust_chunk_size(user_chunk_size)

+        # Output chunking
+        disable_output_chunking = unwrap(kwargs.get("disable_output_chunking"), False)
+        self.max_rq_tokens = None if disable_output_chunking else self.chunk_size
+
        # Template setup
        self.prompt_template = await find_prompt_template(
            kwargs.get("prompt_template"), model_directory
@@ -982,6 +987,7 @@ class ExllamaV3Container(BaseModelContainer):
            banned_strings=params.banned_strings,
            embeddings=mm_embeddings_content,
            return_top_tokens=params.logprobs,
+            max_rq_tokens=self.max_rq_tokens
        )

        generated_tokens = 0