Config: replace disable_output_chunking flag with output_chunking

2026-05-11 16:30:16 +00:00 · 2025-10-14 02:47:52 +02:00
parent 7eee3924c7
commit 8abdfe7b13
4 changed files with 11 additions and 10 deletions
--- a/backends/exllamav3/model.py
+++ b/backends/exllamav3/model.py
@@ -250,8 +250,8 @@ class ExllamaV3Container(BaseModelContainer):
        self.chunk_size = self.adjust_chunk_size(user_chunk_size)

        # Output chunking
-        disable_output_chunking = unwrap(kwargs.get("disable_output_chunking"), False)
-        self.max_rq_tokens = None if disable_output_chunking else self.chunk_size
+        output_chunking = unwrap(kwargs.get("output_chunking"), True)
+        self.max_rq_tokens = self.chunk_size if output_chunking else None

        # Template setup
        self.prompt_template = await find_prompt_template(
--- a/common/config_models.py
+++ b/common/config_models.py
@@ -265,12 +265,13 @@ class ModelConfig(BaseConfigModel):
        ),
        gt=0,
    )
-    disable_output_chunking: Optional[bool] = Field(
-        False,
+    output_chunking: Optional[bool] = Field(
+        True,
        description=(
-            "Disable output chunking (default: false).\n"
+            "Use output chunking (default: True)\n"
+            "Instead of allocating cache space for the entire completion at once, "
+            "allocate in chunks as needed.\n"
            "Used by EXL3 models only.\n"
-            "True, allocate space in the cache for the entire response with each request..\n"
        ),
    )
    max_batch_size: Optional[int] = Field(
--- a/config_sample.yml
+++ b/config_sample.yml
@@ -133,10 +133,10 @@ model:
  # An ideal value is between 512 and 4096.
  chunk_size: 2048

-  # Disable output chunking (default: false)
+  # Use output chunking (default: True)
+  # Instead of allocating cache space for the entire completion at once, allocate in chunks as needed.
  # Used by EXL3 models only.
-  # If True, allocate space in the cache for the entire response with each request.
-  disable_output_chunking: false
+  output_chunking: true

  # Set the maximum number of prompts to process at one time (default: None/Automatic).
  # Automatically calculated if left blank.
--- a/endpoints/core/types/model.py
+++ b/endpoints/core/types/model.py
@@ -109,7 +109,7 @@ class ModelLoadRequest(BaseModel):
    )
    cache_mode: Optional[str] = None
    chunk_size: Optional[int] = None
-    disable_output_chunking: Optional[bool] = False
+    output_chunking: Optional[bool] = True
    prompt_template: Optional[str] = None
    vision: Optional[bool] = None