Model: Enable max_rq_tokens (output chunking)

2026-03-15 00:07:28 +00:00 · 2025-10-05 18:54:45 +02:00
parent e09a61969f
commit 52e093ae6c
4 changed files with 250 additions and 230 deletions
--- a/common/config_models.py
+++ b/common/config_models.py
@@ -265,6 +265,14 @@ class ModelConfig(BaseConfigModel):
        ),
        gt=0,
    )
+    disable_output_chunking: Optional[bool] = Field(
+        False,
+        description=(
+            "Disable output chunking (default: false).\n"
+            "Used by EXL3 models only.\n"
+            "True, allocate space in the cache for the entire response with each request..\n"
+        ),
+    )
    max_batch_size: Optional[int] = Field(
        None,
        description=(