Model: Enable max_rq_tokens (output chunking)

This commit is contained in:
turboderp
2025-10-05 18:54:45 +02:00
parent e09a61969f
commit 52e093ae6c
4 changed files with 250 additions and 230 deletions

View File

@@ -265,6 +265,14 @@ class ModelConfig(BaseConfigModel):
),
gt=0,
)
disable_output_chunking: Optional[bool] = Field(
False,
description=(
"Disable output chunking (default: false).\n"
"Used by EXL3 models only.\n"
"True, allocate space in the cache for the entire response with each request..\n"
),
)
max_batch_size: Optional[int] = Field(
None,
description=(