mirror of
https://github.com/theroyallab/tabbyAPI.git
synced 2026-04-27 09:41:54 +00:00
Config: replace disable_output_chunking flag with output_chunking
This commit is contained in:
@@ -250,8 +250,8 @@ class ExllamaV3Container(BaseModelContainer):
|
|||||||
self.chunk_size = self.adjust_chunk_size(user_chunk_size)
|
self.chunk_size = self.adjust_chunk_size(user_chunk_size)
|
||||||
|
|
||||||
# Output chunking
|
# Output chunking
|
||||||
disable_output_chunking = unwrap(kwargs.get("disable_output_chunking"), False)
|
output_chunking = unwrap(kwargs.get("output_chunking"), True)
|
||||||
self.max_rq_tokens = None if disable_output_chunking else self.chunk_size
|
self.max_rq_tokens = self.chunk_size if output_chunking else None
|
||||||
|
|
||||||
# Template setup
|
# Template setup
|
||||||
self.prompt_template = await find_prompt_template(
|
self.prompt_template = await find_prompt_template(
|
||||||
|
|||||||
@@ -265,12 +265,13 @@ class ModelConfig(BaseConfigModel):
|
|||||||
),
|
),
|
||||||
gt=0,
|
gt=0,
|
||||||
)
|
)
|
||||||
disable_output_chunking: Optional[bool] = Field(
|
output_chunking: Optional[bool] = Field(
|
||||||
False,
|
True,
|
||||||
description=(
|
description=(
|
||||||
"Disable output chunking (default: false).\n"
|
"Use output chunking (default: True)\n"
|
||||||
|
"Instead of allocating cache space for the entire completion at once, "
|
||||||
|
"allocate in chunks as needed.\n"
|
||||||
"Used by EXL3 models only.\n"
|
"Used by EXL3 models only.\n"
|
||||||
"True, allocate space in the cache for the entire response with each request..\n"
|
|
||||||
),
|
),
|
||||||
)
|
)
|
||||||
max_batch_size: Optional[int] = Field(
|
max_batch_size: Optional[int] = Field(
|
||||||
|
|||||||
@@ -133,10 +133,10 @@ model:
|
|||||||
# An ideal value is between 512 and 4096.
|
# An ideal value is between 512 and 4096.
|
||||||
chunk_size: 2048
|
chunk_size: 2048
|
||||||
|
|
||||||
# Disable output chunking (default: false)
|
# Use output chunking (default: True)
|
||||||
|
# Instead of allocating cache space for the entire completion at once, allocate in chunks as needed.
|
||||||
# Used by EXL3 models only.
|
# Used by EXL3 models only.
|
||||||
# If True, allocate space in the cache for the entire response with each request.
|
output_chunking: true
|
||||||
disable_output_chunking: false
|
|
||||||
|
|
||||||
# Set the maximum number of prompts to process at one time (default: None/Automatic).
|
# Set the maximum number of prompts to process at one time (default: None/Automatic).
|
||||||
# Automatically calculated if left blank.
|
# Automatically calculated if left blank.
|
||||||
|
|||||||
@@ -109,7 +109,7 @@ class ModelLoadRequest(BaseModel):
|
|||||||
)
|
)
|
||||||
cache_mode: Optional[str] = None
|
cache_mode: Optional[str] = None
|
||||||
chunk_size: Optional[int] = None
|
chunk_size: Optional[int] = None
|
||||||
disable_output_chunking: Optional[bool] = False
|
output_chunking: Optional[bool] = True
|
||||||
prompt_template: Optional[str] = None
|
prompt_template: Optional[str] = None
|
||||||
vision: Optional[bool] = None
|
vision: Optional[bool] = None
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user