Model: Enable max_rq_tokens (output chunking)

This commit is contained in:
turboderp
2025-10-05 18:54:45 +02:00
parent e09a61969f
commit 52e093ae6c
4 changed files with 250 additions and 230 deletions

View File

@@ -85,6 +85,7 @@ class ExllamaV3Container(BaseModelContainer):
cache_mode: str = "FP16"
draft_cache_mode: str = "FP16"
chunk_size: int = 2048
max_rq_tokens: Optional[int] = 2048
max_batch_size: Optional[int] = None
# Required methods
@@ -250,6 +251,10 @@ class ExllamaV3Container(BaseModelContainer):
user_chunk_size = unwrap(kwargs.get("chunk_size"), 2048)
self.chunk_size = self.adjust_chunk_size(user_chunk_size)
# Output chunking
disable_output_chunking = unwrap(kwargs.get("disable_output_chunking"), False)
self.max_rq_tokens = None if disable_output_chunking else self.chunk_size
# Template setup
self.prompt_template = await find_prompt_template(
kwargs.get("prompt_template"), model_directory
@@ -982,6 +987,7 @@ class ExllamaV3Container(BaseModelContainer):
banned_strings=params.banned_strings,
embeddings=mm_embeddings_content,
return_top_tokens=params.logprobs,
max_rq_tokens=self.max_rq_tokens
)
generated_tokens = 0