From 0af29d957a57b436c5348363befcbb58755f05d8 Mon Sep 17 00:00:00 2001 From: turboderp <11859846+turboderp@users.noreply.github.com> Date: Wed, 15 Oct 2025 10:40:19 +0200 Subject: [PATCH] Fix #390 --- backends/exllamav2/model.py | 11 +++-------- backends/exllamav3/model.py | 13 +++++-------- 2 files changed, 8 insertions(+), 16 deletions(-) diff --git a/backends/exllamav2/model.py b/backends/exllamav2/model.py index 4394697..dead5f5 100644 --- a/backends/exllamav2/model.py +++ b/backends/exllamav2/model.py @@ -1336,14 +1336,9 @@ class ExllamaV2Container(BaseModelContainer): negative_context_len = input_ids[1].size(dim=-1) if negative_prompt else 0 # Automatically set max_tokens to fill up the context - # This should be an OK default, but may be changed in the future - max_tokens = unwrap( - params.max_tokens, - self.config.max_seq_len - max(context_len, negative_context_len), - ) - if max_tokens < 1: - logger.warning("max_tokens must be a positive integer, setting to 1.") - max_tokens = 1 + max_tokens = unwrap(params.max_tokens, 0) + if max_tokens <= 0: + max_tokens = self.config.max_seq_len - max(context_len, negative_context_len) # Determine if the negative context or the context length is bigger context_to_check = max(negative_context_len, context_len) diff --git a/backends/exllamav3/model.py b/backends/exllamav3/model.py index c1277d0..d385de7 100644 --- a/backends/exllamav3/model.py +++ b/backends/exllamav3/model.py @@ -951,14 +951,11 @@ class ExllamaV3Container(BaseModelContainer): # The first index will always be the positive prompt context_len = input_ids[0].size(dim=-1) - # Automatically set max_tokens to fill up the context - max_tokens = unwrap( - params.max_tokens if params.max_tokens > 0 else None, - self.max_seq_len - context_len - 1, - ) - if max_tokens < 1: - logger.warning("max_tokens must be a positive integer, setting to 1.") - max_tokens = 1 + # Unless specified in the request, automatically set max_tokens to fill up + # the context + max_tokens = unwrap(params.max_tokens, 0) + if max_tokens <= 0: + max_tokens = self.max_seq_len - context_len - 1 # Check total length of prompt against max context length if context_len > self.max_seq_len: