Merge pull request #264 from DocShotgun/robust-length-checking

Robust request length checking in generator
2026-04-30 11:11:35 +00:00 · 2024-12-26 23:37:53 -05:00
parent 7878d351a7 b994aae995
commit 709493837b
1 changed files with 38 additions and 6 deletions
--- a/backends/exllamav2/model.py
+++ b/backends/exllamav2/model.py
@@ -1307,17 +1307,49 @@ class ExllamaV2Container:
        # The first index will always be the positive prompt
        context_len = input_ids[0].size(dim=-1)
-        if context_len > self.config.max_seq_len:
+
-            raise ValueError(
+        # The second index will be the negative prompt if CFG is enabled
-                f"Context length {context_len} is greater than max_seq_len "
+        negative_context_len = input_ids[1].size(dim=-1) if negative_prompt else 0
                f"{self.config.max_seq_len}"
            )
        # Automatically set max_tokens to fill up the context
        # This should be an OK default, but may be changed in the future
        max_tokens = unwrap(
-            kwargs.get("max_tokens"), self.config.max_seq_len - context_len
+            kwargs.get("max_tokens"),
            self.config.max_seq_len - max(context_len, negative_context_len),
        )
        if max_tokens < 1:
            logger.warning("max_tokens must be a positive integer, setting to 1.")
            max_tokens = 1
        # Determine if the negative context or the context length is bigger
        context_to_check = max(negative_context_len, context_len)
        # Check highest possible total length of request
        if context_to_check + max_tokens > self.config.max_seq_len:
            preamble = (
                "Negative prompt request"
                if negative_context_len > context_len
                else "Request"
            )
            raise ValueError(
                f"{preamble} length {context_to_check} + {max_tokens} is greater than "
                f"max_seq_len {self.config.max_seq_len}"
            )
        # Check total required pages for CFG request to avoid overallocation
        if negative_prompt and (
            sum(
                256 * math.ceil((context + max_tokens) / 256)
                for context in (context_len, negative_context_len)
            )
            > self.cache_size
        ):
            raise ValueError(
                f"Total required page size for request "
                f"{context_len} + {negative_context_len} + {max_tokens} * 2 "
                f"is greater than cache_size {self.cache_size}"
            )
        # Set min_tokens to generate while keeping EOS banned
        min_tokens = unwrap(kwargs.get("min_tokens"), 0)