Merge branch 'main' into draft-split

2026-04-30 03:01:44 +00:00 · 2025-02-08 15:10:44 -05:00
parent ab1f4b7a6a dcbf2de9e5
commit bd8256d168
14 changed files with 287 additions and 226 deletions
--- a/backends/exllamav2/model.py
+++ b/backends/exllamav2/model.py
@@ -498,16 +498,18 @@ class ExllamaV2Container:
            "rope_scale": self.config.scale_pos_emb,
            "rope_alpha": self.config.scale_alpha_value,
            "max_seq_len": self.config.max_seq_len,
+            "max_batch_size": self.max_batch_size,
            "cache_size": self.cache_size,
            "cache_mode": self.cache_mode,
            "chunk_size": self.config.max_input_len,
            "num_experts_per_token": self.config.num_experts_per_token,
-            "prompt_template": self.prompt_template.name
-            if self.prompt_template
-            else None,
            "use_vision": self.use_vision,
        }

+        if self.prompt_template:
+            model_params["prompt_template"] = self.prompt_template.name
+            model_params["prompt_template_content"] = self.prompt_template.raw_template
+
        if self.draft_config:
            draft_model_params = {
                "name": self.draft_model_dir.name,
@@ -787,6 +789,10 @@ class ExllamaV2Container:
                max_batch_size=self.max_batch_size,
                paged=self.paged,
            )
+
+            # Update the state of the container var
+            if self.max_batch_size is None:
+                self.max_batch_size = self.generator.generator.max_batch_size
        finally:
            # This means the generator is being recreated
            # The load lock is already released in the load function
@@ -1222,7 +1228,7 @@ class ExllamaV2Container:
        # Add EBNF filter if it exists
        grammar_string = unwrap(kwargs.get("grammar_string"))
        if grammar_string:
-            grammar_handler.add_ebnf_filter(grammar_string, self.model, self.tokenizer)
+            grammar_handler.add_kbnf_filter(grammar_string, self.model, self.tokenizer)

        # Set banned strings
        banned_strings: List[str] = unwrap(kwargs.get("banned_strings"), [])
@@ -1329,17 +1335,49 @@ class ExllamaV2Container:

        # The first index will always be the positive prompt
        context_len = input_ids[0].size(dim=-1)
-        if context_len > self.config.max_seq_len:
-            raise ValueError(
-                f"Context length {context_len} is greater than max_seq_len "
-                f"{self.config.max_seq_len}"
-            )
+
+        # The second index will be the negative prompt if CFG is enabled
+        negative_context_len = input_ids[1].size(dim=-1) if negative_prompt else 0

        # Automatically set max_tokens to fill up the context
        # This should be an OK default, but may be changed in the future
        max_tokens = unwrap(
-            kwargs.get("max_tokens"), self.config.max_seq_len - context_len
+            kwargs.get("max_tokens"),
+            self.config.max_seq_len - max(context_len, negative_context_len),
        )
+        if max_tokens < 1:
+            logger.warning("max_tokens must be a positive integer, setting to 1.")
+            max_tokens = 1
+
+        # Determine if the negative context or the context length is bigger
+        context_to_check = max(negative_context_len, context_len)
+
+        # Check highest possible total length of request
+        if context_to_check + max_tokens > self.config.max_seq_len:
+            preamble = (
+                "Negative prompt request"
+                if negative_context_len > context_len
+                else "Request"
+            )
+
+            raise ValueError(
+                f"{preamble} length {context_to_check} + {max_tokens} is greater than "
+                f"max_seq_len {self.config.max_seq_len}"
+            )
+
+        # Check total required pages for CFG request to avoid overallocation
+        if negative_prompt and (
+            sum(
+                256 * math.ceil((context + max_tokens) / 256)
+                for context in (context_len, negative_context_len)
+            )
+            > self.cache_size
+        ):
+            raise ValueError(
+                f"Total required page size for request "
+                f"{context_len} + {negative_context_len} + {max_tokens} * 2 "
+                f"is greater than cache_size {self.cache_size}"
+            )

        # Set min_tokens to generate while keeping EOS banned
        min_tokens = unwrap(kwargs.get("min_tokens"), 0)