mirror of
https://github.com/theroyallab/tabbyAPI.git
synced 2026-04-30 03:01:44 +00:00
Merge branch 'main' into draft-split
This commit is contained in:
@@ -498,16 +498,18 @@ class ExllamaV2Container:
|
||||
"rope_scale": self.config.scale_pos_emb,
|
||||
"rope_alpha": self.config.scale_alpha_value,
|
||||
"max_seq_len": self.config.max_seq_len,
|
||||
"max_batch_size": self.max_batch_size,
|
||||
"cache_size": self.cache_size,
|
||||
"cache_mode": self.cache_mode,
|
||||
"chunk_size": self.config.max_input_len,
|
||||
"num_experts_per_token": self.config.num_experts_per_token,
|
||||
"prompt_template": self.prompt_template.name
|
||||
if self.prompt_template
|
||||
else None,
|
||||
"use_vision": self.use_vision,
|
||||
}
|
||||
|
||||
if self.prompt_template:
|
||||
model_params["prompt_template"] = self.prompt_template.name
|
||||
model_params["prompt_template_content"] = self.prompt_template.raw_template
|
||||
|
||||
if self.draft_config:
|
||||
draft_model_params = {
|
||||
"name": self.draft_model_dir.name,
|
||||
@@ -787,6 +789,10 @@ class ExllamaV2Container:
|
||||
max_batch_size=self.max_batch_size,
|
||||
paged=self.paged,
|
||||
)
|
||||
|
||||
# Update the state of the container var
|
||||
if self.max_batch_size is None:
|
||||
self.max_batch_size = self.generator.generator.max_batch_size
|
||||
finally:
|
||||
# This means the generator is being recreated
|
||||
# The load lock is already released in the load function
|
||||
@@ -1222,7 +1228,7 @@ class ExllamaV2Container:
|
||||
# Add EBNF filter if it exists
|
||||
grammar_string = unwrap(kwargs.get("grammar_string"))
|
||||
if grammar_string:
|
||||
grammar_handler.add_ebnf_filter(grammar_string, self.model, self.tokenizer)
|
||||
grammar_handler.add_kbnf_filter(grammar_string, self.model, self.tokenizer)
|
||||
|
||||
# Set banned strings
|
||||
banned_strings: List[str] = unwrap(kwargs.get("banned_strings"), [])
|
||||
@@ -1329,17 +1335,49 @@ class ExllamaV2Container:
|
||||
|
||||
# The first index will always be the positive prompt
|
||||
context_len = input_ids[0].size(dim=-1)
|
||||
if context_len > self.config.max_seq_len:
|
||||
raise ValueError(
|
||||
f"Context length {context_len} is greater than max_seq_len "
|
||||
f"{self.config.max_seq_len}"
|
||||
)
|
||||
|
||||
# The second index will be the negative prompt if CFG is enabled
|
||||
negative_context_len = input_ids[1].size(dim=-1) if negative_prompt else 0
|
||||
|
||||
# Automatically set max_tokens to fill up the context
|
||||
# This should be an OK default, but may be changed in the future
|
||||
max_tokens = unwrap(
|
||||
kwargs.get("max_tokens"), self.config.max_seq_len - context_len
|
||||
kwargs.get("max_tokens"),
|
||||
self.config.max_seq_len - max(context_len, negative_context_len),
|
||||
)
|
||||
if max_tokens < 1:
|
||||
logger.warning("max_tokens must be a positive integer, setting to 1.")
|
||||
max_tokens = 1
|
||||
|
||||
# Determine if the negative context or the context length is bigger
|
||||
context_to_check = max(negative_context_len, context_len)
|
||||
|
||||
# Check highest possible total length of request
|
||||
if context_to_check + max_tokens > self.config.max_seq_len:
|
||||
preamble = (
|
||||
"Negative prompt request"
|
||||
if negative_context_len > context_len
|
||||
else "Request"
|
||||
)
|
||||
|
||||
raise ValueError(
|
||||
f"{preamble} length {context_to_check} + {max_tokens} is greater than "
|
||||
f"max_seq_len {self.config.max_seq_len}"
|
||||
)
|
||||
|
||||
# Check total required pages for CFG request to avoid overallocation
|
||||
if negative_prompt and (
|
||||
sum(
|
||||
256 * math.ceil((context + max_tokens) / 256)
|
||||
for context in (context_len, negative_context_len)
|
||||
)
|
||||
> self.cache_size
|
||||
):
|
||||
raise ValueError(
|
||||
f"Total required page size for request "
|
||||
f"{context_len} + {negative_context_len} + {max_tokens} * 2 "
|
||||
f"is greater than cache_size {self.cache_size}"
|
||||
)
|
||||
|
||||
# Set min_tokens to generate while keeping EOS banned
|
||||
min_tokens = unwrap(kwargs.get("min_tokens"), 0)
|
||||
|
||||
Reference in New Issue
Block a user