mirror of
https://github.com/theroyallab/tabbyAPI.git
synced 2026-06-29 10:47:09 +00:00
35 lines
1.1 KiB
Python
35 lines
1.1 KiB
Python
class ContextLengthExceededError(ValueError):
|
|
"""Raised when a tokenized prompt exceeds the loaded model's context limit."""
|
|
|
|
|
|
def validate_context_requirements(
|
|
context_len: int,
|
|
max_seq_len: int,
|
|
max_tokens: int,
|
|
cache_capacity: int,
|
|
max_rq_tokens: int | None = None,
|
|
allocation_boundary: int = 256,
|
|
):
|
|
"""Validate the initial cache allocation required by an ExLlamaV3 job."""
|
|
|
|
if context_len > max_seq_len:
|
|
raise ContextLengthExceededError(
|
|
f"Prompt length {context_len} is greater than max_seq_len {max_seq_len}"
|
|
)
|
|
|
|
if max_tokens <= 0:
|
|
max_tokens = max_seq_len - context_len - 1
|
|
|
|
if max_rq_tokens is not None:
|
|
required_tokens = (
|
|
(context_len - 1 + max_rq_tokens + allocation_boundary - 1) // allocation_boundary
|
|
) * allocation_boundary
|
|
else:
|
|
required_tokens = context_len + max_tokens
|
|
|
|
if required_tokens > cache_capacity:
|
|
raise ContextLengthExceededError(
|
|
f"Initial job allocation requires {required_tokens} cache tokens, "
|
|
f"which exceeds the available context size of {cache_capacity} tokens"
|
|
)
|