Files
tabbyAPI/common/errors.py

71 lines
2.1 KiB
Python

from fastapi import HTTPException, Request
from fastapi.responses import JSONResponse
class ContextLengthExceededError(ValueError):
"""Raised when a tokenized prompt exceeds the loaded model's context limit."""
class ContextLengthHTTPException(HTTPException):
"""HTTP error for OpenAI-compatible context overflow responses."""
def __init__(self, message: str):
super().__init__(status_code=400, detail=message)
def context_length_error_content(message: str) -> dict:
"""Build an OpenAI-compatible context overflow error."""
return {
"error": {
"message": message,
"type": "invalid_request_error",
"param": None,
"code": "context_length_exceeded",
}
}
async def context_length_exception_handler(
request: Request, exc: ContextLengthHTTPException
) -> JSONResponse:
"""Return the OpenAI error shape expected by compatible clients."""
return JSONResponse(
status_code=exc.status_code,
content=context_length_error_content(exc.detail),
)
def validate_context_requirements(
context_len: int,
max_seq_len: int,
max_tokens: int,
cache_capacity: int,
max_rq_tokens: int | None = None,
allocation_boundary: int = 256,
):
"""Validate the initial cache allocation required by an ExLlamaV3 job."""
if context_len > max_seq_len:
raise ContextLengthExceededError(
f"Prompt length {context_len} exceeds the available context size "
f"of {max_seq_len} tokens"
)
if max_tokens <= 0:
max_tokens = max_seq_len - context_len - 1
if max_rq_tokens is not None:
required_tokens = (
(context_len - 1 + max_rq_tokens + allocation_boundary - 1) // allocation_boundary
) * allocation_boundary
else:
required_tokens = context_len + max_tokens
if required_tokens > cache_capacity:
raise ContextLengthExceededError(
f"Initial job allocation requires {required_tokens} cache tokens, "
f"which exceeds the available context size of {cache_capacity} tokens"
)