mirror of
https://github.com/theroyallab/tabbyAPI.git
synced 2026-06-28 18:27:12 +00:00
71 lines
2.1 KiB
Python
71 lines
2.1 KiB
Python
from fastapi import HTTPException, Request
|
|
from fastapi.responses import JSONResponse
|
|
|
|
|
|
class ContextLengthExceededError(ValueError):
|
|
"""Raised when a tokenized prompt exceeds the loaded model's context limit."""
|
|
|
|
|
|
class ContextLengthHTTPException(HTTPException):
|
|
"""HTTP error for OpenAI-compatible context overflow responses."""
|
|
|
|
def __init__(self, message: str):
|
|
super().__init__(status_code=400, detail=message)
|
|
|
|
|
|
def context_length_error_content(message: str) -> dict:
|
|
"""Build an OpenAI-compatible context overflow error."""
|
|
|
|
return {
|
|
"error": {
|
|
"message": message,
|
|
"type": "invalid_request_error",
|
|
"param": None,
|
|
"code": "context_length_exceeded",
|
|
}
|
|
}
|
|
|
|
|
|
async def context_length_exception_handler(
|
|
request: Request, exc: ContextLengthHTTPException
|
|
) -> JSONResponse:
|
|
"""Return the OpenAI error shape expected by compatible clients."""
|
|
|
|
return JSONResponse(
|
|
status_code=exc.status_code,
|
|
content=context_length_error_content(exc.detail),
|
|
)
|
|
|
|
|
|
def validate_context_requirements(
|
|
context_len: int,
|
|
max_seq_len: int,
|
|
max_tokens: int,
|
|
cache_capacity: int,
|
|
max_rq_tokens: int | None = None,
|
|
allocation_boundary: int = 256,
|
|
):
|
|
"""Validate the initial cache allocation required by an ExLlamaV3 job."""
|
|
|
|
if context_len > max_seq_len:
|
|
raise ContextLengthExceededError(
|
|
f"Prompt length {context_len} exceeds the available context size "
|
|
f"of {max_seq_len} tokens"
|
|
)
|
|
|
|
if max_tokens <= 0:
|
|
max_tokens = max_seq_len - context_len - 1
|
|
|
|
if max_rq_tokens is not None:
|
|
required_tokens = (
|
|
(context_len - 1 + max_rq_tokens + allocation_boundary - 1) // allocation_boundary
|
|
) * allocation_boundary
|
|
else:
|
|
required_tokens = context_len + max_tokens
|
|
|
|
if required_tokens > cache_capacity:
|
|
raise ContextLengthExceededError(
|
|
f"Initial job allocation requires {required_tokens} cache tokens, "
|
|
f"which exceeds the available context size of {cache_capacity} tokens"
|
|
)
|