Merge remote-tracking branch 'upstream/main' into HEAD

This commit is contained in:
TerminalMan
2024-09-11 15:57:18 +01:00
28 changed files with 386 additions and 171 deletions

View File

@@ -22,6 +22,7 @@ from endpoints.OAI.utils.chat_completion import (
)
from endpoints.OAI.utils.completion import (
generate_completion,
load_inline_model,
stream_generate_completion,
)
from endpoints.OAI.utils.embeddings import get_embeddings
@@ -42,7 +43,7 @@ def setup():
# Completions endpoint
@router.post(
"/v1/completions",
dependencies=[Depends(check_api_key), Depends(check_model_container)],
dependencies=[Depends(check_api_key)],
)
async def completion_request(
request: Request, data: CompletionRequest
@@ -53,6 +54,18 @@ async def completion_request(
If stream = true, this returns an SSE stream.
"""
if data.model:
inline_load_task = asyncio.create_task(load_inline_model(data.model, request))
await run_with_request_disconnect(
request,
inline_load_task,
disconnect_message=f"Model switch for generation {request.state.id} "
+ "cancelled by user.",
)
else:
await check_model_container()
model_path = model.container.model_dir
if isinstance(data.prompt, list):
@@ -85,7 +98,7 @@ async def completion_request(
# Chat completions endpoint
@router.post(
"/v1/chat/completions",
dependencies=[Depends(check_api_key), Depends(check_model_container)],
dependencies=[Depends(check_api_key)],
)
async def chat_completion_request(
request: Request, data: ChatCompletionRequest
@@ -96,6 +109,11 @@ async def chat_completion_request(
If stream = true, this returns an SSE stream.
"""
if data.model:
await load_inline_model(data.model, request)
else:
await check_model_container()
if model.container.prompt_template is None:
error_message = handle_request_error(
"Chat completions are disabled because a prompt template is not set.",

View File

@@ -56,6 +56,7 @@ class ChatCompletionRequest(CommonCompletionRequest):
add_generation_prompt: Optional[bool] = True
template_vars: Optional[dict] = {}
response_prefix: Optional[str] = None
model: Optional[str] = None
# tools is follows the format OAI schema, functions is more flexible
# both are available in the chat template.

View File

@@ -1,4 +1,8 @@
"""Completion utilities for OAI server."""
"""
Completion utilities for OAI server.
Also serves as a common module for completions and chat completions.
"""
import asyncio
import pathlib
@@ -10,12 +14,14 @@ from typing import List, Union
from loguru import logger
from common import model
from common.auth import get_key_permission
from common.networking import (
get_generator_error,
handle_request_disconnect,
handle_request_error,
request_disconnect_loop,
)
from common.tabby_config import config
from common.utils import unwrap
from endpoints.OAI.types.completion import (
CompletionRequest,
@@ -103,6 +109,50 @@ async def _stream_collector(
await gen_queue.put(e)
async def load_inline_model(model_name: str, request: Request):
"""Load a model from the data.model parameter"""
# Return if the model container already exists and the model is fully loaded
if (
model.container
and model.container.model_dir.name == model_name
and model.container.model_loaded
):
return
# Inline model loading isn't enabled or the user isn't an admin
if not get_key_permission(request) == "admin":
error_message = handle_request_error(
f"Unable to switch model to {model_name} because "
+ "an admin key isn't provided",
exc_info=False,
).error.message
raise HTTPException(401, error_message)
if not unwrap(config.model.get("inline_model_loading"), False):
logger.warning(
f"Unable to switch model to {model_name} because "
'"inline_model_loading" is not True in config.yml.'
)
return
model_path = pathlib.Path(unwrap(config.model.get("model_dir"), "models"))
model_path = model_path / model_name
# Model path doesn't exist
if not model_path.exists():
logger.warning(
f"Could not find model path {str(model_path)}. Skipping inline model load."
)
return
# Load the model
await model.load_model(model_path)
async def stream_generate_completion(
data: CompletionRequest, request: Request, model_path: pathlib.Path
):