API: Add inline exception for dummy models

If an API key sends a dummy model, it shouldn't error as the server is catering to clients that expect specific OAI model names. This is a problem with inline model loading since these names would error by default. Therefore, add an exception if the provided name is in the dummy model names (which also doubles as inline strict exceptions). However, the dummy model names weren't configurable, so add a new option to specify exception names, otherwise the default is gpt-3.5-turbo. Signed-off-by: kingbri <bdashore3@proton.me>
2026-03-14 15:57:27 +00:00 · 2024-11-17 21:12:38 -05:00
parent b94c646210
commit bd9e78e19e
5 changed files with 53 additions and 13 deletions
--- a/endpoints/OAI/utils/completion.py
+++ b/endpoints/OAI/utils/completion.py
@@ -130,18 +130,33 @@ async def load_inline_model(model_name: str, request: Request):

        return

-    # Error if an invalid key is passed
-    if get_key_permission(request) != "admin":
-        error_message = handle_request_error(
-            f"Unable to switch model to {model_name} because "
-            + "an admin key isn't provided",
-            exc_info=False,
-        ).error.message
+    is_dummy_model = (
+        config.model.use_dummy_models and model_name in config.model.dummy_model_names
+    )

-        raise HTTPException(401, error_message)
+    # Error if an invalid key is passed
+    # If a dummy model is provided, don't error
+    if get_key_permission(request) != "admin":
+        if not is_dummy_model:
+            error_message = handle_request_error(
+                f"Unable to switch model to {model_name} because "
+                + "an admin key isn't provided",
+                exc_info=False,
+            ).error.message
+
+            raise HTTPException(401, error_message)
+        else:
+            return

    # Start inline loading
    # Past here, user is assumed to be admin
+
+    # Skip if the model is a dummy
+    if is_dummy_model:
+        logger.warning(f"Dummy model {model_name} provided. Skipping inline load.")
+
+        return
+
    model_path = pathlib.Path(config.model.model_dir)
    model_path = model_path / model_name

--- a/endpoints/core/router.py
+++ b/endpoints/core/router.py
@@ -39,6 +39,7 @@ from endpoints.core.utils.lora import get_active_loras, get_lora_list
 from endpoints.core.utils.model import (
    get_current_model,
    get_current_model_list,
+    get_dummy_models,
    get_model_list,
    stream_model_load,
 )
@@ -82,7 +83,7 @@ async def list_models(request: Request) -> ModelList:
        models = await get_current_model_list()

    if config.model.use_dummy_models:
-        models.data.insert(0, ModelCard(id="gpt-3.5-turbo"))
+        models.data[:0] = get_dummy_models()

    return models

--- a/endpoints/core/utils/model.py
+++ b/endpoints/core/utils/model.py
@@ -92,6 +92,13 @@ def get_current_model():
    return model_card


+def get_dummy_models():
+    if config.model.dummy_model_names:
+        return [ModelCard(id=dummy_id) for dummy_id in config.model.dummy_model_names]
+    else:
+        return [ModelCard(id="gpt-3.5-turbo")]
+
+
 async def stream_model_load(
    data: ModelLoadRequest,
    model_path: pathlib.Path,