Model: Use true async jobs and add logprobs

The new async dynamic job allows for native async support without the need of threading. Also add logprobs and metrics back to responses. Signed-off-by: kingbri <bdashore3@proton.me>
2026-03-15 00:07:28 +00:00 · 2024-05-23 21:37:50 -04:00
parent 32ae62feac
commit 06ff47e2b4
4 changed files with 102 additions and 217 deletions
--- a/endpoints/OAI/utils/chat_completion.py
+++ b/endpoints/OAI/utils/chat_completion.py
@@ -1,8 +1,7 @@
 """Chat completion utilities for OAI server."""

-from asyncio import CancelledError
 import pathlib
-import threading
+from asyncio import CancelledError
 from typing import Optional
 from uuid import uuid4

@@ -198,11 +197,8 @@ async def stream_generate_chat_completion(
    """Generator for the generation process."""
    try:
        const_id = f"chatcmpl-{uuid4().hex}"
-        abort_event = threading.Event()

-        new_generation = model.container.generate_gen(
-            prompt, abort_event, **data.to_gen_params()
-        )
+        new_generation = model.container.generate_gen(prompt, **data.to_gen_params())
        async for generation in new_generation:
            response = _create_stream_chunk(const_id, generation, model_path.name)

@@ -214,7 +210,6 @@ async def stream_generate_chat_completion(
    except CancelledError:
        # Get out if the request gets disconnected

-        abort_event.set()
        handle_request_disconnect("Chat completion generation cancelled by user.")
    except Exception:
        yield get_generator_error(