Model: Add proper jobs cleanup and fix var calls

Jobs should be started and immediately cleaned up when calling the generation stream. Expose a stream_generate function and append this to the base class since it's more idiomatic than generate_gen. The exl2 container's generate_gen function is now internal. Signed-off-by: kingbri <8082010+kingbri1@users.noreply.github.com>
2026-04-23 15:59:14 +00:00 · 2025-04-24 21:30:55 -04:00
parent 7e007f0761
commit f070587e9f
6 changed files with 45 additions and 26 deletions
--- a/endpoints/OAI/utils/completion.py
+++ b/endpoints/OAI/utils/completion.py
@@ -95,7 +95,7 @@ async def _stream_collector(
    """Collects a stream and places results in a common queue"""

    try:
-        new_generation = model.container.generate_gen(
+        new_generation = model.container.stream_generate(
            request_id,
            prompt,
            params,
@@ -120,7 +120,7 @@ async def load_inline_model(model_name: str, request: Request):
    if (
        model.container
        and model.container.model_dir.name == model_name
-        and model.container.model_loaded
+        and model.container.loaded
    ):
        return