Model: Attempt to recreate generator on a fatal error

If a job causes the generator to error, tabby stops working until a relaunch. It's better to try establishing a system of redundancy and remake the generator in the event that it fails. May replace this with an exit signal for a fatal error instead, but not sure. Signed-off-by: kingbri <bdashore3@proton.me>
2026-05-11 16:30:16 +00:00 · 2024-07-15 01:09:49 -04:00
parent 6019c93637
commit 9dae461142
1 changed files with 40 additions and 9 deletions
--- a/backends/exllamav2/model.py
+++ b/backends/exllamav2/model.py
@@ -488,15 +488,7 @@ class ExllamaV2Container:
                yield value

            # Create async generator
-            self.generator = ExLlamaV2DynamicGeneratorAsync(
-                model=self.model,
-                cache=self.cache,
-                draft_model=self.draft_model,
-                draft_cache=self.draft_cache,
-                tokenizer=self.tokenizer,
-                max_batch_size=self.max_batch_size,
-                paged=self.paged,
-            )
+            await self.create_generator()

            # Clean up any extra vram usage from torch and cuda
            # (Helps reduce VRAM bottlenecking on Windows)
@@ -645,6 +637,34 @@ class ExllamaV2Container:
        input_ids = torch.zeros((1, self.config.max_input_len), dtype=torch.long)
        self.model.forward(input_ids, cache=self.cache, preprocess_only=True)

+    async def create_generator(self):
+        try:
+            # Don't acquire locks unless a model is loaded
+            if self.model_loaded:
+                await self.load_lock.acquire()
+
+                # Immediately cancel all jobs
+                await self.wait_for_jobs(skip_wait=True)
+
+            # Create new generator
+            self.generator = ExLlamaV2DynamicGeneratorAsync(
+                model=self.model,
+                cache=self.cache,
+                draft_model=self.draft_model,
+                draft_cache=self.draft_cache,
+                tokenizer=self.tokenizer,
+                max_batch_size=self.max_batch_size,
+                paged=self.paged,
+            )
+        finally:
+            # This means the generator is being recreated
+            # The load lock is already released in the load function
+            if self.model_loaded:
+                self.load_lock.release()
+
+                async with self.load_condition:
+                    self.load_condition.notify_all()
+
    def get_loras(self):
        """Convenience function to get all loras."""

@@ -1223,3 +1243,14 @@ class ExllamaV2Container:
                        break
        except asyncio.CancelledError:
            await job.cancel()
+        except Exception as ex:
+            # Create a new generator since the current state is broken
+            # No need to wait for this to finish
+            logger.error(
+                "FATAL ERROR with generation. "
+                "Attempting to recreate the generator. "
+                "If this fails, please restart the server.\n"
+            )
+            asyncio.ensure_future(self.create_generator())
+
+            raise ex