mirror of
https://github.com/theroyallab/tabbyAPI.git
synced 2026-03-14 15:57:27 +00:00
Model: Attempt to recreate generator on a fatal error
If a job causes the generator to error, tabby stops working until a relaunch. It's better to try establishing a system of redundancy and remake the generator in the event that it fails. May replace this with an exit signal for a fatal error instead, but not sure. Signed-off-by: kingbri <bdashore3@proton.me>
This commit is contained in:
@@ -488,15 +488,7 @@ class ExllamaV2Container:
|
||||
yield value
|
||||
|
||||
# Create async generator
|
||||
self.generator = ExLlamaV2DynamicGeneratorAsync(
|
||||
model=self.model,
|
||||
cache=self.cache,
|
||||
draft_model=self.draft_model,
|
||||
draft_cache=self.draft_cache,
|
||||
tokenizer=self.tokenizer,
|
||||
max_batch_size=self.max_batch_size,
|
||||
paged=self.paged,
|
||||
)
|
||||
await self.create_generator()
|
||||
|
||||
# Clean up any extra vram usage from torch and cuda
|
||||
# (Helps reduce VRAM bottlenecking on Windows)
|
||||
@@ -645,6 +637,34 @@ class ExllamaV2Container:
|
||||
input_ids = torch.zeros((1, self.config.max_input_len), dtype=torch.long)
|
||||
self.model.forward(input_ids, cache=self.cache, preprocess_only=True)
|
||||
|
||||
async def create_generator(self):
|
||||
try:
|
||||
# Don't acquire locks unless a model is loaded
|
||||
if self.model_loaded:
|
||||
await self.load_lock.acquire()
|
||||
|
||||
# Immediately cancel all jobs
|
||||
await self.wait_for_jobs(skip_wait=True)
|
||||
|
||||
# Create new generator
|
||||
self.generator = ExLlamaV2DynamicGeneratorAsync(
|
||||
model=self.model,
|
||||
cache=self.cache,
|
||||
draft_model=self.draft_model,
|
||||
draft_cache=self.draft_cache,
|
||||
tokenizer=self.tokenizer,
|
||||
max_batch_size=self.max_batch_size,
|
||||
paged=self.paged,
|
||||
)
|
||||
finally:
|
||||
# This means the generator is being recreated
|
||||
# The load lock is already released in the load function
|
||||
if self.model_loaded:
|
||||
self.load_lock.release()
|
||||
|
||||
async with self.load_condition:
|
||||
self.load_condition.notify_all()
|
||||
|
||||
def get_loras(self):
|
||||
"""Convenience function to get all loras."""
|
||||
|
||||
@@ -1223,3 +1243,14 @@ class ExllamaV2Container:
|
||||
break
|
||||
except asyncio.CancelledError:
|
||||
await job.cancel()
|
||||
except Exception as ex:
|
||||
# Create a new generator since the current state is broken
|
||||
# No need to wait for this to finish
|
||||
logger.error(
|
||||
"FATAL ERROR with generation. "
|
||||
"Attempting to recreate the generator. "
|
||||
"If this fails, please restart the server.\n"
|
||||
)
|
||||
asyncio.ensure_future(self.create_generator())
|
||||
|
||||
raise ex
|
||||
|
||||
Reference in New Issue
Block a user