Model: Create generator on load

This commit is contained in:
turboderp
2025-05-03 18:32:51 +02:00
parent 0d949d00b9
commit 58c380b8ca

View File

@@ -345,6 +345,9 @@ class ExllamaV3Container(BaseModelContainer):
async for value in iterate_in_threadpool(generator):
yield value
# Create async generator
await self.create_generator()
# Clean up any extra vram usage from torch and cuda
# (Helps reduce VRAM bottlenecking on Windows)
gc.collect()
@@ -774,12 +777,6 @@ class ExllamaV3Container(BaseModelContainer):
f"max_seq_len {self.max_seq_len}"
)
self.generator = AsyncGenerator(
model=self.model,
cache=self.cache,
tokenizer=self.tokenizer,
)
generation = {}
job = AsyncJob(
self.generator,