mirror of
https://github.com/theroyallab/tabbyAPI.git
synced 2026-04-27 01:38:56 +00:00
fixup: auto split
This commit is contained in:
@@ -1,5 +1,6 @@
|
|||||||
import asyncio
|
import asyncio
|
||||||
import gc
|
import gc
|
||||||
|
import math
|
||||||
import pathlib
|
import pathlib
|
||||||
from loguru import logger
|
from loguru import logger
|
||||||
from typing import (
|
from typing import (
|
||||||
@@ -46,8 +47,11 @@ class ExllamaV3Container(BaseModelContainer):
|
|||||||
cache: Cache
|
cache: Cache
|
||||||
tokenizer: Tokenizer
|
tokenizer: Tokenizer
|
||||||
config: Config
|
config: Config
|
||||||
gpu_split: List[float] = []
|
gpu_split: List[float] | None = None
|
||||||
|
gpu_split_auto: bool = True
|
||||||
|
autosplit_reserve: List[float] = [96 * 1024**2]
|
||||||
max_seq_len: int
|
max_seq_len: int
|
||||||
|
use_tp: bool = False
|
||||||
|
|
||||||
# Required methods
|
# Required methods
|
||||||
@classmethod
|
@classmethod
|
||||||
@@ -76,17 +80,43 @@ class ExllamaV3Container(BaseModelContainer):
|
|||||||
|
|
||||||
self.max_seq_len = kwargs.get("max_seq_len")
|
self.max_seq_len = kwargs.get("max_seq_len")
|
||||||
self.cache = Cache(self.model, max_num_tokens=self.max_seq_len)
|
self.cache = Cache(self.model, max_num_tokens=self.max_seq_len)
|
||||||
gpu_split = unwrap(kwargs.get("gpu_split"), [])
|
|
||||||
|
|
||||||
# Set GPU split options
|
|
||||||
# Enable manual GPU split if provided
|
|
||||||
if gpu_split:
|
|
||||||
self.gpu_split = gpu_split
|
|
||||||
# Try to set prompt template
|
# Try to set prompt template
|
||||||
self.prompt_template = await find_prompt_template(
|
self.prompt_template = await find_prompt_template(
|
||||||
kwargs.get("prompt_template"), model_directory
|
kwargs.get("prompt_template"), model_directory
|
||||||
)
|
)
|
||||||
|
|
||||||
|
# Turn off GPU split if the user is using 1 GPU
|
||||||
|
gpu_count = torch.cuda.device_count()
|
||||||
|
gpu_split_auto = unwrap(kwargs.get("gpu_split_auto"), True)
|
||||||
|
gpu_split = unwrap(kwargs.get("gpu_split"), None)
|
||||||
|
|
||||||
|
# Set GPU split options
|
||||||
|
if gpu_count == 1:
|
||||||
|
self.gpu_split_auto = False
|
||||||
|
logger.info("Disabling GPU split because one GPU is in use.")
|
||||||
|
else:
|
||||||
|
# TODO: Set tensor parallel
|
||||||
|
|
||||||
|
# Set GPU split options
|
||||||
|
# Enable manual GPU split if provided
|
||||||
|
if gpu_split:
|
||||||
|
self.gpu_split = gpu_split
|
||||||
|
elif gpu_split_auto and not self.use_tp:
|
||||||
|
# Otherwise fallback to autosplit settings
|
||||||
|
self.gpu_split_auto = gpu_split_auto
|
||||||
|
|
||||||
|
autosplit_reserve_megabytes = unwrap(
|
||||||
|
kwargs.get("autosplit_reserve"), [96]
|
||||||
|
)
|
||||||
|
|
||||||
|
# Reserve VRAM for each GPU
|
||||||
|
self.autosplit_reserve = [
|
||||||
|
int(math.ceil(value * 1024**2))
|
||||||
|
for value in autosplit_reserve_megabytes
|
||||||
|
]
|
||||||
|
# TODO: speculative decoding
|
||||||
|
|
||||||
return self
|
return self
|
||||||
|
|
||||||
async def load(self, progress_callback=None, **kwargs):
|
async def load(self, progress_callback=None, **kwargs):
|
||||||
|
|||||||
Reference in New Issue
Block a user