mirror of
https://github.com/theroyallab/tabbyAPI.git
synced 2026-03-15 00:07:28 +00:00
Model: Add tensor_parallel_backend option
This allows for users to use nccl or native depending on the GPU setup. NCCL is only available with Linux built wheels. Signed-off-by: kingbri <8082010+kingbri1@users.noreply.github.com>
This commit is contained in:
@@ -25,6 +25,7 @@ from loguru import logger
|
||||
|
||||
from backends.base_model_container import BaseModelContainer
|
||||
from backends.exllamav3.sampler import ExllamaV3SamplerBuilder
|
||||
from backends.exllamav3.utils import exllama_supports_nccl
|
||||
from backends.exllamav3.vision import clear_image_embedding_cache
|
||||
from common.concurrency import iterate_in_threadpool
|
||||
from common.gen_logging import (
|
||||
@@ -78,6 +79,7 @@ class ExllamaV3Container(BaseModelContainer):
|
||||
gpu_split_auto: bool = True
|
||||
autosplit_reserve: Optional[List[float]] = [96 / 1024]
|
||||
use_tp: bool = False
|
||||
tp_backend: str = "native"
|
||||
max_seq_len: int = 4096
|
||||
cache_size: int = 4096
|
||||
cache_mode: str = "FP16"
|
||||
@@ -173,6 +175,12 @@ class ExllamaV3Container(BaseModelContainer):
|
||||
# Set tensor parallel
|
||||
if use_tp:
|
||||
self.use_tp = True
|
||||
tp_backend = unwrap(kwargs.get("tensor_parallel_backend"), "native")
|
||||
|
||||
if not exllama_supports_nccl():
|
||||
tp_backend = "native"
|
||||
|
||||
self.tp_backend = tp_backend
|
||||
|
||||
# TP has its own autosplit loader
|
||||
self.gpu_split_auto = False
|
||||
@@ -458,7 +466,7 @@ class ExllamaV3Container(BaseModelContainer):
|
||||
|
||||
for value in self.model.load_gen(
|
||||
tensor_p=self.use_tp,
|
||||
tp_backend="native",
|
||||
tp_backend=self.tp_backend,
|
||||
reserve_per_device=self.autosplit_reserve,
|
||||
use_per_device=self.gpu_split,
|
||||
callback=progress_callback,
|
||||
|
||||
Reference in New Issue
Block a user