mirror of
https://github.com/theroyallab/tabbyAPI.git
synced 2026-03-14 15:57:27 +00:00
Model.py TP changes
This commit is contained in:
@@ -163,13 +163,19 @@ class ExllamaV3Container(BaseModelContainer):
|
||||
gpu_split_auto = unwrap(kwargs.get("gpu_split_auto"), True)
|
||||
gpu_split = unwrap(kwargs.get("gpu_split"), None)
|
||||
gpu_device_list = list(range(0, gpu_count))
|
||||
use_tp = unwrap(kwargs.get("tensor_parallel"), False)
|
||||
|
||||
# Set GPU split options
|
||||
if gpu_count == 1:
|
||||
self.gpu_split_auto = False
|
||||
logger.info("Disabling GPU split because one GPU is in use.")
|
||||
else:
|
||||
# TODO: Set tensor parallel
|
||||
# Set tensor parallel
|
||||
if use_tp:
|
||||
self.use_tp = True
|
||||
|
||||
# TP has its own autosplit loader
|
||||
self.gpu_split_auto = False
|
||||
|
||||
# Set GPU split options
|
||||
# Enable manual GPU split if provided
|
||||
@@ -451,6 +457,8 @@ class ExllamaV3Container(BaseModelContainer):
|
||||
yield value
|
||||
|
||||
for value in self.model.load_gen(
|
||||
tensor_p=self.use_tp,
|
||||
tp_backend="native",
|
||||
reserve_per_device=self.autosplit_reserve,
|
||||
use_per_device=self.gpu_split,
|
||||
callback=progress_callback,
|
||||
|
||||
Reference in New Issue
Block a user