mirror of
https://github.com/theroyallab/tabbyAPI.git
synced 2026-04-29 18:51:53 +00:00
Model: Remove dev wheel setting checks
Removes TP and DRY sampler checks since those are in stable. Signed-off-by: kingbri <bdashore3@proton.me>
This commit is contained in:
@@ -17,6 +17,7 @@ from exllamav2 import (
|
|||||||
ExLlamaV2Cache_Q4,
|
ExLlamaV2Cache_Q4,
|
||||||
ExLlamaV2Cache_Q6,
|
ExLlamaV2Cache_Q6,
|
||||||
ExLlamaV2Cache_Q8,
|
ExLlamaV2Cache_Q8,
|
||||||
|
ExLlamaV2Cache_TP,
|
||||||
ExLlamaV2Tokenizer,
|
ExLlamaV2Tokenizer,
|
||||||
ExLlamaV2Lora,
|
ExLlamaV2Lora,
|
||||||
)
|
)
|
||||||
@@ -55,14 +56,6 @@ from common.templating import (
|
|||||||
from common.transformers_utils import GenerationConfig, HuggingFaceConfig
|
from common.transformers_utils import GenerationConfig, HuggingFaceConfig
|
||||||
from common.utils import coalesce, unwrap
|
from common.utils import coalesce, unwrap
|
||||||
|
|
||||||
# Dynamic imports
|
|
||||||
try:
|
|
||||||
from exllamav2 import ExLlamaV2Cache_TP
|
|
||||||
|
|
||||||
has_tp = True
|
|
||||||
except ImportError:
|
|
||||||
has_tp = False
|
|
||||||
|
|
||||||
|
|
||||||
class ExllamaV2Container:
|
class ExllamaV2Container:
|
||||||
"""The model container class for ExLlamaV2 models."""
|
"""The model container class for ExLlamaV2 models."""
|
||||||
@@ -197,17 +190,10 @@ class ExllamaV2Container:
|
|||||||
else:
|
else:
|
||||||
# Set tensor parallel
|
# Set tensor parallel
|
||||||
if use_tp:
|
if use_tp:
|
||||||
if has_tp:
|
self.use_tp = True
|
||||||
self.use_tp = True
|
|
||||||
|
|
||||||
# TP has its own autosplit loader
|
# TP has its own autosplit loader
|
||||||
self.gpu_split_auto = False
|
self.gpu_split_auto = False
|
||||||
else:
|
|
||||||
# TODO: Remove conditional with exl2 v0.1.9 release
|
|
||||||
logger.warning(
|
|
||||||
"Tensor parallelism is not supported in the "
|
|
||||||
"current ExllamaV2 version."
|
|
||||||
)
|
|
||||||
|
|
||||||
# Enable manual GPU split if provided
|
# Enable manual GPU split if provided
|
||||||
if gpu_split:
|
if gpu_split:
|
||||||
@@ -703,7 +689,7 @@ class ExllamaV2Container:
|
|||||||
):
|
):
|
||||||
"""Utility function to create a model cache."""
|
"""Utility function to create a model cache."""
|
||||||
|
|
||||||
if has_tp and use_tp:
|
if use_tp:
|
||||||
return ExLlamaV2Cache_TP(
|
return ExLlamaV2Cache_TP(
|
||||||
model,
|
model,
|
||||||
base=cache_class,
|
base=cache_class,
|
||||||
@@ -967,14 +953,6 @@ class ExllamaV2Container:
|
|||||||
Meant for dev wheels!
|
Meant for dev wheels!
|
||||||
"""
|
"""
|
||||||
|
|
||||||
if unwrap(kwargs.get("dry_allowed_length"), 0) > 0 and not hasattr(
|
|
||||||
ExLlamaV2Sampler.Settings, "dry_multiplier"
|
|
||||||
):
|
|
||||||
logger.warning(
|
|
||||||
"DRY sampling is not supported by the currently "
|
|
||||||
"installed ExLlamaV2 version."
|
|
||||||
)
|
|
||||||
|
|
||||||
return kwargs
|
return kwargs
|
||||||
|
|
||||||
async def generate_gen(
|
async def generate_gen(
|
||||||
|
|||||||
Reference in New Issue
Block a user