mirror of
https://github.com/theroyallab/tabbyAPI.git
synced 2026-04-29 10:42:03 +00:00
Merge branch 'main' into vision
This commit is contained in:
@@ -170,7 +170,6 @@ class ExllamaV2Container:
|
|||||||
|
|
||||||
if enable_draft:
|
if enable_draft:
|
||||||
self.draft_config = ExLlamaV2Config()
|
self.draft_config = ExLlamaV2Config()
|
||||||
self.draft_config.no_flash_attn = self.config.no_flash_attn
|
|
||||||
draft_model_path = pathlib.Path(
|
draft_model_path = pathlib.Path(
|
||||||
unwrap(draft_args.get("draft_model_dir"), "models")
|
unwrap(draft_args.get("draft_model_dir"), "models")
|
||||||
)
|
)
|
||||||
@@ -264,6 +263,8 @@ class ExllamaV2Container:
|
|||||||
or not supports_paged_attn()
|
or not supports_paged_attn()
|
||||||
):
|
):
|
||||||
self.config.no_flash_attn = True
|
self.config.no_flash_attn = True
|
||||||
|
if self.draft_config:
|
||||||
|
self.draft_config.no_flash_attn = True
|
||||||
self.paged = False
|
self.paged = False
|
||||||
self.max_batch_size = 1
|
self.max_batch_size = 1
|
||||||
torch.backends.cuda.enable_flash_sdp(False)
|
torch.backends.cuda.enable_flash_sdp(False)
|
||||||
@@ -332,9 +333,20 @@ class ExllamaV2Container:
|
|||||||
if num_experts_override:
|
if num_experts_override:
|
||||||
self.config.num_experts_per_token = kwargs.get("num_experts_per_token")
|
self.config.num_experts_per_token = kwargs.get("num_experts_per_token")
|
||||||
|
|
||||||
# Make sure chunk size is >= 16 and <= max seq length
|
# Make sure chunk size is >= 256, keep near or below max seq len
|
||||||
user_chunk_size = unwrap(kwargs.get("chunk_size"), 2048)
|
user_chunk_size = unwrap(kwargs.get("chunk_size"), 2048)
|
||||||
chunk_size = sorted((16, user_chunk_size, self.config.max_seq_len))[1]
|
chunk_size = sorted((256, user_chunk_size, self.config.max_seq_len))[1]
|
||||||
|
chunk_remainder = chunk_size % 256
|
||||||
|
if chunk_remainder != 0:
|
||||||
|
rounded_chunk_size = int(256 * ((chunk_size - chunk_remainder) / 256 + 1))
|
||||||
|
|
||||||
|
logger.warning(
|
||||||
|
f"The given chunk size ({chunk_size}) is "
|
||||||
|
"not a multiple of 256.\n"
|
||||||
|
"Overriding chunk_size with an overestimated value of "
|
||||||
|
f"{rounded_chunk_size} tokens."
|
||||||
|
)
|
||||||
|
chunk_size = rounded_chunk_size
|
||||||
self.config.max_input_len = chunk_size
|
self.config.max_input_len = chunk_size
|
||||||
self.config.max_attention_size = chunk_size**2
|
self.config.max_attention_size = chunk_size**2
|
||||||
|
|
||||||
|
|||||||
@@ -148,14 +148,25 @@ class ModelConfig(BaseConfigModel):
|
|||||||
False,
|
False,
|
||||||
description=(
|
description=(
|
||||||
"Allow direct loading of models "
|
"Allow direct loading of models "
|
||||||
"from a completion or chat completion request (default: False)."
|
"from a completion or chat completion request (default: False).\n"
|
||||||
|
"This method of loading is strict by default.\n"
|
||||||
|
"Enable dummy models to add exceptions for invalid model names."
|
||||||
),
|
),
|
||||||
)
|
)
|
||||||
use_dummy_models: Optional[bool] = Field(
|
use_dummy_models: Optional[bool] = Field(
|
||||||
False,
|
False,
|
||||||
description=(
|
description=(
|
||||||
"Sends dummy model names when the models endpoint is queried.\n"
|
"Sends dummy model names when the models endpoint is queried. "
|
||||||
"Enable this if the client is looking for specific OAI models."
|
"(default: False)\n"
|
||||||
|
"Enable this if the client is looking for specific OAI models.\n"
|
||||||
|
),
|
||||||
|
)
|
||||||
|
dummy_model_names: List[str] = Field(
|
||||||
|
default=["gpt-3.5-turbo"],
|
||||||
|
description=(
|
||||||
|
"A list of fake model names that are sent via the /v1/models endpoint. "
|
||||||
|
'(default: ["gpt-3.5-turbo"])\n'
|
||||||
|
"Also used as bypasses for strict mode if inline_model_loading is true."
|
||||||
),
|
),
|
||||||
)
|
)
|
||||||
model_name: Optional[str] = Field(
|
model_name: Optional[str] = Field(
|
||||||
|
|||||||
@@ -52,12 +52,18 @@ model:
|
|||||||
model_dir: models
|
model_dir: models
|
||||||
|
|
||||||
# Allow direct loading of models from a completion or chat completion request (default: False).
|
# Allow direct loading of models from a completion or chat completion request (default: False).
|
||||||
|
# This method of loading is strict by default.
|
||||||
|
# Enable dummy models to add exceptions for invalid model names.
|
||||||
inline_model_loading: false
|
inline_model_loading: false
|
||||||
|
|
||||||
# Sends dummy model names when the models endpoint is queried.
|
# Sends dummy model names when the models endpoint is queried. (default: False)
|
||||||
# Enable this if the client is looking for specific OAI models.
|
# Enable this if the client is looking for specific OAI models.
|
||||||
use_dummy_models: false
|
use_dummy_models: false
|
||||||
|
|
||||||
|
# A list of fake model names that are sent via the /v1/models endpoint. (default: ["gpt-3.5-turbo"])
|
||||||
|
# Also used as bypasses for strict mode if inline_model_loading is true.
|
||||||
|
dummy_model_names: ["gpt-3.5-turbo"]
|
||||||
|
|
||||||
# An initial model to load.
|
# An initial model to load.
|
||||||
# Make sure the model is located in the model directory!
|
# Make sure the model is located in the model directory!
|
||||||
# REQUIRED: This must be filled out to load a model on startup.
|
# REQUIRED: This must be filled out to load a model on startup.
|
||||||
|
|||||||
@@ -19,8 +19,8 @@ WORKDIR /app
|
|||||||
# Get requirements
|
# Get requirements
|
||||||
COPY pyproject.toml .
|
COPY pyproject.toml .
|
||||||
|
|
||||||
# Install packages specified in pyproject.toml cu121
|
# Install packages specified in pyproject.toml cu121, extras
|
||||||
RUN pip3 install --no-cache-dir .[cu121]
|
RUN pip3 install --no-cache-dir .[cu121,extras]
|
||||||
|
|
||||||
RUN rm pyproject.toml
|
RUN rm pyproject.toml
|
||||||
|
|
||||||
|
|||||||
@@ -1,4 +1,4 @@
|
|||||||
from typing import List, Optional
|
from typing import List, Optional, Union
|
||||||
|
|
||||||
from pydantic import BaseModel, Field
|
from pydantic import BaseModel, Field
|
||||||
|
|
||||||
@@ -10,7 +10,7 @@ class UsageInfo(BaseModel):
|
|||||||
|
|
||||||
|
|
||||||
class EmbeddingsRequest(BaseModel):
|
class EmbeddingsRequest(BaseModel):
|
||||||
input: List[str] = Field(
|
input: Union[str, List[str]] = Field(
|
||||||
..., description="List of input texts to generate embeddings for."
|
..., description="List of input texts to generate embeddings for."
|
||||||
)
|
)
|
||||||
encoding_format: str = Field(
|
encoding_format: str = Field(
|
||||||
|
|||||||
@@ -121,21 +121,41 @@ async def load_inline_model(model_name: str, request: Request):
|
|||||||
):
|
):
|
||||||
return
|
return
|
||||||
|
|
||||||
# Inline model loading isn't enabled or the user isn't an admin
|
# Return if inline loading is disabled
|
||||||
if not get_key_permission(request) == "admin":
|
# Also warn if an admin key is used
|
||||||
error_message = handle_request_error(
|
|
||||||
f"Unable to switch model to {model_name} because "
|
|
||||||
+ "an admin key isn't provided",
|
|
||||||
exc_info=False,
|
|
||||||
).error.message
|
|
||||||
|
|
||||||
raise HTTPException(401, error_message)
|
|
||||||
|
|
||||||
if not config.model.inline_model_loading:
|
if not config.model.inline_model_loading:
|
||||||
logger.warning(
|
if get_key_permission(request) == "admin":
|
||||||
f"Unable to switch model to {model_name} because "
|
logger.warning(
|
||||||
'"inline_model_loading" is not True in config.yml.'
|
f"Unable to switch model to {model_name} because "
|
||||||
)
|
'"inline_model_loading" is not True in config.yml.'
|
||||||
|
)
|
||||||
|
|
||||||
|
return
|
||||||
|
|
||||||
|
is_dummy_model = (
|
||||||
|
config.model.use_dummy_models and model_name in config.model.dummy_model_names
|
||||||
|
)
|
||||||
|
|
||||||
|
# Error if an invalid key is passed
|
||||||
|
# If a dummy model is provided, don't error
|
||||||
|
if get_key_permission(request) != "admin":
|
||||||
|
if not is_dummy_model:
|
||||||
|
error_message = handle_request_error(
|
||||||
|
f"Unable to switch model to {model_name} because "
|
||||||
|
+ "an admin key isn't provided",
|
||||||
|
exc_info=False,
|
||||||
|
).error.message
|
||||||
|
|
||||||
|
raise HTTPException(401, error_message)
|
||||||
|
else:
|
||||||
|
return
|
||||||
|
|
||||||
|
# Start inline loading
|
||||||
|
# Past here, user is assumed to be admin
|
||||||
|
|
||||||
|
# Skip if the model is a dummy
|
||||||
|
if is_dummy_model:
|
||||||
|
logger.warning(f"Dummy model {model_name} provided. Skipping inline load.")
|
||||||
|
|
||||||
return
|
return
|
||||||
|
|
||||||
|
|||||||
@@ -40,6 +40,7 @@ from endpoints.core.utils.lora import get_active_loras, get_lora_list
|
|||||||
from endpoints.core.utils.model import (
|
from endpoints.core.utils.model import (
|
||||||
get_current_model,
|
get_current_model,
|
||||||
get_current_model_list,
|
get_current_model_list,
|
||||||
|
get_dummy_models,
|
||||||
get_model_list,
|
get_model_list,
|
||||||
stream_model_load,
|
stream_model_load,
|
||||||
)
|
)
|
||||||
@@ -83,7 +84,7 @@ async def list_models(request: Request) -> ModelList:
|
|||||||
models = await get_current_model_list()
|
models = await get_current_model_list()
|
||||||
|
|
||||||
if config.model.use_dummy_models:
|
if config.model.use_dummy_models:
|
||||||
models.data.insert(0, ModelCard(id="gpt-3.5-turbo"))
|
models.data[:0] = get_dummy_models()
|
||||||
|
|
||||||
return models
|
return models
|
||||||
|
|
||||||
|
|||||||
@@ -92,6 +92,13 @@ def get_current_model():
|
|||||||
return model_card
|
return model_card
|
||||||
|
|
||||||
|
|
||||||
|
def get_dummy_models():
|
||||||
|
if config.model.dummy_model_names:
|
||||||
|
return [ModelCard(id=dummy_id) for dummy_id in config.model.dummy_model_names]
|
||||||
|
else:
|
||||||
|
return [ModelCard(id="gpt-3.5-turbo")]
|
||||||
|
|
||||||
|
|
||||||
async def stream_model_load(
|
async def stream_model_load(
|
||||||
data: ModelLoadRequest,
|
data: ModelLoadRequest,
|
||||||
model_path: pathlib.Path,
|
model_path: pathlib.Path,
|
||||||
|
|||||||
@@ -69,12 +69,12 @@ cu121 = [
|
|||||||
"torch @ https://download.pytorch.org/whl/cu121/torch-2.4.1%2Bcu121-cp310-cp310-linux_x86_64.whl ; platform_system == 'Linux' and platform_machine == 'x86_64' and python_version == '3.10'",
|
"torch @ https://download.pytorch.org/whl/cu121/torch-2.4.1%2Bcu121-cp310-cp310-linux_x86_64.whl ; platform_system == 'Linux' and platform_machine == 'x86_64' and python_version == '3.10'",
|
||||||
|
|
||||||
# Exl2
|
# Exl2
|
||||||
"exllamav2 @ https://github.com/turboderp/exllamav2/releases/download/v0.2.3/exllamav2-0.2.3+cu121.torch2.4.0-cp312-cp312-win_amd64.whl ; platform_system == 'Windows' and python_version == '3.12'",
|
"exllamav2 @ https://github.com/turboderp/exllamav2/releases/download/v0.2.4/exllamav2-0.2.4+cu121.torch2.4.0-cp312-cp312-win_amd64.whl ; platform_system == 'Windows' and python_version == '3.12'",
|
||||||
"exllamav2 @ https://github.com/turboderp/exllamav2/releases/download/v0.2.3/exllamav2-0.2.3+cu121.torch2.4.0-cp311-cp311-win_amd64.whl ; platform_system == 'Windows' and python_version == '3.11'",
|
"exllamav2 @ https://github.com/turboderp/exllamav2/releases/download/v0.2.4/exllamav2-0.2.4+cu121.torch2.4.0-cp311-cp311-win_amd64.whl ; platform_system == 'Windows' and python_version == '3.11'",
|
||||||
"exllamav2 @ https://github.com/turboderp/exllamav2/releases/download/v0.2.3/exllamav2-0.2.3+cu121.torch2.4.0-cp310-cp310-win_amd64.whl ; platform_system == 'Windows' and python_version == '3.10'",
|
"exllamav2 @ https://github.com/turboderp/exllamav2/releases/download/v0.2.4/exllamav2-0.2.4+cu121.torch2.4.0-cp310-cp310-win_amd64.whl ; platform_system == 'Windows' and python_version == '3.10'",
|
||||||
"exllamav2 @ https://github.com/turboderp/exllamav2/releases/download/v0.2.3/exllamav2-0.2.3+cu121.torch2.4.0-cp312-cp312-linux_x86_64.whl ; platform_system == 'Linux' and platform_machine == 'x86_64' and python_version == '3.12'",
|
"exllamav2 @ https://github.com/turboderp/exllamav2/releases/download/v0.2.4/exllamav2-0.2.4+cu121.torch2.4.0-cp312-cp312-linux_x86_64.whl ; platform_system == 'Linux' and platform_machine == 'x86_64' and python_version == '3.12'",
|
||||||
"exllamav2 @ https://github.com/turboderp/exllamav2/releases/download/v0.2.3/exllamav2-0.2.3+cu121.torch2.4.0-cp311-cp311-linux_x86_64.whl ; platform_system == 'Linux' and platform_machine == 'x86_64' and python_version == '3.11'",
|
"exllamav2 @ https://github.com/turboderp/exllamav2/releases/download/v0.2.4/exllamav2-0.2.4+cu121.torch2.4.0-cp311-cp311-linux_x86_64.whl ; platform_system == 'Linux' and platform_machine == 'x86_64' and python_version == '3.11'",
|
||||||
"exllamav2 @ https://github.com/turboderp/exllamav2/releases/download/v0.2.3/exllamav2-0.2.3+cu121.torch2.4.0-cp310-cp310-linux_x86_64.whl ; platform_system == 'Linux' and platform_machine == 'x86_64' and python_version == '3.10'",
|
"exllamav2 @ https://github.com/turboderp/exllamav2/releases/download/v0.2.4/exllamav2-0.2.4+cu121.torch2.4.0-cp310-cp310-linux_x86_64.whl ; platform_system == 'Linux' and platform_machine == 'x86_64' and python_version == '3.10'",
|
||||||
|
|
||||||
# Windows FA2 from https://github.com/bdashore3/flash-attention/releases
|
# Windows FA2 from https://github.com/bdashore3/flash-attention/releases
|
||||||
"flash_attn @ https://github.com/bdashore3/flash-attention/releases/download/v2.6.3/flash_attn-2.6.3+cu123torch2.4.0cxx11abiFALSE-cp312-cp312-win_amd64.whl ; platform_system == 'Windows' and python_version == '3.12'",
|
"flash_attn @ https://github.com/bdashore3/flash-attention/releases/download/v2.6.3/flash_attn-2.6.3+cu123torch2.4.0cxx11abiFALSE-cp312-cp312-win_amd64.whl ; platform_system == 'Windows' and python_version == '3.12'",
|
||||||
@@ -96,12 +96,12 @@ cu118 = [
|
|||||||
"torch @ https://download.pytorch.org/whl/cu118/torch-2.4.1%2Bcu118-cp310-cp310-linux_x86_64.whl ; platform_system == 'Linux' and platform_machine == 'x86_64' and python_version == '3.10'",
|
"torch @ https://download.pytorch.org/whl/cu118/torch-2.4.1%2Bcu118-cp310-cp310-linux_x86_64.whl ; platform_system == 'Linux' and platform_machine == 'x86_64' and python_version == '3.10'",
|
||||||
|
|
||||||
# Exl2
|
# Exl2
|
||||||
"exllamav2 @ https://github.com/turboderp/exllamav2/releases/download/v0.2.3/exllamav2-0.2.3+cu118.torch2.4.0-cp312-cp312-win_amd64.whl ; platform_system == 'Windows' and python_version == '3.12'",
|
"exllamav2 @ https://github.com/turboderp/exllamav2/releases/download/v0.2.4/exllamav2-0.2.4+cu118.torch2.4.0-cp312-cp312-win_amd64.whl ; platform_system == 'Windows' and python_version == '3.12'",
|
||||||
"exllamav2 @ https://github.com/turboderp/exllamav2/releases/download/v0.2.3/exllamav2-0.2.3+cu118.torch2.4.0-cp311-cp311-win_amd64.whl ; platform_system == 'Windows' and python_version == '3.11'",
|
"exllamav2 @ https://github.com/turboderp/exllamav2/releases/download/v0.2.4/exllamav2-0.2.4+cu118.torch2.4.0-cp311-cp311-win_amd64.whl ; platform_system == 'Windows' and python_version == '3.11'",
|
||||||
"exllamav2 @ https://github.com/turboderp/exllamav2/releases/download/v0.2.3/exllamav2-0.2.3+cu118.torch2.4.0-cp310-cp310-win_amd64.whl ; platform_system == 'Windows' and python_version == '3.10'",
|
"exllamav2 @ https://github.com/turboderp/exllamav2/releases/download/v0.2.4/exllamav2-0.2.4+cu118.torch2.4.0-cp310-cp310-win_amd64.whl ; platform_system == 'Windows' and python_version == '3.10'",
|
||||||
"exllamav2 @ https://github.com/turboderp/exllamav2/releases/download/v0.2.3/exllamav2-0.2.3+cu118.torch2.4.0-cp312-cp312-linux_x86_64.whl ; platform_system == 'Linux' and platform_machine == 'x86_64' and python_version == '3.12'",
|
"exllamav2 @ https://github.com/turboderp/exllamav2/releases/download/v0.2.4/exllamav2-0.2.4+cu118.torch2.4.0-cp312-cp312-linux_x86_64.whl ; platform_system == 'Linux' and platform_machine == 'x86_64' and python_version == '3.12'",
|
||||||
"exllamav2 @ https://github.com/turboderp/exllamav2/releases/download/v0.2.3/exllamav2-0.2.3+cu118.torch2.4.0-cp311-cp311-linux_x86_64.whl ; platform_system == 'Linux' and platform_machine == 'x86_64' and python_version == '3.11'",
|
"exllamav2 @ https://github.com/turboderp/exllamav2/releases/download/v0.2.4/exllamav2-0.2.4+cu118.torch2.4.0-cp311-cp311-linux_x86_64.whl ; platform_system == 'Linux' and platform_machine == 'x86_64' and python_version == '3.11'",
|
||||||
"exllamav2 @ https://github.com/turboderp/exllamav2/releases/download/v0.2.3/exllamav2-0.2.3+cu118.torch2.4.0-cp310-cp310-linux_x86_64.whl ; platform_system == 'Linux' and platform_machine == 'x86_64' and python_version == '3.10'",
|
"exllamav2 @ https://github.com/turboderp/exllamav2/releases/download/v0.2.4/exllamav2-0.2.4+cu118.torch2.4.0-cp310-cp310-linux_x86_64.whl ; platform_system == 'Linux' and platform_machine == 'x86_64' and python_version == '3.10'",
|
||||||
|
|
||||||
# Linux FA2 from https://github.com/Dao-AILab/flash-attention/releases
|
# Linux FA2 from https://github.com/Dao-AILab/flash-attention/releases
|
||||||
"flash_attn @ https://github.com/Dao-AILab/flash-attention/releases/download/v2.6.3/flash_attn-2.6.3+cu118torch2.4cxx11abiFALSE-cp312-cp312-linux_x86_64.whl ; platform_system == 'Linux' and platform_machine == 'x86_64' and python_version == '3.12'",
|
"flash_attn @ https://github.com/Dao-AILab/flash-attention/releases/download/v2.6.3/flash_attn-2.6.3+cu118torch2.4cxx11abiFALSE-cp312-cp312-linux_x86_64.whl ; platform_system == 'Linux' and platform_machine == 'x86_64' and python_version == '3.12'",
|
||||||
@@ -120,9 +120,9 @@ amd = [
|
|||||||
"torch @ https://download.pytorch.org/whl/rocm6.0/torch-2.4.1%2Brocm6.0-cp310-cp310-linux_x86_64.whl ; python_version == '3.10'",
|
"torch @ https://download.pytorch.org/whl/rocm6.0/torch-2.4.1%2Brocm6.0-cp310-cp310-linux_x86_64.whl ; python_version == '3.10'",
|
||||||
|
|
||||||
# Exl2
|
# Exl2
|
||||||
"exllamav2 @ https://github.com/turboderp/exllamav2/releases/download/v0.2.3/exllamav2-0.2.3+rocm6.1.torch2.4.0-cp312-cp312-linux_x86_64.whl ; python_version == '3.12'",
|
"exllamav2 @ https://github.com/turboderp/exllamav2/releases/download/v0.2.4/exllamav2-0.2.4+rocm6.1.torch2.4.0-cp312-cp312-linux_x86_64.whl ; python_version == '3.12'",
|
||||||
"exllamav2 @ https://github.com/turboderp/exllamav2/releases/download/v0.2.3/exllamav2-0.2.3+rocm6.1.torch2.4.0-cp311-cp311-linux_x86_64.whl ; python_version == '3.11'",
|
"exllamav2 @ https://github.com/turboderp/exllamav2/releases/download/v0.2.4/exllamav2-0.2.4+rocm6.1.torch2.4.0-cp311-cp311-linux_x86_64.whl ; python_version == '3.11'",
|
||||||
"exllamav2 @ https://github.com/turboderp/exllamav2/releases/download/v0.2.3/exllamav2-0.2.3+rocm6.1.torch2.4.0-cp310-cp310-linux_x86_64.whl ; python_version == '3.10'",
|
"exllamav2 @ https://github.com/turboderp/exllamav2/releases/download/v0.2.4/exllamav2-0.2.4+rocm6.1.torch2.4.0-cp310-cp310-linux_x86_64.whl ; python_version == '3.10'",
|
||||||
]
|
]
|
||||||
|
|
||||||
# MARK: Ruff options
|
# MARK: Ruff options
|
||||||
|
|||||||
Reference in New Issue
Block a user