mirror of
https://github.com/theroyallab/tabbyAPI.git
synced 2026-03-14 15:57:27 +00:00
Some models (such as mistral and mixtral) set their base sequence length to 32k due to assumptions of support for sliding window attention. Therefore, add this parameter to override the base sequence length of a model which helps with auto-calculation of rope alpha. If auto-calculation of rope alpha isn't being used, the max_seq_len parameter works fine as is. Signed-off-by: kingbri <bdashore3@proton.me>
59 lines
2.3 KiB
Python
59 lines
2.3 KiB
Python
from pydantic import BaseModel, Field, ConfigDict
|
|
from time import time
|
|
from typing import List, Optional
|
|
from gen_logging import LogConfig
|
|
|
|
class ModelCardParameters(BaseModel):
|
|
# Safe to do this since it's guaranteed to fetch a max seq len from model_container
|
|
max_seq_len: Optional[int] = None
|
|
rope_scale: Optional[float] = 1.0
|
|
rope_alpha: Optional[float] = 1.0
|
|
cache_mode: Optional[str] = "FP16"
|
|
prompt_template: Optional[str] = None
|
|
num_experts_per_token: Optional[int] = None
|
|
draft: Optional['ModelCard'] = None
|
|
|
|
class ModelCard(BaseModel):
|
|
id: str = "test"
|
|
object: str = "model"
|
|
created: int = Field(default_factory=lambda: int(time()))
|
|
owned_by: str = "tabbyAPI"
|
|
logging: Optional[LogConfig] = None
|
|
parameters: Optional[ModelCardParameters] = None
|
|
|
|
class ModelList(BaseModel):
|
|
object: str = "list"
|
|
data: List[ModelCard] = Field(default_factory=list)
|
|
|
|
class DraftModelLoadRequest(BaseModel):
|
|
draft_model_name: str
|
|
draft_rope_scale: Optional[float] = 1.0
|
|
draft_rope_alpha: Optional[float] = Field(description = "Automatically calculated if not present", default = None)
|
|
|
|
# TODO: Unify this with ModelCardParams
|
|
class ModelLoadRequest(BaseModel):
|
|
name: str
|
|
|
|
# Max seq len is fetched from config.json of the model by default
|
|
max_seq_len: Optional[int] = Field(description = "Leave this blank to use the model's base sequence length", default = None)
|
|
override_base_seq_len: Optional[int] = Field(description = "Overrides the model's base sequence length. Leave blank if unsure", default = None)
|
|
gpu_split_auto: Optional[bool] = True
|
|
gpu_split: Optional[List[float]] = Field(default_factory=list)
|
|
rope_scale: Optional[float] = 1.0
|
|
rope_alpha: Optional[float] = Field(description = "Automatically calculated if not present", default = None)
|
|
no_flash_attention: Optional[bool] = False
|
|
# low_mem: Optional[bool] = False
|
|
cache_mode: Optional[str] = "FP16"
|
|
prompt_template: Optional[str] = None
|
|
num_experts_per_token: Optional[int] = None
|
|
draft: Optional[DraftModelLoadRequest] = None
|
|
|
|
class ModelLoadResponse(BaseModel):
|
|
# Avoids pydantic namespace warning
|
|
model_config = ConfigDict(protected_namespaces = [])
|
|
|
|
model_type: str = "model"
|
|
module: int
|
|
modules: int
|
|
status: str
|