Files
tabbyAPI/OAI/types/model.py
kingbri ab10b263fd Model: Add override base seq len
Some models (such as mistral and mixtral) set their base sequence
length to 32k due to assumptions of support for sliding window
attention.

Therefore, add this parameter to override the base sequence length
of a model which helps with auto-calculation of rope alpha.

If auto-calculation of rope alpha isn't being used, the max_seq_len
parameter works fine as is.

Signed-off-by: kingbri <bdashore3@proton.me>
2023-12-20 00:45:39 -05:00

59 lines
2.3 KiB
Python

from pydantic import BaseModel, Field, ConfigDict
from time import time
from typing import List, Optional
from gen_logging import LogConfig
class ModelCardParameters(BaseModel):
# Safe to do this since it's guaranteed to fetch a max seq len from model_container
max_seq_len: Optional[int] = None
rope_scale: Optional[float] = 1.0
rope_alpha: Optional[float] = 1.0
cache_mode: Optional[str] = "FP16"
prompt_template: Optional[str] = None
num_experts_per_token: Optional[int] = None
draft: Optional['ModelCard'] = None
class ModelCard(BaseModel):
id: str = "test"
object: str = "model"
created: int = Field(default_factory=lambda: int(time()))
owned_by: str = "tabbyAPI"
logging: Optional[LogConfig] = None
parameters: Optional[ModelCardParameters] = None
class ModelList(BaseModel):
object: str = "list"
data: List[ModelCard] = Field(default_factory=list)
class DraftModelLoadRequest(BaseModel):
draft_model_name: str
draft_rope_scale: Optional[float] = 1.0
draft_rope_alpha: Optional[float] = Field(description = "Automatically calculated if not present", default = None)
# TODO: Unify this with ModelCardParams
class ModelLoadRequest(BaseModel):
name: str
# Max seq len is fetched from config.json of the model by default
max_seq_len: Optional[int] = Field(description = "Leave this blank to use the model's base sequence length", default = None)
override_base_seq_len: Optional[int] = Field(description = "Overrides the model's base sequence length. Leave blank if unsure", default = None)
gpu_split_auto: Optional[bool] = True
gpu_split: Optional[List[float]] = Field(default_factory=list)
rope_scale: Optional[float] = 1.0
rope_alpha: Optional[float] = Field(description = "Automatically calculated if not present", default = None)
no_flash_attention: Optional[bool] = False
# low_mem: Optional[bool] = False
cache_mode: Optional[str] = "FP16"
prompt_template: Optional[str] = None
num_experts_per_token: Optional[int] = None
draft: Optional[DraftModelLoadRequest] = None
class ModelLoadResponse(BaseModel):
# Avoids pydantic namespace warning
model_config = ConfigDict(protected_namespaces = [])
model_type: str = "model"
module: int
modules: int
status: str