mirror of
https://github.com/theroyallab/tabbyAPI.git
synced 2026-03-14 15:57:27 +00:00
Previously, the max sequence length was overriden by the user's config and never took the model's config.json into account. Now, set the default to 4096, but include config.prepare when selecting the max sequence length. The yaml and API request now serve as overrides rather than parameters. Signed-off-by: kingbri <bdashore3@proton.me>
58 lines
2.0 KiB
Python
58 lines
2.0 KiB
Python
from pydantic import BaseModel, Field, ConfigDict
|
|
from time import time
|
|
from typing import List, Optional
|
|
from gen_logging import LogConfig
|
|
|
|
class ModelCardParameters(BaseModel):
|
|
# Safe to do this since it's guaranteed to fetch a max seq len from model_container
|
|
max_seq_len: Optional[int] = None
|
|
rope_scale: Optional[float] = 1.0
|
|
rope_alpha: Optional[float] = 1.0
|
|
cache_mode: Optional[str] = "FP16"
|
|
prompt_template: Optional[str] = None
|
|
num_experts_per_token: Optional[int] = None
|
|
draft: Optional['ModelCard'] = None
|
|
|
|
class ModelCard(BaseModel):
|
|
id: str = "test"
|
|
object: str = "model"
|
|
created: int = Field(default_factory=lambda: int(time()))
|
|
owned_by: str = "tabbyAPI"
|
|
logging: Optional[LogConfig] = None
|
|
parameters: Optional[ModelCardParameters] = None
|
|
|
|
class ModelList(BaseModel):
|
|
object: str = "list"
|
|
data: List[ModelCard] = Field(default_factory=list)
|
|
|
|
class DraftModelLoadRequest(BaseModel):
|
|
draft_model_name: str
|
|
draft_rope_scale: Optional[float] = 1.0
|
|
draft_rope_alpha: Optional[float] = Field(description = "Automatically calculated if not present", default = None)
|
|
|
|
# TODO: Unify this with ModelCardParams
|
|
class ModelLoadRequest(BaseModel):
|
|
name: str
|
|
|
|
# Max seq len is defaulted when loading the model itself
|
|
max_seq_len: Optional[int] = None
|
|
gpu_split_auto: Optional[bool] = True
|
|
gpu_split: Optional[List[float]] = Field(default_factory=list)
|
|
rope_scale: Optional[float] = 1.0
|
|
rope_alpha: Optional[float] = Field(description = "Automatically calculated if not present", default = None)
|
|
no_flash_attention: Optional[bool] = False
|
|
# low_mem: Optional[bool] = False
|
|
cache_mode: Optional[str] = "FP16"
|
|
prompt_template: Optional[str] = None
|
|
num_experts_per_token: Optional[int] = None
|
|
draft: Optional[DraftModelLoadRequest] = None
|
|
|
|
class ModelLoadResponse(BaseModel):
|
|
# Avoids pydantic namespace warning
|
|
model_config = ConfigDict(protected_namespaces = [])
|
|
|
|
model_type: str = "model"
|
|
module: int
|
|
modules: int
|
|
status: str
|