mirror of
https://github.com/theroyallab/tabbyAPI.git
synced 2026-04-26 09:18:53 +00:00
Model: Fix chunk size handling
Wrong class attribute name used for max_attention_size and fixes declaration of the draft model's chunk_size. Also expose the parameter to the end user in both config and model load. Signed-off-by: kingbri <bdashore3@proton.me>
This commit is contained in:
@@ -209,11 +209,11 @@ class ExllamaV2Container:
|
|||||||
if num_experts_override:
|
if num_experts_override:
|
||||||
self.config.num_experts_per_token = kwargs.get("num_experts_per_token")
|
self.config.num_experts_per_token = kwargs.get("num_experts_per_token")
|
||||||
|
|
||||||
chunk_size = min(
|
# Make sure chunk size is >= 16 and <= max seq length
|
||||||
unwrap(kwargs.get("chunk_size"), 2048), self.config.max_seq_len
|
user_chunk_size = unwrap(kwargs.get("chunk_size"), 2048)
|
||||||
)
|
chunk_size = sorted((16, user_chunk_size, self.config.max_seq_len))[1]
|
||||||
self.config.max_input_len = chunk_size
|
self.config.max_input_len = chunk_size
|
||||||
self.config.max_attn_size = chunk_size**2
|
self.config.max_attention_size = chunk_size**2
|
||||||
|
|
||||||
draft_args = unwrap(kwargs.get("draft"), {})
|
draft_args = unwrap(kwargs.get("draft"), {})
|
||||||
draft_model_name = draft_args.get("draft_model_name")
|
draft_model_name = draft_args.get("draft_model_name")
|
||||||
@@ -248,9 +248,9 @@ class ExllamaV2Container:
|
|||||||
)
|
)
|
||||||
self.draft_config.max_seq_len = self.config.max_seq_len
|
self.draft_config.max_seq_len = self.config.max_seq_len
|
||||||
|
|
||||||
if "chunk_size" in kwargs:
|
if chunk_size:
|
||||||
self.draft_config.max_input_len = kwargs["chunk_size"]
|
self.draft_config.max_input_len = chunk_size
|
||||||
self.draft_config.max_attn_size = kwargs["chunk_size"] ** 2
|
self.draft_config.max_attention_size = chunk_size**2
|
||||||
|
|
||||||
def find_prompt_template(self, prompt_template_name, model_directory):
|
def find_prompt_template(self, prompt_template_name, model_directory):
|
||||||
"""Tries to find a prompt template using various methods"""
|
"""Tries to find a prompt template using various methods"""
|
||||||
@@ -320,6 +320,7 @@ class ExllamaV2Container:
|
|||||||
"rope_alpha": self.config.scale_alpha_value,
|
"rope_alpha": self.config.scale_alpha_value,
|
||||||
"max_seq_len": self.config.max_seq_len,
|
"max_seq_len": self.config.max_seq_len,
|
||||||
"cache_mode": self.cache_mode,
|
"cache_mode": self.cache_mode,
|
||||||
|
"chunk_size": self.config.max_input_len,
|
||||||
"num_experts_per_token": self.config.num_experts_per_token,
|
"num_experts_per_token": self.config.num_experts_per_token,
|
||||||
"use_cfg": self.use_cfg,
|
"use_cfg": self.use_cfg,
|
||||||
"prompt_template": self.prompt_template.name
|
"prompt_template": self.prompt_template.name
|
||||||
|
|||||||
@@ -107,6 +107,10 @@ model:
|
|||||||
# Possible values FP16, FP8, Q4. (default: FP16)
|
# Possible values FP16, FP8, Q4. (default: FP16)
|
||||||
#cache_mode: FP16
|
#cache_mode: FP16
|
||||||
|
|
||||||
|
# Chunk size for prompt ingestion. A lower value reduces VRAM usage at the cost of ingestion speed (default: 2048)
|
||||||
|
# NOTE: Effects vary depending on the model. An ideal value is between 512 and 4096
|
||||||
|
#chunk_size: 2048
|
||||||
|
|
||||||
# Set the prompt template for this model. If empty, attempts to look for the model's chat template. (default: None)
|
# Set the prompt template for this model. If empty, attempts to look for the model's chat template. (default: None)
|
||||||
# If a model contains multiple templates in its tokenizer_config.json, set prompt_template to the name
|
# If a model contains multiple templates in its tokenizer_config.json, set prompt_template to the name
|
||||||
# of the template you want to use.
|
# of the template you want to use.
|
||||||
|
|||||||
@@ -16,6 +16,7 @@ class ModelCardParameters(BaseModel):
|
|||||||
rope_scale: Optional[float] = 1.0
|
rope_scale: Optional[float] = 1.0
|
||||||
rope_alpha: Optional[float] = 1.0
|
rope_alpha: Optional[float] = 1.0
|
||||||
cache_mode: Optional[str] = "FP16"
|
cache_mode: Optional[str] = "FP16"
|
||||||
|
chunk_size: Optional[int] = 2048
|
||||||
prompt_template: Optional[str] = None
|
prompt_template: Optional[str] = None
|
||||||
num_experts_per_token: Optional[int] = None
|
num_experts_per_token: Optional[int] = None
|
||||||
use_cfg: Optional[bool] = None
|
use_cfg: Optional[bool] = None
|
||||||
@@ -90,6 +91,7 @@ class ModelLoadRequest(BaseModel):
|
|||||||
no_flash_attention: Optional[bool] = False
|
no_flash_attention: Optional[bool] = False
|
||||||
# low_mem: Optional[bool] = False
|
# low_mem: Optional[bool] = False
|
||||||
cache_mode: Optional[str] = "FP16"
|
cache_mode: Optional[str] = "FP16"
|
||||||
|
chunk_size: Optional[int] = 2048
|
||||||
prompt_template: Optional[str] = None
|
prompt_template: Optional[str] = None
|
||||||
num_experts_per_token: Optional[int] = None
|
num_experts_per_token: Optional[int] = None
|
||||||
use_cfg: Optional[bool] = None
|
use_cfg: Optional[bool] = None
|
||||||
|
|||||||
Reference in New Issue
Block a user