mirror of
https://github.com/theroyallab/tabbyAPI.git
synced 2026-03-15 00:07:28 +00:00
Model: Default max_seq_len to 4096
A common problem in TabbyAPI is that users who want to get up and running with a model always had issues with max_seq_len causing OOMs. This is because model devs set max context values in the millions which requires a lot of VRAM. To idiot-proof first time setup, make the fallback default 4096 so users can run their models. If a user still wants to use the model's max_seq_len, set it to -1. Signed-off-by: kingbri <8082010+kingbri1@users.noreply.github.com>
This commit is contained in:
@@ -95,7 +95,7 @@ async def apply_inline_overrides(model_dir: pathlib.Path, **kwargs):
|
||||
else:
|
||||
logger.warning(
|
||||
"Cannot find inline model overrides. "
|
||||
"Make sure they are nested under a \"model:\" key"
|
||||
'Make sure they are nested under a "model:" key'
|
||||
)
|
||||
|
||||
# Merge draft overrides beforehand
|
||||
@@ -148,6 +148,13 @@ async def load_model_gen(model_path: pathlib.Path, **kwargs):
|
||||
# Fetch the extra HF configuration options
|
||||
hf_model = await HFModel.from_directory(model_path)
|
||||
|
||||
# Override the max sequence length based on user
|
||||
max_seq_len = kwargs.get("max_seq_len")
|
||||
if max_seq_len == -1:
|
||||
kwargs["max_seq_len"] = hf_model.hf_config.max_position_embeddings
|
||||
elif max_seq_len is None:
|
||||
kwargs["max_seq_len"] = 4096
|
||||
|
||||
# Create a new container and check if the right dependencies are installed
|
||||
backend = unwrap(kwargs.get("backend"), detect_backend(hf_model))
|
||||
container_class = _BACKEND_REGISTRY.get(backend)
|
||||
|
||||
Reference in New Issue
Block a user