From 3622710582a8c286b6df6e402dfd279114a668ef Mon Sep 17 00:00:00 2001 From: kingbri Date: Thu, 28 Dec 2023 00:31:14 -0500 Subject: [PATCH] API: Fix num_experts_per_token reporting This wasn't linked to the model config. This value can be 1 if a MoE model isn't loaded. Signed-off-by: kingbri --- config_sample.yml | 2 +- main.py | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/config_sample.yml b/config_sample.yml index 435e91d..f3ca708 100644 --- a/config_sample.yml +++ b/config_sample.yml @@ -77,7 +77,7 @@ model: # NOTE: Only works with chat completion message lists! prompt_template: - # Number of experts to use per token. Loads from the model's config.json if not specified (default: None) + # Number of experts to use PER TOKEN. Fetched from the model's config.json if not specified (default: None) # WARNING: Don't set this unless you know what you're doing! # NOTE: For MoE models (ex. Mixtral) only! num_experts_per_token: diff --git a/main.py b/main.py index cd33a49..7f6822f 100644 --- a/main.py +++ b/main.py @@ -111,6 +111,7 @@ async def get_current_model(): max_seq_len=MODEL_CONTAINER.config.max_seq_len, cache_mode="FP8" if MODEL_CONTAINER.cache_fp8 else "FP16", prompt_template=prompt_template.name if prompt_template else None, + num_experts_per_token=MODEL_CONTAINER.config.num_experts_per_token, ), logging=gen_logging.PREFERENCES, )