Model: Fix chunk size handling

Wrong class attribute name used for max_attention_size and fixes
declaration of the draft model's chunk_size.

Also expose the parameter to the end user in both config and model
load.

Signed-off-by: kingbri <bdashore3@proton.me>
This commit is contained in:
kingbri
2024-04-07 18:10:50 -04:00
parent 30c4554572
commit d759a15559
3 changed files with 14 additions and 7 deletions

View File

@@ -16,6 +16,7 @@ class ModelCardParameters(BaseModel):
rope_scale: Optional[float] = 1.0
rope_alpha: Optional[float] = 1.0
cache_mode: Optional[str] = "FP16"
chunk_size: Optional[int] = 2048
prompt_template: Optional[str] = None
num_experts_per_token: Optional[int] = None
use_cfg: Optional[bool] = None
@@ -90,6 +91,7 @@ class ModelLoadRequest(BaseModel):
no_flash_attention: Optional[bool] = False
# low_mem: Optional[bool] = False
cache_mode: Optional[str] = "FP16"
chunk_size: Optional[int] = 2048
prompt_template: Optional[str] = None
num_experts_per_token: Optional[int] = None
use_cfg: Optional[bool] = None