From ce2602df9af151124275e3f52e00243a1baccb20 Mon Sep 17 00:00:00 2001
From: kingbri <bdashore3@proton.me>
Date: Tue, 19 Dec 2023 23:37:52 -0500
Subject: [PATCH] Model: Fix max seq len handling

Previously, the max sequence length was overriden by the user's
config and never took the model's config.json into account.

Now, set the default to 4096, but include config.prepare when
selecting the max sequence length. The yaml and API request
now serve as overrides rather than parameters.

Signed-off-by: kingbri <bdashore3@proton.me>
---
 OAI/types/model.py |  7 +++++--
 config_sample.yml  |  4 ++--
 model.py           | 12 ++++++++++--
 3 files changed, 17 insertions(+), 6 deletions(-)

diff --git a/OAI/types/model.py b/OAI/types/model.py
index 08715ad..7072840 100644
--- a/OAI/types/model.py
+++ b/OAI/types/model.py
@@ -4,7 +4,8 @@ from typing import List, Optional
 from gen_logging import LogConfig
 
 class ModelCardParameters(BaseModel):
-    max_seq_len: Optional[int] = 4096
+    # Safe to do this since it's guaranteed to fetch a max seq len from model_container
+    max_seq_len: Optional[int] = None
     rope_scale: Optional[float] = 1.0
     rope_alpha: Optional[float] = 1.0
     cache_mode: Optional[str] = "FP16"
@@ -32,7 +33,9 @@ class DraftModelLoadRequest(BaseModel):
 # TODO: Unify this with ModelCardParams
 class ModelLoadRequest(BaseModel):
     name: str
-    max_seq_len: Optional[int] = 4096
+
+    # Max seq len is defaulted when loading the model itself
+    max_seq_len: Optional[int] = None
     gpu_split_auto: Optional[bool] = True
     gpu_split: Optional[List[float]] = Field(default_factory=list)
     rope_scale: Optional[float] = 1.0
diff --git a/config_sample.yml b/config_sample.yml
index 15ce81b..fb17438 100644
--- a/config_sample.yml
+++ b/config_sample.yml
@@ -37,8 +37,8 @@ model:
 
   # The below parameters apply only if model_name is set
 
-  # Maximum model context length (default: 4096)
-  max_seq_len: 4096
+  # Override maximum model context length (default: None)
+  max_seq_len:
 
   # Automatically allocate resources to GPUs (default: True)
   gpu_split_auto: True
diff --git a/model.py b/model.py
index e39a478..e2076a1 100644
--- a/model.py
+++ b/model.py
@@ -79,13 +79,21 @@ class ModelContainer:
 
         self.config = ExLlamaV2Config()
         self.config.model_dir = str(model_directory.resolve())
+
+        # Make the max seq len 4096 before preparing the config
+        # This is a better default than 2038
+        self.config.max_seq_len = 4096
         self.config.prepare()
 
+        # Then override the max_seq_len if present
+        override_max_seq_len = kwargs.get("max_seq_len")
+        if override_max_seq_len:
+            self.config.max_seq_len = kwargs.get("max_seq_len")
+
         # Grab the base model's sequence length before overrides for rope calculations
         base_seq_len = self.config.max_seq_len
 
-        # Then override the max_seq_len if present
-        self.config.max_seq_len = unwrap(kwargs.get("max_seq_len"), 4096)
+        # Set the rope scale
         self.config.scale_pos_emb = unwrap(kwargs.get("rope_scale"), 1.0)
 
         # Automatically calculate rope alpha