diff --git a/common/config_models.py b/common/config_models.py
index 340685e..0e71734 100644
--- a/common/config_models.py
+++ b/common/config_models.py
@@ -180,6 +180,25 @@ class ModelConfig(BaseConfigModel):
         ),
         ge=-1,
     )
+    cache_size: Optional[int] = Field(
+        None,
+        description=(
+            "Size of the prompt cache to allocate (default: max_seq_len).\n"
+            "Must be a multiple of 256 and can't be less than max_seq_len.\n"
+            "For CFG, set this to 2 * max_seq_len."
+        ),
+        multiple_of=256,
+        gt=0,
+    )
+    cache_mode: Optional[CACHE_TYPE] = Field(
+        "FP16",
+        description=(
+            "Enable different cache modes for VRAM savings (default: FP16).\n"
+            f"Possible values for exllamav2: {str(CACHE_SIZES)[15:-1]}.\n"
+            "For exllamav3, specify the pair k_bits,v_bits where k_bits and v_bits "
+            "are integers from 2-8 (i.e. 8,8)."
+        ),
+    )
     tensor_parallel: Optional[bool] = Field(
         False,
         description=(
@@ -236,25 +255,6 @@ class ModelConfig(BaseConfigModel):
             "or auto-calculate."
         ),
     )
-    cache_mode: Optional[CACHE_TYPE] = Field(
-        "FP16",
-        description=(
-            "Enable different cache modes for VRAM savings (default: FP16).\n"
-            f"Possible values for exllamav2: {str(CACHE_SIZES)[15:-1]}.\n"
-            "For exllamav3, specify the pair k_bits,v_bits where k_bits and v_bits "
-            "are integers from 2-8 (i.e. 8,8)."
-        ),
-    )
-    cache_size: Optional[int] = Field(
-        None,
-        description=(
-            "Size of the prompt cache to allocate (default: max_seq_len).\n"
-            "Must be a multiple of 256 and can't be less than max_seq_len.\n"
-            "For CFG, set this to 2 * max_seq_len."
-        ),
-        multiple_of=256,
-        gt=0,
-    )
     chunk_size: Optional[int] = Field(
         2048,
         description=(
diff --git a/config_sample.yml b/config_sample.yml
index 0d51719..1dbc7d5 100644
--- a/config_sample.yml
+++ b/config_sample.yml
@@ -81,6 +81,15 @@ model:
   # Max sequence length (default: fetch from the model's config.json).
   max_seq_len:
 
+  # Size of the key/value cache to allocate, in tokens (default: 4096).
+  # Must be a multiple of 256.
+  cache_size:
+
+  # Enable different cache modes for VRAM savings (default: FP16).
+  # Possible values for exllamav2: 'FP16', 'Q8', 'Q6', 'Q4'.
+  # For exllamav3, specify the pair k_bits,v_bits where k_bits and v_bits are integers from 2-8 (i.e. 8,8).
+  cache_mode: FP16
+
   # Load model with tensor parallelism.
   # Falls back to autosplit if GPU split isn't provided.
   # This ignores the gpu_split_auto value.
@@ -118,15 +127,6 @@ model:
   # Leaving this value blank will either pull from the model or auto-calculate.
   rope_alpha:
 
-  # Enable different cache modes for VRAM savings (default: FP16).
-  # Possible values for exllamav2: 'FP16', 'Q8', 'Q6', 'Q4'.
-  # For exllamav3, specify the pair k_bits,v_bits where k_bits and v_bits are integers from 2-8 (i.e. 8,8).
-  cache_mode: FP16
-
-  # Size of the key/value cache to allocate, in tokens (default: 4096).
-  # Must be a multiple of 256.
-  cache_size:
-
   # Chunk size for prompt ingestion (default: 2048).
   # A lower value reduces VRAM usage but decreases ingestion speed.
   # NOTE: Effects vary depending on the model.
diff --git a/endpoints/core/types/model.py b/endpoints/core/types/model.py
index 6e2e0c9..8422929 100644
--- a/endpoints/core/types/model.py
+++ b/endpoints/core/types/model.py
@@ -14,11 +14,11 @@ class ModelCardParameters(BaseModel):
     # Safe to do this since it's guaranteed to fetch a max seq len
     # from model_container
     max_seq_len: Optional[int] = None
+    cache_size: Optional[int] = None
+    cache_mode: Optional[str] = "FP16"
     rope_scale: Optional[float] = 1.0
     rope_alpha: Optional[float] = 1.0
     max_batch_size: Optional[int] = 1
-    cache_size: Optional[int] = None
-    cache_mode: Optional[str] = "FP16"
     chunk_size: Optional[int] = 2048
     prompt_template: Optional[str] = None
     prompt_template_content: Optional[str] = None
@@ -89,6 +89,7 @@ class ModelLoadRequest(BaseModel):
         default=None,
         examples=[4096],
     )
+    cache_mode: Optional[str] = None
     tensor_parallel: Optional[bool] = None
     tensor_parallel_backend: Optional[str] = "native"
     gpu_split_auto: Optional[bool] = None
@@ -107,7 +108,6 @@ class ModelLoadRequest(BaseModel):
         default=None,
         examples=[1.0],
     )
-    cache_mode: Optional[str] = None
     chunk_size: Optional[int] = None
     output_chunking: Optional[bool] = True
     prompt_template: Optional[str] = None