diff --git a/backends/exllamav2/model.py b/backends/exllamav2/model.py index 8381a8c..b610cfd 100644 --- a/backends/exllamav2/model.py +++ b/backends/exllamav2/model.py @@ -251,9 +251,6 @@ class ExllamaV2Container: else: self.config.scale_alpha_value = rope_alpha - # Enable fasttensors loading if present - self.config.fasttensors = unwrap(kwargs.get("fasttensors"), False) - # Set max batch size to the config override self.max_batch_size = unwrap(kwargs.get("max_batch_size")) diff --git a/common/config_models.py b/common/config_models.py index 79d774f..a31d6a5 100644 --- a/common/config_models.py +++ b/common/config_models.py @@ -290,13 +290,6 @@ class ModelConfig(BaseConfigModel): ), ge=1, ) - fasttensors: Optional[bool] = Field( - False, - description=( - "Enables fasttensors to possibly increase model loading speeds " - "(default: False)." - ), - ) _metadata: Metadata = PrivateAttr(Metadata()) model_config = ConfigDict(protected_namespaces=()) diff --git a/config_sample.yml b/config_sample.yml index 507d7d5..771d7e6 100644 --- a/config_sample.yml +++ b/config_sample.yml @@ -135,9 +135,6 @@ model: # WARNING: Don't set this unless you know what you're doing! num_experts_per_token: - # Enables fasttensors to possibly increase model loading speeds (default: False). - fasttensors: false - # Options for draft models (speculative decoding) # This will use more VRAM! draft_model: diff --git a/endpoints/core/types/model.py b/endpoints/core/types/model.py index b169162..25a4032 100644 --- a/endpoints/core/types/model.py +++ b/endpoints/core/types/model.py @@ -106,7 +106,6 @@ class ModelLoadRequest(BaseModel): chunk_size: Optional[int] = None prompt_template: Optional[str] = None num_experts_per_token: Optional[int] = None - fasttensors: Optional[bool] = None # Non-config arguments draft: Optional[DraftModelLoadRequest] = None