diff --git a/backends/exllamav2/model.py b/backends/exllamav2/model.py index de3173f..deadac7 100644 --- a/backends/exllamav2/model.py +++ b/backends/exllamav2/model.py @@ -270,7 +270,8 @@ class ExllamaV2Container(BaseModelContainer): self.config.max_seq_len = self.adjust_max_seq_len(user_max_seq_len) else: self.config.max_seq_len = unwrap( - user_max_seq_len, min(hf_model.hf_config.get_max_position_embeddings(), 4096) + user_max_seq_len, + min(hf_model.hf_config.get_max_position_embeddings(), 4096), ) self.cache_size = self.config.max_seq_len diff --git a/backends/exllamav3/model.py b/backends/exllamav3/model.py index 821cfab..1e938dc 100644 --- a/backends/exllamav3/model.py +++ b/backends/exllamav3/model.py @@ -225,27 +225,35 @@ class ExllamaV3Container(BaseModelContainer): # Determine max_seq_len and cache_size max_seq_len_user = kwargs.get("max_seq_len") - max_seq_len_model = self.hf_model.hf_config.get_max_position_embeddings(default = None) + max_seq_len_model = self.hf_model.hf_config.get_max_position_embeddings( + default=None + ) max_seq_len_default = 8192 if max_seq_len_model and not max_seq_len_user: - logger.info(f'Using default max_seq_len from model: {max_seq_len_model} tokens.') + logger.info( + f"Using default max_seq_len from model: {max_seq_len_model} tokens." + ) max_seq_len = max_seq_len_model elif max_seq_len_user: - logger.info(f'Using configured max_seq_len: {max_seq_len_user} tokens.') + logger.info(f"Using configured max_seq_len: {max_seq_len_user} tokens.") max_seq_len = max_seq_len_user else: - logger.warning(f"max_seq_len is undefined. Defaulting to {max_seq_len_default} tokens.") + logger.warning( + f"max_seq_len is undefined. Defaulting to {max_seq_len_default} tokens." + ) max_seq_len = max_seq_len_default cache_size_user = kwargs.get("cache_size") cache_size_default = 8192 if cache_size_user: - logger.info(f'Using configured cache_size: {cache_size_user} tokens.') + logger.info(f"Using configured cache_size: {cache_size_user} tokens.") cache_size = cache_size_user else: - logger.warning(f"cache_size is undefined. Defaulting to {cache_size_default} tokens.") + logger.warning( + f"cache_size is undefined. Defaulting to {cache_size_default} tokens." + ) cache_size = cache_size_default if max_seq_len < cache_size: diff --git a/common/transformers_utils.py b/common/transformers_utils.py index 8651684..1cd2017 100644 --- a/common/transformers_utils.py +++ b/common/transformers_utils.py @@ -83,13 +83,17 @@ class HuggingFaceConfig(BaseModel): return [] def get_max_position_embeddings(self, default: int | None = 4096) -> int: - if self.text_config is not None and self.text_config.max_position_embeddings is not None: + if ( + self.text_config is not None + and self.text_config.max_position_embeddings is not None + ): return self.text_config.max_position_embeddings elif self.max_position_embeddings is not None: return self.max_position_embeddings else: return default + class TokenizerConfig(BaseModel): """ An abridged version of HuggingFace's tokenizer config. diff --git a/endpoints/OAI/utils/chat_completion.py b/endpoints/OAI/utils/chat_completion.py index b87b4cf..3acd24d 100644 --- a/endpoints/OAI/utils/chat_completion.py +++ b/endpoints/OAI/utils/chat_completion.py @@ -43,7 +43,9 @@ def _extract_think_content(text: str) -> tuple[Optional[str], Optional[str]]: return None, text elif model.container.reasoning_start_token in text: start_reasoning = text.split(model.container.reasoning_start_token)[1] - reasoning_content = start_reasoning.split(model.container.reasoning_end_token)[0] + reasoning_content = start_reasoning.split(model.container.reasoning_end_token)[ + 0 + ] content = start_reasoning.split(model.container.reasoning_end_token)[1] return reasoning_content.strip(), content.strip() else: