From 8cbb59d6e1e6775347ca1a7e18442b4a310b862b Mon Sep 17 00:00:00 2001 From: kingbri Date: Mon, 18 Mar 2024 22:20:45 -0400 Subject: [PATCH] Model: Cleanup some comments Signed-off-by: kingbri --- backends/exllamav2/model.py | 50 +++---------------------------------- 1 file changed, 3 insertions(+), 47 deletions(-) diff --git a/backends/exllamav2/model.py b/backends/exllamav2/model.py index 4b99202..79bdbe2 100644 --- a/backends/exllamav2/model.py +++ b/backends/exllamav2/model.py @@ -141,7 +141,7 @@ class ExllamaV2Container: self.config.model_dir = str(model_directory.resolve()) # Make the max seq len 4096 before preparing the config - # This is a better default than 2038 + # This is a better default than 2048 self.config.max_seq_len = 4096 # Hardcode max output length to 16 @@ -185,13 +185,6 @@ class ExllamaV2Container: True if self.use_cfg else unwrap(kwargs.get("no_flash_attention"), False) ) - # low_mem is currently broken in exllamav2. Don't use it until it's - # fixed. - """ - if "low_mem" in kwargs and kwargs["low_mem"]: - self.config.set_low_mem() - """ - # Try to set prompt template self.prompt_template = self.find_prompt_template( kwargs.get("prompt_template"), model_directory @@ -639,46 +632,9 @@ class ExllamaV2Container: def generate_gen_sync(self, prompt: str, **kwargs): """ - Create generator function for prompt completion + Create generator function for prompt completion. - Args: - prompt (str): Input prompt - **kwargs: - 'token_healing' (bool): Use token healing (default: False) - 'temperature' (float): Sampling temperature (default: 1.0) - 'temperature_last' (bool): Apply temperature after all other - samplers (default: False) - 'top_k' (int): Sampling top-K (default: 0) - 'top_p' (float): Sampling top-P (default: 1.0) - 'min_p' (float): Sampling min-P (default: 0.0) - 'tfs' (float): Tail-free sampling (default: 0.0) - 'typical' (float): Sampling typical (default: 0.0) - 'mirostat' (bool): Use Mirostat (default: False) - 'mirostat_tau' (float) Mirostat tau parameter (default: 1.5) - 'mirostat_eta' (float) Mirostat eta parameter (default: 0.1) - 'frequency_penalty' (float): Token frequency penalty (default: 0.0) - 'presence_penalty' (float): Token presence penalty (default: 0.0) - 'repetition_penalty' (float): Token repetition penalty - (default: 1.15) - 'penalty_range' (int): Penalty range - (default: whole context) - 'repetition_decay' (int): Repetition penalty range - (default: same as range) - 'stop' (List[Union[str, int]]): List of stop strings/tokens to - end response (default: [EOS]) - 'max_tokens' (int): Max no. tokens in response (default: 150) - 'add_bos_token' (bool): Adds the BOS token to the start of the - prompt (default: True) - 'ban_eos_token' (bool): Bans the EOS token from generation - (default: False) - 'logit_bias' (Dict[int, float]): Biases specific tokens to - either show up more or less (default: None) - 'stream_interval' (float): Interval in seconds between each - output chunk (default: immediate) - 'generate_window' (int): Space to reserve at the end of the - model's context when generating. Rolls context window by - the same amount if context length is exceeded to allow - generating pastthe models max_seq_len. + for kwargs, check common/sampling.py """ token_healing = unwrap(kwargs.get("token_healing"), False)