From 52e093ae6c69f31b4c601969f747eb93a67b3ee7 Mon Sep 17 00:00:00 2001 From: turboderp <11859846+turboderp@users.noreply.github.com> Date: Sun, 5 Oct 2025 18:54:45 +0200 Subject: [PATCH] Model: Enable max_rq_tokens (output chunking) --- backends/exllamav3/model.py | 6 + common/config_models.py | 8 + config_sample.yml | 465 +++++++++++++++++----------------- endpoints/core/types/model.py | 1 + 4 files changed, 250 insertions(+), 230 deletions(-) diff --git a/backends/exllamav3/model.py b/backends/exllamav3/model.py index abe907e..4427df4 100644 --- a/backends/exllamav3/model.py +++ b/backends/exllamav3/model.py @@ -85,6 +85,7 @@ class ExllamaV3Container(BaseModelContainer): cache_mode: str = "FP16" draft_cache_mode: str = "FP16" chunk_size: int = 2048 + max_rq_tokens: Optional[int] = 2048 max_batch_size: Optional[int] = None # Required methods @@ -250,6 +251,10 @@ class ExllamaV3Container(BaseModelContainer): user_chunk_size = unwrap(kwargs.get("chunk_size"), 2048) self.chunk_size = self.adjust_chunk_size(user_chunk_size) + # Output chunking + disable_output_chunking = unwrap(kwargs.get("disable_output_chunking"), False) + self.max_rq_tokens = None if disable_output_chunking else self.chunk_size + # Template setup self.prompt_template = await find_prompt_template( kwargs.get("prompt_template"), model_directory @@ -982,6 +987,7 @@ class ExllamaV3Container(BaseModelContainer): banned_strings=params.banned_strings, embeddings=mm_embeddings_content, return_top_tokens=params.logprobs, + max_rq_tokens=self.max_rq_tokens ) generated_tokens = 0 diff --git a/common/config_models.py b/common/config_models.py index bd15592..8d07d79 100644 --- a/common/config_models.py +++ b/common/config_models.py @@ -265,6 +265,14 @@ class ModelConfig(BaseConfigModel): ), gt=0, ) + disable_output_chunking: Optional[bool] = Field( + False, + description=( + "Disable output chunking (default: false).\n" + "Used by EXL3 models only.\n" + "True, allocate space in the cache for the entire response with each request..\n" + ), + ) max_batch_size: Optional[int] = Field( None, description=( diff --git a/config_sample.yml b/config_sample.yml index ad49224..a294cfc 100644 --- a/config_sample.yml +++ b/config_sample.yml @@ -1,230 +1,235 @@ -# Sample YAML file for configuration. -# Comment and uncomment values as needed. -# Every value has a default within the application. -# This file serves to be a drop in for config.yml - -# Unless specified in the comments, DO NOT put these options in quotes! -# You can use https://www.yamllint.com/ if you want to check your YAML formatting. - -# Options for networking -network: - # The IP to host on (default: 127.0.0.1). - # Use 0.0.0.0 to expose on all network adapters. - host: 127.0.0.1 - - # The port to host on (default: 5000). - port: 5000 - - # Disable HTTP token authentication with requests. - # WARNING: This will make your instance vulnerable! - # Turn on this option if you are ONLY connecting from localhost. - disable_auth: false - - # Disable fetching external content in response to requests,such as images from URLs. - disable_fetch_requests: false - - # Send tracebacks over the API (default: False). - # NOTE: Only enable this for debug purposes. - send_tracebacks: false - - # Select API servers to enable (default: ["OAI"]). - # Possible values: OAI, Kobold. - api_servers: ["OAI"] - -# Options for logging -logging: - # Enable prompt logging (default: False). - log_prompt: false - - # Enable generation parameter logging (default: False). - log_generation_params: false - - # Enable request logging (default: False). - # NOTE: Only use this for debugging! - log_requests: false - -# Options for model overrides and loading -# Please read the comments to understand how arguments are handled -# between initial and API loads -model: - # Directory to look for models (default: models). - # Windows users, do NOT put this path in quotes! - model_dir: models - - # Allow direct loading of models from a completion or chat completion request (default: False). - # This method of loading is strict by default. - # Enable dummy models to add exceptions for invalid model names. - inline_model_loading: false - - # Sends dummy model names when the models endpoint is queried. (default: False) - # Enable this if the client is looking for specific OAI models. - use_dummy_models: false - - # A list of fake model names that are sent via the /v1/models endpoint. (default: ["gpt-3.5-turbo"]) - # Also used as bypasses for strict mode if inline_model_loading is true. - dummy_model_names: ["gpt-3.5-turbo"] - - # An initial model to load. - # Make sure the model is located in the model directory! - # REQUIRED: This must be filled out to load a model on startup. - model_name: - - # Names of args to use as a fallback for API load requests (default: []). - # For example, if you always want cache_mode to be Q4 instead of on the inital model load, add "cache_mode" to this array. - # Example: ['max_seq_len', 'cache_mode']. - use_as_default: [] - - # Backend to use for this model (auto-detect if not specified) - # Options: exllamav2, exllamav3 - backend: - - # Max sequence length (default: 4096). - # Set to -1 to fetch from the model's config.json - max_seq_len: - - # Load model with tensor parallelism. - # Falls back to autosplit if GPU split isn't provided. - # This ignores the gpu_split_auto value. - tensor_parallel: false - - # Sets a backend type for tensor parallelism. (default: native). - # Options: native, nccl - # Native is recommended for PCIe GPUs - # NCCL is recommended for NVLink. - tensor_parallel_backend: native - - # Automatically allocate resources to GPUs (default: True). - # Not parsed for single GPU users. - gpu_split_auto: true - - # Reserve VRAM used for autosplit loading (default: 96 MB on GPU 0). - # Represented as an array of MB per GPU. - autosplit_reserve: [96] - - # An integer array of GBs of VRAM to split between GPUs (default: []). - # Used with tensor parallelism. - gpu_split: [] - - # NOTE: If a model has YaRN rope scaling, it will automatically be enabled by ExLlama. - # rope_scale and rope_alpha settings won't apply in this case. - - # Rope scale (default: 1.0). - # Same as compress_pos_emb. - # Use if the model was trained on long context with rope. - # Leave blank to pull the value from the model. - rope_scale: 1.0 - - # Rope alpha (default: None). - # Same as alpha_value. Set to "auto" to auto-calculate. - # Leaving this value blank will either pull from the model or auto-calculate. - rope_alpha: - - # Enable different cache modes for VRAM savings (default: FP16). - # Possible values for exllamav2: 'FP16', 'Q8', 'Q6', 'Q4'. - # For exllamav3, specify the pair k_bits,v_bits where k_bits and v_bits are integers from 2-8 (i.e. 8,8). - cache_mode: FP16 - - # Size of the prompt cache to allocate (default: max_seq_len). - # Must be a multiple of 256 and can't be less than max_seq_len. - # For CFG, set this to 2 * max_seq_len. - cache_size: - - # Chunk size for prompt ingestion (default: 2048). - # A lower value reduces VRAM usage but decreases ingestion speed. - # NOTE: Effects vary depending on the model. - # An ideal value is between 512 and 4096. - chunk_size: 2048 - - # Set the maximum number of prompts to process at one time (default: None/Automatic). - # Automatically calculated if left blank. - # NOTE: Only available for Nvidia ampere (30 series) and above GPUs. - max_batch_size: - - # Set the prompt template for this model. (default: None) - # If empty, attempts to look for the model's chat template. - # If a model contains multiple templates in its tokenizer_config.json, - # set prompt_template to the name of the template you want to use. - # NOTE: Only works with chat completion message lists! - prompt_template: - - # Enables vision support if the model supports it. (default: False) - vision: false - -# Options for draft models (speculative decoding) -# This will use more VRAM! -draft_model: - # Directory to look for draft models (default: models) - draft_model_dir: models - - # An initial draft model to load. - # Ensure the model is in the model directory. - draft_model_name: - - # Rope scale for draft models (default: 1.0). - # Same as compress_pos_emb. - # Use if the draft model was trained on long context with rope. - draft_rope_scale: 1.0 - - # Rope alpha for draft models (default: None). - # Same as alpha_value. Set to "auto" to auto-calculate. - # Leaving this value blank will either pull from the model or auto-calculate. - draft_rope_alpha: - - # Cache mode for draft models to save VRAM (default: FP16). - # Possible values for exllamav2: 'FP16', 'Q8', 'Q6', 'Q4'. - # For exllamav3, specify the pair k_bits,v_bits where k_bits and v_bits are integers from 2-8 (i.e. 8,8). - draft_cache_mode: FP16 - - # An integer array of GBs of VRAM to split between GPUs (default: []). - # If this isn't filled in, the draft model is autosplit. - draft_gpu_split: [] - -# Options for Sampling -sampling: - # Select a sampler override preset (default: None). - # Find this in the sampler-overrides folder. - # This overrides default fallbacks for sampler values that are passed to the API. - # NOTE: safe_defaults is noob friendly and provides fallbacks for frontends that don't send sampling parameters. - # Remove this for any advanced usage. - override_preset: safe_defaults - -# Options for Loras -lora: - # Directory to look for LoRAs (default: loras). - lora_dir: loras - - # List of LoRAs to load and associated scaling factors (default scale: 1.0). - # For the YAML file, add each entry as a YAML list: - # - name: lora1 - # scaling: 1.0 - loras: - -# Options for embedding models and loading. -# NOTE: Embeddings requires the "extras" feature to be installed -# Install it via "pip install .[extras]" -embeddings: - # Directory to look for embedding models (default: models). - embedding_model_dir: models - - # Device to load embedding models on (default: cpu). - # Possible values: cpu, auto, cuda. - # NOTE: It's recommended to load embedding models on the CPU. - # If using an AMD GPU, set this value to 'cuda'. - embeddings_device: cpu - - # An initial embedding model to load on the infinity backend. - embedding_model_name: - -# Options for development and experimentation -developer: - # Skip Exllamav2 version check (default: False). - # WARNING: It's highly recommended to update your dependencies rather than enabling this flag. - unsafe_launch: false - - # Disable API request streaming (default: False). - disable_request_streaming: false - - # Set process to use a higher priority. - # For realtime process priority, run as administrator or sudo. - # Otherwise, the priority will be set to high. - realtime_process_priority: false +# Sample YAML file for configuration. +# Comment and uncomment values as needed. +# Every value has a default within the application. +# This file serves to be a drop in for config.yml + +# Unless specified in the comments, DO NOT put these options in quotes! +# You can use https://www.yamllint.com/ if you want to check your YAML formatting. + +# Options for networking +network: + # The IP to host on (default: 127.0.0.1). + # Use 0.0.0.0 to expose on all network adapters. + host: 127.0.0.1 + + # The port to host on (default: 5000). + port: 5000 + + # Disable HTTP token authentication with requests. + # WARNING: This will make your instance vulnerable! + # Turn on this option if you are ONLY connecting from localhost. + disable_auth: false + + # Disable fetching external content in response to requests,such as images from URLs. + disable_fetch_requests: false + + # Send tracebacks over the API (default: False). + # NOTE: Only enable this for debug purposes. + send_tracebacks: false + + # Select API servers to enable (default: ["OAI"]). + # Possible values: OAI, Kobold. + api_servers: ["OAI"] + +# Options for logging +logging: + # Enable prompt logging (default: False). + log_prompt: false + + # Enable generation parameter logging (default: False). + log_generation_params: false + + # Enable request logging (default: False). + # NOTE: Only use this for debugging! + log_requests: false + +# Options for model overrides and loading +# Please read the comments to understand how arguments are handled +# between initial and API loads +model: + # Directory to look for models (default: models). + # Windows users, do NOT put this path in quotes! + model_dir: models + + # Allow direct loading of models from a completion or chat completion request (default: False). + # This method of loading is strict by default. + # Enable dummy models to add exceptions for invalid model names. + inline_model_loading: false + + # Sends dummy model names when the models endpoint is queried. (default: False) + # Enable this if the client is looking for specific OAI models. + use_dummy_models: false + + # A list of fake model names that are sent via the /v1/models endpoint. (default: ["gpt-3.5-turbo"]) + # Also used as bypasses for strict mode if inline_model_loading is true. + dummy_model_names: ["gpt-3.5-turbo"] + + # An initial model to load. + # Make sure the model is located in the model directory! + # REQUIRED: This must be filled out to load a model on startup. + model_name: + + # Names of args to use as a fallback for API load requests (default: []). + # For example, if you always want cache_mode to be Q4 instead of on the inital model load, add "cache_mode" to this array. + # Example: ['max_seq_len', 'cache_mode']. + use_as_default: [] + + # Backend to use for this model (auto-detect if not specified) + # Options: exllamav2, exllamav3 + backend: + + # Max sequence length (default: 4096). + # Set to -1 to fetch from the model's config.json + max_seq_len: + + # Load model with tensor parallelism. + # Falls back to autosplit if GPU split isn't provided. + # This ignores the gpu_split_auto value. + tensor_parallel: false + + # Sets a backend type for tensor parallelism. (default: native). + # Options: native, nccl + # Native is recommended for PCIe GPUs + # NCCL is recommended for NVLink. + tensor_parallel_backend: native + + # Automatically allocate resources to GPUs (default: True). + # Not parsed for single GPU users. + gpu_split_auto: true + + # Reserve VRAM used for autosplit loading (default: 96 MB on GPU 0). + # Represented as an array of MB per GPU. + autosplit_reserve: [96] + + # An integer array of GBs of VRAM to split between GPUs (default: []). + # Used with tensor parallelism. + gpu_split: [] + + # NOTE: If a model has YaRN rope scaling, it will automatically be enabled by ExLlama. + # rope_scale and rope_alpha settings won't apply in this case. + + # Rope scale (default: 1.0). + # Same as compress_pos_emb. + # Use if the model was trained on long context with rope. + # Leave blank to pull the value from the model. + rope_scale: 1.0 + + # Rope alpha (default: None). + # Same as alpha_value. Set to "auto" to auto-calculate. + # Leaving this value blank will either pull from the model or auto-calculate. + rope_alpha: + + # Enable different cache modes for VRAM savings (default: FP16). + # Possible values for exllamav2: 'FP16', 'Q8', 'Q6', 'Q4'. + # For exllamav3, specify the pair k_bits,v_bits where k_bits and v_bits are integers from 2-8 (i.e. 8,8). + cache_mode: FP16 + + # Size of the prompt cache to allocate (default: max_seq_len). + # Must be a multiple of 256 and can't be less than max_seq_len. + # For CFG, set this to 2 * max_seq_len. + cache_size: + + # Chunk size for prompt ingestion (default: 2048). + # A lower value reduces VRAM usage but decreases ingestion speed. + # NOTE: Effects vary depending on the model. + # An ideal value is between 512 and 4096. + chunk_size: 2048 + + # Disable output chunking (default: false) + # Used by EXL3 models only. + # If True, allocate space in the cache for the entire response with each request. + disable_output_chunking: false + + # Set the maximum number of prompts to process at one time (default: None/Automatic). + # Automatically calculated if left blank. + # NOTE: Only available for Nvidia ampere (30 series) and above GPUs. + max_batch_size: + + # Set the prompt template for this model. (default: None) + # If empty, attempts to look for the model's chat template. + # If a model contains multiple templates in its tokenizer_config.json, + # set prompt_template to the name of the template you want to use. + # NOTE: Only works with chat completion message lists! + prompt_template: + + # Enables vision support if the model supports it. (default: False) + vision: false + +# Options for draft models (speculative decoding) +# This will use more VRAM! +draft_model: + # Directory to look for draft models (default: models) + draft_model_dir: models + + # An initial draft model to load. + # Ensure the model is in the model directory. + draft_model_name: + + # Rope scale for draft models (default: 1.0). + # Same as compress_pos_emb. + # Use if the draft model was trained on long context with rope. + draft_rope_scale: 1.0 + + # Rope alpha for draft models (default: None). + # Same as alpha_value. Set to "auto" to auto-calculate. + # Leaving this value blank will either pull from the model or auto-calculate. + draft_rope_alpha: + + # Cache mode for draft models to save VRAM (default: FP16). + # Possible values for exllamav2: 'FP16', 'Q8', 'Q6', 'Q4'. + # For exllamav3, specify the pair k_bits,v_bits where k_bits and v_bits are integers from 2-8 (i.e. 8,8). + draft_cache_mode: FP16 + + # An integer array of GBs of VRAM to split between GPUs (default: []). + # If this isn't filled in, the draft model is autosplit. + draft_gpu_split: [] + +# Options for Sampling +sampling: + # Select a sampler override preset (default: None). + # Find this in the sampler-overrides folder. + # This overrides default fallbacks for sampler values that are passed to the API. + # NOTE: safe_defaults is noob friendly and provides fallbacks for frontends that don't send sampling parameters. + # Remove this for any advanced usage. + override_preset: safe_defaults + +# Options for Loras +lora: + # Directory to look for LoRAs (default: loras). + lora_dir: loras + + # List of LoRAs to load and associated scaling factors (default scale: 1.0). + # For the YAML file, add each entry as a YAML list: + # - name: lora1 + # scaling: 1.0 + loras: + +# Options for embedding models and loading. +# NOTE: Embeddings requires the "extras" feature to be installed +# Install it via "pip install .[extras]" +embeddings: + # Directory to look for embedding models (default: models). + embedding_model_dir: models + + # Device to load embedding models on (default: cpu). + # Possible values: cpu, auto, cuda. + # NOTE: It's recommended to load embedding models on the CPU. + # If using an AMD GPU, set this value to 'cuda'. + embeddings_device: cpu + + # An initial embedding model to load on the infinity backend. + embedding_model_name: + +# Options for development and experimentation +developer: + # Skip Exllamav2 version check (default: False). + # WARNING: It's highly recommended to update your dependencies rather than enabling this flag. + unsafe_launch: false + + # Disable API request streaming (default: False). + disable_request_streaming: false + + # Set process to use a higher priority. + # For realtime process priority, run as administrator or sudo. + # Otherwise, the priority will be set to high. + realtime_process_priority: false diff --git a/endpoints/core/types/model.py b/endpoints/core/types/model.py index 1e84336..202b74b 100644 --- a/endpoints/core/types/model.py +++ b/endpoints/core/types/model.py @@ -109,6 +109,7 @@ class ModelLoadRequest(BaseModel): ) cache_mode: Optional[str] = None chunk_size: Optional[int] = None + disable_output_chunking: Optional[bool] = False prompt_template: Optional[str] = None vision: Optional[bool] = None