From e09a61969f61d05c08c96630564f39ff1170f739 Mon Sep 17 00:00:00 2001 From: turboderp <11859846+turboderp@users.noreply.github.com> Date: Sun, 5 Oct 2025 18:52:37 +0200 Subject: [PATCH 1/7] Model: Fix NCCL detection --- backends/exllamav3/model.py | 6 +++++- backends/exllamav3/utils.py | 16 +++++++--------- 2 files changed, 12 insertions(+), 10 deletions(-) diff --git a/backends/exllamav3/model.py b/backends/exllamav3/model.py index 217f5bf..abe907e 100644 --- a/backends/exllamav3/model.py +++ b/backends/exllamav3/model.py @@ -177,7 +177,11 @@ class ExllamaV3Container(BaseModelContainer): self.use_tp = True tp_backend = unwrap(kwargs.get("tensor_parallel_backend"), "native") - if not exllama_supports_nccl(): + if tp_backend == "nccl" and not exllama_supports_nccl(): + unsupported_message = ( + "NCCL is not available. Falling back to native backend." + ) + logger.warning(unsupported_message) tp_backend = "native" self.tp_backend = tp_backend diff --git a/backends/exllamav3/utils.py b/backends/exllamav3/utils.py index dbaffb5..0a90487 100644 --- a/backends/exllamav3/utils.py +++ b/backends/exllamav3/utils.py @@ -1,15 +1,13 @@ import platform from loguru import logger - def exllama_supports_nccl(): - if platform.system() != "Windows": + if platform.system() == "Windows": + unsupported_message = ( + "The NCCL tensor parallel backend is not supported on Windows." + ) + logger.warning(unsupported_message) return False - unsupported_message = ( - "The NCCL tensor parallel backend is not supported on Windows. \n" - "Switching to native backend." - ) - logger.warning(unsupported_message) - - return True + import torch + return torch.cuda.is_available() and torch.distributed.is_nccl_available() From 52e093ae6c69f31b4c601969f747eb93a67b3ee7 Mon Sep 17 00:00:00 2001 From: turboderp <11859846+turboderp@users.noreply.github.com> Date: Sun, 5 Oct 2025 18:54:45 +0200 Subject: [PATCH 2/7] Model: Enable max_rq_tokens (output chunking) --- backends/exllamav3/model.py | 6 + common/config_models.py | 8 + config_sample.yml | 465 +++++++++++++++++----------------- endpoints/core/types/model.py | 1 + 4 files changed, 250 insertions(+), 230 deletions(-) diff --git a/backends/exllamav3/model.py b/backends/exllamav3/model.py index abe907e..4427df4 100644 --- a/backends/exllamav3/model.py +++ b/backends/exllamav3/model.py @@ -85,6 +85,7 @@ class ExllamaV3Container(BaseModelContainer): cache_mode: str = "FP16" draft_cache_mode: str = "FP16" chunk_size: int = 2048 + max_rq_tokens: Optional[int] = 2048 max_batch_size: Optional[int] = None # Required methods @@ -250,6 +251,10 @@ class ExllamaV3Container(BaseModelContainer): user_chunk_size = unwrap(kwargs.get("chunk_size"), 2048) self.chunk_size = self.adjust_chunk_size(user_chunk_size) + # Output chunking + disable_output_chunking = unwrap(kwargs.get("disable_output_chunking"), False) + self.max_rq_tokens = None if disable_output_chunking else self.chunk_size + # Template setup self.prompt_template = await find_prompt_template( kwargs.get("prompt_template"), model_directory @@ -982,6 +987,7 @@ class ExllamaV3Container(BaseModelContainer): banned_strings=params.banned_strings, embeddings=mm_embeddings_content, return_top_tokens=params.logprobs, + max_rq_tokens=self.max_rq_tokens ) generated_tokens = 0 diff --git a/common/config_models.py b/common/config_models.py index bd15592..8d07d79 100644 --- a/common/config_models.py +++ b/common/config_models.py @@ -265,6 +265,14 @@ class ModelConfig(BaseConfigModel): ), gt=0, ) + disable_output_chunking: Optional[bool] = Field( + False, + description=( + "Disable output chunking (default: false).\n" + "Used by EXL3 models only.\n" + "True, allocate space in the cache for the entire response with each request..\n" + ), + ) max_batch_size: Optional[int] = Field( None, description=( diff --git a/config_sample.yml b/config_sample.yml index ad49224..a294cfc 100644 --- a/config_sample.yml +++ b/config_sample.yml @@ -1,230 +1,235 @@ -# Sample YAML file for configuration. -# Comment and uncomment values as needed. -# Every value has a default within the application. -# This file serves to be a drop in for config.yml - -# Unless specified in the comments, DO NOT put these options in quotes! -# You can use https://www.yamllint.com/ if you want to check your YAML formatting. - -# Options for networking -network: - # The IP to host on (default: 127.0.0.1). - # Use 0.0.0.0 to expose on all network adapters. - host: 127.0.0.1 - - # The port to host on (default: 5000). - port: 5000 - - # Disable HTTP token authentication with requests. - # WARNING: This will make your instance vulnerable! - # Turn on this option if you are ONLY connecting from localhost. - disable_auth: false - - # Disable fetching external content in response to requests,such as images from URLs. - disable_fetch_requests: false - - # Send tracebacks over the API (default: False). - # NOTE: Only enable this for debug purposes. - send_tracebacks: false - - # Select API servers to enable (default: ["OAI"]). - # Possible values: OAI, Kobold. - api_servers: ["OAI"] - -# Options for logging -logging: - # Enable prompt logging (default: False). - log_prompt: false - - # Enable generation parameter logging (default: False). - log_generation_params: false - - # Enable request logging (default: False). - # NOTE: Only use this for debugging! - log_requests: false - -# Options for model overrides and loading -# Please read the comments to understand how arguments are handled -# between initial and API loads -model: - # Directory to look for models (default: models). - # Windows users, do NOT put this path in quotes! - model_dir: models - - # Allow direct loading of models from a completion or chat completion request (default: False). - # This method of loading is strict by default. - # Enable dummy models to add exceptions for invalid model names. - inline_model_loading: false - - # Sends dummy model names when the models endpoint is queried. (default: False) - # Enable this if the client is looking for specific OAI models. - use_dummy_models: false - - # A list of fake model names that are sent via the /v1/models endpoint. (default: ["gpt-3.5-turbo"]) - # Also used as bypasses for strict mode if inline_model_loading is true. - dummy_model_names: ["gpt-3.5-turbo"] - - # An initial model to load. - # Make sure the model is located in the model directory! - # REQUIRED: This must be filled out to load a model on startup. - model_name: - - # Names of args to use as a fallback for API load requests (default: []). - # For example, if you always want cache_mode to be Q4 instead of on the inital model load, add "cache_mode" to this array. - # Example: ['max_seq_len', 'cache_mode']. - use_as_default: [] - - # Backend to use for this model (auto-detect if not specified) - # Options: exllamav2, exllamav3 - backend: - - # Max sequence length (default: 4096). - # Set to -1 to fetch from the model's config.json - max_seq_len: - - # Load model with tensor parallelism. - # Falls back to autosplit if GPU split isn't provided. - # This ignores the gpu_split_auto value. - tensor_parallel: false - - # Sets a backend type for tensor parallelism. (default: native). - # Options: native, nccl - # Native is recommended for PCIe GPUs - # NCCL is recommended for NVLink. - tensor_parallel_backend: native - - # Automatically allocate resources to GPUs (default: True). - # Not parsed for single GPU users. - gpu_split_auto: true - - # Reserve VRAM used for autosplit loading (default: 96 MB on GPU 0). - # Represented as an array of MB per GPU. - autosplit_reserve: [96] - - # An integer array of GBs of VRAM to split between GPUs (default: []). - # Used with tensor parallelism. - gpu_split: [] - - # NOTE: If a model has YaRN rope scaling, it will automatically be enabled by ExLlama. - # rope_scale and rope_alpha settings won't apply in this case. - - # Rope scale (default: 1.0). - # Same as compress_pos_emb. - # Use if the model was trained on long context with rope. - # Leave blank to pull the value from the model. - rope_scale: 1.0 - - # Rope alpha (default: None). - # Same as alpha_value. Set to "auto" to auto-calculate. - # Leaving this value blank will either pull from the model or auto-calculate. - rope_alpha: - - # Enable different cache modes for VRAM savings (default: FP16). - # Possible values for exllamav2: 'FP16', 'Q8', 'Q6', 'Q4'. - # For exllamav3, specify the pair k_bits,v_bits where k_bits and v_bits are integers from 2-8 (i.e. 8,8). - cache_mode: FP16 - - # Size of the prompt cache to allocate (default: max_seq_len). - # Must be a multiple of 256 and can't be less than max_seq_len. - # For CFG, set this to 2 * max_seq_len. - cache_size: - - # Chunk size for prompt ingestion (default: 2048). - # A lower value reduces VRAM usage but decreases ingestion speed. - # NOTE: Effects vary depending on the model. - # An ideal value is between 512 and 4096. - chunk_size: 2048 - - # Set the maximum number of prompts to process at one time (default: None/Automatic). - # Automatically calculated if left blank. - # NOTE: Only available for Nvidia ampere (30 series) and above GPUs. - max_batch_size: - - # Set the prompt template for this model. (default: None) - # If empty, attempts to look for the model's chat template. - # If a model contains multiple templates in its tokenizer_config.json, - # set prompt_template to the name of the template you want to use. - # NOTE: Only works with chat completion message lists! - prompt_template: - - # Enables vision support if the model supports it. (default: False) - vision: false - -# Options for draft models (speculative decoding) -# This will use more VRAM! -draft_model: - # Directory to look for draft models (default: models) - draft_model_dir: models - - # An initial draft model to load. - # Ensure the model is in the model directory. - draft_model_name: - - # Rope scale for draft models (default: 1.0). - # Same as compress_pos_emb. - # Use if the draft model was trained on long context with rope. - draft_rope_scale: 1.0 - - # Rope alpha for draft models (default: None). - # Same as alpha_value. Set to "auto" to auto-calculate. - # Leaving this value blank will either pull from the model or auto-calculate. - draft_rope_alpha: - - # Cache mode for draft models to save VRAM (default: FP16). - # Possible values for exllamav2: 'FP16', 'Q8', 'Q6', 'Q4'. - # For exllamav3, specify the pair k_bits,v_bits where k_bits and v_bits are integers from 2-8 (i.e. 8,8). - draft_cache_mode: FP16 - - # An integer array of GBs of VRAM to split between GPUs (default: []). - # If this isn't filled in, the draft model is autosplit. - draft_gpu_split: [] - -# Options for Sampling -sampling: - # Select a sampler override preset (default: None). - # Find this in the sampler-overrides folder. - # This overrides default fallbacks for sampler values that are passed to the API. - # NOTE: safe_defaults is noob friendly and provides fallbacks for frontends that don't send sampling parameters. - # Remove this for any advanced usage. - override_preset: safe_defaults - -# Options for Loras -lora: - # Directory to look for LoRAs (default: loras). - lora_dir: loras - - # List of LoRAs to load and associated scaling factors (default scale: 1.0). - # For the YAML file, add each entry as a YAML list: - # - name: lora1 - # scaling: 1.0 - loras: - -# Options for embedding models and loading. -# NOTE: Embeddings requires the "extras" feature to be installed -# Install it via "pip install .[extras]" -embeddings: - # Directory to look for embedding models (default: models). - embedding_model_dir: models - - # Device to load embedding models on (default: cpu). - # Possible values: cpu, auto, cuda. - # NOTE: It's recommended to load embedding models on the CPU. - # If using an AMD GPU, set this value to 'cuda'. - embeddings_device: cpu - - # An initial embedding model to load on the infinity backend. - embedding_model_name: - -# Options for development and experimentation -developer: - # Skip Exllamav2 version check (default: False). - # WARNING: It's highly recommended to update your dependencies rather than enabling this flag. - unsafe_launch: false - - # Disable API request streaming (default: False). - disable_request_streaming: false - - # Set process to use a higher priority. - # For realtime process priority, run as administrator or sudo. - # Otherwise, the priority will be set to high. - realtime_process_priority: false +# Sample YAML file for configuration. +# Comment and uncomment values as needed. +# Every value has a default within the application. +# This file serves to be a drop in for config.yml + +# Unless specified in the comments, DO NOT put these options in quotes! +# You can use https://www.yamllint.com/ if you want to check your YAML formatting. + +# Options for networking +network: + # The IP to host on (default: 127.0.0.1). + # Use 0.0.0.0 to expose on all network adapters. + host: 127.0.0.1 + + # The port to host on (default: 5000). + port: 5000 + + # Disable HTTP token authentication with requests. + # WARNING: This will make your instance vulnerable! + # Turn on this option if you are ONLY connecting from localhost. + disable_auth: false + + # Disable fetching external content in response to requests,such as images from URLs. + disable_fetch_requests: false + + # Send tracebacks over the API (default: False). + # NOTE: Only enable this for debug purposes. + send_tracebacks: false + + # Select API servers to enable (default: ["OAI"]). + # Possible values: OAI, Kobold. + api_servers: ["OAI"] + +# Options for logging +logging: + # Enable prompt logging (default: False). + log_prompt: false + + # Enable generation parameter logging (default: False). + log_generation_params: false + + # Enable request logging (default: False). + # NOTE: Only use this for debugging! + log_requests: false + +# Options for model overrides and loading +# Please read the comments to understand how arguments are handled +# between initial and API loads +model: + # Directory to look for models (default: models). + # Windows users, do NOT put this path in quotes! + model_dir: models + + # Allow direct loading of models from a completion or chat completion request (default: False). + # This method of loading is strict by default. + # Enable dummy models to add exceptions for invalid model names. + inline_model_loading: false + + # Sends dummy model names when the models endpoint is queried. (default: False) + # Enable this if the client is looking for specific OAI models. + use_dummy_models: false + + # A list of fake model names that are sent via the /v1/models endpoint. (default: ["gpt-3.5-turbo"]) + # Also used as bypasses for strict mode if inline_model_loading is true. + dummy_model_names: ["gpt-3.5-turbo"] + + # An initial model to load. + # Make sure the model is located in the model directory! + # REQUIRED: This must be filled out to load a model on startup. + model_name: + + # Names of args to use as a fallback for API load requests (default: []). + # For example, if you always want cache_mode to be Q4 instead of on the inital model load, add "cache_mode" to this array. + # Example: ['max_seq_len', 'cache_mode']. + use_as_default: [] + + # Backend to use for this model (auto-detect if not specified) + # Options: exllamav2, exllamav3 + backend: + + # Max sequence length (default: 4096). + # Set to -1 to fetch from the model's config.json + max_seq_len: + + # Load model with tensor parallelism. + # Falls back to autosplit if GPU split isn't provided. + # This ignores the gpu_split_auto value. + tensor_parallel: false + + # Sets a backend type for tensor parallelism. (default: native). + # Options: native, nccl + # Native is recommended for PCIe GPUs + # NCCL is recommended for NVLink. + tensor_parallel_backend: native + + # Automatically allocate resources to GPUs (default: True). + # Not parsed for single GPU users. + gpu_split_auto: true + + # Reserve VRAM used for autosplit loading (default: 96 MB on GPU 0). + # Represented as an array of MB per GPU. + autosplit_reserve: [96] + + # An integer array of GBs of VRAM to split between GPUs (default: []). + # Used with tensor parallelism. + gpu_split: [] + + # NOTE: If a model has YaRN rope scaling, it will automatically be enabled by ExLlama. + # rope_scale and rope_alpha settings won't apply in this case. + + # Rope scale (default: 1.0). + # Same as compress_pos_emb. + # Use if the model was trained on long context with rope. + # Leave blank to pull the value from the model. + rope_scale: 1.0 + + # Rope alpha (default: None). + # Same as alpha_value. Set to "auto" to auto-calculate. + # Leaving this value blank will either pull from the model or auto-calculate. + rope_alpha: + + # Enable different cache modes for VRAM savings (default: FP16). + # Possible values for exllamav2: 'FP16', 'Q8', 'Q6', 'Q4'. + # For exllamav3, specify the pair k_bits,v_bits where k_bits and v_bits are integers from 2-8 (i.e. 8,8). + cache_mode: FP16 + + # Size of the prompt cache to allocate (default: max_seq_len). + # Must be a multiple of 256 and can't be less than max_seq_len. + # For CFG, set this to 2 * max_seq_len. + cache_size: + + # Chunk size for prompt ingestion (default: 2048). + # A lower value reduces VRAM usage but decreases ingestion speed. + # NOTE: Effects vary depending on the model. + # An ideal value is between 512 and 4096. + chunk_size: 2048 + + # Disable output chunking (default: false) + # Used by EXL3 models only. + # If True, allocate space in the cache for the entire response with each request. + disable_output_chunking: false + + # Set the maximum number of prompts to process at one time (default: None/Automatic). + # Automatically calculated if left blank. + # NOTE: Only available for Nvidia ampere (30 series) and above GPUs. + max_batch_size: + + # Set the prompt template for this model. (default: None) + # If empty, attempts to look for the model's chat template. + # If a model contains multiple templates in its tokenizer_config.json, + # set prompt_template to the name of the template you want to use. + # NOTE: Only works with chat completion message lists! + prompt_template: + + # Enables vision support if the model supports it. (default: False) + vision: false + +# Options for draft models (speculative decoding) +# This will use more VRAM! +draft_model: + # Directory to look for draft models (default: models) + draft_model_dir: models + + # An initial draft model to load. + # Ensure the model is in the model directory. + draft_model_name: + + # Rope scale for draft models (default: 1.0). + # Same as compress_pos_emb. + # Use if the draft model was trained on long context with rope. + draft_rope_scale: 1.0 + + # Rope alpha for draft models (default: None). + # Same as alpha_value. Set to "auto" to auto-calculate. + # Leaving this value blank will either pull from the model or auto-calculate. + draft_rope_alpha: + + # Cache mode for draft models to save VRAM (default: FP16). + # Possible values for exllamav2: 'FP16', 'Q8', 'Q6', 'Q4'. + # For exllamav3, specify the pair k_bits,v_bits where k_bits and v_bits are integers from 2-8 (i.e. 8,8). + draft_cache_mode: FP16 + + # An integer array of GBs of VRAM to split between GPUs (default: []). + # If this isn't filled in, the draft model is autosplit. + draft_gpu_split: [] + +# Options for Sampling +sampling: + # Select a sampler override preset (default: None). + # Find this in the sampler-overrides folder. + # This overrides default fallbacks for sampler values that are passed to the API. + # NOTE: safe_defaults is noob friendly and provides fallbacks for frontends that don't send sampling parameters. + # Remove this for any advanced usage. + override_preset: safe_defaults + +# Options for Loras +lora: + # Directory to look for LoRAs (default: loras). + lora_dir: loras + + # List of LoRAs to load and associated scaling factors (default scale: 1.0). + # For the YAML file, add each entry as a YAML list: + # - name: lora1 + # scaling: 1.0 + loras: + +# Options for embedding models and loading. +# NOTE: Embeddings requires the "extras" feature to be installed +# Install it via "pip install .[extras]" +embeddings: + # Directory to look for embedding models (default: models). + embedding_model_dir: models + + # Device to load embedding models on (default: cpu). + # Possible values: cpu, auto, cuda. + # NOTE: It's recommended to load embedding models on the CPU. + # If using an AMD GPU, set this value to 'cuda'. + embeddings_device: cpu + + # An initial embedding model to load on the infinity backend. + embedding_model_name: + +# Options for development and experimentation +developer: + # Skip Exllamav2 version check (default: False). + # WARNING: It's highly recommended to update your dependencies rather than enabling this flag. + unsafe_launch: false + + # Disable API request streaming (default: False). + disable_request_streaming: false + + # Set process to use a higher priority. + # For realtime process priority, run as administrator or sudo. + # Otherwise, the priority will be set to high. + realtime_process_priority: false diff --git a/endpoints/core/types/model.py b/endpoints/core/types/model.py index 1e84336..202b74b 100644 --- a/endpoints/core/types/model.py +++ b/endpoints/core/types/model.py @@ -109,6 +109,7 @@ class ModelLoadRequest(BaseModel): ) cache_mode: Optional[str] = None chunk_size: Optional[int] = None + disable_output_chunking: Optional[bool] = False prompt_template: Optional[str] = None vision: Optional[bool] = None From d672dc2137247dfd9481295bd33594ddac31b69c Mon Sep 17 00:00:00 2001 From: turboderp <11859846+turboderp@users.noreply.github.com> Date: Sun, 5 Oct 2025 21:23:02 +0200 Subject: [PATCH 3/7] API: Fix race condition when client disconnects --- endpoints/Kobold/utils/generation.py | 7 ++----- endpoints/OAI/utils/chat_completion.py | 7 ++----- endpoints/OAI/utils/completion.py | 7 ++----- 3 files changed, 6 insertions(+), 15 deletions(-) diff --git a/endpoints/Kobold/utils/generation.py b/endpoints/Kobold/utils/generation.py index f08e758..0d1c166 100644 --- a/endpoints/Kobold/utils/generation.py +++ b/endpoints/Kobold/utils/generation.py @@ -61,10 +61,7 @@ async def _stream_collector(data: GenerateRequest, request: Request): async for generation in generator: if disconnect_task.done(): - abort_event.set() - handle_request_disconnect( - f"Kobold generation {data.genkey} cancelled by user." - ) + raise CancelledError() text = generation.get("text") @@ -78,7 +75,7 @@ async def _stream_collector(data: GenerateRequest, request: Request): break except CancelledError: # If the request disconnects, break out - if not disconnect_task.done(): + if not abort_event.is_set(): abort_event.set() handle_request_disconnect( f"Kobold generation {data.genkey} cancelled by user." diff --git a/endpoints/OAI/utils/chat_completion.py b/endpoints/OAI/utils/chat_completion.py index 4a6c210..b559bb2 100644 --- a/endpoints/OAI/utils/chat_completion.py +++ b/endpoints/OAI/utils/chat_completion.py @@ -348,10 +348,7 @@ async def stream_generate_chat_completion( # Consumer loop while True: if disconnect_task.done(): - abort_event.set() - handle_request_disconnect( - f"Chat completion generation {request.state.id} cancelled by user." - ) + raise CancelledError() generation = await gen_queue.get() @@ -401,7 +398,7 @@ async def stream_generate_chat_completion( except CancelledError: # Get out if the request gets disconnected - if not disconnect_task.done(): + if not abort_event.is_set(): abort_event.set() handle_request_disconnect("Chat completion generation cancelled by user.") except Exception: diff --git a/endpoints/OAI/utils/completion.py b/endpoints/OAI/utils/completion.py index ca51c9c..f66d381 100644 --- a/endpoints/OAI/utils/completion.py +++ b/endpoints/OAI/utils/completion.py @@ -226,10 +226,7 @@ async def stream_generate_completion( # Consumer loop while True: if disconnect_task.done(): - abort_event.set() - handle_request_disconnect( - f"Completion generation {request.state.id} cancelled by user." - ) + raise CancelledError() generation = await gen_queue.get() @@ -248,7 +245,7 @@ async def stream_generate_completion( except CancelledError: # Get out if the request gets disconnected - if not disconnect_task.done(): + if not abort_event.is_set(): abort_event.set() handle_request_disconnect( f"Completion generation {request.state.id} cancelled by user." From 4235f98e830894bafd3f5544ae8d6fb998ca675b Mon Sep 17 00:00:00 2001 From: turboderp <11859846+turboderp@users.noreply.github.com> Date: Sun, 5 Oct 2025 22:15:27 +0200 Subject: [PATCH 4/7] Model: Change cache_size/max_seq_len behavior - Cache size is now given only by the cache_size config option. Default is 4096 (user should always override to max out VRAM) - max_seq_len, if not overridden in the config, will default to the model's config.json - max_seq_len is reduced to be no larger than the cache --- backends/exllamav2/model.py | 20 +++++------ backends/exllamav3/model.py | 66 ++++++++++++----------------------- common/model.py | 4 +-- config_sample.yml | 8 ++--- endpoints/core/types/model.py | 2 +- 5 files changed, 37 insertions(+), 63 deletions(-) diff --git a/backends/exllamav2/model.py b/backends/exllamav2/model.py index 4fc263e..447e9f4 100644 --- a/backends/exllamav2/model.py +++ b/backends/exllamav2/model.py @@ -238,7 +238,7 @@ class ExllamaV2Container(BaseModelContainer): base_seq_len = hf_model.hf_config.max_position_embeddings # Set the target seq len if present - target_seq_len = unwrap(kwargs.get("max_seq_len"), 4096) + target_seq_len = unwrap(kwargs.get("max_seq_len"), base_seq_len) # Set the rope scale self.config.scale_pos_emb = unwrap( @@ -289,16 +289,7 @@ class ExllamaV2Container(BaseModelContainer): # Set k/v cache size # cache_size is only relevant when paged mode is enabled if self.paged: - cache_size = unwrap(kwargs.get("cache_size"), self.config.max_seq_len) - - if cache_size < self.config.max_seq_len: - logger.warning( - f"The given cache_size ({cache_size}) is smaller than the " - "desired context length.\n" - "Overriding cache_size to max_seq_len. " - ) - - cache_size = self.config.max_seq_len + cache_size = unwrap(kwargs.get("cache_size"), 4096) # Enforce a multiple of 256 for cache size # Overestimate to ensure that the cache isn't below max_seq_len @@ -317,6 +308,13 @@ class ExllamaV2Container(BaseModelContainer): cache_size = rounded_cache_size + if self.config.max_seq_len > cache_size: + logger.warning( + f"The given max_seq_len ({self.config.max_seq_len}) is larger than the " + f"cache size and will be limited to {cache_size} tokens." + ) + self.config.max_seq_len = cache_size + # Warn user if cache size may be inadequate for CFG if cache_size < 2 * self.config.max_seq_len: logger.warning( diff --git a/backends/exllamav3/model.py b/backends/exllamav3/model.py index 4427df4..7e8402f 100644 --- a/backends/exllamav3/model.py +++ b/backends/exllamav3/model.py @@ -105,12 +105,7 @@ class ExllamaV3Container(BaseModelContainer): self = cls() # Make sure ExllamaV3 is up to date - check_package_version("exllamav3", "0.0.4") - - logger.warning( - "ExllamaV3 is currently in an alpha state. " - "Please note that all config options may not work." - ) + check_package_version("exllamav3", "0.0.7") self.model_dir = model_directory self.hf_model = hf_model @@ -131,9 +126,6 @@ class ExllamaV3Container(BaseModelContainer): self.vision_model = None self.use_vision = False - # Fallback to 4096 since exl3 can't fetch from HF's config.json - self.max_seq_len = unwrap(kwargs.get("max_seq_len"), 4096) - # Prepare the draft model config if necessary draft_args = unwrap(kwargs.get("draft_model"), {}) draft_model_name = draft_args.get("draft_model_name") @@ -231,11 +223,15 @@ class ExllamaV3Container(BaseModelContainer): raise RuntimeError(gpu_unsupported_message) # Cache - user_cache_size = unwrap(kwargs.get("cache_size"), self.max_seq_len) + user_cache_size = unwrap(kwargs.get("cache_size"), 4096) self.cache_size = self.adjust_cache_size(user_cache_size) self.cache_mode = unwrap(kwargs.get("cache_mode"), "FP16") self.cache = self.create_cache(self.cache_mode, self.model) + # Limit max_seq_len to prevent sequences larger than the cache + max_seq_len = unwrap(kwargs.get("max_seq_len"), hf_model.hf_config.max_position_embeddings) + self.max_seq_len = self.adjust_max_seq_len(max_seq_len) + # Draft cache if self.use_draft_model: # Set draft cache mode @@ -274,21 +270,11 @@ class ExllamaV3Container(BaseModelContainer): return self def adjust_cache_size(self, cache_size): - if cache_size < self.max_seq_len: - logger.warning( - f"The given cache_size ({cache_size}) is smaller than the " - "desired context length.\n" - "Overriding cache_size to max_seq_len. " - ) - - cache_size = self.max_seq_len - # Enforce a multiple of 256 for cache size # Overestimate to ensure that the cache isn't below max_seq_len cache_remainder = cache_size % 256 if cache_remainder != 0: rounded_cache_size = int(256 * ((cache_size - cache_remainder) / 256 + 1)) - logger.warning( f"The given cache size ({cache_size}) is " "not a multiple of 256.\n" @@ -298,22 +284,22 @@ class ExllamaV3Container(BaseModelContainer): cache_size = rounded_cache_size - # Warn user if cache size may be inadequate for CFG - if cache_size < 2 * self.max_seq_len: - logger.warning( - f"The given cache_size ({cache_size}) is less than 2 * max_seq_len " - "and may be too small for requests using CFG. \n" - "Ignore this warning if you do not plan on using CFG." - ) - return cache_size - def adjust_chunk_size(self, user_chunk_size: int): - chunk_size = sorted((256, user_chunk_size, self.max_seq_len))[1] - chunk_remainder = chunk_size % 256 - if chunk_remainder != 0: - rounded_chunk_size = int(256 * ((chunk_size - chunk_remainder) / 256 + 1)) + def adjust_max_seq_len(self, max_seq_len): + if max_seq_len > self.cache_size: + logger.warning( + f"The given max_seq_len ({max_seq_len}) is larger than the cache size " + f"and will be limited to {self.cache_size} tokens." + ) + max_seq_len = self.cache_size + return max_seq_len + + def adjust_chunk_size(self, user_chunk_size: int): + chunk_size = max(256, user_chunk_size) + rounded_chunk_size = (chunk_size + 255) // 256 * 256 + if chunk_size != rounded_chunk_size: logger.warning( f"The given chunk size ({chunk_size}) is " "not a multiple of 256.\n" @@ -950,24 +936,18 @@ class ExllamaV3Container(BaseModelContainer): context_len = input_ids[0].size(dim=-1) # Automatically set max_tokens to fill up the context - # This should be an OK default, but may be changed in the future max_tokens = unwrap( - params.max_tokens, - self.max_seq_len - context_len, + params.max_tokens if params.max_tokens > 0 else None, + self.max_seq_len - context_len - 1, ) if max_tokens < 1: logger.warning("max_tokens must be a positive integer, setting to 1.") max_tokens = 1 - # Determine if the negative context or the context length is bigger - context_to_check = context_len - # Check total length of prompt against max context length - if context_to_check > self.max_seq_len: - preamble = "Prompt" - + if context_len > self.max_seq_len: raise ValueError( - f"{preamble} length {context_to_check} is greater than " + f"Prompt length {context_len} is greater than " f"max_seq_len {self.max_seq_len}" ) diff --git a/common/model.py b/common/model.py index 16138a3..1eea5eb 100644 --- a/common/model.py +++ b/common/model.py @@ -157,10 +157,8 @@ async def load_model_gen(model_path: pathlib.Path, **kwargs): # Override the max sequence length based on user max_seq_len = kwargs.get("max_seq_len") - if max_seq_len == -1: + if max_seq_len == -1 or max_seq_len is None: kwargs["max_seq_len"] = hf_model.hf_config.max_position_embeddings - elif max_seq_len is None: - kwargs["max_seq_len"] = 4096 # Create a new container and check if the right dependencies are installed backend = unwrap(kwargs.get("backend"), detect_backend(hf_model)) diff --git a/config_sample.yml b/config_sample.yml index a294cfc..7781083 100644 --- a/config_sample.yml +++ b/config_sample.yml @@ -78,8 +78,7 @@ model: # Options: exllamav2, exllamav3 backend: - # Max sequence length (default: 4096). - # Set to -1 to fetch from the model's config.json + # Max sequence length (default: fetch from the model's config.json). max_seq_len: # Load model with tensor parallelism. @@ -124,9 +123,8 @@ model: # For exllamav3, specify the pair k_bits,v_bits where k_bits and v_bits are integers from 2-8 (i.e. 8,8). cache_mode: FP16 - # Size of the prompt cache to allocate (default: max_seq_len). - # Must be a multiple of 256 and can't be less than max_seq_len. - # For CFG, set this to 2 * max_seq_len. + # Size of the key/value cache to allocate, in tokens (default: 4096). + # Must be a multiple of 256. cache_size: # Chunk size for prompt ingestion (default: 2048). diff --git a/endpoints/core/types/model.py b/endpoints/core/types/model.py index 202b74b..c9fbf1b 100644 --- a/endpoints/core/types/model.py +++ b/endpoints/core/types/model.py @@ -85,7 +85,7 @@ class ModelLoadRequest(BaseModel): examples=[4096], ) cache_size: Optional[int] = Field( - description=("Number in tokens, must be greater than or equal to max_seq_len"), + description="Number in tokens, must be multiple of 256", default=None, examples=[4096], ) From 85459ce6008782f1f6add5e9faaa98916f32221f Mon Sep 17 00:00:00 2001 From: kingbri <8082010+kingbri1@users.noreply.github.com> Date: Thu, 9 Oct 2025 22:33:53 -0400 Subject: [PATCH 5/7] Tree: Format Signed-off-by: kingbri <8082010+kingbri1@users.noreply.github.com> --- backends/exllamav3/model.py | 8 +++++--- backends/exllamav3/utils.py | 3 ++- 2 files changed, 7 insertions(+), 4 deletions(-) diff --git a/backends/exllamav3/model.py b/backends/exllamav3/model.py index 7e8402f..f3fb26e 100644 --- a/backends/exllamav3/model.py +++ b/backends/exllamav3/model.py @@ -2,6 +2,7 @@ import asyncio import gc import pathlib import re +import torch from itertools import zip_longest from typing import ( Any, @@ -11,7 +12,6 @@ from typing import ( Optional, ) -import torch from exllamav3 import ( AsyncGenerator, AsyncJob, @@ -229,7 +229,9 @@ class ExllamaV3Container(BaseModelContainer): self.cache = self.create_cache(self.cache_mode, self.model) # Limit max_seq_len to prevent sequences larger than the cache - max_seq_len = unwrap(kwargs.get("max_seq_len"), hf_model.hf_config.max_position_embeddings) + max_seq_len = unwrap( + kwargs.get("max_seq_len"), hf_model.hf_config.max_position_embeddings + ) self.max_seq_len = self.adjust_max_seq_len(max_seq_len) # Draft cache @@ -967,7 +969,7 @@ class ExllamaV3Container(BaseModelContainer): banned_strings=params.banned_strings, embeddings=mm_embeddings_content, return_top_tokens=params.logprobs, - max_rq_tokens=self.max_rq_tokens + max_rq_tokens=self.max_rq_tokens, ) generated_tokens = 0 diff --git a/backends/exllamav3/utils.py b/backends/exllamav3/utils.py index 0a90487..5a3e68d 100644 --- a/backends/exllamav3/utils.py +++ b/backends/exllamav3/utils.py @@ -1,6 +1,8 @@ import platform +import torch from loguru import logger + def exllama_supports_nccl(): if platform.system() == "Windows": unsupported_message = ( @@ -9,5 +11,4 @@ def exllama_supports_nccl(): logger.warning(unsupported_message) return False - import torch return torch.cuda.is_available() and torch.distributed.is_nccl_available() From f73e88e9e9d07753850e2572f3c4d07154cf5370 Mon Sep 17 00:00:00 2001 From: turboderp <11859846+turboderp@users.noreply.github.com> Date: Tue, 14 Oct 2025 00:58:14 +0200 Subject: [PATCH 6/7] Dependencies: update exllamav3 --- pyproject.toml | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 45832d1..1971c24 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -78,14 +78,14 @@ cu12 = [ "exllamav2 @ https://github.com/turboderp-org/exllamav2/releases/download/v0.3.2/exllamav2-0.3.2+cu128.torch2.8.0-cp310-cp310-linux_x86_64.whl ; platform_system == 'Linux' and platform_machine == 'x86_64' and python_version == '3.10'", # Exl3 - "exllamav3 @ https://github.com/turboderp-org/exllamav3/releases/download/v0.0.7/exllamav3-0.0.7+cu128.torch2.8.0-cp313-cp313-win_amd64.whl ; platform_system == 'Windows' and python_version == '3.13'", - "exllamav3 @ https://github.com/turboderp-org/exllamav3/releases/download/v0.0.7/exllamav3-0.0.7+cu128.torch2.8.0-cp312-cp312-win_amd64.whl ; platform_system == 'Windows' and python_version == '3.12'", - "exllamav3 @ https://github.com/turboderp-org/exllamav3/releases/download/v0.0.7/exllamav3-0.0.7+cu128.torch2.8.0-cp311-cp311-win_amd64.whl ; platform_system == 'Windows' and python_version == '3.11'", - "exllamav3 @ https://github.com/turboderp-org/exllamav3/releases/download/v0.0.7/exllamav3-0.0.7+cu128.torch2.8.0-cp310-cp310-win_amd64.whl ; platform_system == 'Windows' and python_version == '3.10'", - "exllamav3 @ https://github.com/turboderp-org/exllamav3/releases/download/v0.0.7/exllamav3-0.0.7+cu128.torch2.8.0-cp313-cp313-linux_x86_64.whl ; platform_system == 'Linux' and platform_machine == 'x86_64' and python_version == '3.13'", - "exllamav3 @ https://github.com/turboderp-org/exllamav3/releases/download/v0.0.7/exllamav3-0.0.7+cu128.torch2.8.0-cp312-cp312-linux_x86_64.whl ; platform_system == 'Linux' and platform_machine == 'x86_64' and python_version == '3.12'", - "exllamav3 @ https://github.com/turboderp-org/exllamav3/releases/download/v0.0.7/exllamav3-0.0.7+cu128.torch2.8.0-cp311-cp311-linux_x86_64.whl ; platform_system == 'Linux' and platform_machine == 'x86_64' and python_version == '3.11'", - "exllamav3 @ https://github.com/turboderp-org/exllamav3/releases/download/v0.0.7/exllamav3-0.0.7+cu128.torch2.8.0-cp310-cp310-linux_x86_64.whl ; platform_system == 'Linux' and platform_machine == 'x86_64' and python_version == '3.10'", + "exllamav3 @ https://github.com/turboderp-org/exllamav3/releases/download/v0.0.9/exllamav3-0.0.9+cu128.torch2.8.0-cp313-cp313-win_amd64.whl ; platform_system == 'Windows' and python_version == '3.13'", + "exllamav3 @ https://github.com/turboderp-org/exllamav3/releases/download/v0.0.9/exllamav3-0.0.9+cu128.torch2.8.0-cp312-cp312-win_amd64.whl ; platform_system == 'Windows' and python_version == '3.12'", + "exllamav3 @ https://github.com/turboderp-org/exllamav3/releases/download/v0.0.9/exllamav3-0.0.9+cu128.torch2.8.0-cp311-cp311-win_amd64.whl ; platform_system == 'Windows' and python_version == '3.11'", + "exllamav3 @ https://github.com/turboderp-org/exllamav3/releases/download/v0.0.9/exllamav3-0.0.9+cu128.torch2.8.0-cp310-cp310-win_amd64.whl ; platform_system == 'Windows' and python_version == '3.10'", + "exllamav3 @ https://github.com/turboderp-org/exllamav3/releases/download/v0.0.9/exllamav3-0.0.9+cu128.torch2.8.0-cp313-cp313-linux_x86_64.whl ; platform_system == 'Linux' and platform_machine == 'x86_64' and python_version == '3.13'", + "exllamav3 @ https://github.com/turboderp-org/exllamav3/releases/download/v0.0.9/exllamav3-0.0.9+cu128.torch2.8.0-cp312-cp312-linux_x86_64.whl ; platform_system == 'Linux' and platform_machine == 'x86_64' and python_version == '3.12'", + "exllamav3 @ https://github.com/turboderp-org/exllamav3/releases/download/v0.0.9/exllamav3-0.0.9+cu128.torch2.8.0-cp311-cp311-linux_x86_64.whl ; platform_system == 'Linux' and platform_machine == 'x86_64' and python_version == '3.11'", + "exllamav3 @ https://github.com/turboderp-org/exllamav3/releases/download/v0.0.9/exllamav3-0.0.9+cu128.torch2.8.0-cp310-cp310-linux_x86_64.whl ; platform_system == 'Linux' and platform_machine == 'x86_64' and python_version == '3.10'", # Windows FA2 from https://github.com/kingbri1/flash-attention/releases "flash_attn @ https://github.com/kingbri1/flash-attention/releases/download/v2.8.3/flash_attn-2.8.3+cu128torch2.8.0cxx11abiFALSE-cp313-cp313-win_amd64.whl ; platform_system == 'Windows' and python_version == '3.13'", From 8abdfe7b1356472ca44eafb3e1e6a8e24d42463c Mon Sep 17 00:00:00 2001 From: turboderp <11859846+turboderp@users.noreply.github.com> Date: Tue, 14 Oct 2025 02:47:52 +0200 Subject: [PATCH 7/7] Config: replace disable_output_chunking flag with output_chunking --- backends/exllamav3/model.py | 4 ++-- common/config_models.py | 9 +++++---- config_sample.yml | 6 +++--- endpoints/core/types/model.py | 2 +- 4 files changed, 11 insertions(+), 10 deletions(-) diff --git a/backends/exllamav3/model.py b/backends/exllamav3/model.py index f3fb26e..0fe09c0 100644 --- a/backends/exllamav3/model.py +++ b/backends/exllamav3/model.py @@ -250,8 +250,8 @@ class ExllamaV3Container(BaseModelContainer): self.chunk_size = self.adjust_chunk_size(user_chunk_size) # Output chunking - disable_output_chunking = unwrap(kwargs.get("disable_output_chunking"), False) - self.max_rq_tokens = None if disable_output_chunking else self.chunk_size + output_chunking = unwrap(kwargs.get("output_chunking"), True) + self.max_rq_tokens = self.chunk_size if output_chunking else None # Template setup self.prompt_template = await find_prompt_template( diff --git a/common/config_models.py b/common/config_models.py index 8d07d79..340685e 100644 --- a/common/config_models.py +++ b/common/config_models.py @@ -265,12 +265,13 @@ class ModelConfig(BaseConfigModel): ), gt=0, ) - disable_output_chunking: Optional[bool] = Field( - False, + output_chunking: Optional[bool] = Field( + True, description=( - "Disable output chunking (default: false).\n" + "Use output chunking (default: True)\n" + "Instead of allocating cache space for the entire completion at once, " + "allocate in chunks as needed.\n" "Used by EXL3 models only.\n" - "True, allocate space in the cache for the entire response with each request..\n" ), ) max_batch_size: Optional[int] = Field( diff --git a/config_sample.yml b/config_sample.yml index 7781083..0d51719 100644 --- a/config_sample.yml +++ b/config_sample.yml @@ -133,10 +133,10 @@ model: # An ideal value is between 512 and 4096. chunk_size: 2048 - # Disable output chunking (default: false) + # Use output chunking (default: True) + # Instead of allocating cache space for the entire completion at once, allocate in chunks as needed. # Used by EXL3 models only. - # If True, allocate space in the cache for the entire response with each request. - disable_output_chunking: false + output_chunking: true # Set the maximum number of prompts to process at one time (default: None/Automatic). # Automatically calculated if left blank. diff --git a/endpoints/core/types/model.py b/endpoints/core/types/model.py index c9fbf1b..6e2e0c9 100644 --- a/endpoints/core/types/model.py +++ b/endpoints/core/types/model.py @@ -109,7 +109,7 @@ class ModelLoadRequest(BaseModel): ) cache_mode: Optional[str] = None chunk_size: Optional[int] = None - disable_output_chunking: Optional[bool] = False + output_chunking: Optional[bool] = True prompt_template: Optional[str] = None vision: Optional[bool] = None