Files
YALS/config_sample.yml
kingbri a9af0c1554 Config: Clarify tensor split
Tensor split uses ratios rather than GBs of vram. This should solve
size incosistencies.

Signed-off-by: kingbri <8082010+kingbri1@users.noreply.github.com>
2025-09-10 22:10:50 -04:00

150 lines
5.3 KiB
YAML

# Options for networking
network:
# The IP to host on (default: 127.0.0.1).
# Use 0.0.0.0 to expose on all network adapters.
host: 127.0.0.1
# The port to host on (default: 5000).
# Note: Recommended to use 5001 on MacOS because AirServer runs on port 5000
port: 5000
# Disable HTTP token authentication with requests.
# WARNING: This will make your instance vulnerable!
# Turn on this option if you are ONLY connecting from localhost.
disable_auth: false
# Options for logging
logging:
# Enable prompt logging (default: False)
log_prompt: false
# Enable generation parameter logging (default: False)
log_generation_params: false
# Enable request logging (default: False).
# NOTE: Only use this for debugging!
log_requests: false
# Options for model overrides and loading
model:
# Directory to look for models (default: models).
# Windows users, do NOT put this path in quotes!
model_dir: models
# Allow direct loading of models from a completion or chat completion request (default: False).
# This method of loading is strict by default.
inline_model_loading: false
# Sends dummy model names when the models endpoint is queried. (default: False)
# Enable this if the client is looking for specific OAI models.
use_dummy_models: false
# A list of fake model names that are sent via the /v1/models endpoint. (default: ["gpt-3.5-turbo"])
# Also used as bypasses for strict mode if inline_model_loading is true.
dummy_model_names: ["gpt-3.5-turbo"]
# An initial model to load.
# Make sure the model is located in the model directory!
# REQUIRED: This must be filled out to load a model on startup.
model_name:
# Names of args to use as a fallback for API load requests (default: []).
# For example, if you always want cache_mode to be Q4 instead of on the inital model load, add "cache_mode" to this array.
# Example: ['max_seq_len', 'num_gpu_layers'].
use_as_default: []
# Max sequence length (default: 4096).
# Set to -1 to fetch from the model's config.json
max_seq_len:
# Number of slots for continuous batching (default: 1)
num_slots: 1
# Size (in tokens) of the KV cache (default: max_seq_len).
# At maximum, should be the max_seq_len * num_slots.
cache_size:
# Chunk size for prompt ingestion (default: 512).
# A lower value reduces VRAM usage but decreases ingestion speed.
# NOTE: Effects vary depending on the model.
# An ideal value is between 512 and 4096.
chunk_size: 512
# Amount of tokens at a time to allocate onto a single GPU during processing (default: chunk_size)
# Used for pipeline parallelism, meaning the overall batch job is split in parallel between all active GPUs
# Only set this when num_gpus > 1. An ideal value is chunk_size / num_gpus
physical_chunk_size:
# Number of model layers to offload on the GPU (default: 0)
# Set this to 999 to offload all layers to the GPU
num_gpu_layers: 0
# An integer array defining the ratio of VRAM to split for each GPU (default: []).
# Going over the max amount of GPUs will crash when loading the model
gpu_split: []
# GPU split mode (default: layer)
# Possible values - layer, row
# Row: Splits layers evenly across GPUs. Saves VRAM, but can hurt performance.
# Layer: Preferred. Fills layers
gpu_split_mode: layer
# Number of CPU threads to use during processing/generation (default: -1)
# NOTE: Does not apply if model is fully offloaded to GPU
num_threads: -1
# Prompt template to use for chat completions (default: None)
prompt_template:
# Enable flash attention (default: true)
# Disable if problems arise with the model's architecture
flash_attention: true
# Rope freq base. 0 = model default (default: 0)
# Adjust this value for NTK scaling
rope_freq_base: 0
# Enable YaRN scaling. All other parameters inherited from the model (default: 0)
# Turning this on disables linear/NTK RoPE scaling
enable_yarn: false
# K cache quantization type (default: F16)
# Possible values - f32, f16, q4_0, q4_1, q5_0, q5_1, q8_0
cache_mode_k: f16
# V cache quantization type (default: F16)
# Possible values - f32, f16, q4_0, q4_1, q5_0, q5_1, q8_0
cache_mode_v: f16
# Offload KV cache to GPU if available (default: true)
# By default, llama.cpp decides KV device per-layer based on CPU or GPU
# Set to false to force all KV layers to CPU
kv_offload: true
# Override tensors to different devices (default: [])
# Takes in a regex string. Recommended to set num_threads
# Make a new array element for multiple overrides
override_tensor: []
# Keep MoE weights of n layers in CPU (default: 0)
# Provide "all" to put MoE weights for all layers in CPU
n_cpu_moe: 0
# Lazily load the model into virtual memory. This is fast and efficient (default: true)
# Turning mmap off will take longer to load, but will reduce the risk of pageouts
# WARNING: Do not adjust this parameter unless you know what you're doing!
mmap: true
# Options for Sampling
sampling:
# Select a sampler override preset (default: None).
# Find this in the sampler_overrides folder.
# This overrides default fallbacks for sampler values that are passed to the API.
override_preset: safe_defaults
developer:
# Set process to use a higher priority.
# For realtime process priority, run as administrator or sudo.
# Otherwise, the priority will be set to high.
realtime_process_priority: false