mirror of
https://github.com/theroyallab/YALS.git
synced 2026-04-19 22:09:15 +00:00
Tensor split uses ratios rather than GBs of vram. This should solve size incosistencies. Signed-off-by: kingbri <8082010+kingbri1@users.noreply.github.com>
150 lines
5.3 KiB
YAML
150 lines
5.3 KiB
YAML
# Options for networking
|
|
network:
|
|
# The IP to host on (default: 127.0.0.1).
|
|
# Use 0.0.0.0 to expose on all network adapters.
|
|
host: 127.0.0.1
|
|
|
|
# The port to host on (default: 5000).
|
|
# Note: Recommended to use 5001 on MacOS because AirServer runs on port 5000
|
|
port: 5000
|
|
|
|
# Disable HTTP token authentication with requests.
|
|
# WARNING: This will make your instance vulnerable!
|
|
# Turn on this option if you are ONLY connecting from localhost.
|
|
disable_auth: false
|
|
|
|
# Options for logging
|
|
logging:
|
|
# Enable prompt logging (default: False)
|
|
log_prompt: false
|
|
|
|
# Enable generation parameter logging (default: False)
|
|
log_generation_params: false
|
|
|
|
# Enable request logging (default: False).
|
|
# NOTE: Only use this for debugging!
|
|
log_requests: false
|
|
|
|
# Options for model overrides and loading
|
|
model:
|
|
# Directory to look for models (default: models).
|
|
# Windows users, do NOT put this path in quotes!
|
|
model_dir: models
|
|
|
|
# Allow direct loading of models from a completion or chat completion request (default: False).
|
|
# This method of loading is strict by default.
|
|
inline_model_loading: false
|
|
|
|
# Sends dummy model names when the models endpoint is queried. (default: False)
|
|
# Enable this if the client is looking for specific OAI models.
|
|
use_dummy_models: false
|
|
|
|
# A list of fake model names that are sent via the /v1/models endpoint. (default: ["gpt-3.5-turbo"])
|
|
# Also used as bypasses for strict mode if inline_model_loading is true.
|
|
dummy_model_names: ["gpt-3.5-turbo"]
|
|
|
|
# An initial model to load.
|
|
# Make sure the model is located in the model directory!
|
|
# REQUIRED: This must be filled out to load a model on startup.
|
|
model_name:
|
|
|
|
# Names of args to use as a fallback for API load requests (default: []).
|
|
# For example, if you always want cache_mode to be Q4 instead of on the inital model load, add "cache_mode" to this array.
|
|
# Example: ['max_seq_len', 'num_gpu_layers'].
|
|
use_as_default: []
|
|
|
|
# Max sequence length (default: 4096).
|
|
# Set to -1 to fetch from the model's config.json
|
|
max_seq_len:
|
|
|
|
# Number of slots for continuous batching (default: 1)
|
|
num_slots: 1
|
|
|
|
# Size (in tokens) of the KV cache (default: max_seq_len).
|
|
# At maximum, should be the max_seq_len * num_slots.
|
|
cache_size:
|
|
|
|
# Chunk size for prompt ingestion (default: 512).
|
|
# A lower value reduces VRAM usage but decreases ingestion speed.
|
|
# NOTE: Effects vary depending on the model.
|
|
# An ideal value is between 512 and 4096.
|
|
chunk_size: 512
|
|
|
|
# Amount of tokens at a time to allocate onto a single GPU during processing (default: chunk_size)
|
|
# Used for pipeline parallelism, meaning the overall batch job is split in parallel between all active GPUs
|
|
# Only set this when num_gpus > 1. An ideal value is chunk_size / num_gpus
|
|
physical_chunk_size:
|
|
|
|
# Number of model layers to offload on the GPU (default: 0)
|
|
# Set this to 999 to offload all layers to the GPU
|
|
num_gpu_layers: 0
|
|
|
|
# An integer array defining the ratio of VRAM to split for each GPU (default: []).
|
|
# Going over the max amount of GPUs will crash when loading the model
|
|
gpu_split: []
|
|
|
|
# GPU split mode (default: layer)
|
|
# Possible values - layer, row
|
|
# Row: Splits layers evenly across GPUs. Saves VRAM, but can hurt performance.
|
|
# Layer: Preferred. Fills layers
|
|
gpu_split_mode: layer
|
|
|
|
# Number of CPU threads to use during processing/generation (default: -1)
|
|
# NOTE: Does not apply if model is fully offloaded to GPU
|
|
num_threads: -1
|
|
|
|
# Prompt template to use for chat completions (default: None)
|
|
prompt_template:
|
|
|
|
# Enable flash attention (default: true)
|
|
# Disable if problems arise with the model's architecture
|
|
flash_attention: true
|
|
|
|
# Rope freq base. 0 = model default (default: 0)
|
|
# Adjust this value for NTK scaling
|
|
rope_freq_base: 0
|
|
|
|
# Enable YaRN scaling. All other parameters inherited from the model (default: 0)
|
|
# Turning this on disables linear/NTK RoPE scaling
|
|
enable_yarn: false
|
|
|
|
# K cache quantization type (default: F16)
|
|
# Possible values - f32, f16, q4_0, q4_1, q5_0, q5_1, q8_0
|
|
cache_mode_k: f16
|
|
|
|
# V cache quantization type (default: F16)
|
|
# Possible values - f32, f16, q4_0, q4_1, q5_0, q5_1, q8_0
|
|
cache_mode_v: f16
|
|
|
|
# Offload KV cache to GPU if available (default: true)
|
|
# By default, llama.cpp decides KV device per-layer based on CPU or GPU
|
|
# Set to false to force all KV layers to CPU
|
|
kv_offload: true
|
|
|
|
# Override tensors to different devices (default: [])
|
|
# Takes in a regex string. Recommended to set num_threads
|
|
# Make a new array element for multiple overrides
|
|
override_tensor: []
|
|
|
|
# Keep MoE weights of n layers in CPU (default: 0)
|
|
# Provide "all" to put MoE weights for all layers in CPU
|
|
n_cpu_moe: 0
|
|
|
|
# Lazily load the model into virtual memory. This is fast and efficient (default: true)
|
|
# Turning mmap off will take longer to load, but will reduce the risk of pageouts
|
|
# WARNING: Do not adjust this parameter unless you know what you're doing!
|
|
mmap: true
|
|
|
|
# Options for Sampling
|
|
sampling:
|
|
# Select a sampler override preset (default: None).
|
|
# Find this in the sampler_overrides folder.
|
|
# This overrides default fallbacks for sampler values that are passed to the API.
|
|
override_preset: safe_defaults
|
|
|
|
developer:
|
|
# Set process to use a higher priority.
|
|
# For realtime process priority, run as administrator or sudo.
|
|
# Otherwise, the priority will be set to high.
|
|
realtime_process_priority: false
|