YALS/config_sample.yml

# Options for networking
network:
  # The IP to host on (default: 127.0.0.1).
  # Use 0.0.0.0 to expose on all network adapters.
  host: 127.0.0.1

  # The port to host on (default: 5000).
  # Note: Recommended to use 5001 on MacOS because AirServer runs on port 5000
  port: 5000

  # Disable HTTP token authentication with requests.
  # WARNING: This will make your instance vulnerable!
  # Turn on this option if you are ONLY connecting from localhost.
  disable_auth: false

# Options for logging
logging:
  # Enable prompt logging (default: False)
  log_prompt: false

  # Enable generation parameter logging (default: False)
  log_generation_params: false

  # Enable request logging (default: False).
  # NOTE: Only use this for debugging!
  log_requests: false

# Options for model overrides and loading
model:
  # Directory to look for models (default: models).
  # Windows users, do NOT put this path in quotes!
  model_dir: models

  # Allow direct loading of models from a completion or chat completion request (default: False).
  # This method of loading is strict by default.
  inline_model_loading: false

  # Sends dummy model names when the models endpoint is queried. (default: False)
  # Enable this if the client is looking for specific OAI models.
  use_dummy_models: false

  # A list of fake model names that are sent via the /v1/models endpoint. (default: ["gpt-3.5-turbo"])
  # Also used as bypasses for strict mode if inline_model_loading is true.
  dummy_model_names: ["gpt-3.5-turbo"]

  # An initial model to load.
  # Make sure the model is located in the model directory!
  # REQUIRED: This must be filled out to load a model on startup.
  model_name:

  # Names of args to use as a fallback for API load requests (default: []).
  # For example, if you always want cache_mode to be Q4 instead of on the inital model load, add "cache_mode" to this array.
  # Example: ['max_seq_len', 'num_gpu_layers'].
  use_as_default: []

  # Max sequence length (default: 4096).
  # Set to -1 to fetch from the model's config.json
  max_seq_len:

  # Number of slots for continuous batching (default: 1)
  num_slots: 1

  # Size (in tokens) of the KV cache (default: max_seq_len).
  # At maximum, should be the max_seq_len * num_slots.
  cache_size:

  # Chunk size for prompt ingestion (default: 512).
  # A lower value reduces VRAM usage but decreases ingestion speed.
  # NOTE: Effects vary depending on the model.
  # An ideal value is between 512 and 4096.
  chunk_size: 512

  # Amount of tokens at a time to allocate onto a single GPU during processing (default: chunk_size)
  # Used for pipeline parallelism, meaning the overall batch job is split in parallel between all active GPUs
  # Only set this when num_gpus > 1. An ideal value is chunk_size / num_gpus
  physical_chunk_size:

  # Number of model layers to offload on the GPU (default: 0)
  # Set this to 999 to offload all layers to the GPU
  num_gpu_layers: 0

  # An integer array defining the ratio of VRAM to split for each GPU (default: []).
  # Going over the max amount of GPUs will crash when loading the model
  gpu_split: []

  # GPU split mode (default: layer)
  # Possible values - layer, row
  # Row: Splits layers evenly across GPUs. Saves VRAM, but can hurt performance.
  # Layer: Preferred. Fills layers
  gpu_split_mode: layer

  # Number of CPU threads to use during processing/generation (default: -1)
  # NOTE: Does not apply if model is fully offloaded to GPU
  num_threads: -1

  # Prompt template to use for chat completions (default: None)
  prompt_template:

  # Enable flash attention (default: true)
  # Disable if problems arise with the model's architecture
  flash_attention: true

  # Rope freq base. 0 = model default (default: 0)
  # Adjust this value for NTK scaling
  rope_freq_base: 0

  # Enable YaRN scaling. All other parameters inherited from the model (default: 0)
  # Turning this on disables linear/NTK RoPE scaling
  enable_yarn: false

  # K cache quantization type (default: F16)
  # Possible values - f32, f16, q4_0, q4_1, q5_0, q5_1, q8_0
  cache_mode_k: f16

  # V cache quantization type (default: F16)
  # Possible values - f32, f16, q4_0, q4_1, q5_0, q5_1, q8_0
  cache_mode_v: f16

  # Offload KV cache to GPU if available (default: true)
  # By default, llama.cpp decides KV device per-layer based on CPU or GPU
  # Set to false to force all KV layers to CPU
  kv_offload: true

  # Override tensors to different devices (default: [])
  # Takes in a regex string. Recommended to set num_threads
  # Make a new array element for multiple overrides
  override_tensor: []

  # Keep MoE weights of n layers in CPU (default: 0)
  # Provide "all" to put MoE weights for all layers in CPU
  n_cpu_moe: 0

  # Lazily load the model into virtual memory. This is fast and efficient (default: true)
  # Turning mmap off will take longer to load, but will reduce the risk of pageouts
  # WARNING: Do not adjust this parameter unless you know what you're doing!
  mmap: true

# Options for Sampling
sampling:
  # Select a sampler override preset (default: None).
  # Find this in the sampler_overrides folder.
  # This overrides default fallbacks for sampler values that are passed to the API.
  override_preset: safe_defaults

developer:
  # Set process to use a higher priority.
  # For realtime process priority, run as administrator or sudo.
  # Otherwise, the priority will be set to high.
  realtime_process_priority: false