crawl4ai/deploy/docker/config.yml

# Application Configuration
app:
  title: "Crawl4AI API"
  version: "1.0.0"
  host: "0.0.0.0"
  port: 11235
  reload: False
  workers: 1
  timeout_keep_alive: 300

# Default LLM Configuration
llm:
  provider: "openai/gpt-4o-mini"
  # api_key: sk-...  # If you pass the API key directly (not recommended)

# Redis Configuration
# Set task_ttl_seconds to automatically expire task data in Redis.
# This prevents unbounded memory growth from accumulated task results.
# Default: 3600 (1 hour). Set to 0 to disable TTL (not recommended).
# Can be overridden with REDIS_TASK_TTL environment variable.
redis:
  host: "localhost"
  port: 6379
  db: 0
  password: ""
  task_ttl_seconds: 3600  # TTL for task data (1 hour default)
  ssl: False
  ssl_cert_reqs: None
  ssl_ca_certs: None
  ssl_certfile: None
  ssl_keyfile: None

# Rate Limiting Configuration
rate_limiting:
  enabled: True
  default_limit: "1000/minute"
  trusted_proxies: []
  storage_uri: "memory://"  # Use "redis://localhost:6379" for production

# Security Configuration
# WARNING: For production deployments, enable security and use proper SECRET_KEY:
#   - Set jwt_enabled: true for authentication
#   - Set SECRET_KEY environment variable to a secure random value
#   - Set CRAWL4AI_HOOKS_ENABLED=true only if you need hooks (RCE risk)
security:
  enabled: false
  jwt_enabled: false
  api_token: ""  # When set, /token endpoint requires this secret to issue JWTs
  https_redirect: false
  trusted_hosts: ["*"]
  headers:
    x_content_type_options: "nosniff"
    x_frame_options: "DENY"
    content_security_policy: "default-src 'self'"
    strict_transport_security: "max-age=63072000; includeSubDomains"

# Crawler Configuration
crawler:
  base_config:
    simulate_user: true
  memory_threshold_percent: 95.0
  rate_limiter:
    enabled: true
    base_delay: [1.0, 2.0]
  timeouts:
    stream_init: 30.0  # Timeout for stream initialization
    batch_process: 300.0  # Timeout for batch processing
  pool:
    max_pages: 40                          # ← GLOBAL_SEM permits
    idle_ttl_sec: 300                     # ← 30 min janitor cutoff
  browser:
    kwargs:
      headless: true
      text_mode: true
    extra_args:
      # - "--single-process"
      - "--no-sandbox"
      - "--disable-dev-shm-usage"
      - "--disable-gpu"
      - "--disable-software-rasterizer"
      # --disable-web-security removed for security (enables cross-origin access)
      - "--allow-insecure-localhost"
      - "--ignore-certificate-errors"

# Logging Configuration
logging:
  level: "INFO"
  format: "%(asctime)s - %(name)s - %(levelname)s - %(message)s"

# Observability Configuration
observability:
  prometheus:
    enabled: True
    endpoint: "/metrics"
  health_check:
    endpoint: "/health"

# Webhook Configuration
webhooks:
  enabled: true
  default_url: null  # Optional: default webhook URL for all jobs
  data_in_payload: false  # Optional: default behavior for including data
  retry:
    max_attempts: 5
    initial_delay_ms: 1000  # 1s, 2s, 4s, 8s, 16s exponential backoff
    max_delay_ms: 32000
    timeout_ms: 30000  # 30s timeout per webhook call
  headers:  # Optional: default headers to include
    User-Agent: "Crawl4AI-Webhook/1.0"