Files
crawl4ai/deploy/docker/config.yml
unclecode 7c0cc3ed88 fix: batch merge of community PRs (#1622, #1786, #1796, #1795, #1798, #1734, #1290, #1668)
Bug fixes:
- Verify redirect targets are alive before returning from URL seeder (#1622)
- Wire mean_delay/max_range from CrawlerRunConfig into dispatcher rate limiter (#1786)
- Use DOMParser instead of innerHTML in process_iframes to prevent XSS (#1796)

Security/Docker:
- Require api_token for /token endpoint when configured (#1795)
- Deep-crawl streaming now mirrors Python library behavior via arun() (#1798)

CI:
- Bump GitHub Actions to latest versions - checkout v6, setup-python v6,
  build-push-action v6, setup-buildx v4, login v4 (#1734)

Features:
- Support type-list pipeline in JsonCssExtractionStrategy for chained
  extraction like ["attribute", "regex"] (#1290)
- Add --json-ensure-ascii CLI flag and JSON_ENSURE_ASCII config setting
  for Unicode preservation in JSON output (#1668)
2026-03-07 08:45:11 +00:00

109 lines
3.1 KiB
YAML
Raw Permalink Blame History

This file contains invisible Unicode characters
This file contains invisible Unicode characters that are indistinguishable to humans but may be processed differently by a computer. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
# Application Configuration
app:
title: "Crawl4AI API"
version: "1.0.0"
host: "0.0.0.0"
port: 11235
reload: False
workers: 1
timeout_keep_alive: 300
# Default LLM Configuration
llm:
provider: "openai/gpt-4o-mini"
# api_key: sk-... # If you pass the API key directly (not recommended)
# Redis Configuration
# Set task_ttl_seconds to automatically expire task data in Redis.
# This prevents unbounded memory growth from accumulated task results.
# Default: 3600 (1 hour). Set to 0 to disable TTL (not recommended).
# Can be overridden with REDIS_TASK_TTL environment variable.
redis:
host: "localhost"
port: 6379
db: 0
password: ""
task_ttl_seconds: 3600 # TTL for task data (1 hour default)
ssl: False
ssl_cert_reqs: None
ssl_ca_certs: None
ssl_certfile: None
ssl_keyfile: None
# Rate Limiting Configuration
rate_limiting:
enabled: True
default_limit: "1000/minute"
trusted_proxies: []
storage_uri: "memory://" # Use "redis://localhost:6379" for production
# Security Configuration
# WARNING: For production deployments, enable security and use proper SECRET_KEY:
# - Set jwt_enabled: true for authentication
# - Set SECRET_KEY environment variable to a secure random value
# - Set CRAWL4AI_HOOKS_ENABLED=true only if you need hooks (RCE risk)
security:
enabled: false
jwt_enabled: false
api_token: "" # When set, /token endpoint requires this secret to issue JWTs
https_redirect: false
trusted_hosts: ["*"]
headers:
x_content_type_options: "nosniff"
x_frame_options: "DENY"
content_security_policy: "default-src 'self'"
strict_transport_security: "max-age=63072000; includeSubDomains"
# Crawler Configuration
crawler:
base_config:
simulate_user: true
memory_threshold_percent: 95.0
rate_limiter:
enabled: true
base_delay: [1.0, 2.0]
timeouts:
stream_init: 30.0 # Timeout for stream initialization
batch_process: 300.0 # Timeout for batch processing
pool:
max_pages: 40 # ← GLOBAL_SEM permits
idle_ttl_sec: 300 # ← 30 min janitor cutoff
browser:
kwargs:
headless: true
text_mode: true
extra_args:
# - "--single-process"
- "--no-sandbox"
- "--disable-dev-shm-usage"
- "--disable-gpu"
- "--disable-software-rasterizer"
- "--disable-web-security"
- "--allow-insecure-localhost"
- "--ignore-certificate-errors"
# Logging Configuration
logging:
level: "INFO"
format: "%(asctime)s - %(name)s - %(levelname)s - %(message)s"
# Observability Configuration
observability:
prometheus:
enabled: True
endpoint: "/metrics"
health_check:
endpoint: "/health"
# Webhook Configuration
webhooks:
enabled: true
default_url: null # Optional: default webhook URL for all jobs
data_in_payload: false # Optional: default behavior for including data
retry:
max_attempts: 5
initial_delay_ms: 1000 # 1s, 2s, 4s, 8s, 16s exponential backoff
max_delay_ms: 32000
timeout_ms: 30000 # 30s timeout per webhook call
headers: # Optional: default headers to include
User-Agent: "Crawl4AI-Webhook/1.0"