mirror of
https://github.com/unclecode/crawl4ai.git
synced 2026-06-10 15:58:15 +00:00
Fixes for 4 vulnerabilities reported by by111/August829 (2026-04-14):
1. Hardcoded JWT secret (CVSS 9.8): Removed "mysecret" default from
auth.py. Added weak secret validation (blocklist + min 32 chars).
Auto-generates ephemeral key when none set.
2. eval() in /config/dump (CVSS 9.1): Replaced eval-based config
parsing with JSON input {type, params} validated by Pydantic.
Added authentication. Deleted _safe_eval_config and all AST
allowlist code.
3. /execute_js endpoint (CVSS 8.1): Disabled by default via
CRAWL4AI_EXECUTE_JS_ENABLED env var. Added SSRF blocklist on
destination URL. Removed --disable-web-security from default
browser args.
4. Hook sandbox escape (CVSS 9.8): Strip __builtins__, __loader__,
__spec__ from injected module proxies. Removed type, hasattr,
__build_class__ from allowed builtins.
Also added SECURITY-CREDITS.md tracking all reporters.
30 adversarial tests added.
DO NOT PUSH until release day.
109 lines
3.1 KiB
YAML
109 lines
3.1 KiB
YAML
# Application Configuration
|
||
app:
|
||
title: "Crawl4AI API"
|
||
version: "1.0.0"
|
||
host: "0.0.0.0"
|
||
port: 11235
|
||
reload: False
|
||
workers: 1
|
||
timeout_keep_alive: 300
|
||
|
||
# Default LLM Configuration
|
||
llm:
|
||
provider: "openai/gpt-4o-mini"
|
||
# api_key: sk-... # If you pass the API key directly (not recommended)
|
||
|
||
# Redis Configuration
|
||
# Set task_ttl_seconds to automatically expire task data in Redis.
|
||
# This prevents unbounded memory growth from accumulated task results.
|
||
# Default: 3600 (1 hour). Set to 0 to disable TTL (not recommended).
|
||
# Can be overridden with REDIS_TASK_TTL environment variable.
|
||
redis:
|
||
host: "localhost"
|
||
port: 6379
|
||
db: 0
|
||
password: ""
|
||
task_ttl_seconds: 3600 # TTL for task data (1 hour default)
|
||
ssl: False
|
||
ssl_cert_reqs: None
|
||
ssl_ca_certs: None
|
||
ssl_certfile: None
|
||
ssl_keyfile: None
|
||
|
||
# Rate Limiting Configuration
|
||
rate_limiting:
|
||
enabled: True
|
||
default_limit: "1000/minute"
|
||
trusted_proxies: []
|
||
storage_uri: "memory://" # Use "redis://localhost:6379" for production
|
||
|
||
# Security Configuration
|
||
# WARNING: For production deployments, enable security and use proper SECRET_KEY:
|
||
# - Set jwt_enabled: true for authentication
|
||
# - Set SECRET_KEY environment variable to a secure random value
|
||
# - Set CRAWL4AI_HOOKS_ENABLED=true only if you need hooks (RCE risk)
|
||
security:
|
||
enabled: false
|
||
jwt_enabled: false
|
||
api_token: "" # When set, /token endpoint requires this secret to issue JWTs
|
||
https_redirect: false
|
||
trusted_hosts: ["*"]
|
||
headers:
|
||
x_content_type_options: "nosniff"
|
||
x_frame_options: "DENY"
|
||
content_security_policy: "default-src 'self'"
|
||
strict_transport_security: "max-age=63072000; includeSubDomains"
|
||
|
||
# Crawler Configuration
|
||
crawler:
|
||
base_config:
|
||
simulate_user: true
|
||
memory_threshold_percent: 95.0
|
||
rate_limiter:
|
||
enabled: true
|
||
base_delay: [1.0, 2.0]
|
||
timeouts:
|
||
stream_init: 30.0 # Timeout for stream initialization
|
||
batch_process: 300.0 # Timeout for batch processing
|
||
pool:
|
||
max_pages: 40 # ← GLOBAL_SEM permits
|
||
idle_ttl_sec: 300 # ← 30 min janitor cutoff
|
||
browser:
|
||
kwargs:
|
||
headless: true
|
||
text_mode: true
|
||
extra_args:
|
||
# - "--single-process"
|
||
- "--no-sandbox"
|
||
- "--disable-dev-shm-usage"
|
||
- "--disable-gpu"
|
||
- "--disable-software-rasterizer"
|
||
# --disable-web-security removed for security (enables cross-origin access)
|
||
- "--allow-insecure-localhost"
|
||
- "--ignore-certificate-errors"
|
||
|
||
# Logging Configuration
|
||
logging:
|
||
level: "INFO"
|
||
format: "%(asctime)s - %(name)s - %(levelname)s - %(message)s"
|
||
|
||
# Observability Configuration
|
||
observability:
|
||
prometheus:
|
||
enabled: True
|
||
endpoint: "/metrics"
|
||
health_check:
|
||
endpoint: "/health"
|
||
|
||
# Webhook Configuration
|
||
webhooks:
|
||
enabled: true
|
||
default_url: null # Optional: default webhook URL for all jobs
|
||
data_in_payload: false # Optional: default behavior for including data
|
||
retry:
|
||
max_attempts: 5
|
||
initial_delay_ms: 1000 # 1s, 2s, 4s, 8s, 16s exponential backoff
|
||
max_delay_ms: 32000
|
||
timeout_ms: 30000 # 30s timeout per webhook call
|
||
headers: # Optional: default headers to include
|
||
User-Agent: "Crawl4AI-Webhook/1.0" |