Files
crawl4ai/deploy/docker/webhook.py
unclecode 7976b45817 fix(security): patch 4 vulns - file write, SSRF, monitor auth, XSS
Fixes for 4 vulnerabilities reported by Jeongbean Jeon (2026-04-13):

1. Arbitrary File Write (CVSS 9.1): /screenshot and /pdf output_path
   now validated via validate_output_path() restricting writes to
   CRAWL4AI_OUTPUT_DIR. Pydantic validator rejects '..' at schema level.

2. SSRF via Webhook (CVSS 8.6): validate_webhook_url() blocks private
   IPs (RFC 1918, loopback, link-local, cloud metadata), dangerous
   hostnames (localhost, metadata.google.internal, host.docker.internal).
   Validated at job submission + send time. follow_redirects=False set.

3. Monitor Auth Bypass (CVSS 6.5): monitor_router now mounted with
   dependencies=[Depends(token_dep)]. WebSocket /ws endpoint checks
   CRAWL4AI_API_TOKEN from query params.

4. Stored XSS (CVSS 6.1): Server-side html.escape() on URLs and errors
   in monitor.py. Client-side escapeHtml() wrapping all innerHTML
   template injections in index.html (active/completed/error lists +
   WebSocket updates).

33 adversarial security tests added.

DO NOT PUSH until release day. Merge to develop + tag + advisory together.
2026-04-13 11:29:54 +00:00

164 lines
5.9 KiB
Python

"""
Webhook delivery service for Crawl4AI.
This module provides webhook notification functionality with exponential backoff retry logic.
"""
import asyncio
import httpx
import logging
from typing import Dict, Optional
from datetime import datetime, timezone
logger = logging.getLogger(__name__)
class WebhookDeliveryService:
"""Handles webhook delivery with exponential backoff retry logic."""
def __init__(self, config: Dict):
"""
Initialize the webhook delivery service.
Args:
config: Application configuration dictionary containing webhook settings
"""
self.config = config.get("webhooks", {})
self.max_attempts = self.config.get("retry", {}).get("max_attempts", 5)
self.initial_delay = self.config.get("retry", {}).get("initial_delay_ms", 1000) / 1000
self.max_delay = self.config.get("retry", {}).get("max_delay_ms", 32000) / 1000
self.timeout = self.config.get("retry", {}).get("timeout_ms", 30000) / 1000
async def send_webhook(
self,
webhook_url: str,
payload: Dict,
headers: Optional[Dict[str, str]] = None
) -> bool:
"""
Send webhook with exponential backoff retry logic.
Args:
webhook_url: The URL to send the webhook to
payload: The JSON payload to send
headers: Optional custom headers
Returns:
bool: True if delivered successfully, False otherwise
"""
# SECURITY: Validate webhook URL at send time (defense-in-depth against SSRF)
from utils import validate_webhook_url
validate_webhook_url(webhook_url)
default_headers = self.config.get("headers", {})
merged_headers = {**default_headers, **(headers or {})}
merged_headers["Content-Type"] = "application/json"
async with httpx.AsyncClient(timeout=self.timeout, follow_redirects=False) as client:
for attempt in range(self.max_attempts):
try:
logger.info(
f"Sending webhook (attempt {attempt + 1}/{self.max_attempts}) to {webhook_url}"
)
response = await client.post(
webhook_url,
json=payload,
headers=merged_headers
)
# Success or client error (don't retry client errors)
if response.status_code < 500:
if 200 <= response.status_code < 300:
logger.info(f"Webhook delivered successfully to {webhook_url}")
return True
else:
logger.warning(
f"Webhook rejected with status {response.status_code}: {response.text[:200]}"
)
return False # Client error - don't retry
# Server error - retry with backoff
logger.warning(
f"Webhook failed with status {response.status_code}, will retry"
)
except httpx.TimeoutException as exc:
logger.error(f"Webhook timeout (attempt {attempt + 1}): {exc}")
except httpx.RequestError as exc:
logger.error(f"Webhook request error (attempt {attempt + 1}): {exc}")
except Exception as exc:
logger.error(f"Webhook delivery error (attempt {attempt + 1}): {exc}")
# Calculate exponential backoff delay
if attempt < self.max_attempts - 1:
delay = min(self.initial_delay * (2 ** attempt), self.max_delay)
logger.info(f"Retrying in {delay}s...")
await asyncio.sleep(delay)
logger.error(
f"Webhook delivery failed after {self.max_attempts} attempts to {webhook_url}"
)
return False
async def notify_job_completion(
self,
task_id: str,
task_type: str,
status: str,
urls: list,
webhook_config: Optional[Dict],
result: Optional[Dict] = None,
error: Optional[str] = None
):
"""
Notify webhook of job completion.
Args:
task_id: The task identifier
task_type: Type of task (e.g., "crawl", "llm_extraction")
status: Task status ("completed" or "failed")
urls: List of URLs that were crawled
webhook_config: Webhook configuration from the job request
result: Optional crawl result data
error: Optional error message if failed
"""
# Determine webhook URL
webhook_url = None
data_in_payload = self.config.get("data_in_payload", False)
custom_headers = None
if webhook_config:
webhook_url = webhook_config.get("webhook_url")
data_in_payload = webhook_config.get("webhook_data_in_payload", data_in_payload)
custom_headers = webhook_config.get("webhook_headers")
if not webhook_url:
webhook_url = self.config.get("default_url")
if not webhook_url:
logger.debug("No webhook URL configured, skipping notification")
return
# Check if webhooks are enabled
if not self.config.get("enabled", True):
logger.debug("Webhooks are disabled, skipping notification")
return
# Build payload
payload = {
"task_id": task_id,
"task_type": task_type,
"status": status,
"timestamp": datetime.now(timezone.utc).isoformat(),
"urls": urls
}
if error:
payload["error"] = error
if data_in_payload and result:
payload["data"] = result
# Send webhook (fire and forget - don't block on completion)
await self.send_webhook(webhook_url, payload, custom_headers)