Files
crawl4ai/deploy/docker/job.py
unclecode 7976b45817 fix(security): patch 4 vulns - file write, SSRF, monitor auth, XSS
Fixes for 4 vulnerabilities reported by Jeongbean Jeon (2026-04-13):

1. Arbitrary File Write (CVSS 9.1): /screenshot and /pdf output_path
   now validated via validate_output_path() restricting writes to
   CRAWL4AI_OUTPUT_DIR. Pydantic validator rejects '..' at schema level.

2. SSRF via Webhook (CVSS 8.6): validate_webhook_url() blocks private
   IPs (RFC 1918, loopback, link-local, cloud metadata), dangerous
   hostnames (localhost, metadata.google.internal, host.docker.internal).
   Validated at job submission + send time. follow_redirects=False set.

3. Monitor Auth Bypass (CVSS 6.5): monitor_router now mounted with
   dependencies=[Depends(token_dep)]. WebSocket /ws endpoint checks
   CRAWL4AI_API_TOKEN from query params.

4. Stored XSS (CVSS 6.1): Server-side html.escape() on URLs and errors
   in monitor.py. Client-side escapeHtml() wrapping all innerHTML
   template injections in index.html (active/completed/error lists +
   WebSocket updates).

33 adversarial security tests added.

DO NOT PUSH until release day. Merge to develop + tag + advisory together.
2026-04-13 11:29:54 +00:00

129 lines
3.9 KiB
Python
Raw Permalink Blame History

This file contains invisible Unicode characters
This file contains invisible Unicode characters that are indistinguishable to humans but may be processed differently by a computer. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
"""
Job endpoints (enqueue + poll) for long-running LLM extraction and raw crawl.
Relies on the existing Redis task helpers in api.py
"""
from typing import Dict, Optional, Callable
from fastapi import APIRouter, BackgroundTasks, Depends, HTTPException, Request
from pydantic import BaseModel, HttpUrl
from api import (
handle_llm_request,
handle_crawl_job,
handle_task_status,
)
from schemas import WebhookConfig
# ------------- dependency placeholders -------------
_redis = None # will be injected from server.py
_config = None
_token_dep: Callable = lambda: None # dummy until injected
# public router
router = APIRouter()
# === init hook called by server.py =========================================
def init_job_router(redis, config, token_dep) -> APIRouter:
"""Inject shared singletons and return the router for mounting."""
global _redis, _config, _token_dep
_redis, _config, _token_dep = redis, config, token_dep
return router
# ---------- payload models --------------------------------------------------
class LlmJobPayload(BaseModel):
url: HttpUrl
q: str
schema: Optional[str] = None
cache: bool = False
provider: Optional[str] = None
webhook_config: Optional[WebhookConfig] = None
temperature: Optional[float] = None
base_url: Optional[str] = None
class CrawlJobPayload(BaseModel):
urls: list[HttpUrl]
browser_config: Dict = {}
crawler_config: Dict = {}
webhook_config: Optional[WebhookConfig] = None
# ---------- LLM job ---------------------------------------------------------
@router.post("/llm/job", status_code=202)
async def llm_job_enqueue(
payload: LlmJobPayload,
background_tasks: BackgroundTasks,
request: Request,
_td: Dict = Depends(lambda: _token_dep()), # late-bound dep
):
webhook_config = None
if payload.webhook_config:
from utils import validate_webhook_url
try:
validate_webhook_url(str(payload.webhook_config.webhook_url))
except ValueError as e:
raise HTTPException(status_code=400, detail=str(e))
webhook_config = payload.webhook_config.model_dump(mode='json')
return await handle_llm_request(
_redis,
background_tasks,
request,
str(payload.url),
query=payload.q,
schema=payload.schema,
cache=payload.cache,
config=_config,
provider=payload.provider,
webhook_config=webhook_config,
temperature=payload.temperature,
api_base_url=payload.base_url,
)
@router.get("/llm/job/{task_id}")
async def llm_job_status(
request: Request,
task_id: str,
_td: Dict = Depends(lambda: _token_dep())
):
return await handle_task_status(_redis, task_id, base_url=str(request.base_url))
# ---------- CRAWL job -------------------------------------------------------
@router.post("/crawl/job", status_code=202)
async def crawl_job_enqueue(
payload: CrawlJobPayload,
background_tasks: BackgroundTasks,
_td: Dict = Depends(lambda: _token_dep()),
):
webhook_config = None
if payload.webhook_config:
from utils import validate_webhook_url
try:
validate_webhook_url(str(payload.webhook_config.webhook_url))
except ValueError as e:
raise HTTPException(status_code=400, detail=str(e))
webhook_config = payload.webhook_config.model_dump(mode='json')
return await handle_crawl_job(
_redis,
background_tasks,
[str(u) for u in payload.urls],
payload.browser_config,
payload.crawler_config,
config=_config,
webhook_config=webhook_config,
)
@router.get("/crawl/job/{task_id}")
async def crawl_job_status(
request: Request,
task_id: str,
_td: Dict = Depends(lambda: _token_dep())
):
return await handle_task_status(_redis, task_id, base_url=str(request.base_url))