mirror of
https://github.com/unclecode/crawl4ai.git
synced 2026-06-10 07:48:50 +00:00
Backward-compatible fixes for the Docker server - features keep working, only the unsafe behavior is closed. (The secure-by-default redesign is the later major.) - SSRF: replace the explicit blocklist with the one rule (reject any resolved IP where not ip.is_global) evaluated on embedded IPv4 transition forms too, closing the gaps - IPv6 unspecified ::, NAT64 64:ff9b::/96, 6to4 2002::/16, v4-mapped. Error messages are now opaque (no resolved-IP leak). - output_path arbitrary write: harden validate_output_path with realpath containment (defeats a symlinked path component) and write via O_NOFOLLOW (write_output_file). output_path stays supported. - LLM base_url key exfil: ignore a request-supplied base_url in /md, /llm, /llm/job; the endpoint is always server-derived. Field still accepted (no 4xx) for compatibility. - env:SECRET_KEY exfil gadget: LLMConfig refuses env: resolution of protected names (SECRET/PASSWORD/PRIVATE substrings, CRAWL4AI*/AWS_SECRET* prefixes, SECRET_KEY/REDIS_PASSWORD/TOKEN). Normal provider keys (OPENAI_API_KEY, ...) unaffected. - CRLF log injection: CRLFSafeFilter strips CR/LF/control from log records. - Webhook header injection: sanitize_webhook_headers (name pattern, no control chars, deny hop-by-hop/sensitive) at send time + a WebhookConfig validator for early 422. Bump 0.8.7 -> 0.8.8 (__version__ + Dockerfile C4AI_VER). 30 new behavioral tests; existing 111 security tests + 112 library config tests still pass. NOT included (breaking -> deferred to the major): auth-by-default, trust boundary, declarative hooks, output_path removal, base_url/provider removal, loopback bind, redis password, TLS-verify-on, CORS, bounded queue. The exec-hook RCE and unauth-by-default criticals have no non-breaking fix and are closed only in the major (hooks are already off by default).
143 lines
4.7 KiB
Python
143 lines
4.7 KiB
Python
from typing import List, Optional, Dict
|
||
from enum import Enum
|
||
from pydantic import BaseModel, Field, HttpUrl, field_validator
|
||
from utils import FilterType
|
||
|
||
|
||
class CrawlRequest(BaseModel):
|
||
urls: List[str] = Field(min_length=1, max_length=100)
|
||
browser_config: Optional[Dict] = Field(default_factory=dict)
|
||
crawler_config: Optional[Dict] = Field(default_factory=dict)
|
||
crawler_configs: Optional[List[Dict]] = Field(
|
||
default=None,
|
||
description=(
|
||
"List of per-URL CrawlerRunConfig dicts for arun_many(). "
|
||
"When provided, each config can include a 'url_matcher' pattern "
|
||
"to match against specific URLs. Takes precedence over crawler_config."
|
||
),
|
||
)
|
||
|
||
|
||
class HookConfig(BaseModel):
|
||
"""Configuration for user-provided hooks"""
|
||
code: Dict[str, str] = Field(
|
||
default_factory=dict,
|
||
description="Map of hook points to Python code strings"
|
||
)
|
||
timeout: int = Field(
|
||
default=30,
|
||
ge=1,
|
||
le=120,
|
||
description="Timeout in seconds for each hook execution"
|
||
)
|
||
|
||
class Config:
|
||
schema_extra = {
|
||
"example": {
|
||
"code": {
|
||
"on_page_context_created": """
|
||
async def hook(page, context, **kwargs):
|
||
# Block images to speed up crawling
|
||
await context.route("**/*.{png,jpg,jpeg,gif}", lambda route: route.abort())
|
||
return page
|
||
""",
|
||
"before_retrieve_html": """
|
||
async def hook(page, context, **kwargs):
|
||
# Scroll to load lazy content
|
||
await page.evaluate("window.scrollTo(0, document.body.scrollHeight)")
|
||
await page.wait_for_timeout(2000)
|
||
return page
|
||
"""
|
||
},
|
||
"timeout": 30
|
||
}
|
||
}
|
||
|
||
|
||
class CrawlRequestWithHooks(CrawlRequest):
|
||
"""Extended crawl request with hooks support"""
|
||
hooks: Optional[HookConfig] = Field(
|
||
default=None,
|
||
description="Optional user-provided hook functions"
|
||
)
|
||
|
||
class MarkdownRequest(BaseModel):
|
||
"""Request body for the /md endpoint."""
|
||
url: str = Field(..., description="Absolute http/https URL to fetch")
|
||
f: FilterType = Field(FilterType.FIT, description="Content‑filter strategy: fit, raw, bm25, or llm")
|
||
q: Optional[str] = Field(None, description="Query string used by BM25/LLM filters")
|
||
c: Optional[str] = Field("0", description="Cache‑bust / revision counter")
|
||
provider: Optional[str] = Field(None, description="LLM provider override (e.g., 'anthropic/claude-3-opus')")
|
||
temperature: Optional[float] = Field(None, description="LLM temperature override (0.0-2.0)")
|
||
base_url: Optional[str] = Field(None, description="LLM API base URL override")
|
||
|
||
|
||
class RawCode(BaseModel):
|
||
code: str
|
||
|
||
class HTMLRequest(BaseModel):
|
||
url: str
|
||
|
||
class ScreenshotRequest(BaseModel):
|
||
url: str
|
||
screenshot_wait_for: Optional[float] = 2
|
||
wait_for_images: Optional[bool] = False
|
||
output_path: Optional[str] = None
|
||
|
||
@field_validator("output_path")
|
||
@classmethod
|
||
def reject_traversal(cls, v):
|
||
if v is None:
|
||
return v
|
||
if ".." in v.replace("\\", "/").split("/"):
|
||
raise ValueError("output_path must not contain path traversal sequences")
|
||
return v
|
||
|
||
class PDFRequest(BaseModel):
|
||
url: str
|
||
output_path: Optional[str] = None
|
||
|
||
@field_validator("output_path")
|
||
@classmethod
|
||
def reject_traversal(cls, v):
|
||
if v is None:
|
||
return v
|
||
if ".." in v.replace("\\", "/").split("/"):
|
||
raise ValueError("output_path must not contain path traversal sequences")
|
||
return v
|
||
|
||
|
||
class JSEndpointRequest(BaseModel):
|
||
url: str
|
||
scripts: List[str] = Field(
|
||
...,
|
||
description="List of separated JavaScript snippets to execute"
|
||
)
|
||
|
||
|
||
class WebhookConfig(BaseModel):
|
||
"""Configuration for webhook notifications."""
|
||
webhook_url: HttpUrl
|
||
webhook_data_in_payload: bool = False
|
||
webhook_headers: Optional[Dict[str, str]] = None
|
||
|
||
@field_validator("webhook_headers")
|
||
@classmethod
|
||
def _validate_headers(cls, v):
|
||
# Reject unsafe outbound headers early (422). Mirrors
|
||
# webhook.sanitize_webhook_headers; kept inline to avoid an import cycle.
|
||
if not v:
|
||
return v
|
||
from webhook import sanitize_webhook_headers
|
||
return sanitize_webhook_headers(v)
|
||
|
||
|
||
class WebhookPayload(BaseModel):
|
||
"""Payload sent to webhook endpoints."""
|
||
task_id: str
|
||
task_type: str # "crawl", "llm_extraction", etc.
|
||
status: str # "completed" or "failed"
|
||
timestamp: str # ISO 8601 format
|
||
urls: List[str]
|
||
error: Optional[str] = None
|
||
data: Optional[Dict] = None # Included only if webhook_data_in_payload=True |