Files
crawl4ai/deploy/docker/tests/test_security_0_8_8.py
unclecode aa81e8fe7d security: non-breaking hardening patch (0.8.8)
Backward-compatible fixes for the Docker server - features keep working, only
the unsafe behavior is closed. (The secure-by-default redesign is the later
major.)

- SSRF: replace the explicit blocklist with the one rule (reject any resolved
  IP where not ip.is_global) evaluated on embedded IPv4 transition forms too,
  closing the gaps - IPv6 unspecified ::, NAT64 64:ff9b::/96, 6to4 2002::/16,
  v4-mapped. Error messages are now opaque (no resolved-IP leak).
- output_path arbitrary write: harden validate_output_path with realpath
  containment (defeats a symlinked path component) and write via O_NOFOLLOW
  (write_output_file). output_path stays supported.
- LLM base_url key exfil: ignore a request-supplied base_url in /md, /llm,
  /llm/job; the endpoint is always server-derived. Field still accepted (no
  4xx) for compatibility.
- env:SECRET_KEY exfil gadget: LLMConfig refuses env: resolution of protected
  names (SECRET/PASSWORD/PRIVATE substrings, CRAWL4AI*/AWS_SECRET* prefixes,
  SECRET_KEY/REDIS_PASSWORD/TOKEN). Normal provider keys (OPENAI_API_KEY, ...)
  unaffected.
- CRLF log injection: CRLFSafeFilter strips CR/LF/control from log records.
- Webhook header injection: sanitize_webhook_headers (name pattern, no control
  chars, deny hop-by-hop/sensitive) at send time + a WebhookConfig validator
  for early 422.

Bump 0.8.7 -> 0.8.8 (__version__ + Dockerfile C4AI_VER). 30 new behavioral
tests; existing 111 security tests + 112 library config tests still pass.

NOT included (breaking -> deferred to the major): auth-by-default, trust
boundary, declarative hooks, output_path removal, base_url/provider removal,
loopback bind, redis password, TLS-verify-on, CORS, bounded queue. The
exec-hook RCE and unauth-by-default criticals have no non-breaking fix and are
closed only in the major (hooks are already off by default).
2026-06-02 12:39:04 +00:00

159 lines
6.1 KiB
Python

"""
Behavioral tests for the 0.8.8 non-breaking security patch.
Each fix here is backward-compatible: features keep working, only the unsafe
behavior is closed. (The full secure-by-default redesign is the later major.)
Covers:
- SSRF blocklist gaps closed (NAT64 / 6to4 / :: / v4-mapped, not-is_global)
+ opaque error (no resolved IP leak)
- output_path symlink/TOCTOU hardening (realpath containment + O_NOFOLLOW)
with the feature kept
- request-supplied LLM base_url ignored (key-exfil vector)
- env:SECRET_KEY exfil gadget blocked in LLMConfig (provider keys still work)
- CRLF-safe logging
- webhook header sanitization
"""
import os
import socket
import sys
import pytest
DOCKER_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
if DOCKER_DIR not in sys.path:
sys.path.insert(0, DOCKER_DIR)
def _patch_dns(monkeypatch, ip):
def fake(host, port=None, *a, **k):
return [(socket.AF_INET, socket.SOCK_STREAM, 6, "", (ip, port or 0))]
monkeypatch.setattr(socket, "getaddrinfo", fake)
class TestSsrfGapsClosed:
@pytest.mark.parametrize("ip", [
"169.254.169.254", # metadata
"127.0.0.1", "10.0.0.5", "192.168.1.1", "100.64.0.1",
"::1", "::", # v6 loopback + unspecified (was a gap)
"::ffff:169.254.169.254", # v4-mapped metadata
"64:ff9b::a9fe:a9fe", # NAT64 -> 169.254.169.254 (was a gap)
"2002:a9fe:a9fe::1", # 6to4 embedding 169.254.169.254 (was a gap)
])
def test_internal_blocked(self, monkeypatch, ip):
import utils
_patch_dns(monkeypatch, ip)
with pytest.raises(ValueError):
utils.validate_webhook_url("http://target.example/cb")
@pytest.mark.parametrize("ip", ["8.8.8.8", "1.1.1.1"])
def test_public_allowed(self, monkeypatch, ip):
import utils
_patch_dns(monkeypatch, ip)
utils.validate_webhook_url("http://target.example/cb") # no raise
def test_error_is_opaque(self, monkeypatch):
import utils
_patch_dns(monkeypatch, "169.254.169.254")
with pytest.raises(ValueError) as e:
utils.validate_webhook_url("http://target.example/")
assert "169.254" not in str(e.value) # no resolved-IP leak
class TestOutputPathHardening:
def test_symlink_escape_rejected(self, monkeypatch, tmp_path):
import utils
allowed = tmp_path / "outputs"
allowed.mkdir()
monkeypatch.setattr(utils, "ALLOWED_OUTPUT_DIR", str(allowed))
# Plant a symlinked subdir that points outside the allowed dir.
outside = tmp_path / "outside"
outside.mkdir()
(allowed / "evil").symlink_to(outside)
from fastapi import HTTPException
with pytest.raises(HTTPException):
utils.validate_output_path("evil/pwned.png") # realpath escapes
def test_normal_path_ok(self, monkeypatch, tmp_path):
import utils
allowed = tmp_path / "outputs"
allowed.mkdir()
monkeypatch.setattr(utils, "ALLOWED_OUTPUT_DIR", str(allowed))
p = utils.validate_output_path("sub/shot.png")
assert p.startswith(str(allowed))
def test_write_refuses_symlink_final_component(self, monkeypatch, tmp_path):
import utils
allowed = tmp_path / "outputs"
allowed.mkdir()
target = allowed / "shot.png"
# Pre-plant a symlink at the final path pointing at a secret.
secret = tmp_path / "secret"
secret.write_text("SECRET")
target.symlink_to(secret)
with pytest.raises(OSError):
utils.write_output_file(str(target), b"data") # O_NOFOLLOW refuses
class TestLlmBaseUrlIgnored:
def test_request_base_url_not_honored_in_source(self):
# base_url from the request must never be passed to the LLM call.
with open(os.path.join(DOCKER_DIR, "api.py")) as f:
src = f.read()
assert "base_url or get_llm_base_url" not in src
assert "base_url=get_llm_base_url(config" in src
class TestEnvSecretGuard:
def test_env_secret_key_blocked(self):
from crawl4ai.async_configs import LLMConfig
with pytest.raises(ValueError):
LLMConfig(api_token="env:SECRET_KEY")
@pytest.mark.parametrize("name", ["REDIS_PASSWORD", "CRAWL4AI_API_TOKEN", "MY_PRIVATE_KEY"])
def test_other_secrets_blocked(self, name):
from crawl4ai.async_configs import LLMConfig
with pytest.raises(ValueError):
LLMConfig(api_token=f"env:{name}")
def test_provider_key_still_works(self, monkeypatch):
from crawl4ai.async_configs import LLMConfig
monkeypatch.setenv("OPENAI_API_KEY", "sk-test-123")
cfg = LLMConfig(provider="openai/gpt-4o-mini", api_token="env:OPENAI_API_KEY")
assert cfg.api_token == "sk-test-123" # normal provider key unaffected
class TestCRLFSafeLogging:
def test_crlf_stripped(self):
import logging
from utils import CRLFSafeFilter
rec = logging.LogRecord("t", logging.INFO, __file__, 1,
"url=http://x/\r\nINJECTED login ok", None, None)
CRLFSafeFilter().filter(rec)
msg = rec.getMessage()
assert "\r" not in msg and "\n" not in msg and "INJECTED" in msg
class TestWebhookHeaderSanitization:
def test_crlf_value_rejected(self):
from webhook import sanitize_webhook_headers
with pytest.raises(ValueError):
sanitize_webhook_headers({"X-Foo": "bar\r\nInjected: 1"})
@pytest.mark.parametrize("bad", ["Host", "Content-Length", "Authorization", "Cookie"])
def test_hop_by_hop_denied(self, bad):
from webhook import sanitize_webhook_headers
with pytest.raises(ValueError):
sanitize_webhook_headers({bad: "x"})
def test_good_header_passes(self):
from webhook import sanitize_webhook_headers
assert sanitize_webhook_headers({"X-Trace-Id": "abc"}) == {"X-Trace-Id": "abc"}
def test_schema_rejects_early(self):
from schemas import WebhookConfig
import pydantic
with pytest.raises(pydantic.ValidationError):
WebhookConfig(webhook_url="https://example.com/cb", webhook_headers={"Host": "evil"})