Files
crawl4ai/deploy/docker/tests/test_security_0_8_9.py
unclecode cdf2ead7ed security: patch proxy SSRF in Docker server (0.8.9)
0.8.8's SSRF check validated the crawl target URL but not the proxy address, so
an unauthenticated /crawl, /crawl/stream, or /crawl/job could route the browser
through a proxy pointing at an internal IP and reach internal services / cloud
metadata. Reported by Geo (geo-chen).

Fix (backward compatible): validate every proxy destination with the same
not-is_global check used for crawl URLs, before the browser is built -
browser_config.proxy, browser_config.proxy_config.server,
crawler_config.proxy_config.server - and strip proxy/DNS-redirecting flags
(--proxy-server / --proxy-pac-url / --proxy-bypass-list / --host-resolver-rules)
from extra_args. A legitimate public proxy still works; configure proxies via
proxy_config (validated), not raw extra_args flags. _enforce_proxy_safety is
called in both crawl handlers (and covers /crawl/job transitively); HTTPException
passthrough added so the 400 is not masked as a 500.

Bump 0.8.8 -> 0.8.9 (__version__ + Dockerfile). 20 new tests; full security
suite 161 pass. Changelog, release blog, README, SECURITY-CREDITS updated.

This vector was already fixed in the upcoming secure-by-default release; 0.8.9
brings it forward because it is an unauthenticated SSRF.
2026-06-04 06:17:41 +00:00

120 lines
4.4 KiB
Python

"""
Behavioral tests for the 0.8.9 non-breaking security patch.
Closes the proxy-injection SSRF class in the Docker server: an unauthenticated
/crawl could set a proxy (or proxy-redirecting Chromium flag) pointing at an
internal IP and route the browser through it, reaching internal services and
cloud metadata. The crawl-target URL was validated; the proxy address was not.
All fixes are backward compatible: a legitimate public proxy still works; only
non-global proxy hosts are rejected and dangerous --proxy/--host-resolver flags
are stripped from extra_args.
"""
import os
import sys
import pytest
DOCKER_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
if DOCKER_DIR not in sys.path:
sys.path.insert(0, DOCKER_DIR)
class TestValidateProxyDestination:
# IP literals so getaddrinfo is numeric (no network needed).
@pytest.mark.parametrize("server", [
"http://169.254.169.254:8080", # cloud metadata
"http://127.0.0.1:8888",
"http://10.0.0.5:3128",
"http://192.168.1.10:3128",
"169.254.169.254:8080", # bare host:port (no scheme)
"socks5://10.1.2.3:1080",
"http://[::1]:8080", # ipv6 loopback
"http://[::ffff:169.254.169.254]:80", # v4-mapped metadata
])
def test_internal_proxy_rejected(self, server):
import utils
with pytest.raises(ValueError):
utils.validate_proxy_destination(server)
@pytest.mark.parametrize("server", [
"http://8.8.8.8:3128",
"https://1.1.1.1:443",
"8.8.8.8:3128",
])
def test_public_proxy_allowed(self, server):
import utils
utils.validate_proxy_destination(server) # no raise
def test_empty_is_noop(self):
import utils
utils.validate_proxy_destination("") # no raise
utils.validate_proxy_destination(None) # no raise
class TestScrubExtraArgs:
def test_strips_dangerous_flags(self):
import utils
args = [
"--headless",
"--proxy-server=http://10.0.0.1:3128",
"--host-resolver-rules=MAP * 169.254.169.254",
"--proxy-bypass-list=*",
"--proxy-pac-url=http://evil/p.pac",
"--disable-gpu",
]
out = utils.scrub_browser_extra_args(args)
assert out == ["--headless", "--disable-gpu"]
def test_keeps_benign(self):
import utils
args = ["--headless", "--no-sandbox", "--disable-dev-shm-usage"]
assert utils.scrub_browser_extra_args(args) == args
def test_empty(self):
import utils
assert utils.scrub_browser_extra_args([]) == []
assert utils.scrub_browser_extra_args(None) is None
class TestEnforceProxySafety:
def test_browser_proxy_config_internal_400(self):
import api
from fastapi import HTTPException
from crawl4ai import BrowserConfig, ProxyConfig
b = BrowserConfig(proxy_config=ProxyConfig(server="http://169.254.169.254:8080"))
with pytest.raises(HTTPException) as e:
api._enforce_proxy_safety(b, None)
assert e.value.status_code == 400
assert "169.254" not in str(e.value.detail) # opaque
def test_deprecated_proxy_field_internal_400(self):
import api
from fastapi import HTTPException
from crawl4ai import BrowserConfig
b = BrowserConfig(proxy="http://10.0.0.9:3128")
with pytest.raises(HTTPException):
api._enforce_proxy_safety(b, None)
def test_crawler_proxy_config_internal_400(self):
import api
from fastapi import HTTPException
from crawl4ai import BrowserConfig, CrawlerRunConfig, ProxyConfig
c = CrawlerRunConfig(proxy_config=ProxyConfig(server="http://192.168.0.2:3128"))
with pytest.raises(HTTPException):
api._enforce_proxy_safety(BrowserConfig(), c)
def test_extra_args_proxy_scrubbed(self):
import api
from crawl4ai import BrowserConfig
b = BrowserConfig(extra_args=["--proxy-server=http://10.0.0.1", "--headless"])
api._enforce_proxy_safety(b, None) # no raise (scrub, not block)
assert b.extra_args == ["--headless"]
def test_public_proxy_passes(self):
import api
from crawl4ai import BrowserConfig, ProxyConfig
b = BrowserConfig(proxy_config=ProxyConfig(server="http://8.8.8.8:3128"))
api._enforce_proxy_safety(b, None) # no raise