mirror of
https://github.com/unclecode/crawl4ai.git
synced 2026-06-10 07:48:50 +00:00
0.8.8's SSRF check validated the crawl target URL but not the proxy address, so an unauthenticated /crawl, /crawl/stream, or /crawl/job could route the browser through a proxy pointing at an internal IP and reach internal services / cloud metadata. Reported by Geo (geo-chen). Fix (backward compatible): validate every proxy destination with the same not-is_global check used for crawl URLs, before the browser is built - browser_config.proxy, browser_config.proxy_config.server, crawler_config.proxy_config.server - and strip proxy/DNS-redirecting flags (--proxy-server / --proxy-pac-url / --proxy-bypass-list / --host-resolver-rules) from extra_args. A legitimate public proxy still works; configure proxies via proxy_config (validated), not raw extra_args flags. _enforce_proxy_safety is called in both crawl handlers (and covers /crawl/job transitively); HTTPException passthrough added so the 400 is not masked as a 500. Bump 0.8.8 -> 0.8.9 (__version__ + Dockerfile). 20 new tests; full security suite 161 pass. Changelog, release blog, README, SECURITY-CREDITS updated. This vector was already fixed in the upcoming secure-by-default release; 0.8.9 brings it forward because it is an unauthenticated SSRF.
120 lines
4.4 KiB
Python
120 lines
4.4 KiB
Python
"""
|
|
Behavioral tests for the 0.8.9 non-breaking security patch.
|
|
|
|
Closes the proxy-injection SSRF class in the Docker server: an unauthenticated
|
|
/crawl could set a proxy (or proxy-redirecting Chromium flag) pointing at an
|
|
internal IP and route the browser through it, reaching internal services and
|
|
cloud metadata. The crawl-target URL was validated; the proxy address was not.
|
|
|
|
All fixes are backward compatible: a legitimate public proxy still works; only
|
|
non-global proxy hosts are rejected and dangerous --proxy/--host-resolver flags
|
|
are stripped from extra_args.
|
|
"""
|
|
|
|
import os
|
|
import sys
|
|
|
|
import pytest
|
|
|
|
DOCKER_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
|
|
if DOCKER_DIR not in sys.path:
|
|
sys.path.insert(0, DOCKER_DIR)
|
|
|
|
|
|
class TestValidateProxyDestination:
|
|
# IP literals so getaddrinfo is numeric (no network needed).
|
|
@pytest.mark.parametrize("server", [
|
|
"http://169.254.169.254:8080", # cloud metadata
|
|
"http://127.0.0.1:8888",
|
|
"http://10.0.0.5:3128",
|
|
"http://192.168.1.10:3128",
|
|
"169.254.169.254:8080", # bare host:port (no scheme)
|
|
"socks5://10.1.2.3:1080",
|
|
"http://[::1]:8080", # ipv6 loopback
|
|
"http://[::ffff:169.254.169.254]:80", # v4-mapped metadata
|
|
])
|
|
def test_internal_proxy_rejected(self, server):
|
|
import utils
|
|
with pytest.raises(ValueError):
|
|
utils.validate_proxy_destination(server)
|
|
|
|
@pytest.mark.parametrize("server", [
|
|
"http://8.8.8.8:3128",
|
|
"https://1.1.1.1:443",
|
|
"8.8.8.8:3128",
|
|
])
|
|
def test_public_proxy_allowed(self, server):
|
|
import utils
|
|
utils.validate_proxy_destination(server) # no raise
|
|
|
|
def test_empty_is_noop(self):
|
|
import utils
|
|
utils.validate_proxy_destination("") # no raise
|
|
utils.validate_proxy_destination(None) # no raise
|
|
|
|
|
|
class TestScrubExtraArgs:
|
|
def test_strips_dangerous_flags(self):
|
|
import utils
|
|
args = [
|
|
"--headless",
|
|
"--proxy-server=http://10.0.0.1:3128",
|
|
"--host-resolver-rules=MAP * 169.254.169.254",
|
|
"--proxy-bypass-list=*",
|
|
"--proxy-pac-url=http://evil/p.pac",
|
|
"--disable-gpu",
|
|
]
|
|
out = utils.scrub_browser_extra_args(args)
|
|
assert out == ["--headless", "--disable-gpu"]
|
|
|
|
def test_keeps_benign(self):
|
|
import utils
|
|
args = ["--headless", "--no-sandbox", "--disable-dev-shm-usage"]
|
|
assert utils.scrub_browser_extra_args(args) == args
|
|
|
|
def test_empty(self):
|
|
import utils
|
|
assert utils.scrub_browser_extra_args([]) == []
|
|
assert utils.scrub_browser_extra_args(None) is None
|
|
|
|
|
|
class TestEnforceProxySafety:
|
|
def test_browser_proxy_config_internal_400(self):
|
|
import api
|
|
from fastapi import HTTPException
|
|
from crawl4ai import BrowserConfig, ProxyConfig
|
|
b = BrowserConfig(proxy_config=ProxyConfig(server="http://169.254.169.254:8080"))
|
|
with pytest.raises(HTTPException) as e:
|
|
api._enforce_proxy_safety(b, None)
|
|
assert e.value.status_code == 400
|
|
assert "169.254" not in str(e.value.detail) # opaque
|
|
|
|
def test_deprecated_proxy_field_internal_400(self):
|
|
import api
|
|
from fastapi import HTTPException
|
|
from crawl4ai import BrowserConfig
|
|
b = BrowserConfig(proxy="http://10.0.0.9:3128")
|
|
with pytest.raises(HTTPException):
|
|
api._enforce_proxy_safety(b, None)
|
|
|
|
def test_crawler_proxy_config_internal_400(self):
|
|
import api
|
|
from fastapi import HTTPException
|
|
from crawl4ai import BrowserConfig, CrawlerRunConfig, ProxyConfig
|
|
c = CrawlerRunConfig(proxy_config=ProxyConfig(server="http://192.168.0.2:3128"))
|
|
with pytest.raises(HTTPException):
|
|
api._enforce_proxy_safety(BrowserConfig(), c)
|
|
|
|
def test_extra_args_proxy_scrubbed(self):
|
|
import api
|
|
from crawl4ai import BrowserConfig
|
|
b = BrowserConfig(extra_args=["--proxy-server=http://10.0.0.1", "--headless"])
|
|
api._enforce_proxy_safety(b, None) # no raise (scrub, not block)
|
|
assert b.extra_args == ["--headless"]
|
|
|
|
def test_public_proxy_passes(self):
|
|
import api
|
|
from crawl4ai import BrowserConfig, ProxyConfig
|
|
b = BrowserConfig(proxy_config=ProxyConfig(server="http://8.8.8.8:3128"))
|
|
api._enforce_proxy_safety(b, None) # no raise
|