crawl4ai/deploy/docker/tests/test_security_0_8_9.py

"""
Behavioral tests for the 0.8.9 non-breaking security patch.

Closes the proxy-injection SSRF class in the Docker server: an unauthenticated
/crawl could set a proxy (or proxy-redirecting Chromium flag) pointing at an
internal IP and route the browser through it, reaching internal services and
cloud metadata. The crawl-target URL was validated; the proxy address was not.

All fixes are backward compatible: a legitimate public proxy still works; only
non-global proxy hosts are rejected and dangerous --proxy/--host-resolver flags
are stripped from extra_args.
"""

import os
import sys

import pytest

DOCKER_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
if DOCKER_DIR not in sys.path:
    sys.path.insert(0, DOCKER_DIR)


class TestValidateProxyDestination:
    # IP literals so getaddrinfo is numeric (no network needed).
    @pytest.mark.parametrize("server", [
        "http://169.254.169.254:8080",   # cloud metadata
        "http://127.0.0.1:8888",
        "http://10.0.0.5:3128",
        "http://192.168.1.10:3128",
        "169.254.169.254:8080",          # bare host:port (no scheme)
        "socks5://10.1.2.3:1080",
        "http://[::1]:8080",             # ipv6 loopback
        "http://[::ffff:169.254.169.254]:80",  # v4-mapped metadata
    ])
    def test_internal_proxy_rejected(self, server):
        import utils
        with pytest.raises(ValueError):
            utils.validate_proxy_destination(server)

    @pytest.mark.parametrize("server", [
        "http://8.8.8.8:3128",
        "https://1.1.1.1:443",
        "8.8.8.8:3128",
    ])
    def test_public_proxy_allowed(self, server):
        import utils
        utils.validate_proxy_destination(server)  # no raise

    def test_empty_is_noop(self):
        import utils
        utils.validate_proxy_destination("")  # no raise
        utils.validate_proxy_destination(None)  # no raise


class TestScrubExtraArgs:
    def test_strips_dangerous_flags(self):
        import utils
        args = [
            "--headless",
            "--proxy-server=http://10.0.0.1:3128",
            "--host-resolver-rules=MAP * 169.254.169.254",
            "--proxy-bypass-list=*",
            "--proxy-pac-url=http://evil/p.pac",
            "--disable-gpu",
        ]
        out = utils.scrub_browser_extra_args(args)
        assert out == ["--headless", "--disable-gpu"]

    def test_keeps_benign(self):
        import utils
        args = ["--headless", "--no-sandbox", "--disable-dev-shm-usage"]
        assert utils.scrub_browser_extra_args(args) == args

    def test_empty(self):
        import utils
        assert utils.scrub_browser_extra_args([]) == []
        assert utils.scrub_browser_extra_args(None) is None


class TestEnforceProxySafety:
    def test_browser_proxy_config_internal_400(self):
        import api
        from fastapi import HTTPException
        from crawl4ai import BrowserConfig, ProxyConfig
        b = BrowserConfig(proxy_config=ProxyConfig(server="http://169.254.169.254:8080"))
        with pytest.raises(HTTPException) as e:
            api._enforce_proxy_safety(b, None)
        assert e.value.status_code == 400
        assert "169.254" not in str(e.value.detail)  # opaque

    def test_deprecated_proxy_field_internal_400(self):
        import api
        from fastapi import HTTPException
        from crawl4ai import BrowserConfig
        b = BrowserConfig(proxy="http://10.0.0.9:3128")
        with pytest.raises(HTTPException):
            api._enforce_proxy_safety(b, None)

    def test_crawler_proxy_config_internal_400(self):
        import api
        from fastapi import HTTPException
        from crawl4ai import BrowserConfig, CrawlerRunConfig, ProxyConfig
        c = CrawlerRunConfig(proxy_config=ProxyConfig(server="http://192.168.0.2:3128"))
        with pytest.raises(HTTPException):
            api._enforce_proxy_safety(BrowserConfig(), c)

    def test_extra_args_proxy_scrubbed(self):
        import api
        from crawl4ai import BrowserConfig
        b = BrowserConfig(extra_args=["--proxy-server=http://10.0.0.1", "--headless"])
        api._enforce_proxy_safety(b, None)  # no raise (scrub, not block)
        assert b.extra_args == ["--headless"]

    def test_public_proxy_passes(self):
        import api
        from crawl4ai import BrowserConfig, ProxyConfig
        b = BrowserConfig(proxy_config=ProxyConfig(server="http://8.8.8.8:3128"))
        api._enforce_proxy_safety(b, None)  # no raise