""" Tests for bug fix batch: PR #1622, #1786, #1796 - #1622: _resolve_head should verify redirect targets are alive - #1786: arun_many should wire mean_delay/max_range into dispatcher - #1796: process_iframes should use DOMParser instead of innerHTML """ import asyncio import pytest from unittest.mock import AsyncMock, MagicMock, patch import httpx # ── PR #1622: Redirect target verification in _resolve_head ────────────── @pytest.fixture def seeder(): """Create an AsyncUrlSeeder with a mocked HTTP client.""" from crawl4ai.async_url_seeder import AsyncUrlSeeder s = AsyncUrlSeeder() s.client = AsyncMock(spec=httpx.AsyncClient) return s def _make_response(status_code, headers=None, url="https://example.com"): """Helper to create a mock httpx Response.""" resp = MagicMock(spec=httpx.Response) resp.status_code = status_code resp.headers = headers or {} resp.url = httpx.URL(url) return resp @pytest.mark.asyncio async def test_resolve_head_direct_2xx(seeder): """Direct 2xx hit should return the URL.""" seeder.client.head = AsyncMock( return_value=_make_response(200, url="https://example.com/page") ) result = await seeder._resolve_head("https://example.com/page") assert result == "https://example.com/page" @pytest.mark.asyncio async def test_resolve_head_redirect_to_live_target(seeder): """3xx redirect to a live target should return the target URL.""" redirect_resp = _make_response( 301, headers={"location": "https://example.com/new-page"} ) target_resp = _make_response(200, url="https://example.com/new-page") seeder.client.head = AsyncMock(side_effect=[redirect_resp, target_resp]) result = await seeder._resolve_head("https://example.com/old-page") assert result == "https://example.com/new-page" assert seeder.client.head.call_count == 2 @pytest.mark.asyncio async def test_resolve_head_redirect_to_dead_target(seeder): """3xx redirect to a dead (non-2xx) target should return None.""" redirect_resp = _make_response( 302, headers={"location": "https://example.com/dead"} ) target_resp = _make_response(404, url="https://example.com/dead") seeder.client.head = AsyncMock(side_effect=[redirect_resp, target_resp]) result = await seeder._resolve_head("https://example.com/old") assert result is None @pytest.mark.asyncio async def test_resolve_head_redirect_target_timeout(seeder): """3xx redirect where target times out should return None.""" redirect_resp = _make_response( 301, headers={"location": "https://example.com/slow"} ) seeder.client.head = AsyncMock( side_effect=[redirect_resp, httpx.TimeoutException("timeout")] ) result = await seeder._resolve_head("https://example.com/old") assert result is None @pytest.mark.asyncio async def test_resolve_head_self_redirect(seeder): """Self-redirect (Location == original URL) should return None.""" redirect_resp = _make_response( 301, headers={"location": "https://example.com/loop"} ) seeder.client.head = AsyncMock(return_value=redirect_resp) result = await seeder._resolve_head("https://example.com/loop") assert result is None # Should NOT make a second request for self-redirect assert seeder.client.head.call_count == 1 @pytest.mark.asyncio async def test_resolve_head_relative_redirect(seeder): """Relative Location header should be resolved against original URL.""" redirect_resp = _make_response(301, headers={"location": "/new-path"}) target_resp = _make_response(200, url="https://example.com/new-path") seeder.client.head = AsyncMock(side_effect=[redirect_resp, target_resp]) result = await seeder._resolve_head("https://example.com/old-path") assert result == "https://example.com/new-path" @pytest.mark.asyncio async def test_resolve_head_4xx_returns_none(seeder): """4xx status should return None.""" seeder.client.head = AsyncMock(return_value=_make_response(404)) result = await seeder._resolve_head("https://example.com/missing") assert result is None @pytest.mark.asyncio async def test_resolve_head_network_error(seeder): """Network error should return None (not raise).""" seeder.client.head = AsyncMock( side_effect=httpx.ConnectError("connection refused") ) result = await seeder._resolve_head("https://example.com/down") assert result is None @pytest.mark.asyncio async def test_resolve_head_no_location_header(seeder): """3xx without Location header should return None.""" seeder.client.head = AsyncMock(return_value=_make_response(301, headers={})) result = await seeder._resolve_head("https://example.com/no-loc") assert result is None # ── PR #1786: mean_delay / max_range wired into dispatcher ─────────────── class TestDispatcherWiring: """Test that arun_many wires CrawlerRunConfig delay params into the dispatcher.""" def test_default_config_values(self): """CrawlerRunConfig should have mean_delay=0.1 and max_range=0.3 by default.""" from crawl4ai.async_configs import CrawlerRunConfig cfg = CrawlerRunConfig() assert cfg.mean_delay == 0.1 assert cfg.max_range == 0.3 def test_custom_config_values(self): """CrawlerRunConfig should accept custom mean_delay and max_range.""" from crawl4ai.async_configs import CrawlerRunConfig cfg = CrawlerRunConfig(mean_delay=2.0, max_range=1.0) assert cfg.mean_delay == 2.0 assert cfg.max_range == 1.0 @pytest.mark.asyncio async def test_dispatcher_uses_config_delays(self): """When no dispatcher is provided, arun_many should create one using config delays.""" from crawl4ai.async_webcrawler import AsyncWebCrawler from crawl4ai.async_configs import CrawlerRunConfig, BrowserConfig from crawl4ai.async_dispatcher import MemoryAdaptiveDispatcher, RateLimiter captured_dispatcher = {} original_init = MemoryAdaptiveDispatcher.__init__ def patched_init(self, *args, **kwargs): original_init(self, *args, **kwargs) captured_dispatcher["rate_limiter"] = self.rate_limiter with patch.object(MemoryAdaptiveDispatcher, "__init__", patched_init): # We just need to trigger the dispatcher creation path # We'll patch run_urls to avoid actually crawling with patch.object( MemoryAdaptiveDispatcher, "run_urls", new_callable=AsyncMock ) as mock_run: mock_run.return_value = [] crawler = AsyncWebCrawler(config=BrowserConfig()) crawler.ready = True # skip browser setup crawler.crawler_strategy = MagicMock() cfg = CrawlerRunConfig(mean_delay=2.0, max_range=1.5) try: await crawler.arun_many(urls=["https://example.com"], config=cfg) except Exception: pass # may fail on result processing, that's fine rl = captured_dispatcher.get("rate_limiter") assert rl is not None, "Dispatcher should have been created" assert rl.base_delay == (2.0, 3.5), ( f"Expected base_delay=(2.0, 3.5), got {rl.base_delay}" ) @pytest.mark.asyncio async def test_dispatcher_uses_first_config_from_list(self): """When config is a list, should use the first config's delays.""" from crawl4ai.async_webcrawler import AsyncWebCrawler from crawl4ai.async_configs import CrawlerRunConfig, BrowserConfig from crawl4ai.async_dispatcher import MemoryAdaptiveDispatcher, RateLimiter captured_dispatcher = {} original_init = MemoryAdaptiveDispatcher.__init__ def patched_init(self, *args, **kwargs): original_init(self, *args, **kwargs) captured_dispatcher["rate_limiter"] = self.rate_limiter with patch.object(MemoryAdaptiveDispatcher, "__init__", patched_init): with patch.object( MemoryAdaptiveDispatcher, "run_urls", new_callable=AsyncMock ) as mock_run: mock_run.return_value = [] crawler = AsyncWebCrawler(config=BrowserConfig()) crawler.ready = True crawler.crawler_strategy = MagicMock() cfg1 = CrawlerRunConfig(mean_delay=5.0, max_range=2.0) cfg2 = CrawlerRunConfig(mean_delay=0.5, max_range=0.1) try: await crawler.arun_many( urls=["https://a.com", "https://b.com"], config=[cfg1, cfg2], ) except Exception: pass rl = captured_dispatcher.get("rate_limiter") assert rl is not None assert rl.base_delay == (5.0, 7.0), ( f"Expected base_delay=(5.0, 7.0) from first config, got {rl.base_delay}" ) @pytest.mark.asyncio async def test_explicit_dispatcher_not_overridden(self): """When user provides their own dispatcher, config delays should NOT override it.""" from crawl4ai.async_webcrawler import AsyncWebCrawler from crawl4ai.async_configs import CrawlerRunConfig, BrowserConfig from crawl4ai.async_dispatcher import MemoryAdaptiveDispatcher, RateLimiter custom_rl = RateLimiter(base_delay=(10.0, 20.0)) custom_dispatcher = MemoryAdaptiveDispatcher(rate_limiter=custom_rl) with patch.object( MemoryAdaptiveDispatcher, "run_urls", new_callable=AsyncMock ) as mock_run: mock_run.return_value = [] crawler = AsyncWebCrawler(config=BrowserConfig()) crawler.ready = True crawler.crawler_strategy = MagicMock() cfg = CrawlerRunConfig(mean_delay=0.5, max_range=0.1) try: await crawler.arun_many( urls=["https://example.com"], config=cfg, dispatcher=custom_dispatcher, ) except Exception: pass # Custom dispatcher's rate limiter should be untouched assert custom_rl.base_delay == (10.0, 20.0) # ── PR #1796: DOMParser in process_iframes ─────────────────────────────── class TestProcessIframesDOMParser: """Verify that process_iframes uses DOMParser instead of innerHTML.""" def test_source_code_uses_domparser(self): """The process_iframes method should use DOMParser, not innerHTML for injection.""" import inspect from crawl4ai.async_crawler_strategy import AsyncPlaywrightCrawlerStrategy source = inspect.getsource(AsyncPlaywrightCrawlerStrategy.process_iframes) # Should contain DOMParser usage assert "DOMParser" in source, "process_iframes should use DOMParser" assert "parseFromString" in source, "process_iframes should call parseFromString" assert "doc.body.firstChild" in source, ( "process_iframes should move nodes from parsed doc" ) # The old innerHTML assignment pattern should NOT be present # Note: document.body.innerHTML for READING iframe content is fine # The dangerous pattern is div.innerHTML = `{_iframe}` for WRITING lines = source.split("\n") for line in lines: stripped = line.strip() # Only flag div.innerHTML assignment, not reading from document.body if "div.innerHTML" in stripped and "=" in stripped: pytest.fail( f"Found unsafe innerHTML assignment: {stripped}" ) def test_js_snippet_structure(self): """The JS snippet should properly create DOM nodes from parsed HTML.""" import inspect from crawl4ai.async_crawler_strategy import AsyncPlaywrightCrawlerStrategy source = inspect.getsource(AsyncPlaywrightCrawlerStrategy.process_iframes) # Verify the correct pattern: parse then move child nodes assert "new DOMParser()" in source assert "'text/html'" in source assert "appendChild" in source