""" Crawl4AI Regression Tests - Browser Management and Features Tests browser lifecycle, viewport configuration, wait_for conditions, JavaScript execution, page interaction, screenshots, iframe processing, overlay removal, stealth mode, session management, network capture, and anti-bot features using real browser crawling with no mocking. """ import base64 import time import pytest from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig from crawl4ai.cache_context import CacheMode # --------------------------------------------------------------------------- # Browser lifecycle # --------------------------------------------------------------------------- @pytest.mark.asyncio async def test_browser_lifecycle(local_server): """Create crawler, start, crawl, and close explicitly without context manager.""" crawler = AsyncWebCrawler(config=BrowserConfig(headless=True, verbose=False)) await crawler.start() try: result = await crawler.arun( url=local_server + "/", config=CrawlerRunConfig(verbose=False), ) assert result.success, f"Crawl failed: {result.error_message}" assert len(result.html) > 0, "HTML should be non-empty" finally: await crawler.close() @pytest.mark.asyncio async def test_browser_context_manager(local_server): """Verify async with pattern works and cleanup happens without error.""" async with AsyncWebCrawler(config=BrowserConfig(headless=True, verbose=False)) as crawler: result = await crawler.arun( url=local_server + "/", config=CrawlerRunConfig(verbose=False), ) assert result.success, f"Context manager crawl failed: {result.error_message}" # If we get here without exception, cleanup succeeded # --------------------------------------------------------------------------- # Viewport configuration # --------------------------------------------------------------------------- @pytest.mark.asyncio async def test_custom_viewport(local_server): """Create BrowserConfig with 1920x1080 viewport and verify crawl succeeds.""" browser_config = BrowserConfig( headless=True, verbose=False, viewport_width=1920, viewport_height=1080, ) async with AsyncWebCrawler(config=browser_config) as crawler: result = await crawler.arun( url=local_server + "/", config=CrawlerRunConfig(verbose=False), ) assert result.success, f"Custom viewport crawl failed: {result.error_message}" @pytest.mark.asyncio async def test_small_viewport(local_server): """Mobile-like viewport (375x667) should still produce a successful crawl.""" browser_config = BrowserConfig( headless=True, verbose=False, viewport_width=375, viewport_height=667, ) async with AsyncWebCrawler(config=browser_config) as crawler: result = await crawler.arun( url=local_server + "/", config=CrawlerRunConfig(verbose=False), ) assert result.success, f"Small viewport crawl failed: {result.error_message}" # --------------------------------------------------------------------------- # wait_for conditions # --------------------------------------------------------------------------- @pytest.mark.asyncio async def test_wait_for_css_selector(local_server): """Wait for a CSS selector on /js-dynamic and verify dynamic content loaded.""" config = CrawlerRunConfig(wait_for="css:.js-loaded", verbose=False) async with AsyncWebCrawler(config=BrowserConfig(headless=True, verbose=False)) as crawler: result = await crawler.arun(url=local_server + "/js-dynamic", config=config) assert result.success, f"wait_for CSS crawl failed: {result.error_message}" assert "Dynamic content successfully loaded" in (result.markdown or ""), ( "Dynamic JS content should appear after waiting for .js-loaded" ) @pytest.mark.asyncio async def test_wait_for_js_function(local_server): """Wait for a JS condition on /js-dynamic and verify the counter value.""" config = CrawlerRunConfig( wait_for="js:() => document.getElementById('counter').textContent === '42'", verbose=False, ) async with AsyncWebCrawler(config=BrowserConfig(headless=True, verbose=False)) as crawler: result = await crawler.arun(url=local_server + "/js-dynamic", config=config) assert result.success, f"wait_for JS crawl failed: {result.error_message}" assert "42" in (result.html or ""), ( "Counter should be set to 42 after JS wait condition is met" ) @pytest.mark.asyncio async def test_wait_for_timeout(local_server): """Wait for a non-existent selector with short timeout should not hang forever.""" config = CrawlerRunConfig( wait_for="css:.nonexistent-class", wait_for_timeout=500, verbose=False, ) async with AsyncWebCrawler(config=BrowserConfig(headless=True, verbose=False)) as crawler: # This may succeed (with timeout warning) or fail, but should not hang result = await crawler.arun(url=local_server + "/js-dynamic", config=config) # We just verify it returned without hanging; success or failure is acceptable assert result is not None, "Should return a result even if wait_for times out" # --------------------------------------------------------------------------- # JavaScript execution # --------------------------------------------------------------------------- @pytest.mark.asyncio async def test_js_code_modifies_dom(local_server): """Execute JS that adds a DOM element and verify it appears in the result.""" config = CrawlerRunConfig( js_code='document.body.innerHTML += \'
Injected by JS
\';', verbose=False, ) async with AsyncWebCrawler(config=BrowserConfig(headless=True, verbose=False)) as crawler: result = await crawler.arun(url=local_server + "/", config=config) assert result.success, f"JS DOM modification crawl failed: {result.error_message}" combined = (result.html or "") + (result.markdown or "") assert "Injected by JS" in combined, ( "Injected content should appear in HTML or markdown" ) @pytest.mark.asyncio async def test_js_code_returns_value(local_server): """Execute JS that returns document.title and check js_execution_result.""" config = CrawlerRunConfig( js_code="return document.title;", verbose=False, ) async with AsyncWebCrawler(config=BrowserConfig(headless=True, verbose=False)) as crawler: result = await crawler.arun(url=local_server + "/", config=config) assert result.success, f"JS return value crawl failed: {result.error_message}" # js_execution_result should contain the returned value if result.js_execution_result is not None: # The result might be stored under a key or directly result_str = str(result.js_execution_result) assert "Crawl4AI Test Home" in result_str or len(result_str) > 0, ( "js_execution_result should contain the document title" ) @pytest.mark.asyncio async def test_multiple_js_scripts(local_server): """Execute multiple JS scripts sequentially; last one sets title to 'B'.""" config = CrawlerRunConfig( js_code=["document.title='A';", "document.title='B';"], verbose=False, ) async with AsyncWebCrawler(config=BrowserConfig(headless=True, verbose=False)) as crawler: result = await crawler.arun(url=local_server + "/", config=config) assert result.success, f"Multiple JS scripts crawl failed: {result.error_message}" # Both scripts should have executed; title should end up as 'B' # We can check via the HTML title tag or via another JS execution # The HTML might still have the original title in source, but the page state changed # --------------------------------------------------------------------------- # Page interaction # --------------------------------------------------------------------------- @pytest.mark.asyncio async def test_scan_full_page(local_server): """Crawl /large with scan_full_page=True and verify bottom sections appear.""" config = CrawlerRunConfig( scan_full_page=True, scroll_delay=0.05, verbose=False, ) async with AsyncWebCrawler(config=BrowserConfig(headless=True, verbose=False)) as crawler: result = await crawler.arun(url=local_server + "/large", config=config) assert result.success, f"Full page scan crawl failed: {result.error_message}" # The large page has 50 sections; verify some from near the bottom combined = (result.html or "") + (result.markdown or "") assert "Section 49" in combined, ( "Scanning the full page should reveal the last section (Section 49)" ) # --------------------------------------------------------------------------- # Screenshot features # --------------------------------------------------------------------------- @pytest.mark.asyncio async def test_screenshot_basic(local_server): """Crawl with screenshot=True, decode base64, and verify PNG header.""" config = CrawlerRunConfig(screenshot=True, verbose=False) async with AsyncWebCrawler(config=BrowserConfig(headless=True, verbose=False)) as crawler: result = await crawler.arun(url=local_server + "/", config=config) assert result.success, f"Screenshot crawl failed: {result.error_message}" assert result.screenshot, "Screenshot should be a non-empty base64 string" raw_bytes = base64.b64decode(result.screenshot) assert raw_bytes[:4] == b"\x89PNG", ( "Screenshot should be in PNG format" ) @pytest.mark.asyncio async def test_force_viewport_screenshot(local_server): """Crawl /large with force_viewport_screenshot=True; should capture viewport only.""" config = CrawlerRunConfig( screenshot=True, force_viewport_screenshot=True, verbose=False, ) async with AsyncWebCrawler(config=BrowserConfig(headless=True, verbose=False)) as crawler: result = await crawler.arun(url=local_server + "/large", config=config) assert result.success, f"Force viewport screenshot crawl failed: {result.error_message}" assert result.screenshot, "Screenshot should be captured" raw_bytes = base64.b64decode(result.screenshot) assert raw_bytes[:4] == b"\x89PNG", "Viewport screenshot should be PNG" # --------------------------------------------------------------------------- # Process iframes # --------------------------------------------------------------------------- @pytest.mark.asyncio async def test_process_iframes(local_server): """Crawl /iframe-page with process_iframes=True and verify iframe content appears.""" config = CrawlerRunConfig(process_iframes=True, verbose=False) async with AsyncWebCrawler(config=BrowserConfig(headless=True, verbose=False)) as crawler: result = await crawler.arun(url=local_server + "/iframe-page", config=config) assert result.success, f"Iframe processing crawl failed: {result.error_message}" combined = (result.html or "") + (result.markdown or "") # At least one iframe's content should appear has_iframe_content = ( "Iframe 1 content" in combined or "Iframe 2 heading" in combined or "embedded" in combined.lower() ) assert has_iframe_content, ( "Iframe content should appear in the result when process_iframes=True" ) # --------------------------------------------------------------------------- # Overlay and popup removal # --------------------------------------------------------------------------- @pytest.mark.asyncio async def test_remove_overlay_elements(local_server): """Crawl with remove_overlay_elements=True; verify it does not break crawling.""" config = CrawlerRunConfig(remove_overlay_elements=True, verbose=False) async with AsyncWebCrawler(config=BrowserConfig(headless=True, verbose=False)) as crawler: result = await crawler.arun(url=local_server + "/", config=config) assert result.success, ( f"Overlay removal should not break crawling: {result.error_message}" ) assert len(result.html) > 0, "HTML should still be present after overlay removal" # --------------------------------------------------------------------------- # Stealth mode # --------------------------------------------------------------------------- @pytest.mark.asyncio async def test_stealth_mode_no_crash(local_server): """Stealth mode should not break basic local crawling.""" browser_config = BrowserConfig( headless=True, verbose=False, enable_stealth=True, ) async with AsyncWebCrawler(config=browser_config) as crawler: result = await crawler.arun( url=local_server + "/", config=CrawlerRunConfig(verbose=False), ) assert result.success, f"Stealth mode crawl failed: {result.error_message}" assert "Crawl4AI Test Home" in (result.html or ""), ( "Stealth mode should still extract content correctly" ) # --------------------------------------------------------------------------- # Session management # --------------------------------------------------------------------------- @pytest.mark.asyncio async def test_session_persistence(local_server): """Session state should persist between crawls with the same session_id.""" async with AsyncWebCrawler(config=BrowserConfig(headless=True, verbose=False)) as crawler: # First crawl: set a JS variable config1 = CrawlerRunConfig( session_id="persist-test", js_code="window.__testVar = 'hello';", verbose=False, ) result1 = await crawler.arun(url=local_server + "/", config=config1) assert result1.success, f"First session crawl failed: {result1.error_message}" # Second crawl: read the JS variable using js_only mode config2 = CrawlerRunConfig( session_id="persist-test", js_only=True, js_code="return window.__testVar;", verbose=False, ) result2 = await crawler.arun(url=local_server + "/", config=config2) assert result2.success, f"Second session crawl failed: {result2.error_message}" # Check if testVar persisted if result2.js_execution_result is not None: result_str = str(result2.js_execution_result) assert "hello" in result_str, ( f"Session variable should persist; got: {result_str}" ) # --------------------------------------------------------------------------- # Delay before return HTML # --------------------------------------------------------------------------- @pytest.mark.asyncio async def test_delay_before_return(local_server): """Crawl with delay_before_return_html=0.5 should succeed and take reasonable time.""" config = CrawlerRunConfig(delay_before_return_html=0.5, verbose=False) async with AsyncWebCrawler(config=BrowserConfig(headless=True, verbose=False)) as crawler: start_time = time.monotonic() result = await crawler.arun(url=local_server + "/", config=config) elapsed = time.monotonic() - start_time assert result.success, f"Delayed crawl failed: {result.error_message}" assert elapsed >= 0.4, ( f"Crawl with 0.5s delay should take at least 0.4s, took {elapsed:.2f}s" ) # --------------------------------------------------------------------------- # Network features # --------------------------------------------------------------------------- @pytest.mark.asyncio async def test_capture_network_requests(local_server): """Crawl /js-dynamic with capture_network_requests=True and verify list returned.""" config = CrawlerRunConfig( capture_network_requests=True, cache_mode=CacheMode.BYPASS, verbose=False, ) async with AsyncWebCrawler(config=BrowserConfig(headless=True, verbose=False)) as crawler: result = await crawler.arun(url=local_server + "/js-dynamic", config=config) assert result.success, f"Network capture crawl failed: {result.error_message}" assert result.network_requests is not None, "network_requests should not be None" assert isinstance(result.network_requests, list), ( "network_requests should be a list" ) assert len(result.network_requests) >= 1, ( "Should capture at least 1 network request (the page itself)" ) @pytest.mark.asyncio async def test_capture_console_messages(local_server): """Crawl with capture_console_messages=True and verify the attribute is a list.""" config = CrawlerRunConfig( capture_console_messages=True, cache_mode=CacheMode.BYPASS, verbose=False, ) async with AsyncWebCrawler(config=BrowserConfig(headless=True, verbose=False)) as crawler: result = await crawler.arun(url=local_server + "/", config=config) assert result.success, f"Console capture crawl failed: {result.error_message}" assert result.console_messages is not None, ( "console_messages should not be None when capture is enabled" ) assert isinstance(result.console_messages, list), ( "console_messages should be a list" ) # --------------------------------------------------------------------------- # Real URL browser tests # --------------------------------------------------------------------------- @pytest.mark.asyncio @pytest.mark.network async def test_real_url_with_wait(): """Crawl https://quotes.toscrape.com with wait_until='load' and verify content.""" config = CrawlerRunConfig(wait_until="load", verbose=False) async with AsyncWebCrawler(config=BrowserConfig(headless=True, verbose=False)) as crawler: result = await crawler.arun(url="https://quotes.toscrape.com", config=config) assert result.success, f"Real URL crawl failed: {result.error_message}" assert len(result.html) > 100, "Real page should have substantial HTML" combined = (result.markdown or "") + (result.html or "") assert "quote" in combined.lower() or "quotes" in combined.lower(), ( "Quotes page should contain the word 'quote'" ) @pytest.mark.asyncio @pytest.mark.network async def test_real_url_screenshot(): """Crawl https://example.com with screenshot=True and verify PNG captured.""" config = CrawlerRunConfig(screenshot=True, verbose=False) async with AsyncWebCrawler(config=BrowserConfig(headless=True, verbose=False)) as crawler: result = await crawler.arun(url="https://example.com", config=config) assert result.success, f"Real URL screenshot crawl failed: {result.error_message}" assert result.screenshot, "Screenshot should be non-empty" raw_bytes = base64.b64decode(result.screenshot) assert raw_bytes[:4] == b"\x89PNG", "Real URL screenshot should be PNG format" # --------------------------------------------------------------------------- # Anti-bot basic check # --------------------------------------------------------------------------- @pytest.mark.asyncio async def test_magic_mode_no_crash(local_server): """Magic mode should not break normal local crawling.""" config = CrawlerRunConfig(magic=True, verbose=False) async with AsyncWebCrawler(config=BrowserConfig(headless=True, verbose=False)) as crawler: result = await crawler.arun(url=local_server + "/", config=config) assert result.success, ( f"Magic mode should not break crawling: {result.error_message}" ) # --------------------------------------------------------------------------- # Edge cases # --------------------------------------------------------------------------- @pytest.mark.asyncio async def test_crawl_empty_page(local_server): """Crawling a page with empty body should not crash, even if anti-bot flags it.""" async with AsyncWebCrawler(config=BrowserConfig(headless=True, verbose=False)) as crawler: result = await crawler.arun( url=local_server + "/empty", config=CrawlerRunConfig(verbose=False), ) # Anti-bot detection may flag near-empty pages as blocked, which is expected # behavior. The key assertion is that it returns a result without crashing. assert result is not None, "Should return a result even for empty page" assert result.html is not None, "HTML should not be None for empty page" if not result.success: assert "empty" in (result.error_message or "").lower() or "blocked" in (result.error_message or "").lower(), ( f"Empty page failure should mention empty/blocked content: {result.error_message}" ) @pytest.mark.asyncio async def test_crawl_malformed_html(local_server): """Crawling malformed HTML should not crash, even if anti-bot flags it.""" async with AsyncWebCrawler(config=BrowserConfig(headless=True, verbose=False)) as crawler: result = await crawler.arun( url=local_server + "/malformed", config=CrawlerRunConfig(verbose=False), ) # Anti-bot may flag malformed HTML as blocked due to minimal visible text. # The key assertion is that it returns a result without crashing. assert result is not None, "Should return a result for malformed HTML" assert result.html is not None, "HTML should not be None even for malformed input" # The content is present in the HTML even if the crawl is marked as not successful assert "Unclosed paragraph" in (result.html or "") or "Malformed" in (result.html or ""), ( "Some original content should appear in the HTML" ) @pytest.mark.asyncio async def test_multiple_crawls_same_crawler(local_server): """A single crawler instance should handle multiple sequential crawls.""" async with AsyncWebCrawler(config=BrowserConfig(headless=True, verbose=False)) as crawler: urls = [ local_server + "/", local_server + "/products", local_server + "/js-dynamic", ] for url in urls: result = await crawler.arun( url=url, config=CrawlerRunConfig(verbose=False), ) assert result.success, f"Sequential crawl of {url} failed: {result.error_message}" @pytest.mark.asyncio async def test_screenshot_not_captured_by_default(local_server): """Without screenshot=True, result.screenshot should be None or empty.""" config = CrawlerRunConfig(screenshot=False, verbose=False) async with AsyncWebCrawler(config=BrowserConfig(headless=True, verbose=False)) as crawler: result = await crawler.arun(url=local_server + "/", config=config) assert result.success, f"No-screenshot crawl failed: {result.error_message}" assert not result.screenshot, ( "Screenshot should be None or empty when not requested" ) @pytest.mark.asyncio async def test_js_code_empty_string(local_server): """Empty js_code string should not cause errors.""" config = CrawlerRunConfig(js_code="", verbose=False) async with AsyncWebCrawler(config=BrowserConfig(headless=True, verbose=False)) as crawler: result = await crawler.arun(url=local_server + "/", config=config) assert result.success, ( f"Empty js_code should not break crawling: {result.error_message}" ) @pytest.mark.asyncio async def test_wait_until_load(local_server): """wait_until='load' should wait for full page load including resources.""" config = CrawlerRunConfig(wait_until="load", verbose=False) async with AsyncWebCrawler(config=BrowserConfig(headless=True, verbose=False)) as crawler: result = await crawler.arun(url=local_server + "/", config=config) assert result.success, f"wait_until=load crawl failed: {result.error_message}" @pytest.mark.asyncio async def test_wait_until_networkidle(local_server): """wait_until='networkidle' should wait until network is idle.""" config = CrawlerRunConfig(wait_until="networkidle", verbose=False) async with AsyncWebCrawler(config=BrowserConfig(headless=True, verbose=False)) as crawler: result = await crawler.arun(url=local_server + "/", config=config) assert result.success, f"wait_until=networkidle crawl failed: {result.error_message}"