crawl4ai/tests/regression/test_reg_browser.py

"""
Crawl4AI Regression Tests - Browser Management and Features

Tests browser lifecycle, viewport configuration, wait_for conditions, JavaScript
execution, page interaction, screenshots, iframe processing, overlay removal,
stealth mode, session management, network capture, and anti-bot features using
real browser crawling with no mocking.
"""

import base64
import time

import pytest

from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig
from crawl4ai.cache_context import CacheMode


# ---------------------------------------------------------------------------
# Browser lifecycle
# ---------------------------------------------------------------------------


@pytest.mark.asyncio
async def test_browser_lifecycle(local_server):
    """Create crawler, start, crawl, and close explicitly without context manager."""
    crawler = AsyncWebCrawler(config=BrowserConfig(headless=True, verbose=False))
    await crawler.start()
    try:
        result = await crawler.arun(
            url=local_server + "/",
            config=CrawlerRunConfig(verbose=False),
        )
        assert result.success, f"Crawl failed: {result.error_message}"
        assert len(result.html) > 0, "HTML should be non-empty"
    finally:
        await crawler.close()


@pytest.mark.asyncio
async def test_browser_context_manager(local_server):
    """Verify async with pattern works and cleanup happens without error."""
    async with AsyncWebCrawler(config=BrowserConfig(headless=True, verbose=False)) as crawler:
        result = await crawler.arun(
            url=local_server + "/",
            config=CrawlerRunConfig(verbose=False),
        )
        assert result.success, f"Context manager crawl failed: {result.error_message}"
    # If we get here without exception, cleanup succeeded


# ---------------------------------------------------------------------------
# Viewport configuration
# ---------------------------------------------------------------------------


@pytest.mark.asyncio
async def test_custom_viewport(local_server):
    """Create BrowserConfig with 1920x1080 viewport and verify crawl succeeds."""
    browser_config = BrowserConfig(
        headless=True,
        verbose=False,
        viewport_width=1920,
        viewport_height=1080,
    )
    async with AsyncWebCrawler(config=browser_config) as crawler:
        result = await crawler.arun(
            url=local_server + "/",
            config=CrawlerRunConfig(verbose=False),
        )
        assert result.success, f"Custom viewport crawl failed: {result.error_message}"


@pytest.mark.asyncio
async def test_small_viewport(local_server):
    """Mobile-like viewport (375x667) should still produce a successful crawl."""
    browser_config = BrowserConfig(
        headless=True,
        verbose=False,
        viewport_width=375,
        viewport_height=667,
    )
    async with AsyncWebCrawler(config=browser_config) as crawler:
        result = await crawler.arun(
            url=local_server + "/",
            config=CrawlerRunConfig(verbose=False),
        )
        assert result.success, f"Small viewport crawl failed: {result.error_message}"


# ---------------------------------------------------------------------------
# wait_for conditions
# ---------------------------------------------------------------------------


@pytest.mark.asyncio
async def test_wait_for_css_selector(local_server):
    """Wait for a CSS selector on /js-dynamic and verify dynamic content loaded."""
    config = CrawlerRunConfig(wait_for="css:.js-loaded", verbose=False)
    async with AsyncWebCrawler(config=BrowserConfig(headless=True, verbose=False)) as crawler:
        result = await crawler.arun(url=local_server + "/js-dynamic", config=config)
        assert result.success, f"wait_for CSS crawl failed: {result.error_message}"
        assert "Dynamic content successfully loaded" in (result.markdown or ""), (
            "Dynamic JS content should appear after waiting for .js-loaded"
        )


@pytest.mark.asyncio
async def test_wait_for_js_function(local_server):
    """Wait for a JS condition on /js-dynamic and verify the counter value."""
    config = CrawlerRunConfig(
        wait_for="js:() => document.getElementById('counter').textContent === '42'",
        verbose=False,
    )
    async with AsyncWebCrawler(config=BrowserConfig(headless=True, verbose=False)) as crawler:
        result = await crawler.arun(url=local_server + "/js-dynamic", config=config)
        assert result.success, f"wait_for JS crawl failed: {result.error_message}"
        assert "42" in (result.html or ""), (
            "Counter should be set to 42 after JS wait condition is met"
        )


@pytest.mark.asyncio
async def test_wait_for_timeout(local_server):
    """Wait for a non-existent selector with short timeout should not hang forever."""
    config = CrawlerRunConfig(
        wait_for="css:.nonexistent-class",
        wait_for_timeout=500,
        verbose=False,
    )
    async with AsyncWebCrawler(config=BrowserConfig(headless=True, verbose=False)) as crawler:
        # This may succeed (with timeout warning) or fail, but should not hang
        result = await crawler.arun(url=local_server + "/js-dynamic", config=config)
        # We just verify it returned without hanging; success or failure is acceptable
        assert result is not None, "Should return a result even if wait_for times out"


# ---------------------------------------------------------------------------
# JavaScript execution
# ---------------------------------------------------------------------------


@pytest.mark.asyncio
async def test_js_code_modifies_dom(local_server):
    """Execute JS that adds a DOM element and verify it appears in the result."""
    config = CrawlerRunConfig(
        js_code='document.body.innerHTML += \'<div id="injected">Injected by JS</div>\';',
        verbose=False,
    )
    async with AsyncWebCrawler(config=BrowserConfig(headless=True, verbose=False)) as crawler:
        result = await crawler.arun(url=local_server + "/", config=config)
        assert result.success, f"JS DOM modification crawl failed: {result.error_message}"
        combined = (result.html or "") + (result.markdown or "")
        assert "Injected by JS" in combined, (
            "Injected content should appear in HTML or markdown"
        )


@pytest.mark.asyncio
async def test_js_code_returns_value(local_server):
    """Execute JS that returns document.title and check js_execution_result."""
    config = CrawlerRunConfig(
        js_code="return document.title;",
        verbose=False,
    )
    async with AsyncWebCrawler(config=BrowserConfig(headless=True, verbose=False)) as crawler:
        result = await crawler.arun(url=local_server + "/", config=config)
        assert result.success, f"JS return value crawl failed: {result.error_message}"
        # js_execution_result should contain the returned value
        if result.js_execution_result is not None:
            # The result might be stored under a key or directly
            result_str = str(result.js_execution_result)
            assert "Crawl4AI Test Home" in result_str or len(result_str) > 0, (
                "js_execution_result should contain the document title"
            )


@pytest.mark.asyncio
async def test_multiple_js_scripts(local_server):
    """Execute multiple JS scripts sequentially; last one sets title to 'B'."""
    config = CrawlerRunConfig(
        js_code=["document.title='A';", "document.title='B';"],
        verbose=False,
    )
    async with AsyncWebCrawler(config=BrowserConfig(headless=True, verbose=False)) as crawler:
        result = await crawler.arun(url=local_server + "/", config=config)
        assert result.success, f"Multiple JS scripts crawl failed: {result.error_message}"
        # Both scripts should have executed; title should end up as 'B'
        # We can check via the HTML title tag or via another JS execution
        # The HTML might still have the original title in source, but the page state changed


# ---------------------------------------------------------------------------
# Page interaction
# ---------------------------------------------------------------------------


@pytest.mark.asyncio
async def test_scan_full_page(local_server):
    """Crawl /large with scan_full_page=True and verify bottom sections appear."""
    config = CrawlerRunConfig(
        scan_full_page=True,
        scroll_delay=0.05,
        verbose=False,
    )
    async with AsyncWebCrawler(config=BrowserConfig(headless=True, verbose=False)) as crawler:
        result = await crawler.arun(url=local_server + "/large", config=config)
        assert result.success, f"Full page scan crawl failed: {result.error_message}"
        # The large page has 50 sections; verify some from near the bottom
        combined = (result.html or "") + (result.markdown or "")
        assert "Section 49" in combined, (
            "Scanning the full page should reveal the last section (Section 49)"
        )


# ---------------------------------------------------------------------------
# Screenshot features
# ---------------------------------------------------------------------------


@pytest.mark.asyncio
async def test_screenshot_basic(local_server):
    """Crawl with screenshot=True, decode base64, and verify PNG header."""
    config = CrawlerRunConfig(screenshot=True, verbose=False)
    async with AsyncWebCrawler(config=BrowserConfig(headless=True, verbose=False)) as crawler:
        result = await crawler.arun(url=local_server + "/", config=config)
        assert result.success, f"Screenshot crawl failed: {result.error_message}"
        assert result.screenshot, "Screenshot should be a non-empty base64 string"
        raw_bytes = base64.b64decode(result.screenshot)
        assert raw_bytes[:4] == b"\x89PNG", (
            "Screenshot should be in PNG format"
        )


@pytest.mark.asyncio
async def test_force_viewport_screenshot(local_server):
    """Crawl /large with force_viewport_screenshot=True; should capture viewport only."""
    config = CrawlerRunConfig(
        screenshot=True,
        force_viewport_screenshot=True,
        verbose=False,
    )
    async with AsyncWebCrawler(config=BrowserConfig(headless=True, verbose=False)) as crawler:
        result = await crawler.arun(url=local_server + "/large", config=config)
        assert result.success, f"Force viewport screenshot crawl failed: {result.error_message}"
        assert result.screenshot, "Screenshot should be captured"
        raw_bytes = base64.b64decode(result.screenshot)
        assert raw_bytes[:4] == b"\x89PNG", "Viewport screenshot should be PNG"


# ---------------------------------------------------------------------------
# Process iframes
# ---------------------------------------------------------------------------


@pytest.mark.asyncio
async def test_process_iframes(local_server):
    """Crawl /iframe-page with process_iframes=True and verify iframe content appears."""
    config = CrawlerRunConfig(process_iframes=True, verbose=False)
    async with AsyncWebCrawler(config=BrowserConfig(headless=True, verbose=False)) as crawler:
        result = await crawler.arun(url=local_server + "/iframe-page", config=config)
        assert result.success, f"Iframe processing crawl failed: {result.error_message}"
        combined = (result.html or "") + (result.markdown or "")
        # At least one iframe's content should appear
        has_iframe_content = (
            "Iframe 1 content" in combined
            or "Iframe 2 heading" in combined
            or "embedded" in combined.lower()
        )
        assert has_iframe_content, (
            "Iframe content should appear in the result when process_iframes=True"
        )


# ---------------------------------------------------------------------------
# Overlay and popup removal
# ---------------------------------------------------------------------------


@pytest.mark.asyncio
async def test_remove_overlay_elements(local_server):
    """Crawl with remove_overlay_elements=True; verify it does not break crawling."""
    config = CrawlerRunConfig(remove_overlay_elements=True, verbose=False)
    async with AsyncWebCrawler(config=BrowserConfig(headless=True, verbose=False)) as crawler:
        result = await crawler.arun(url=local_server + "/", config=config)
        assert result.success, (
            f"Overlay removal should not break crawling: {result.error_message}"
        )
        assert len(result.html) > 0, "HTML should still be present after overlay removal"


# ---------------------------------------------------------------------------
# Stealth mode
# ---------------------------------------------------------------------------


@pytest.mark.asyncio
async def test_stealth_mode_no_crash(local_server):
    """Stealth mode should not break basic local crawling."""
    browser_config = BrowserConfig(
        headless=True,
        verbose=False,
        enable_stealth=True,
    )
    async with AsyncWebCrawler(config=browser_config) as crawler:
        result = await crawler.arun(
            url=local_server + "/",
            config=CrawlerRunConfig(verbose=False),
        )
        assert result.success, f"Stealth mode crawl failed: {result.error_message}"
        assert "Crawl4AI Test Home" in (result.html or ""), (
            "Stealth mode should still extract content correctly"
        )


# ---------------------------------------------------------------------------
# Session management
# ---------------------------------------------------------------------------


@pytest.mark.asyncio
async def test_session_persistence(local_server):
    """Session state should persist between crawls with the same session_id."""
    async with AsyncWebCrawler(config=BrowserConfig(headless=True, verbose=False)) as crawler:
        # First crawl: set a JS variable
        config1 = CrawlerRunConfig(
            session_id="persist-test",
            js_code="window.__testVar = 'hello';",
            verbose=False,
        )
        result1 = await crawler.arun(url=local_server + "/", config=config1)
        assert result1.success, f"First session crawl failed: {result1.error_message}"

        # Second crawl: read the JS variable using js_only mode
        config2 = CrawlerRunConfig(
            session_id="persist-test",
            js_only=True,
            js_code="return window.__testVar;",
            verbose=False,
        )
        result2 = await crawler.arun(url=local_server + "/", config=config2)
        assert result2.success, f"Second session crawl failed: {result2.error_message}"

        # Check if testVar persisted
        if result2.js_execution_result is not None:
            result_str = str(result2.js_execution_result)
            assert "hello" in result_str, (
                f"Session variable should persist; got: {result_str}"
            )


# ---------------------------------------------------------------------------
# Delay before return HTML
# ---------------------------------------------------------------------------


@pytest.mark.asyncio
async def test_delay_before_return(local_server):
    """Crawl with delay_before_return_html=0.5 should succeed and take reasonable time."""
    config = CrawlerRunConfig(delay_before_return_html=0.5, verbose=False)
    async with AsyncWebCrawler(config=BrowserConfig(headless=True, verbose=False)) as crawler:
        start_time = time.monotonic()
        result = await crawler.arun(url=local_server + "/", config=config)
        elapsed = time.monotonic() - start_time

        assert result.success, f"Delayed crawl failed: {result.error_message}"
        assert elapsed >= 0.4, (
            f"Crawl with 0.5s delay should take at least 0.4s, took {elapsed:.2f}s"
        )


# ---------------------------------------------------------------------------
# Network features
# ---------------------------------------------------------------------------


@pytest.mark.asyncio
async def test_capture_network_requests(local_server):
    """Crawl /js-dynamic with capture_network_requests=True and verify list returned."""
    config = CrawlerRunConfig(
        capture_network_requests=True,
        cache_mode=CacheMode.BYPASS,
        verbose=False,
    )
    async with AsyncWebCrawler(config=BrowserConfig(headless=True, verbose=False)) as crawler:
        result = await crawler.arun(url=local_server + "/js-dynamic", config=config)
        assert result.success, f"Network capture crawl failed: {result.error_message}"
        assert result.network_requests is not None, "network_requests should not be None"
        assert isinstance(result.network_requests, list), (
            "network_requests should be a list"
        )
        assert len(result.network_requests) >= 1, (
            "Should capture at least 1 network request (the page itself)"
        )


@pytest.mark.asyncio
async def test_capture_console_messages(local_server):
    """Crawl with capture_console_messages=True and verify the attribute is a list."""
    config = CrawlerRunConfig(
        capture_console_messages=True,
        cache_mode=CacheMode.BYPASS,
        verbose=False,
    )
    async with AsyncWebCrawler(config=BrowserConfig(headless=True, verbose=False)) as crawler:
        result = await crawler.arun(url=local_server + "/", config=config)
        assert result.success, f"Console capture crawl failed: {result.error_message}"
        assert result.console_messages is not None, (
            "console_messages should not be None when capture is enabled"
        )
        assert isinstance(result.console_messages, list), (
            "console_messages should be a list"
        )


# ---------------------------------------------------------------------------
# Real URL browser tests
# ---------------------------------------------------------------------------


@pytest.mark.asyncio
@pytest.mark.network
async def test_real_url_with_wait():
    """Crawl https://quotes.toscrape.com with wait_until='load' and verify content."""
    config = CrawlerRunConfig(wait_until="load", verbose=False)
    async with AsyncWebCrawler(config=BrowserConfig(headless=True, verbose=False)) as crawler:
        result = await crawler.arun(url="https://quotes.toscrape.com", config=config)
        assert result.success, f"Real URL crawl failed: {result.error_message}"
        assert len(result.html) > 100, "Real page should have substantial HTML"
        combined = (result.markdown or "") + (result.html or "")
        assert "quote" in combined.lower() or "quotes" in combined.lower(), (
            "Quotes page should contain the word 'quote'"
        )


@pytest.mark.asyncio
@pytest.mark.network
async def test_real_url_screenshot():
    """Crawl https://example.com with screenshot=True and verify PNG captured."""
    config = CrawlerRunConfig(screenshot=True, verbose=False)
    async with AsyncWebCrawler(config=BrowserConfig(headless=True, verbose=False)) as crawler:
        result = await crawler.arun(url="https://example.com", config=config)
        assert result.success, f"Real URL screenshot crawl failed: {result.error_message}"
        assert result.screenshot, "Screenshot should be non-empty"
        raw_bytes = base64.b64decode(result.screenshot)
        assert raw_bytes[:4] == b"\x89PNG", "Real URL screenshot should be PNG format"


# ---------------------------------------------------------------------------
# Anti-bot basic check
# ---------------------------------------------------------------------------


@pytest.mark.asyncio
async def test_magic_mode_no_crash(local_server):
    """Magic mode should not break normal local crawling."""
    config = CrawlerRunConfig(magic=True, verbose=False)
    async with AsyncWebCrawler(config=BrowserConfig(headless=True, verbose=False)) as crawler:
        result = await crawler.arun(url=local_server + "/", config=config)
        assert result.success, (
            f"Magic mode should not break crawling: {result.error_message}"
        )


# ---------------------------------------------------------------------------
# Edge cases
# ---------------------------------------------------------------------------


@pytest.mark.asyncio
async def test_crawl_empty_page(local_server):
    """Crawling a page with empty body should not crash, even if anti-bot flags it."""
    async with AsyncWebCrawler(config=BrowserConfig(headless=True, verbose=False)) as crawler:
        result = await crawler.arun(
            url=local_server + "/empty",
            config=CrawlerRunConfig(verbose=False),
        )
        # Anti-bot detection may flag near-empty pages as blocked, which is expected
        # behavior. The key assertion is that it returns a result without crashing.
        assert result is not None, "Should return a result even for empty page"
        assert result.html is not None, "HTML should not be None for empty page"
        if not result.success:
            assert "empty" in (result.error_message or "").lower() or "blocked" in (result.error_message or "").lower(), (
                f"Empty page failure should mention empty/blocked content: {result.error_message}"
            )


@pytest.mark.asyncio
async def test_crawl_malformed_html(local_server):
    """Crawling malformed HTML should not crash, even if anti-bot flags it."""
    async with AsyncWebCrawler(config=BrowserConfig(headless=True, verbose=False)) as crawler:
        result = await crawler.arun(
            url=local_server + "/malformed",
            config=CrawlerRunConfig(verbose=False),
        )
        # Anti-bot may flag malformed HTML as blocked due to minimal visible text.
        # The key assertion is that it returns a result without crashing.
        assert result is not None, "Should return a result for malformed HTML"
        assert result.html is not None, "HTML should not be None even for malformed input"
        # The content is present in the HTML even if the crawl is marked as not successful
        assert "Unclosed paragraph" in (result.html or "") or "Malformed" in (result.html or ""), (
            "Some original content should appear in the HTML"
        )


@pytest.mark.asyncio
async def test_multiple_crawls_same_crawler(local_server):
    """A single crawler instance should handle multiple sequential crawls."""
    async with AsyncWebCrawler(config=BrowserConfig(headless=True, verbose=False)) as crawler:
        urls = [
            local_server + "/",
            local_server + "/products",
            local_server + "/js-dynamic",
        ]
        for url in urls:
            result = await crawler.arun(
                url=url,
                config=CrawlerRunConfig(verbose=False),
            )
            assert result.success, f"Sequential crawl of {url} failed: {result.error_message}"


@pytest.mark.asyncio
async def test_screenshot_not_captured_by_default(local_server):
    """Without screenshot=True, result.screenshot should be None or empty."""
    config = CrawlerRunConfig(screenshot=False, verbose=False)
    async with AsyncWebCrawler(config=BrowserConfig(headless=True, verbose=False)) as crawler:
        result = await crawler.arun(url=local_server + "/", config=config)
        assert result.success, f"No-screenshot crawl failed: {result.error_message}"
        assert not result.screenshot, (
            "Screenshot should be None or empty when not requested"
        )


@pytest.mark.asyncio
async def test_js_code_empty_string(local_server):
    """Empty js_code string should not cause errors."""
    config = CrawlerRunConfig(js_code="", verbose=False)
    async with AsyncWebCrawler(config=BrowserConfig(headless=True, verbose=False)) as crawler:
        result = await crawler.arun(url=local_server + "/", config=config)
        assert result.success, (
            f"Empty js_code should not break crawling: {result.error_message}"
        )


@pytest.mark.asyncio
async def test_wait_until_load(local_server):
    """wait_until='load' should wait for full page load including resources."""
    config = CrawlerRunConfig(wait_until="load", verbose=False)
    async with AsyncWebCrawler(config=BrowserConfig(headless=True, verbose=False)) as crawler:
        result = await crawler.arun(url=local_server + "/", config=config)
        assert result.success, f"wait_until=load crawl failed: {result.error_message}"


@pytest.mark.asyncio
async def test_wait_until_networkidle(local_server):
    """wait_until='networkidle' should wait until network is idle."""
    config = CrawlerRunConfig(wait_until="networkidle", verbose=False)
    async with AsyncWebCrawler(config=BrowserConfig(headless=True, verbose=False)) as crawler:
        result = await crawler.arun(url=local_server + "/", config=config)
        assert result.success, f"wait_until=networkidle crawl failed: {result.error_message}"