Files
crawl4ai/tests/regression/test_reg_core_crawl.py
unclecode d788c28315 test: add comprehensive regression test suite (291 tests)
Full regression suite covering all major Crawl4AI subsystems:
- core crawl (arun, arun_many, raw HTML, JS, screenshots, cache, hooks)
- content processing (markdown, citations, BM25/pruning filters, links, images, tables, metadata)
- extraction strategies (JsonCss, JsonXPath, JsonLxml, Regex, Cosine, NoExtraction)
- deep crawl (BFS, DFS, BestFirst, filters, scorers, URL normalization)
- browser management (lifecycle, viewport, wait_for, stealth, sessions, iframes)
- config serialization (BrowserConfig, CrawlerRunConfig, ProxyConfig roundtrips)
- utilities (extract_xml_data, cache modes, content hashing)
- edge cases (empty pages, malformed HTML, unicode, concurrent crawls, error recovery)

Also adds /c4ai-check slash command for testing changes against the suite.
2026-03-08 03:20:52 +00:00

406 lines
18 KiB
Python

"""
Crawl4AI Regression Tests - Core Crawling Functionality
Tests core crawling features including basic crawls, raw HTML, multiple URLs,
screenshots, JavaScript execution, caching, sessions, hooks, network capture,
CSS selectors, excluded tags, timeouts, and status codes.
All tests use real browser crawling with no mocking.
"""
import asyncio
import base64
import pytest
from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig
from crawl4ai.cache_context import CacheMode
# ---------------------------------------------------------------------------
# Basic crawl tests
# ---------------------------------------------------------------------------
@pytest.mark.asyncio
async def test_basic_crawl(local_server):
"""Crawl the local server home page and verify basic result fields."""
async with AsyncWebCrawler(config=BrowserConfig(headless=True, verbose=False)) as crawler:
result = await crawler.arun(local_server + "/")
assert result.success, f"Crawl failed: {result.error_message}"
assert "<h1>" in result.html, "HTML should contain an <h1> tag"
assert isinstance(result.markdown, str), "Markdown should be a string"
assert len(result.markdown) > 0, "Markdown should be non-empty"
@pytest.mark.asyncio
@pytest.mark.network
async def test_basic_crawl_real_url():
"""Crawl https://example.com and verify success with real content."""
async with AsyncWebCrawler(config=BrowserConfig(headless=True, verbose=False)) as crawler:
result = await crawler.arun("https://example.com")
assert result.success, f"Crawl failed: {result.error_message}"
assert len(result.html) > 100, "HTML should have substantial content"
assert len(result.markdown) > 10, "Markdown should have content"
# ---------------------------------------------------------------------------
# Raw HTML crawl tests
# ---------------------------------------------------------------------------
@pytest.mark.asyncio
async def test_raw_html_crawl():
"""Crawl raw HTML and verify markdown extraction."""
async with AsyncWebCrawler(config=BrowserConfig(headless=True, verbose=False)) as crawler:
result = await crawler.arun("raw:<html><body><h1>Test</h1><p>Hello world</p></body></html>")
assert result.success, f"Raw HTML crawl failed: {result.error_message}"
assert "Test" in result.markdown, "Markdown should contain 'Test'"
assert "Hello" in result.markdown, "Markdown should contain 'Hello'"
@pytest.mark.asyncio
async def test_raw_html_with_base_url():
"""Raw HTML with relative links should resolve against base_url."""
raw_html = (
"raw:<html><body>"
'<a href="/page1">Link 1</a>'
'<a href="/page2">Link 2</a>'
'<a href="https://other.com/abs">Absolute</a>'
"</body></html>"
)
config = CrawlerRunConfig(base_url="http://example.com")
async with AsyncWebCrawler(config=BrowserConfig(headless=True, verbose=False)) as crawler:
result = await crawler.arun(raw_html, config=config)
assert result.success, f"Raw HTML with base_url failed: {result.error_message}"
# Check that links were resolved (they should appear in the result's links or markdown)
md_lower = result.markdown.lower() if result.markdown else ""
html_lower = result.html.lower() if result.html else ""
combined = md_lower + html_lower
# At minimum, the link text should appear
assert "link 1" in combined, "Link text should be present"
# ---------------------------------------------------------------------------
# Multiple URL crawl tests
# ---------------------------------------------------------------------------
@pytest.mark.asyncio
async def test_arun_many(local_server):
"""Crawl 3 local server URLs with arun_many and verify all succeed."""
urls = [
local_server + "/",
local_server + "/products",
local_server + "/tables",
]
async with AsyncWebCrawler(config=BrowserConfig(headless=True, verbose=False)) as crawler:
results = await crawler.arun_many(urls, config=CrawlerRunConfig(cache_mode=CacheMode.BYPASS))
assert isinstance(results, list), "arun_many should return a list"
assert len(results) == 3, f"Expected 3 results, got {len(results)}"
for i, result in enumerate(results):
assert result.success, f"Result {i} failed: {result.error_message}"
@pytest.mark.asyncio
@pytest.mark.network
async def test_arun_many_real():
"""Crawl multiple real URLs together."""
urls = ["https://example.com", "https://quotes.toscrape.com"]
async with AsyncWebCrawler(config=BrowserConfig(headless=True, verbose=False)) as crawler:
results = await crawler.arun_many(urls, config=CrawlerRunConfig(cache_mode=CacheMode.BYPASS))
assert len(results) == 2, f"Expected 2 results, got {len(results)}"
for result in results:
assert result.success, f"Real URL crawl failed: {result.error_message}"
# ---------------------------------------------------------------------------
# Screenshot tests
# ---------------------------------------------------------------------------
@pytest.mark.asyncio
async def test_screenshot_capture(local_server):
"""Crawl with screenshot=True and verify PNG format output."""
config = CrawlerRunConfig(screenshot=True)
async with AsyncWebCrawler(config=BrowserConfig(headless=True, verbose=False)) as crawler:
result = await crawler.arun(local_server + "/", config=config)
assert result.success, f"Screenshot crawl failed: {result.error_message}"
assert result.screenshot, "Screenshot should be a non-empty string"
assert isinstance(result.screenshot, str), "Screenshot should be a base64 string"
# Decode and verify PNG header
raw_bytes = base64.b64decode(result.screenshot)
assert raw_bytes[:4] == b"\x89PNG", "Screenshot should be in PNG format"
@pytest.mark.asyncio
async def test_screenshot_not_bmp(local_server):
"""Verify screenshot is PNG format, NOT BMP (regression for #1758)."""
config = CrawlerRunConfig(screenshot=True)
async with AsyncWebCrawler(config=BrowserConfig(headless=True, verbose=False)) as crawler:
result = await crawler.arun(local_server + "/", config=config)
assert result.success
raw_bytes = base64.b64decode(result.screenshot)
# BMP files start with b'BM'
assert raw_bytes[:2] != b"BM", "Screenshot should NOT be BMP format"
assert raw_bytes[:4] == b"\x89PNG", "Screenshot should be PNG format"
# ---------------------------------------------------------------------------
# JavaScript execution tests
# ---------------------------------------------------------------------------
@pytest.mark.asyncio
async def test_js_execution(local_server):
"""Crawl /js-dynamic with wait_for to verify JS-generated content loads."""
config = CrawlerRunConfig(wait_for="css:.js-loaded")
async with AsyncWebCrawler(config=BrowserConfig(headless=True, verbose=False)) as crawler:
result = await crawler.arun(local_server + "/js-dynamic", config=config)
assert result.success, f"JS dynamic crawl failed: {result.error_message}"
assert "Dynamic content successfully loaded" in result.markdown, (
"JS-generated content should appear in markdown"
)
@pytest.mark.asyncio
async def test_js_code_execution(local_server):
"""Execute custom JS code during crawl and verify modification."""
config = CrawlerRunConfig(
js_code="document.title = 'Modified Title';",
)
async with AsyncWebCrawler(config=BrowserConfig(headless=True, verbose=False)) as crawler:
result = await crawler.arun(local_server + "/", config=config)
assert result.success, f"JS code execution crawl failed: {result.error_message}"
# The JS ran after page load; verify it did not cause errors
# (title change may or may not be reflected in html depending on timing)
@pytest.mark.asyncio
async def test_js_code_before_wait(local_server):
"""Use js_code_before_wait to inject content, then wait_for to verify it."""
js_inject = """
const div = document.createElement('div');
div.id = 'injected-marker';
div.className = 'injected';
div.textContent = 'Injected by js_code_before_wait';
document.body.appendChild(div);
"""
config = CrawlerRunConfig(
js_code_before_wait=js_inject,
wait_for="css:#injected-marker",
)
async with AsyncWebCrawler(config=BrowserConfig(headless=True, verbose=False)) as crawler:
result = await crawler.arun(local_server + "/", config=config)
assert result.success, f"js_code_before_wait crawl failed: {result.error_message}"
assert "Injected by js_code_before_wait" in result.markdown, (
"Injected content should appear in markdown"
)
# ---------------------------------------------------------------------------
# Cache mode tests
# ---------------------------------------------------------------------------
@pytest.mark.asyncio
async def test_cache_write_and_read(local_server):
"""Crawl with ENABLED cache, then crawl again to verify cache hit."""
config = CrawlerRunConfig(cache_mode=CacheMode.ENABLED)
async with AsyncWebCrawler(config=BrowserConfig(headless=True, verbose=False)) as crawler:
# First crawl - writes to cache
result1 = await crawler.arun(local_server + "/", config=config)
assert result1.success, f"First crawl failed: {result1.error_message}"
# Second crawl - should read from cache
result2 = await crawler.arun(local_server + "/", config=config)
assert result2.success, f"Second crawl failed: {result2.error_message}"
if result2.cache_status:
assert "hit" in result2.cache_status.lower(), (
f"Second crawl should be a cache hit, got: {result2.cache_status}"
)
@pytest.mark.asyncio
async def test_cache_bypass(local_server):
"""Crawl with BYPASS cache mode; result should still succeed."""
config = CrawlerRunConfig(cache_mode=CacheMode.BYPASS)
async with AsyncWebCrawler(config=BrowserConfig(headless=True, verbose=False)) as crawler:
result = await crawler.arun(local_server + "/", config=config)
assert result.success, f"Bypass cache crawl failed: {result.error_message}"
assert len(result.html) > 0, "HTML should be non-empty even with bypass"
@pytest.mark.asyncio
async def test_cache_disabled(local_server):
"""Crawl with DISABLED cache; second crawl should not be cached."""
config = CrawlerRunConfig(cache_mode=CacheMode.DISABLED)
async with AsyncWebCrawler(config=BrowserConfig(headless=True, verbose=False)) as crawler:
result1 = await crawler.arun(local_server + "/", config=config)
assert result1.success
result2 = await crawler.arun(local_server + "/", config=config)
assert result2.success
# With DISABLED, there should be no cache hit
if result2.cache_status:
assert "hit" not in result2.cache_status.lower(), (
"DISABLED cache should not produce a cache hit"
)
# ---------------------------------------------------------------------------
# Session reuse test
# ---------------------------------------------------------------------------
@pytest.mark.asyncio
async def test_session_reuse(local_server):
"""Crawl with a session_id, crawl again with same session_id; both succeed."""
config = CrawlerRunConfig(session_id="test-session", cache_mode=CacheMode.BYPASS)
async with AsyncWebCrawler(config=BrowserConfig(headless=True, verbose=False)) as crawler:
result1 = await crawler.arun(local_server + "/", config=config)
assert result1.success, f"First session crawl failed: {result1.error_message}"
result2 = await crawler.arun(local_server + "/", config=config)
assert result2.success, f"Second session crawl failed: {result2.error_message}"
# ---------------------------------------------------------------------------
# Hooks test
# ---------------------------------------------------------------------------
@pytest.mark.asyncio
async def test_hooks_fire(local_server):
"""Verify before_goto and after_goto hooks are called during crawl."""
calls = []
async def before_hook(page, context, url, **kwargs):
calls.append(("before_goto", url))
return page
async def after_hook(page, context, url, **kwargs):
calls.append(("after_goto", url))
return page
async with AsyncWebCrawler(config=BrowserConfig(headless=True, verbose=False)) as crawler:
crawler.crawler_strategy.set_hook("before_goto", before_hook)
crawler.crawler_strategy.set_hook("after_goto", after_hook)
result = await crawler.arun(local_server + "/", config=CrawlerRunConfig(cache_mode=CacheMode.BYPASS))
assert result.success, f"Hook crawl failed: {result.error_message}"
hook_types = [c[0] for c in calls]
assert "before_goto" in hook_types, "before_goto hook should have been called"
assert "after_goto" in hook_types, "after_goto hook should have been called"
# ---------------------------------------------------------------------------
# Network capture test
# ---------------------------------------------------------------------------
@pytest.mark.asyncio
async def test_network_request_capture(local_server):
"""Crawl with capture_network_requests=True and verify requests are captured."""
config = CrawlerRunConfig(capture_network_requests=True, cache_mode=CacheMode.BYPASS)
async with AsyncWebCrawler(config=BrowserConfig(headless=True, verbose=False)) as crawler:
result = await crawler.arun(local_server + "/", config=config)
assert result.success, f"Network capture crawl failed: {result.error_message}"
assert result.network_requests is not None, "network_requests should not be None"
assert isinstance(result.network_requests, list), "network_requests should be a list"
assert len(result.network_requests) >= 1, "Should capture at least 1 network request"
# Each entry should have a url key
assert "url" in result.network_requests[0], (
"Network request entries should have a 'url' key"
)
# ---------------------------------------------------------------------------
# CSS selector test
# ---------------------------------------------------------------------------
@pytest.mark.asyncio
async def test_css_selector(local_server):
"""Crawl /products with css_selector to narrow content extraction."""
config = CrawlerRunConfig(css_selector=".product-list", cache_mode=CacheMode.BYPASS)
async with AsyncWebCrawler(config=BrowserConfig(headless=True, verbose=False)) as crawler:
result = await crawler.arun(local_server + "/products", config=config)
assert result.success, f"CSS selector crawl failed: {result.error_message}"
# The product content should be present
assert "Wireless Mouse" in result.html, "Product content should be in HTML"
# The h1 "Products" is outside .product-list, should not be in the selected HTML
# css_selector filters the HTML sent to content extraction
assert "<h1>" not in result.html, (
"The h1 outside .product-list should not appear in result.html"
)
# ---------------------------------------------------------------------------
# Excluded tags test
# ---------------------------------------------------------------------------
@pytest.mark.asyncio
async def test_excluded_tags(local_server):
"""Crawl with excluded_tags to remove nav and footer content."""
config = CrawlerRunConfig(excluded_tags=["nav", "footer"], cache_mode=CacheMode.BYPASS)
async with AsyncWebCrawler(config=BrowserConfig(headless=True, verbose=False)) as crawler:
result = await crawler.arun(local_server + "/", config=config)
assert result.success, f"Excluded tags crawl failed: {result.error_message}"
cleaned = result.cleaned_html or ""
assert "<nav" not in cleaned.lower(), "cleaned_html should not contain nav element"
assert "<footer" not in cleaned.lower(), "cleaned_html should not contain footer element"
# ---------------------------------------------------------------------------
# Page timeout test
# ---------------------------------------------------------------------------
@pytest.mark.asyncio
async def test_page_timeout(local_server):
"""Crawl /slow with a 500ms timeout; expect failure or timeout."""
config = CrawlerRunConfig(page_timeout=500, cache_mode=CacheMode.BYPASS)
async with AsyncWebCrawler(config=BrowserConfig(headless=True, verbose=False)) as crawler:
result = await crawler.arun(local_server + "/slow", config=config)
# The slow page takes 2 seconds but we gave only 500ms
# It should either fail or have an error
if result.success:
# Some browsers may still return partial content; that is acceptable
pass
else:
assert result.error_message is not None, (
"Failed crawl should have an error message"
)
# ---------------------------------------------------------------------------
# Status code tests
# ---------------------------------------------------------------------------
@pytest.mark.asyncio
async def test_404_status_code(local_server):
"""Crawl /not-found and verify 404 status code."""
config = CrawlerRunConfig(cache_mode=CacheMode.BYPASS)
async with AsyncWebCrawler(config=BrowserConfig(headless=True, verbose=False)) as crawler:
result = await crawler.arun(local_server + "/not-found", config=config)
assert result.status_code == 404, (
f"Expected status code 404, got {result.status_code}"
)
@pytest.mark.asyncio
async def test_redirect_status(local_server):
"""Crawl /redirect and verify it follows the redirect to home."""
config = CrawlerRunConfig(cache_mode=CacheMode.BYPASS)
async with AsyncWebCrawler(config=BrowserConfig(headless=True, verbose=False)) as crawler:
result = await crawler.arun(local_server + "/redirect", config=config)
assert result.success, f"Redirect crawl failed: {result.error_message}"
# After redirect, the final URL should be the home page
if result.redirected_url:
assert result.redirected_url.rstrip("/").endswith(
local_server.rstrip("/").split(":")[-1]
) or result.redirected_url.endswith("/"), (
f"Redirected URL should end with /, got: {result.redirected_url}"
)