mirror of
https://github.com/unclecode/crawl4ai.git
synced 2026-06-10 15:58:15 +00:00
Full regression suite covering all major Crawl4AI subsystems: - core crawl (arun, arun_many, raw HTML, JS, screenshots, cache, hooks) - content processing (markdown, citations, BM25/pruning filters, links, images, tables, metadata) - extraction strategies (JsonCss, JsonXPath, JsonLxml, Regex, Cosine, NoExtraction) - deep crawl (BFS, DFS, BestFirst, filters, scorers, URL normalization) - browser management (lifecycle, viewport, wait_for, stealth, sessions, iframes) - config serialization (BrowserConfig, CrawlerRunConfig, ProxyConfig roundtrips) - utilities (extract_xml_data, cache modes, content hashing) - edge cases (empty pages, malformed HTML, unicode, concurrent crawls, error recovery) Also adds /c4ai-check slash command for testing changes against the suite.
562 lines
24 KiB
Python
562 lines
24 KiB
Python
"""
|
|
Crawl4AI Regression Tests - Browser Management and Features
|
|
|
|
Tests browser lifecycle, viewport configuration, wait_for conditions, JavaScript
|
|
execution, page interaction, screenshots, iframe processing, overlay removal,
|
|
stealth mode, session management, network capture, and anti-bot features using
|
|
real browser crawling with no mocking.
|
|
"""
|
|
|
|
import base64
|
|
import time
|
|
|
|
import pytest
|
|
|
|
from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig
|
|
from crawl4ai.cache_context import CacheMode
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Browser lifecycle
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_browser_lifecycle(local_server):
|
|
"""Create crawler, start, crawl, and close explicitly without context manager."""
|
|
crawler = AsyncWebCrawler(config=BrowserConfig(headless=True, verbose=False))
|
|
await crawler.start()
|
|
try:
|
|
result = await crawler.arun(
|
|
url=local_server + "/",
|
|
config=CrawlerRunConfig(verbose=False),
|
|
)
|
|
assert result.success, f"Crawl failed: {result.error_message}"
|
|
assert len(result.html) > 0, "HTML should be non-empty"
|
|
finally:
|
|
await crawler.close()
|
|
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_browser_context_manager(local_server):
|
|
"""Verify async with pattern works and cleanup happens without error."""
|
|
async with AsyncWebCrawler(config=BrowserConfig(headless=True, verbose=False)) as crawler:
|
|
result = await crawler.arun(
|
|
url=local_server + "/",
|
|
config=CrawlerRunConfig(verbose=False),
|
|
)
|
|
assert result.success, f"Context manager crawl failed: {result.error_message}"
|
|
# If we get here without exception, cleanup succeeded
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Viewport configuration
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_custom_viewport(local_server):
|
|
"""Create BrowserConfig with 1920x1080 viewport and verify crawl succeeds."""
|
|
browser_config = BrowserConfig(
|
|
headless=True,
|
|
verbose=False,
|
|
viewport_width=1920,
|
|
viewport_height=1080,
|
|
)
|
|
async with AsyncWebCrawler(config=browser_config) as crawler:
|
|
result = await crawler.arun(
|
|
url=local_server + "/",
|
|
config=CrawlerRunConfig(verbose=False),
|
|
)
|
|
assert result.success, f"Custom viewport crawl failed: {result.error_message}"
|
|
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_small_viewport(local_server):
|
|
"""Mobile-like viewport (375x667) should still produce a successful crawl."""
|
|
browser_config = BrowserConfig(
|
|
headless=True,
|
|
verbose=False,
|
|
viewport_width=375,
|
|
viewport_height=667,
|
|
)
|
|
async with AsyncWebCrawler(config=browser_config) as crawler:
|
|
result = await crawler.arun(
|
|
url=local_server + "/",
|
|
config=CrawlerRunConfig(verbose=False),
|
|
)
|
|
assert result.success, f"Small viewport crawl failed: {result.error_message}"
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# wait_for conditions
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_wait_for_css_selector(local_server):
|
|
"""Wait for a CSS selector on /js-dynamic and verify dynamic content loaded."""
|
|
config = CrawlerRunConfig(wait_for="css:.js-loaded", verbose=False)
|
|
async with AsyncWebCrawler(config=BrowserConfig(headless=True, verbose=False)) as crawler:
|
|
result = await crawler.arun(url=local_server + "/js-dynamic", config=config)
|
|
assert result.success, f"wait_for CSS crawl failed: {result.error_message}"
|
|
assert "Dynamic content successfully loaded" in (result.markdown or ""), (
|
|
"Dynamic JS content should appear after waiting for .js-loaded"
|
|
)
|
|
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_wait_for_js_function(local_server):
|
|
"""Wait for a JS condition on /js-dynamic and verify the counter value."""
|
|
config = CrawlerRunConfig(
|
|
wait_for="js:() => document.getElementById('counter').textContent === '42'",
|
|
verbose=False,
|
|
)
|
|
async with AsyncWebCrawler(config=BrowserConfig(headless=True, verbose=False)) as crawler:
|
|
result = await crawler.arun(url=local_server + "/js-dynamic", config=config)
|
|
assert result.success, f"wait_for JS crawl failed: {result.error_message}"
|
|
assert "42" in (result.html or ""), (
|
|
"Counter should be set to 42 after JS wait condition is met"
|
|
)
|
|
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_wait_for_timeout(local_server):
|
|
"""Wait for a non-existent selector with short timeout should not hang forever."""
|
|
config = CrawlerRunConfig(
|
|
wait_for="css:.nonexistent-class",
|
|
wait_for_timeout=500,
|
|
verbose=False,
|
|
)
|
|
async with AsyncWebCrawler(config=BrowserConfig(headless=True, verbose=False)) as crawler:
|
|
# This may succeed (with timeout warning) or fail, but should not hang
|
|
result = await crawler.arun(url=local_server + "/js-dynamic", config=config)
|
|
# We just verify it returned without hanging; success or failure is acceptable
|
|
assert result is not None, "Should return a result even if wait_for times out"
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# JavaScript execution
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_js_code_modifies_dom(local_server):
|
|
"""Execute JS that adds a DOM element and verify it appears in the result."""
|
|
config = CrawlerRunConfig(
|
|
js_code='document.body.innerHTML += \'<div id="injected">Injected by JS</div>\';',
|
|
verbose=False,
|
|
)
|
|
async with AsyncWebCrawler(config=BrowserConfig(headless=True, verbose=False)) as crawler:
|
|
result = await crawler.arun(url=local_server + "/", config=config)
|
|
assert result.success, f"JS DOM modification crawl failed: {result.error_message}"
|
|
combined = (result.html or "") + (result.markdown or "")
|
|
assert "Injected by JS" in combined, (
|
|
"Injected content should appear in HTML or markdown"
|
|
)
|
|
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_js_code_returns_value(local_server):
|
|
"""Execute JS that returns document.title and check js_execution_result."""
|
|
config = CrawlerRunConfig(
|
|
js_code="return document.title;",
|
|
verbose=False,
|
|
)
|
|
async with AsyncWebCrawler(config=BrowserConfig(headless=True, verbose=False)) as crawler:
|
|
result = await crawler.arun(url=local_server + "/", config=config)
|
|
assert result.success, f"JS return value crawl failed: {result.error_message}"
|
|
# js_execution_result should contain the returned value
|
|
if result.js_execution_result is not None:
|
|
# The result might be stored under a key or directly
|
|
result_str = str(result.js_execution_result)
|
|
assert "Crawl4AI Test Home" in result_str or len(result_str) > 0, (
|
|
"js_execution_result should contain the document title"
|
|
)
|
|
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_multiple_js_scripts(local_server):
|
|
"""Execute multiple JS scripts sequentially; last one sets title to 'B'."""
|
|
config = CrawlerRunConfig(
|
|
js_code=["document.title='A';", "document.title='B';"],
|
|
verbose=False,
|
|
)
|
|
async with AsyncWebCrawler(config=BrowserConfig(headless=True, verbose=False)) as crawler:
|
|
result = await crawler.arun(url=local_server + "/", config=config)
|
|
assert result.success, f"Multiple JS scripts crawl failed: {result.error_message}"
|
|
# Both scripts should have executed; title should end up as 'B'
|
|
# We can check via the HTML title tag or via another JS execution
|
|
# The HTML might still have the original title in source, but the page state changed
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Page interaction
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_scan_full_page(local_server):
|
|
"""Crawl /large with scan_full_page=True and verify bottom sections appear."""
|
|
config = CrawlerRunConfig(
|
|
scan_full_page=True,
|
|
scroll_delay=0.05,
|
|
verbose=False,
|
|
)
|
|
async with AsyncWebCrawler(config=BrowserConfig(headless=True, verbose=False)) as crawler:
|
|
result = await crawler.arun(url=local_server + "/large", config=config)
|
|
assert result.success, f"Full page scan crawl failed: {result.error_message}"
|
|
# The large page has 50 sections; verify some from near the bottom
|
|
combined = (result.html or "") + (result.markdown or "")
|
|
assert "Section 49" in combined, (
|
|
"Scanning the full page should reveal the last section (Section 49)"
|
|
)
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Screenshot features
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_screenshot_basic(local_server):
|
|
"""Crawl with screenshot=True, decode base64, and verify PNG header."""
|
|
config = CrawlerRunConfig(screenshot=True, verbose=False)
|
|
async with AsyncWebCrawler(config=BrowserConfig(headless=True, verbose=False)) as crawler:
|
|
result = await crawler.arun(url=local_server + "/", config=config)
|
|
assert result.success, f"Screenshot crawl failed: {result.error_message}"
|
|
assert result.screenshot, "Screenshot should be a non-empty base64 string"
|
|
raw_bytes = base64.b64decode(result.screenshot)
|
|
assert raw_bytes[:4] == b"\x89PNG", (
|
|
"Screenshot should be in PNG format"
|
|
)
|
|
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_force_viewport_screenshot(local_server):
|
|
"""Crawl /large with force_viewport_screenshot=True; should capture viewport only."""
|
|
config = CrawlerRunConfig(
|
|
screenshot=True,
|
|
force_viewport_screenshot=True,
|
|
verbose=False,
|
|
)
|
|
async with AsyncWebCrawler(config=BrowserConfig(headless=True, verbose=False)) as crawler:
|
|
result = await crawler.arun(url=local_server + "/large", config=config)
|
|
assert result.success, f"Force viewport screenshot crawl failed: {result.error_message}"
|
|
assert result.screenshot, "Screenshot should be captured"
|
|
raw_bytes = base64.b64decode(result.screenshot)
|
|
assert raw_bytes[:4] == b"\x89PNG", "Viewport screenshot should be PNG"
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Process iframes
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_process_iframes(local_server):
|
|
"""Crawl /iframe-page with process_iframes=True and verify iframe content appears."""
|
|
config = CrawlerRunConfig(process_iframes=True, verbose=False)
|
|
async with AsyncWebCrawler(config=BrowserConfig(headless=True, verbose=False)) as crawler:
|
|
result = await crawler.arun(url=local_server + "/iframe-page", config=config)
|
|
assert result.success, f"Iframe processing crawl failed: {result.error_message}"
|
|
combined = (result.html or "") + (result.markdown or "")
|
|
# At least one iframe's content should appear
|
|
has_iframe_content = (
|
|
"Iframe 1 content" in combined
|
|
or "Iframe 2 heading" in combined
|
|
or "embedded" in combined.lower()
|
|
)
|
|
assert has_iframe_content, (
|
|
"Iframe content should appear in the result when process_iframes=True"
|
|
)
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Overlay and popup removal
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_remove_overlay_elements(local_server):
|
|
"""Crawl with remove_overlay_elements=True; verify it does not break crawling."""
|
|
config = CrawlerRunConfig(remove_overlay_elements=True, verbose=False)
|
|
async with AsyncWebCrawler(config=BrowserConfig(headless=True, verbose=False)) as crawler:
|
|
result = await crawler.arun(url=local_server + "/", config=config)
|
|
assert result.success, (
|
|
f"Overlay removal should not break crawling: {result.error_message}"
|
|
)
|
|
assert len(result.html) > 0, "HTML should still be present after overlay removal"
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Stealth mode
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_stealth_mode_no_crash(local_server):
|
|
"""Stealth mode should not break basic local crawling."""
|
|
browser_config = BrowserConfig(
|
|
headless=True,
|
|
verbose=False,
|
|
enable_stealth=True,
|
|
)
|
|
async with AsyncWebCrawler(config=browser_config) as crawler:
|
|
result = await crawler.arun(
|
|
url=local_server + "/",
|
|
config=CrawlerRunConfig(verbose=False),
|
|
)
|
|
assert result.success, f"Stealth mode crawl failed: {result.error_message}"
|
|
assert "Crawl4AI Test Home" in (result.html or ""), (
|
|
"Stealth mode should still extract content correctly"
|
|
)
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Session management
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_session_persistence(local_server):
|
|
"""Session state should persist between crawls with the same session_id."""
|
|
async with AsyncWebCrawler(config=BrowserConfig(headless=True, verbose=False)) as crawler:
|
|
# First crawl: set a JS variable
|
|
config1 = CrawlerRunConfig(
|
|
session_id="persist-test",
|
|
js_code="window.__testVar = 'hello';",
|
|
verbose=False,
|
|
)
|
|
result1 = await crawler.arun(url=local_server + "/", config=config1)
|
|
assert result1.success, f"First session crawl failed: {result1.error_message}"
|
|
|
|
# Second crawl: read the JS variable using js_only mode
|
|
config2 = CrawlerRunConfig(
|
|
session_id="persist-test",
|
|
js_only=True,
|
|
js_code="return window.__testVar;",
|
|
verbose=False,
|
|
)
|
|
result2 = await crawler.arun(url=local_server + "/", config=config2)
|
|
assert result2.success, f"Second session crawl failed: {result2.error_message}"
|
|
|
|
# Check if testVar persisted
|
|
if result2.js_execution_result is not None:
|
|
result_str = str(result2.js_execution_result)
|
|
assert "hello" in result_str, (
|
|
f"Session variable should persist; got: {result_str}"
|
|
)
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Delay before return HTML
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_delay_before_return(local_server):
|
|
"""Crawl with delay_before_return_html=0.5 should succeed and take reasonable time."""
|
|
config = CrawlerRunConfig(delay_before_return_html=0.5, verbose=False)
|
|
async with AsyncWebCrawler(config=BrowserConfig(headless=True, verbose=False)) as crawler:
|
|
start_time = time.monotonic()
|
|
result = await crawler.arun(url=local_server + "/", config=config)
|
|
elapsed = time.monotonic() - start_time
|
|
|
|
assert result.success, f"Delayed crawl failed: {result.error_message}"
|
|
assert elapsed >= 0.4, (
|
|
f"Crawl with 0.5s delay should take at least 0.4s, took {elapsed:.2f}s"
|
|
)
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Network features
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_capture_network_requests(local_server):
|
|
"""Crawl /js-dynamic with capture_network_requests=True and verify list returned."""
|
|
config = CrawlerRunConfig(
|
|
capture_network_requests=True,
|
|
cache_mode=CacheMode.BYPASS,
|
|
verbose=False,
|
|
)
|
|
async with AsyncWebCrawler(config=BrowserConfig(headless=True, verbose=False)) as crawler:
|
|
result = await crawler.arun(url=local_server + "/js-dynamic", config=config)
|
|
assert result.success, f"Network capture crawl failed: {result.error_message}"
|
|
assert result.network_requests is not None, "network_requests should not be None"
|
|
assert isinstance(result.network_requests, list), (
|
|
"network_requests should be a list"
|
|
)
|
|
assert len(result.network_requests) >= 1, (
|
|
"Should capture at least 1 network request (the page itself)"
|
|
)
|
|
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_capture_console_messages(local_server):
|
|
"""Crawl with capture_console_messages=True and verify the attribute is a list."""
|
|
config = CrawlerRunConfig(
|
|
capture_console_messages=True,
|
|
cache_mode=CacheMode.BYPASS,
|
|
verbose=False,
|
|
)
|
|
async with AsyncWebCrawler(config=BrowserConfig(headless=True, verbose=False)) as crawler:
|
|
result = await crawler.arun(url=local_server + "/", config=config)
|
|
assert result.success, f"Console capture crawl failed: {result.error_message}"
|
|
assert result.console_messages is not None, (
|
|
"console_messages should not be None when capture is enabled"
|
|
)
|
|
assert isinstance(result.console_messages, list), (
|
|
"console_messages should be a list"
|
|
)
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Real URL browser tests
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
@pytest.mark.asyncio
|
|
@pytest.mark.network
|
|
async def test_real_url_with_wait():
|
|
"""Crawl https://quotes.toscrape.com with wait_until='load' and verify content."""
|
|
config = CrawlerRunConfig(wait_until="load", verbose=False)
|
|
async with AsyncWebCrawler(config=BrowserConfig(headless=True, verbose=False)) as crawler:
|
|
result = await crawler.arun(url="https://quotes.toscrape.com", config=config)
|
|
assert result.success, f"Real URL crawl failed: {result.error_message}"
|
|
assert len(result.html) > 100, "Real page should have substantial HTML"
|
|
combined = (result.markdown or "") + (result.html or "")
|
|
assert "quote" in combined.lower() or "quotes" in combined.lower(), (
|
|
"Quotes page should contain the word 'quote'"
|
|
)
|
|
|
|
|
|
@pytest.mark.asyncio
|
|
@pytest.mark.network
|
|
async def test_real_url_screenshot():
|
|
"""Crawl https://example.com with screenshot=True and verify PNG captured."""
|
|
config = CrawlerRunConfig(screenshot=True, verbose=False)
|
|
async with AsyncWebCrawler(config=BrowserConfig(headless=True, verbose=False)) as crawler:
|
|
result = await crawler.arun(url="https://example.com", config=config)
|
|
assert result.success, f"Real URL screenshot crawl failed: {result.error_message}"
|
|
assert result.screenshot, "Screenshot should be non-empty"
|
|
raw_bytes = base64.b64decode(result.screenshot)
|
|
assert raw_bytes[:4] == b"\x89PNG", "Real URL screenshot should be PNG format"
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Anti-bot basic check
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_magic_mode_no_crash(local_server):
|
|
"""Magic mode should not break normal local crawling."""
|
|
config = CrawlerRunConfig(magic=True, verbose=False)
|
|
async with AsyncWebCrawler(config=BrowserConfig(headless=True, verbose=False)) as crawler:
|
|
result = await crawler.arun(url=local_server + "/", config=config)
|
|
assert result.success, (
|
|
f"Magic mode should not break crawling: {result.error_message}"
|
|
)
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Edge cases
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_crawl_empty_page(local_server):
|
|
"""Crawling a page with empty body should not crash, even if anti-bot flags it."""
|
|
async with AsyncWebCrawler(config=BrowserConfig(headless=True, verbose=False)) as crawler:
|
|
result = await crawler.arun(
|
|
url=local_server + "/empty",
|
|
config=CrawlerRunConfig(verbose=False),
|
|
)
|
|
# Anti-bot detection may flag near-empty pages as blocked, which is expected
|
|
# behavior. The key assertion is that it returns a result without crashing.
|
|
assert result is not None, "Should return a result even for empty page"
|
|
assert result.html is not None, "HTML should not be None for empty page"
|
|
if not result.success:
|
|
assert "empty" in (result.error_message or "").lower() or "blocked" in (result.error_message or "").lower(), (
|
|
f"Empty page failure should mention empty/blocked content: {result.error_message}"
|
|
)
|
|
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_crawl_malformed_html(local_server):
|
|
"""Crawling malformed HTML should not crash, even if anti-bot flags it."""
|
|
async with AsyncWebCrawler(config=BrowserConfig(headless=True, verbose=False)) as crawler:
|
|
result = await crawler.arun(
|
|
url=local_server + "/malformed",
|
|
config=CrawlerRunConfig(verbose=False),
|
|
)
|
|
# Anti-bot may flag malformed HTML as blocked due to minimal visible text.
|
|
# The key assertion is that it returns a result without crashing.
|
|
assert result is not None, "Should return a result for malformed HTML"
|
|
assert result.html is not None, "HTML should not be None even for malformed input"
|
|
# The content is present in the HTML even if the crawl is marked as not successful
|
|
assert "Unclosed paragraph" in (result.html or "") or "Malformed" in (result.html or ""), (
|
|
"Some original content should appear in the HTML"
|
|
)
|
|
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_multiple_crawls_same_crawler(local_server):
|
|
"""A single crawler instance should handle multiple sequential crawls."""
|
|
async with AsyncWebCrawler(config=BrowserConfig(headless=True, verbose=False)) as crawler:
|
|
urls = [
|
|
local_server + "/",
|
|
local_server + "/products",
|
|
local_server + "/js-dynamic",
|
|
]
|
|
for url in urls:
|
|
result = await crawler.arun(
|
|
url=url,
|
|
config=CrawlerRunConfig(verbose=False),
|
|
)
|
|
assert result.success, f"Sequential crawl of {url} failed: {result.error_message}"
|
|
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_screenshot_not_captured_by_default(local_server):
|
|
"""Without screenshot=True, result.screenshot should be None or empty."""
|
|
config = CrawlerRunConfig(screenshot=False, verbose=False)
|
|
async with AsyncWebCrawler(config=BrowserConfig(headless=True, verbose=False)) as crawler:
|
|
result = await crawler.arun(url=local_server + "/", config=config)
|
|
assert result.success, f"No-screenshot crawl failed: {result.error_message}"
|
|
assert not result.screenshot, (
|
|
"Screenshot should be None or empty when not requested"
|
|
)
|
|
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_js_code_empty_string(local_server):
|
|
"""Empty js_code string should not cause errors."""
|
|
config = CrawlerRunConfig(js_code="", verbose=False)
|
|
async with AsyncWebCrawler(config=BrowserConfig(headless=True, verbose=False)) as crawler:
|
|
result = await crawler.arun(url=local_server + "/", config=config)
|
|
assert result.success, (
|
|
f"Empty js_code should not break crawling: {result.error_message}"
|
|
)
|
|
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_wait_until_load(local_server):
|
|
"""wait_until='load' should wait for full page load including resources."""
|
|
config = CrawlerRunConfig(wait_until="load", verbose=False)
|
|
async with AsyncWebCrawler(config=BrowserConfig(headless=True, verbose=False)) as crawler:
|
|
result = await crawler.arun(url=local_server + "/", config=config)
|
|
assert result.success, f"wait_until=load crawl failed: {result.error_message}"
|
|
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_wait_until_networkidle(local_server):
|
|
"""wait_until='networkidle' should wait until network is idle."""
|
|
config = CrawlerRunConfig(wait_until="networkidle", verbose=False)
|
|
async with AsyncWebCrawler(config=BrowserConfig(headless=True, verbose=False)) as crawler:
|
|
result = await crawler.arun(url=local_server + "/", config=config)
|
|
assert result.success, f"wait_until=networkidle crawl failed: {result.error_message}"
|