mirror of
https://github.com/unclecode/crawl4ai.git
synced 2026-06-10 15:58:15 +00:00
262 lines
9.3 KiB
Python
262 lines
9.3 KiB
Python
"""
|
|
Tests for GitHub issue #1748: scroll_delay config is now properly respected
|
|
in take_screenshot_scroller().
|
|
|
|
Three changes were made to async_crawler_strategy.py:
|
|
A) arun call site now passes scroll_delay from config
|
|
B) _generate_media_from_html call site now passes scroll_delay from config
|
|
C) take_screenshot_scroller reads scroll_delay from kwargs (was hardcoded 0.01)
|
|
|
|
These tests verify that all three paths correctly forward and use scroll_delay.
|
|
"""
|
|
|
|
import pytest
|
|
import asyncio
|
|
import base64
|
|
from io import BytesIO
|
|
from unittest.mock import AsyncMock, MagicMock, patch, call
|
|
|
|
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, BrowserConfig
|
|
from crawl4ai.async_crawler_strategy import AsyncPlaywrightCrawlerStrategy
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Helpers
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
def _make_tiny_jpeg() -> bytes:
|
|
"""Create a minimal valid JPEG image for mock screenshot returns."""
|
|
from PIL import Image
|
|
|
|
img = Image.new("RGB", (10, 10), color="red")
|
|
buf = BytesIO()
|
|
img.save(buf, format="JPEG")
|
|
return buf.getvalue()
|
|
|
|
|
|
TINY_JPEG = _make_tiny_jpeg()
|
|
|
|
# A tall HTML page that exceeds any reasonable screenshot_height_threshold
|
|
TALL_HTML = "<html><body>" + "<p>Line of content</p>" * 200 + "</body></html>"
|
|
|
|
|
|
def _make_mock_page(viewport_width=1280, viewport_height=200):
|
|
"""Create a mock Playwright page with the essentials for take_screenshot_scroller."""
|
|
page = MagicMock()
|
|
page.viewport_size = {"width": viewport_width, "height": viewport_height}
|
|
page.set_viewport_size = AsyncMock()
|
|
page.evaluate = AsyncMock(return_value=None)
|
|
page.screenshot = AsyncMock(return_value=TINY_JPEG)
|
|
return page
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Test 1 — Unit: scroll_delay extracted from kwargs correctly
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_scroll_delay_custom_value_used():
|
|
"""
|
|
When scroll_delay=1.5 is passed in kwargs, asyncio.sleep must be called
|
|
with 1.5 — NOT with the old hardcoded 0.01.
|
|
"""
|
|
strategy = AsyncPlaywrightCrawlerStrategy.__new__(AsyncPlaywrightCrawlerStrategy)
|
|
# Minimal attributes needed by take_screenshot_scroller
|
|
strategy.logger = MagicMock()
|
|
strategy.adapter = MagicMock()
|
|
|
|
page = _make_mock_page()
|
|
|
|
# get_page_dimensions returns a page taller than the viewport
|
|
strategy.get_page_dimensions = AsyncMock(
|
|
return_value={"width": 1280, "height": 600}
|
|
)
|
|
|
|
with patch(
|
|
"crawl4ai.async_crawler_strategy.asyncio.sleep", new_callable=AsyncMock
|
|
) as mock_sleep:
|
|
result = await strategy.take_screenshot_scroller(
|
|
page, scroll_delay=1.5, screenshot_height_threshold=100
|
|
)
|
|
|
|
# asyncio.sleep must have been called with our custom value
|
|
sleep_args = [c.args[0] for c in mock_sleep.call_args_list]
|
|
assert 1.5 in sleep_args, (
|
|
f"Expected asyncio.sleep(1.5) but got calls with: {sleep_args}"
|
|
)
|
|
# The old hardcoded 0.01 must NOT appear
|
|
assert 0.01 not in sleep_args, (
|
|
f"Old hardcoded 0.01 still present in sleep calls: {sleep_args}"
|
|
)
|
|
# Should return a base64-encoded string
|
|
assert isinstance(result, str)
|
|
assert len(result) > 0
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Test 2 — Unit: default scroll_delay is 0.2 when not provided
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_scroll_delay_default_value():
|
|
"""
|
|
When scroll_delay is NOT provided in kwargs, asyncio.sleep must be called
|
|
with 0.2 (the correct default), NOT 0.01.
|
|
"""
|
|
strategy = AsyncPlaywrightCrawlerStrategy.__new__(AsyncPlaywrightCrawlerStrategy)
|
|
strategy.logger = MagicMock()
|
|
strategy.adapter = MagicMock()
|
|
|
|
page = _make_mock_page()
|
|
|
|
strategy.get_page_dimensions = AsyncMock(
|
|
return_value={"width": 1280, "height": 600}
|
|
)
|
|
|
|
with patch(
|
|
"crawl4ai.async_crawler_strategy.asyncio.sleep", new_callable=AsyncMock
|
|
) as mock_sleep:
|
|
result = await strategy.take_screenshot_scroller(
|
|
page, screenshot_height_threshold=100
|
|
)
|
|
|
|
sleep_args = [c.args[0] for c in mock_sleep.call_args_list]
|
|
assert 0.2 in sleep_args, (
|
|
f"Expected default asyncio.sleep(0.2) but got calls with: {sleep_args}"
|
|
)
|
|
assert 0.01 not in sleep_args, (
|
|
f"Old hardcoded 0.01 still present in sleep calls: {sleep_args}"
|
|
)
|
|
assert isinstance(result, str)
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Test 3 — Unit: take_screenshot forwards scroll_delay to take_screenshot_scroller
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_take_screenshot_forwards_scroll_delay():
|
|
"""
|
|
When take_screenshot is called with scroll_delay=2.5 in kwargs and the page
|
|
needs scrolling, it must pass that value through to take_screenshot_scroller.
|
|
"""
|
|
strategy = AsyncPlaywrightCrawlerStrategy.__new__(AsyncPlaywrightCrawlerStrategy)
|
|
strategy.logger = MagicMock()
|
|
strategy.adapter = MagicMock()
|
|
|
|
page = _make_mock_page()
|
|
|
|
# page_need_scroll returns True so the scroller path is taken
|
|
strategy.page_need_scroll = AsyncMock(return_value=True)
|
|
strategy.take_screenshot_scroller = AsyncMock(return_value="base64data")
|
|
|
|
await strategy.take_screenshot(page, scroll_delay=2.5)
|
|
|
|
# Verify take_screenshot_scroller was called with scroll_delay in kwargs
|
|
strategy.take_screenshot_scroller.assert_called_once()
|
|
call_kwargs = strategy.take_screenshot_scroller.call_args
|
|
# kwargs are passed through via **kwargs
|
|
assert call_kwargs.kwargs.get("scroll_delay") == 2.5 or (
|
|
len(call_kwargs.args) > 1 and False
|
|
), f"scroll_delay=2.5 not forwarded. Call was: {call_kwargs}"
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Test 4 — Integration: full-page screenshot with custom scroll_delay
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_integration_arun_respects_scroll_delay():
|
|
"""
|
|
End-to-end: use AsyncWebCrawler with a raw: tall HTML page and a very low
|
|
screenshot_height_threshold to force the scroller path. Verify asyncio.sleep
|
|
is called with the configured scroll_delay, not 0.01.
|
|
"""
|
|
config = CrawlerRunConfig(
|
|
screenshot=True,
|
|
scroll_delay=0.5,
|
|
screenshot_height_threshold=100, # Very low to force scroller
|
|
)
|
|
|
|
with patch(
|
|
"crawl4ai.async_crawler_strategy.asyncio.sleep", new_callable=AsyncMock
|
|
) as mock_sleep:
|
|
async with AsyncWebCrawler() as crawler:
|
|
result = await crawler.arun(f"raw:{TALL_HTML}", config=config)
|
|
|
|
assert result.success, f"Crawl failed: {result.error_message}"
|
|
assert result.screenshot is not None, "Expected screenshot data"
|
|
|
|
# Check that our custom scroll_delay was used during screenshot capture
|
|
sleep_args = [c.args[0] for c in mock_sleep.call_args_list]
|
|
assert 0.5 in sleep_args, (
|
|
f"Expected asyncio.sleep(0.5) in screenshot capture but got: {sleep_args}"
|
|
)
|
|
assert 0.01 not in sleep_args, (
|
|
f"Old hardcoded 0.01 still present in sleep calls: {sleep_args}"
|
|
)
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Test 5 — Integration: _generate_media_from_html respects scroll_delay
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_integration_generate_media_respects_scroll_delay():
|
|
"""
|
|
Call _generate_media_from_html directly with a config that has
|
|
scroll_delay=0.75 and screenshot=True. Verify asyncio.sleep is called
|
|
with 0.75 during screenshot capture.
|
|
"""
|
|
config = CrawlerRunConfig(
|
|
screenshot=True,
|
|
scroll_delay=0.75,
|
|
screenshot_height_threshold=100, # Very low to force scroller
|
|
)
|
|
|
|
with patch(
|
|
"crawl4ai.async_crawler_strategy.asyncio.sleep", new_callable=AsyncMock
|
|
) as mock_sleep:
|
|
async with AsyncWebCrawler() as crawler:
|
|
(
|
|
screenshot_data,
|
|
pdf_data,
|
|
mhtml_data,
|
|
) = await crawler.crawler_strategy._generate_media_from_html(
|
|
TALL_HTML, config
|
|
)
|
|
|
|
assert screenshot_data is not None, (
|
|
"Expected screenshot data from _generate_media_from_html"
|
|
)
|
|
|
|
sleep_args = [c.args[0] for c in mock_sleep.call_args_list]
|
|
assert 0.75 in sleep_args, f"Expected asyncio.sleep(0.75) but got: {sleep_args}"
|
|
assert 0.01 not in sleep_args, (
|
|
f"Old hardcoded 0.01 still present in sleep calls: {sleep_args}"
|
|
)
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Test 6 — Unit: CrawlerRunConfig default scroll_delay is 0.2
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_crawler_run_config_default_scroll_delay():
|
|
"""CrawlerRunConfig.scroll_delay defaults to 0.2."""
|
|
config = CrawlerRunConfig()
|
|
assert config.scroll_delay == 0.2, (
|
|
f"Expected default scroll_delay=0.2, got {config.scroll_delay}"
|
|
)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
pytest.main([__file__, "-v"])
|