"""
Crawl4AI Regression Test Suite - Shared Fixtures
Provides a local HTTP test server with crafted pages for deterministic testing,
plus markers for network-dependent tests against real URLs.
Usage:
pytest tests/regression/ -v # all tests
pytest tests/regression/ -v -m "not network" # skip real URL tests
pytest tests/regression/ -v -k "core" # only core tests
"""
import pytest
import socket
import threading
import asyncio
import time
from aiohttp import web
# ---------------------------------------------------------------------------
# Pytest configuration
# ---------------------------------------------------------------------------
def pytest_configure(config):
config.addinivalue_line("markers", "network: tests requiring real network access")
# ---------------------------------------------------------------------------
# Test HTML Pages
# ---------------------------------------------------------------------------
HOME_HTML = """\
Crawl4AI Test Home
Welcome to the Crawl4AI Test Site
This is a comprehensive test page designed for regression testing of the
Crawl4AI web crawling library. It contains various HTML elements to verify
content extraction, markdown generation, and link discovery work correctly.
Features Overview
The test suite covers multiple aspects of web crawling including content
extraction, JavaScript execution, screenshot capture, and deep crawling
capabilities. Each feature is tested both with local pages and real URLs.
Content extraction and markdown generation
Link discovery and classification
Image extraction and scoring
Table extraction and validation
Code Example
from crawl4ai import AsyncWebCrawler
async with AsyncWebCrawler() as crawler:
result = await crawler.arun("https://example.com")
print(result.markdown)
A stunning landscape photograph showcasing the beauty of mountain scenery
at golden hour. This image demonstrates proper extraction of high-quality
photographs with descriptive alt text and surrounding context.
"""
STRUCTURED_DATA_HTML = """\
Article with Structured Data
Web Crawling Best Practices
By Test Author | Published June 15, 2025
Web crawling is the process of systematically browsing the web to extract
information. Modern crawlers like Crawl4AI provide sophisticated tools for
content extraction, including markdown generation, structured data extraction,
and intelligent link following.
Key Techniques
Understanding how to properly configure a web crawler is essential for
efficient data collection. This includes setting appropriate delays, respecting
robots.txt, and using proper user agents.
Email us at support@crawl4ai.com or sales@example.org for inquiries.
Call us: +1-555-123-4567 or (800) 555-0199
Visit https://crawl4ai.com or https://docs.crawl4ai.com/api/v2
Server IP: 192.168.1.100
Request ID: 550e8400-e29b-41d4-a716-446655440000
Price: $199.99 or EUR 175.50
Completion rate: 95.7%
Published: 2025-03-15
Updated: 03/15/2025
Meeting at 14:30 or 09:00
Zip code: 94105 or 94105-1234
Follow @crawl4ai on social media
Tags: #WebCrawling #DataExtraction #Python
Color theme: #FF5733
"""
def _generate_large_html(num_sections=50):
"""Generate a large HTML page with many sections."""
sections = []
for i in range(num_sections):
sections.append(f"""
Section {i}: Important Topic Number {i}
This is paragraph one of section {i}. It contains enough text to be
meaningful for content extraction and markdown generation testing purposes.
The crawler should properly handle large pages with many sections.
This is paragraph two of section {i}. It provides additional context
and detail about topic {i}, ensuring that the content extraction pipeline
can handle substantial amounts of text without issues.
This is a leaf page in the deep crawl hierarchy. It contains substantial
content about {title} to ensure proper extraction at all crawl depths.
The adaptive crawler should find and process this content correctly.
Back to Hub
"""
IFRAME_HTML = """\
Page with Iframes
Main Page Content
This page contains embedded iframes for testing iframe processing.