""" Crawl4AI Regression Test Suite - Shared Fixtures Provides a local HTTP test server with crafted pages for deterministic testing, plus markers for network-dependent tests against real URLs. Usage: pytest tests/regression/ -v # all tests pytest tests/regression/ -v -m "not network" # skip real URL tests pytest tests/regression/ -v -k "core" # only core tests """ import pytest import socket import threading import asyncio import time from aiohttp import web # --------------------------------------------------------------------------- # Pytest configuration # --------------------------------------------------------------------------- def pytest_configure(config): config.addinivalue_line("markers", "network: tests requiring real network access") # --------------------------------------------------------------------------- # Test HTML Pages # --------------------------------------------------------------------------- HOME_HTML = """\ Crawl4AI Test Home

Welcome to the Crawl4AI Test Site

This is a comprehensive test page designed for regression testing of the Crawl4AI web crawling library. It contains various HTML elements to verify content extraction, markdown generation, and link discovery work correctly.

Features Overview

The test suite covers multiple aspects of web crawling including content extraction, JavaScript execution, screenshot capture, and deep crawling capabilities. Each feature is tested both with local pages and real URLs.

Content extraction and markdown generation
Link discovery and classification
Image extraction and scoring
Table extraction and validation

Code Example

from crawl4ai import AsyncWebCrawler

async with AsyncWebCrawler() as crawler:
    result = await crawler.arun("https://example.com")
    print(result.markdown)

Internal Links

Alpha Page Beta Page

External Links

Example.com Crawl4AI GitHub Hero image for testing

""" PRODUCTS_HTML = """\ Product Listing

Products

Wireless Mouse

$29.99

4.5 stars

Ergonomic wireless mouse with precision tracking

Electronics View Details

Mechanical Keyboard

$89.99

4.8 stars

Cherry MX switches with RGB backlighting

Electronics View Details

USB-C Hub

$45.50

4.2 stars

7-in-1 hub with HDMI, USB-A, SD card reader

Accessories View Details

Monitor Stand

$34.99

3.9 stars

Adjustable aluminum monitor riser with storage

Furniture View Details

Webcam HD

$59.00

4.6 stars

1080p webcam with built-in microphone and privacy cover

Electronics View Details

""" TABLES_HTML = """\ Tables Test

Data Tables

Sales Report

Quarter	Revenue	Growth
Q1 2025	$1,234,567	12.5%
Q2 2025	$1,456,789	18.0%
Q3 2025	$1,678,901	15.2%
Q4 2025	$1,890,123	12.6%

Layout Table (should be filtered)

Left column

Right column

Employee Directory

Name	Email	Department	Phone
Alice Johnson	alice@example.com	Engineering	+1-555-0101
Bob Smith	bob@example.com	Marketing	+1-555-0102
Carol White	carol@example.com	Sales	+1-555-0103

""" JS_DYNAMIC_HTML = """\ JS Dynamic Content

Static Section

This content is immediately available in the HTML.

""" LINKS_HTML = """\ Links Collection

Link Collection Page

External Resources

Example Domain GitHub Python Python Docs

Social Media

Twitter Facebook LinkedIn

Duplicate Links

Home Again Example Again

""" IMAGES_HTML = """\ Images Gallery

Image Gallery

A stunning landscape photograph showcasing the beauty of mountain scenery at golden hour. This image demonstrates proper extraction of high-quality photographs with descriptive alt text and surrounding context.

""" STRUCTURED_DATA_HTML = """\ Article with Structured Data

Web Crawling Best Practices

By Test Author | Published June 15, 2025

Web crawling is the process of systematically browsing the web to extract information. Modern crawlers like Crawl4AI provide sophisticated tools for content extraction, including markdown generation, structured data extraction, and intelligent link following.

Key Techniques

Understanding how to properly configure a web crawler is essential for efficient data collection. This includes setting appropriate delays, respecting robots.txt, and using proper user agents.

""" EMPTY_HTML = """\ Empty Page """ MALFORMED_HTML = """\ Malformed Page</head> <body> <div> <p>Unclosed paragraph <p>Another paragraph without closing <img src="/test.jpg" alt="no closing bracket" <a href="/broken>Broken link</a> <div><span>Nested but unclosed <table><tr><td>Cell without closing tags </body> </html>""" REGEX_TEST_HTML = """\ <!DOCTYPE html> <html> <head><title>Regex Test Content

Contact Information

Email us at support@crawl4ai.com or sales@example.org for inquiries.

Call us: +1-555-123-4567 or (800) 555-0199

Visit https://crawl4ai.com or https://docs.crawl4ai.com/api/v2

Server IP: 192.168.1.100

Request ID: 550e8400-e29b-41d4-a716-446655440000

Price: $199.99 or EUR 175.50

Completion rate: 95.7%

Published: 2025-03-15

Updated: 03/15/2025

Meeting at 14:30 or 09:00

Zip code: 94105 or 94105-1234

Follow @crawl4ai on social media

Tags: #WebCrawling #DataExtraction #Python

Color theme: #FF5733

""" def _generate_large_html(num_sections=50): """Generate a large HTML page with many sections.""" sections = [] for i in range(num_sections): sections.append(f"""

Section {i}: Important Topic Number {i}

This is paragraph one of section {i}. It contains enough text to be meaningful for content extraction and markdown generation testing purposes. The crawler should properly handle large pages with many sections.

This is paragraph two of section {i}. It provides additional context and detail about topic {i}, ensuring that the content extraction pipeline can handle substantial amounts of text without issues.

Comprehensive Document

{"".join(sections)} """ LARGE_HTML = _generate_large_html(50) # Deep crawl pages: hub -> sub1,sub2,sub3 -> leaf pages DEEP_HUB_HTML = """\ Deep Crawl Hub

Hub Page

This is the starting point for deep crawl testing.

""" DEEP_SUB_TEMPLATE = """\ Deep Crawl - {title}

{title}

Content about {title}. This sub-page contains links to deeper content.

Leaf A under {title} Leaf B under {title} Back to Hub """ DEEP_LEAF_TEMPLATE = """\ Deep Crawl - {title}

{title}

This is a leaf page in the deep crawl hierarchy. It contains substantial content about {title} to ensure proper extraction at all crawl depths. The adaptive crawler should find and process this content correctly.

Back to Hub """ IFRAME_HTML = """\ Page with Iframes

Main Page Content

This page contains embedded iframes for testing iframe processing.

""" # --------------------------------------------------------------------------- # Server Handlers # --------------------------------------------------------------------------- async def _serve_html(html, content_type="text/html"): return web.Response(text=html, content_type=content_type) async def _home_handler(request): return await _serve_html(HOME_HTML) async def _products_handler(request): return await _serve_html(PRODUCTS_HTML) async def _tables_handler(request): return await _serve_html(TABLES_HTML) async def _js_dynamic_handler(request): return await _serve_html(JS_DYNAMIC_HTML) async def _links_handler(request): return await _serve_html(LINKS_HTML) async def _images_handler(request): return await _serve_html(IMAGES_HTML) async def _structured_handler(request): return await _serve_html(STRUCTURED_DATA_HTML) async def _empty_handler(request): return await _serve_html(EMPTY_HTML) async def _malformed_handler(request): return await _serve_html(MALFORMED_HTML) async def _regex_test_handler(request): return await _serve_html(REGEX_TEST_HTML) async def _large_handler(request): return await _serve_html(LARGE_HTML) async def _iframe_handler(request): return await _serve_html(IFRAME_HTML) async def _redirect_handler(request): raise web.HTTPFound("/") async def _not_found_handler(request): return web.Response( text="404 Not Found" "

Page Not Found

The requested page does not exist.

", status=404, content_type="text/html", ) async def _slow_handler(request): await asyncio.sleep(2) return await _serve_html( "Slow Page" "

Slow Response

This page had a 2-second delay.

" ) async def _deep_hub_handler(request): return await _serve_html(DEEP_HUB_HTML) async def _deep_sub_handler(request): sub_id = request.match_info["sub_id"] titles = {"sub1": "Technology", "sub2": "Science", "sub3": "Arts"} title = titles.get(sub_id, f"Sub {sub_id}") html = DEEP_SUB_TEMPLATE.format(title=title, prefix=sub_id) return await _serve_html(html) async def _deep_leaf_handler(request): sub_id = request.match_info["sub_id"] leaf_id = request.match_info["leaf_id"] title = f"Leaf {leaf_id} under {sub_id}" html = DEEP_LEAF_TEMPLATE.format(title=title) return await _serve_html(html) async def _catch_all_handler(request): """Serve a simple page for any unmatched path (useful for link targets).""" path = request.path return await _serve_html( f"Page: {path}" f"

Page at {path}

" f"

Auto-generated page for path: {path}

" f'Back to Home' ) # --------------------------------------------------------------------------- # Server Setup # --------------------------------------------------------------------------- def _find_free_port(): with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s: s.bind(("", 0)) return s.getsockname()[1] def _create_app(): app = web.Application() app.router.add_get("/", _home_handler) app.router.add_get("/products", _products_handler) app.router.add_get("/tables", _tables_handler) app.router.add_get("/js-dynamic", _js_dynamic_handler) app.router.add_get("/links-page", _links_handler) app.router.add_get("/images-page", _images_handler) app.router.add_get("/structured-data", _structured_handler) app.router.add_get("/empty", _empty_handler) app.router.add_get("/malformed", _malformed_handler) app.router.add_get("/regex-test", _regex_test_handler) app.router.add_get("/large", _large_handler) app.router.add_get("/iframe-page", _iframe_handler) app.router.add_get("/redirect", _redirect_handler) app.router.add_get("/not-found", _not_found_handler) app.router.add_get("/slow", _slow_handler) app.router.add_get("/deep/hub", _deep_hub_handler) app.router.add_get("/deep/{sub_id}", _deep_sub_handler) app.router.add_get("/deep/{sub_id}/{leaf_id}", _deep_leaf_handler) # Catch-all for auto-generated pages (internal link targets, etc.) app.router.add_get("/{path:.*}", _catch_all_handler) return app def _run_server(app, host, port, ready_event): loop = asyncio.new_event_loop() asyncio.set_event_loop(loop) runner = web.AppRunner(app) loop.run_until_complete(runner.setup()) site = web.TCPSite(runner, host, port) loop.run_until_complete(site.start()) ready_event.set() try: loop.run_forever() finally: loop.run_until_complete(runner.cleanup()) loop.close() @pytest.fixture(scope="session") def local_server(): """Start a local HTTP test server. Returns base URL like 'http://localhost:PORT'.""" port = _find_free_port() app = _create_app() ready = threading.Event() thread = threading.Thread( target=_run_server, args=(app, "localhost", port, ready), daemon=True, ) thread.start() assert ready.wait(timeout=10), "Test server failed to start within 10 seconds" # Small delay to ensure server is fully ready time.sleep(0.2) yield f"http://localhost:{port}" # Daemon thread cleans up automatically # --------------------------------------------------------------------------- # Common test constants # --------------------------------------------------------------------------- # Stable real URLs for network tests REAL_URL_SIMPLE = "https://example.com" REAL_URL_QUOTES = "https://quotes.toscrape.com" REAL_URL_BOOKS = "https://books.toscrape.com"