""" Crawl4AI Regression Test Suite - Shared Fixtures Provides a local HTTP test server with crafted pages for deterministic testing, plus markers for network-dependent tests against real URLs. Usage: pytest tests/regression/ -v # all tests pytest tests/regression/ -v -m "not network" # skip real URL tests pytest tests/regression/ -v -k "core" # only core tests """ import pytest import socket import threading import asyncio import time from aiohttp import web # --------------------------------------------------------------------------- # Pytest configuration # --------------------------------------------------------------------------- def pytest_configure(config): config.addinivalue_line("markers", "network: tests requiring real network access") # --------------------------------------------------------------------------- # Test HTML Pages # --------------------------------------------------------------------------- HOME_HTML = """\ Crawl4AI Test Home

Welcome to the Crawl4AI Test Site

This is a comprehensive test page designed for regression testing of the Crawl4AI web crawling library. It contains various HTML elements to verify content extraction, markdown generation, and link discovery work correctly.

Features Overview

The test suite covers multiple aspects of web crawling including content extraction, JavaScript execution, screenshot capture, and deep crawling capabilities. Each feature is tested both with local pages and real URLs.

Code Example

from crawl4ai import AsyncWebCrawler

async with AsyncWebCrawler() as crawler:
    result = await crawler.arun("https://example.com")
    print(result.markdown)

Contact us at test@example.com for more info.

Internal Links

Alpha Page Beta Page

External Links

Example.com Crawl4AI GitHub Hero image for testing
""" PRODUCTS_HTML = """\ Product Listing

Products

Wireless Mouse

$29.99
4.5 stars

Ergonomic wireless mouse with precision tracking

Electronics View Details

Mechanical Keyboard

$89.99
4.8 stars

Cherry MX switches with RGB backlighting

Electronics View Details

USB-C Hub

$45.50
4.2 stars

7-in-1 hub with HDMI, USB-A, SD card reader

Accessories View Details

Monitor Stand

$34.99
3.9 stars

Adjustable aluminum monitor riser with storage

Furniture View Details

Webcam HD

$59.00
4.6 stars

1080p webcam with built-in microphone and privacy cover

Electronics View Details
""" TABLES_HTML = """\ Tables Test

Data Tables

Sales Report

QuarterRevenueGrowth
Q1 2025$1,234,56712.5%
Q2 2025$1,456,78918.0%
Q3 2025$1,678,90115.2%
Q4 2025$1,890,12312.6%

Layout Table (should be filtered)

Left columnRight column

Employee Directory

NameEmailDepartmentPhone
Alice Johnsonalice@example.comEngineering+1-555-0101
Bob Smithbob@example.comMarketing+1-555-0102
Carol Whitecarol@example.comSales+1-555-0103
""" JS_DYNAMIC_HTML = """\ JS Dynamic Content

Static Section

This content is immediately available in the HTML.

0
""" LINKS_HTML = """\ Links Collection

Link Collection Page

External Resources

Example Domain GitHub Python Python Docs

Social Media

Twitter Facebook LinkedIn

Duplicate Links

Home Again Example Again
""" IMAGES_HTML = """\ Images Gallery

Image Gallery

Beautiful mountain landscape at sunset

A stunning landscape photograph showcasing the beauty of mountain scenery at golden hour. This image demonstrates proper extraction of high-quality photographs with descriptive alt text and surrounding context.

Product photograph Lazy loaded image Responsive image with srcset Company Logo """ STRUCTURED_DATA_HTML = """\ Article with Structured Data

Web Crawling Best Practices

By Test Author | Published June 15, 2025

Web crawling is the process of systematically browsing the web to extract information. Modern crawlers like Crawl4AI provide sophisticated tools for content extraction, including markdown generation, structured data extraction, and intelligent link following.

Key Techniques

Understanding how to properly configure a web crawler is essential for efficient data collection. This includes setting appropriate delays, respecting robots.txt, and using proper user agents.

""" EMPTY_HTML = """\ Empty Page """ MALFORMED_HTML = """\ Malformed Page</head> <body> <div> <p>Unclosed paragraph <p>Another paragraph without closing <img src="/test.jpg" alt="no closing bracket" <a href="/broken>Broken link</a> <div><span>Nested but unclosed <table><tr><td>Cell without closing tags </body> </html>""" REGEX_TEST_HTML = """\ <!DOCTYPE html> <html> <head><title>Regex Test Content

Contact Information

Email us at support@crawl4ai.com or sales@example.org for inquiries.

Call us: +1-555-123-4567 or (800) 555-0199

Visit https://crawl4ai.com or https://docs.crawl4ai.com/api/v2

Server IP: 192.168.1.100

Request ID: 550e8400-e29b-41d4-a716-446655440000

Price: $199.99 or EUR 175.50

Completion rate: 95.7%

Published: 2025-03-15

Updated: 03/15/2025

Meeting at 14:30 or 09:00

Zip code: 94105 or 94105-1234

Follow @crawl4ai on social media

Tags: #WebCrawling #DataExtraction #Python

Color theme: #FF5733

""" def _generate_large_html(num_sections=50): """Generate a large HTML page with many sections.""" sections = [] for i in range(num_sections): sections.append(f"""

Section {i}: Important Topic Number {i}

This is paragraph one of section {i}. It contains enough text to be meaningful for content extraction and markdown generation testing purposes. The crawler should properly handle large pages with many sections.

This is paragraph two of section {i}. It provides additional context and detail about topic {i}, ensuring that the content extraction pipeline can handle substantial amounts of text without issues.

Read more about topic {i}
""") return f"""\ Large Page with Many Sections

Comprehensive Document

{"".join(sections)} """ LARGE_HTML = _generate_large_html(50) # Deep crawl pages: hub -> sub1,sub2,sub3 -> leaf pages DEEP_HUB_HTML = """\ Deep Crawl Hub

Hub Page

This is the starting point for deep crawl testing.

""" DEEP_SUB_TEMPLATE = """\ Deep Crawl - {title}

{title}

Content about {title}. This sub-page contains links to deeper content.

Leaf A under {title} Leaf B under {title} Back to Hub """ DEEP_LEAF_TEMPLATE = """\ Deep Crawl - {title}

{title}

This is a leaf page in the deep crawl hierarchy. It contains substantial content about {title} to ensure proper extraction at all crawl depths. The adaptive crawler should find and process this content correctly.

Back to Hub """ IFRAME_HTML = """\ Page with Iframes

Main Page Content

This page contains embedded iframes for testing iframe processing.

""" # --------------------------------------------------------------------------- # Server Handlers # --------------------------------------------------------------------------- async def _serve_html(html, content_type="text/html"): return web.Response(text=html, content_type=content_type) async def _home_handler(request): return await _serve_html(HOME_HTML) async def _products_handler(request): return await _serve_html(PRODUCTS_HTML) async def _tables_handler(request): return await _serve_html(TABLES_HTML) async def _js_dynamic_handler(request): return await _serve_html(JS_DYNAMIC_HTML) async def _links_handler(request): return await _serve_html(LINKS_HTML) async def _images_handler(request): return await _serve_html(IMAGES_HTML) async def _structured_handler(request): return await _serve_html(STRUCTURED_DATA_HTML) async def _empty_handler(request): return await _serve_html(EMPTY_HTML) async def _malformed_handler(request): return await _serve_html(MALFORMED_HTML) async def _regex_test_handler(request): return await _serve_html(REGEX_TEST_HTML) async def _large_handler(request): return await _serve_html(LARGE_HTML) async def _iframe_handler(request): return await _serve_html(IFRAME_HTML) async def _redirect_handler(request): raise web.HTTPFound("/") async def _not_found_handler(request): return web.Response( text="404 Not Found" "

Page Not Found

The requested page does not exist.

", status=404, content_type="text/html", ) async def _slow_handler(request): await asyncio.sleep(2) return await _serve_html( "Slow Page" "

Slow Response

This page had a 2-second delay.

" ) async def _deep_hub_handler(request): return await _serve_html(DEEP_HUB_HTML) async def _deep_sub_handler(request): sub_id = request.match_info["sub_id"] titles = {"sub1": "Technology", "sub2": "Science", "sub3": "Arts"} title = titles.get(sub_id, f"Sub {sub_id}") html = DEEP_SUB_TEMPLATE.format(title=title, prefix=sub_id) return await _serve_html(html) async def _deep_leaf_handler(request): sub_id = request.match_info["sub_id"] leaf_id = request.match_info["leaf_id"] title = f"Leaf {leaf_id} under {sub_id}" html = DEEP_LEAF_TEMPLATE.format(title=title) return await _serve_html(html) async def _catch_all_handler(request): """Serve a simple page for any unmatched path (useful for link targets).""" path = request.path return await _serve_html( f"Page: {path}" f"

Page at {path}

" f"

Auto-generated page for path: {path}

" f'Back to Home' ) # --------------------------------------------------------------------------- # Server Setup # --------------------------------------------------------------------------- def _find_free_port(): with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s: s.bind(("", 0)) return s.getsockname()[1] def _create_app(): app = web.Application() app.router.add_get("/", _home_handler) app.router.add_get("/products", _products_handler) app.router.add_get("/tables", _tables_handler) app.router.add_get("/js-dynamic", _js_dynamic_handler) app.router.add_get("/links-page", _links_handler) app.router.add_get("/images-page", _images_handler) app.router.add_get("/structured-data", _structured_handler) app.router.add_get("/empty", _empty_handler) app.router.add_get("/malformed", _malformed_handler) app.router.add_get("/regex-test", _regex_test_handler) app.router.add_get("/large", _large_handler) app.router.add_get("/iframe-page", _iframe_handler) app.router.add_get("/redirect", _redirect_handler) app.router.add_get("/not-found", _not_found_handler) app.router.add_get("/slow", _slow_handler) app.router.add_get("/deep/hub", _deep_hub_handler) app.router.add_get("/deep/{sub_id}", _deep_sub_handler) app.router.add_get("/deep/{sub_id}/{leaf_id}", _deep_leaf_handler) # Catch-all for auto-generated pages (internal link targets, etc.) app.router.add_get("/{path:.*}", _catch_all_handler) return app def _run_server(app, host, port, ready_event): loop = asyncio.new_event_loop() asyncio.set_event_loop(loop) runner = web.AppRunner(app) loop.run_until_complete(runner.setup()) site = web.TCPSite(runner, host, port) loop.run_until_complete(site.start()) ready_event.set() try: loop.run_forever() finally: loop.run_until_complete(runner.cleanup()) loop.close() @pytest.fixture(scope="session") def local_server(): """Start a local HTTP test server. Returns base URL like 'http://localhost:PORT'.""" port = _find_free_port() app = _create_app() ready = threading.Event() thread = threading.Thread( target=_run_server, args=(app, "localhost", port, ready), daemon=True, ) thread.start() assert ready.wait(timeout=10), "Test server failed to start within 10 seconds" # Small delay to ensure server is fully ready time.sleep(0.2) yield f"http://localhost:{port}" # Daemon thread cleans up automatically # --------------------------------------------------------------------------- # Common test constants # --------------------------------------------------------------------------- # Stable real URLs for network tests REAL_URL_SIMPLE = "https://example.com" REAL_URL_QUOTES = "https://quotes.toscrape.com" REAL_URL_BOOKS = "https://books.toscrape.com"