mirror of
https://github.com/unclecode/crawl4ai.git
synced 2026-06-10 15:58:15 +00:00
Full regression suite covering all major Crawl4AI subsystems: - core crawl (arun, arun_many, raw HTML, JS, screenshots, cache, hooks) - content processing (markdown, citations, BM25/pruning filters, links, images, tables, metadata) - extraction strategies (JsonCss, JsonXPath, JsonLxml, Regex, Cosine, NoExtraction) - deep crawl (BFS, DFS, BestFirst, filters, scorers, URL normalization) - browser management (lifecycle, viewport, wait_for, stealth, sessions, iframes) - config serialization (BrowserConfig, CrawlerRunConfig, ProxyConfig roundtrips) - utilities (extract_xml_data, cache modes, content hashing) - edge cases (empty pages, malformed HTML, unicode, concurrent crawls, error recovery) Also adds /c4ai-check slash command for testing changes against the suite.
629 lines
22 KiB
Python
629 lines
22 KiB
Python
"""
|
|
Crawl4AI Regression Test Suite - Shared Fixtures
|
|
|
|
Provides a local HTTP test server with crafted pages for deterministic testing,
|
|
plus markers for network-dependent tests against real URLs.
|
|
|
|
Usage:
|
|
pytest tests/regression/ -v # all tests
|
|
pytest tests/regression/ -v -m "not network" # skip real URL tests
|
|
pytest tests/regression/ -v -k "core" # only core tests
|
|
"""
|
|
|
|
import pytest
|
|
import socket
|
|
import threading
|
|
import asyncio
|
|
import time
|
|
from aiohttp import web
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Pytest configuration
|
|
# ---------------------------------------------------------------------------
|
|
|
|
def pytest_configure(config):
|
|
config.addinivalue_line("markers", "network: tests requiring real network access")
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Test HTML Pages
|
|
# ---------------------------------------------------------------------------
|
|
|
|
HOME_HTML = """\
|
|
<!DOCTYPE html>
|
|
<html lang="en">
|
|
<head>
|
|
<meta charset="utf-8">
|
|
<title>Crawl4AI Test Home</title>
|
|
<meta name="description" content="Regression test page for Crawl4AI">
|
|
<meta name="keywords" content="crawl4ai, testing, regression">
|
|
<meta property="og:title" content="Test OG Title">
|
|
<meta property="og:description" content="Test OG description for social sharing">
|
|
<meta property="og:image" content="/images/og-image.jpg">
|
|
<meta property="og:type" content="website">
|
|
<meta name="twitter:card" content="summary_large_image">
|
|
<meta name="twitter:title" content="Test Twitter Title">
|
|
</head>
|
|
<body>
|
|
<nav>
|
|
<a href="/">Home</a>
|
|
<a href="/products">Products</a>
|
|
<a href="/links-page">Links</a>
|
|
<a href="/tables">Tables</a>
|
|
</nav>
|
|
<main>
|
|
<h1>Welcome to the Crawl4AI Test Site</h1>
|
|
<p>This is a comprehensive test page designed for regression testing of the
|
|
Crawl4AI web crawling library. It contains various HTML elements to verify
|
|
content extraction, markdown generation, and link discovery work correctly.</p>
|
|
|
|
<h2>Features Overview</h2>
|
|
<p>The test suite covers multiple aspects of web crawling including content
|
|
extraction, JavaScript execution, screenshot capture, and deep crawling
|
|
capabilities. Each feature is tested both with local pages and real URLs.</p>
|
|
|
|
<ul>
|
|
<li>Content extraction and markdown generation</li>
|
|
<li>Link discovery and classification</li>
|
|
<li>Image extraction and scoring</li>
|
|
<li>Table extraction and validation</li>
|
|
</ul>
|
|
|
|
<h2>Code Example</h2>
|
|
<pre><code>from crawl4ai import AsyncWebCrawler
|
|
|
|
async with AsyncWebCrawler() as crawler:
|
|
result = await crawler.arun("https://example.com")
|
|
print(result.markdown)</code></pre>
|
|
|
|
<p>Contact us at <a href="mailto:test@example.com">test@example.com</a> for more info.</p>
|
|
|
|
<h3>Internal Links</h3>
|
|
<a href="/page-alpha">Alpha Page</a>
|
|
<a href="/page-beta">Beta Page</a>
|
|
|
|
<h3>External Links</h3>
|
|
<a href="https://example.com">Example.com</a>
|
|
<a href="https://github.com/unclecode/crawl4ai">Crawl4AI GitHub</a>
|
|
|
|
<img src="/images/hero.jpg" alt="Hero image for testing" width="800" height="400">
|
|
<img src="/images/icon.png" alt="" width="16" height="16">
|
|
</main>
|
|
<footer>
|
|
<p>Footer content - should be excluded with excluded_tags</p>
|
|
</footer>
|
|
</body>
|
|
</html>"""
|
|
|
|
PRODUCTS_HTML = """\
|
|
<!DOCTYPE html>
|
|
<html lang="en">
|
|
<head>
|
|
<title>Product Listing</title>
|
|
<meta name="description" content="Test product listing page">
|
|
</head>
|
|
<body>
|
|
<h1>Products</h1>
|
|
<div class="product-list">
|
|
<div class="product" data-id="1">
|
|
<h2 class="name">Wireless Mouse</h2>
|
|
<span class="price">$29.99</span>
|
|
<div class="rating" data-stars="4.5">4.5 stars</div>
|
|
<p class="description">Ergonomic wireless mouse with precision tracking</p>
|
|
<span class="category">Electronics</span>
|
|
<a href="/product/1" class="details-link">View Details</a>
|
|
</div>
|
|
<div class="product" data-id="2">
|
|
<h2 class="name">Mechanical Keyboard</h2>
|
|
<span class="price">$89.99</span>
|
|
<div class="rating" data-stars="4.8">4.8 stars</div>
|
|
<p class="description">Cherry MX switches with RGB backlighting</p>
|
|
<span class="category">Electronics</span>
|
|
<a href="/product/2" class="details-link">View Details</a>
|
|
</div>
|
|
<div class="product" data-id="3">
|
|
<h2 class="name">USB-C Hub</h2>
|
|
<span class="price">$45.50</span>
|
|
<div class="rating" data-stars="4.2">4.2 stars</div>
|
|
<p class="description">7-in-1 hub with HDMI, USB-A, SD card reader</p>
|
|
<span class="category">Accessories</span>
|
|
<a href="/product/3" class="details-link">View Details</a>
|
|
</div>
|
|
<div class="product" data-id="4">
|
|
<h2 class="name">Monitor Stand</h2>
|
|
<span class="price">$34.99</span>
|
|
<div class="rating" data-stars="3.9">3.9 stars</div>
|
|
<p class="description">Adjustable aluminum monitor riser with storage</p>
|
|
<span class="category">Furniture</span>
|
|
<a href="/product/4" class="details-link">View Details</a>
|
|
</div>
|
|
<div class="product" data-id="5">
|
|
<h2 class="name">Webcam HD</h2>
|
|
<span class="price">$59.00</span>
|
|
<div class="rating" data-stars="4.6">4.6 stars</div>
|
|
<p class="description">1080p webcam with built-in microphone and privacy cover</p>
|
|
<span class="category">Electronics</span>
|
|
<a href="/product/5" class="details-link">View Details</a>
|
|
</div>
|
|
</div>
|
|
</body>
|
|
</html>"""
|
|
|
|
TABLES_HTML = """\
|
|
<!DOCTYPE html>
|
|
<html>
|
|
<head><title>Tables Test</title></head>
|
|
<body>
|
|
<h1>Data Tables</h1>
|
|
|
|
<h2>Sales Report</h2>
|
|
<table id="sales-table">
|
|
<thead>
|
|
<tr><th>Quarter</th><th>Revenue</th><th>Growth</th></tr>
|
|
</thead>
|
|
<tbody>
|
|
<tr><td>Q1 2025</td><td>$1,234,567</td><td>12.5%</td></tr>
|
|
<tr><td>Q2 2025</td><td>$1,456,789</td><td>18.0%</td></tr>
|
|
<tr><td>Q3 2025</td><td>$1,678,901</td><td>15.2%</td></tr>
|
|
<tr><td>Q4 2025</td><td>$1,890,123</td><td>12.6%</td></tr>
|
|
</tbody>
|
|
</table>
|
|
|
|
<h2>Layout Table (should be filtered)</h2>
|
|
<table id="layout-table">
|
|
<tr><td>Left column</td><td>Right column</td></tr>
|
|
</table>
|
|
|
|
<h2>Employee Directory</h2>
|
|
<table id="employee-table">
|
|
<thead>
|
|
<tr><th>Name</th><th>Email</th><th>Department</th><th>Phone</th></tr>
|
|
</thead>
|
|
<tbody>
|
|
<tr><td>Alice Johnson</td><td>alice@example.com</td><td>Engineering</td><td>+1-555-0101</td></tr>
|
|
<tr><td>Bob Smith</td><td>bob@example.com</td><td>Marketing</td><td>+1-555-0102</td></tr>
|
|
<tr><td>Carol White</td><td>carol@example.com</td><td>Sales</td><td>+1-555-0103</td></tr>
|
|
</tbody>
|
|
</table>
|
|
</body>
|
|
</html>"""
|
|
|
|
JS_DYNAMIC_HTML = """\
|
|
<!DOCTYPE html>
|
|
<html>
|
|
<head><title>JS Dynamic Content</title></head>
|
|
<body>
|
|
<div id="static-content">
|
|
<h1>Static Section</h1>
|
|
<p>This content is immediately available in the HTML.</p>
|
|
</div>
|
|
<div id="dynamic-content"></div>
|
|
<div id="counter">0</div>
|
|
<script>
|
|
setTimeout(function() {
|
|
document.getElementById('dynamic-content').innerHTML =
|
|
'<p class="js-loaded">Dynamic content successfully loaded via JavaScript</p>' +
|
|
'<ul><li>Item A</li><li>Item B</li><li>Item C</li></ul>';
|
|
}, 300);
|
|
setTimeout(function() {
|
|
document.getElementById('counter').textContent = '42';
|
|
}, 200);
|
|
</script>
|
|
</body>
|
|
</html>"""
|
|
|
|
LINKS_HTML = """\
|
|
<!DOCTYPE html>
|
|
<html>
|
|
<head><title>Links Collection</title></head>
|
|
<body>
|
|
<h1>Link Collection Page</h1>
|
|
<nav>
|
|
<h2>Internal Navigation</h2>
|
|
<a href="/">Home</a>
|
|
<a href="/products">Products</a>
|
|
<a href="/tables">Tables</a>
|
|
<a href="/about">About Us</a>
|
|
<a href="/contact">Contact</a>
|
|
<a href="/blog/post-1">Blog Post 1</a>
|
|
<a href="/blog/post-2">Blog Post 2</a>
|
|
<a href="/docs/api">API Docs</a>
|
|
<a href="/docs/guide">User Guide</a>
|
|
</nav>
|
|
<section>
|
|
<h2>External Resources</h2>
|
|
<a href="https://example.com">Example Domain</a>
|
|
<a href="https://github.com">GitHub</a>
|
|
<a href="https://python.org">Python</a>
|
|
<a href="https://docs.python.org/3/">Python Docs</a>
|
|
</section>
|
|
<section>
|
|
<h2>Social Media</h2>
|
|
<a href="https://twitter.com/example">Twitter</a>
|
|
<a href="https://facebook.com/example">Facebook</a>
|
|
<a href="https://linkedin.com/company/example">LinkedIn</a>
|
|
</section>
|
|
<section>
|
|
<h2>Duplicate Links</h2>
|
|
<a href="/">Home Again</a>
|
|
<a href="https://example.com">Example Again</a>
|
|
</section>
|
|
</body>
|
|
</html>"""
|
|
|
|
IMAGES_HTML = """\
|
|
<!DOCTYPE html>
|
|
<html>
|
|
<head><title>Images Gallery</title></head>
|
|
<body>
|
|
<h1>Image Gallery</h1>
|
|
|
|
<!-- High-quality image: should score high (large, has alt, common format) -->
|
|
<div class="hero">
|
|
<img src="/images/landscape.jpg" alt="Beautiful mountain landscape at sunset"
|
|
width="1200" height="800">
|
|
<p>A stunning landscape photograph showcasing the beauty of mountain scenery
|
|
at golden hour. This image demonstrates proper extraction of high-quality
|
|
photographs with descriptive alt text and surrounding context.</p>
|
|
</div>
|
|
|
|
<!-- Medium quality: decent size, has alt -->
|
|
<img src="/images/product-photo.png" alt="Product photograph" width="400" height="300">
|
|
|
|
<!-- Low quality: small icon, no alt -->
|
|
<img src="/images/icon-search.svg" alt="" width="24" height="24">
|
|
|
|
<!-- Lazy-loaded image -->
|
|
<img data-src="/images/lazy-photo.webp" alt="Lazy loaded image" width="600" height="400"
|
|
class="lazyload" src="data:image/gif;base64,R0lGODlhAQABAAAAACH5BAEKAAEALAAAAAABAAEAAAICTAEAOw==">
|
|
|
|
<!-- Image with srcset -->
|
|
<img src="/images/responsive-sm.jpg"
|
|
srcset="/images/responsive-sm.jpg 480w, /images/responsive-md.jpg 800w, /images/responsive-lg.jpg 1200w"
|
|
alt="Responsive image with srcset" width="800" height="600">
|
|
|
|
<!-- Button icon (should be filtered) -->
|
|
<button><img src="/images/btn-submit.png" alt="submit" width="100" height="30"></button>
|
|
|
|
<!-- Logo (should be filtered by pattern) -->
|
|
<img src="/images/company-logo.png" alt="Company Logo" width="200" height="50">
|
|
</body>
|
|
</html>"""
|
|
|
|
STRUCTURED_DATA_HTML = """\
|
|
<!DOCTYPE html>
|
|
<html lang="en">
|
|
<head>
|
|
<title>Article with Structured Data</title>
|
|
<meta name="description" content="An article about web crawling techniques">
|
|
<meta property="og:title" content="Web Crawling Best Practices">
|
|
<meta property="og:description" content="Learn about modern web crawling">
|
|
<meta property="og:image" content="/images/article-cover.jpg">
|
|
<meta property="og:type" content="article">
|
|
<meta property="article:published_time" content="2025-06-15T10:00:00Z">
|
|
<meta property="article:modified_time" content="2025-07-20T14:30:00Z">
|
|
<meta name="twitter:card" content="summary_large_image">
|
|
<script type="application/ld+json">
|
|
{
|
|
"@context": "https://schema.org",
|
|
"@type": "Article",
|
|
"headline": "Web Crawling Best Practices",
|
|
"author": {"@type": "Person", "name": "Test Author"},
|
|
"datePublished": "2025-06-15",
|
|
"description": "A comprehensive guide to web crawling"
|
|
}
|
|
</script>
|
|
</head>
|
|
<body>
|
|
<article>
|
|
<h1>Web Crawling Best Practices</h1>
|
|
<p class="byline">By Test Author | Published June 15, 2025</p>
|
|
<p>Web crawling is the process of systematically browsing the web to extract
|
|
information. Modern crawlers like Crawl4AI provide sophisticated tools for
|
|
content extraction, including markdown generation, structured data extraction,
|
|
and intelligent link following.</p>
|
|
<h2>Key Techniques</h2>
|
|
<p>Understanding how to properly configure a web crawler is essential for
|
|
efficient data collection. This includes setting appropriate delays, respecting
|
|
robots.txt, and using proper user agents.</p>
|
|
</article>
|
|
</body>
|
|
</html>"""
|
|
|
|
EMPTY_HTML = """\
|
|
<!DOCTYPE html>
|
|
<html><head><title>Empty Page</title></head>
|
|
<body></body>
|
|
</html>"""
|
|
|
|
MALFORMED_HTML = """\
|
|
<html>
|
|
<head><title>Malformed Page</head>
|
|
<body>
|
|
<div>
|
|
<p>Unclosed paragraph
|
|
<p>Another paragraph without closing
|
|
<img src="/test.jpg" alt="no closing bracket"
|
|
<a href="/broken>Broken link</a>
|
|
<div><span>Nested but unclosed
|
|
<table><tr><td>Cell without closing tags
|
|
</body>
|
|
</html>"""
|
|
|
|
REGEX_TEST_HTML = """\
|
|
<!DOCTYPE html>
|
|
<html>
|
|
<head><title>Regex Test Content</title></head>
|
|
<body>
|
|
<h1>Contact Information</h1>
|
|
<p>Email us at support@crawl4ai.com or sales@example.org for inquiries.</p>
|
|
<p>Call us: +1-555-123-4567 or (800) 555-0199</p>
|
|
<p>Visit https://crawl4ai.com or https://docs.crawl4ai.com/api/v2</p>
|
|
<p>Server IP: 192.168.1.100</p>
|
|
<p>Request ID: 550e8400-e29b-41d4-a716-446655440000</p>
|
|
<p>Price: $199.99 or EUR 175.50</p>
|
|
<p>Completion rate: 95.7%</p>
|
|
<p>Published: 2025-03-15</p>
|
|
<p>Updated: 03/15/2025</p>
|
|
<p>Meeting at 14:30 or 09:00</p>
|
|
<p>Zip code: 94105 or 94105-1234</p>
|
|
<p>Follow @crawl4ai on social media</p>
|
|
<p>Tags: #WebCrawling #DataExtraction #Python</p>
|
|
<p>Color theme: #FF5733</p>
|
|
</body>
|
|
</html>"""
|
|
|
|
|
|
def _generate_large_html(num_sections=50):
|
|
"""Generate a large HTML page with many sections."""
|
|
sections = []
|
|
for i in range(num_sections):
|
|
sections.append(f"""
|
|
<section id="section-{i}">
|
|
<h2>Section {i}: Important Topic Number {i}</h2>
|
|
<p>This is paragraph one of section {i}. It contains enough text to be
|
|
meaningful for content extraction and markdown generation testing purposes.
|
|
The crawler should properly handle large pages with many sections.</p>
|
|
<p>This is paragraph two of section {i}. It provides additional context
|
|
and detail about topic {i}, ensuring that the content extraction pipeline
|
|
can handle substantial amounts of text without issues.</p>
|
|
<a href="/section/{i}">Read more about topic {i}</a>
|
|
</section>""")
|
|
return f"""\
|
|
<!DOCTYPE html>
|
|
<html>
|
|
<head><title>Large Page with Many Sections</title></head>
|
|
<body>
|
|
<h1>Comprehensive Document</h1>
|
|
{"".join(sections)}
|
|
</body>
|
|
</html>"""
|
|
|
|
LARGE_HTML = _generate_large_html(50)
|
|
|
|
|
|
# Deep crawl pages: hub -> sub1,sub2,sub3 -> leaf pages
|
|
DEEP_HUB_HTML = """\
|
|
<!DOCTYPE html>
|
|
<html>
|
|
<head><title>Deep Crawl Hub</title></head>
|
|
<body>
|
|
<h1>Hub Page</h1>
|
|
<p>This is the starting point for deep crawl testing.</p>
|
|
<nav>
|
|
<a href="/deep/sub1">Sub Page 1 - Technology</a>
|
|
<a href="/deep/sub2">Sub Page 2 - Science</a>
|
|
<a href="/deep/sub3">Sub Page 3 - Arts</a>
|
|
</nav>
|
|
</body>
|
|
</html>"""
|
|
|
|
DEEP_SUB_TEMPLATE = """\
|
|
<!DOCTYPE html>
|
|
<html>
|
|
<head><title>Deep Crawl - {title}</title></head>
|
|
<body>
|
|
<h1>{title}</h1>
|
|
<p>Content about {title}. This sub-page contains links to deeper content.</p>
|
|
<a href="/deep/{prefix}/leaf-a">Leaf A under {title}</a>
|
|
<a href="/deep/{prefix}/leaf-b">Leaf B under {title}</a>
|
|
<a href="/deep/hub">Back to Hub</a>
|
|
</body>
|
|
</html>"""
|
|
|
|
DEEP_LEAF_TEMPLATE = """\
|
|
<!DOCTYPE html>
|
|
<html>
|
|
<head><title>Deep Crawl - {title}</title></head>
|
|
<body>
|
|
<h1>{title}</h1>
|
|
<p>This is a leaf page in the deep crawl hierarchy. It contains substantial
|
|
content about {title} to ensure proper extraction at all crawl depths.
|
|
The adaptive crawler should find and process this content correctly.</p>
|
|
<a href="/deep/hub">Back to Hub</a>
|
|
</body>
|
|
</html>"""
|
|
|
|
IFRAME_HTML = """\
|
|
<!DOCTYPE html>
|
|
<html>
|
|
<head><title>Page with Iframes</title></head>
|
|
<body>
|
|
<h1>Main Page Content</h1>
|
|
<p>This page contains embedded iframes for testing iframe processing.</p>
|
|
<iframe id="frame1" srcdoc="<html><body><p>Iframe 1 content: embedded text</p></body></html>"
|
|
width="400" height="200"></iframe>
|
|
<iframe id="frame2" srcdoc="<html><body><h2>Iframe 2 heading</h2><p>More embedded content here</p></body></html>"
|
|
width="400" height="200"></iframe>
|
|
</body>
|
|
</html>"""
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Server Handlers
|
|
# ---------------------------------------------------------------------------
|
|
|
|
async def _serve_html(html, content_type="text/html"):
|
|
return web.Response(text=html, content_type=content_type)
|
|
|
|
|
|
async def _home_handler(request):
|
|
return await _serve_html(HOME_HTML)
|
|
|
|
async def _products_handler(request):
|
|
return await _serve_html(PRODUCTS_HTML)
|
|
|
|
async def _tables_handler(request):
|
|
return await _serve_html(TABLES_HTML)
|
|
|
|
async def _js_dynamic_handler(request):
|
|
return await _serve_html(JS_DYNAMIC_HTML)
|
|
|
|
async def _links_handler(request):
|
|
return await _serve_html(LINKS_HTML)
|
|
|
|
async def _images_handler(request):
|
|
return await _serve_html(IMAGES_HTML)
|
|
|
|
async def _structured_handler(request):
|
|
return await _serve_html(STRUCTURED_DATA_HTML)
|
|
|
|
async def _empty_handler(request):
|
|
return await _serve_html(EMPTY_HTML)
|
|
|
|
async def _malformed_handler(request):
|
|
return await _serve_html(MALFORMED_HTML)
|
|
|
|
async def _regex_test_handler(request):
|
|
return await _serve_html(REGEX_TEST_HTML)
|
|
|
|
async def _large_handler(request):
|
|
return await _serve_html(LARGE_HTML)
|
|
|
|
async def _iframe_handler(request):
|
|
return await _serve_html(IFRAME_HTML)
|
|
|
|
async def _redirect_handler(request):
|
|
raise web.HTTPFound("/")
|
|
|
|
async def _not_found_handler(request):
|
|
return web.Response(
|
|
text="<html><head><title>404 Not Found</title></head>"
|
|
"<body><h1>Page Not Found</h1><p>The requested page does not exist.</p></body></html>",
|
|
status=404, content_type="text/html",
|
|
)
|
|
|
|
async def _slow_handler(request):
|
|
await asyncio.sleep(2)
|
|
return await _serve_html(
|
|
"<html><head><title>Slow Page</title></head>"
|
|
"<body><h1>Slow Response</h1><p>This page had a 2-second delay.</p></body></html>"
|
|
)
|
|
|
|
async def _deep_hub_handler(request):
|
|
return await _serve_html(DEEP_HUB_HTML)
|
|
|
|
async def _deep_sub_handler(request):
|
|
sub_id = request.match_info["sub_id"]
|
|
titles = {"sub1": "Technology", "sub2": "Science", "sub3": "Arts"}
|
|
title = titles.get(sub_id, f"Sub {sub_id}")
|
|
html = DEEP_SUB_TEMPLATE.format(title=title, prefix=sub_id)
|
|
return await _serve_html(html)
|
|
|
|
async def _deep_leaf_handler(request):
|
|
sub_id = request.match_info["sub_id"]
|
|
leaf_id = request.match_info["leaf_id"]
|
|
title = f"Leaf {leaf_id} under {sub_id}"
|
|
html = DEEP_LEAF_TEMPLATE.format(title=title)
|
|
return await _serve_html(html)
|
|
|
|
async def _catch_all_handler(request):
|
|
"""Serve a simple page for any unmatched path (useful for link targets)."""
|
|
path = request.path
|
|
return await _serve_html(
|
|
f"<html><head><title>Page: {path}</title></head>"
|
|
f"<body><h1>Page at {path}</h1>"
|
|
f"<p>Auto-generated page for path: {path}</p>"
|
|
f'<a href="/">Back to Home</a></body></html>'
|
|
)
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Server Setup
|
|
# ---------------------------------------------------------------------------
|
|
|
|
def _find_free_port():
|
|
with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
|
|
s.bind(("", 0))
|
|
return s.getsockname()[1]
|
|
|
|
|
|
def _create_app():
|
|
app = web.Application()
|
|
app.router.add_get("/", _home_handler)
|
|
app.router.add_get("/products", _products_handler)
|
|
app.router.add_get("/tables", _tables_handler)
|
|
app.router.add_get("/js-dynamic", _js_dynamic_handler)
|
|
app.router.add_get("/links-page", _links_handler)
|
|
app.router.add_get("/images-page", _images_handler)
|
|
app.router.add_get("/structured-data", _structured_handler)
|
|
app.router.add_get("/empty", _empty_handler)
|
|
app.router.add_get("/malformed", _malformed_handler)
|
|
app.router.add_get("/regex-test", _regex_test_handler)
|
|
app.router.add_get("/large", _large_handler)
|
|
app.router.add_get("/iframe-page", _iframe_handler)
|
|
app.router.add_get("/redirect", _redirect_handler)
|
|
app.router.add_get("/not-found", _not_found_handler)
|
|
app.router.add_get("/slow", _slow_handler)
|
|
app.router.add_get("/deep/hub", _deep_hub_handler)
|
|
app.router.add_get("/deep/{sub_id}", _deep_sub_handler)
|
|
app.router.add_get("/deep/{sub_id}/{leaf_id}", _deep_leaf_handler)
|
|
# Catch-all for auto-generated pages (internal link targets, etc.)
|
|
app.router.add_get("/{path:.*}", _catch_all_handler)
|
|
return app
|
|
|
|
|
|
def _run_server(app, host, port, ready_event):
|
|
loop = asyncio.new_event_loop()
|
|
asyncio.set_event_loop(loop)
|
|
runner = web.AppRunner(app)
|
|
loop.run_until_complete(runner.setup())
|
|
site = web.TCPSite(runner, host, port)
|
|
loop.run_until_complete(site.start())
|
|
ready_event.set()
|
|
try:
|
|
loop.run_forever()
|
|
finally:
|
|
loop.run_until_complete(runner.cleanup())
|
|
loop.close()
|
|
|
|
|
|
@pytest.fixture(scope="session")
|
|
def local_server():
|
|
"""Start a local HTTP test server. Returns base URL like 'http://localhost:PORT'."""
|
|
port = _find_free_port()
|
|
app = _create_app()
|
|
ready = threading.Event()
|
|
thread = threading.Thread(
|
|
target=_run_server,
|
|
args=(app, "localhost", port, ready),
|
|
daemon=True,
|
|
)
|
|
thread.start()
|
|
assert ready.wait(timeout=10), "Test server failed to start within 10 seconds"
|
|
# Small delay to ensure server is fully ready
|
|
time.sleep(0.2)
|
|
yield f"http://localhost:{port}"
|
|
# Daemon thread cleans up automatically
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Common test constants
|
|
# ---------------------------------------------------------------------------
|
|
|
|
# Stable real URLs for network tests
|
|
REAL_URL_SIMPLE = "https://example.com"
|
|
REAL_URL_QUOTES = "https://quotes.toscrape.com"
|
|
REAL_URL_BOOKS = "https://books.toscrape.com"
|