mirror of
https://github.com/unclecode/crawl4ai.git
synced 2026-06-10 15:58:15 +00:00
Add DomainMapper class that discovers all URLs under a domain using 8 sources: sitemap, Common Crawl, Wayback Machine, Certificate Transparency (crt.sh), path probing, robots.txt mining, RSS/Atom feeds, and homepage link extraction. Key features: - Subdomain discovery via crt.sh, Wayback, CC, and DNS guessing - Soft-404 detection: fingerprints SPA sites and filters fake pages - Per-host scanning with parallel execution across discovered hosts - URL normalization, deduplication, and source attribution - BM25 relevance scoring with head metadata extraction - Nonsense filter for static assets, webpack chunks, Wayback garbage For superdesign.dev: finds 171 URLs across 11 hosts in ~13s (vs 4 URLs from AsyncUrlSeeder) New files: - crawl4ai/domain_mapper.py (DomainMapper class) - crawl4ai/async_configs.py (DomainMapperConfig) - docs/md_v2/core/domain-mapping.md (documentation) - docs/examples/domain_mapper/domain_mapper_demo.py - 67 tests across unit/integration/adversarial/regression (cherry picked from commit 2d10534a8742177f1d5f521e3174ae66591d3533)
271 lines
8.7 KiB
Python
271 lines
8.7 KiB
Python
"""
|
|
Crawl4AI Regression Tests - DomainMapper
|
|
|
|
Tests DomainMapper functionality: host discovery, soft-404 detection,
|
|
multi-source scanning, post-processing, and crawler integration.
|
|
|
|
All network tests use real endpoints.
|
|
"""
|
|
|
|
import pytest
|
|
import pytest_asyncio
|
|
from crawl4ai import DomainMapper, DomainMapperConfig, AsyncWebCrawler
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Fixtures
|
|
# ---------------------------------------------------------------------------
|
|
|
|
@pytest_asyncio.fixture
|
|
async def mapper():
|
|
async with DomainMapper() as m:
|
|
yield m
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Basic scan tests
|
|
# ---------------------------------------------------------------------------
|
|
|
|
@pytest.mark.asyncio
|
|
@pytest.mark.network
|
|
async def test_basic_scan(mapper):
|
|
"""Scan a domain with sitemaps and get results."""
|
|
config = DomainMapperConfig(
|
|
source="sitemap",
|
|
extract_head=False,
|
|
verbose=False,
|
|
)
|
|
results = await mapper.scan("docs.crawl4ai.com", config)
|
|
assert len(results) >= 1, "Should find at least 1 URL from sitemap"
|
|
assert all("url" in r for r in results)
|
|
assert all("host" in r for r in results)
|
|
assert all("source" in r for r in results)
|
|
|
|
|
|
@pytest.mark.asyncio
|
|
@pytest.mark.network
|
|
async def test_scan_with_head_extraction(mapper):
|
|
"""Head extraction should populate title and meta."""
|
|
config = DomainMapperConfig(
|
|
source="sitemap",
|
|
extract_head=True,
|
|
max_urls=3,
|
|
verbose=False,
|
|
)
|
|
results = await mapper.scan("docs.crawl4ai.com", config)
|
|
assert len(results) >= 1
|
|
has_title = any(r.get("head_data", {}).get("title") for r in results)
|
|
assert has_title, "At least one result should have a title"
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Host discovery tests
|
|
# ---------------------------------------------------------------------------
|
|
|
|
@pytest.mark.asyncio
|
|
@pytest.mark.network
|
|
async def test_crt_subdomain_discovery(mapper):
|
|
"""crt.sh should discover subdomains."""
|
|
config = DomainMapperConfig(
|
|
source="crt+probe",
|
|
extract_head=False,
|
|
verbose=False,
|
|
)
|
|
results = await mapper.scan("superdesign.dev", config)
|
|
hosts = {r["host"] for r in results}
|
|
assert len(hosts) >= 3, f"Expected >=3 hosts via crt, got {len(hosts)}: {hosts}"
|
|
|
|
|
|
@pytest.mark.asyncio
|
|
@pytest.mark.network
|
|
async def test_dns_subdomain_guessing(mapper):
|
|
"""DNS guessing should find common subdomains."""
|
|
hosts = await mapper._guess_subdomains("crawl4ai.com", ["docs", "www", "api"], DomainMapperConfig())
|
|
# docs.crawl4ai.com should resolve
|
|
assert "docs.crawl4ai.com" in hosts
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Soft-404 detection tests
|
|
# ---------------------------------------------------------------------------
|
|
|
|
@pytest.mark.asyncio
|
|
@pytest.mark.network
|
|
async def test_soft_404_detection(mapper):
|
|
"""SPA site should be detected as soft-404."""
|
|
fp = await mapper._fingerprint_soft_404("app.superdesign.dev", DomainMapperConfig())
|
|
assert fp is not None
|
|
assert fp.status_code == 200, "SPA should return 200 for nonexistent paths"
|
|
assert fp.title is not None, "Should capture the SPA shell title"
|
|
|
|
|
|
@pytest.mark.asyncio
|
|
@pytest.mark.network
|
|
async def test_soft_404_filters_probes(mapper):
|
|
"""Probing an SPA with soft-404 enabled should filter all paths."""
|
|
config = DomainMapperConfig(
|
|
source="probe",
|
|
soft_404_detection=True,
|
|
extract_head=False,
|
|
verbose=False,
|
|
)
|
|
results = await mapper.scan("app.superdesign.dev", config)
|
|
probe_urls = [r for r in results if r["source"] == "probe"]
|
|
assert len(probe_urls) == 0, "All probe paths on SPA should be soft-404 filtered"
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Source isolation tests
|
|
# ---------------------------------------------------------------------------
|
|
|
|
@pytest.mark.asyncio
|
|
@pytest.mark.network
|
|
async def test_sitemap_only_no_cross_contamination(mapper):
|
|
"""source='sitemap' should only produce sitemap-sourced results."""
|
|
config = DomainMapperConfig(
|
|
source="sitemap",
|
|
extract_head=False,
|
|
verbose=False,
|
|
)
|
|
results = await mapper.scan("docs.crawl4ai.com", config)
|
|
for r in results:
|
|
for part in r["source"].split("+"):
|
|
assert part == "sitemap", f"Unexpected source: {part}"
|
|
|
|
|
|
@pytest.mark.asyncio
|
|
@pytest.mark.network
|
|
async def test_probe_only(mapper):
|
|
"""source='probe' should work standalone."""
|
|
config = DomainMapperConfig(
|
|
source="probe",
|
|
extract_head=False,
|
|
verbose=False,
|
|
)
|
|
results = await mapper.scan("docs.crawl4ai.com", config)
|
|
assert isinstance(results, list)
|
|
assert len(results) >= 1
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Post-processing tests
|
|
# ---------------------------------------------------------------------------
|
|
|
|
@pytest.mark.asyncio
|
|
@pytest.mark.network
|
|
async def test_max_urls_respected(mapper):
|
|
"""max_urls should cap results."""
|
|
config = DomainMapperConfig(
|
|
source="sitemap+probe",
|
|
extract_head=False,
|
|
max_urls=5,
|
|
verbose=False,
|
|
)
|
|
results = await mapper.scan("docs.crawl4ai.com", config)
|
|
assert len(results) <= 5
|
|
|
|
|
|
@pytest.mark.asyncio
|
|
@pytest.mark.network
|
|
async def test_nonsense_filter_removes_assets(mapper):
|
|
"""Nonsense filter should remove JS/CSS/image URLs."""
|
|
config = DomainMapperConfig(
|
|
source="sitemap+homepage",
|
|
extract_head=False,
|
|
filter_nonsense_urls=True,
|
|
verbose=False,
|
|
)
|
|
results = await mapper.scan("docs.crawl4ai.com", config)
|
|
for r in results:
|
|
url = r["url"].lower()
|
|
assert not url.endswith(".js"), f"JS file should be filtered: {url}"
|
|
assert not url.endswith(".css"), f"CSS file should be filtered: {url}"
|
|
assert not url.endswith(".png"), f"Image should be filtered: {url}"
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Error handling tests
|
|
# ---------------------------------------------------------------------------
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_invalid_source_raises(mapper):
|
|
"""Invalid source should raise ValueError."""
|
|
with pytest.raises(ValueError, match="Invalid source"):
|
|
await mapper.scan("example.com", DomainMapperConfig(source="bogus"))
|
|
|
|
|
|
@pytest.mark.asyncio
|
|
@pytest.mark.network
|
|
async def test_nonexistent_domain(mapper):
|
|
"""Nonexistent domain should return empty list, not crash."""
|
|
config = DomainMapperConfig(
|
|
source="sitemap+probe",
|
|
extract_head=False,
|
|
verbose=False,
|
|
)
|
|
results = await mapper.scan("thiswillneverexist99999.dev", config)
|
|
assert results == []
|
|
|
|
|
|
@pytest.mark.asyncio
|
|
@pytest.mark.network
|
|
async def test_domain_with_scheme_stripped(mapper):
|
|
"""Domain with https:// prefix should work."""
|
|
config = DomainMapperConfig(
|
|
source="sitemap",
|
|
extract_head=False,
|
|
max_urls=3,
|
|
verbose=False,
|
|
)
|
|
results = await mapper.scan("https://docs.crawl4ai.com", config)
|
|
assert len(results) >= 1
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Crawler integration tests
|
|
# ---------------------------------------------------------------------------
|
|
|
|
@pytest.mark.asyncio
|
|
@pytest.mark.network
|
|
async def test_amap_domain_on_crawler():
|
|
"""AsyncWebCrawler.amap_domain() should work end-to-end."""
|
|
async with AsyncWebCrawler() as crawler:
|
|
results = await crawler.amap_domain(
|
|
"docs.crawl4ai.com",
|
|
DomainMapperConfig(
|
|
source="sitemap",
|
|
extract_head=False,
|
|
max_urls=5,
|
|
verbose=False,
|
|
),
|
|
)
|
|
assert len(results) >= 1
|
|
assert all("url" in r for r in results)
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Config tests
|
|
# ---------------------------------------------------------------------------
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_config_clone():
|
|
"""DomainMapperConfig.clone() should produce correct copies."""
|
|
config = DomainMapperConfig(source="sitemap", max_urls=10, verbose=True)
|
|
cloned = config.clone(max_urls=20, force=True)
|
|
assert cloned.max_urls == 20
|
|
assert cloned.force is True
|
|
assert cloned.source == "sitemap"
|
|
assert cloned.verbose is True
|
|
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_config_from_kwargs():
|
|
"""DomainMapperConfig.from_kwargs() should work."""
|
|
config = DomainMapperConfig.from_kwargs({
|
|
"source": "crt+probe",
|
|
"max_urls": 50,
|
|
})
|
|
assert config.source == "crt+probe"
|
|
assert config.max_urls == 50
|
|
assert config.extract_head is True # default
|