Files
crawl4ai/tests/regression/test_reg_domain_mapper.py
unclecode 9d5bcf78e2 feat: Add DomainMapper for comprehensive domain URL discovery
Add DomainMapper class that discovers all URLs under a domain using
8 sources: sitemap, Common Crawl, Wayback Machine, Certificate
Transparency (crt.sh), path probing, robots.txt mining, RSS/Atom
feeds, and homepage link extraction.

Key features:
- Subdomain discovery via crt.sh, Wayback, CC, and DNS guessing
- Soft-404 detection: fingerprints SPA sites and filters fake pages
- Per-host scanning with parallel execution across discovered hosts
- URL normalization, deduplication, and source attribution
- BM25 relevance scoring with head metadata extraction
- Nonsense filter for static assets, webpack chunks, Wayback garbage

For superdesign.dev: finds 171 URLs across 11 hosts in ~13s
(vs 4 URLs from AsyncUrlSeeder)

New files:
- crawl4ai/domain_mapper.py (DomainMapper class)
- crawl4ai/async_configs.py (DomainMapperConfig)
- docs/md_v2/core/domain-mapping.md (documentation)
- docs/examples/domain_mapper/domain_mapper_demo.py
- 67 tests across unit/integration/adversarial/regression

(cherry picked from commit 2d10534a8742177f1d5f521e3174ae66591d3533)
2026-06-01 12:58:23 +00:00

271 lines
8.7 KiB
Python

"""
Crawl4AI Regression Tests - DomainMapper
Tests DomainMapper functionality: host discovery, soft-404 detection,
multi-source scanning, post-processing, and crawler integration.
All network tests use real endpoints.
"""
import pytest
import pytest_asyncio
from crawl4ai import DomainMapper, DomainMapperConfig, AsyncWebCrawler
# ---------------------------------------------------------------------------
# Fixtures
# ---------------------------------------------------------------------------
@pytest_asyncio.fixture
async def mapper():
async with DomainMapper() as m:
yield m
# ---------------------------------------------------------------------------
# Basic scan tests
# ---------------------------------------------------------------------------
@pytest.mark.asyncio
@pytest.mark.network
async def test_basic_scan(mapper):
"""Scan a domain with sitemaps and get results."""
config = DomainMapperConfig(
source="sitemap",
extract_head=False,
verbose=False,
)
results = await mapper.scan("docs.crawl4ai.com", config)
assert len(results) >= 1, "Should find at least 1 URL from sitemap"
assert all("url" in r for r in results)
assert all("host" in r for r in results)
assert all("source" in r for r in results)
@pytest.mark.asyncio
@pytest.mark.network
async def test_scan_with_head_extraction(mapper):
"""Head extraction should populate title and meta."""
config = DomainMapperConfig(
source="sitemap",
extract_head=True,
max_urls=3,
verbose=False,
)
results = await mapper.scan("docs.crawl4ai.com", config)
assert len(results) >= 1
has_title = any(r.get("head_data", {}).get("title") for r in results)
assert has_title, "At least one result should have a title"
# ---------------------------------------------------------------------------
# Host discovery tests
# ---------------------------------------------------------------------------
@pytest.mark.asyncio
@pytest.mark.network
async def test_crt_subdomain_discovery(mapper):
"""crt.sh should discover subdomains."""
config = DomainMapperConfig(
source="crt+probe",
extract_head=False,
verbose=False,
)
results = await mapper.scan("superdesign.dev", config)
hosts = {r["host"] for r in results}
assert len(hosts) >= 3, f"Expected >=3 hosts via crt, got {len(hosts)}: {hosts}"
@pytest.mark.asyncio
@pytest.mark.network
async def test_dns_subdomain_guessing(mapper):
"""DNS guessing should find common subdomains."""
hosts = await mapper._guess_subdomains("crawl4ai.com", ["docs", "www", "api"], DomainMapperConfig())
# docs.crawl4ai.com should resolve
assert "docs.crawl4ai.com" in hosts
# ---------------------------------------------------------------------------
# Soft-404 detection tests
# ---------------------------------------------------------------------------
@pytest.mark.asyncio
@pytest.mark.network
async def test_soft_404_detection(mapper):
"""SPA site should be detected as soft-404."""
fp = await mapper._fingerprint_soft_404("app.superdesign.dev", DomainMapperConfig())
assert fp is not None
assert fp.status_code == 200, "SPA should return 200 for nonexistent paths"
assert fp.title is not None, "Should capture the SPA shell title"
@pytest.mark.asyncio
@pytest.mark.network
async def test_soft_404_filters_probes(mapper):
"""Probing an SPA with soft-404 enabled should filter all paths."""
config = DomainMapperConfig(
source="probe",
soft_404_detection=True,
extract_head=False,
verbose=False,
)
results = await mapper.scan("app.superdesign.dev", config)
probe_urls = [r for r in results if r["source"] == "probe"]
assert len(probe_urls) == 0, "All probe paths on SPA should be soft-404 filtered"
# ---------------------------------------------------------------------------
# Source isolation tests
# ---------------------------------------------------------------------------
@pytest.mark.asyncio
@pytest.mark.network
async def test_sitemap_only_no_cross_contamination(mapper):
"""source='sitemap' should only produce sitemap-sourced results."""
config = DomainMapperConfig(
source="sitemap",
extract_head=False,
verbose=False,
)
results = await mapper.scan("docs.crawl4ai.com", config)
for r in results:
for part in r["source"].split("+"):
assert part == "sitemap", f"Unexpected source: {part}"
@pytest.mark.asyncio
@pytest.mark.network
async def test_probe_only(mapper):
"""source='probe' should work standalone."""
config = DomainMapperConfig(
source="probe",
extract_head=False,
verbose=False,
)
results = await mapper.scan("docs.crawl4ai.com", config)
assert isinstance(results, list)
assert len(results) >= 1
# ---------------------------------------------------------------------------
# Post-processing tests
# ---------------------------------------------------------------------------
@pytest.mark.asyncio
@pytest.mark.network
async def test_max_urls_respected(mapper):
"""max_urls should cap results."""
config = DomainMapperConfig(
source="sitemap+probe",
extract_head=False,
max_urls=5,
verbose=False,
)
results = await mapper.scan("docs.crawl4ai.com", config)
assert len(results) <= 5
@pytest.mark.asyncio
@pytest.mark.network
async def test_nonsense_filter_removes_assets(mapper):
"""Nonsense filter should remove JS/CSS/image URLs."""
config = DomainMapperConfig(
source="sitemap+homepage",
extract_head=False,
filter_nonsense_urls=True,
verbose=False,
)
results = await mapper.scan("docs.crawl4ai.com", config)
for r in results:
url = r["url"].lower()
assert not url.endswith(".js"), f"JS file should be filtered: {url}"
assert not url.endswith(".css"), f"CSS file should be filtered: {url}"
assert not url.endswith(".png"), f"Image should be filtered: {url}"
# ---------------------------------------------------------------------------
# Error handling tests
# ---------------------------------------------------------------------------
@pytest.mark.asyncio
async def test_invalid_source_raises(mapper):
"""Invalid source should raise ValueError."""
with pytest.raises(ValueError, match="Invalid source"):
await mapper.scan("example.com", DomainMapperConfig(source="bogus"))
@pytest.mark.asyncio
@pytest.mark.network
async def test_nonexistent_domain(mapper):
"""Nonexistent domain should return empty list, not crash."""
config = DomainMapperConfig(
source="sitemap+probe",
extract_head=False,
verbose=False,
)
results = await mapper.scan("thiswillneverexist99999.dev", config)
assert results == []
@pytest.mark.asyncio
@pytest.mark.network
async def test_domain_with_scheme_stripped(mapper):
"""Domain with https:// prefix should work."""
config = DomainMapperConfig(
source="sitemap",
extract_head=False,
max_urls=3,
verbose=False,
)
results = await mapper.scan("https://docs.crawl4ai.com", config)
assert len(results) >= 1
# ---------------------------------------------------------------------------
# Crawler integration tests
# ---------------------------------------------------------------------------
@pytest.mark.asyncio
@pytest.mark.network
async def test_amap_domain_on_crawler():
"""AsyncWebCrawler.amap_domain() should work end-to-end."""
async with AsyncWebCrawler() as crawler:
results = await crawler.amap_domain(
"docs.crawl4ai.com",
DomainMapperConfig(
source="sitemap",
extract_head=False,
max_urls=5,
verbose=False,
),
)
assert len(results) >= 1
assert all("url" in r for r in results)
# ---------------------------------------------------------------------------
# Config tests
# ---------------------------------------------------------------------------
@pytest.mark.asyncio
async def test_config_clone():
"""DomainMapperConfig.clone() should produce correct copies."""
config = DomainMapperConfig(source="sitemap", max_urls=10, verbose=True)
cloned = config.clone(max_urls=20, force=True)
assert cloned.max_urls == 20
assert cloned.force is True
assert cloned.source == "sitemap"
assert cloned.verbose is True
@pytest.mark.asyncio
async def test_config_from_kwargs():
"""DomainMapperConfig.from_kwargs() should work."""
config = DomainMapperConfig.from_kwargs({
"source": "crt+probe",
"max_urls": 50,
})
assert config.source == "crt+probe"
assert config.max_urls == 50
assert config.extract_head is True # default