Files
crawl4ai/tests/adversarial/test_domain_mapper_adversarial.py
unclecode 9d5bcf78e2 feat: Add DomainMapper for comprehensive domain URL discovery
Add DomainMapper class that discovers all URLs under a domain using
8 sources: sitemap, Common Crawl, Wayback Machine, Certificate
Transparency (crt.sh), path probing, robots.txt mining, RSS/Atom
feeds, and homepage link extraction.

Key features:
- Subdomain discovery via crt.sh, Wayback, CC, and DNS guessing
- Soft-404 detection: fingerprints SPA sites and filters fake pages
- Per-host scanning with parallel execution across discovered hosts
- URL normalization, deduplication, and source attribution
- BM25 relevance scoring with head metadata extraction
- Nonsense filter for static assets, webpack chunks, Wayback garbage

For superdesign.dev: finds 171 URLs across 11 hosts in ~13s
(vs 4 URLs from AsyncUrlSeeder)

New files:
- crawl4ai/domain_mapper.py (DomainMapper class)
- crawl4ai/async_configs.py (DomainMapperConfig)
- docs/md_v2/core/domain-mapping.md (documentation)
- docs/examples/domain_mapper/domain_mapper_demo.py
- 67 tests across unit/integration/adversarial/regression

(cherry picked from commit 2d10534a8742177f1d5f521e3174ae66591d3533)
2026-06-01 12:58:23 +00:00

137 lines
4.7 KiB
Python

"""Adversarial tests for DomainMapper — edge cases, failures, tough scenarios."""
import asyncio
import pytest
import pytest_asyncio
from crawl4ai import DomainMapper, DomainMapperConfig
pytestmark = pytest.mark.network
@pytest_asyncio.fixture
async def mapper():
async with DomainMapper() as m:
yield m
class TestDomainMapperAdversarial:
@pytest.mark.asyncio
async def test_nonexistent_domain(self, mapper):
"""Domain that doesn't exist should return empty, not crash."""
config = DomainMapperConfig(
source="sitemap+probe",
extract_head=False,
verbose=False,
)
results = await mapper.scan("thiswillneverexist12345678.dev", config)
assert results == []
@pytest.mark.asyncio
async def test_invalid_source(self, mapper):
"""Invalid source should raise ValueError."""
config = DomainMapperConfig(source="sitemap+bogus")
with pytest.raises(ValueError, match="Invalid source"):
await mapper.scan("example.com", config)
@pytest.mark.asyncio
async def test_empty_domain(self, mapper):
"""Empty domain string should not crash."""
config = DomainMapperConfig(
source="probe",
extract_head=False,
verbose=False,
)
results = await mapper.scan("", config)
assert isinstance(results, list)
@pytest.mark.asyncio
async def test_domain_with_scheme(self, mapper):
"""Domain with https:// prefix should be handled."""
config = DomainMapperConfig(
source="sitemap",
extract_head=False,
verbose=False,
max_urls=5,
)
results = await mapper.scan("https://docs.crawl4ai.com", config)
assert len(results) >= 1
@pytest.mark.asyncio
async def test_soft_404_filtering_spa(self, mapper):
"""SPA sites (app.superdesign.dev) should have soft-404s filtered."""
config = DomainMapperConfig(
source="probe",
extract_head=False,
soft_404_detection=True,
verbose=False,
)
results = await mapper.scan("app.superdesign.dev", config)
# All probed paths on this SPA should be filtered as soft-404s
# (the site returns 200 for every path with the same shell)
probe_urls = [r for r in results if r["source"] == "probe"]
assert len(probe_urls) == 0, \
f"Expected 0 valid probe URLs on SPA, got {len(probe_urls)}: {[r['url'] for r in probe_urls]}"
@pytest.mark.asyncio
async def test_rate_limiting(self, mapper):
"""hits_per_sec=2 should not crash."""
config = DomainMapperConfig(
source="probe",
extract_head=False,
hits_per_sec=2,
verbose=False,
)
results = await mapper.scan("docs.crawl4ai.com", config)
assert isinstance(results, list)
@pytest.mark.asyncio
async def test_unicode_in_results(self, mapper):
"""Results should handle unicode URLs gracefully."""
config = DomainMapperConfig(
source="sitemap",
extract_head=False,
verbose=False,
)
results = await mapper.scan("docs.crawl4ai.com", config)
for r in results:
assert isinstance(r["url"], str)
@pytest.mark.asyncio
async def test_config_clone(self):
"""DomainMapperConfig.clone() should work."""
config = DomainMapperConfig(source="sitemap", max_urls=10)
cloned = config.clone(max_urls=20, force=True)
assert cloned.max_urls == 20
assert cloned.force is True
assert cloned.source == "sitemap" # inherited
@pytest.mark.asyncio
async def test_concurrent_host_scanning(self, mapper):
"""Multiple hosts scanned in parallel should not race."""
config = DomainMapperConfig(
source="sitemap+crt+probe",
extract_head=False,
concurrency=20,
verbose=False,
force=True,
)
results = await mapper.scan("superdesign.dev", config)
# Verify no duplicate URLs in results
urls = [r["url"] for r in results]
# Normalized dedup should prevent exact duplicates
assert isinstance(results, list)
assert len(results) > 0
@pytest.mark.asyncio
async def test_probe_without_crt(self, mapper):
"""source='probe' alone should still work (just scans base domain)."""
config = DomainMapperConfig(
source="probe",
extract_head=False,
verbose=False,
)
results = await mapper.scan("docs.crawl4ai.com", config)
# Should find at least / and /docs
assert len(results) >= 1