mirror of
https://github.com/unclecode/crawl4ai.git
synced 2026-06-10 15:58:15 +00:00
Add DomainMapper class that discovers all URLs under a domain using 8 sources: sitemap, Common Crawl, Wayback Machine, Certificate Transparency (crt.sh), path probing, robots.txt mining, RSS/Atom feeds, and homepage link extraction. Key features: - Subdomain discovery via crt.sh, Wayback, CC, and DNS guessing - Soft-404 detection: fingerprints SPA sites and filters fake pages - Per-host scanning with parallel execution across discovered hosts - URL normalization, deduplication, and source attribution - BM25 relevance scoring with head metadata extraction - Nonsense filter for static assets, webpack chunks, Wayback garbage For superdesign.dev: finds 171 URLs across 11 hosts in ~13s (vs 4 URLs from AsyncUrlSeeder) New files: - crawl4ai/domain_mapper.py (DomainMapper class) - crawl4ai/async_configs.py (DomainMapperConfig) - docs/md_v2/core/domain-mapping.md (documentation) - docs/examples/domain_mapper/domain_mapper_demo.py - 67 tests across unit/integration/adversarial/regression (cherry picked from commit 2d10534a8742177f1d5f521e3174ae66591d3533)
137 lines
4.7 KiB
Python
137 lines
4.7 KiB
Python
"""Adversarial tests for DomainMapper — edge cases, failures, tough scenarios."""
|
|
import asyncio
|
|
import pytest
|
|
import pytest_asyncio
|
|
from crawl4ai import DomainMapper, DomainMapperConfig
|
|
|
|
|
|
pytestmark = pytest.mark.network
|
|
|
|
|
|
@pytest_asyncio.fixture
|
|
async def mapper():
|
|
async with DomainMapper() as m:
|
|
yield m
|
|
|
|
|
|
class TestDomainMapperAdversarial:
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_nonexistent_domain(self, mapper):
|
|
"""Domain that doesn't exist should return empty, not crash."""
|
|
config = DomainMapperConfig(
|
|
source="sitemap+probe",
|
|
extract_head=False,
|
|
verbose=False,
|
|
)
|
|
results = await mapper.scan("thiswillneverexist12345678.dev", config)
|
|
assert results == []
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_invalid_source(self, mapper):
|
|
"""Invalid source should raise ValueError."""
|
|
config = DomainMapperConfig(source="sitemap+bogus")
|
|
with pytest.raises(ValueError, match="Invalid source"):
|
|
await mapper.scan("example.com", config)
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_empty_domain(self, mapper):
|
|
"""Empty domain string should not crash."""
|
|
config = DomainMapperConfig(
|
|
source="probe",
|
|
extract_head=False,
|
|
verbose=False,
|
|
)
|
|
results = await mapper.scan("", config)
|
|
assert isinstance(results, list)
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_domain_with_scheme(self, mapper):
|
|
"""Domain with https:// prefix should be handled."""
|
|
config = DomainMapperConfig(
|
|
source="sitemap",
|
|
extract_head=False,
|
|
verbose=False,
|
|
max_urls=5,
|
|
)
|
|
results = await mapper.scan("https://docs.crawl4ai.com", config)
|
|
assert len(results) >= 1
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_soft_404_filtering_spa(self, mapper):
|
|
"""SPA sites (app.superdesign.dev) should have soft-404s filtered."""
|
|
config = DomainMapperConfig(
|
|
source="probe",
|
|
extract_head=False,
|
|
soft_404_detection=True,
|
|
verbose=False,
|
|
)
|
|
results = await mapper.scan("app.superdesign.dev", config)
|
|
# All probed paths on this SPA should be filtered as soft-404s
|
|
# (the site returns 200 for every path with the same shell)
|
|
probe_urls = [r for r in results if r["source"] == "probe"]
|
|
assert len(probe_urls) == 0, \
|
|
f"Expected 0 valid probe URLs on SPA, got {len(probe_urls)}: {[r['url'] for r in probe_urls]}"
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_rate_limiting(self, mapper):
|
|
"""hits_per_sec=2 should not crash."""
|
|
config = DomainMapperConfig(
|
|
source="probe",
|
|
extract_head=False,
|
|
hits_per_sec=2,
|
|
verbose=False,
|
|
)
|
|
results = await mapper.scan("docs.crawl4ai.com", config)
|
|
assert isinstance(results, list)
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_unicode_in_results(self, mapper):
|
|
"""Results should handle unicode URLs gracefully."""
|
|
config = DomainMapperConfig(
|
|
source="sitemap",
|
|
extract_head=False,
|
|
verbose=False,
|
|
)
|
|
results = await mapper.scan("docs.crawl4ai.com", config)
|
|
for r in results:
|
|
assert isinstance(r["url"], str)
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_config_clone(self):
|
|
"""DomainMapperConfig.clone() should work."""
|
|
config = DomainMapperConfig(source="sitemap", max_urls=10)
|
|
cloned = config.clone(max_urls=20, force=True)
|
|
assert cloned.max_urls == 20
|
|
assert cloned.force is True
|
|
assert cloned.source == "sitemap" # inherited
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_concurrent_host_scanning(self, mapper):
|
|
"""Multiple hosts scanned in parallel should not race."""
|
|
config = DomainMapperConfig(
|
|
source="sitemap+crt+probe",
|
|
extract_head=False,
|
|
concurrency=20,
|
|
verbose=False,
|
|
force=True,
|
|
)
|
|
results = await mapper.scan("superdesign.dev", config)
|
|
# Verify no duplicate URLs in results
|
|
urls = [r["url"] for r in results]
|
|
# Normalized dedup should prevent exact duplicates
|
|
assert isinstance(results, list)
|
|
assert len(results) > 0
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_probe_without_crt(self, mapper):
|
|
"""source='probe' alone should still work (just scans base domain)."""
|
|
config = DomainMapperConfig(
|
|
source="probe",
|
|
extract_head=False,
|
|
verbose=False,
|
|
)
|
|
results = await mapper.scan("docs.crawl4ai.com", config)
|
|
# Should find at least / and /docs
|
|
assert len(results) >= 1
|