mirror of
https://github.com/unclecode/crawl4ai.git
synced 2026-06-10 15:58:15 +00:00
Add DomainMapper class that discovers all URLs under a domain using 8 sources: sitemap, Common Crawl, Wayback Machine, Certificate Transparency (crt.sh), path probing, robots.txt mining, RSS/Atom feeds, and homepage link extraction. Key features: - Subdomain discovery via crt.sh, Wayback, CC, and DNS guessing - Soft-404 detection: fingerprints SPA sites and filters fake pages - Per-host scanning with parallel execution across discovered hosts - URL normalization, deduplication, and source attribution - BM25 relevance scoring with head metadata extraction - Nonsense filter for static assets, webpack chunks, Wayback garbage For superdesign.dev: finds 171 URLs across 11 hosts in ~13s (vs 4 URLs from AsyncUrlSeeder) New files: - crawl4ai/domain_mapper.py (DomainMapper class) - crawl4ai/async_configs.py (DomainMapperConfig) - docs/md_v2/core/domain-mapping.md (documentation) - docs/examples/domain_mapper/domain_mapper_demo.py - 67 tests across unit/integration/adversarial/regression (cherry picked from commit 2d10534a8742177f1d5f521e3174ae66591d3533)
139 lines
4.8 KiB
Python
139 lines
4.8 KiB
Python
"""Integration tests for DomainMapper — hits real endpoints."""
|
|
import asyncio
|
|
import pytest
|
|
import pytest_asyncio
|
|
from crawl4ai import DomainMapper, DomainMapperConfig
|
|
|
|
|
|
pytestmark = pytest.mark.network
|
|
|
|
|
|
@pytest_asyncio.fixture
|
|
async def mapper():
|
|
async with DomainMapper() as m:
|
|
yield m
|
|
|
|
|
|
class TestDomainMapperE2E:
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_scan_superdesign_dev(self, mapper):
|
|
"""Full scan of superdesign.dev should find >=30 URLs across >=5 hosts."""
|
|
config = DomainMapperConfig(
|
|
source="sitemap+cc+crt+probe+robots+homepage",
|
|
extract_head=False,
|
|
force=True,
|
|
verbose=False,
|
|
)
|
|
results = await mapper.scan("superdesign.dev", config)
|
|
hosts = {r["host"] for r in results}
|
|
|
|
assert len(results) >= 20, f"Expected >=20 URLs, got {len(results)}"
|
|
assert len(hosts) >= 4, f"Expected >=4 hosts, got {len(hosts)}: {hosts}"
|
|
assert any("docs.superdesign.dev" == r["host"] for r in results), \
|
|
"docs.superdesign.dev should be discovered"
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_scan_docs_crawl4ai(self, mapper):
|
|
"""docs.crawl4ai.com has a known good sitemap."""
|
|
config = DomainMapperConfig(
|
|
source="sitemap",
|
|
extract_head=False,
|
|
force=True,
|
|
verbose=False,
|
|
)
|
|
results = await mapper.scan("docs.crawl4ai.com", config)
|
|
assert len(results) >= 5, f"Expected >=5 URLs from sitemap, got {len(results)}"
|
|
assert all(r["source"] == "sitemap" for r in results)
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_sitemap_only_source(self, mapper):
|
|
"""source='sitemap' should not hit CC, crt, or wayback."""
|
|
config = DomainMapperConfig(
|
|
source="sitemap",
|
|
extract_head=False,
|
|
force=True,
|
|
verbose=False,
|
|
)
|
|
results = await mapper.scan("superdesign.dev", config)
|
|
sources = {r["source"] for r in results}
|
|
# Should only have sitemap source
|
|
for s in sources:
|
|
for part in s.split("+"):
|
|
assert part == "sitemap", f"Unexpected source: {part}"
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_crt_discovers_subdomains(self, mapper):
|
|
"""crt source should discover subdomains for superdesign.dev."""
|
|
config = DomainMapperConfig(
|
|
source="crt+probe",
|
|
extract_head=False,
|
|
force=True,
|
|
verbose=False,
|
|
)
|
|
results = await mapper.scan("superdesign.dev", config)
|
|
hosts = {r["host"] for r in results}
|
|
# crt should find at least docs, app, cloud subdomains
|
|
assert len(hosts) >= 3, f"Expected >=3 hosts, got {len(hosts)}: {hosts}"
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_max_urls_limit(self, mapper):
|
|
"""max_urls should cap results."""
|
|
config = DomainMapperConfig(
|
|
source="sitemap+crt+probe",
|
|
extract_head=False,
|
|
max_urls=10,
|
|
force=True,
|
|
verbose=False,
|
|
)
|
|
results = await mapper.scan("superdesign.dev", config)
|
|
assert len(results) <= 10, f"Expected <=10 URLs, got {len(results)}"
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_source_attribution(self, mapper):
|
|
"""Each result should have a source field."""
|
|
config = DomainMapperConfig(
|
|
source="sitemap+probe",
|
|
extract_head=False,
|
|
force=True,
|
|
verbose=False,
|
|
)
|
|
results = await mapper.scan("docs.crawl4ai.com", config)
|
|
for r in results:
|
|
assert "source" in r
|
|
assert r["source"], "Source should not be empty"
|
|
assert "host" in r
|
|
assert "url" in r
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_head_extraction(self, mapper):
|
|
"""extract_head=True should populate head_data with titles."""
|
|
config = DomainMapperConfig(
|
|
source="sitemap",
|
|
extract_head=True,
|
|
max_urls=5,
|
|
force=True,
|
|
verbose=False,
|
|
)
|
|
results = await mapper.scan("docs.crawl4ai.com", config)
|
|
has_title = any(r.get("head_data", {}).get("title") for r in results)
|
|
assert has_title, "At least one result should have a title in head_data"
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_crawler_integration(self):
|
|
"""Test amap_domain() on AsyncWebCrawler works."""
|
|
from crawl4ai import AsyncWebCrawler
|
|
async with AsyncWebCrawler() as crawler:
|
|
results = await crawler.amap_domain(
|
|
"docs.crawl4ai.com",
|
|
DomainMapperConfig(
|
|
source="sitemap",
|
|
extract_head=False,
|
|
force=True,
|
|
verbose=False,
|
|
max_urls=5,
|
|
),
|
|
)
|
|
assert len(results) >= 1
|
|
assert all("url" in r for r in results)
|