mirror of
https://github.com/unclecode/crawl4ai.git
synced 2026-06-10 15:58:15 +00:00
Add DomainMapper class that discovers all URLs under a domain using 8 sources: sitemap, Common Crawl, Wayback Machine, Certificate Transparency (crt.sh), path probing, robots.txt mining, RSS/Atom feeds, and homepage link extraction. Key features: - Subdomain discovery via crt.sh, Wayback, CC, and DNS guessing - Soft-404 detection: fingerprints SPA sites and filters fake pages - Per-host scanning with parallel execution across discovered hosts - URL normalization, deduplication, and source attribution - BM25 relevance scoring with head metadata extraction - Nonsense filter for static assets, webpack chunks, Wayback garbage For superdesign.dev: finds 171 URLs across 11 hosts in ~13s (vs 4 URLs from AsyncUrlSeeder) New files: - crawl4ai/domain_mapper.py (DomainMapper class) - crawl4ai/async_configs.py (DomainMapperConfig) - docs/md_v2/core/domain-mapping.md (documentation) - docs/examples/domain_mapper/domain_mapper_demo.py - 67 tests across unit/integration/adversarial/regression (cherry picked from commit 2d10534a8742177f1d5f521e3174ae66591d3533)
83 lines
2.7 KiB
Python
83 lines
2.7 KiB
Python
"""
|
|
DomainMapper Demo — Comprehensive domain URL discovery
|
|
|
|
Discovers all URLs under a domain using 8 sources:
|
|
sitemap, Common Crawl, Wayback Machine, Certificate Transparency,
|
|
path probing, robots.txt mining, RSS/Atom feeds, homepage link extraction.
|
|
|
|
Usage:
|
|
python domain_mapper_demo.py [domain] [--source SOURCE] [--query QUERY]
|
|
|
|
Examples:
|
|
python domain_mapper_demo.py superdesign.dev
|
|
python domain_mapper_demo.py example.com --source sitemap+crt+probe
|
|
python domain_mapper_demo.py docs.crawl4ai.com --query "extraction tutorial"
|
|
"""
|
|
|
|
import asyncio
|
|
import argparse
|
|
from collections import defaultdict
|
|
from crawl4ai import DomainMapper, DomainMapperConfig
|
|
|
|
|
|
async def main():
|
|
parser = argparse.ArgumentParser(description="DomainMapper Demo")
|
|
parser.add_argument("domain", help="Domain to scan (e.g., example.com)")
|
|
parser.add_argument("--source", default="sitemap+cc+crt+probe+robots+homepage",
|
|
help="Discovery sources (default: sitemap+cc+crt+probe+robots+homepage)")
|
|
parser.add_argument("--query", default=None, help="BM25 relevance query")
|
|
parser.add_argument("--max-urls", type=int, default=-1, help="Max URLs to return")
|
|
parser.add_argument("--no-head", action="store_true", help="Skip head extraction")
|
|
args = parser.parse_args()
|
|
|
|
config = DomainMapperConfig(
|
|
source=args.source,
|
|
extract_head=not args.no_head,
|
|
query=args.query,
|
|
max_urls=args.max_urls,
|
|
verbose=True,
|
|
force=True,
|
|
)
|
|
|
|
async with DomainMapper() as mapper:
|
|
results = await mapper.scan(args.domain, config)
|
|
|
|
# Group by host
|
|
by_host = defaultdict(list)
|
|
for r in results:
|
|
by_host[r["host"]].append(r)
|
|
|
|
# Print results
|
|
print("\n" + "=" * 70)
|
|
print(f"RESULTS: {len(results)} URLs across {len(by_host)} hosts")
|
|
print("=" * 70)
|
|
|
|
for host in sorted(by_host.keys()):
|
|
urls = by_host[host]
|
|
print(f"\n {host} ({len(urls)} URLs):")
|
|
for r in urls[:10]:
|
|
title = r.get("head_data", {}).get("title", "")
|
|
score = r.get("relevance_score")
|
|
line = f" [{r['source']}] {r['url']}"
|
|
if title:
|
|
line += f"\n Title: {title}"
|
|
if score is not None:
|
|
line += f"\n Score: {score:.3f}"
|
|
print(line)
|
|
if len(urls) > 10:
|
|
print(f" ... and {len(urls) - 10} more")
|
|
|
|
# Source breakdown
|
|
source_counts = defaultdict(int)
|
|
for r in results:
|
|
for s in r["source"].split("+"):
|
|
source_counts[s] += 1
|
|
|
|
print(f"\nSource breakdown:")
|
|
for source, count in sorted(source_counts.items(), key=lambda x: -x[1]):
|
|
print(f" {source}: {count} URLs")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
asyncio.run(main())
|