crawl4ai/docs/examples/domain_mapper/domain_mapper_demo.py

"""
DomainMapper Demo — Comprehensive domain URL discovery

Discovers all URLs under a domain using 8 sources:
sitemap, Common Crawl, Wayback Machine, Certificate Transparency,
path probing, robots.txt mining, RSS/Atom feeds, homepage link extraction.

Usage:
    python domain_mapper_demo.py [domain] [--source SOURCE] [--query QUERY]

Examples:
    python domain_mapper_demo.py superdesign.dev
    python domain_mapper_demo.py example.com --source sitemap+crt+probe
    python domain_mapper_demo.py docs.crawl4ai.com --query "extraction tutorial"
"""

import asyncio
import argparse
from collections import defaultdict
from crawl4ai import DomainMapper, DomainMapperConfig


async def main():
    parser = argparse.ArgumentParser(description="DomainMapper Demo")
    parser.add_argument("domain", help="Domain to scan (e.g., example.com)")
    parser.add_argument("--source", default="sitemap+cc+crt+probe+robots+homepage",
                        help="Discovery sources (default: sitemap+cc+crt+probe+robots+homepage)")
    parser.add_argument("--query", default=None, help="BM25 relevance query")
    parser.add_argument("--max-urls", type=int, default=-1, help="Max URLs to return")
    parser.add_argument("--no-head", action="store_true", help="Skip head extraction")
    args = parser.parse_args()

    config = DomainMapperConfig(
        source=args.source,
        extract_head=not args.no_head,
        query=args.query,
        max_urls=args.max_urls,
        verbose=True,
        force=True,
    )

    async with DomainMapper() as mapper:
        results = await mapper.scan(args.domain, config)

    # Group by host
    by_host = defaultdict(list)
    for r in results:
        by_host[r["host"]].append(r)

    # Print results
    print("\n" + "=" * 70)
    print(f"RESULTS: {len(results)} URLs across {len(by_host)} hosts")
    print("=" * 70)

    for host in sorted(by_host.keys()):
        urls = by_host[host]
        print(f"\n  {host} ({len(urls)} URLs):")
        for r in urls[:10]:
            title = r.get("head_data", {}).get("title", "")
            score = r.get("relevance_score")
            line = f"    [{r['source']}] {r['url']}"
            if title:
                line += f"\n      Title: {title}"
            if score is not None:
                line += f"\n      Score: {score:.3f}"
            print(line)
        if len(urls) > 10:
            print(f"    ... and {len(urls) - 10} more")

    # Source breakdown
    source_counts = defaultdict(int)
    for r in results:
        for s in r["source"].split("+"):
            source_counts[s] += 1

    print(f"\nSource breakdown:")
    for source, count in sorted(source_counts.items(), key=lambda x: -x[1]):
        print(f"  {source}: {count} URLs")


if __name__ == "__main__":
    asyncio.run(main())