"""Unit tests for DomainMapper: soft-404, robots.txt, feeds, normalization, nonsense filter."""
import asyncio
import hashlib
import pytest
from unittest.mock import AsyncMock, MagicMock, patch
from crawl4ai.domain_mapper import (
DomainMapper,
Soft404Fingerprint,
_NONSENSE_SUFFIXES,
_ASSET_EXTENSIONS,
)
# ════════════════════════════════════════════════════════════════════════
# Soft-404 Detection
# ════════════════════════════════════════════════════════════════════════
class TestSoft404Detection:
def test_is_soft_404_title_match(self):
mapper = DomainMapper.__new__(DomainMapper)
fp = Soft404Fingerprint(
status_code=200, title="Page Not Found",
content_length=1234, body_hash="abc123",
)
body = b"
Page Not FoundOops"
assert mapper._is_soft_404(200, body, fp) is True
def test_is_soft_404_hash_match(self):
mapper = DomainMapper.__new__(DomainMapper)
body = b"Different TitleError content"
body_hash = hashlib.md5(body[:2048]).hexdigest()
fp = Soft404Fingerprint(
status_code=200, title="Other Title",
content_length=len(body), body_hash=body_hash,
)
assert mapper._is_soft_404(200, body, fp) is True
def test_is_soft_404_real_404(self):
mapper = DomainMapper.__new__(DomainMapper)
fp = Soft404Fingerprint(
status_code=200, title="Not Found",
content_length=100, body_hash="abc",
)
body = b"Not Found"
# Real 404 status — NOT a soft-404
assert mapper._is_soft_404(404, body, fp) is False
def test_is_soft_404_no_fingerprint(self):
mapper = DomainMapper.__new__(DomainMapper)
body = b"Anything"
assert mapper._is_soft_404(200, body, None) is False
def test_is_soft_404_different_content(self):
mapper = DomainMapper.__new__(DomainMapper)
fp = Soft404Fingerprint(
status_code=200, title="Not Found",
content_length=100, body_hash="abc123",
)
body = b"Real PageActual content here"
assert mapper._is_soft_404(200, body, fp) is False
def test_is_soft_404_no_title_in_body(self):
mapper = DomainMapper.__new__(DomainMapper)
fp = Soft404Fingerprint(
status_code=200, title="Not Found",
content_length=100, body_hash="abc123",
)
body = b"No title tag"
assert mapper._is_soft_404(200, body, fp) is False
# ════════════════════════════════════════════════════════════════════════
# robots.txt Parsing
# ════════════════════════════════════════════════════════════════════════
class TestRobotsTxtParsing:
@pytest.mark.asyncio
async def test_parse_sitemap_directives(self):
robots_text = (
"User-agent: *\n"
"Disallow: /private/\n"
"Sitemap: https://example.com/sitemap.xml\n"
"Sitemap: https://example.com/sitemap-posts.xml\n"
)
mapper = DomainMapper.__new__(DomainMapper)
mapper.logger = None
mapper.client = AsyncMock()
resp = MagicMock()
resp.status_code = 200
resp.text = robots_text
mapper.client.get = AsyncMock(return_value=resp)
from crawl4ai.async_configs import DomainMapperConfig
config = DomainMapperConfig()
sitemap_urls, disallow_paths = await mapper._scan_robots_txt("example.com", config)
assert len(sitemap_urls) == 2
assert "https://example.com/sitemap.xml" in sitemap_urls
assert "/private/" in disallow_paths
@pytest.mark.asyncio
async def test_parse_disallow_ignores_wildcards(self):
robots_text = (
"User-agent: *\n"
"Disallow: /admin/\n"
"Disallow: /search?*\n"
"Disallow: /\n"
"Allow: /public/\n"
)
mapper = DomainMapper.__new__(DomainMapper)
mapper.logger = None
mapper.client = AsyncMock()
resp = MagicMock()
resp.status_code = 200
resp.text = robots_text
mapper.client.get = AsyncMock(return_value=resp)
from crawl4ai.async_configs import DomainMapperConfig
config = DomainMapperConfig()
_, paths = await mapper._scan_robots_txt("example.com", config)
assert "/admin/" in paths
assert "/public/" in paths
# Wildcards should be skipped
assert "/search?*" not in paths
# Single "/" is too short (len <= 1)
assert "/" not in paths
@pytest.mark.asyncio
async def test_empty_robots(self):
mapper = DomainMapper.__new__(DomainMapper)
mapper.logger = None
mapper.client = AsyncMock()
resp = MagicMock()
resp.status_code = 404
mapper.client.get = AsyncMock(return_value=resp)
from crawl4ai.async_configs import DomainMapperConfig
config = DomainMapperConfig()
sitemap_urls, paths = await mapper._scan_robots_txt("example.com", config)
assert sitemap_urls == []
assert paths == []
# ════════════════════════════════════════════════════════════════════════
# Feed Parsing
# ════════════════════════════════════════════════════════════════════════
class TestFeedParsing:
def test_parse_rss_feed(self):
rss = """
- https://example.com/post-1
- https://example.com/post-2
"""
mapper = DomainMapper.__new__(DomainMapper)
urls = mapper._parse_feed_xml(rss, "https://example.com/feed")
assert len(urls) == 2
assert "https://example.com/post-1" in urls
assert "https://example.com/post-2" in urls
def test_parse_atom_feed(self):
atom = """
"""
mapper = DomainMapper.__new__(DomainMapper)
urls = mapper._parse_feed_xml(atom, "https://example.com/atom.xml")
assert len(urls) == 2
assert "https://example.com/entry-1" in urls
def test_parse_rss_guid_fallback(self):
rss = """
-
https://example.com/guid-post
"""
mapper = DomainMapper.__new__(DomainMapper)
urls = mapper._parse_feed_xml(rss, "https://example.com/feed")
assert "https://example.com/guid-post" in urls
def test_malformed_feed(self):
mapper = DomainMapper.__new__(DomainMapper)
urls = mapper._parse_feed_xml("not xml at all <><>", "https://example.com/feed")
assert urls == []
# ════════════════════════════════════════════════════════════════════════
# URL Normalization & Dedup
# ════════════════════════════════════════════════════════════════════════
class TestNormalizationDedup:
def test_trailing_slash_dedup(self):
mapper = DomainMapper.__new__(DomainMapper)
results = [
{"url": "https://example.com/about", "host": "example.com", "source": "sitemap", "status": "valid", "head_data": {}},
{"url": "https://example.com/about/", "host": "example.com", "source": "probe", "status": "valid", "head_data": {}},
]
deduped = mapper._normalize_and_dedup(results, "example.com")
assert len(deduped) == 1
assert "probe" in deduped[0]["source"] or "sitemap" in deduped[0]["source"]
def test_source_merging(self):
mapper = DomainMapper.__new__(DomainMapper)
results = [
{"url": "https://example.com/page", "host": "example.com", "source": "sitemap", "status": "valid", "head_data": {}},
{"url": "https://example.com/page", "host": "example.com", "source": "homepage", "status": "valid", "head_data": {}},
]
deduped = mapper._normalize_and_dedup(results, "example.com")
assert len(deduped) == 1
sources = set(deduped[0]["source"].split("+"))
assert "sitemap" in sources
assert "homepage" in sources
# ════════════════════════════════════════════════════════════════════════
# Nonsense Filter
# ════════════════════════════════════════════════════════════════════════
class TestNonsenseFilter:
def test_filters_robots_txt(self):
mapper = DomainMapper.__new__(DomainMapper)
assert mapper._is_nonsense("https://example.com/robots.txt") is True
def test_filters_sitemap_xml(self):
mapper = DomainMapper.__new__(DomainMapper)
assert mapper._is_nonsense("https://example.com/sitemap.xml") is True
def test_filters_js_assets(self):
mapper = DomainMapper.__new__(DomainMapper)
assert mapper._is_nonsense("https://example.com/app.bundle.js") is True
def test_filters_css_assets(self):
mapper = DomainMapper.__new__(DomainMapper)
assert mapper._is_nonsense("https://example.com/style.css") is True
def test_filters_images(self):
mapper = DomainMapper.__new__(DomainMapper)
assert mapper._is_nonsense("https://example.com/logo.png") is True
assert mapper._is_nonsense("https://example.com/photo.jpg") is True
def test_filters_next_js_chunks(self):
mapper = DomainMapper.__new__(DomainMapper)
assert mapper._is_nonsense("https://example.com/_next/static/chunks/main.js") is True
def test_filters_wayback_garbage(self):
mapper = DomainMapper.__new__(DomainMapper)
assert mapper._is_nonsense("https://example.com/%5Cn-") is True
assert mapper._is_nonsense("https://example.com/%5CnJoin") is True
def test_keeps_login(self):
mapper = DomainMapper.__new__(DomainMapper)
assert mapper._is_nonsense("https://example.com/login") is False
def test_keeps_dashboard(self):
mapper = DomainMapper.__new__(DomainMapper)
assert mapper._is_nonsense("https://example.com/dashboard") is False
def test_keeps_docs(self):
mapper = DomainMapper.__new__(DomainMapper)
assert mapper._is_nonsense("https://example.com/docs") is False
def test_keeps_api_docs(self):
mapper = DomainMapper.__new__(DomainMapper)
assert mapper._is_nonsense("https://example.com/api-docs") is False
def test_filters_dotfiles(self):
mapper = DomainMapper.__new__(DomainMapper)
assert mapper._is_nonsense("https://example.com/.env") is True
assert mapper._is_nonsense("https://example.com/.git/config") is True
def test_filters_fonts(self):
mapper = DomainMapper.__new__(DomainMapper)
assert mapper._is_nonsense("https://example.com/fonts/arial.woff2") is True
# ════════════════════════════════════════════════════════════════════════
# crt.sh Response Parsing
# ════════════════════════════════════════════════════════════════════════
class TestCrtShParsing:
@pytest.mark.asyncio
async def test_parse_crt_response(self):
mapper = DomainMapper.__new__(DomainMapper)
mapper.logger = None
mapper.client = AsyncMock()
resp = MagicMock()
resp.status_code = 200
resp.json.return_value = [
{"common_name": "example.com", "name_value": "example.com"},
{"common_name": "docs.example.com", "name_value": "docs.example.com\napi.example.com"},
{"common_name": "*.example.com", "name_value": "*.example.com"},
]
mapper.client.get = AsyncMock(return_value=resp)
from crawl4ai.async_configs import DomainMapperConfig
hosts = await mapper._discover_via_crt("example.com", DomainMapperConfig())
assert "example.com" in hosts
assert "docs.example.com" in hosts
assert "api.example.com" in hosts
# Wildcards should be resolved to base
assert "*.example.com" not in hosts
@pytest.mark.asyncio
async def test_crt_filters_unrelated_domains(self):
mapper = DomainMapper.__new__(DomainMapper)
mapper.logger = None
mapper.client = AsyncMock()
resp = MagicMock()
resp.status_code = 200
resp.json.return_value = [
{"common_name": "example.com", "name_value": "example.com"},
{"common_name": "evil.com", "name_value": "evil.com"},
]
mapper.client.get = AsyncMock(return_value=resp)
from crawl4ai.async_configs import DomainMapperConfig
hosts = await mapper._discover_via_crt("example.com", DomainMapperConfig())
assert "example.com" in hosts
assert "evil.com" not in hosts
@pytest.mark.asyncio
async def test_crt_handles_failure(self):
mapper = DomainMapper.__new__(DomainMapper)
mapper.logger = None
mapper.client = AsyncMock()
mapper.client.get = AsyncMock(side_effect=Exception("timeout"))
from crawl4ai.async_configs import DomainMapperConfig
hosts = await mapper._discover_via_crt("example.com", DomainMapperConfig())
assert hosts == set()
# ════════════════════════════════════════════════════════════════════════
# Homepage Link Extraction
# ════════════════════════════════════════════════════════════════════════
class TestHomepageLinkExtraction:
@pytest.mark.asyncio
async def test_extract_internal_links(self):
html = """
Test
About
Blog
External
"""
mapper = DomainMapper.__new__(DomainMapper)
mapper.logger = None
mapper.client = AsyncMock()
resp = MagicMock()
resp.status_code = 200
resp.text = html
resp.url = "https://example.com/"
mapper.client.get = AsyncMock(return_value=resp)
from crawl4ai.async_configs import DomainMapperConfig
urls = await mapper._scan_homepage("example.com", "example.com", DomainMapperConfig())
# Should have internal links, not external
assert any("/about" in u for u in urls)
assert any("/blog" in u for u in urls)
assert not any("external.com" in u for u in urls)
@pytest.mark.asyncio
async def test_extract_link_tags(self):
html = """
Test
Link"""
mapper = DomainMapper.__new__(DomainMapper)
mapper.logger = None
mapper.client = AsyncMock()
resp = MagicMock()
resp.status_code = 200
resp.text = html
resp.url = "https://example.com/"
mapper.client.get = AsyncMock(return_value=resp)
from crawl4ai.async_configs import DomainMapperConfig
urls = await mapper._scan_homepage("example.com", "example.com", DomainMapperConfig())
# Should include link tags
assert any("/es/" in u for u in urls)
assert any("/features" in u for u in urls)