Files
crawl4ai/tests/test_issue_1484_css_selector.py
hafezparast 8de83a3590 fix: css_selector ignored in LXML scraping for raw:// URLs (#1484)
css_selector was skipped in _scrap() — only target_elements was
applied. Now css_selector filters the DOM first, then target_elements
narrows within that selection.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-12 20:00:33 +08:00

168 lines
5.5 KiB
Python

"""Tests for issue #1484: css_selector doesn't work but target_elements does.
The bug: When using raw:// URLs, css_selector was accepted by _scrap() but
never applied. Only target_elements worked.
"""
import pytest
from crawl4ai.content_scraping_strategy import LXMLWebScrapingStrategy
SAMPLE_HTML = """
<html>
<body>
<div class="header">Header content</div>
<div class="main-content">
<p>Main paragraph one with enough words to pass threshold easily here.</p>
<p>Main paragraph two with enough words to pass threshold easily here.</p>
</div>
<div class="sidebar">
<p>Sidebar content with enough words to pass the word count threshold easily.</p>
</div>
<div class="footer">Footer content here</div>
</body>
</html>
"""
NESTED_HTML = """
<html>
<body>
<div class="wrapper">
<div class="el-drawer">
<div class="drawer-body">
<p>Drawer content with enough words to pass threshold filter check.</p>
</div>
</div>
<div class="other">
<p>Other content with enough words to pass threshold filter easily here.</p>
</div>
</div>
</body>
</html>
"""
@pytest.fixture
def scraper():
return LXMLWebScrapingStrategy()
class TestCssSelectorBasic:
def test_css_selector_filters_content(self, scraper):
"""css_selector should restrict content to matching elements."""
result = scraper._scrap(
url="raw://test",
html=SAMPLE_HTML,
css_selector=".main-content",
)
assert result is not None
text = result.get("markdown", "") or result.get("cleaned_html", "")
assert "Main paragraph" in text
assert "Sidebar content" not in text
def test_css_selector_none_returns_full_content(self, scraper):
"""No css_selector should return all content."""
result = scraper._scrap(
url="raw://test",
html=SAMPLE_HTML,
css_selector=None,
)
assert result is not None
html = result.get("cleaned_html", "")
assert "Main paragraph" in html
assert "Sidebar content" in html
def test_css_selector_no_match_returns_full_content(self, scraper):
"""Non-matching css_selector should fall back to full content."""
result = scraper._scrap(
url="raw://test",
html=SAMPLE_HTML,
css_selector=".nonexistent-class",
)
assert result is not None
html = result.get("cleaned_html", "")
assert "Main paragraph" in html
def test_css_selector_invalid_falls_back(self, scraper):
"""Invalid css_selector should fall back gracefully."""
result = scraper._scrap(
url="raw://test",
html=SAMPLE_HTML,
css_selector="[[[invalid",
)
assert result is not None
def test_css_selector_nested_selector(self, scraper):
"""Nested CSS selectors like '.el-drawer .drawer-body' should work."""
result = scraper._scrap(
url="raw://test",
html=NESTED_HTML,
css_selector=".el-drawer .drawer-body",
)
assert result is not None
html = result.get("cleaned_html", "")
assert "Drawer content" in html
assert "Other content" not in html
class TestCssSelectorWithTargetElements:
def test_css_selector_combined_with_target_elements(self, scraper):
"""css_selector and target_elements together should narrow content."""
result = scraper._scrap(
url="raw://test",
html=SAMPLE_HTML,
css_selector=".main-content",
target_elements=["p"],
)
assert result is not None
html = result.get("cleaned_html", "")
assert "Main paragraph" in html
assert "Sidebar content" not in html
def test_target_elements_alone_still_works(self, scraper):
"""target_elements without css_selector should still work."""
result = scraper._scrap(
url="raw://test",
html=SAMPLE_HTML,
target_elements=[".main-content"],
)
assert result is not None
html = result.get("cleaned_html", "")
assert "Main paragraph" in html
def test_css_selector_multiple_matches(self, scraper):
"""css_selector matching multiple elements should include all."""
result = scraper._scrap(
url="raw://test",
html=SAMPLE_HTML,
css_selector="div p",
)
assert result is not None
html = result.get("cleaned_html", "")
assert "Main paragraph one" in html
assert "Main paragraph two" in html
def test_css_selector_id_selector(self, scraper):
"""ID-based css_selector should work."""
html = '<html><body><div id="target"><p>Target content here with enough words.</p></div><div><p>Other stuff</p></div></body></html>'
result = scraper._scrap(
url="raw://test",
html=html,
css_selector="#target",
)
assert result is not None
cleaned = result.get("cleaned_html", "")
assert "Target content" in cleaned
def test_css_selector_excludes_non_matching(self, scraper):
"""Content outside css_selector match should not appear."""
result = scraper._scrap(
url="raw://test",
html=SAMPLE_HTML,
css_selector=".sidebar",
)
assert result is not None
html = result.get("cleaned_html", "")
assert "Sidebar content" in html
assert "Main paragraph" not in html