Files
crawl4ai/tests/test_issue_1213_bm25_dedup.py
Maysam Hafezparast 57b0d09934 fix: deduplicate BM25ContentFilter output (#1213) (#1824)
BM25ContentFilter.filter_content() returned duplicate text chunks when
the same content appeared in multiple DOM elements. Added exact-text
deduplication after threshold filtering, keeping the first occurrence
in document order.

Co-authored-by: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-12 14:23:34 +08:00

179 lines
7.9 KiB
Python

"""
Tests for GitHub issue #1213: BM25ContentFilter returns multiple copies of
the same text when the DOM contains repeated content blocks.
Fix: added deduplication by chunk text in filter_content(), keeping the
first occurrence in document order.
"""
import pytest
from crawl4ai.content_filter_strategy import BM25ContentFilter
# ---------------------------------------------------------------------------
# Helpers
# ---------------------------------------------------------------------------
def _wrap_html(body_inner: str) -> str:
"""Wrap body HTML with boilerplate including a title for query extraction."""
return (
"<html><head><title>Python Programming Guide</title>"
'<meta name="description" content="Learn Python programming language"></head>'
f"<body>{body_inner}</body></html>"
)
# Provide enough varied paragraphs so BM25 IDF doesn't go negative.
# The "target" paragraph is the one we'll duplicate; the others provide corpus variety.
TARGET_TEXT = (
"Python is a versatile programming language widely used for web development, "
"data science, machine learning, and automation tasks across the industry."
)
FILLER_PARAGRAPHS = [
"<p>JavaScript powers interactive websites and runs in every modern browser environment.</p>",
"<p>Rust provides memory safety without garbage collection through its ownership model.</p>",
"<p>Go was designed at Google for building scalable network services and cloud infrastructure.</p>",
"<p>Ruby on Rails popularized convention over configuration in web application frameworks.</p>",
"<p>TypeScript adds static type checking to JavaScript for large-scale application development.</p>",
"<p>Java remains dominant in enterprise software and Android mobile application development.</p>",
"<p>C++ is essential for game engines, operating systems, and high-performance computing.</p>",
]
FILLER_BLOCK = "\n".join(FILLER_PARAGRAPHS)
def _make_test_html(target_copies: int = 1, extra_body: str = "") -> str:
"""Build HTML with `target_copies` of TARGET_TEXT + filler for BM25 variety."""
target_block = f"<p>{TARGET_TEXT}</p>\n" * target_copies
return _wrap_html(target_block + FILLER_BLOCK + extra_body)
# ---------------------------------------------------------------------------
# Tests
# ---------------------------------------------------------------------------
class TestBM25Deduplication:
"""Core deduplication tests for issue #1213."""
def test_exact_duplicates_collapsed(self):
"""Identical paragraphs should appear only once in output."""
html = _make_test_html(target_copies=5)
filt = BM25ContentFilter(user_query="Python programming", bm25_threshold=0.01)
results = filt.filter_content(html)
matching = [r for r in results if "versatile" in r]
assert len(matching) == 1
def test_no_duplicates_unchanged(self):
"""Unique paragraphs should all be preserved (none removed by dedup)."""
html = _make_test_html(target_copies=1)
filt = BM25ContentFilter(user_query="programming languages", bm25_threshold=0.01)
results = filt.filter_content(html)
# At least some pass threshold; key point is no false dedup
assert len(results) >= 1
# All results should be unique strings
assert len(results) == len(set(results))
def test_mixed_unique_and_duplicate(self):
"""Duplicates removed, uniques kept."""
extra = f"<p>{TARGET_TEXT}</p><p>{TARGET_TEXT}</p>"
html = _make_test_html(target_copies=1, extra_body=extra)
filt = BM25ContentFilter(user_query="Python programming", bm25_threshold=0.01)
results = filt.filter_content(html)
matching = [r for r in results if "versatile" in r]
assert len(matching) == 1
def test_document_order_preserved(self):
"""First occurrence should be kept, not the last."""
first_text = "Python was created by Guido van Rossum in the early nineties as a successor to the ABC language."
second_text = "Python supports multiple programming paradigms including procedural and functional styles."
body = (
f"<p>{first_text}</p>"
f"<p>{second_text}</p>"
f"<p>{first_text}</p>" # duplicate
+ FILLER_BLOCK
)
html = _wrap_html(body)
filt = BM25ContentFilter(user_query="Python history", bm25_threshold=0.01)
results = filt.filter_content(html)
guido_results = [r for r in results if "Guido" in r]
assert len(guido_results) == 1
def test_empty_html(self):
"""Empty input should return empty list."""
filt = BM25ContentFilter()
assert filt.filter_content("") == []
def test_none_html(self):
"""None input should return empty list."""
filt = BM25ContentFilter()
assert filt.filter_content(None) == []
def test_no_body(self):
"""HTML without explicit body should still work."""
body = f"<p>{TARGET_TEXT}</p>" * 3 + FILLER_BLOCK
filt = BM25ContentFilter(user_query="Python programming", bm25_threshold=0.01)
results = filt.filter_content(body)
matching = [r for r in results if "versatile" in r]
assert len(matching) == 1
def test_single_paragraph_with_filler(self):
"""Single target paragraph — nothing to deduplicate."""
html = _make_test_html(target_copies=1)
filt = BM25ContentFilter(user_query="Python programming", bm25_threshold=0.01)
results = filt.filter_content(html)
matching = [r for r in results if "versatile" in r]
assert len(matching) == 1
def test_high_threshold_filters_all(self):
"""Very high threshold should return nothing."""
html = _make_test_html(target_copies=3)
filt = BM25ContentFilter(bm25_threshold=9999.0)
results = filt.filter_content(html)
assert results == []
def test_duplicate_with_different_tags(self):
"""Same text in different tag types should still be deduplicated."""
text = "Python enables rapid prototyping and development of complex software systems."
extra = f"<p>{text}</p><div>{text}</div>"
html = _make_test_html(target_copies=0, extra_body=extra)
filt = BM25ContentFilter(user_query="Python development", bm25_threshold=0.01)
results = filt.filter_content(html)
matching = [r for r in results if "rapid prototyping" in r]
assert len(matching) == 1
def test_many_duplicates(self):
"""30 identical paragraphs should collapse to 1."""
html = _make_test_html(target_copies=30)
filt = BM25ContentFilter(user_query="Python programming", bm25_threshold=0.01)
results = filt.filter_content(html)
matching = [r for r in results if "versatile" in r]
assert len(matching) == 1
def test_user_query_with_duplicates(self):
"""Dedup works correctly when a user_query is provided."""
html = _make_test_html(target_copies=4)
filt = BM25ContentFilter(user_query="Python web development", bm25_threshold=0.01)
results = filt.filter_content(html)
matching = [r for r in results if "versatile" in r]
assert len(matching) == 1
def test_stemming_disabled_with_duplicates(self):
"""Dedup works with use_stemming=False."""
html = _make_test_html(target_copies=4)
filt = BM25ContentFilter(
user_query="Python programming", bm25_threshold=0.01, use_stemming=False
)
results = filt.filter_content(html)
matching = [r for r in results if "versatile" in r]
assert len(matching) == 1
def test_returns_list_type(self):
"""Output should always be a list of strings."""
html = _make_test_html(target_copies=2)
filt = BM25ContentFilter(user_query="Python", bm25_threshold=0.01)
results = filt.filter_content(html)
assert isinstance(results, list)
for r in results:
assert isinstance(r, str)