mirror of
https://github.com/unclecode/crawl4ai.git
synced 2026-06-11 08:18:01 +00:00
Merge pull request #1845 from hafezparast/fix/maysam-mermaid-svg-text-1043
fix: preserve mermaid diagram text from SVGs during scraping (#1043)
This commit is contained in:
@@ -721,6 +721,30 @@ class LXMLWebScrapingStrategy(ContentScrapingStrategy):
|
||||
elif content_element is None:
|
||||
content_element = body
|
||||
|
||||
# Replace mermaid SVGs with text before they get stripped
|
||||
for svg in body.xpath('.//svg[starts-with(@id, "mermaid-")]'):
|
||||
try:
|
||||
diagram_type = svg.get("aria-roledescription", "diagram")
|
||||
# Extract text from node/edge labels
|
||||
labels = []
|
||||
seen = set()
|
||||
for el in svg.cssselect(".nodeLabel, .label span, .edgeLabel span"):
|
||||
text = el.text_content().strip()
|
||||
if text and text not in seen:
|
||||
seen.add(text)
|
||||
labels.append(text)
|
||||
if labels:
|
||||
# Build a pre block so it survives markdown conversion
|
||||
placeholder = lhtml.Element("pre")
|
||||
code = etree.SubElement(placeholder, "code")
|
||||
code.set("class", "language-mermaid")
|
||||
code.text = f"%% {diagram_type} diagram\n" + "\n".join(labels)
|
||||
parent = svg.getparent()
|
||||
if parent is not None:
|
||||
parent.replace(svg, placeholder)
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
# Remove script and style tags
|
||||
for tag in ["style", "link", "meta", "noscript"]:
|
||||
for element in body.xpath(f".//{tag}"):
|
||||
|
||||
229
tests/test_issue_1043_mermaid_svg.py
Normal file
229
tests/test_issue_1043_mermaid_svg.py
Normal file
@@ -0,0 +1,229 @@
|
||||
"""
|
||||
Tests for issue #1043: Missing Mermaid Flowcharts
|
||||
|
||||
Verifies that mermaid SVG diagrams are preserved as text content
|
||||
during HTML scraping, rather than being stripped entirely.
|
||||
"""
|
||||
|
||||
import pytest
|
||||
from lxml import html as lhtml
|
||||
from crawl4ai.content_scraping_strategy import LXMLWebScrapingStrategy
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def strategy():
|
||||
return LXMLWebScrapingStrategy()
|
||||
|
||||
|
||||
def _make_html(body_content: str) -> str:
|
||||
return f"<html><body>{body_content}</body></html>"
|
||||
|
||||
|
||||
# -- Mermaid SVG detection and replacement --
|
||||
|
||||
FLOWCHART_SVG = """
|
||||
<div>
|
||||
<p>Before diagram</p>
|
||||
<svg id="mermaid-abc123" aria-roledescription="flowchart-v2" xmlns="http://www.w3.org/2000/svg">
|
||||
<g class="node"><foreignObject><div><span class="nodeLabel">Start</span></div></foreignObject></g>
|
||||
<g class="node"><foreignObject><div><span class="nodeLabel">Process Data</span></div></foreignObject></g>
|
||||
<g class="node"><foreignObject><div><span class="nodeLabel">End</span></div></foreignObject></g>
|
||||
<g class="edgeLabel"><foreignObject><div><span>yes</span></div></foreignObject></g>
|
||||
</svg>
|
||||
<p>After diagram</p>
|
||||
</div>
|
||||
"""
|
||||
|
||||
CLASS_DIAGRAM_SVG = """
|
||||
<div>
|
||||
<svg id="mermaid-def456" aria-roledescription="class" xmlns="http://www.w3.org/2000/svg">
|
||||
<g class="node"><foreignObject><div><span class="nodeLabel">MyClass</span></div></foreignObject></g>
|
||||
<g class="node"><foreignObject><div><span class="nodeLabel">+method() : void</span></div></foreignObject></g>
|
||||
<g class="node"><foreignObject><div><span class="nodeLabel">-field : int</span></div></foreignObject></g>
|
||||
</svg>
|
||||
</div>
|
||||
"""
|
||||
|
||||
SEQUENCE_SVG = """
|
||||
<div>
|
||||
<svg id="mermaid-seq789" aria-roledescription="sequence" xmlns="http://www.w3.org/2000/svg">
|
||||
<g class="label"><foreignObject><div><span>Alice</span></div></foreignObject></g>
|
||||
<g class="label"><foreignObject><div><span>Bob</span></div></foreignObject></g>
|
||||
<g class="edgeLabel"><foreignObject><div><span>Hello</span></div></foreignObject></g>
|
||||
</svg>
|
||||
</div>
|
||||
"""
|
||||
|
||||
|
||||
class TestMermaidSVGDetection:
|
||||
"""Test that mermaid SVGs are detected by their id prefix."""
|
||||
|
||||
def test_flowchart_svg_detected(self, strategy):
|
||||
html = _make_html(FLOWCHART_SVG)
|
||||
result = strategy._scrap("http://test.com", html)
|
||||
assert result is not None
|
||||
cleaned = result.get("cleaned_html", "")
|
||||
assert "Start" in cleaned
|
||||
assert "Process Data" in cleaned
|
||||
|
||||
def test_non_mermaid_svg_not_affected(self, strategy):
|
||||
"""Regular SVGs without mermaid id should be unaffected."""
|
||||
html = _make_html("""
|
||||
<div>
|
||||
<svg id="logo" xmlns="http://www.w3.org/2000/svg">
|
||||
<text>Logo Text</text>
|
||||
</svg>
|
||||
<p>Content here</p>
|
||||
</div>
|
||||
""")
|
||||
result = strategy._scrap("http://test.com", html)
|
||||
assert result is not None
|
||||
|
||||
def test_mermaid_svg_replaced_with_pre_code(self, strategy):
|
||||
"""Mermaid SVG should be replaced with pre/code block."""
|
||||
html = _make_html(FLOWCHART_SVG)
|
||||
result = strategy._scrap("http://test.com", html)
|
||||
cleaned = result.get("cleaned_html", "")
|
||||
assert "language-mermaid" in cleaned or "mermaid" in cleaned.lower()
|
||||
|
||||
|
||||
class TestMermaidTextExtraction:
|
||||
"""Test that text content is correctly extracted from mermaid SVGs."""
|
||||
|
||||
def test_node_labels_extracted(self, strategy):
|
||||
html = _make_html(FLOWCHART_SVG)
|
||||
result = strategy._scrap("http://test.com", html)
|
||||
cleaned = result.get("cleaned_html", "")
|
||||
assert "Start" in cleaned
|
||||
assert "Process Data" in cleaned
|
||||
assert "End" in cleaned
|
||||
|
||||
def test_edge_labels_extracted(self, strategy):
|
||||
html = _make_html(FLOWCHART_SVG)
|
||||
result = strategy._scrap("http://test.com", html)
|
||||
cleaned = result.get("cleaned_html", "")
|
||||
assert "yes" in cleaned
|
||||
|
||||
def test_class_diagram_labels_extracted(self, strategy):
|
||||
html = _make_html(CLASS_DIAGRAM_SVG)
|
||||
result = strategy._scrap("http://test.com", html)
|
||||
cleaned = result.get("cleaned_html", "")
|
||||
assert "MyClass" in cleaned
|
||||
assert "+method() : void" in cleaned
|
||||
|
||||
def test_sequence_diagram_labels_extracted(self, strategy):
|
||||
html = _make_html(SEQUENCE_SVG)
|
||||
result = strategy._scrap("http://test.com", html)
|
||||
cleaned = result.get("cleaned_html", "")
|
||||
assert "Alice" in cleaned
|
||||
assert "Bob" in cleaned
|
||||
|
||||
def test_duplicate_labels_deduplicated(self, strategy):
|
||||
"""Same label appearing multiple times should only appear once."""
|
||||
html = _make_html("""
|
||||
<div>
|
||||
<svg id="mermaid-dup" aria-roledescription="flowchart-v2" xmlns="http://www.w3.org/2000/svg">
|
||||
<g class="node"><foreignObject><div><span class="nodeLabel">Repeated</span></div></foreignObject></g>
|
||||
<g class="node"><foreignObject><div><span class="nodeLabel">Repeated</span></div></foreignObject></g>
|
||||
<g class="node"><foreignObject><div><span class="nodeLabel">Unique</span></div></foreignObject></g>
|
||||
</svg>
|
||||
</div>
|
||||
""")
|
||||
result = strategy._scrap("http://test.com", html)
|
||||
cleaned = result.get("cleaned_html", "")
|
||||
# Should have Repeated once, not twice
|
||||
assert cleaned.count("Repeated") == 1
|
||||
assert "Unique" in cleaned
|
||||
|
||||
|
||||
class TestMermaidDiagramType:
|
||||
"""Test that diagram type is preserved."""
|
||||
|
||||
def test_flowchart_type_preserved(self, strategy):
|
||||
html = _make_html(FLOWCHART_SVG)
|
||||
result = strategy._scrap("http://test.com", html)
|
||||
cleaned = result.get("cleaned_html", "")
|
||||
assert "flowchart" in cleaned.lower()
|
||||
|
||||
def test_class_type_preserved(self, strategy):
|
||||
html = _make_html(CLASS_DIAGRAM_SVG)
|
||||
result = strategy._scrap("http://test.com", html)
|
||||
cleaned = result.get("cleaned_html", "")
|
||||
assert "class" in cleaned.lower()
|
||||
|
||||
def test_sequence_type_preserved(self, strategy):
|
||||
html = _make_html(SEQUENCE_SVG)
|
||||
result = strategy._scrap("http://test.com", html)
|
||||
cleaned = result.get("cleaned_html", "")
|
||||
assert "sequence" in cleaned.lower()
|
||||
|
||||
|
||||
class TestMermaidSurroundingContent:
|
||||
"""Test that surrounding content is preserved."""
|
||||
|
||||
def test_text_before_diagram_preserved(self, strategy):
|
||||
html = _make_html(FLOWCHART_SVG)
|
||||
result = strategy._scrap("http://test.com", html)
|
||||
cleaned = result.get("cleaned_html", "")
|
||||
assert "Before diagram" in cleaned
|
||||
|
||||
def test_text_after_diagram_preserved(self, strategy):
|
||||
html = _make_html(FLOWCHART_SVG)
|
||||
result = strategy._scrap("http://test.com", html)
|
||||
cleaned = result.get("cleaned_html", "")
|
||||
assert "After diagram" in cleaned
|
||||
|
||||
|
||||
class TestMermaidEdgeCases:
|
||||
"""Test edge cases for mermaid SVG handling."""
|
||||
|
||||
def test_empty_mermaid_svg(self, strategy):
|
||||
"""SVG with no text content should be handled gracefully."""
|
||||
html = _make_html("""
|
||||
<div>
|
||||
<svg id="mermaid-empty" aria-roledescription="flowchart-v2" xmlns="http://www.w3.org/2000/svg">
|
||||
<rect width="100" height="100"/>
|
||||
</svg>
|
||||
<p>Content</p>
|
||||
</div>
|
||||
""")
|
||||
result = strategy._scrap("http://test.com", html)
|
||||
assert result is not None
|
||||
cleaned = result.get("cleaned_html", "")
|
||||
assert "Content" in cleaned
|
||||
|
||||
def test_multiple_mermaid_svgs(self, strategy):
|
||||
"""Multiple mermaid diagrams on one page."""
|
||||
html = _make_html(FLOWCHART_SVG + CLASS_DIAGRAM_SVG)
|
||||
result = strategy._scrap("http://test.com", html)
|
||||
cleaned = result.get("cleaned_html", "")
|
||||
assert "Start" in cleaned
|
||||
assert "MyClass" in cleaned
|
||||
|
||||
def test_mermaid_svg_no_aria(self, strategy):
|
||||
"""Mermaid SVG without aria-roledescription should use 'diagram' fallback."""
|
||||
html = _make_html("""
|
||||
<div>
|
||||
<svg id="mermaid-noaria" xmlns="http://www.w3.org/2000/svg">
|
||||
<g class="node"><foreignObject><div><span class="nodeLabel">Node A</span></div></foreignObject></g>
|
||||
</svg>
|
||||
</div>
|
||||
""")
|
||||
result = strategy._scrap("http://test.com", html)
|
||||
cleaned = result.get("cleaned_html", "")
|
||||
assert "Node A" in cleaned
|
||||
assert "diagram" in cleaned.lower()
|
||||
|
||||
def test_mermaid_svg_malformed_no_crash(self, strategy):
|
||||
"""Malformed SVG should not crash the scraper."""
|
||||
html = _make_html("""
|
||||
<div>
|
||||
<svg id="mermaid-bad" xmlns="http://www.w3.org/2000/svg">
|
||||
</svg>
|
||||
<p>Still works</p>
|
||||
</div>
|
||||
""")
|
||||
result = strategy._scrap("http://test.com", html)
|
||||
assert result is not None
|
||||
cleaned = result.get("cleaned_html", "")
|
||||
assert "Still works" in cleaned
|
||||
Reference in New Issue
Block a user