From 2fd0f4c6a7c43bb4aa26a5c66dddcdb6be22728b Mon Sep 17 00:00:00 2001 From: hafezparast Date: Sun, 22 Mar 2026 11:43:54 +0800 Subject: [PATCH] fix: preserve mermaid diagram text from SVGs during scraping (#1043) Mermaid diagrams rendered as SVGs were completely stripped during HTML cleaning, losing all text content. Now detects SVGs with id="mermaid-*", extracts node/edge labels, and replaces the SVG with a fenced mermaid code block containing the diagram type and extracted text. Co-Authored-By: Claude Opus 4.6 (1M context) --- crawl4ai/content_scraping_strategy.py | 24 +++ tests/test_issue_1043_mermaid_svg.py | 229 ++++++++++++++++++++++++++ 2 files changed, 253 insertions(+) create mode 100644 tests/test_issue_1043_mermaid_svg.py diff --git a/crawl4ai/content_scraping_strategy.py b/crawl4ai/content_scraping_strategy.py index ade19aa11..9853f788f 100644 --- a/crawl4ai/content_scraping_strategy.py +++ b/crawl4ai/content_scraping_strategy.py @@ -721,6 +721,30 @@ def _scrap( elif content_element is None: content_element = body + # Replace mermaid SVGs with text before they get stripped + for svg in body.xpath('.//svg[starts-with(@id, "mermaid-")]'): + try: + diagram_type = svg.get("aria-roledescription", "diagram") + # Extract text from node/edge labels + labels = [] + seen = set() + for el in svg.cssselect(".nodeLabel, .label span, .edgeLabel span"): + text = el.text_content().strip() + if text and text not in seen: + seen.add(text) + labels.append(text) + if labels: + # Build a pre block so it survives markdown conversion + placeholder = lhtml.Element("pre") + code = etree.SubElement(placeholder, "code") + code.set("class", "language-mermaid") + code.text = f"%% {diagram_type} diagram\n" + "\n".join(labels) + parent = svg.getparent() + if parent is not None: + parent.replace(svg, placeholder) + except Exception: + pass + # Remove script and style tags for tag in ["style", "link", "meta", "noscript"]: for element in body.xpath(f".//{tag}"): diff --git a/tests/test_issue_1043_mermaid_svg.py b/tests/test_issue_1043_mermaid_svg.py new file mode 100644 index 000000000..31f1e3585 --- /dev/null +++ b/tests/test_issue_1043_mermaid_svg.py @@ -0,0 +1,229 @@ +""" +Tests for issue #1043: Missing Mermaid Flowcharts + +Verifies that mermaid SVG diagrams are preserved as text content +during HTML scraping, rather than being stripped entirely. +""" + +import pytest +from lxml import html as lhtml +from crawl4ai.content_scraping_strategy import LXMLWebScrapingStrategy + + +@pytest.fixture +def strategy(): + return LXMLWebScrapingStrategy() + + +def _make_html(body_content: str) -> str: + return f"{body_content}" + + +# -- Mermaid SVG detection and replacement -- + +FLOWCHART_SVG = """ +
+

Before diagram

+ +
Start
+
Process Data
+
End
+
yes
+
+

After diagram

+
+""" + +CLASS_DIAGRAM_SVG = """ +
+ +
MyClass
+
+method() : void
+
-field : int
+
+
+""" + +SEQUENCE_SVG = """ +
+ +
Alice
+
Bob
+
Hello
+
+
+""" + + +class TestMermaidSVGDetection: + """Test that mermaid SVGs are detected by their id prefix.""" + + def test_flowchart_svg_detected(self, strategy): + html = _make_html(FLOWCHART_SVG) + result = strategy._scrap("http://test.com", html) + assert result is not None + cleaned = result.get("cleaned_html", "") + assert "Start" in cleaned + assert "Process Data" in cleaned + + def test_non_mermaid_svg_not_affected(self, strategy): + """Regular SVGs without mermaid id should be unaffected.""" + html = _make_html(""" +
+ +

Content here

+
+ """) + result = strategy._scrap("http://test.com", html) + assert result is not None + + def test_mermaid_svg_replaced_with_pre_code(self, strategy): + """Mermaid SVG should be replaced with pre/code block.""" + html = _make_html(FLOWCHART_SVG) + result = strategy._scrap("http://test.com", html) + cleaned = result.get("cleaned_html", "") + assert "language-mermaid" in cleaned or "mermaid" in cleaned.lower() + + +class TestMermaidTextExtraction: + """Test that text content is correctly extracted from mermaid SVGs.""" + + def test_node_labels_extracted(self, strategy): + html = _make_html(FLOWCHART_SVG) + result = strategy._scrap("http://test.com", html) + cleaned = result.get("cleaned_html", "") + assert "Start" in cleaned + assert "Process Data" in cleaned + assert "End" in cleaned + + def test_edge_labels_extracted(self, strategy): + html = _make_html(FLOWCHART_SVG) + result = strategy._scrap("http://test.com", html) + cleaned = result.get("cleaned_html", "") + assert "yes" in cleaned + + def test_class_diagram_labels_extracted(self, strategy): + html = _make_html(CLASS_DIAGRAM_SVG) + result = strategy._scrap("http://test.com", html) + cleaned = result.get("cleaned_html", "") + assert "MyClass" in cleaned + assert "+method() : void" in cleaned + + def test_sequence_diagram_labels_extracted(self, strategy): + html = _make_html(SEQUENCE_SVG) + result = strategy._scrap("http://test.com", html) + cleaned = result.get("cleaned_html", "") + assert "Alice" in cleaned + assert "Bob" in cleaned + + def test_duplicate_labels_deduplicated(self, strategy): + """Same label appearing multiple times should only appear once.""" + html = _make_html(""" +
+ +
Repeated
+
Repeated
+
Unique
+
+
+ """) + result = strategy._scrap("http://test.com", html) + cleaned = result.get("cleaned_html", "") + # Should have Repeated once, not twice + assert cleaned.count("Repeated") == 1 + assert "Unique" in cleaned + + +class TestMermaidDiagramType: + """Test that diagram type is preserved.""" + + def test_flowchart_type_preserved(self, strategy): + html = _make_html(FLOWCHART_SVG) + result = strategy._scrap("http://test.com", html) + cleaned = result.get("cleaned_html", "") + assert "flowchart" in cleaned.lower() + + def test_class_type_preserved(self, strategy): + html = _make_html(CLASS_DIAGRAM_SVG) + result = strategy._scrap("http://test.com", html) + cleaned = result.get("cleaned_html", "") + assert "class" in cleaned.lower() + + def test_sequence_type_preserved(self, strategy): + html = _make_html(SEQUENCE_SVG) + result = strategy._scrap("http://test.com", html) + cleaned = result.get("cleaned_html", "") + assert "sequence" in cleaned.lower() + + +class TestMermaidSurroundingContent: + """Test that surrounding content is preserved.""" + + def test_text_before_diagram_preserved(self, strategy): + html = _make_html(FLOWCHART_SVG) + result = strategy._scrap("http://test.com", html) + cleaned = result.get("cleaned_html", "") + assert "Before diagram" in cleaned + + def test_text_after_diagram_preserved(self, strategy): + html = _make_html(FLOWCHART_SVG) + result = strategy._scrap("http://test.com", html) + cleaned = result.get("cleaned_html", "") + assert "After diagram" in cleaned + + +class TestMermaidEdgeCases: + """Test edge cases for mermaid SVG handling.""" + + def test_empty_mermaid_svg(self, strategy): + """SVG with no text content should be handled gracefully.""" + html = _make_html(""" +
+ + + +

Content

+
+ """) + result = strategy._scrap("http://test.com", html) + assert result is not None + cleaned = result.get("cleaned_html", "") + assert "Content" in cleaned + + def test_multiple_mermaid_svgs(self, strategy): + """Multiple mermaid diagrams on one page.""" + html = _make_html(FLOWCHART_SVG + CLASS_DIAGRAM_SVG) + result = strategy._scrap("http://test.com", html) + cleaned = result.get("cleaned_html", "") + assert "Start" in cleaned + assert "MyClass" in cleaned + + def test_mermaid_svg_no_aria(self, strategy): + """Mermaid SVG without aria-roledescription should use 'diagram' fallback.""" + html = _make_html(""" +
+ +
Node A
+
+
+ """) + result = strategy._scrap("http://test.com", html) + cleaned = result.get("cleaned_html", "") + assert "Node A" in cleaned + assert "diagram" in cleaned.lower() + + def test_mermaid_svg_malformed_no_crash(self, strategy): + """Malformed SVG should not crash the scraper.""" + html = _make_html(""" +
+ + +

Still works

+
+ """) + result = strategy._scrap("http://test.com", html) + assert result is not None + cleaned = result.get("cleaned_html", "") + assert "Still works" in cleaned