diff --git a/crawl4ai/content_scraping_strategy.py b/crawl4ai/content_scraping_strategy.py
index ade19aa11..9853f788f 100644
--- a/crawl4ai/content_scraping_strategy.py
+++ b/crawl4ai/content_scraping_strategy.py
@@ -721,6 +721,30 @@ def _scrap(
elif content_element is None:
content_element = body
+ # Replace mermaid SVGs with text before they get stripped
+ for svg in body.xpath('.//svg[starts-with(@id, "mermaid-")]'):
+ try:
+ diagram_type = svg.get("aria-roledescription", "diagram")
+ # Extract text from node/edge labels
+ labels = []
+ seen = set()
+ for el in svg.cssselect(".nodeLabel, .label span, .edgeLabel span"):
+ text = el.text_content().strip()
+ if text and text not in seen:
+ seen.add(text)
+ labels.append(text)
+ if labels:
+ # Build a pre block so it survives markdown conversion
+ placeholder = lhtml.Element("pre")
+ code = etree.SubElement(placeholder, "code")
+ code.set("class", "language-mermaid")
+ code.text = f"%% {diagram_type} diagram\n" + "\n".join(labels)
+ parent = svg.getparent()
+ if parent is not None:
+ parent.replace(svg, placeholder)
+ except Exception:
+ pass
+
# Remove script and style tags
for tag in ["style", "link", "meta", "noscript"]:
for element in body.xpath(f".//{tag}"):
diff --git a/tests/test_issue_1043_mermaid_svg.py b/tests/test_issue_1043_mermaid_svg.py
new file mode 100644
index 000000000..31f1e3585
--- /dev/null
+++ b/tests/test_issue_1043_mermaid_svg.py
@@ -0,0 +1,229 @@
+"""
+Tests for issue #1043: Missing Mermaid Flowcharts
+
+Verifies that mermaid SVG diagrams are preserved as text content
+during HTML scraping, rather than being stripped entirely.
+"""
+
+import pytest
+from lxml import html as lhtml
+from crawl4ai.content_scraping_strategy import LXMLWebScrapingStrategy
+
+
+@pytest.fixture
+def strategy():
+ return LXMLWebScrapingStrategy()
+
+
+def _make_html(body_content: str) -> str:
+ return f"
+"""
+
+
+class TestMermaidSVGDetection:
+ """Test that mermaid SVGs are detected by their id prefix."""
+
+ def test_flowchart_svg_detected(self, strategy):
+ html = _make_html(FLOWCHART_SVG)
+ result = strategy._scrap("http://test.com", html)
+ assert result is not None
+ cleaned = result.get("cleaned_html", "")
+ assert "Start" in cleaned
+ assert "Process Data" in cleaned
+
+ def test_non_mermaid_svg_not_affected(self, strategy):
+ """Regular SVGs without mermaid id should be unaffected."""
+ html = _make_html("""
+
+
+
Content here
+
+ """)
+ result = strategy._scrap("http://test.com", html)
+ assert result is not None
+
+ def test_mermaid_svg_replaced_with_pre_code(self, strategy):
+ """Mermaid SVG should be replaced with pre/code block."""
+ html = _make_html(FLOWCHART_SVG)
+ result = strategy._scrap("http://test.com", html)
+ cleaned = result.get("cleaned_html", "")
+ assert "language-mermaid" in cleaned or "mermaid" in cleaned.lower()
+
+
+class TestMermaidTextExtraction:
+ """Test that text content is correctly extracted from mermaid SVGs."""
+
+ def test_node_labels_extracted(self, strategy):
+ html = _make_html(FLOWCHART_SVG)
+ result = strategy._scrap("http://test.com", html)
+ cleaned = result.get("cleaned_html", "")
+ assert "Start" in cleaned
+ assert "Process Data" in cleaned
+ assert "End" in cleaned
+
+ def test_edge_labels_extracted(self, strategy):
+ html = _make_html(FLOWCHART_SVG)
+ result = strategy._scrap("http://test.com", html)
+ cleaned = result.get("cleaned_html", "")
+ assert "yes" in cleaned
+
+ def test_class_diagram_labels_extracted(self, strategy):
+ html = _make_html(CLASS_DIAGRAM_SVG)
+ result = strategy._scrap("http://test.com", html)
+ cleaned = result.get("cleaned_html", "")
+ assert "MyClass" in cleaned
+ assert "+method() : void" in cleaned
+
+ def test_sequence_diagram_labels_extracted(self, strategy):
+ html = _make_html(SEQUENCE_SVG)
+ result = strategy._scrap("http://test.com", html)
+ cleaned = result.get("cleaned_html", "")
+ assert "Alice" in cleaned
+ assert "Bob" in cleaned
+
+ def test_duplicate_labels_deduplicated(self, strategy):
+ """Same label appearing multiple times should only appear once."""
+ html = _make_html("""
+
+
+
+ """)
+ result = strategy._scrap("http://test.com", html)
+ cleaned = result.get("cleaned_html", "")
+ # Should have Repeated once, not twice
+ assert cleaned.count("Repeated") == 1
+ assert "Unique" in cleaned
+
+
+class TestMermaidDiagramType:
+ """Test that diagram type is preserved."""
+
+ def test_flowchart_type_preserved(self, strategy):
+ html = _make_html(FLOWCHART_SVG)
+ result = strategy._scrap("http://test.com", html)
+ cleaned = result.get("cleaned_html", "")
+ assert "flowchart" in cleaned.lower()
+
+ def test_class_type_preserved(self, strategy):
+ html = _make_html(CLASS_DIAGRAM_SVG)
+ result = strategy._scrap("http://test.com", html)
+ cleaned = result.get("cleaned_html", "")
+ assert "class" in cleaned.lower()
+
+ def test_sequence_type_preserved(self, strategy):
+ html = _make_html(SEQUENCE_SVG)
+ result = strategy._scrap("http://test.com", html)
+ cleaned = result.get("cleaned_html", "")
+ assert "sequence" in cleaned.lower()
+
+
+class TestMermaidSurroundingContent:
+ """Test that surrounding content is preserved."""
+
+ def test_text_before_diagram_preserved(self, strategy):
+ html = _make_html(FLOWCHART_SVG)
+ result = strategy._scrap("http://test.com", html)
+ cleaned = result.get("cleaned_html", "")
+ assert "Before diagram" in cleaned
+
+ def test_text_after_diagram_preserved(self, strategy):
+ html = _make_html(FLOWCHART_SVG)
+ result = strategy._scrap("http://test.com", html)
+ cleaned = result.get("cleaned_html", "")
+ assert "After diagram" in cleaned
+
+
+class TestMermaidEdgeCases:
+ """Test edge cases for mermaid SVG handling."""
+
+ def test_empty_mermaid_svg(self, strategy):
+ """SVG with no text content should be handled gracefully."""
+ html = _make_html("""
+
+
+
Content
+
+ """)
+ result = strategy._scrap("http://test.com", html)
+ assert result is not None
+ cleaned = result.get("cleaned_html", "")
+ assert "Content" in cleaned
+
+ def test_multiple_mermaid_svgs(self, strategy):
+ """Multiple mermaid diagrams on one page."""
+ html = _make_html(FLOWCHART_SVG + CLASS_DIAGRAM_SVG)
+ result = strategy._scrap("http://test.com", html)
+ cleaned = result.get("cleaned_html", "")
+ assert "Start" in cleaned
+ assert "MyClass" in cleaned
+
+ def test_mermaid_svg_no_aria(self, strategy):
+ """Mermaid SVG without aria-roledescription should use 'diagram' fallback."""
+ html = _make_html("""
+
+
+
+ """)
+ result = strategy._scrap("http://test.com", html)
+ cleaned = result.get("cleaned_html", "")
+ assert "Node A" in cleaned
+ assert "diagram" in cleaned.lower()
+
+ def test_mermaid_svg_malformed_no_crash(self, strategy):
+ """Malformed SVG should not crash the scraper."""
+ html = _make_html("""
+
+
+
Still works
+
+ """)
+ result = strategy._scrap("http://test.com", html)
+ assert result is not None
+ cleaned = result.get("cleaned_html", "")
+ assert "Still works" in cleaned