unclecode · hafezparast · Mar 23, 2026
diff --git a/crawl4ai/async_crawler_strategy.py b/crawl4ai/async_crawler_strategy.py
@@ -1236,6 +1236,28 @@ async def _handle_full_page_scan(self, page: Page, scroll_delay: float = 0.1, ma
             )
             current_position = viewport_height
 
+            # Install MutationObserver to capture elements removed during
+            # scrolling.  On virtual-scroll pages the DOM recycles elements,
+            # so removed nodes contain content that would otherwise be lost.
+            await page.evaluate("""() => {
+                window.__c4ai_removed = [];
+                window.__c4ai_observer = new MutationObserver(mutations => {
+                    for (const m of mutations) {
+                        for (const node of m.removedNodes) {
+                            if (node.nodeType === Node.ELEMENT_NODE) {
+                                const text = (node.innerText || '').trim();
+                                if (text.length > 5) {
+                                    window.__c4ai_removed.push(node.outerHTML);
+                                }
+                            }
+                        }
+                    }
+                });
+                window.__c4ai_observer.observe(document.body, {
+                    childList: true, subtree: true
+                });
+            }""")
+
             # await page.evaluate(f"window.scrollTo(0, {current_position})")
             await self.safe_scroll(page, 0, current_position, delay=scroll_delay)
             # await self.csp_scroll_to(page, 0, current_position)
@@ -1270,10 +1292,69 @@ async def _handle_full_page_scan(self, page: Page, scroll_delay: float = 0.1, ma
                 if new_height > total_height:
                     total_height = new_height
 
+            # Disconnect observer and re-inject any recycled elements so that
+            # the subsequent HTML capture (page.content()) includes them.
+            merge_result = await page.evaluate(r"""() => {
+                if (window.__c4ai_observer) {
+                    window.__c4ai_observer.disconnect();
+                    delete window.__c4ai_observer;
+                }
+                const removed = window.__c4ai_removed || [];
+                delete window.__c4ai_removed;
+                if (removed.length === 0) return { injected: false, count: 0 };
+
+                // Fingerprint currently-visible elements so we skip duplicates
+                const seen = new Set();
+                for (const el of document.body.querySelectorAll('*')) {
+                    const t = (el.innerText || '').trim().toLowerCase().replace(/\s+/g, ' ');
+                    if (t.length > 5) seen.add(t.substring(0, 200));
+                }
+
+                const uniqueHTML = [];
+                const tmp = document.createElement('div');
+                for (const html of removed) {
+                    tmp.innerHTML = html;
+                    const el = tmp.firstElementChild;
+                    if (!el) continue;
+                    const t = (el.innerText || '').trim().toLowerCase().replace(/\s+/g, ' ');
+                    const fp = t.substring(0, 200);
+                    if (t.length > 5 && !seen.has(fp)) {
+                        seen.add(fp);
+                        uniqueHTML.push(html);
+                    }
+                }
+                if (uniqueHTML.length === 0) return { injected: false, count: 0 };
+
+                const container = document.createElement('div');
+                container.id = '__c4ai_accumulated_content';
+                container.style.display = 'none';
+                container.innerHTML = uniqueHTML.join('\n');
+                document.body.appendChild(container);
+                return { injected: true, count: uniqueHTML.length };
+            }""")
+
+            if merge_result and merge_result.get("injected"):
+                self.logger.info(
+                    message="Virtual scroll detected: re-injected {count} recycled elements",
+                    tag="PAGE_SCAN",
+                    params={"count": merge_result.get("count", 0)},
+                )
+
             # await page.evaluate("window.scrollTo(0, 0)")
             await self.safe_scroll(page, 0, 0)
 
         except Exception as e:
+            # Clean up observer on error
+            try:
+                await page.evaluate("""() => {
+                    if (window.__c4ai_observer) {
+                        window.__c4ai_observer.disconnect();
+                        delete window.__c4ai_observer;
+                    }
+                    delete window.__c4ai_removed;
+                }""")
+            except Exception:
+                pass
             self.logger.warning(
                 message="Failed to perform full page scan: {error}",
                 tag="PAGE_SCAN",
@@ -1341,15 +1422,22 @@ async def _handle_virtual_scroll(self, page: Page, config: "VirtualScrollConfig"
 
                 // Perform scrolling
                 while (scrollCount < config.scroll_count) {
-                    // Scroll the container
+                    // Scroll the container; fall back to window if container
+                    // doesn't scroll (e.g. Twitter scrolls the window, not a
+                    // container element).
+                    const prevScrollTop = container.scrollTop;
                     container.scrollTop += scrollAmount;
-
+                    const usedWindowScroll = (container.scrollTop === prevScrollTop);
+                    if (usedWindowScroll) {
+                        window.scrollBy(0, scrollAmount);
+                    }
+
                     // Wait for content to potentially load
                     await new Promise(resolve => setTimeout(resolve, config.wait_after_scroll * 1000));
-                    
+
                     // Get current HTML
                     const currentHTML = container.innerHTML;
-                    
+
                     // Determine what changed
                     if (currentHTML === previousHTML) {
                         // Case 0: No change - continue scrolling
@@ -1362,13 +1450,15 @@ async def _handle_virtual_scroll(self, page: Page, config: "VirtualScrollConfig"
                         console.log(`Scroll ${scrollCount + 1}: Content replaced, capturing chunk`);
                         htmlChunks.push(previousHTML);
                     }
-                    
+
                     // Update previous HTML for next iteration
                     previousHTML = currentHTML;
                     scrollCount++;
-
-                    // Check if we've reached the end
-                    if (container.scrollTop + container.clientHeight >= container.scrollHeight - 10) {
+
+                    // Check if we've reached the end of scrollable content
+                    const atContainerEnd = container.scrollTop + container.clientHeight >= container.scrollHeight - 10;
+                    const atWindowEnd = window.scrollY + window.innerHeight >= document.documentElement.scrollHeight - 10;
+                    if (usedWindowScroll ? atWindowEnd : atContainerEnd) {
                         console.log(`Reached end of scrollable content at scroll ${scrollCount}`);
                         // Capture final chunk if content was replaced
                         if (htmlChunks.length > 0) {

diff --git a/tests/test_repro_731.py b/tests/test_repro_731.py
@@ -0,0 +1,176 @@
+"""
+Reproduction test for issue #731:
+scan_full_page=True only captures the final elements on virtual-scroll pages.
+
+Creates a local HTML page that simulates virtual scrolling (DOM recycling)
+and verifies that scan_full_page captures ALL items, not just the last batch.
+"""
+
+import asyncio
+import os
+import tempfile
+import json
+from http.server import HTTPServer, SimpleHTTPRequestHandler
+import threading
+
+from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, CacheMode, BrowserConfig
+from crawl4ai.extraction_strategy import JsonCssExtractionStrategy
+
+# HTML page that simulates virtual scrolling: only 10 items visible at a time,
+# content is REPLACED (recycled) as you scroll — mimicking Twitter/X behavior.
+VIRTUAL_SCROLL_HTML = """
+<!DOCTYPE html>
+<html>
+<head>
+<style>
+  body { margin: 0; font-family: sans-serif; }
+  .item { height: 80px; padding: 10px; border-bottom: 1px solid #eee; }
+  .item .name { font-weight: bold; }
+  .item .handle { color: #666; }
+</style>
+</head>
+<body>
+<div id="feed"></div>
+<script>
+  // Simulate 50 total users, but only render 10 at a time (virtual scroll)
+  const TOTAL = 50;
+  const VISIBLE = 10;
+  const allUsers = [];
+  for (let i = 0; i < TOTAL; i++) {
+    allUsers.push({ name: `User ${i+1}`, handle: `@user${i+1}` });
+  }
+
+  let startIdx = 0;
+  const feed = document.getElementById('feed');
+
+  function render() {
+    feed.innerHTML = '';
+    const end = Math.min(startIdx + VISIBLE, TOTAL);
+    for (let i = startIdx; i < end; i++) {
+      const div = document.createElement('div');
+      div.className = 'item';
+      div.setAttribute('data-testid', 'UserCell');
+      div.innerHTML = `<div class="name">${allUsers[i].name}</div><div class="handle">${allUsers[i].handle}</div>`;
+      feed.appendChild(div);
+    }
+    // Set body height to allow scrolling
+    document.body.style.height = (TOTAL * 80) + 'px';
+  }
+
+  render();
+
+  // On scroll, recycle DOM elements (virtual scroll behavior)
+  window.addEventListener('scroll', () => {
+    const scrollPos = window.scrollY;
+    const newStart = Math.min(Math.floor(scrollPos / 80), TOTAL - VISIBLE);
+    if (newStart !== startIdx) {
+      startIdx = newStart;
+      render();
+    }
+  });
+</script>
+</body>
+</html>
+"""
+
+
+def start_server(html_dir, port=9731):
+    """Start a simple HTTP server serving the test HTML."""
+    class Handler(SimpleHTTPRequestHandler):
+        def __init__(self, *args, **kwargs):
+            super().__init__(*args, directory=html_dir, **kwargs)
+        def log_message(self, format, *args):
+            pass  # suppress logs
+    server = HTTPServer(("127.0.0.1", port), Handler)
+    thread = threading.Thread(target=server.serve_forever, daemon=True)
+    thread.start()
+    return server
+
+
+async def test_scan_full_page_virtual_scroll():
+    """
+    BUG REPRODUCTION: scan_full_page=True on a virtual-scroll page.
+    Expected: all 50 users captured.
+    Actual (bug): only the last ~10 users captured.
+    """
+    # Write test HTML to a temp dir and serve it
+    with tempfile.TemporaryDirectory() as tmpdir:
+        html_path = os.path.join(tmpdir, "index.html")
+        with open(html_path, "w") as f:
+            f.write(VIRTUAL_SCROLL_HTML)
+
+        server = start_server(tmpdir, port=9731)
+
+        try:
+            schema = {
+                "name": "Users",
+                "baseSelector": "[data-testid='UserCell']",
+                "fields": [
+                    {"name": "name", "selector": ".name", "type": "text"},
+                    {"name": "handle", "selector": ".handle", "type": "text"},
+                ],
+            }
+            extraction = JsonCssExtractionStrategy(schema)
+
+            # --- Test 1: WITHOUT scan_full_page (baseline) ---
+            config_no_scroll = CrawlerRunConfig(
+                cache_mode=CacheMode.BYPASS,
+                extraction_strategy=extraction,
+                scan_full_page=False,
+            )
+            async with AsyncWebCrawler(verbose=True) as crawler:
+                result = await crawler.arun(url="http://127.0.0.1:9731/index.html", config=config_no_scroll)
+
+            data_no_scroll = json.loads(result.extracted_content)
+            names_no_scroll = [u["name"] for u in data_no_scroll]
+            print(f"\n{'='*60}")
+            print(f"WITHOUT scan_full_page: {len(data_no_scroll)} users")
+            print(f"  Users: {names_no_scroll[:5]} ... {names_no_scroll[-3:] if len(names_no_scroll) > 5 else ''}")
+            print(f"{'='*60}")
+
+            # --- Test 2: WITH scan_full_page (the bug) ---
+            config_scroll = CrawlerRunConfig(
+                cache_mode=CacheMode.BYPASS,
+                extraction_strategy=extraction,
+                scan_full_page=True,
+                scroll_delay=0.3,
+                max_scroll_steps=20,
+            )
+            async with AsyncWebCrawler(verbose=True) as crawler:
+                result = await crawler.arun(url="http://127.0.0.1:9731/index.html", config=config_scroll)
+
+            data_scroll = json.loads(result.extracted_content)
+            names_scroll = [u["name"] for u in data_scroll]
+            print(f"\n{'='*60}")
+            print(f"WITH scan_full_page: {len(data_scroll)} users")
+            print(f"  Users: {names_scroll[:5]} ... {names_scroll[-3:] if len(names_scroll) > 5 else ''}")
+            print(f"{'='*60}")
+
+            # --- Deduplicate by handle ---
+            unique_scroll = {u["handle"]: u for u in data_scroll}
+
+            # --- Verdict ---
+            print(f"\n{'='*60}")
+            print("VERDICT:")
+            print(f"  Raw extracted: {len(data_scroll)}, Unique by handle: {len(unique_scroll)}")
+            if len(unique_scroll) >= 40:
+                print(f"  PASS — scan_full_page captured {len(unique_scroll)}/50 unique users (>= 40)")
+                # Check coverage
+                captured = sorted(unique_scroll.keys(), key=lambda h: int(h.replace("@user", "")))
+                missing = [f"@user{i}" for i in range(1, 51) if f"@user{i}" not in unique_scroll]
+                if missing:
+                    print(f"  Missing {len(missing)} users: {missing[:10]}{'...' if len(missing) > 10 else ''}")
+                else:
+                    print(f"  All 50 users captured!")
+            else:
+                print(f"  BUG CONFIRMED — scan_full_page only captured {len(unique_scroll)}/50 unique users")
+                handles = sorted(unique_scroll.keys())
+                print(f"  Captured handles: {handles}")
+            print(f"{'='*60}")
+
+        finally:
+            server.shutdown()
+
+
+if __name__ == "__main__":
+    asyncio.run(test_scan_full_page_virtual_scroll())