diff --git a/crawl4ai/async_crawler_strategy.py b/crawl4ai/async_crawler_strategy.py index b9de25f6b..55c4e7b98 100644 --- a/crawl4ai/async_crawler_strategy.py +++ b/crawl4ai/async_crawler_strategy.py @@ -1236,6 +1236,28 @@ async def _handle_full_page_scan(self, page: Page, scroll_delay: float = 0.1, ma ) current_position = viewport_height + # Install MutationObserver to capture elements removed during + # scrolling. On virtual-scroll pages the DOM recycles elements, + # so removed nodes contain content that would otherwise be lost. + await page.evaluate("""() => { + window.__c4ai_removed = []; + window.__c4ai_observer = new MutationObserver(mutations => { + for (const m of mutations) { + for (const node of m.removedNodes) { + if (node.nodeType === Node.ELEMENT_NODE) { + const text = (node.innerText || '').trim(); + if (text.length > 5) { + window.__c4ai_removed.push(node.outerHTML); + } + } + } + } + }); + window.__c4ai_observer.observe(document.body, { + childList: true, subtree: true + }); + }""") + # await page.evaluate(f"window.scrollTo(0, {current_position})") await self.safe_scroll(page, 0, current_position, delay=scroll_delay) # await self.csp_scroll_to(page, 0, current_position) @@ -1270,10 +1292,69 @@ async def _handle_full_page_scan(self, page: Page, scroll_delay: float = 0.1, ma if new_height > total_height: total_height = new_height + # Disconnect observer and re-inject any recycled elements so that + # the subsequent HTML capture (page.content()) includes them. + merge_result = await page.evaluate(r"""() => { + if (window.__c4ai_observer) { + window.__c4ai_observer.disconnect(); + delete window.__c4ai_observer; + } + const removed = window.__c4ai_removed || []; + delete window.__c4ai_removed; + if (removed.length === 0) return { injected: false, count: 0 }; + + // Fingerprint currently-visible elements so we skip duplicates + const seen = new Set(); + for (const el of document.body.querySelectorAll('*')) { + const t = (el.innerText || '').trim().toLowerCase().replace(/\s+/g, ' '); + if (t.length > 5) seen.add(t.substring(0, 200)); + } + + const uniqueHTML = []; + const tmp = document.createElement('div'); + for (const html of removed) { + tmp.innerHTML = html; + const el = tmp.firstElementChild; + if (!el) continue; + const t = (el.innerText || '').trim().toLowerCase().replace(/\s+/g, ' '); + const fp = t.substring(0, 200); + if (t.length > 5 && !seen.has(fp)) { + seen.add(fp); + uniqueHTML.push(html); + } + } + if (uniqueHTML.length === 0) return { injected: false, count: 0 }; + + const container = document.createElement('div'); + container.id = '__c4ai_accumulated_content'; + container.style.display = 'none'; + container.innerHTML = uniqueHTML.join('\n'); + document.body.appendChild(container); + return { injected: true, count: uniqueHTML.length }; + }""") + + if merge_result and merge_result.get("injected"): + self.logger.info( + message="Virtual scroll detected: re-injected {count} recycled elements", + tag="PAGE_SCAN", + params={"count": merge_result.get("count", 0)}, + ) + # await page.evaluate("window.scrollTo(0, 0)") await self.safe_scroll(page, 0, 0) except Exception as e: + # Clean up observer on error + try: + await page.evaluate("""() => { + if (window.__c4ai_observer) { + window.__c4ai_observer.disconnect(); + delete window.__c4ai_observer; + } + delete window.__c4ai_removed; + }""") + except Exception: + pass self.logger.warning( message="Failed to perform full page scan: {error}", tag="PAGE_SCAN", @@ -1341,15 +1422,22 @@ async def _handle_virtual_scroll(self, page: Page, config: "VirtualScrollConfig" // Perform scrolling while (scrollCount < config.scroll_count) { - // Scroll the container + // Scroll the container; fall back to window if container + // doesn't scroll (e.g. Twitter scrolls the window, not a + // container element). + const prevScrollTop = container.scrollTop; container.scrollTop += scrollAmount; - + const usedWindowScroll = (container.scrollTop === prevScrollTop); + if (usedWindowScroll) { + window.scrollBy(0, scrollAmount); + } + // Wait for content to potentially load await new Promise(resolve => setTimeout(resolve, config.wait_after_scroll * 1000)); - + // Get current HTML const currentHTML = container.innerHTML; - + // Determine what changed if (currentHTML === previousHTML) { // Case 0: No change - continue scrolling @@ -1362,13 +1450,15 @@ async def _handle_virtual_scroll(self, page: Page, config: "VirtualScrollConfig" console.log(`Scroll ${scrollCount + 1}: Content replaced, capturing chunk`); htmlChunks.push(previousHTML); } - + // Update previous HTML for next iteration previousHTML = currentHTML; scrollCount++; - - // Check if we've reached the end - if (container.scrollTop + container.clientHeight >= container.scrollHeight - 10) { + + // Check if we've reached the end of scrollable content + const atContainerEnd = container.scrollTop + container.clientHeight >= container.scrollHeight - 10; + const atWindowEnd = window.scrollY + window.innerHeight >= document.documentElement.scrollHeight - 10; + if (usedWindowScroll ? atWindowEnd : atContainerEnd) { console.log(`Reached end of scrollable content at scroll ${scrollCount}`); // Capture final chunk if content was replaced if (htmlChunks.length > 0) { diff --git a/tests/test_repro_731.py b/tests/test_repro_731.py new file mode 100644 index 000000000..edcc0b7a4 --- /dev/null +++ b/tests/test_repro_731.py @@ -0,0 +1,176 @@ +""" +Reproduction test for issue #731: +scan_full_page=True only captures the final elements on virtual-scroll pages. + +Creates a local HTML page that simulates virtual scrolling (DOM recycling) +and verifies that scan_full_page captures ALL items, not just the last batch. +""" + +import asyncio +import os +import tempfile +import json +from http.server import HTTPServer, SimpleHTTPRequestHandler +import threading + +from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, CacheMode, BrowserConfig +from crawl4ai.extraction_strategy import JsonCssExtractionStrategy + +# HTML page that simulates virtual scrolling: only 10 items visible at a time, +# content is REPLACED (recycled) as you scroll — mimicking Twitter/X behavior. +VIRTUAL_SCROLL_HTML = """ + + + + + + +
+ + + +""" + + +def start_server(html_dir, port=9731): + """Start a simple HTTP server serving the test HTML.""" + class Handler(SimpleHTTPRequestHandler): + def __init__(self, *args, **kwargs): + super().__init__(*args, directory=html_dir, **kwargs) + def log_message(self, format, *args): + pass # suppress logs + server = HTTPServer(("127.0.0.1", port), Handler) + thread = threading.Thread(target=server.serve_forever, daemon=True) + thread.start() + return server + + +async def test_scan_full_page_virtual_scroll(): + """ + BUG REPRODUCTION: scan_full_page=True on a virtual-scroll page. + Expected: all 50 users captured. + Actual (bug): only the last ~10 users captured. + """ + # Write test HTML to a temp dir and serve it + with tempfile.TemporaryDirectory() as tmpdir: + html_path = os.path.join(tmpdir, "index.html") + with open(html_path, "w") as f: + f.write(VIRTUAL_SCROLL_HTML) + + server = start_server(tmpdir, port=9731) + + try: + schema = { + "name": "Users", + "baseSelector": "[data-testid='UserCell']", + "fields": [ + {"name": "name", "selector": ".name", "type": "text"}, + {"name": "handle", "selector": ".handle", "type": "text"}, + ], + } + extraction = JsonCssExtractionStrategy(schema) + + # --- Test 1: WITHOUT scan_full_page (baseline) --- + config_no_scroll = CrawlerRunConfig( + cache_mode=CacheMode.BYPASS, + extraction_strategy=extraction, + scan_full_page=False, + ) + async with AsyncWebCrawler(verbose=True) as crawler: + result = await crawler.arun(url="http://127.0.0.1:9731/index.html", config=config_no_scroll) + + data_no_scroll = json.loads(result.extracted_content) + names_no_scroll = [u["name"] for u in data_no_scroll] + print(f"\n{'='*60}") + print(f"WITHOUT scan_full_page: {len(data_no_scroll)} users") + print(f" Users: {names_no_scroll[:5]} ... {names_no_scroll[-3:] if len(names_no_scroll) > 5 else ''}") + print(f"{'='*60}") + + # --- Test 2: WITH scan_full_page (the bug) --- + config_scroll = CrawlerRunConfig( + cache_mode=CacheMode.BYPASS, + extraction_strategy=extraction, + scan_full_page=True, + scroll_delay=0.3, + max_scroll_steps=20, + ) + async with AsyncWebCrawler(verbose=True) as crawler: + result = await crawler.arun(url="http://127.0.0.1:9731/index.html", config=config_scroll) + + data_scroll = json.loads(result.extracted_content) + names_scroll = [u["name"] for u in data_scroll] + print(f"\n{'='*60}") + print(f"WITH scan_full_page: {len(data_scroll)} users") + print(f" Users: {names_scroll[:5]} ... {names_scroll[-3:] if len(names_scroll) > 5 else ''}") + print(f"{'='*60}") + + # --- Deduplicate by handle --- + unique_scroll = {u["handle"]: u for u in data_scroll} + + # --- Verdict --- + print(f"\n{'='*60}") + print("VERDICT:") + print(f" Raw extracted: {len(data_scroll)}, Unique by handle: {len(unique_scroll)}") + if len(unique_scroll) >= 40: + print(f" PASS — scan_full_page captured {len(unique_scroll)}/50 unique users (>= 40)") + # Check coverage + captured = sorted(unique_scroll.keys(), key=lambda h: int(h.replace("@user", ""))) + missing = [f"@user{i}" for i in range(1, 51) if f"@user{i}" not in unique_scroll] + if missing: + print(f" Missing {len(missing)} users: {missing[:10]}{'...' if len(missing) > 10 else ''}") + else: + print(f" All 50 users captured!") + else: + print(f" BUG CONFIRMED — scan_full_page only captured {len(unique_scroll)}/50 unique users") + handles = sorted(unique_scroll.keys()) + print(f" Captured handles: {handles}") + print(f"{'='*60}") + + finally: + server.shutdown() + + +if __name__ == "__main__": + asyncio.run(test_scan_full_page_virtual_scroll())