Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
106 changes: 98 additions & 8 deletions crawl4ai/async_crawler_strategy.py
Original file line number Diff line number Diff line change
Expand Up @@ -1236,6 +1236,28 @@ async def _handle_full_page_scan(self, page: Page, scroll_delay: float = 0.1, ma
)
current_position = viewport_height

# Install MutationObserver to capture elements removed during
# scrolling. On virtual-scroll pages the DOM recycles elements,
# so removed nodes contain content that would otherwise be lost.
await page.evaluate("""() => {
window.__c4ai_removed = [];
window.__c4ai_observer = new MutationObserver(mutations => {
for (const m of mutations) {
for (const node of m.removedNodes) {
if (node.nodeType === Node.ELEMENT_NODE) {
const text = (node.innerText || '').trim();
if (text.length > 5) {
window.__c4ai_removed.push(node.outerHTML);
}
}
}
}
});
window.__c4ai_observer.observe(document.body, {
childList: true, subtree: true
});
}""")

# await page.evaluate(f"window.scrollTo(0, {current_position})")
await self.safe_scroll(page, 0, current_position, delay=scroll_delay)
# await self.csp_scroll_to(page, 0, current_position)
Expand Down Expand Up @@ -1270,10 +1292,69 @@ async def _handle_full_page_scan(self, page: Page, scroll_delay: float = 0.1, ma
if new_height > total_height:
total_height = new_height

# Disconnect observer and re-inject any recycled elements so that
# the subsequent HTML capture (page.content()) includes them.
merge_result = await page.evaluate(r"""() => {
if (window.__c4ai_observer) {
window.__c4ai_observer.disconnect();
delete window.__c4ai_observer;
}
const removed = window.__c4ai_removed || [];
delete window.__c4ai_removed;
if (removed.length === 0) return { injected: false, count: 0 };

// Fingerprint currently-visible elements so we skip duplicates
const seen = new Set();
for (const el of document.body.querySelectorAll('*')) {
const t = (el.innerText || '').trim().toLowerCase().replace(/\s+/g, ' ');
if (t.length > 5) seen.add(t.substring(0, 200));
}

const uniqueHTML = [];
const tmp = document.createElement('div');
for (const html of removed) {
tmp.innerHTML = html;
const el = tmp.firstElementChild;
if (!el) continue;
const t = (el.innerText || '').trim().toLowerCase().replace(/\s+/g, ' ');
const fp = t.substring(0, 200);
if (t.length > 5 && !seen.has(fp)) {
seen.add(fp);
uniqueHTML.push(html);
}
}
if (uniqueHTML.length === 0) return { injected: false, count: 0 };

const container = document.createElement('div');
container.id = '__c4ai_accumulated_content';
container.style.display = 'none';
container.innerHTML = uniqueHTML.join('\n');
document.body.appendChild(container);
return { injected: true, count: uniqueHTML.length };
}""")

if merge_result and merge_result.get("injected"):
self.logger.info(
message="Virtual scroll detected: re-injected {count} recycled elements",
tag="PAGE_SCAN",
params={"count": merge_result.get("count", 0)},
)

# await page.evaluate("window.scrollTo(0, 0)")
await self.safe_scroll(page, 0, 0)

except Exception as e:
# Clean up observer on error
try:
await page.evaluate("""() => {
if (window.__c4ai_observer) {
window.__c4ai_observer.disconnect();
delete window.__c4ai_observer;
}
delete window.__c4ai_removed;
}""")
except Exception:
pass
self.logger.warning(
message="Failed to perform full page scan: {error}",
tag="PAGE_SCAN",
Expand Down Expand Up @@ -1341,15 +1422,22 @@ async def _handle_virtual_scroll(self, page: Page, config: "VirtualScrollConfig"

// Perform scrolling
while (scrollCount < config.scroll_count) {
// Scroll the container
// Scroll the container; fall back to window if container
// doesn't scroll (e.g. Twitter scrolls the window, not a
// container element).
const prevScrollTop = container.scrollTop;
container.scrollTop += scrollAmount;

const usedWindowScroll = (container.scrollTop === prevScrollTop);
if (usedWindowScroll) {
window.scrollBy(0, scrollAmount);
}

// Wait for content to potentially load
await new Promise(resolve => setTimeout(resolve, config.wait_after_scroll * 1000));

// Get current HTML
const currentHTML = container.innerHTML;

// Determine what changed
if (currentHTML === previousHTML) {
// Case 0: No change - continue scrolling
Expand All @@ -1362,13 +1450,15 @@ async def _handle_virtual_scroll(self, page: Page, config: "VirtualScrollConfig"
console.log(`Scroll ${scrollCount + 1}: Content replaced, capturing chunk`);
htmlChunks.push(previousHTML);
}

// Update previous HTML for next iteration
previousHTML = currentHTML;
scrollCount++;

// Check if we've reached the end
if (container.scrollTop + container.clientHeight >= container.scrollHeight - 10) {

// Check if we've reached the end of scrollable content
const atContainerEnd = container.scrollTop + container.clientHeight >= container.scrollHeight - 10;
const atWindowEnd = window.scrollY + window.innerHeight >= document.documentElement.scrollHeight - 10;
if (usedWindowScroll ? atWindowEnd : atContainerEnd) {
console.log(`Reached end of scrollable content at scroll ${scrollCount}`);
// Capture final chunk if content was replaced
if (htmlChunks.length > 0) {
Expand Down
176 changes: 176 additions & 0 deletions tests/test_repro_731.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,176 @@
"""
Reproduction test for issue #731:
scan_full_page=True only captures the final elements on virtual-scroll pages.

Creates a local HTML page that simulates virtual scrolling (DOM recycling)
and verifies that scan_full_page captures ALL items, not just the last batch.
"""

import asyncio
import os
import tempfile
import json
from http.server import HTTPServer, SimpleHTTPRequestHandler
import threading

from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, CacheMode, BrowserConfig
from crawl4ai.extraction_strategy import JsonCssExtractionStrategy

# HTML page that simulates virtual scrolling: only 10 items visible at a time,
# content is REPLACED (recycled) as you scroll — mimicking Twitter/X behavior.
VIRTUAL_SCROLL_HTML = """
<!DOCTYPE html>
<html>
<head>
<style>
body { margin: 0; font-family: sans-serif; }
.item { height: 80px; padding: 10px; border-bottom: 1px solid #eee; }
.item .name { font-weight: bold; }
.item .handle { color: #666; }
</style>
</head>
<body>
<div id="feed"></div>
<script>
// Simulate 50 total users, but only render 10 at a time (virtual scroll)
const TOTAL = 50;
const VISIBLE = 10;
const allUsers = [];
for (let i = 0; i < TOTAL; i++) {
allUsers.push({ name: `User ${i+1}`, handle: `@user${i+1}` });
}

let startIdx = 0;
const feed = document.getElementById('feed');

function render() {
feed.innerHTML = '';
const end = Math.min(startIdx + VISIBLE, TOTAL);
for (let i = startIdx; i < end; i++) {
const div = document.createElement('div');
div.className = 'item';
div.setAttribute('data-testid', 'UserCell');
div.innerHTML = `<div class="name">${allUsers[i].name}</div><div class="handle">${allUsers[i].handle}</div>`;
feed.appendChild(div);
}
// Set body height to allow scrolling
document.body.style.height = (TOTAL * 80) + 'px';
}

render();

// On scroll, recycle DOM elements (virtual scroll behavior)
window.addEventListener('scroll', () => {
const scrollPos = window.scrollY;
const newStart = Math.min(Math.floor(scrollPos / 80), TOTAL - VISIBLE);
if (newStart !== startIdx) {
startIdx = newStart;
render();
}
});
</script>
</body>
</html>
"""


def start_server(html_dir, port=9731):
"""Start a simple HTTP server serving the test HTML."""
class Handler(SimpleHTTPRequestHandler):
def __init__(self, *args, **kwargs):
super().__init__(*args, directory=html_dir, **kwargs)
def log_message(self, format, *args):
pass # suppress logs
server = HTTPServer(("127.0.0.1", port), Handler)
thread = threading.Thread(target=server.serve_forever, daemon=True)
thread.start()
return server


async def test_scan_full_page_virtual_scroll():
"""
BUG REPRODUCTION: scan_full_page=True on a virtual-scroll page.
Expected: all 50 users captured.
Actual (bug): only the last ~10 users captured.
"""
# Write test HTML to a temp dir and serve it
with tempfile.TemporaryDirectory() as tmpdir:
html_path = os.path.join(tmpdir, "index.html")
with open(html_path, "w") as f:
f.write(VIRTUAL_SCROLL_HTML)

server = start_server(tmpdir, port=9731)

try:
schema = {
"name": "Users",
"baseSelector": "[data-testid='UserCell']",
"fields": [
{"name": "name", "selector": ".name", "type": "text"},
{"name": "handle", "selector": ".handle", "type": "text"},
],
}
extraction = JsonCssExtractionStrategy(schema)

# --- Test 1: WITHOUT scan_full_page (baseline) ---
config_no_scroll = CrawlerRunConfig(
cache_mode=CacheMode.BYPASS,
extraction_strategy=extraction,
scan_full_page=False,
)
async with AsyncWebCrawler(verbose=True) as crawler:
result = await crawler.arun(url="http://127.0.0.1:9731/index.html", config=config_no_scroll)

data_no_scroll = json.loads(result.extracted_content)
names_no_scroll = [u["name"] for u in data_no_scroll]
print(f"\n{'='*60}")
print(f"WITHOUT scan_full_page: {len(data_no_scroll)} users")
print(f" Users: {names_no_scroll[:5]} ... {names_no_scroll[-3:] if len(names_no_scroll) > 5 else ''}")
print(f"{'='*60}")

# --- Test 2: WITH scan_full_page (the bug) ---
config_scroll = CrawlerRunConfig(
cache_mode=CacheMode.BYPASS,
extraction_strategy=extraction,
scan_full_page=True,
scroll_delay=0.3,
max_scroll_steps=20,
)
async with AsyncWebCrawler(verbose=True) as crawler:
result = await crawler.arun(url="http://127.0.0.1:9731/index.html", config=config_scroll)

data_scroll = json.loads(result.extracted_content)
names_scroll = [u["name"] for u in data_scroll]
print(f"\n{'='*60}")
print(f"WITH scan_full_page: {len(data_scroll)} users")
print(f" Users: {names_scroll[:5]} ... {names_scroll[-3:] if len(names_scroll) > 5 else ''}")
print(f"{'='*60}")

# --- Deduplicate by handle ---
unique_scroll = {u["handle"]: u for u in data_scroll}

# --- Verdict ---
print(f"\n{'='*60}")
print("VERDICT:")
print(f" Raw extracted: {len(data_scroll)}, Unique by handle: {len(unique_scroll)}")
if len(unique_scroll) >= 40:
print(f" PASS — scan_full_page captured {len(unique_scroll)}/50 unique users (>= 40)")
# Check coverage
captured = sorted(unique_scroll.keys(), key=lambda h: int(h.replace("@user", "")))
missing = [f"@user{i}" for i in range(1, 51) if f"@user{i}" not in unique_scroll]
if missing:
print(f" Missing {len(missing)} users: {missing[:10]}{'...' if len(missing) > 10 else ''}")
else:
print(f" All 50 users captured!")
else:
print(f" BUG CONFIRMED — scan_full_page only captured {len(unique_scroll)}/50 unique users")
handles = sorted(unique_scroll.keys())
print(f" Captured handles: {handles}")
print(f"{'='*60}")

finally:
server.shutdown()


if __name__ == "__main__":
asyncio.run(test_scan_full_page_virtual_scroll())