diff --git a/crawl4ai/async_configs.py b/crawl4ai/async_configs.py index 44d3040b5..7d2837745 100644 --- a/crawl4ai/async_configs.py +++ b/crawl4ai/async_configs.py @@ -937,23 +937,33 @@ def __init__( scroll_count: int = 10, scroll_by: Union[str, int] = "container_height", wait_after_scroll: float = 0.5, + max_no_change: int = 5, + max_captured_elements: int = 10000, ): """ Initialize virtual scroll configuration. - + Args: container_selector: CSS selector for the scrollable container scroll_count: Maximum number of scrolls to perform scroll_by: Amount to scroll - can be: - "container_height": scroll by container's height - - "page_height": scroll by viewport height + - "page_height": scroll by viewport height - int: fixed pixel amount wait_after_scroll: Seconds to wait after each scroll for content to load + max_no_change: Stop scrolling after this many consecutive scrolls with no + new content detected. Prevents wasting time at the end of a feed. + Set to 0 to disable early termination. + max_captured_elements: Maximum number of unique elements to accumulate + before stopping. Prevents browser OOM on very large feeds. + Set to 0 to disable the cap. """ self.container_selector = container_selector self.scroll_count = scroll_count self.scroll_by = scroll_by self.wait_after_scroll = wait_after_scroll + self.max_no_change = max_no_change + self.max_captured_elements = max_captured_elements def to_dict(self) -> dict: """Convert to dictionary for serialization.""" @@ -962,12 +972,18 @@ def to_dict(self) -> dict: "scroll_count": self.scroll_count, "scroll_by": self.scroll_by, "wait_after_scroll": self.wait_after_scroll, + "max_no_change": self.max_no_change, + "max_captured_elements": self.max_captured_elements, } @classmethod def from_dict(cls, data: dict) -> "VirtualScrollConfig": - """Create instance from dictionary.""" - return cls(**data) + """Create instance from dictionary. Unknown keys are ignored for + forward-compatibility with newer config versions.""" + known = {"container_selector", "scroll_count", "scroll_by", + "wait_after_scroll", "max_no_change", "max_captured_elements"} + filtered = {k: v for k, v in data.items() if k in known} + return cls(**filtered) class LinkPreviewConfig: """Configuration for link head extraction and scoring.""" diff --git a/crawl4ai/async_crawler_strategy.py b/crawl4ai/async_crawler_strategy.py index b9de25f6b..42ce5be26 100644 --- a/crawl4ai/async_crawler_strategy.py +++ b/crawl4ai/async_crawler_strategy.py @@ -1200,88 +1200,935 @@ async def get_delayed_content(delay: float = 5.0) -> str: except Exception: pass - # async def _handle_full_page_scan(self, page: Page, scroll_delay: float = 0.1): async def _handle_full_page_scan(self, page: Page, scroll_delay: float = 0.1, max_scroll_steps: Optional[int] = None): """ - Helper method to handle full page scanning. - - How it works: - 1. Get the viewport height. - 2. Scroll to the bottom of the page. - 3. Get the total height of the page. - 4. Scroll back to the top of the page. - 5. Scroll to the bottom of the page again. - 6. Continue scrolling until the bottom of the page is reached. + Progressive full-page scan with automatic DOM recycling detection. + + Five phases: + 1. Setup — viewport, timeout, helpers (fingerprint, cleanOuterHTML, + expandCollapsed). + 2. Detect recycling — fingerprint candidate containers before/after + a probe scroll; also run a MutationObserver for innerHTML-wipe + detection. Collects all recycling containers. + 3. Scroll + capture recycling containers — for each detected + recycling container: deterministic scroll with fingerprint-based + dedup, nested inner-container scrolling, then inject merged HTML. + Supports vertical, horizontal, and 2D zigzag scroll. + 4. Container-scroll pass — scans for overflow-y/x scrollable + containers not handled in phase 3 and scrolls them. + 5. Fallback — normal scroll-to-bottom for append-based lazy loading. + + For pages that do NOT recycle DOM nodes the behaviour is identical to + the previous implementation — scroll top to bottom. Args: page (Page): The Playwright page object scroll_delay (float): The delay between page scrolls - max_scroll_steps (Optional[int]): Maximum number of scroll steps to perform. Defaults to 10 to prevent infinite scroll hangs. - + max_scroll_steps (Optional[int]): Maximum number of scroll steps. + If None, scrolls until the bottom is reached. """ - # Default to 10 steps to prevent infinite scroll on dynamic pages - if max_scroll_steps is None: - max_scroll_steps = 10 - + prev_timeout = page._timeout_settings._timeout if hasattr(page, '_timeout_settings') else None try: viewport_size = page.viewport_size if viewport_size is None: await page.set_viewport_size( {"width": self.browser_config.viewport_width, "height": self.browser_config.viewport_height} ) - viewport_size = page.viewport_size - viewport_height = viewport_size.get( - "height", self.browser_config.viewport_height - ) - current_position = viewport_height + # Virtual-scroll pages may need many scroll steps, easily + # exceeding Playwright's default 30s evaluate timeout. + # Temporarily raise it for this call. + page.set_default_timeout(300_000) # 5 minutes + + result = await page.evaluate( + """async (cfg) => { + // ── Config ────────────────────────────────────────────────────────── + const scrollDelay = cfg.scrollDelay; + const maxSteps = cfg.maxSteps; + const memCap = cfg.memCap || 50000; + + // ── Helpers ───────────────────────────────────────────────────────── + const sleep = ms => new Promise(r => setTimeout(r, ms)); + + // Deterministic fingerprint. + // Priority: href on element or child a[href] + // → data-id / data-key / data-index / data-testid / id + // → full text djb2 hash (no truncation, no outerHTML fallback) + function fingerprint(el) { + try { + if (!el || el.nodeType !== 1) return null; + const tag = el.tagName.toLowerCase(); + if (['script','style','link','meta','noscript','br','hr'].indexOf(tag) !== -1) return null; + + // 1. href + const href = el.getAttribute('href') + || (el.querySelector ? (el.querySelector('a[href]') || {getAttribute: ()=>null}).getAttribute('href') : null); + if (href && href.length > 1 && href !== '#' && href !== '/') return 'u:' + href; + + // 2. stable data attributes + const aid = el.getAttribute('data-id') + || el.getAttribute('data-key') + || el.getAttribute('data-index') + || el.getAttribute('data-testid') + || el.id; + if (aid) return 'a:' + aid; + + // 3. full-text djb2 hash — no truncation + const txt = (el.textContent || '').trim(); + if (txt.length > 0) { + let h = 5381; + for (let i = 0; i < txt.length; i++) { + h = ((h << 5) + h + txt.charCodeAt(i)) | 0; + } + return 't:' + h + ':' + txt.length; + } + return null; + } catch(e) { return null; } + } + + // Strip volatile positioning from inline style before capturing. + // Uses cloneNode to avoid mutating live DOM (which could trigger + // framework re-renders or race with MutationObservers). + function cleanOuterHTML(el) { + try { + const clone = el.cloneNode(true); + const s = clone.style; + if (s) { + for (const prop of ['transform', 'translate', 'top', 'left']) { + if (s[prop]) s[prop] = ''; + } + } + return clone.outerHTML; + } catch(e) { + return el.outerHTML; + } + } - # await page.evaluate(f"window.scrollTo(0, {current_position})") - await self.safe_scroll(page, 0, current_position, delay=scroll_delay) - # await self.csp_scroll_to(page, 0, current_position) - # await asyncio.sleep(scroll_delay) + // ── expandCollapsed — preserved exactly ───────────────────────────── + const EXPAND_RE = /^\+\d+\s*more|show\s*(all|more)|load\s*more|see\s*(all|more)|expand/i; + const expandedSet = new WeakSet(); + async function expandCollapsed() { + let count = 0; + for (const el of document.querySelectorAll('[role="button"], button')) { + if (expandedSet.has(el)) continue; + const txt = (el.textContent || '').trim(); + if (EXPAND_RE.test(txt) && !el.closest('nav') && !el.closest('header') && !el.closest('footer')) { + expandedSet.add(el); + try { el.click(); count++; } catch(e) {} + } + } + for (const d of document.querySelectorAll('details:not([open])')) { + if (expandedSet.has(d)) continue; + expandedSet.add(d); + d.setAttribute('open', ''); + count++; + } + for (const el of document.querySelectorAll('[aria-expanded="false"]')) { + if (expandedSet.has(el)) continue; + const txt = (el.textContent || '').trim(); + if (txt.length < 200 && !el.closest('nav') && !el.closest('header')) { + expandedSet.add(el); + try { el.click(); count++; } catch(e) {} + } + } + if (count > 0) await sleep(scrollDelay * 1000); + return count; + } - # total_height = await page.evaluate("document.documentElement.scrollHeight") - dimensions = await self.get_page_dimensions(page) - total_height = dimensions["height"] - - scroll_step_count = 0 - while current_position < total_height: - #### - # NEW FEATURE: Check if we've reached the maximum allowed scroll steps - # This prevents infinite scrolling on very long pages or infinite scroll scenarios - # If max_scroll_steps is None, this check is skipped (unlimited scrolling - original behavior) - #### - if max_scroll_steps is not None and scroll_step_count >= max_scroll_steps: - break - current_position = min(current_position + viewport_height, total_height) - await self.safe_scroll(page, 0, current_position, delay=scroll_delay) - - # Increment the step counter for max_scroll_steps tracking - scroll_step_count += 1 - - # await page.evaluate(f"window.scrollTo(0, {current_position})") - # await asyncio.sleep(scroll_delay) + // ── Phase 1: Setup ───────────────────────────────────────────── + let steps = 0; + const limit = (maxSteps && maxSteps > 0) ? maxSteps : 1000000; + let totalExpanded = 0; + let removedCount = 0; + + // ── Phase 2: Detect recycling via content comparison ──────── + // Three recycling patterns exist: + // a) Transform-based: DOM nodes stay, style.transform changes content + // b) innerHTML-wipe: container cleared and rebuilt on scroll + // c) Node swap: individual nodes removed and new ones added + // We detect ALL patterns by: + // 1. Fingerprinting all children of candidate containers before scroll + // 2. Scrolling down + // 3. Fingerprinting again — if fingerprints changed but child count + // didn't grow proportionally, it's recycling + // Also run MutationObserver for innerHTML-wipe detection. + + let recyclingContainers = []; + + // Find candidate containers: elements with 3+ children in the viewport area + function findCandidates() { + const candidates = []; + const elems = document.body.querySelectorAll('*'); + const max = Math.min(elems.length, 5000); + for (let i = 0; i < max; i++) { + const el = elems[i]; + if (el.children.length < 3) continue; + // Check first child tag — list containers have same-tag children + const firstTag = el.children[0].tagName; + let sameTag = 0; + for (const c of el.children) { if (c.tagName === firstTag) sameTag++; } + if (sameTag >= 3) candidates.push(el); + } + return candidates; + } + + // Snapshot: collect fingerprints of all children + function snapshotFps(el) { + const fps = new Set(); + for (const child of el.children) { + if (child.nodeType !== 1) continue; + const f = fingerprint(child); + if (f) fps.add(f); + } + return fps; + } + + // MutationObserver for innerHTML-wipe detection + let hadChildListRecycling = null; // element or null + const recycleParents = new Map(); + const probeObserver = new MutationObserver(mutations => { + for (const mut of mutations) { + if (mut.type !== 'childList') continue; + if (!recycleParents.has(mut.target)) { + recycleParents.set(mut.target, {adds: 0, removes: 0}); + } + const rec = recycleParents.get(mut.target); + if (mut.addedNodes.length > 0) rec.adds += mut.addedNodes.length; + if (mut.removedNodes.length > 0) { + rec.removes += mut.removedNodes.length; + for (const node of mut.removedNodes) { + if (node.nodeType === 1) removedCount++; + } + } + } + }); + probeObserver.observe(document.body, {childList: true, subtree: true}); + + // Snapshot candidates before scroll + const candidates = findCandidates(); + const beforeMap = new Map(); // element → Set + for (const c of candidates) { + beforeMap.set(c, snapshotFps(c)); + } + const beforeChildCounts = new Map(); + for (const c of candidates) { + beforeChildCounts.set(c, c.children.length); + } + + // Helper: find nearest scrollable ancestor of el (excluding window). + // Checks both vertical (overflowY) and horizontal (overflowX). + function scrollableAncestor(el) { + let p = el.parentElement; + while (p && p !== document.body && p !== document.documentElement) { + const cs = window.getComputedStyle(p); + const oy = cs.overflowY; + if ((oy === 'scroll' || oy === 'auto') && p.scrollHeight > p.clientHeight) { + return p; + } + const ox = cs.overflowX; + if ((ox === 'scroll' || ox === 'auto') && p.scrollWidth > p.clientWidth) { + return p; + } + p = p.parentElement; + } + return null; + } + + // Scroll down by one viewport height (window-level). + // Also probe each candidate container's own scrollTop/scrollLeft + // for container-level virtual scroll (overflow-y/x: scroll). + // If a candidate is not itself scrollable, scroll its nearest + // scrollable ancestor (e.g. #inner inside #scroller). + const probeDistance = window.innerHeight; + window.scrollBy(0, probeDistance); + const probedAncestors = new Set(); + for (const c of candidates) { + const cs = window.getComputedStyle(c); + const oy = cs.overflowY; + const ox = cs.overflowX; + if ((oy === 'scroll' || oy === 'auto') && c.scrollHeight > c.clientHeight) { + c.scrollTop += c.clientHeight; + } else if ((ox === 'scroll' || ox === 'auto') && c.scrollWidth > c.clientWidth) { + c.scrollLeft += c.clientWidth; + } else { + // Candidate itself doesn't scroll — try scrollable parent + const anc = scrollableAncestor(c); + if (anc && !probedAncestors.has(anc)) { + probedAncestors.add(anc); + const ancCs = window.getComputedStyle(anc); + const ancOx = ancCs.overflowX; + if ((ancOx === 'scroll' || ancOx === 'auto') && anc.scrollWidth > anc.clientWidth) { + anc.scrollLeft += anc.clientWidth; + } else { + anc.scrollTop += anc.clientHeight; + } + } + } + } + await sleep(Math.max(scrollDelay * 1000, 300)); + + // Compare: which containers had their content change? + for (const c of candidates) { + const before = beforeMap.get(c); + const after = snapshotFps(c); + const beforeCount = beforeChildCounts.get(c); + const afterCount = c.children.length; + + // Content changed? + let changed = 0; + for (const f of before) { + if (!after.has(f)) changed++; + } + let added = 0; + for (const f of after) { + if (!before.has(f)) added++; + } + + // Recycling = content changed significantly but child count + // didn't grow (or grew very little). If child count doubled, + // it's append-based, not recycling. + if (changed >= 2 && afterCount <= beforeCount * 1.5) { + recyclingContainers.push(c); + } + } + + // Also check MutationObserver results for childList recycling. + // Require significant mutation count (>= 4) to avoid false positives + // from loading spinners or minor DOM updates (1 add + 1 remove). + // Also check MutationObserver for containers not already detected + // via fingerprint comparison above. + { + const alreadyFound = new Set(recyclingContainers); + for (const [el, counts] of recycleParents) { + if (alreadyFound.has(el)) continue; + if (counts.adds >= 2 && counts.removes >= 2) { + recyclingContainers.push(el); + alreadyFound.add(el); + } + } + } + + probeObserver.disconnect(); + + // Scroll back to top/left (window + all probed containers + their ancestors) + window.scrollTo(0, 0); + for (const c of candidates) { + if (c.scrollTop > 0) c.scrollTop = 0; + if (c.scrollLeft > 0) c.scrollLeft = 0; + } + for (const anc of probedAncestors) { + anc.scrollTop = 0; + anc.scrollLeft = 0; + } + await sleep(scrollDelay * 1000); + + // ── Phase 3: Scroll + capture all recycling containers ──── + let phase2TotalMerged = 0; + let phase2CapReached = false; + + for (let rci = 0; rci < recyclingContainers.length && steps < limit && !phase2CapReached; rci++) { + const recyclingContainer = recyclingContainers[rci]; + const children = recyclingContainer.children; + if (children.length < 2) { + continue; // can't measure, skip + } else { + // Measure item dimension — try vertical first, then horizontal. + let itemHeight = Math.abs( + children[1].getBoundingClientRect().top - + children[0].getBoundingClientRect().top + ); + if (itemHeight <= 0) { + // Vertical offset is zero — try horizontal offset + // (items may be laid out horizontally with translateX) + itemHeight = Math.abs( + children[1].getBoundingClientRect().left - + children[0].getBoundingClientRect().left + ); + } + if (itemHeight <= 0) { + itemHeight = children[0].getBoundingClientRect().height; + } + if (itemHeight <= 0) { + itemHeight = children[0].getBoundingClientRect().width; + } + if (itemHeight <= 0) { + continue; // can't determine step size, skip this container + } else { + // Determine total virtual size (height or width). + // Check container's explicit style.height/width first (set by + // virtual-scroll frameworks for scrollbar sizing). + // Fall back to scrollHeight/scrollWidth. + // For window-level scroll, use documentElement dimensions. + let totalHeight = 0; + if (recyclingContainer.style && recyclingContainer.style.width) { + const pw = parseFloat(recyclingContainer.style.width) || 0; + if (pw > 0) totalHeight = pw; + } + if (!totalHeight && recyclingContainer.style && recyclingContainer.style.height) { + totalHeight = parseFloat(recyclingContainer.style.height) || 0; + } + if (!totalHeight || totalHeight <= 0) { + totalHeight = Math.max( + document.documentElement.scrollHeight, + document.documentElement.scrollWidth + ); + } + const totalItems = Math.round(totalHeight / itemHeight); + const cappedTotal = Math.min(totalItems, memCap); + + // ── Phase 3a: Deterministic scroll + capture ──────────── + // captured: fingerprint → cleanOuterHTML (first-seen wins) + const captured = new Map(); + + function captureVisible() { + for (const child of recyclingContainer.children) { + if (child.nodeType !== 1) continue; + const key = fingerprint(child); + if (key && !captured.has(key)) { + captured.set(key, cleanOuterHTML(child)); + if (captured.size >= memCap) return; + } + // Also capture children inside nested scrollable + // containers within each recycled child. + try { + const subs = child.querySelectorAll('*'); + for (const isc of subs) { + const ics = window.getComputedStyle(isc); + const iox = ics.overflowX; + const ioy = ics.overflowY; + const hS = (iox === 'scroll' || iox === 'auto') && isc.scrollWidth > isc.clientWidth; + const vS = (ioy === 'scroll' || ioy === 'auto') && isc.scrollHeight > isc.clientHeight; + if (!hS && !vS) continue; + for (const ic of isc.children) { + if (ic.nodeType !== 1) continue; + const ik = fingerprint(ic); + if (ik && !captured.has(ik)) { + captured.set(ik, cleanOuterHTML(ic)); + if (captured.size >= memCap) return; + } + } + } + } catch(e) {} + } + } + + // Scroll nested scrollable containers within visible + // recycled children to reveal nested recycled content. + async function scrollInnerContainers() { + for (const child of recyclingContainer.children) { + if (child.nodeType !== 1) continue; + if (captured.size >= memCap) return; + try { + const subs = child.querySelectorAll('*'); + for (const isc of subs) { + if (captured.size >= memCap) return; + const ics = window.getComputedStyle(isc); + const iox = ics.overflowX; + const ioy = ics.overflowY; + const hS = (iox === 'scroll' || iox === 'auto') && isc.scrollWidth > isc.clientWidth; + const vS = (ioy === 'scroll' || ioy === 'auto') && isc.scrollHeight > isc.clientHeight; + if (!hS && !vS) continue; + let iDim = hS ? isc.clientWidth : isc.clientHeight; + if (isc.children.length >= 2) { + const d = hS + ? Math.abs(isc.children[1].getBoundingClientRect().left - isc.children[0].getBoundingClientRect().left) + : Math.abs(isc.children[1].getBoundingClientRect().top - isc.children[0].getBoundingClientRect().top); + if (d > 0) iDim = d; + } + const iStep = Math.max(iDim, iDim * 2); + for (let si = 0; si < 200; si++) { + if (hS) isc.scrollLeft += iStep; + else isc.scrollTop += iStep; + await sleep(Math.min(scrollDelay * 500, 100)); + for (const ic of isc.children) { + if (ic.nodeType !== 1) continue; + const ik = fingerprint(ic); + if (ik && !captured.has(ik)) { + captured.set(ik, cleanOuterHTML(ic)); + } + } + if (captured.size >= memCap) break; + if (hS) { + if (isc.scrollLeft + isc.clientWidth >= isc.scrollWidth - iDim) break; + } else { + if (isc.scrollTop + isc.clientHeight >= isc.scrollHeight - iDim) break; + } + } + if (hS) isc.scrollLeft = 0; + else isc.scrollTop = 0; + } + } catch(e) {} + } + } + + // Second observer: capture items at the moment of removal + // (innerHTML-wipe fallback — ensures we don't miss items + // whose container is wiped rather than recycled). + const wipeObserver = new MutationObserver(mutations => { + for (const mut of mutations) { + if (mut.type !== 'childList') continue; + for (const node of mut.removedNodes) { + if (node.nodeType !== 1) continue; + const key = fingerprint(node); + if (key && !captured.has(key)) { + try { captured.set(key, cleanOuterHTML(node)); } catch(e) {} + } + } + } + }); + wipeObserver.observe(recyclingContainer, {childList: true, subtree: false}); + let capReached = false; + try { + + // Determine the element to actually scroll. + // Priority: + // 1. recyclingContainer itself, if it has overflow-y/x scroll/auto + // and is actually scrollable. + // 2. Its nearest scrollable ancestor (e.g. #scroller wrapping #inner). + // 3. Window (fallback). + // This handles the pattern where the DATA container (#inner) is not + // scrollable but its PARENT (#scroller) is. + let scrollTarget = null; // null → use window + let isHorizontal = false; // true if the scroll axis is horizontal + if ( + recyclingContainer !== document.documentElement && + recyclingContainer !== document.body + ) { + const rcs = window.getComputedStyle(recyclingContainer); + const oy = rcs.overflowY; + const ox = rcs.overflowX; + if ((oy === 'scroll' || oy === 'auto') && + recyclingContainer.scrollHeight > recyclingContainer.clientHeight) { + scrollTarget = recyclingContainer; + isHorizontal = false; + } else if ((ox === 'scroll' || ox === 'auto') && + recyclingContainer.scrollWidth > recyclingContainer.clientWidth) { + scrollTarget = recyclingContainer; + isHorizontal = true; + } else { + // Not scrollable itself — look for a scrollable ancestor + const anc = scrollableAncestor(recyclingContainer); + if (anc) { + scrollTarget = anc; + const ancCs = window.getComputedStyle(anc); + const ancOx = ancCs.overflowX; + if ((ancOx === 'scroll' || ancOx === 'auto') && + anc.scrollWidth > anc.clientWidth && + !(anc.scrollHeight > anc.clientHeight)) { + isHorizontal = true; + } + } + } + } + const useWindowScroll = (scrollTarget === null); + + // Detect 2D scrollable containers (both horizontal + vertical) + let is2D = false; + if (!useWindowScroll) { + const st = scrollTarget; + const stCs = window.getComputedStyle(st); + const stOx = stCs.overflowX; + const stOy = stCs.overflowY; + const hasH = (stOx === 'scroll' || stOx === 'auto') && st.scrollWidth > st.clientWidth; + const hasV = (stOy === 'scroll' || stOy === 'auto') && st.scrollHeight > st.clientHeight; + if (hasH && hasV) is2D = true; + } + + // Measure item width (separate from height) for 2D / horizontal + let itemWidth = itemHeight; + if (recyclingContainer.children.length >= 2) { + const iw = Math.abs( + recyclingContainer.children[1].getBoundingClientRect().left - + recyclingContainer.children[0].getBoundingClientRect().left + ); + if (iw > 0) itemWidth = iw; + } + // For pure horizontal scroll, reuse itemHeight variable for width + if (isHorizontal && !is2D) { + itemHeight = itemWidth; + } + + // Capture at position 0 + captureVisible(); + await scrollInnerContainers(); + totalExpanded += await expandCollapsed(); + + let consecutiveEmpty = 0; + + if (is2D) { + // ── 2D zigzag scroll ────────────────────────────────── + // For containers scrollable in both X and Y (e.g. 2D grids), + // sweep horizontally at each vertical band, then step down. + // Use single-item steps to ensure the pool (which may be + // smaller than the visible cell count) renders every cell + // at least once across the sweep. + const vpW = scrollTarget.clientWidth; + const vpH2 = scrollTarget.clientHeight; + const hStep = itemWidth; // one column at a time + const vStep = itemHeight; // one row at a time + + let yPos = 0; + let atBottomRow = false; + while (!atBottomRow && steps < limit && !capReached) { + // Set vertical position and reset horizontal + scrollTarget.scrollTop = yPos; + scrollTarget.scrollLeft = 0; + await sleep(scrollDelay * 1000); + captureVisible(); + steps++; + if (captured.size >= memCap) { capReached = true; break; } + + // Sweep all the way right + while (steps < limit && !capReached) { + scrollTarget.scrollLeft += hStep; + await sleep(scrollDelay * 1000); + captureVisible(); + steps++; + if (captured.size >= memCap) { capReached = true; break; } + if (scrollTarget.scrollLeft + vpW >= scrollTarget.scrollWidth - itemWidth) break; + } + + // Check if at bottom + if (yPos + vpH2 >= scrollTarget.scrollHeight - itemHeight) { + atBottomRow = true; + } else { + yPos += vStep; + } + } + } else { + // ── 1D scroll (original logic) ──────────────────────── + const vpH = useWindowScroll + ? (isHorizontal ? window.innerWidth : window.innerHeight) + : (isHorizontal ? scrollTarget.clientWidth : scrollTarget.clientHeight); + const scrollStep = Math.max( + itemHeight, + Math.floor(vpH / itemHeight - 1) * itemHeight + ); + + while (steps < limit && !capReached) { + if (useWindowScroll) { + if (isHorizontal) { + window.scrollBy(scrollStep, 0); + } else { + window.scrollBy(0, scrollStep); + } + } else { + if (isHorizontal) { + scrollTarget.scrollLeft += scrollStep; + } else { + scrollTarget.scrollTop += scrollStep; + } + } + await sleep(scrollDelay * 1000); + totalExpanded += await expandCollapsed(); + + const beforeSize = captured.size; + captureVisible(); + await scrollInnerContainers(); + const newItems = captured.size - beforeSize; + + if (newItems === 0) { + consecutiveEmpty++; + let atEnd; + if (useWindowScroll) { + if (isHorizontal) { + atEnd = (window.scrollX + window.innerWidth >= document.documentElement.scrollWidth - itemHeight * 2); + } else { + atEnd = (window.scrollY + window.innerHeight >= document.documentElement.scrollHeight - itemHeight * 2); + } + } else { + if (isHorizontal) { + atEnd = (scrollTarget.scrollLeft + scrollTarget.clientWidth >= scrollTarget.scrollWidth - itemHeight * 2); + } else { + atEnd = (scrollTarget.scrollTop + scrollTarget.clientHeight >= scrollTarget.scrollHeight - itemHeight * 2); + } + } + if (consecutiveEmpty >= 3 && atEnd) break; + } else { + consecutiveEmpty = 0; + } + + steps++; + + if (captured.size >= memCap) { capReached = true; break; } + + if (useWindowScroll) { + if (isHorizontal) { + const scrolled = window.scrollX || window.pageXOffset || 0; + const docWidth = document.documentElement.scrollWidth; + if (scrolled + window.innerWidth >= docWidth - itemHeight) break; + } else { + const scrolled = window.scrollY || window.pageYOffset || 0; + const docHeight = document.documentElement.scrollHeight; + if (scrolled + window.innerHeight >= docHeight - itemHeight) break; + } + } else { + if (isHorizontal) { + if (scrollTarget.scrollLeft + scrollTarget.clientWidth >= + scrollTarget.scrollWidth - itemHeight) break; + } else { + if (scrollTarget.scrollTop + scrollTarget.clientHeight >= + scrollTarget.scrollHeight - itemHeight) break; + } + } + } + } + + // Final capture at scroll bottom + captureVisible(); + await scrollInnerContainers(); + + } finally { wipeObserver.disconnect(); } + + // ── Phase 3b: Inject results for this container ───────── + if (captured.size > recyclingContainer.children.length) { + recyclingContainer.style.display = 'none'; + const mergedDiv = document.createElement('div'); + mergedDiv.id = 'crawl4ai-merged-' + rci; + mergedDiv.innerHTML = Array.from(captured.values()).join('\\n'); + recyclingContainer.parentElement.insertBefore(mergedDiv, recyclingContainer); + } + phase2TotalMerged += captured.size; + if (capReached) phase2CapReached = true; + } + } + } // end for each recyclingContainer + + // ── Phase 3 complete — store result, fall through to Phase 4 ── + // Even if Phase 3 handled recycling containers, there may be + // OTHER scrollable containers (nested scroll, overflow-y/x) + // that Phase 4 should also scroll. + let phase2Result = null; + if (phase2TotalMerged > 0) { + window.scrollTo(0, 0); + phase2Result = { + recyclingDetected: true, + removedCount: removedCount, + totalMerged: phase2TotalMerged, + scrollSteps: steps, + capReached: phase2CapReached, + expandedGroups: totalExpanded, + phase: 3, + phase2Containers: recyclingContainers + }; + } + + // ── Phase 4: Container-scroll (overflow-y/x: scroll/auto) ── + // Handles scrollable containers not caught by Phase 3's + // recycling detection. Uses the same fingerprint + Map + // approach. Detects both vertical and horizontal containers. + const allEls3 = document.querySelectorAll('*'); + const scanLimit = Math.min(allEls3.length, 5000); + const scrollContainers = []; + for (let i = 0; i < scanLimit; i++) { + try { + const el = allEls3[i]; + const cs3 = window.getComputedStyle(el); + const oy = cs3.overflowY; + const ox = cs3.overflowX; + if ((oy === 'auto' || oy === 'scroll') && + el.scrollHeight > el.clientHeight * 2 && + el.clientHeight > 50 && + el.children.length > 0) { + scrollContainers.push({el: el, horizontal: false}); + } else if ((ox === 'auto' || ox === 'scroll') && + el.scrollWidth > el.clientWidth * 2 && + el.clientWidth > 50 && + el.children.length > 0) { + scrollContainers.push({el: el, horizontal: true}); + } + } catch(e) { continue; } + } + + let phase3Merged = 0; + let phase3CapReached = false; + + // Collect containers already handled by Phase 3 to skip them + const phase3Handled = new Set(); + if (phase2Result && phase2Result.phase2Containers) { + for (const rc of phase2Result.phase2Containers) { + phase3Handled.add(rc); + } + } + + for (let ci = 0; ci < scrollContainers.length && steps < limit && !phase3CapReached; ci++) { + try { + const ct = scrollContainers[ci].el; + const ctHoriz = scrollContainers[ci].horizontal; + if (ctHoriz ? ct.clientWidth === 0 : ct.clientHeight === 0) continue; + // Skip containers already handled by Phase 3 + if (phase3Handled.has(ct)) continue; + // Skip containers that are display:none (hidden by Phase 3) + if (ct.style.display === 'none') continue; + // Skip containers inside a display:none parent (merged by Phase 3) + if (ct.closest('[style*="display: none"]') || ct.closest('[style*="display:none"]')) continue; + + // Measure item dimension from first two children + let ctItemDim = ctHoriz ? ct.clientWidth : ct.clientHeight; // fallback + if (ct.children.length >= 2) { + const ih = ctHoriz + ? Math.abs(ct.children[1].getBoundingClientRect().left - ct.children[0].getBoundingClientRect().left) + : Math.abs(ct.children[1].getBoundingClientRect().top - ct.children[0].getBoundingClientRect().top); + if (ih > 0) ctItemDim = ih; + } else if (ct.children.length === 1) { + const ih = ctHoriz + ? ct.children[0].getBoundingClientRect().width + : ct.children[0].getBoundingClientRect().height; + if (ih > 0) ctItemDim = ih; + } - # new_height = await page.evaluate("document.documentElement.scrollHeight") - dimensions = await self.get_page_dimensions(page) - new_height = dimensions["height"] + const ctCaptured = new Map(); - if new_height > total_height: - total_height = new_height + function ctCaptureVisible() { + for (const child of ct.children) { + if (child.nodeType !== 1) continue; + const key = fingerprint(child); + if (key && !ctCaptured.has(key)) { + ctCaptured.set(key, cleanOuterHTML(child)); + if (ctCaptured.size >= memCap) return; + } + } + } + + ctCaptureVisible(); + let ctConsecutiveEmpty = 0; + + for (let si = 0; si < 1000 && ctConsecutiveEmpty < 5 && steps < limit; si++) { + if (ctHoriz) { + ct.scrollLeft += ctItemDim; + } else { + ct.scrollTop += ctItemDim; + } + await sleep(scrollDelay * 1000); + steps++; + + const beforeSize = ctCaptured.size; + ctCaptureVisible(); + const newItems = ctCaptured.size - beforeSize; + + if (newItems === 0) { + ctConsecutiveEmpty++; + } else { + ctConsecutiveEmpty = 0; + } + + if (ctCaptured.size >= memCap) { phase3CapReached = true; break; } + if (ctHoriz) { + if (ct.scrollLeft + ct.clientWidth >= ct.scrollWidth - ctItemDim) break; + } else { + if (ct.scrollTop + ct.clientHeight >= ct.scrollHeight - ctItemDim) break; + } + } + + // Only inject if we found more items than currently visible + if (ctCaptured.size > ct.children.length) { + ct.innerHTML = Array.from(ctCaptured.values()).join('\\n'); + phase3Merged += ctCaptured.size; + } + } catch(e) { continue; } + } + + if (phase3Merged > 0 || phase2Result) { + window.scrollTo(0, document.documentElement.scrollHeight); + const totalMerged = (phase2Result ? phase2Result.totalMerged : 0) + phase3Merged; + return { + recyclingDetected: true, + removedCount: totalMerged, + totalMerged: totalMerged, + scrollSteps: steps, + capReached: phase3CapReached || (phase2Result && phase2Result.capReached), + expandedGroups: totalExpanded, + phase: phase2Result ? 3 : 4 + }; + } - # await page.evaluate("window.scrollTo(0, 0)") - await self.safe_scroll(page, 0, 0) + // No recycling and no container-scroll found. + // Do a normal scroll-to-bottom for append-based infinite scroll + // (items are lazy-loaded as user scrolls down). + let totalHeight = document.documentElement.scrollHeight; + const vpHeight = window.innerHeight; + let pos = 0; + while (pos < totalHeight && steps < limit) { + pos = Math.min(pos + vpHeight, totalHeight); + window.scrollTo(0, pos); + await sleep(scrollDelay * 1000); + totalExpanded += await expandCollapsed(); + const nh = document.documentElement.scrollHeight; + if (nh > totalHeight) totalHeight = nh; + steps++; + } + window.scrollTo(0, document.documentElement.scrollHeight); + return { + recyclingDetected: false, + scrollSteps: steps, + removedButNotRecycled: removedCount + }; + }""", + { + "scrollDelay": scroll_delay, + "maxSteps": max_scroll_steps if max_scroll_steps else 0, + "memCap": 50000, + }, + ) + if result and result.get("recyclingDetected"): + if result.get("capReached"): + self.logger.warning( + message="DOM recycling detected — recovered {removed} elements, " + "{total} total after merge ({steps} scroll steps) " + "[MEMORY CAP REACHED — results may be incomplete]", + tag="PAGE_SCAN", + params={ + "removed": result.get("removedCount", 0), + "total": result.get("totalMerged", 0), + "steps": result.get("scrollSteps", 0), + }, + ) + else: + expanded = result.get("expandedGroups", 0) + extra = f", expanded {expanded} collapsed groups" if expanded else "" + self.logger.success( + message="DOM recycling detected — recovered {removed} elements, " + "{total} total after merge ({steps} scroll steps)" + + extra, + tag="PAGE_SCAN", + params={ + "removed": result.get("removedCount", 0), + "total": result.get("totalMerged", 0), + "steps": result.get("scrollSteps", 0), + }, + ) + else: + removed_not_recycled = result.get("removedButNotRecycled", 0) if result else 0 + if removed_not_recycled > 0: + self.logger.warning( + message="Full page scan completed but {count} DOM removals " + "were not confirmed as recycling. Content may be incomplete.", + tag="PAGE_SCAN", + params={"count": removed_not_recycled}, + ) + else: + self.logger.info( + message="Full page scan completed in {steps} scroll steps", + tag="PAGE_SCAN", + params={"steps": result.get("scrollSteps", 0) if result else 0}, + ) + + except (asyncio.CancelledError, KeyboardInterrupt): + raise except Exception as e: - self.logger.warning( - message="Failed to perform full page scan: {error}", + self.logger.error( + message="Full page scan failed: {error}. HTML will contain only " + "the first viewport of content.", tag="PAGE_SCAN", params={"error": str(e)}, ) - else: - # await page.evaluate("window.scrollTo(0, document.body.scrollHeight)") - await self.safe_scroll(page, 0, total_height) + finally: + # Always restore the previous timeout, even on error + if prev_timeout is not None: + page.set_default_timeout(prev_timeout) + else: + page.set_default_timeout(30_000) async def _handle_virtual_scroll(self, page: Page, config: "VirtualScrollConfig"): """ @@ -1316,113 +2163,158 @@ async def _handle_virtual_scroll(self, page: Page, config: "VirtualScrollConfig" params={"selector": config.container_selector} ) - # JavaScript function to handle virtual scroll capture virtual_scroll_js = """ async (config) => { const container = document.querySelector(config.container_selector); if (!container) { throw new Error(`Container not found: ${config.container_selector}`); } - - // List to store HTML chunks when content is replaced + const htmlChunks = []; let previousHTML = container.innerHTML; + let previousChildCount = container.children.length; let scrollCount = 0; - + let consecutiveNoChange = 0; + const maxNoChange = config.max_no_change || 5; + const maxCaptured = config.max_captured_elements || 0; + let totalCapturedCount = 0; + let capReached = false; + // Determine scroll amount let scrollAmount; if (typeof config.scroll_by === 'number') { scrollAmount = config.scroll_by; } else if (config.scroll_by === 'page_height') { scrollAmount = window.innerHeight; - } else { // container_height + } else { scrollAmount = container.offsetHeight; } - - // Perform scrolling - while (scrollCount < config.scroll_count) { - // Scroll the container - container.scrollTop += scrollAmount; - - // Wait for content to potentially load + + let useWindowScroll = false; + const prevScrollTop = container.scrollTop; + container.scrollTop += 1; + if (container.scrollTop === prevScrollTop) { + useWindowScroll = true; + + } else { + container.scrollTop = prevScrollTop; + } + + function doScroll() { + if (useWindowScroll) { + window.scrollBy(0, scrollAmount); + } else { + container.scrollTop += scrollAmount; + } + } + + function isAtEnd() { + if (useWindowScroll) { + return window.scrollY + window.innerHeight >= document.documentElement.scrollHeight - 10; + } + return container.scrollTop + container.clientHeight >= container.scrollHeight - 10; + } + + function getElementFingerprint(el) { + try { + const attrId = el.getAttribute('data-id') + || el.getAttribute('data-index') + || el.getAttribute('data-key') + || el.getAttribute('data-testid') + || el.id; + if (attrId) return 'attr:' + attrId; + const txt = (el.innerText || '').trim(); + if (txt.length > 10) + return 'text:' + txt.toLowerCase().replace(/[\\s\\W]/g, '').substring(0, 200); + if (el.outerHTML && el.outerHTML.length > 0) + return 'html:' + el.outerHTML.length + ':' + el.outerHTML.substring(0, 120); + return null; + } catch(e) { return null; } + } + + function isContentAppended(prevHTML, currHTML, prevCount) { + try { + const currCount = container.children.length; + if (currCount <= prevCount) return false; + if (prevCount > 0 && currCount > prevCount && container.children[0]) { + const firstChild = container.children[0]; + const prefix = firstChild.outerHTML.substring(0, Math.min(100, firstChild.outerHTML.length)); + return prevHTML.startsWith(prefix); + } + return false; + } catch(e) { return false; } + } + + while (scrollCount < config.scroll_count && !capReached) { + doScroll(); await new Promise(resolve => setTimeout(resolve, config.wait_after_scroll * 1000)); - - // Get current HTML + const currentHTML = container.innerHTML; - - // Determine what changed + if (currentHTML === previousHTML) { - // Case 0: No change - continue scrolling - console.log(`Scroll ${scrollCount + 1}: No change in content`); - } else if (currentHTML.startsWith(previousHTML)) { - // Case 1: New items appended - content already in page - console.log(`Scroll ${scrollCount + 1}: New items appended`); + consecutiveNoChange++; + if (maxNoChange > 0 && consecutiveNoChange >= maxNoChange) { + break; + } + } else if (isContentAppended(previousHTML, currentHTML, previousChildCount)) { + consecutiveNoChange = 0; } else { - // Case 2: Items replaced - capture the previous HTML - console.log(`Scroll ${scrollCount + 1}: Content replaced, capturing chunk`); htmlChunks.push(previousHTML); + totalCapturedCount += container.children.length; + consecutiveNoChange = 0; + if (maxCaptured > 0 && totalCapturedCount >= maxCaptured) { + capReached = true; + } } - - // Update previous HTML for next iteration + previousHTML = currentHTML; + previousChildCount = container.children.length; scrollCount++; - - // Check if we've reached the end - if (container.scrollTop + container.clientHeight >= container.scrollHeight - 10) { - console.log(`Reached end of scrollable content at scroll ${scrollCount}`); - // Capture final chunk if content was replaced - if (htmlChunks.length > 0) { - htmlChunks.push(currentHTML); - } - break; - } + + if (isAtEnd()) { break; } } - - // If we have chunks (case 2 occurred), merge them + + if (htmlChunks.length > 0) { + htmlChunks.push(previousHTML); + } + if (htmlChunks.length > 0) { - console.log(`Merging ${htmlChunks.length} HTML chunks`); - - // Parse all chunks to extract unique elements const tempDiv = document.createElement('div'); - const seenTexts = new Set(); + const seenFingerprints = new Set(); const uniqueElements = []; - - // Process each chunk + for (const chunk of htmlChunks) { tempDiv.innerHTML = chunk; - const elements = tempDiv.children; - - for (let i = 0; i < elements.length; i++) { - const element = elements[i]; - // Normalize text for deduplication - const normalizedText = element.innerText - .toLowerCase() - .replace(/[\\s\\W]/g, ''); // Remove spaces and symbols - - if (!seenTexts.has(normalizedText)) { - seenTexts.add(normalizedText); + const elements = Array.from(tempDiv.children); + for (const element of elements) { + const fp = getElementFingerprint(element); + if (fp && !seenFingerprints.has(fp)) { + seenFingerprints.add(fp); + uniqueElements.push(element.outerHTML); + } else if (!fp) { uniqueElements.push(element.outerHTML); } } } - - // Replace container content with merged unique elements + container.innerHTML = uniqueElements.join('\\n'); - console.log(`Merged ${uniqueElements.length} unique elements from ${htmlChunks.length} chunks`); - + return { success: true, chunksCount: htmlChunks.length, uniqueCount: uniqueElements.length, - replaced: true + replaced: true, + usedWindowScroll: useWindowScroll, + capReached: capReached }; } else { - console.log('No content replacement detected, all content remains in page'); return { success: true, chunksCount: 0, uniqueCount: 0, - replaced: false + replaced: false, + usedWindowScroll: useWindowScroll, + capReached: false }; } } @@ -1432,12 +2324,18 @@ async def _handle_virtual_scroll(self, page: Page, config: "VirtualScrollConfig" result = await self.adapter.evaluate(page, virtual_scroll_js, config.to_dict()) if result.get("replaced", False): + extra = "" + if result.get("usedWindowScroll"): + extra += " (window scroll fallback)" + if result.get("capReached"): + extra += " [memory cap reached]" self.logger.success( - message="Virtual scroll completed. Merged {unique} unique elements from {chunks} chunks", + message="Virtual scroll completed. Merged {unique} unique elements from {chunks} chunks{extra}", tag="VSCROLL", params={ "unique": result.get("uniqueCount", 0), - "chunks": result.get("chunksCount", 0) + "chunks": result.get("chunksCount", 0), + "extra": extra, } ) else: diff --git a/crawl4ai/content_scraping_strategy.py b/crawl4ai/content_scraping_strategy.py index ade19aa11..2ddcc2b1b 100644 --- a/crawl4ai/content_scraping_strategy.py +++ b/crawl4ai/content_scraping_strategy.py @@ -562,6 +562,15 @@ def remove_empty_elements_fast(self, root, word_count_threshold=5): ): parent = el.getparent() if parent is not None: + # Preserve tail text before removal — tail belongs to + # the parent's content flow, not the removed element + tail = el.tail + if tail: + prev = el.getprevious() + if prev is not None: + prev.tail = (prev.tail or "") + tail + else: + parent.text = (parent.text or "") + tail parent.remove(el) return root diff --git a/test_virtual_scroll_compat.py b/test_virtual_scroll_compat.py new file mode 100644 index 000000000..9412bcdf2 --- /dev/null +++ b/test_virtual_scroll_compat.py @@ -0,0 +1,2117 @@ +""" +Comprehensive virtual scroll compatibility test suite. + +Covers 13 distinct scroll/virtualisation patterns: + + Test 1 — Transform-based virtual scroll (50 items, translateY) + Test 2 — innerHTML-wipe virtual scroll (50 items, PR #1853 exact pattern) + Test 3 — Append-based infinite scroll (100 quotes, no DOM recycling) + Test 4 — Container-level virtual scroll (200 rows, overflow-y: scroll) + Test 5 — Transform-based stress test (1000 items) + Test 6 — Real site: quotes.toscrape.com/scroll + Test 7 — Variable row heights (80 items, non-uniform heights) + Test 8 — Horizontal virtual scroll (60 items, translateX) + Test 9 — 2D grid virtualisation (10x10 = 100 cells) + Test 10 — Multiple virtual containers on same page (40 + 30 items) + Test 11 — Nested virtual scroll (5 categories x 10 items) + Test 12 — Async/setTimeout-loaded items (50 items) + Test 13 — Small virtual section in large static page (60 items) + +Each local test is served from a self-contained HTML file via HTTPServer +on a unique port. All tests use JsonCssExtractionStrategy + scan_full_page=True. +""" + +from __future__ import annotations + +import asyncio +import json +import os +import tempfile +import threading +from http.server import HTTPServer, SimpleHTTPRequestHandler + +from crawl4ai import AsyncWebCrawler, BrowserConfig, CacheMode, CrawlerRunConfig +from crawl4ai.extraction_strategy import JsonCssExtractionStrategy + +# --------------------------------------------------------------------------- +# HTML fixtures +# --------------------------------------------------------------------------- + +# Test 1 — Transform-based virtual scroll +# Items are positioned with CSS transform: translateY(Npx). +# Container has an explicit style.height = TOTAL * ITEM_HEIGHT. +# Only ~10 items exist in the DOM at any time. +# On scroll the transform of each live node is updated (moved into / out of +# the visible band) — this mirrors how React/Next.js virtual lists work +# (e.g. skills.sh, Twitter feed, Tanstack Virtual). +TRANSFORM_SCROLL_HTML = """ + + + + + + +
+ + + +""" + +# Test 2 — innerHTML-wipe virtual scroll (PR #1853 exact pattern) +# Container.innerHTML = '' then new items are appended on every scroll. +# No transforms, no explicit height on the container itself. +# Body height is set to TOTAL * ITEM_HEIGHT to allow the window to scroll. +INNERHTML_WIPE_HTML = """ + + + + + + +
+ + + +""" + +# Test 3 — Append-based infinite scroll (no DOM recycling) +# Items are only ever appended; nothing is ever removed. +APPEND_SCROLL_HTML = """ + + + + + + +
+ + + +""" + +# Test 4 — Container-level virtual scroll (overflow-y: scroll on a div) +# The container element itself scrolls (not the window). +# Items inside use position: absolute + top offset — recycled on container scroll. +CONTAINER_SCROLL_HTML = """ + + + + + + +

Container Scroll (200 rows)

+
+
+
+ + + +""" + +# Test 8 — Horizontal virtual scroll (60 items, translateX) +# Container scrolls horizontally via overflow-x: scroll. +# Items are positioned with transform: translateX(Npx) and recycled +# from a pool of ~8 DOM nodes — same pattern as vertical virtual scroll +# but on the X axis. +HORIZONTAL_SCROLL_HTML = """ + + + + + + +

Horizontal Virtual Scroll (60 items)

+
+
+
+ + + +""" + +# Test 5 — Transform-based, 1000 items (stress test) +# Identical logic to Test 1 but scaled to 1000 items. +TRANSFORM_SCROLL_1000_HTML = TRANSFORM_SCROLL_HTML.replace( + "var TOTAL = 50;", + "var TOTAL = 1000;", +) + +# Test 7 — Variable Row Heights virtual scroll +# 80 items where each item N has height 40 + (N % 5) * 20 px (40-120px). +# Uses transform-based recycling (translateY) with a pool of ~15 DOM nodes. +# Container style.height is the SUM of all item heights. +# The scroll render uses cumulative-sum lookup to find visible items. +VARIABLE_ROW_HEIGHTS_HTML = """ + + + + + + +
+ + + +""" + +# Test 8 — 2D Grid Virtualisation +# A 10x10 grid (100 cells total) where only ~20 DOM nodes exist at any time. +# Both horizontal AND vertical scrolling is needed to reveal all cells. +# Each cell is position: absolute with left/top computed from col/row. +# On scroll, the pool is recycled for the visible 2D viewport region. +GRID_2D_SCROLL_HTML = """ + + + + + + +

2D Grid Virtual Scroll (10x10 = 100 cells)

+
+
+
+ + + +""" + +# Test 9 — WebSocket/Async-Loaded Items (setTimeout simulating async fetch) +# Items are appended in batches of 10 (50 total) via setTimeout when the +# user scrolls near the bottom. A loading spinner appears during the 300ms +# delay. Items are never recycled — this is an append pattern, but with an +# async gap that can trip up crawlers that check "at bottom" before the new +# content has arrived. +ASYNC_LOADED_HTML = """ + + + + + + +
+
Loading...
+ + + +""" + +# Test 10 — Nested Virtual Scroll (outer vertical + inner horizontal) +# OUTER: 5 categories recycled vertically via window scroll (each 200px tall). +# INNER: Each visible category contains a HORIZONTAL scrollable list of 10 items, +# recycled horizontally via overflow-x: scroll. +# Total: 5 category links + 50 inner item links = 55 unique links. +NESTED_VIRTUAL_SCROLL_HTML = """ + + + + + + +
+ + + +""" + + +# --------------------------------------------------------------------------- +# Server helper +# --------------------------------------------------------------------------- + +def start_server(html_dir: str, port: int) -> HTTPServer: + """Start a simple HTTP server in a daemon thread.""" + + class _Handler(SimpleHTTPRequestHandler): + def __init__(self, *args, **kwargs): + super().__init__(*args, directory=html_dir, **kwargs) + + def log_message(self, fmt, *args): # silence access log + pass + + server = HTTPServer(("127.0.0.1", port), _Handler) + t = threading.Thread(target=server.serve_forever, daemon=True) + t.start() + return server + + +# --------------------------------------------------------------------------- +# Individual test coroutines +# --------------------------------------------------------------------------- + +async def test_transform_virtual_scroll() -> bool: + """ + Test 1: Transform-based virtual scroll — 50 items. + + Items use CSS transform: translateY(Npx). The container has an explicit + style.height. Only ~10 DOM nodes exist at a time; on scroll the pool is + recycled by updating each node's transform. This is how React/Next.js + virtual lists work (skills.sh, Twitter, Tanstack Virtual). + + Fingerprint: each item has a unique link. + Expected: capture all 50 items. + """ + print("=" * 70) + print("TEST 1: Transform-based virtual scroll — 50 items (translateY)") + print("=" * 70) + + with tempfile.TemporaryDirectory() as tmpdir: + with open(os.path.join(tmpdir, "index.html"), "w") as fh: + fh.write(TRANSFORM_SCROLL_HTML) + server = start_server(tmpdir, 9741) + try: + schema = { + "name": "Items", + "baseSelector": ".item", + "fields": [ + {"name": "title", "selector": ".title", "type": "text"}, + {"name": "link", "selector": ".meta a", "type": "attribute", "attribute": "href"}, + ], + } + async with AsyncWebCrawler(config=BrowserConfig(headless=True)) as crawler: + cfg = CrawlerRunConfig( + cache_mode=CacheMode.BYPASS, + extraction_strategy=JsonCssExtractionStrategy(schema), + scan_full_page=True, + scroll_delay=0.3, + ) + result = await crawler.arun(url="http://127.0.0.1:9741/index.html", config=cfg) + + data = json.loads(result.extracted_content) + # Deduplicate by unique href (/item/1 … /item/50) + unique_links = {d["link"] for d in data if d.get("link")} + expected = {f"/item/{i}" for i in range(1, 51)} + missing = sorted(expected - unique_links, key=lambda s: int(s.split("/")[-1])) + + print(f" Raw extracted : {len(data)}") + print(f" Unique by href: {len(unique_links)}/50") + if missing: + show = missing[:10] + tail = "..." if len(missing) > 10 else "" + print(f" Missing : {show}{tail}") + passed = len(unique_links) >= 45 + print(f" Result : {'PASS' if passed else 'FAIL'}") + return passed + finally: + server.shutdown() + + +async def test_innerhtml_wipe_virtual_scroll() -> bool: + """ + Test 2: innerHTML-wipe virtual scroll — 50 items (PR #1853 exact pattern). + + On every scroll event the container's innerHTML is cleared and freshly + rendered items are appended. No transforms, no explicit container height. + Body height is set to TOTAL * ITEM_HEIGHT so the window can scroll. + + Fingerprint: each item has a unique link. + Expected: capture all 50 items. + """ + print("\n" + "=" * 70) + print("TEST 2: innerHTML-wipe virtual scroll — 50 items (PR #1853 pattern)") + print("=" * 70) + + with tempfile.TemporaryDirectory() as tmpdir: + with open(os.path.join(tmpdir, "index.html"), "w") as fh: + fh.write(INNERHTML_WIPE_HTML) + server = start_server(tmpdir, 9742) + try: + schema = { + "name": "Users", + "baseSelector": "[data-testid='UserCell']", + "fields": [ + {"name": "name", "selector": ".name", "type": "text"}, + {"name": "handle", "selector": ".handle", "type": "text"}, + {"name": "link", "selector": ".handle a", "type": "attribute", "attribute": "href"}, + ], + } + async with AsyncWebCrawler(config=BrowserConfig(headless=True)) as crawler: + cfg = CrawlerRunConfig( + cache_mode=CacheMode.BYPASS, + extraction_strategy=JsonCssExtractionStrategy(schema), + scan_full_page=True, + scroll_delay=0.2, + ) + result = await crawler.arun(url="http://127.0.0.1:9742/index.html", config=cfg) + + data = json.loads(result.extracted_content) + unique_links = {d["link"] for d in data if d.get("link")} + expected = {f"/profile/{i}" for i in range(1, 51)} + missing = sorted(expected - unique_links, key=lambda s: int(s.split("/")[-1])) + + print(f" Raw extracted : {len(data)}") + print(f" Unique by href: {len(unique_links)}/50") + if missing: + show = missing[:10] + tail = "..." if len(missing) > 10 else "" + print(f" Missing : {show}{tail}") + passed = len(unique_links) >= 45 + print(f" Result : {'PASS' if passed else 'FAIL'}") + return passed + finally: + server.shutdown() + + +async def test_append_infinite_scroll() -> bool: + """ + Test 3: Append-based infinite scroll — 100 quotes. + + Items are only ever appended to the DOM; nothing is ever removed. + This is the classic infinite scroll pattern (no virtualisation at all). + This test must not regress — crawl4ai has always handled this correctly. + + Fingerprint: each quote has a unique link. + Expected: capture all 100 items. + """ + print("\n" + "=" * 70) + print("TEST 3: Append-based infinite scroll — 100 quotes (regression guard)") + print("=" * 70) + + with tempfile.TemporaryDirectory() as tmpdir: + with open(os.path.join(tmpdir, "index.html"), "w") as fh: + fh.write(APPEND_SCROLL_HTML) + server = start_server(tmpdir, 9743) + try: + schema = { + "name": "Quotes", + "baseSelector": ".quote", + "fields": [ + {"name": "text", "selector": ".text", "type": "text"}, + {"name": "author", "selector": ".author", "type": "text"}, + {"name": "link", "selector": ".author a", "type": "attribute", "attribute": "href"}, + ], + } + async with AsyncWebCrawler(config=BrowserConfig(headless=True)) as crawler: + cfg = CrawlerRunConfig( + cache_mode=CacheMode.BYPASS, + extraction_strategy=JsonCssExtractionStrategy(schema), + scan_full_page=True, + scroll_delay=0.2, + ) + result = await crawler.arun(url="http://127.0.0.1:9743/index.html", config=cfg) + + data = json.loads(result.extracted_content) + unique_links = {d["link"] for d in data if d.get("link")} + expected = {f"/author/{i}" for i in range(1, 101)} + missing = sorted(expected - unique_links, key=lambda s: int(s.split("/")[-1])) + + print(f" Raw extracted : {len(data)}") + print(f" Unique by href: {len(unique_links)}/100") + if missing: + show = missing[:10] + tail = "..." if len(missing) > 10 else "" + print(f" Missing : {show}{tail}") + passed = len(unique_links) >= 90 + print(f" Result : {'PASS' if passed else 'FAIL'}") + return passed + finally: + server.shutdown() + + +async def test_container_scroll() -> bool: + """ + Test 4: Container-level virtual scroll — 200 rows. + + The scrolling happens on a fixed-height div (overflow-y: scroll), not on + the window. Inside the container a tall inner wrapper provides scroll + height; rows use position: absolute + top offset and are recycled on + container scroll events. + + Fingerprint: each row has a unique link. + Expected: capture all 200 rows. + """ + print("\n" + "=" * 70) + print("TEST 4: Container-level virtual scroll — 200 rows (overflow-y: scroll)") + print("=" * 70) + + with tempfile.TemporaryDirectory() as tmpdir: + with open(os.path.join(tmpdir, "index.html"), "w") as fh: + fh.write(CONTAINER_SCROLL_HTML) + server = start_server(tmpdir, 9744) + try: + schema = { + "name": "Rows", + "baseSelector": ".row", + "fields": [ + {"name": "label", "selector": "", "type": "text"}, + {"name": "link", "selector": "a", "type": "attribute", "attribute": "href"}, + ], + } + async with AsyncWebCrawler(config=BrowserConfig(headless=True)) as crawler: + cfg = CrawlerRunConfig( + cache_mode=CacheMode.BYPASS, + extraction_strategy=JsonCssExtractionStrategy(schema), + scan_full_page=True, + scroll_delay=0.1, + ) + result = await crawler.arun(url="http://127.0.0.1:9744/index.html", config=cfg) + + data = json.loads(result.extracted_content) + unique_links = {d["link"] for d in data if d.get("link")} + expected = {f"/row/{i}" for i in range(1, 201)} + missing = sorted(expected - unique_links, key=lambda s: int(s.split("/")[-1])) + + print(f" Raw extracted : {len(data)}") + print(f" Unique by href: {len(unique_links)}/200") + if missing: + show = missing[:10] + tail = "..." if len(missing) > 10 else "" + print(f" Missing : {show}{tail}") + passed = len(unique_links) >= 180 + print(f" Result : {'PASS' if passed else 'FAIL'}") + return passed + finally: + server.shutdown() + + +async def test_transform_stress_1000() -> bool: + """ + Test 5: Transform-based virtual scroll — 1000 items (stress test). + + Same DOM-recycling / translateY mechanism as Test 1 but scaled to 1000 + items. Validates that the crawler's snapshot-and-deduplicate strategy + holds up under a large item count without running out of memory or + missing large swathes of the list. + + Fingerprint: each item has a unique link. + Expected: capture all 1000 items. + """ + print("\n" + "=" * 70) + print("TEST 5: Transform-based virtual scroll — 1000 items (stress test)") + print("=" * 70) + + with tempfile.TemporaryDirectory() as tmpdir: + with open(os.path.join(tmpdir, "index.html"), "w") as fh: + fh.write(TRANSFORM_SCROLL_1000_HTML) + server = start_server(tmpdir, 9745) + try: + schema = { + "name": "Items", + "baseSelector": ".item", + "fields": [ + {"name": "title", "selector": ".title", "type": "text"}, + {"name": "link", "selector": ".meta a", "type": "attribute", "attribute": "href"}, + ], + } + async with AsyncWebCrawler(config=BrowserConfig(headless=True)) as crawler: + cfg = CrawlerRunConfig( + cache_mode=CacheMode.BYPASS, + extraction_strategy=JsonCssExtractionStrategy(schema), + scan_full_page=True, + scroll_delay=0.05, + ) + result = await crawler.arun(url="http://127.0.0.1:9745/index.html", config=cfg) + + data = json.loads(result.extracted_content) + unique_links = {d["link"] for d in data if d.get("link")} + expected = {f"/item/{i}" for i in range(1, 1001)} + missing_count = len(expected - unique_links) + + print(f" Raw extracted : {len(data)}") + print(f" Unique by href: {len(unique_links)}/1000") + print(f" Missing : {missing_count}") + passed = len(unique_links) >= 950 + print(f" Result : {'PASS' if passed else 'FAIL'}") + return passed + finally: + server.shutdown() + + +async def test_real_site_quotes() -> bool: + """ + Test 6: Real site — quotes.toscrape.com/scroll. + + Append-based infinite scroll on a live public site. This validates that + real-world behaviour matches what the synthetic Test 3 exercises. + + Expected: capture all 100 quotes (or >=90 to allow for network variance). + """ + print("\n" + "=" * 70) + print("TEST 6: Real site — quotes.toscrape.com/scroll") + print("=" * 70) + + schema = { + "name": "Quotes", + "baseSelector": ".quote", + "fields": [ + {"name": "text", "selector": ".text", "type": "text"}, + {"name": "author", "selector": ".author", "type": "text"}, + ], + } + + async with AsyncWebCrawler(config=BrowserConfig(headless=True)) as crawler: + cfg = CrawlerRunConfig( + cache_mode=CacheMode.BYPASS, + extraction_strategy=JsonCssExtractionStrategy(schema), + scan_full_page=True, + scroll_delay=0.5, + ) + result = await crawler.arun(url="https://quotes.toscrape.com/scroll", config=cfg) + + data = json.loads(result.extracted_content) + unique = {d["text"]: d for d in data if d.get("text")} + + print(f" Raw extracted : {len(data)}") + print(f" Unique quotes : {len(unique)}/100") + passed = len(unique) >= 90 + print(f" Result : {'PASS' if passed else 'FAIL'}") + return passed + + +async def test_variable_row_heights() -> bool: + """ + Test 7: Variable Row Heights virtual scroll -- 80 items. + + Items use CSS transform: translateY(Npx) but each item has a DIFFERENT + height (40 + (N % 5) * 20 px, ranging from 40px to 120px). Container + style.height is the sum of all item heights. The scroll render uses a + cumulative-sum / binary-search approach to find visible items. Pool of + ~15 DOM nodes recycled on window scroll. + + This tests whether _handle_full_page_scan works when itemHeight is NOT + uniform -- the Phase 4 scroll step calculation must not skip items. + + Fingerprint: each item has a unique link. + Expected: capture >=72 of 80 items (90%). + """ + print("\n" + "=" * 70) + print("TEST 7: Variable Row Heights virtual scroll -- 80 items") + print("=" * 70) + + with tempfile.TemporaryDirectory() as tmpdir: + with open(os.path.join(tmpdir, "index.html"), "w") as fh: + fh.write(VARIABLE_ROW_HEIGHTS_HTML) + server = start_server(tmpdir, 9751) + try: + schema = { + "name": "Items", + "baseSelector": ".item", + "fields": [ + {"name": "title", "selector": ".title", "type": "text"}, + {"name": "link", "selector": ".meta a", "type": "attribute", "attribute": "href"}, + ], + } + async with AsyncWebCrawler(config=BrowserConfig(headless=True)) as crawler: + cfg = CrawlerRunConfig( + cache_mode=CacheMode.BYPASS, + extraction_strategy=JsonCssExtractionStrategy(schema), + scan_full_page=True, + scroll_delay=0.3, + ) + result = await crawler.arun(url="http://127.0.0.1:9751/index.html", config=cfg) + + data = json.loads(result.extracted_content) + unique_links = {d["link"] for d in data if d.get("link")} + expected = {f"/varh/{i}" for i in range(1, 81)} + missing = sorted(expected - unique_links, key=lambda s: int(s.split("/")[-1])) + + print(f" Raw extracted : {len(data)}") + print(f" Unique by href: {len(unique_links)}/80") + if missing: + show = missing[:10] + tail = "..." if len(missing) > 10 else "" + print(f" Missing : {show}{tail}") + passed = len(unique_links) >= 72 + print(f" Result : {'PASS' if passed else 'FAIL'}") + return passed + finally: + server.shutdown() + + +async def test_horizontal_virtual_scroll() -> bool: + """ + Test 8: Horizontal virtual scroll — 60 items (translateX). + + Container scrolls horizontally (overflow-x: scroll) with items positioned + via transform: translateX(Npx). Pool of ~8 DOM nodes recycled on + horizontal scroll. This tests that _handle_full_page_scan detects and + scrolls horizontal virtual scroll containers, not just vertical ones. + + Fingerprint: each item has a unique link. + Expected: capture >=54 of 60 items (90%). + """ + print("\n" + "=" * 70) + print("TEST 8: Horizontal virtual scroll — 60 items (translateX)") + print("=" * 70) + + with tempfile.TemporaryDirectory() as tmpdir: + with open(os.path.join(tmpdir, "index.html"), "w") as fh: + fh.write(HORIZONTAL_SCROLL_HTML) + server = start_server(tmpdir, 9752) + try: + schema = { + "name": "Cards", + "baseSelector": ".hitem", + "fields": [ + {"name": "title", "selector": ".card-title", "type": "text"}, + {"name": "link", "selector": ".card-link a", "type": "attribute", "attribute": "href"}, + ], + } + async with AsyncWebCrawler(config=BrowserConfig(headless=True)) as crawler: + cfg = CrawlerRunConfig( + cache_mode=CacheMode.BYPASS, + extraction_strategy=JsonCssExtractionStrategy(schema), + scan_full_page=True, + scroll_delay=0.3, + ) + result = await crawler.arun(url="http://127.0.0.1:9752/index.html", config=cfg) + + data = json.loads(result.extracted_content) + unique_links = {d["link"] for d in data if d.get("link")} + expected = {f"/hscroll/{i}" for i in range(1, 61)} + missing = sorted(expected - unique_links, key=lambda s: int(s.split("/")[-1])) + + print(f" Raw extracted : {len(data)}") + print(f" Unique by href: {len(unique_links)}/60") + if missing: + show = missing[:10] + tail = "..." if len(missing) > 10 else "" + print(f" Missing : {show}{tail}") + passed = len(unique_links) >= 54 + print(f" Result : {'PASS' if passed else 'FAIL'}") + return passed + finally: + server.shutdown() + + +async def test_2d_grid_virtual_scroll() -> bool: + """ + Test 9: 2D Grid Virtualisation — 10x10 = 100 cells. + + A grid container scrolls both horizontally AND vertically. Only ~20 + DOM nodes exist at any time; they are recycled as the user scrolls in + either direction. Each cell uses position: absolute with left/top + calculated from its column/row. + + This tests whether _handle_full_page_scan can handle containers where + scrollWidth > clientWidth AND scrollHeight > clientHeight — it needs + to scroll in a zigzag pattern to visit all 2D regions. + + Fingerprint: each cell has a unique link. + Expected: capture >=90 of 100 cells (90%). + """ + print("\n" + "=" * 70) + print("TEST 9: 2D Grid Virtualisation — 10x10 = 100 cells") + print("=" * 70) + + with tempfile.TemporaryDirectory() as tmpdir: + with open(os.path.join(tmpdir, "index.html"), "w") as fh: + fh.write(GRID_2D_SCROLL_HTML) + server = start_server(tmpdir, 9753) + try: + schema = { + "name": "Cells", + "baseSelector": ".cell", + "fields": [ + {"name": "label", "selector": "strong", "type": "text"}, + {"name": "link", "selector": "a", "type": "attribute", "attribute": "href"}, + ], + } + async with AsyncWebCrawler(config=BrowserConfig(headless=True)) as crawler: + cfg = CrawlerRunConfig( + cache_mode=CacheMode.BYPASS, + extraction_strategy=JsonCssExtractionStrategy(schema), + scan_full_page=True, + scroll_delay=0.3, + ) + result = await crawler.arun(url="http://127.0.0.1:9753/index.html", config=cfg) + + data = json.loads(result.extracted_content) + unique_links = {d["link"] for d in data if d.get("link")} + expected = {f"/cell/{r}-{c}" for r in range(10) for c in range(10)} + missing = sorted(expected - unique_links) + + print(f" Raw extracted : {len(data)}") + print(f" Unique by href: {len(unique_links)}/100") + if missing: + show = missing[:10] + tail = "..." if len(missing) > 10 else "" + print(f" Missing : {show}{tail}") + passed = len(unique_links) >= 90 + print(f" Result : {'PASS' if passed else 'FAIL'}") + return passed + finally: + server.shutdown() + + +async def test_async_loaded_items() -> bool: + """ + Test 10: WebSocket/Async-Loaded Items — 50 items. + + Simulates async-loaded content (like a chat feed or real-time dashboard). + 50 items total, loaded in batches of 10 via setTimeout with a 300ms delay. + On scroll near the bottom, a loading spinner appears, then after the delay + new items are appended. Items are NOT recycled — they accumulate. + + The async delay is the key challenge: the crawler may detect "at bottom" + before the new batch has been appended by setTimeout, causing early exit. + + Fingerprint: each item has a unique link. + Expected: capture >=45 of 50 items (90%). + """ + print("\n" + "=" * 70) + print("TEST 10: WebSocket/Async-Loaded Items — 50 items (setTimeout)") + print("=" * 70) + + with tempfile.TemporaryDirectory() as tmpdir: + with open(os.path.join(tmpdir, "index.html"), "w") as fh: + fh.write(ASYNC_LOADED_HTML) + server = start_server(tmpdir, 9756) + try: + schema = { + "name": "FeedItems", + "baseSelector": ".feed-item", + "fields": [ + {"name": "title", "selector": ".title", "type": "text"}, + {"name": "link", "selector": ".link a", "type": "attribute", "attribute": "href"}, + ], + } + async with AsyncWebCrawler(config=BrowserConfig(headless=True)) as crawler: + cfg = CrawlerRunConfig( + cache_mode=CacheMode.BYPASS, + extraction_strategy=JsonCssExtractionStrategy(schema), + scan_full_page=True, + scroll_delay=0.5, + ) + result = await crawler.arun(url="http://127.0.0.1:9756/index.html", config=cfg) + + data = json.loads(result.extracted_content) + unique_links = {d["link"] for d in data if d.get("link")} + expected = {f"/async/{i}" for i in range(1, 51)} + missing = sorted(expected - unique_links, key=lambda s: int(s.split("/")[-1])) + + print(f" Raw extracted : {len(data)}") + print(f" Unique by href: {len(unique_links)}/50") + if missing: + show = missing[:10] + tail = "..." if len(missing) > 10 else "" + print(f" Missing : {show}{tail}") + passed = len(unique_links) >= 45 + print(f" Result : {'PASS' if passed else 'FAIL'}") + return passed + finally: + server.shutdown() + + +async def test_nested_virtual_scroll() -> bool: + """ + Test 11: Nested Virtual Scroll — 5 categories x 10 items = 55 links. + + OUTER: 5 categories recycled vertically via window scroll (translateY). + Each category div has a link . + INNER: Each visible category contains a HORIZONTAL scrollable list of + 10 items, recycled horizontally via overflow-x: scroll with position + absolute + left offset. Each item has . + + Total: 5 category links + 50 inner item links = 55 unique links. + This tests nested scroll-within-scroll: the outer vertical scroll + recycles categories, while each category's inner horizontal scroll + recycles items. + + Expected: capture >=45 of 55 total unique links. + """ + print("\n" + "=" * 70) + print("TEST 11: Nested Virtual Scroll — 5 cats x 10 items = 55 links") + print("=" * 70) + + with tempfile.TemporaryDirectory() as tmpdir: + with open(os.path.join(tmpdir, "index.html"), "w") as fh: + fh.write(NESTED_VIRTUAL_SCROLL_HTML) + server = start_server(tmpdir, 9755) + try: + async with AsyncWebCrawler(config=BrowserConfig(headless=True)) as crawler: + cfg = CrawlerRunConfig( + cache_mode=CacheMode.BYPASS, + scan_full_page=True, + scroll_delay=0.3, + ) + result = await crawler.arun(url="http://127.0.0.1:9755/index.html", config=cfg) + + # Extract links from result.links (internal) and also + # scan the raw HTML for /cat/ hrefs as a fallback. + import re + unique_links = set() + # From result.links + if hasattr(result, "links") and result.links: + for link in result.links.get("internal", []): + href = link.get("href", "") + # Normalise: strip origin, keep path + if "/cat/" in href: + path = "/" + href.split("/cat/", 1)[1] + unique_links.add("/cat/" + path.lstrip("/")) + # Also scan raw HTML for any /cat/ hrefs + for m in re.findall(r'href="(/cat/[^"]+)"', result.html or ""): + if m.startswith("/cat/"): + unique_links.add(m) + + # Expected links + cat_links = {f"/cat/{i}" for i in range(1, 6)} + item_links = {f"/cat/{c}/item/{m}" for c in range(1, 6) for m in range(1, 11)} + expected = cat_links | item_links # 55 total + + found_cats = cat_links & unique_links + found_items = item_links & unique_links + missing = sorted(expected - unique_links) + + print(f" Unique links : {len(unique_links & expected)}/55") + print(f" Cat links : {len(found_cats)}/5") + print(f" Item links : {len(found_items)}/50") + if missing: + show = missing[:15] + tail = "..." if len(missing) > 15 else "" + print(f" Missing : {show}{tail}") + passed = len(unique_links & expected) >= 45 + print(f" Result : {'PASS' if passed else 'FAIL'}") + return passed + finally: + server.shutdown() + + +# Test 9 — Multiple virtual containers on same page +# TWO independent pool-based virtual scroll containers side-by-side. +# Container A (#scroller-a): 40 items, height 400px, overflow-y: scroll +# Container B (#scroller-b): 30 items, height 300px, overflow-y: scroll +# Both use position: absolute + pool recycling independently. +MULTIPLE_CONTAINERS_HTML = """ + + + + + + +

Multiple Virtual Containers

+
+
+

List A (40 items)

+

Product catalog items

+
+
+
+
+ +
+

Dashboard

+

This is static content between two virtual scroll lists.

+

Both lists scroll independently with their own recycling pools.

+
+ +
+

List B (30 items)

+

Recent activity feed

+
+
+
+
+
+ + + +""" + + +async def test_multiple_virtual_containers() -> bool: + """ + Test 9: Multiple virtual containers on same page. + + TWO independent pool-based virtual scroll containers side-by-side. + Container A (#scroller-a): 40 items, height 400px, overflow-y: scroll + Container B (#scroller-b): 30 items, height 300px, overflow-y: scroll + Both use position: absolute + pool recycling independently. + + Fingerprint: /list-a/N links in container A, /list-b/N links in container B. + Expected: capture >=36 items from list-a AND >=27 items from list-b (90% each). + """ + print("\n" + "=" * 70) + print("TEST 9: Multiple virtual containers -- 40 + 30 items (side by side)") + print("=" * 70) + + with tempfile.TemporaryDirectory() as tmpdir: + with open(os.path.join(tmpdir, "index.html"), "w") as fh: + fh.write(MULTIPLE_CONTAINERS_HTML) + server = start_server(tmpdir, 9754) + try: + schema = { + "name": "Entries", + "baseSelector": ".entry", + "fields": [ + {"name": "label", "selector": "", "type": "text"}, + {"name": "link", "selector": "a", "type": "attribute", "attribute": "href"}, + ], + } + async with AsyncWebCrawler(config=BrowserConfig(headless=True)) as crawler: + cfg = CrawlerRunConfig( + cache_mode=CacheMode.BYPASS, + extraction_strategy=JsonCssExtractionStrategy(schema), + scan_full_page=True, + scroll_delay=0.1, + ) + result = await crawler.arun(url="http://127.0.0.1:9754/index.html", config=cfg) + + data = json.loads(result.extracted_content) + + # Separate links by container + links_a = {d["link"] for d in data if d.get("link") and d["link"].startswith("/list-a/")} + links_b = {d["link"] for d in data if d.get("link") and d["link"].startswith("/list-b/")} + expected_a = {f"/list-a/{i}" for i in range(1, 41)} + expected_b = {f"/list-b/{i}" for i in range(1, 31)} + missing_a = sorted(expected_a - links_a, key=lambda s: int(s.split("/")[-1])) + missing_b = sorted(expected_b - links_b, key=lambda s: int(s.split("/")[-1])) + + print(f" Raw extracted : {len(data)}") + print(f" List A by href : {len(links_a)}/40") + if missing_a: + show = missing_a[:10] + tail = "..." if len(missing_a) > 10 else "" + print(f" List A missing : {show}{tail}") + print(f" List B by href : {len(links_b)}/30") + if missing_b: + show = missing_b[:10] + tail = "..." if len(missing_b) > 10 else "" + print(f" List B missing : {show}{tail}") + + passed_a = len(links_a) >= 36 + passed_b = len(links_b) >= 27 + passed = passed_a and passed_b + print(f" List A : {'PASS' if passed_a else 'FAIL'} (>=36)") + print(f" List B : {'PASS' if passed_b else 'FAIL'} (>=27)") + print(f" Result : {'PASS' if passed else 'FAIL'}") + return passed + finally: + server.shutdown() + + +# Test — Large page with small virtual scroll section in the middle +# A ~2000px+ page of static content with a 400px overflow-y:scroll container +# embedded in the middle. The container has 60 virtual items (pool of ~12 DOM +# nodes, position: absolute, recycled on container scroll). +SMALL_VIRTUAL_IN_LARGE_PAGE_HTML = """ + + + + + + + +
+ +
+

Welcome to Our Platform

+

This is a large page with a virtual scroll widget embedded in the + middle. The scanner must handle both the static content and the virtual + scroll container to capture everything.

+
+ +
+

This is static content that appears ABOVE the virtual scroll + section. It contains important information that must be captured by the crawler. + Lorem ipsum dolor sit amet, consectetur adipiscing elit. Sed do eiusmod tempor + incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis + nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat.

+
+ +
+

Data Feed

+
+
+
+
+ + + +
+

Final section of static content at the very bottom of the page. + This must also be captured by the scanner. The page total height exceeds 2000px + with the virtual scroll widget in the middle.

+
+ + + + +""" + + +async def test_small_virtual_in_large_page() -> bool: + """ + Test: Large page with small virtual scroll section in the middle. + + A 2000px+ page of static content (header, hero, text, footer) with a + 400px overflow-y:scroll container embedded in the middle. The container + holds 60 virtual items (pool of ~12 DOM nodes, position: absolute, + recycled on container scroll). + + The scanner must: + 1. Scroll the page to reach the container (it is ~1000px down) + 2. Scroll the container to capture all 60 items + 3. Continue scrolling the page to capture static content below + + Fingerprint: each item has a unique link. + Expected: capture >=54 of 60 items (90%) from the virtual section. + Also verify that static content above and below is in result.html. + """ + print("\n" + "=" * 70) + print("TEST 13: Large page with small virtual scroll in the middle — 60 items") + print("=" * 70) + + with tempfile.TemporaryDirectory() as tmpdir: + with open(os.path.join(tmpdir, "index.html"), "w") as fh: + fh.write(SMALL_VIRTUAL_IN_LARGE_PAGE_HTML) + server = start_server(tmpdir, 9757) + try: + schema = { + "name": "MidItems", + "baseSelector": ".vitem", + "fields": [ + {"name": "label", "selector": "", "type": "text"}, + {"name": "link", "selector": "a", "type": "attribute", "attribute": "href"}, + ], + } + async with AsyncWebCrawler(config=BrowserConfig(headless=True)) as crawler: + cfg = CrawlerRunConfig( + cache_mode=CacheMode.BYPASS, + extraction_strategy=JsonCssExtractionStrategy(schema), + scan_full_page=True, + scroll_delay=0.2, + ) + result = await crawler.arun(url="http://127.0.0.1:9757/index.html", config=cfg) + + data = json.loads(result.extracted_content) + unique_links = {d["link"] for d in data if d.get("link")} + expected = {f"/mid/{i}" for i in range(1, 61)} + missing = sorted(expected - unique_links, key=lambda s: int(s.split("/")[-1])) + + print(f" Raw extracted : {len(data)}") + print(f" Unique by href: {len(unique_links)}/60") + if missing: + show = missing[:10] + tail = "..." if len(missing) > 10 else "" + print(f" Missing : {show}{tail}") + + # Check static content above/below is present in result.html + html = result.html or "" + has_hero = "hero-title" in html or "Welcome to Our Platform" in html + has_above = "above-text" in html or "ABOVE the virtual scroll" in html + has_footer = "footer-text" in html or "BELOW the virtual scroll" in html + has_below = "below-text" in html or "very bottom of the page" in html + has_nav = "nav-home" in html or "/nav/home" in html + + print(f" Static content checks:") + print(f" Navigation : {'OK' if has_nav else 'MISSING'}") + print(f" Hero : {'OK' if has_hero else 'MISSING'}") + print(f" Above text : {'OK' if has_above else 'MISSING'}") + print(f" Footer : {'OK' if has_footer else 'MISSING'}") + print(f" Below text : {'OK' if has_below else 'MISSING'}") + + static_ok = has_hero and has_above and has_footer and has_below and has_nav + items_ok = len(unique_links) >= 54 # 90% of 60 + passed = items_ok and static_ok + + if not items_ok: + print(f" FAIL: Only captured {len(unique_links)}/60 items (need >=54)") + if not static_ok: + print(f" FAIL: Some static content is missing from result.html") + + print(f" Result : {'PASS' if passed else 'FAIL'}") + return passed + finally: + server.shutdown() + + +# --------------------------------------------------------------------------- +# Entry point +# --------------------------------------------------------------------------- + +async def main() -> None: + results: dict[str, bool] = {} + + results["Test 1 — Transform virtual scroll (50 items)"] = await test_transform_virtual_scroll() + results["Test 2 — innerHTML-wipe virtual scroll (50 items)"]= await test_innerhtml_wipe_virtual_scroll() + results["Test 3 — Append infinite scroll (100 quotes)"] = await test_append_infinite_scroll() + results["Test 4 — Container scroll (200 rows)"] = await test_container_scroll() + results["Test 5 — Transform stress test (1000 items)"] = await test_transform_stress_1000() + results["Test 6 — Real site: quotes.toscrape.com"] = await test_real_site_quotes() + results["Test 7 — Variable Row Heights (80 items)"] = await test_variable_row_heights() + results["Test 8 — Horizontal virtual scroll (60 items)"] = await test_horizontal_virtual_scroll() + results["Test 9 — 2D grid virtualisation (100 cells)"] = await test_2d_grid_virtual_scroll() + results["Test 10 — Multiple virtual containers (40+30)"] = await test_multiple_virtual_containers() + results["Test 11 — Nested virtual scroll (55 links)"] = await test_nested_virtual_scroll() + results["Test 12 — Async-loaded items (50 items)"] = await test_async_loaded_items() + results["Test 13 — Small virtual in large page (60 items)"] = await test_small_virtual_in_large_page() + + print("\n" + "=" * 70) + print("SUMMARY") + print("=" * 70) + for name, passed in results.items(): + tag = "PASS" if passed else "FAIL" + print(f" [{tag}] {name}") + print("=" * 70) + + total = sum(results.values()) + print(f"\n {total}/{len(results)} tests passed\n") + + +if __name__ == "__main__": + asyncio.run(main()) diff --git a/tests/test_virtual_scroll.py b/tests/test_virtual_scroll.py index 1e7a7890e..31e3d88d7 100644 --- a/tests/test_virtual_scroll.py +++ b/tests/test_virtual_scroll.py @@ -1,197 +1,513 @@ """ -Test virtual scroll implementation according to the design: -- Create a page with virtual scroll that replaces content -- Verify all 1000 items are captured +Tests for virtual scroll and progressive full-page scan. + +Covers: +- VirtualScrollConfig: container DOM recycling, dedup, early termination, memory cap +- scan_full_page: window-level DOM recycling (issue #731), lazy-load backward compat +- VirtualScrollConfig window.scrollBy fallback +- Config serialization, from_dict forward-compat, error handling """ import asyncio +import re +import socket +import tempfile +import threading +from functools import partial +import http.server import os -from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, VirtualScrollConfig, CacheMode, BrowserConfig - -async def test_virtual_scroll(): - """Test virtual scroll with content replacement (true virtual scroll)""" - - # Create test HTML with true virtual scroll that replaces content - test_html = ''' - - - - - -

Virtual Scroll Test - 1000 Items

-
- - - - ''' - - # Save test HTML to a file - import tempfile - - with tempfile.NamedTemporaryFile(mode='w', suffix='.html', delete=False) as f: - f.write(test_html) - test_file_path = f.name - - httpd = None - old_cwd = os.getcwd() - + +import pytest + +from crawl4ai import ( + AsyncWebCrawler, + BrowserConfig, + CacheMode, + CrawlerRunConfig, + VirtualScrollConfig, +) + + +# --------------------------------------------------------------------------- +# Fixtures +# --------------------------------------------------------------------------- + +def _find_free_port() -> int: + with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s: + s.bind(("", 0)) + return s.getsockname()[1] + + +@pytest.fixture(scope="module") +def browser_config(): + return BrowserConfig(headless=True) + + +class _TestServer: + """Lightweight HTTP server. Uses directory= to avoid os.chdir.""" + + def __init__(self, html: str): + self._tmpdir = tempfile.mkdtemp() + self._filepath = os.path.join(self._tmpdir, "page.html") + with open(self._filepath, "w") as f: + f.write(html) + self.port = _find_free_port() + handler = partial(http.server.SimpleHTTPRequestHandler, directory=self._tmpdir) + self._httpd = http.server.HTTPServer(("127.0.0.1", self.port), handler) + self._thread = threading.Thread(target=self._httpd.serve_forever, daemon=True) + self._thread.start() + self.url = f"http://127.0.0.1:{self.port}/page.html" + + def shutdown(self): + self._httpd.shutdown() + os.unlink(self._filepath) + os.rmdir(self._tmpdir) + + +# --------------------------------------------------------------------------- +# HTML templates — container-level virtual scroll (uses .format()) +# --------------------------------------------------------------------------- + +CONTAINER_VSCROLL_HTML = """ + +
+ +""" + +# --------------------------------------------------------------------------- +# HTML templates — static strings (no .format()) +# --------------------------------------------------------------------------- + +NO_ATTR_VSCROLL_HTML = """ + +
+ +""" + +SAME_TEXT_VSCROLL_HTML = """ + +
+ +""" + +STATIC_HTML = """ + +
+
Static item 1
+
Static item 2
+
Static item 3
+
+ +""" + +WINDOW_RECYCLE_HTML = """ + +

Feed

+
+ +""" + +LAZY_LOAD_HTML = """ + +
+ +""" + + +# --------------------------------------------------------------------------- +# VirtualScrollConfig tests +# --------------------------------------------------------------------------- + +@pytest.mark.asyncio +async def test_vscroll_captures_all_items(browser_config): + """100 items, 10 per page, DOM recycling — all must be captured.""" + server = _TestServer(CONTAINER_VSCROLL_HTML.format(total=100, per_page=10)) + try: + await asyncio.sleep(0.3) + config = CrawlerRunConfig( + virtual_scroll_config=VirtualScrollConfig( + container_selector="#container", + scroll_count=15, + scroll_by="container_height", + wait_after_scroll=0.15, + ), + cache_mode=CacheMode.BYPASS, + ) + async with AsyncWebCrawler(config=browser_config) as crawler: + result = await crawler.arun(url=server.url, config=config) + + indices = set(int(m) for m in re.findall(r'data-index="(\d+)"', result.html)) + assert indices == set(range(100)), f"Missing: {set(range(100)) - indices}" + finally: + server.shutdown() + + +@pytest.mark.asyncio +async def test_vscroll_final_chunk_not_lost(browser_config): + """scroll_count exhausted before bottom — last chunk must still be captured.""" + server = _TestServer(CONTAINER_VSCROLL_HTML.format(total=200, per_page=10)) + try: + await asyncio.sleep(0.3) + config = CrawlerRunConfig( + virtual_scroll_config=VirtualScrollConfig( + container_selector="#container", + scroll_count=5, + scroll_by="container_height", + wait_after_scroll=0.15, + max_no_change=0, + ), + cache_mode=CacheMode.BYPASS, + ) + async with AsyncWebCrawler(config=browser_config) as crawler: + result = await crawler.arun(url=server.url, config=config) + + indices = sorted(set(int(m) for m in re.findall(r'data-index="(\d+)"', result.html))) + assert len(indices) > 10, f"Only {len(indices)} items — final chunk likely lost" + assert set(range(max(indices) + 1)) == set(indices), "Gaps in captured range" + finally: + server.shutdown() + + +@pytest.mark.asyncio +async def test_vscroll_early_termination(browser_config): + """Static content with high scroll_count — must stop early via max_no_change.""" + server = _TestServer(STATIC_HTML) + try: + await asyncio.sleep(0.3) + config = CrawlerRunConfig( + virtual_scroll_config=VirtualScrollConfig( + container_selector="#container", + scroll_count=50, + scroll_by=100, + wait_after_scroll=0.05, + max_no_change=3, + ), + cache_mode=CacheMode.BYPASS, + ) + async with AsyncWebCrawler(config=browser_config) as crawler: + result = await crawler.arun(url=server.url, config=config) + + indices = set(int(m) for m in re.findall(r'data-index="(\d+)"', result.html)) + assert indices == {0, 1, 2} + finally: + server.shutdown() + + +@pytest.mark.asyncio +async def test_vscroll_text_dedup_no_attributes(browser_config): + """Elements with no data-id/id — text-based dedup must capture unique profiles.""" + server = _TestServer(NO_ATTR_VSCROLL_HTML) + try: + await asyncio.sleep(0.3) + config = CrawlerRunConfig( + virtual_scroll_config=VirtualScrollConfig( + container_selector="#container", + scroll_count=15, + scroll_by="container_height", + wait_after_scroll=0.15, + ), + cache_mode=CacheMode.BYPASS, + ) + async with AsyncWebCrawler(config=browser_config) as crawler: + result = await crawler.arun(url=server.url, config=config) + + profiles = set(int(p) for p in re.findall(r"Profile (\d+) joined", result.html)) + assert len(profiles) >= 30, f"Only {len(profiles)}/50 profiles captured" + finally: + server.shutdown() + + +@pytest.mark.asyncio +async def test_vscroll_attr_dedup_same_text(browser_config): + """Items with identical text but different data-index — all must survive.""" + server = _TestServer(SAME_TEXT_VSCROLL_HTML) try: - # Start a simple HTTP server - import http.server - import socketserver - import threading - import random - - # Find available port - for _ in range(10): - PORT = random.randint(8000, 9999) - try: - Handler = http.server.SimpleHTTPRequestHandler - os.chdir(os.path.dirname(test_file_path)) - httpd = socketserver.TCPServer(("", PORT), Handler) - break - except OSError: - continue - - if httpd is None: - raise RuntimeError("Could not find available port") - - server_thread = threading.Thread(target=httpd.serve_forever) - server_thread.daemon = True - server_thread.start() - - # Give server time to start - await asyncio.sleep(0.5) - - # Configure virtual scroll - # With 10 items per page and 1000 total, we need 100 pages - # Let's do 120 scrolls to ensure we get everything - virtual_config = VirtualScrollConfig( - container_selector="#container", - scroll_count=120, - scroll_by="container_height", # Scroll by container height - wait_after_scroll=0.1 # Quick wait for test + await asyncio.sleep(0.3) + config = CrawlerRunConfig( + virtual_scroll_config=VirtualScrollConfig( + container_selector="#container", + scroll_count=10, + scroll_by="container_height", + wait_after_scroll=0.15, + ), + cache_mode=CacheMode.BYPASS, ) - + async with AsyncWebCrawler(config=browser_config) as crawler: + result = await crawler.arun(url=server.url, config=config) + + indices = set(int(m) for m in re.findall(r'data-index="(\d+)"', result.html)) + assert len(indices) >= 18, f"Only {len(indices)}/20 survived dedup" + finally: + server.shutdown() + + +@pytest.mark.asyncio +async def test_vscroll_window_fallback(browser_config): + """Container scrollTop has no effect — must fall back to window.scrollBy.""" + server = _TestServer(WINDOW_RECYCLE_HTML) + try: + await asyncio.sleep(0.3) config = CrawlerRunConfig( - virtual_scroll_config=virtual_config, + virtual_scroll_config=VirtualScrollConfig( + container_selector="#feed", + scroll_count=20, + scroll_by="page_height", + wait_after_scroll=0.15, + ), cache_mode=CacheMode.BYPASS, - verbose=True ) - - browserConfig = BrowserConfig( - headless= False + async with AsyncWebCrawler(config=browser_config) as crawler: + result = await crawler.arun(url=server.url, config=config) + + indices = set(int(m) for m in re.findall(r'data-index="(\d+)"', result.html)) + assert len(indices) >= 80, f"Only {len(indices)}/100 — window fallback may have failed" + finally: + server.shutdown() + + +@pytest.mark.asyncio +async def test_vscroll_memory_cap(browser_config): + """max_captured_elements prevents unbounded accumulation.""" + server = _TestServer(CONTAINER_VSCROLL_HTML.format(total=500, per_page=10)) + try: + await asyncio.sleep(0.3) + config = CrawlerRunConfig( + virtual_scroll_config=VirtualScrollConfig( + container_selector="#container", + scroll_count=60, + scroll_by="container_height", + wait_after_scroll=0.1, + max_captured_elements=50, + ), + cache_mode=CacheMode.BYPASS, ) - - async with AsyncWebCrawler(verbose=True, config=browserConfig) as crawler: - result = await crawler.arun( - url=f"http://localhost:{PORT}/{os.path.basename(test_file_path)}", - config=config - ) - - # Count all items in the result - import re - items = re.findall(r'data-index="(\d+)"', result.html) - unique_indices = sorted(set(int(idx) for idx in items)) - - print(f"\n{'='*60}") - print(f"TEST RESULTS:") - print(f"HTML Length: {len(result.html)}") - print(f"Total items found: {len(items)}") - print(f"Unique items: {len(unique_indices)}") - - if unique_indices: - print(f"Item indices: {min(unique_indices)} to {max(unique_indices)}") - print(f"Expected: 0 to 999") - - # Check for gaps - expected = set(range(1000)) - actual = set(unique_indices) - missing = expected - actual - - if missing: - print(f"\n❌ FAILED! Missing {len(missing)} items") - print(f"Missing indices: {sorted(missing)[:10]}{'...' if len(missing) > 10 else ''}") - else: - print(f"\n✅ SUCCESS! All 1000 items captured!") - - # Show some sample items - print(f"\nSample items from result:") - sample_items = re.findall(r'
]*>([^<]+)
', result.html)[:5] - for item in sample_items: - print(f" - {item}") - - print(f"{'='*60}\n") - + async with AsyncWebCrawler(config=browser_config) as crawler: + result = await crawler.arun(url=server.url, config=config) + + indices = set(int(m) for m in re.findall(r'data-index="(\d+)"', result.html)) + assert 20 <= len(indices) <= 150, f"Cap didn't work: {len(indices)} items" + finally: + server.shutdown() + + +# --------------------------------------------------------------------------- +# scan_full_page tests (issue #731) +# --------------------------------------------------------------------------- + +@pytest.mark.asyncio +async def test_scan_full_page_window_recycling(browser_config): + """Issue #731: scan_full_page=True on window-level DOM recycling page.""" + server = _TestServer(WINDOW_RECYCLE_HTML) + try: + await asyncio.sleep(0.3) + config = CrawlerRunConfig( + scan_full_page=True, + scroll_delay=0.15, + cache_mode=CacheMode.BYPASS, + ) + async with AsyncWebCrawler(config=browser_config) as crawler: + result = await crawler.arun(url=server.url, config=config) + + indices = set(int(m) for m in re.findall(r'data-index="(\d+)"', result.html)) + assert len(indices) >= 90, f"Only {len(indices)}/100 captured with scan_full_page" + finally: + server.shutdown() + + +@pytest.mark.asyncio +async def test_scan_full_page_lazy_load(browser_config): + """Backward compat: lazy-load page (no recycling) still works.""" + server = _TestServer(LAZY_LOAD_HTML) + try: + await asyncio.sleep(0.3) + config = CrawlerRunConfig( + scan_full_page=True, + scroll_delay=0.2, + cache_mode=CacheMode.BYPASS, + ) + async with AsyncWebCrawler(config=browser_config) as crawler: + result = await crawler.arun(url=server.url, config=config) + + indices = set(int(m) for m in re.findall(r'data-index="(\d+)"', result.html)) + assert len(indices) >= 40, f"Only {len(indices)}/50 — lazy load regression" + finally: + server.shutdown() + + +# --------------------------------------------------------------------------- +# Config unit tests +# --------------------------------------------------------------------------- + +@pytest.mark.asyncio +async def test_config_serialization(): + """VirtualScrollConfig round-trips through to_dict/from_dict.""" + cfg = VirtualScrollConfig( + container_selector="#feed", + scroll_count=20, + scroll_by=300, + wait_after_scroll=0.8, + max_no_change=7, + max_captured_elements=5000, + ) + d = cfg.to_dict() + assert d["max_no_change"] == 7 + assert d["max_captured_elements"] == 5000 + + restored = VirtualScrollConfig.from_dict(d) + assert restored.max_no_change == 7 + assert restored.scroll_by == 300 + + +@pytest.mark.asyncio +async def test_config_from_dict_ignores_unknown_keys(): + """from_dict must not crash on keys from a newer config version.""" + d = { + "container_selector": "#x", + "scroll_count": 5, + "unknown_future_field": 42, + "another_new_thing": True, + } + cfg = VirtualScrollConfig.from_dict(d) + assert cfg.container_selector == "#x" + assert cfg.scroll_count == 5 + assert cfg.max_no_change == 5 # default + + +@pytest.mark.asyncio +async def test_vscroll_container_not_found(browser_config): + """Wrong container selector — crawl must complete without crashing.""" + server = _TestServer(STATIC_HTML) + try: + await asyncio.sleep(0.3) + config = CrawlerRunConfig( + virtual_scroll_config=VirtualScrollConfig( + container_selector="#nonexistent", + scroll_count=3, + scroll_by=100, + wait_after_scroll=0.05, + ), + cache_mode=CacheMode.BYPASS, + ) + async with AsyncWebCrawler(config=browser_config) as crawler: + result = await crawler.arun(url=server.url, config=config) + + assert result.html is not None, "Crawl should return HTML even if vscroll fails" + assert len(result.html) > 0 finally: - # Clean up - if httpd: - httpd.shutdown() - os.chdir(old_cwd) - os.unlink(test_file_path) - -if __name__ == "__main__": - asyncio.run(test_virtual_scroll()) \ No newline at end of file + server.shutdown()