diff --git a/crawl4ai/async_webcrawler.py b/crawl4ai/async_webcrawler.py index 36b999fd1..a860683ba 100644 --- a/crawl4ai/async_webcrawler.py +++ b/crawl4ai/async_webcrawler.py @@ -650,6 +650,37 @@ async def arun( return CrawlResultContainer(crawl_result) else: + # If an extraction strategy is configured, re-run the + # processing pipeline on cached HTML so the strategy is + # applied (e.g. LLMExtractionStrategy). Without this, + # the cache-hit path returns stale/empty extracted_content. + if config.extraction_strategy and html: + from urllib.parse import urlparse as _urlparse + crawl_result = await self.aprocess_html( + url=url, html=html, + extracted_content=extracted_content, + config=config, + screenshot_data=cached_result.screenshot, + pdf_data=cached_result.pdf, + verbose=config.verbose, + is_raw_html=url.startswith("raw:"), + redirected_url=cached_result.redirected_url or url, + original_scheme=_urlparse(url).scheme, + **kwargs, + ) + crawl_result.cache_status = cached_result.cache_status + crawl_result.status_code = cached_result.status_code + crawl_result.redirected_url = cached_result.redirected_url or url + crawl_result.response_headers = cached_result.response_headers + + self.logger.url_status( + url=cache_context.display_url, + success=crawl_result.success, + timing=time.perf_counter() - start_time, + tag="COMPLETE", + ) + return CrawlResultContainer(crawl_result) + self.logger.url_status( url=cache_context.display_url, success=True, diff --git a/tests/test_issue_1455_cache_extraction.py b/tests/test_issue_1455_cache_extraction.py new file mode 100644 index 000000000..1fd3ba81b --- /dev/null +++ b/tests/test_issue_1455_cache_extraction.py @@ -0,0 +1,142 @@ +""" +Reproduction test for issue #1455: +Extraction strategy is skipped when cache_mode=ENABLED and cache hits. + +Uses JsonCssExtractionStrategy (no LLM needed) to verify that extraction +runs on cached HTML, not just on fresh fetches. +""" + +import asyncio +import json +import socket +import threading +import time +import pytest +from aiohttp import web + +from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, CacheMode +from crawl4ai.extraction_strategy import JsonCssExtractionStrategy + + +PRODUCT_PAGE = """ + + +Product Catalog - Test Store + + + +

Test Store Product Catalog

+ +
+
+

Welcome to our store. Browse our selection of quality products below. +We offer competitive prices and fast shipping on all orders.

+
+ Widget A + $9.99 +

A high-quality widget for everyday use. Built to last with premium materials.

+
+
+ Widget B + $19.99 +

Our premium widget with advanced features and extended warranty included.

+
+
+ + +""" + +SCHEMA = { + "name": "Products", + "baseSelector": "div.product[data-testid='product']", + "fields": [ + {"name": "name", "selector": "span.name", "type": "text"}, + {"name": "price", "selector": "span.price", "type": "text"}, + ], +} + + +def _find_free_port(): + with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s: + s.bind(("", 0)) + return s.getsockname()[1] + + +@pytest.fixture(scope="module") +def test_server(): + port = _find_free_port() + + async def handle(request): + return web.Response(text=PRODUCT_PAGE, content_type="text/html") + + app = web.Application() + app.router.add_get("/products", handle) + + ready = threading.Event() + + def run(): + loop = asyncio.new_event_loop() + asyncio.set_event_loop(loop) + runner = web.AppRunner(app) + loop.run_until_complete(runner.setup()) + site = web.TCPSite(runner, "localhost", port) + loop.run_until_complete(site.start()) + ready.set() + loop.run_forever() + + t = threading.Thread(target=run, daemon=True) + t.start() + assert ready.wait(timeout=10) + time.sleep(0.2) + yield f"http://localhost:{port}" + + +@pytest.mark.asyncio +async def test_extraction_runs_on_cache_hit(test_server): + """ + Bug #1455: extraction strategy must run even when result comes from cache. + + 1. First crawl WITHOUT extraction (populates cache) + 2. Second crawl WITH extraction + cache_mode=ENABLED (cache hit) + 3. Verify extracted_content is populated (not empty) + """ + url = f"{test_server}/products" + + # Step 1: Warm the cache (no extraction strategy) + config_warm = CrawlerRunConfig( + cache_mode=CacheMode.ENABLED, + ) + async with AsyncWebCrawler(verbose=False) as crawler: + result1 = await crawler.arun(url=url, config=config_warm) + assert result1.success + + # Step 2: Crawl again WITH extraction strategy (should hit cache) + extraction = JsonCssExtractionStrategy(SCHEMA) + config_extract = CrawlerRunConfig( + cache_mode=CacheMode.ENABLED, + extraction_strategy=extraction, + ) + async with AsyncWebCrawler(verbose=False) as crawler: + result2 = await crawler.arun(url=url, config=config_extract) + + assert result2.success + data = json.loads(result2.extracted_content) + assert len(data) == 2, f"Expected 2 products, got {len(data)}" + assert data[0]["name"] == "Widget A" + assert data[1]["name"] == "Widget B" + + +@pytest.mark.asyncio +async def test_cache_without_extraction_still_works(test_server): + """Cache hit without extraction strategy should still return normally.""" + url = f"{test_server}/products" + + config = CrawlerRunConfig(cache_mode=CacheMode.ENABLED) + async with AsyncWebCrawler(verbose=False) as crawler: + result = await crawler.arun(url=url, config=config) + assert result.success + assert "Widget A" in result.html + + +if __name__ == "__main__": + asyncio.run(test_extraction_runs_on_cache_hit.__wrapped__(None))