From 3011f0695e87ce4b1c41cfd9ef8070a7972b9e94 Mon Sep 17 00:00:00 2001 From: hafezparast Date: Wed, 25 Mar 2026 07:53:03 +0800 Subject: [PATCH] fix: run extraction strategy on cache hits when configured (#1455) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When cache_mode=ENABLED and a URL was already cached, the cache-hit path returned the old CrawlResult directly without calling aprocess_html(). This meant extraction strategies (LLM, CSS, etc.) were never applied to cached content — extracted_content was empty or stale. Now, when a cache hit occurs and config.extraction_strategy is set, the processing pipeline runs on the cached HTML so the extraction strategy is applied. Cache hits without an extraction strategy continue to return immediately (no behavior change). Co-Authored-By: Claude Opus 4.6 (1M context) --- crawl4ai/async_webcrawler.py | 31 +++++ tests/test_issue_1455_cache_extraction.py | 142 ++++++++++++++++++++++ 2 files changed, 173 insertions(+) create mode 100644 tests/test_issue_1455_cache_extraction.py diff --git a/crawl4ai/async_webcrawler.py b/crawl4ai/async_webcrawler.py index 36b999fd1..a860683ba 100644 --- a/crawl4ai/async_webcrawler.py +++ b/crawl4ai/async_webcrawler.py @@ -650,6 +650,37 @@ async def arun( return CrawlResultContainer(crawl_result) else: + # If an extraction strategy is configured, re-run the + # processing pipeline on cached HTML so the strategy is + # applied (e.g. LLMExtractionStrategy). Without this, + # the cache-hit path returns stale/empty extracted_content. + if config.extraction_strategy and html: + from urllib.parse import urlparse as _urlparse + crawl_result = await self.aprocess_html( + url=url, html=html, + extracted_content=extracted_content, + config=config, + screenshot_data=cached_result.screenshot, + pdf_data=cached_result.pdf, + verbose=config.verbose, + is_raw_html=url.startswith("raw:"), + redirected_url=cached_result.redirected_url or url, + original_scheme=_urlparse(url).scheme, + **kwargs, + ) + crawl_result.cache_status = cached_result.cache_status + crawl_result.status_code = cached_result.status_code + crawl_result.redirected_url = cached_result.redirected_url or url + crawl_result.response_headers = cached_result.response_headers + + self.logger.url_status( + url=cache_context.display_url, + success=crawl_result.success, + timing=time.perf_counter() - start_time, + tag="COMPLETE", + ) + return CrawlResultContainer(crawl_result) + self.logger.url_status( url=cache_context.display_url, success=True, diff --git a/tests/test_issue_1455_cache_extraction.py b/tests/test_issue_1455_cache_extraction.py new file mode 100644 index 000000000..1fd3ba81b --- /dev/null +++ b/tests/test_issue_1455_cache_extraction.py @@ -0,0 +1,142 @@ +""" +Reproduction test for issue #1455: +Extraction strategy is skipped when cache_mode=ENABLED and cache hits. + +Uses JsonCssExtractionStrategy (no LLM needed) to verify that extraction +runs on cached HTML, not just on fresh fetches. +""" + +import asyncio +import json +import socket +import threading +import time +import pytest +from aiohttp import web + +from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, CacheMode +from crawl4ai.extraction_strategy import JsonCssExtractionStrategy + + +PRODUCT_PAGE = """ + + +Product Catalog - Test Store + + + +

Test Store Product Catalog

+ +
+
+

Welcome to our store. Browse our selection of quality products below. +We offer competitive prices and fast shipping on all orders.

+
+ Widget A + $9.99 +

A high-quality widget for everyday use. Built to last with premium materials.

+
+
+ Widget B + $19.99 +

Our premium widget with advanced features and extended warranty included.

+
+
+ + +""" + +SCHEMA = { + "name": "Products", + "baseSelector": "div.product[data-testid='product']", + "fields": [ + {"name": "name", "selector": "span.name", "type": "text"}, + {"name": "price", "selector": "span.price", "type": "text"}, + ], +} + + +def _find_free_port(): + with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s: + s.bind(("", 0)) + return s.getsockname()[1] + + +@pytest.fixture(scope="module") +def test_server(): + port = _find_free_port() + + async def handle(request): + return web.Response(text=PRODUCT_PAGE, content_type="text/html") + + app = web.Application() + app.router.add_get("/products", handle) + + ready = threading.Event() + + def run(): + loop = asyncio.new_event_loop() + asyncio.set_event_loop(loop) + runner = web.AppRunner(app) + loop.run_until_complete(runner.setup()) + site = web.TCPSite(runner, "localhost", port) + loop.run_until_complete(site.start()) + ready.set() + loop.run_forever() + + t = threading.Thread(target=run, daemon=True) + t.start() + assert ready.wait(timeout=10) + time.sleep(0.2) + yield f"http://localhost:{port}" + + +@pytest.mark.asyncio +async def test_extraction_runs_on_cache_hit(test_server): + """ + Bug #1455: extraction strategy must run even when result comes from cache. + + 1. First crawl WITHOUT extraction (populates cache) + 2. Second crawl WITH extraction + cache_mode=ENABLED (cache hit) + 3. Verify extracted_content is populated (not empty) + """ + url = f"{test_server}/products" + + # Step 1: Warm the cache (no extraction strategy) + config_warm = CrawlerRunConfig( + cache_mode=CacheMode.ENABLED, + ) + async with AsyncWebCrawler(verbose=False) as crawler: + result1 = await crawler.arun(url=url, config=config_warm) + assert result1.success + + # Step 2: Crawl again WITH extraction strategy (should hit cache) + extraction = JsonCssExtractionStrategy(SCHEMA) + config_extract = CrawlerRunConfig( + cache_mode=CacheMode.ENABLED, + extraction_strategy=extraction, + ) + async with AsyncWebCrawler(verbose=False) as crawler: + result2 = await crawler.arun(url=url, config=config_extract) + + assert result2.success + data = json.loads(result2.extracted_content) + assert len(data) == 2, f"Expected 2 products, got {len(data)}" + assert data[0]["name"] == "Widget A" + assert data[1]["name"] == "Widget B" + + +@pytest.mark.asyncio +async def test_cache_without_extraction_still_works(test_server): + """Cache hit without extraction strategy should still return normally.""" + url = f"{test_server}/products" + + config = CrawlerRunConfig(cache_mode=CacheMode.ENABLED) + async with AsyncWebCrawler(verbose=False) as crawler: + result = await crawler.arun(url=url, config=config) + assert result.success + assert "Widget A" in result.html + + +if __name__ == "__main__": + asyncio.run(test_extraction_runs_on_cache_hit.__wrapped__(None))