diff --git a/crawl4ai/async_webcrawler.py b/crawl4ai/async_webcrawler.py
index 36b999fd1..a860683ba 100644
--- a/crawl4ai/async_webcrawler.py
+++ b/crawl4ai/async_webcrawler.py
@@ -650,6 +650,37 @@ async def arun(
return CrawlResultContainer(crawl_result)
else:
+ # If an extraction strategy is configured, re-run the
+ # processing pipeline on cached HTML so the strategy is
+ # applied (e.g. LLMExtractionStrategy). Without this,
+ # the cache-hit path returns stale/empty extracted_content.
+ if config.extraction_strategy and html:
+ from urllib.parse import urlparse as _urlparse
+ crawl_result = await self.aprocess_html(
+ url=url, html=html,
+ extracted_content=extracted_content,
+ config=config,
+ screenshot_data=cached_result.screenshot,
+ pdf_data=cached_result.pdf,
+ verbose=config.verbose,
+ is_raw_html=url.startswith("raw:"),
+ redirected_url=cached_result.redirected_url or url,
+ original_scheme=_urlparse(url).scheme,
+ **kwargs,
+ )
+ crawl_result.cache_status = cached_result.cache_status
+ crawl_result.status_code = cached_result.status_code
+ crawl_result.redirected_url = cached_result.redirected_url or url
+ crawl_result.response_headers = cached_result.response_headers
+
+ self.logger.url_status(
+ url=cache_context.display_url,
+ success=crawl_result.success,
+ timing=time.perf_counter() - start_time,
+ tag="COMPLETE",
+ )
+ return CrawlResultContainer(crawl_result)
+
self.logger.url_status(
url=cache_context.display_url,
success=True,
diff --git a/tests/test_issue_1455_cache_extraction.py b/tests/test_issue_1455_cache_extraction.py
new file mode 100644
index 000000000..1fd3ba81b
--- /dev/null
+++ b/tests/test_issue_1455_cache_extraction.py
@@ -0,0 +1,142 @@
+"""
+Reproduction test for issue #1455:
+Extraction strategy is skipped when cache_mode=ENABLED and cache hits.
+
+Uses JsonCssExtractionStrategy (no LLM needed) to verify that extraction
+runs on cached HTML, not just on fresh fetches.
+"""
+
+import asyncio
+import json
+import socket
+import threading
+import time
+import pytest
+from aiohttp import web
+
+from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, CacheMode
+from crawl4ai.extraction_strategy import JsonCssExtractionStrategy
+
+
+PRODUCT_PAGE = """
+
+
+
Product Catalog - Test Store
+
+
+
+
+
+Welcome to our store. Browse our selection of quality products below.
+We offer competitive prices and fast shipping on all orders.
+
+
Widget A
+
$9.99
+
A high-quality widget for everyday use. Built to last with premium materials.
+
+
+
Widget B
+
$19.99
+
Our premium widget with advanced features and extended warranty included.
+
+
+
+
+"""
+
+SCHEMA = {
+ "name": "Products",
+ "baseSelector": "div.product[data-testid='product']",
+ "fields": [
+ {"name": "name", "selector": "span.name", "type": "text"},
+ {"name": "price", "selector": "span.price", "type": "text"},
+ ],
+}
+
+
+def _find_free_port():
+ with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
+ s.bind(("", 0))
+ return s.getsockname()[1]
+
+
+@pytest.fixture(scope="module")
+def test_server():
+ port = _find_free_port()
+
+ async def handle(request):
+ return web.Response(text=PRODUCT_PAGE, content_type="text/html")
+
+ app = web.Application()
+ app.router.add_get("/products", handle)
+
+ ready = threading.Event()
+
+ def run():
+ loop = asyncio.new_event_loop()
+ asyncio.set_event_loop(loop)
+ runner = web.AppRunner(app)
+ loop.run_until_complete(runner.setup())
+ site = web.TCPSite(runner, "localhost", port)
+ loop.run_until_complete(site.start())
+ ready.set()
+ loop.run_forever()
+
+ t = threading.Thread(target=run, daemon=True)
+ t.start()
+ assert ready.wait(timeout=10)
+ time.sleep(0.2)
+ yield f"http://localhost:{port}"
+
+
+@pytest.mark.asyncio
+async def test_extraction_runs_on_cache_hit(test_server):
+ """
+ Bug #1455: extraction strategy must run even when result comes from cache.
+
+ 1. First crawl WITHOUT extraction (populates cache)
+ 2. Second crawl WITH extraction + cache_mode=ENABLED (cache hit)
+ 3. Verify extracted_content is populated (not empty)
+ """
+ url = f"{test_server}/products"
+
+ # Step 1: Warm the cache (no extraction strategy)
+ config_warm = CrawlerRunConfig(
+ cache_mode=CacheMode.ENABLED,
+ )
+ async with AsyncWebCrawler(verbose=False) as crawler:
+ result1 = await crawler.arun(url=url, config=config_warm)
+ assert result1.success
+
+ # Step 2: Crawl again WITH extraction strategy (should hit cache)
+ extraction = JsonCssExtractionStrategy(SCHEMA)
+ config_extract = CrawlerRunConfig(
+ cache_mode=CacheMode.ENABLED,
+ extraction_strategy=extraction,
+ )
+ async with AsyncWebCrawler(verbose=False) as crawler:
+ result2 = await crawler.arun(url=url, config=config_extract)
+
+ assert result2.success
+ data = json.loads(result2.extracted_content)
+ assert len(data) == 2, f"Expected 2 products, got {len(data)}"
+ assert data[0]["name"] == "Widget A"
+ assert data[1]["name"] == "Widget B"
+
+
+@pytest.mark.asyncio
+async def test_cache_without_extraction_still_works(test_server):
+ """Cache hit without extraction strategy should still return normally."""
+ url = f"{test_server}/products"
+
+ config = CrawlerRunConfig(cache_mode=CacheMode.ENABLED)
+ async with AsyncWebCrawler(verbose=False) as crawler:
+ result = await crawler.arun(url=url, config=config)
+ assert result.success
+ assert "Widget A" in result.html
+
+
+if __name__ == "__main__":
+ asyncio.run(test_extraction_runs_on_cache_hit.__wrapped__(None))