apify · vdusek · Feb 13, 2026 · Feb 13, 2026 · Feb 13, 2026
diff --git a/tests/e2e/test_scrapy/__init__.py b/tests/e2e/test_scrapy/__init__.py
@@ -0,0 +1 @@
+
diff --git a/tests/e2e/test_scrapy/actor_source/__init__.py b/tests/e2e/test_scrapy/actor_source/__init__.py
diff --git a/tests/e2e/test_scrapy/actor_source/__main__.py b/tests/e2e/test_scrapy/actor_source/__main__.py
@@ -0,0 +1,17 @@
+from __future__ import annotations
+
+from scrapy.utils.reactor import install_reactor
+
+install_reactor('twisted.internet.asyncioreactor.AsyncioSelectorReactor')
+
+import os  # noqa: E402, I001
+
+from apify.scrapy import initialize_logging, run_scrapy_actor  # noqa: E402
+
+from .main import main  # noqa: E402
+
+os.environ['SCRAPY_SETTINGS_MODULE'] = 'src.settings'
+
+if __name__ == '__main__':
+    initialize_logging()
+    run_scrapy_actor(main())
diff --git a/tests/e2e/test_scrapy/actor_source/items.py b/tests/e2e/test_scrapy/actor_source/items.py
@@ -0,0 +1,10 @@
+from __future__ import annotations
+
+from scrapy import Field, Item
+
+
+class ProductItem(Item):
+    name = Field()
+    url = Field()
+    price = Field()
+    description = Field()
diff --git a/tests/e2e/test_scrapy/actor_source/main.py b/tests/e2e/test_scrapy/actor_source/main.py
@@ -0,0 +1,16 @@
+from __future__ import annotations  # noqa: I001
+
+from scrapy.crawler import CrawlerRunner
+from scrapy.utils.defer import deferred_to_future
+
+from apify import Actor
+from apify.scrapy import apply_apify_settings
+
+from .spiders import Spider  # ty: ignore[unresolved-import]
+
+
+async def main() -> None:
+    async with Actor:
+        settings = apply_apify_settings()
+        runner = CrawlerRunner(settings)
+        await deferred_to_future(runner.crawl(Spider, start_urls=['http://localhost:8080/']))
diff --git a/tests/e2e/test_scrapy/actor_source/main_custom_pipeline.py b/tests/e2e/test_scrapy/actor_source/main_custom_pipeline.py
@@ -0,0 +1,19 @@
+from __future__ import annotations  # noqa: I001
+
+import os
+
+from scrapy.crawler import CrawlerRunner
+from scrapy.utils.defer import deferred_to_future
+
+from apify import Actor
+from apify.scrapy import apply_apify_settings
+
+from .spiders import Spider  # ty: ignore[unresolved-import]
+
+
+async def main() -> None:
+    async with Actor:
+        os.environ['SCRAPY_SETTINGS_MODULE'] = 'src.settings_custom_pipeline'
+        settings = apply_apify_settings()
+        runner = CrawlerRunner(settings)
+        await deferred_to_future(runner.crawl(Spider, start_urls=['http://localhost:8080/']))
diff --git a/tests/e2e/test_scrapy/actor_source/pipelines.py b/tests/e2e/test_scrapy/actor_source/pipelines.py
@@ -0,0 +1,17 @@
+from __future__ import annotations
+
+from typing import TYPE_CHECKING
+
+if TYPE_CHECKING:
+    from scrapy import Item, Spider
+
+
+class PriceCleanerPipeline:
+    def process_item(
+        self,
+        item: Item,
+        _: Spider,
+    ) -> Item:
+        if 'price' in item and isinstance(item['price'], str):
+            item['price'] = item['price'].lstrip('$')
+        return item
diff --git a/tests/e2e/test_scrapy/actor_source/server.py b/tests/e2e/test_scrapy/actor_source/server.py
@@ -0,0 +1,86 @@
+from __future__ import annotations
+
+import asyncio
+import logging
+from collections.abc import Awaitable, Callable, Coroutine
+from typing import Any
+
+from uvicorn import Config
+from uvicorn.server import Server
+
+Receive = Callable[[], Awaitable[dict[str, Any]]]
+Send = Callable[[dict[str, Any]], Coroutine[None, None, None]]
+
+_PRODUCTS = {
+    '1': {'name': 'Widget A', 'price': '$19.99', 'description': 'A basic widget for everyday use'},
+    '2': {'name': 'Widget B', 'price': '$29.99', 'description': 'An advanced widget with extra features'},
+    '3': {'name': 'Widget C', 'price': '$39.99', 'description': 'A premium widget for professionals'},
+}
+
+
+async def _send_html(send: Send, html: str, status: int = 200) -> None:
+    await send(
+        {
+            'type': 'http.response.start',
+            'status': status,
+            'headers': [[b'content-type', b'text/html; charset=utf-8']],
+        }
+    )
+    await send({'type': 'http.response.body', 'body': html.encode()})
+
+
+async def app(scope: dict[str, Any], _receive: Receive, send: Send) -> None:
+    assert scope['type'] == 'http'
+    path = scope['path']
+
+    if path == '/':
+        await _send_html(
+            send,
+            '<html><head><title>E-commerce Test Store</title></head><body>'
+            '<h1>Welcome to Test Store</h1>'
+            '<a href="/products/1">Widget A</a>'
+            '<a href="/products/2">Widget B</a>'
+            '<a href="/products/3">Widget C</a>'
+            '<a href="/about">About Us</a>'
+            '</body></html>',
+        )
+    elif path.startswith('/products/'):
+        product = _PRODUCTS.get(path.split('/')[-1])
+        if product:
+            await _send_html(
+                send,
+                f'<html><head><title>{product["name"]}</title></head><body>'
+                f'<h1>{product["name"]}</h1>'
+                f'<span class="price">{product["price"]}</span>'
+                f'<p class="description">{product["description"]}</p>'
+                f'<a href="/">Back to Home</a>'
+                f'</body></html>',
+            )
+        else:
+            await _send_html(send, '<html><body>Not Found</body></html>', 404)
+    elif path == '/about':
+        await _send_html(
+            send,
+            '<html><head><title>About Us</title></head><body>'
+            '<h1>About Test Store</h1>'
+            '<p class="description">We sell the best widgets in the world.</p>'
+            '<a href="/">Back to Home</a>'
+            '</body></html>',
+        )
+    else:
+        await _send_html(send, '<html><body>Not Found</body></html>', 404)
+
+
+if __name__ == '__main__':
+    asyncio.run(
+        Server(
+            config=Config(
+                app=app,
+                lifespan='off',
+                loop='asyncio',
+                port=8080,
+                log_config=None,
+                log_level=logging.CRITICAL,
+            )
+        ).serve()
+    )
diff --git a/tests/e2e/test_scrapy/actor_source/settings.py b/tests/e2e/test_scrapy/actor_source/settings.py
@@ -0,0 +1,8 @@
+BOT_NAME = 'testbot'
+LOG_LEVEL = 'INFO'
+NEWSPIDER_MODULE = 'src.spiders'
+ROBOTSTXT_OBEY = False
+SPIDER_MODULES = ['src.spiders']
+TELNETCONSOLE_ENABLED = False
+TWISTED_REACTOR = 'twisted.internet.asyncioreactor.AsyncioSelectorReactor'
+HTTPCACHE_ENABLED = False
diff --git a/tests/e2e/test_scrapy/actor_source/settings_custom_pipeline.py b/tests/e2e/test_scrapy/actor_source/settings_custom_pipeline.py
@@ -0,0 +1,5 @@
+from src.settings import *  # noqa: F403  # ty: ignore[unresolved-import]
+
+ITEM_PIPELINES = {
+    'src.pipelines.PriceCleanerPipeline': 100,
+}
diff --git a/tests/e2e/test_scrapy/actor_source/spider_basic.py b/tests/e2e/test_scrapy/actor_source/spider_basic.py
@@ -0,0 +1,34 @@
+from __future__ import annotations
+
+from typing import TYPE_CHECKING, Any
+
+from scrapy import Request, Spider
+
+if TYPE_CHECKING:
+    from collections.abc import Generator
+
+    from scrapy.http.response import Response
+
+
+class BasicSpider(Spider):
+    name = 'basic_spider'
+
+    def __init__(self, start_urls: list[str], *args: Any, **kwargs: Any) -> None:
+        super().__init__(*args, **kwargs)
+        self.start_urls = start_urls
+
+    def start_requests(self) -> Generator[Request, None, None]:
+        for url in self.start_urls:
+            yield Request(url, callback=self.parse)
+
+    def parse(self, response: Response) -> Generator[dict | Request, None, None]:
+        for link in response.css('a[href*="/products/"]::attr(href)').getall():
+            yield response.follow(link, callback=self.parse_product)
+
+    def parse_product(self, response: Response) -> Generator[dict, None, None]:
+        yield {
+            'url': response.url,
+            'name': response.css('h1::text').get(''),
+            'price': response.css('span.price::text').get(''),
+            'description': response.css('p.description::text').get(''),
+        }
diff --git a/tests/e2e/test_scrapy/actor_source/spider_cb_kwargs.py b/tests/e2e/test_scrapy/actor_source/spider_cb_kwargs.py
@@ -0,0 +1,36 @@
+from __future__ import annotations
+
+from typing import TYPE_CHECKING, Any
+
+from scrapy import Request, Spider
+
+if TYPE_CHECKING:
+    from collections.abc import Generator
+
+    from scrapy.http.response import Response
+
+
+class CbKwargsSpider(Spider):
+    name = 'cb_kwargs_spider'
+
+    def __init__(self, start_urls: list[str], *args: Any, **kwargs: Any) -> None:
+        super().__init__(*args, **kwargs)
+        self.start_urls = start_urls
+
+    def start_requests(self) -> Generator[Request, None, None]:
+        for url in self.start_urls:
+            yield Request(url, callback=self.parse)
+
+    def parse(self, response: Response) -> Generator[Request, None, None]:
+        source = response.css('title::text').get('')
+        for link in response.css('a[href*="/products/"]::attr(href)').getall():
+            yield response.follow(link, callback=self.parse_product, cb_kwargs={'source': source})
+
+    def parse_product(self, response: Response, source: str) -> Generator[dict, None, None]:
+        yield {
+            'url': response.url,
+            'name': response.css('h1::text').get(''),
+            'price': response.css('span.price::text').get(''),
+            'description': response.css('p.description::text').get(''),
+            'source': source,
+        }
diff --git a/tests/e2e/test_scrapy/actor_source/spider_crawl.py b/tests/e2e/test_scrapy/actor_source/spider_crawl.py
@@ -0,0 +1,29 @@
+from __future__ import annotations
+
+from typing import TYPE_CHECKING, Any
+
+from scrapy.linkextractors import LinkExtractor
+from scrapy.spiders import CrawlSpider, Rule
+
+if TYPE_CHECKING:
+    from collections.abc import Generator
+
+    from scrapy.http.response import Response
+
+
+class CrawlProductSpider(CrawlSpider):
+    name = 'crawl_product_spider'
+
+    rules = (Rule(LinkExtractor(allow=r'/products/'), callback='parse_product'),)
+
+    def __init__(self, start_urls: list[str], *args: Any, **kwargs: Any) -> None:
+        super().__init__(*args, **kwargs)
+        self.start_urls = start_urls
+
+    def parse_product(self, response: Response) -> Generator[dict, None, None]:
+        yield {
+            'url': response.url,
+            'name': response.css('h1::text').get(''),
+            'price': response.css('span.price::text').get(''),
+            'description': response.css('p.description::text').get(''),
+        }
diff --git a/tests/e2e/test_scrapy/actor_source/spider_custom_pipeline.py b/tests/e2e/test_scrapy/actor_source/spider_custom_pipeline.py
@@ -0,0 +1,34 @@
+from __future__ import annotations
+
+from typing import TYPE_CHECKING, Any
+
+from scrapy import Request, Spider
+
+if TYPE_CHECKING:
+    from collections.abc import Generator
+
+    from scrapy.http.response import Response
+
+
+class CustomPipelineSpider(Spider):
+    name = 'custom_pipeline_spider'
+
+    def __init__(self, start_urls: list[str], *args: Any, **kwargs: Any) -> None:
+        super().__init__(*args, **kwargs)
+        self.start_urls = start_urls
+
+    def start_requests(self) -> Generator[Request, None, None]:
+        for url in self.start_urls:
+            yield Request(url, callback=self.parse)
+
+    def parse(self, response: Response) -> Generator[Request, None, None]:
+        for link in response.css('a[href*="/products/"]::attr(href)').getall():
+            yield response.follow(link, callback=self.parse_product)
+
+    def parse_product(self, response: Response) -> Generator[dict, None, None]:
+        yield {
+            'url': response.url,
+            'name': response.css('h1::text').get(''),
+            'price': response.css('span.price::text').get(''),
+            'description': response.css('p.description::text').get(''),
+        }
diff --git a/tests/e2e/test_scrapy/actor_source/spider_itemloader.py b/tests/e2e/test_scrapy/actor_source/spider_itemloader.py
@@ -0,0 +1,46 @@
+from __future__ import annotations  # noqa: I001
+
+from typing import TYPE_CHECKING, Any
+
+from itemloaders.processors import MapCompose, TakeFirst
+from scrapy import Request, Spider
+from scrapy.loader import ItemLoader
+
+from src.items import ProductItem  # ty: ignore[unresolved-import]
+
+if TYPE_CHECKING:
+    from collections.abc import Generator
+
+    from scrapy.http.response import Response
+
+
+class ProductItemLoader(ItemLoader):
+    default_item_class = ProductItem
+    default_output_processor = TakeFirst()
+    name_in = MapCompose(str.strip)
+    price_in = MapCompose(str.strip)
+    description_in = MapCompose(str.strip)
+
+
+class ItemLoaderSpider(Spider):
+    name = 'itemloader_spider'
+
+    def __init__(self, start_urls: list[str], *args: Any, **kwargs: Any) -> None:
+        super().__init__(*args, **kwargs)
+        self.start_urls = start_urls
+
+    def start_requests(self) -> Generator[Request, None, None]:
+        for url in self.start_urls:
+            yield Request(url, callback=self.parse)
+
+    def parse(self, response: Response) -> Generator[Request, None, None]:
+        for link in response.css('a[href*="/products/"]::attr(href)').getall():
+            yield response.follow(link, callback=self.parse_product)
+
+    def parse_product(self, response: Response) -> Generator[ProductItem, None, None]:
+        loader = ProductItemLoader(response=response)  # ty: ignore[invalid-argument-type]
+        loader.add_value('url', response.url)
+        loader.add_css('name', 'h1::text')
+        loader.add_css('price', 'span.price::text')
+        loader.add_css('description', 'p.description::text')
+        yield loader.load_item()