Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions tests/e2e/test_scrapy/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@

Empty file.
17 changes: 17 additions & 0 deletions tests/e2e/test_scrapy/actor_source/__main__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
from __future__ import annotations

from scrapy.utils.reactor import install_reactor

install_reactor('twisted.internet.asyncioreactor.AsyncioSelectorReactor')

import os # noqa: E402, I001

from apify.scrapy import initialize_logging, run_scrapy_actor # noqa: E402

from .main import main # noqa: E402

os.environ['SCRAPY_SETTINGS_MODULE'] = 'src.settings'

if __name__ == '__main__':
initialize_logging()
run_scrapy_actor(main())
10 changes: 10 additions & 0 deletions tests/e2e/test_scrapy/actor_source/items.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
from __future__ import annotations

from scrapy import Field, Item


class ProductItem(Item):
name = Field()
url = Field()
price = Field()
description = Field()
16 changes: 16 additions & 0 deletions tests/e2e/test_scrapy/actor_source/main.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
from __future__ import annotations # noqa: I001

from scrapy.crawler import CrawlerRunner
from scrapy.utils.defer import deferred_to_future

from apify import Actor
from apify.scrapy import apply_apify_settings

from .spiders import Spider # ty: ignore[unresolved-import]


async def main() -> None:
async with Actor:
settings = apply_apify_settings()
runner = CrawlerRunner(settings)
await deferred_to_future(runner.crawl(Spider, start_urls=['http://localhost:8080/']))
19 changes: 19 additions & 0 deletions tests/e2e/test_scrapy/actor_source/main_custom_pipeline.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
from __future__ import annotations # noqa: I001

import os

from scrapy.crawler import CrawlerRunner
from scrapy.utils.defer import deferred_to_future

from apify import Actor
from apify.scrapy import apply_apify_settings

from .spiders import Spider # ty: ignore[unresolved-import]


async def main() -> None:
async with Actor:
os.environ['SCRAPY_SETTINGS_MODULE'] = 'src.settings_custom_pipeline'
settings = apply_apify_settings()
runner = CrawlerRunner(settings)
await deferred_to_future(runner.crawl(Spider, start_urls=['http://localhost:8080/']))
17 changes: 17 additions & 0 deletions tests/e2e/test_scrapy/actor_source/pipelines.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
from __future__ import annotations

from typing import TYPE_CHECKING

if TYPE_CHECKING:
from scrapy import Item, Spider


class PriceCleanerPipeline:
def process_item(
self,
item: Item,
_: Spider,
) -> Item:
if 'price' in item and isinstance(item['price'], str):
item['price'] = item['price'].lstrip('$')
return item
86 changes: 86 additions & 0 deletions tests/e2e/test_scrapy/actor_source/server.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,86 @@
from __future__ import annotations

import asyncio
import logging
from collections.abc import Awaitable, Callable, Coroutine
from typing import Any

from uvicorn import Config
from uvicorn.server import Server

Receive = Callable[[], Awaitable[dict[str, Any]]]
Send = Callable[[dict[str, Any]], Coroutine[None, None, None]]

_PRODUCTS = {
'1': {'name': 'Widget A', 'price': '$19.99', 'description': 'A basic widget for everyday use'},
'2': {'name': 'Widget B', 'price': '$29.99', 'description': 'An advanced widget with extra features'},
'3': {'name': 'Widget C', 'price': '$39.99', 'description': 'A premium widget for professionals'},
}


async def _send_html(send: Send, html: str, status: int = 200) -> None:
await send(
{
'type': 'http.response.start',
'status': status,
'headers': [[b'content-type', b'text/html; charset=utf-8']],
}
)
await send({'type': 'http.response.body', 'body': html.encode()})


async def app(scope: dict[str, Any], _receive: Receive, send: Send) -> None:
assert scope['type'] == 'http'
path = scope['path']

if path == '/':
await _send_html(
send,
'<html><head><title>E-commerce Test Store</title></head><body>'
'<h1>Welcome to Test Store</h1>'
'<a href="/products/1">Widget A</a>'
'<a href="/products/2">Widget B</a>'
'<a href="/products/3">Widget C</a>'
'<a href="/about">About Us</a>'
'</body></html>',
)
elif path.startswith('/products/'):
product = _PRODUCTS.get(path.split('/')[-1])
if product:
await _send_html(
send,
f'<html><head><title>{product["name"]}</title></head><body>'
f'<h1>{product["name"]}</h1>'
f'<span class="price">{product["price"]}</span>'
f'<p class="description">{product["description"]}</p>'
f'<a href="/">Back to Home</a>'
f'</body></html>',
)
else:
await _send_html(send, '<html><body>Not Found</body></html>', 404)
elif path == '/about':
await _send_html(
send,
'<html><head><title>About Us</title></head><body>'
'<h1>About Test Store</h1>'
'<p class="description">We sell the best widgets in the world.</p>'
'<a href="/">Back to Home</a>'
'</body></html>',
)
else:
await _send_html(send, '<html><body>Not Found</body></html>', 404)


if __name__ == '__main__':
asyncio.run(
Server(
config=Config(
app=app,
lifespan='off',
loop='asyncio',
port=8080,
log_config=None,
log_level=logging.CRITICAL,
)
).serve()
)
8 changes: 8 additions & 0 deletions tests/e2e/test_scrapy/actor_source/settings.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
BOT_NAME = 'testbot'
LOG_LEVEL = 'INFO'
NEWSPIDER_MODULE = 'src.spiders'
ROBOTSTXT_OBEY = False
SPIDER_MODULES = ['src.spiders']
TELNETCONSOLE_ENABLED = False
TWISTED_REACTOR = 'twisted.internet.asyncioreactor.AsyncioSelectorReactor'
HTTPCACHE_ENABLED = False
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
from src.settings import * # noqa: F403 # ty: ignore[unresolved-import]

ITEM_PIPELINES = {
'src.pipelines.PriceCleanerPipeline': 100,
}
34 changes: 34 additions & 0 deletions tests/e2e/test_scrapy/actor_source/spider_basic.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
from __future__ import annotations

from typing import TYPE_CHECKING, Any

from scrapy import Request, Spider

if TYPE_CHECKING:
from collections.abc import Generator

from scrapy.http.response import Response


class BasicSpider(Spider):
name = 'basic_spider'

def __init__(self, start_urls: list[str], *args: Any, **kwargs: Any) -> None:
super().__init__(*args, **kwargs)
self.start_urls = start_urls

def start_requests(self) -> Generator[Request, None, None]:
for url in self.start_urls:
yield Request(url, callback=self.parse)

def parse(self, response: Response) -> Generator[dict | Request, None, None]:
for link in response.css('a[href*="/products/"]::attr(href)').getall():
yield response.follow(link, callback=self.parse_product)

def parse_product(self, response: Response) -> Generator[dict, None, None]:
yield {
'url': response.url,
'name': response.css('h1::text').get(''),
'price': response.css('span.price::text').get(''),
'description': response.css('p.description::text').get(''),
}
36 changes: 36 additions & 0 deletions tests/e2e/test_scrapy/actor_source/spider_cb_kwargs.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
from __future__ import annotations

from typing import TYPE_CHECKING, Any

from scrapy import Request, Spider

if TYPE_CHECKING:
from collections.abc import Generator

from scrapy.http.response import Response


class CbKwargsSpider(Spider):
name = 'cb_kwargs_spider'

def __init__(self, start_urls: list[str], *args: Any, **kwargs: Any) -> None:
super().__init__(*args, **kwargs)
self.start_urls = start_urls

def start_requests(self) -> Generator[Request, None, None]:
for url in self.start_urls:
yield Request(url, callback=self.parse)

def parse(self, response: Response) -> Generator[Request, None, None]:
source = response.css('title::text').get('')
for link in response.css('a[href*="/products/"]::attr(href)').getall():
yield response.follow(link, callback=self.parse_product, cb_kwargs={'source': source})

def parse_product(self, response: Response, source: str) -> Generator[dict, None, None]:
yield {
'url': response.url,
'name': response.css('h1::text').get(''),
'price': response.css('span.price::text').get(''),
'description': response.css('p.description::text').get(''),
'source': source,
}
29 changes: 29 additions & 0 deletions tests/e2e/test_scrapy/actor_source/spider_crawl.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
from __future__ import annotations

from typing import TYPE_CHECKING, Any

from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule

if TYPE_CHECKING:
from collections.abc import Generator

from scrapy.http.response import Response


class CrawlProductSpider(CrawlSpider):
name = 'crawl_product_spider'

rules = (Rule(LinkExtractor(allow=r'/products/'), callback='parse_product'),)

def __init__(self, start_urls: list[str], *args: Any, **kwargs: Any) -> None:
super().__init__(*args, **kwargs)
self.start_urls = start_urls

def parse_product(self, response: Response) -> Generator[dict, None, None]:
yield {
'url': response.url,
'name': response.css('h1::text').get(''),
'price': response.css('span.price::text').get(''),
'description': response.css('p.description::text').get(''),
}
34 changes: 34 additions & 0 deletions tests/e2e/test_scrapy/actor_source/spider_custom_pipeline.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
from __future__ import annotations

from typing import TYPE_CHECKING, Any

from scrapy import Request, Spider

if TYPE_CHECKING:
from collections.abc import Generator

from scrapy.http.response import Response


class CustomPipelineSpider(Spider):
name = 'custom_pipeline_spider'

def __init__(self, start_urls: list[str], *args: Any, **kwargs: Any) -> None:
super().__init__(*args, **kwargs)
self.start_urls = start_urls

def start_requests(self) -> Generator[Request, None, None]:
for url in self.start_urls:
yield Request(url, callback=self.parse)

def parse(self, response: Response) -> Generator[Request, None, None]:
for link in response.css('a[href*="/products/"]::attr(href)').getall():
yield response.follow(link, callback=self.parse_product)

def parse_product(self, response: Response) -> Generator[dict, None, None]:
yield {
'url': response.url,
'name': response.css('h1::text').get(''),
'price': response.css('span.price::text').get(''),
'description': response.css('p.description::text').get(''),
}
46 changes: 46 additions & 0 deletions tests/e2e/test_scrapy/actor_source/spider_itemloader.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
from __future__ import annotations # noqa: I001

from typing import TYPE_CHECKING, Any

from itemloaders.processors import MapCompose, TakeFirst
from scrapy import Request, Spider
from scrapy.loader import ItemLoader

from src.items import ProductItem # ty: ignore[unresolved-import]

if TYPE_CHECKING:
from collections.abc import Generator

from scrapy.http.response import Response


class ProductItemLoader(ItemLoader):
default_item_class = ProductItem
default_output_processor = TakeFirst()
name_in = MapCompose(str.strip)
price_in = MapCompose(str.strip)
description_in = MapCompose(str.strip)


class ItemLoaderSpider(Spider):
name = 'itemloader_spider'

def __init__(self, start_urls: list[str], *args: Any, **kwargs: Any) -> None:
super().__init__(*args, **kwargs)
self.start_urls = start_urls

def start_requests(self) -> Generator[Request, None, None]:
for url in self.start_urls:
yield Request(url, callback=self.parse)

def parse(self, response: Response) -> Generator[Request, None, None]:
for link in response.css('a[href*="/products/"]::attr(href)').getall():
yield response.follow(link, callback=self.parse_product)

def parse_product(self, response: Response) -> Generator[ProductItem, None, None]:
loader = ProductItemLoader(response=response) # ty: ignore[invalid-argument-type]
loader.add_value('url', response.url)
loader.add_css('name', 'h1::text')
loader.add_css('price', 'span.price::text')
loader.add_css('description', 'p.description::text')
yield loader.load_item()
Loading