From 0479f04ad25833cdd91d15dd4a87b04edccf3398 Mon Sep 17 00:00:00 2001 From: Vlada Dusek Date: Thu, 12 Feb 2026 19:00:57 +0100 Subject: [PATCH 1/9] test: add e2e tests for Crawlee crawlers running as Apify Actors Add 6 e2e tests (one per crawler type) verifying that each Crawlee crawler works correctly when deployed as an Actor on the Apify platform. Each test exercises link discovery, data extraction (push_data), and KVS storage against a local 5-page e-commerce test server. Crawlers covered: BasicCrawler, HttpCrawler, BeautifulSoupCrawler, ParselCrawler, PlaywrightCrawler, AdaptivePlaywrightCrawler. Co-Authored-By: Claude Opus 4.6 --- tests/e2e/test_crawlee_crawlers/__init__.py | 0 tests/e2e/test_crawlee_crawlers/conftest.py | 144 ++++++++++++++++++ .../test_adaptive_playwright_crawler.py | 60 ++++++++ .../test_basic_crawler.py | 90 +++++++++++ .../test_beautifulsoup_crawler.py | 56 +++++++ .../test_http_crawler.py | 61 ++++++++ .../test_parsel_crawler.py | 55 +++++++ .../test_playwright_crawler.py | 60 ++++++++ 8 files changed, 526 insertions(+) create mode 100644 tests/e2e/test_crawlee_crawlers/__init__.py create mode 100644 tests/e2e/test_crawlee_crawlers/conftest.py create mode 100644 tests/e2e/test_crawlee_crawlers/test_adaptive_playwright_crawler.py create mode 100644 tests/e2e/test_crawlee_crawlers/test_basic_crawler.py create mode 100644 tests/e2e/test_crawlee_crawlers/test_beautifulsoup_crawler.py create mode 100644 tests/e2e/test_crawlee_crawlers/test_http_crawler.py create mode 100644 tests/e2e/test_crawlee_crawlers/test_parsel_crawler.py create mode 100644 tests/e2e/test_crawlee_crawlers/test_playwright_crawler.py diff --git a/tests/e2e/test_crawlee_crawlers/__init__.py b/tests/e2e/test_crawlee_crawlers/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/tests/e2e/test_crawlee_crawlers/conftest.py b/tests/e2e/test_crawlee_crawlers/conftest.py new file mode 100644 index 00000000..402bb77c --- /dev/null +++ b/tests/e2e/test_crawlee_crawlers/conftest.py @@ -0,0 +1,144 @@ +from __future__ import annotations + +import os +import sys +from typing import TYPE_CHECKING + +if TYPE_CHECKING: + from apify_client.clients.resource_clients import ActorClientAsync + + from apify._models import ActorRun + +_PYTHON_VERSION = os.getenv('INTEGRATION_TESTS_PYTHON_VERSION') or '.'.join(str(x) for x in sys.version_info[:2]) + +_TEST_SERVER_PY = """\ +import asyncio +import logging +from collections.abc import Awaitable, Callable, Coroutine +from typing import Any + +from uvicorn import Config +from uvicorn.server import Server + +Receive = Callable[[], Awaitable[dict[str, Any]]] +Send = Callable[[dict[str, Any]], Coroutine[None, None, None]] + +_PRODUCTS = { + '1': {'name': 'Widget A', 'price': '$19.99', 'description': 'A basic widget for everyday use'}, + '2': {'name': 'Widget B', 'price': '$29.99', 'description': 'An advanced widget with extra features'}, + '3': {'name': 'Widget C', 'price': '$39.99', 'description': 'A premium widget for professionals'}, +} + + +async def _send_html(send: Send, html: str, status: int = 200) -> None: + await send({ + 'type': 'http.response.start', + 'status': status, + 'headers': [[b'content-type', b'text/html; charset=utf-8']], + }) + await send({'type': 'http.response.body', 'body': html.encode()}) + + +async def app(scope: dict[str, Any], _receive: Receive, send: Send) -> None: + assert scope['type'] == 'http' + path = scope['path'] + + if path == '/': + await _send_html( + send, + 'E-commerce Test Store' + '

Welcome to Test Store

' + 'Widget A' + 'Widget B' + 'Widget C' + 'About Us' + '', + ) + elif path.startswith('/products/'): + product = _PRODUCTS.get(path.split('/')[-1]) + if product: + await _send_html( + send, + f'{product["name"]}' + f'

{product["name"]}

' + f'{product["price"]}' + f'

{product["description"]}

' + f'Back to Home' + f'', + ) + else: + await _send_html(send, 'Not Found', 404) + elif path == '/about': + await _send_html( + send, + 'About Us' + '

About Test Store

' + '

We sell the best widgets in the world.

' + 'Back to Home' + '', + ) + else: + await _send_html(send, 'Not Found', 404) + + +if __name__ == '__main__': + asyncio.run( + Server( + config=Config( + app=app, + lifespan='off', + loop='asyncio', + port=8080, + log_config=None, + log_level=logging.CRITICAL, + ) + ).serve() + ) +""" + +_PLAYWRIGHT_DOCKERFILE = f"""\ +FROM apify/actor-python-playwright:{_PYTHON_VERSION} + +COPY . ./ + +RUN apt-get update && apt-get install -y git && rm -rf /var/lib/apt/lists/* + +RUN pip install --force-reinstall -r requirements.txt + +CMD ["sh", "-c", "python server.py & python -m src"] +""" + +_EXPECTED_PRODUCTS = { + 'Widget A': {'price': '$19.99', 'description': 'A basic widget for everyday use'}, + 'Widget B': {'price': '$29.99', 'description': 'An advanced widget with extra features'}, + 'Widget C': {'price': '$39.99', 'description': 'A premium widget for professionals'}, +} + + +async def _verify_crawler_results( + actor: ActorClientAsync, + run_result: ActorRun, + expected_crawler_type: str, +) -> None: + """Verify dataset items and KVS record after a crawler Actor run.""" + assert run_result.status == 'SUCCEEDED' + + # Verify dataset items. + items = await actor.last_run().dataset().list_items() + assert items.count == 3 + + items_by_name = {item['name']: item for item in items.items} + + for name, expected in _EXPECTED_PRODUCTS.items(): + assert name in items_by_name, f'Missing product: {name}' + item = items_by_name[name] + assert 'url' in item + assert item['price'] == expected['price'] + assert item['description'] == expected['description'] + + # Verify KVS record. + kvs_record = await actor.last_run().key_value_store().get_record('CRAWLER_RESULT') + assert kvs_record is not None + result = kvs_record['value'] + assert result['crawler_type'] == expected_crawler_type + assert result['pages_visited_count'] >= 5 diff --git a/tests/e2e/test_crawlee_crawlers/test_adaptive_playwright_crawler.py b/tests/e2e/test_crawlee_crawlers/test_adaptive_playwright_crawler.py new file mode 100644 index 00000000..6733a6da --- /dev/null +++ b/tests/e2e/test_crawlee_crawlers/test_adaptive_playwright_crawler.py @@ -0,0 +1,60 @@ +from __future__ import annotations + +from typing import TYPE_CHECKING + +from .conftest import _PLAYWRIGHT_DOCKERFILE, _TEST_SERVER_PY, _verify_crawler_results + +if TYPE_CHECKING: + from ..conftest import MakeActorFunction, RunActorFunction + +_ADAPTIVE_PLAYWRIGHT_CRAWLER_MAIN_PY = """\ +from __future__ import annotations + +from crawlee.crawlers import AdaptivePlaywrightCrawler, AdaptivePlaywrightCrawlingContext + +from apify import Actor + + +async def main() -> None: + async with Actor: + pages_visited: list[str] = [] + crawler = AdaptivePlaywrightCrawler.with_parsel_static_parser() + + @crawler.router.default_handler + async def handler(context: AdaptivePlaywrightCrawlingContext) -> None: + pages_visited.append(context.request.url) + await context.enqueue_links() + + if '/products/' in context.request.url: + name = context.parsed_content.css('h1::text').get('').strip() + price = context.parsed_content.css('span.price::text').get('').strip() + description = context.parsed_content.css('p.description::text').get('').strip() + if name: + await context.push_data({ + 'url': context.request.url, + 'name': name, + 'price': price, + 'description': description, + }) + + await crawler.run(['http://localhost:8080/']) + + await Actor.set_value('CRAWLER_RESULT', { + 'pages_visited_count': len(pages_visited), + 'crawler_type': 'AdaptivePlaywrightCrawler', + }) +""" + + +async def test_adaptive_playwright_crawler(make_actor: MakeActorFunction, run_actor: RunActorFunction) -> None: + actor = await make_actor( + label='crawl-adaptive', + source_files={ + 'server.py': _TEST_SERVER_PY, + 'src/main.py': _ADAPTIVE_PLAYWRIGHT_CRAWLER_MAIN_PY, + 'Dockerfile': _PLAYWRIGHT_DOCKERFILE, + }, + additional_requirements=['crawlee[all]>=1.0.0,<2.0.0'], + ) + run_result = await run_actor(actor) + await _verify_crawler_results(actor, run_result, 'AdaptivePlaywrightCrawler') diff --git a/tests/e2e/test_crawlee_crawlers/test_basic_crawler.py b/tests/e2e/test_crawlee_crawlers/test_basic_crawler.py new file mode 100644 index 00000000..07187c58 --- /dev/null +++ b/tests/e2e/test_crawlee_crawlers/test_basic_crawler.py @@ -0,0 +1,90 @@ +from __future__ import annotations + +from typing import TYPE_CHECKING + +from .conftest import _TEST_SERVER_PY, _verify_crawler_results + +if TYPE_CHECKING: + from ..conftest import MakeActorFunction, RunActorFunction + +_BASIC_CRAWLER_MAIN_PY = """\ +from __future__ import annotations + +from html.parser import HTMLParser + +from crawlee._types import BasicCrawlingContext +from crawlee.crawlers import BasicCrawler + +from apify import Actor + + +class _PageParser(HTMLParser): + def __init__(self) -> None: + super().__init__() + self.links: list[str] = [] + self.data: dict[str, str] = {} + self._in_tag: str | None = None + self._in_class: str = '' + + def handle_starttag(self, tag: str, attrs: list[tuple[str, str | None]]) -> None: + attrs_dict = dict(attrs) + if tag == 'a' and 'href' in attrs_dict: + self.links.append(attrs_dict['href']) + self._in_tag = tag + self._in_class = attrs_dict.get('class', '') or '' + + def handle_endtag(self, tag: str) -> None: + self._in_tag = None + self._in_class = '' + + def handle_data(self, data: str) -> None: + text = data.strip() + if not text: + return + if self._in_tag == 'h1': + self.data['name'] = text + elif self._in_tag == 'span' and self._in_class == 'price': + self.data['price'] = text + elif self._in_tag == 'p' and self._in_class == 'description': + self.data['description'] = text + + +async def main() -> None: + async with Actor: + pages_visited: list[str] = [] + crawler = BasicCrawler() + + @crawler.router.default_handler + async def handler(context: BasicCrawlingContext) -> None: + pages_visited.append(context.request.url) + + response = await context.send_request(context.request.url) + html = (await response.read()).decode() + + parser = _PageParser() + parser.feed(html) + + base_url = 'http://localhost:8080' + await context.add_requests( + [f'{base_url}{link}' for link in parser.links if link.startswith('/')] + ) + + if '/products/' in context.request.url and parser.data.get('name'): + await context.push_data({'url': context.request.url, **parser.data}) + + await crawler.run(['http://localhost:8080/']) + + await Actor.set_value('CRAWLER_RESULT', { + 'pages_visited_count': len(pages_visited), + 'crawler_type': 'BasicCrawler', + }) +""" + + +async def test_basic_crawler(make_actor: MakeActorFunction, run_actor: RunActorFunction) -> None: + actor = await make_actor( + label='crawl-basic', + source_files={'server.py': _TEST_SERVER_PY, 'src/main.py': _BASIC_CRAWLER_MAIN_PY}, + ) + run_result = await run_actor(actor) + await _verify_crawler_results(actor, run_result, 'BasicCrawler') diff --git a/tests/e2e/test_crawlee_crawlers/test_beautifulsoup_crawler.py b/tests/e2e/test_crawlee_crawlers/test_beautifulsoup_crawler.py new file mode 100644 index 00000000..f7981286 --- /dev/null +++ b/tests/e2e/test_crawlee_crawlers/test_beautifulsoup_crawler.py @@ -0,0 +1,56 @@ +from __future__ import annotations + +from typing import TYPE_CHECKING + +from .conftest import _TEST_SERVER_PY, _verify_crawler_results + +if TYPE_CHECKING: + from ..conftest import MakeActorFunction, RunActorFunction + +_BEAUTIFULSOUP_CRAWLER_MAIN_PY = """\ +from __future__ import annotations + +from crawlee.crawlers import BeautifulSoupCrawler, BeautifulSoupCrawlingContext + +from apify import Actor + + +async def main() -> None: + async with Actor: + pages_visited: list[str] = [] + crawler = BeautifulSoupCrawler() + + @crawler.router.default_handler + async def handler(context: BeautifulSoupCrawlingContext) -> None: + pages_visited.append(context.request.url) + await context.enqueue_links() + + if '/products/' in context.request.url: + name_tag = context.soup.find('h1') + price_tag = context.soup.find('span', class_='price') + desc_tag = context.soup.find('p', class_='description') + if name_tag: + await context.push_data({ + 'url': context.request.url, + 'name': name_tag.get_text(strip=True), + 'price': price_tag.get_text(strip=True) if price_tag else '', + 'description': desc_tag.get_text(strip=True) if desc_tag else '', + }) + + await crawler.run(['http://localhost:8080/']) + + await Actor.set_value('CRAWLER_RESULT', { + 'pages_visited_count': len(pages_visited), + 'crawler_type': 'BeautifulSoupCrawler', + }) +""" + + +async def test_beautifulsoup_crawler(make_actor: MakeActorFunction, run_actor: RunActorFunction) -> None: + actor = await make_actor( + label='crawl-bsoup', + source_files={'server.py': _TEST_SERVER_PY, 'src/main.py': _BEAUTIFULSOUP_CRAWLER_MAIN_PY}, + additional_requirements=['crawlee[beautifulsoup]>=1.0.0,<2.0.0'], + ) + run_result = await run_actor(actor) + await _verify_crawler_results(actor, run_result, 'BeautifulSoupCrawler') diff --git a/tests/e2e/test_crawlee_crawlers/test_http_crawler.py b/tests/e2e/test_crawlee_crawlers/test_http_crawler.py new file mode 100644 index 00000000..cb0b45ec --- /dev/null +++ b/tests/e2e/test_crawlee_crawlers/test_http_crawler.py @@ -0,0 +1,61 @@ +from __future__ import annotations + +from typing import TYPE_CHECKING + +from .conftest import _TEST_SERVER_PY, _verify_crawler_results + +if TYPE_CHECKING: + from ..conftest import MakeActorFunction, RunActorFunction + +_HTTP_CRAWLER_MAIN_PY = """\ +from __future__ import annotations + +import re + +from crawlee.crawlers import HttpCrawler, HttpCrawlingContext + +from apify import Actor + + +async def main() -> None: + async with Actor: + pages_visited: list[str] = [] + crawler = HttpCrawler() + + @crawler.router.default_handler + async def handler(context: HttpCrawlingContext) -> None: + pages_visited.append(context.request.url) + html = (await context.http_response.read()).decode() + + links = re.findall(r'href="(/[^"]*)"', html) + base_url = 'http://localhost:8080' + await context.add_requests([f'{base_url}{link}' for link in links]) + + if '/products/' in context.request.url: + name_match = re.search(r'

(.*?)

', html) + price_match = re.search(r'(.*?)', html) + desc_match = re.search(r'

(.*?)

', html) + if name_match: + await context.push_data({ + 'url': context.request.url, + 'name': name_match.group(1), + 'price': price_match.group(1) if price_match else '', + 'description': desc_match.group(1) if desc_match else '', + }) + + await crawler.run(['http://localhost:8080/']) + + await Actor.set_value('CRAWLER_RESULT', { + 'pages_visited_count': len(pages_visited), + 'crawler_type': 'HttpCrawler', + }) +""" + + +async def test_http_crawler(make_actor: MakeActorFunction, run_actor: RunActorFunction) -> None: + actor = await make_actor( + label='crawl-http', + source_files={'server.py': _TEST_SERVER_PY, 'src/main.py': _HTTP_CRAWLER_MAIN_PY}, + ) + run_result = await run_actor(actor) + await _verify_crawler_results(actor, run_result, 'HttpCrawler') diff --git a/tests/e2e/test_crawlee_crawlers/test_parsel_crawler.py b/tests/e2e/test_crawlee_crawlers/test_parsel_crawler.py new file mode 100644 index 00000000..02aeb934 --- /dev/null +++ b/tests/e2e/test_crawlee_crawlers/test_parsel_crawler.py @@ -0,0 +1,55 @@ +from __future__ import annotations + +from typing import TYPE_CHECKING + +from .conftest import _TEST_SERVER_PY, _verify_crawler_results + +if TYPE_CHECKING: + from ..conftest import MakeActorFunction, RunActorFunction + +_PARSEL_CRAWLER_MAIN_PY = """\ +from __future__ import annotations + +from crawlee.crawlers import ParselCrawler, ParselCrawlingContext + +from apify import Actor + + +async def main() -> None: + async with Actor: + pages_visited: list[str] = [] + crawler = ParselCrawler() + + @crawler.router.default_handler + async def handler(context: ParselCrawlingContext) -> None: + pages_visited.append(context.request.url) + await context.enqueue_links() + + if '/products/' in context.request.url: + name = context.selector.css('h1::text').get('').strip() + price = context.selector.css('span.price::text').get('').strip() + description = context.selector.css('p.description::text').get('').strip() + if name: + await context.push_data({ + 'url': context.request.url, + 'name': name, + 'price': price, + 'description': description, + }) + + await crawler.run(['http://localhost:8080/']) + + await Actor.set_value('CRAWLER_RESULT', { + 'pages_visited_count': len(pages_visited), + 'crawler_type': 'ParselCrawler', + }) +""" + + +async def test_parsel_crawler(make_actor: MakeActorFunction, run_actor: RunActorFunction) -> None: + actor = await make_actor( + label='crawl-parsel', + source_files={'server.py': _TEST_SERVER_PY, 'src/main.py': _PARSEL_CRAWLER_MAIN_PY}, + ) + run_result = await run_actor(actor) + await _verify_crawler_results(actor, run_result, 'ParselCrawler') diff --git a/tests/e2e/test_crawlee_crawlers/test_playwright_crawler.py b/tests/e2e/test_crawlee_crawlers/test_playwright_crawler.py new file mode 100644 index 00000000..85b3571f --- /dev/null +++ b/tests/e2e/test_crawlee_crawlers/test_playwright_crawler.py @@ -0,0 +1,60 @@ +from __future__ import annotations + +from typing import TYPE_CHECKING + +from .conftest import _PLAYWRIGHT_DOCKERFILE, _TEST_SERVER_PY, _verify_crawler_results + +if TYPE_CHECKING: + from ..conftest import MakeActorFunction, RunActorFunction + +_PLAYWRIGHT_CRAWLER_MAIN_PY = """\ +from __future__ import annotations + +from crawlee.crawlers import PlaywrightCrawler, PlaywrightCrawlingContext + +from apify import Actor + + +async def main() -> None: + async with Actor: + pages_visited: list[str] = [] + crawler = PlaywrightCrawler() + + @crawler.router.default_handler + async def handler(context: PlaywrightCrawlingContext) -> None: + pages_visited.append(context.request.url) + await context.enqueue_links() + + if '/products/' in context.request.url: + name = await context.page.locator('h1').text_content() + price = await context.page.locator('span.price').text_content() + description = await context.page.locator('p.description').text_content() + if name: + await context.push_data({ + 'url': context.request.url, + 'name': name.strip(), + 'price': (price or '').strip(), + 'description': (description or '').strip(), + }) + + await crawler.run(['http://localhost:8080/']) + + await Actor.set_value('CRAWLER_RESULT', { + 'pages_visited_count': len(pages_visited), + 'crawler_type': 'PlaywrightCrawler', + }) +""" + + +async def test_playwright_crawler(make_actor: MakeActorFunction, run_actor: RunActorFunction) -> None: + actor = await make_actor( + label='crawl-playwright', + source_files={ + 'server.py': _TEST_SERVER_PY, + 'src/main.py': _PLAYWRIGHT_CRAWLER_MAIN_PY, + 'Dockerfile': _PLAYWRIGHT_DOCKERFILE, + }, + additional_requirements=['crawlee[playwright]>=1.0.0,<2.0.0'], + ) + run_result = await run_actor(actor) + await _verify_crawler_results(actor, run_result, 'PlaywrightCrawler') From e7424a0f7bcb9ff683e1654c3807adde12350316 Mon Sep 17 00:00:00 2001 From: Vlada Dusek Date: Thu, 12 Feb 2026 19:27:31 +0100 Subject: [PATCH 2/9] test: increase memory to 1024MB for Playwright-based e2e tests The Playwright browser process uses ~244MB at startup, exceeding the 256MB default. Both PlaywrightCrawler and AdaptivePlaywrightCrawler tests timed out due to memory pressure. Add memory_mbytes parameter to make_actor and set it to 1024MB for Playwright tests. Co-Authored-By: Claude Opus 4.6 --- tests/e2e/conftest.py | 5 ++++- .../test_adaptive_playwright_crawler.py | 1 + tests/e2e/test_crawlee_crawlers/test_playwright_crawler.py | 1 + 3 files changed, 6 insertions(+), 1 deletion(-) diff --git a/tests/e2e/conftest.py b/tests/e2e/conftest.py index cb894087..79ae7e0c 100644 --- a/tests/e2e/conftest.py +++ b/tests/e2e/conftest.py @@ -190,6 +190,7 @@ def __call__( main_py: str | None = None, source_files: Mapping[str, str | bytes] | None = None, additional_requirements: list[str] | None = None, + memory_mbytes: int = 256, ) -> Awaitable[ActorClientAsync]: """Create a temporary Actor from the given main function or source files. @@ -204,6 +205,7 @@ def __call__( main_py: The `src/main.py` file of the Actor. source_files: A dictionary of the source files of the Actor. additional_requirements: A list of additional requirements to be added to the `requirements.txt`. + memory_mbytes: The default memory allocation for the Actor run in MB. Returns: A resource client for the created Actor. @@ -229,6 +231,7 @@ async def _make_actor( main_py: str | None = None, source_files: Mapping[str, str | bytes] | None = None, additional_requirements: list[str] | None = None, + memory_mbytes: int = 256, ) -> ActorClientAsync: if not (main_func or main_py or source_files): raise TypeError('One of `main_func`, `main_py` or `source_files` arguments must be specified') @@ -298,7 +301,7 @@ async def _make_actor( created_actor = await client.actors().create( name=actor_name, default_run_build='latest', - default_run_memory_mbytes=256, + default_run_memory_mbytes=memory_mbytes, default_run_timeout_secs=600, versions=[ { diff --git a/tests/e2e/test_crawlee_crawlers/test_adaptive_playwright_crawler.py b/tests/e2e/test_crawlee_crawlers/test_adaptive_playwright_crawler.py index 6733a6da..0c1edb96 100644 --- a/tests/e2e/test_crawlee_crawlers/test_adaptive_playwright_crawler.py +++ b/tests/e2e/test_crawlee_crawlers/test_adaptive_playwright_crawler.py @@ -55,6 +55,7 @@ async def test_adaptive_playwright_crawler(make_actor: MakeActorFunction, run_ac 'Dockerfile': _PLAYWRIGHT_DOCKERFILE, }, additional_requirements=['crawlee[all]>=1.0.0,<2.0.0'], + memory_mbytes=1024, ) run_result = await run_actor(actor) await _verify_crawler_results(actor, run_result, 'AdaptivePlaywrightCrawler') diff --git a/tests/e2e/test_crawlee_crawlers/test_playwright_crawler.py b/tests/e2e/test_crawlee_crawlers/test_playwright_crawler.py index 85b3571f..5b48b479 100644 --- a/tests/e2e/test_crawlee_crawlers/test_playwright_crawler.py +++ b/tests/e2e/test_crawlee_crawlers/test_playwright_crawler.py @@ -55,6 +55,7 @@ async def test_playwright_crawler(make_actor: MakeActorFunction, run_actor: RunA 'Dockerfile': _PLAYWRIGHT_DOCKERFILE, }, additional_requirements=['crawlee[playwright]>=1.0.0,<2.0.0'], + memory_mbytes=1024, ) run_result = await run_actor(actor) await _verify_crawler_results(actor, run_result, 'PlaywrightCrawler') From cfa347a292e079d7b625a01fa8f9e534e07ca87f Mon Sep 17 00:00:00 2001 From: Vlada Dusek Date: Fri, 13 Feb 2026 09:27:32 +0100 Subject: [PATCH 3/9] test: extract inline Python strings to dedicated files in crawlee crawler e2e tests Move Actor source code from triple-quoted string constants into standalone files under actor_source/, so they benefit from syntax highlighting, linting, and type-checking. Load them at runtime via Path.read_text() helpers. Co-Authored-By: Claude Opus 4.6 --- .../actor_source/Dockerfile.playwright | 9 ++ .../main_adaptive_playwright_crawler.py | 40 ++++++++ .../actor_source/main_basic_crawler.py | 74 ++++++++++++++ .../main_beautifulsoup_crawler.py | 40 ++++++++ .../actor_source/main_http_crawler.py | 46 +++++++++ .../actor_source/main_parsel_crawler.py | 40 ++++++++ .../actor_source/main_playwright_crawler.py | 40 ++++++++ .../actor_source/server.py | 84 ++++++++++++++++ tests/e2e/test_crawlee_crawlers/conftest.py | 98 ++----------------- .../test_adaptive_playwright_crawler.py | 46 +-------- .../test_basic_crawler.py | 80 +-------------- .../test_beautifulsoup_crawler.py | 45 +-------- .../test_http_crawler.py | 51 +--------- .../test_parsel_crawler.py | 45 +-------- .../test_playwright_crawler.py | 46 +-------- 15 files changed, 409 insertions(+), 375 deletions(-) create mode 100644 tests/e2e/test_crawlee_crawlers/actor_source/Dockerfile.playwright create mode 100644 tests/e2e/test_crawlee_crawlers/actor_source/main_adaptive_playwright_crawler.py create mode 100644 tests/e2e/test_crawlee_crawlers/actor_source/main_basic_crawler.py create mode 100644 tests/e2e/test_crawlee_crawlers/actor_source/main_beautifulsoup_crawler.py create mode 100644 tests/e2e/test_crawlee_crawlers/actor_source/main_http_crawler.py create mode 100644 tests/e2e/test_crawlee_crawlers/actor_source/main_parsel_crawler.py create mode 100644 tests/e2e/test_crawlee_crawlers/actor_source/main_playwright_crawler.py create mode 100644 tests/e2e/test_crawlee_crawlers/actor_source/server.py diff --git a/tests/e2e/test_crawlee_crawlers/actor_source/Dockerfile.playwright b/tests/e2e/test_crawlee_crawlers/actor_source/Dockerfile.playwright new file mode 100644 index 00000000..99c0e5f7 --- /dev/null +++ b/tests/e2e/test_crawlee_crawlers/actor_source/Dockerfile.playwright @@ -0,0 +1,9 @@ +FROM apify/actor-python-playwright:PYTHON_VERSION_PLACEHOLDER + +COPY . ./ + +RUN apt-get update && apt-get install -y git && rm -rf /var/lib/apt/lists/* + +RUN pip install --force-reinstall -r requirements.txt + +CMD ["sh", "-c", "python server.py & python -m src"] diff --git a/tests/e2e/test_crawlee_crawlers/actor_source/main_adaptive_playwright_crawler.py b/tests/e2e/test_crawlee_crawlers/actor_source/main_adaptive_playwright_crawler.py new file mode 100644 index 00000000..928a238d --- /dev/null +++ b/tests/e2e/test_crawlee_crawlers/actor_source/main_adaptive_playwright_crawler.py @@ -0,0 +1,40 @@ +from __future__ import annotations + +from crawlee.crawlers import AdaptivePlaywrightCrawler, AdaptivePlaywrightCrawlingContext + +from apify import Actor + + +async def main() -> None: + async with Actor: + pages_visited: list[str] = [] + crawler = AdaptivePlaywrightCrawler.with_parsel_static_parser() + + @crawler.router.default_handler + async def handler(context: AdaptivePlaywrightCrawlingContext) -> None: + pages_visited.append(context.request.url) + await context.enqueue_links() + + if '/products/' in context.request.url: + name = context.parsed_content.css('h1::text').get('').strip() + price = context.parsed_content.css('span.price::text').get('').strip() + description = context.parsed_content.css('p.description::text').get('').strip() + if name: + await context.push_data( + { + 'url': context.request.url, + 'name': name, + 'price': price, + 'description': description, + } + ) + + await crawler.run(['http://localhost:8080/']) + + await Actor.set_value( + 'CRAWLER_RESULT', + { + 'pages_visited_count': len(pages_visited), + 'crawler_type': 'AdaptivePlaywrightCrawler', + }, + ) diff --git a/tests/e2e/test_crawlee_crawlers/actor_source/main_basic_crawler.py b/tests/e2e/test_crawlee_crawlers/actor_source/main_basic_crawler.py new file mode 100644 index 00000000..2c5cdcff --- /dev/null +++ b/tests/e2e/test_crawlee_crawlers/actor_source/main_basic_crawler.py @@ -0,0 +1,74 @@ +from __future__ import annotations + +from html.parser import HTMLParser +from typing import TYPE_CHECKING + +from crawlee.crawlers import BasicCrawler + +from apify import Actor + +if TYPE_CHECKING: + from crawlee._types import BasicCrawlingContext + + +class _PageParser(HTMLParser): + def __init__(self) -> None: + super().__init__() + self.links: list[str] = [] + self.data: dict[str, str] = {} + self._in_tag: str | None = None + self._in_class: str = '' + + def handle_starttag(self, tag: str, attrs: list[tuple[str, str | None]]) -> None: + attrs_dict = dict(attrs) + if tag == 'a' and (href := attrs_dict.get('href')): + self.links.append(href) + self._in_tag = tag + self._in_class = attrs_dict.get('class', '') or '' + + def handle_endtag(self, tag: str) -> None: # noqa: ARG002 + self._in_tag = None + self._in_class = '' + + def handle_data(self, data: str) -> None: + text = data.strip() + if not text: + return + if self._in_tag == 'h1': + self.data['name'] = text + elif self._in_tag == 'span' and self._in_class == 'price': + self.data['price'] = text + elif self._in_tag == 'p' and self._in_class == 'description': + self.data['description'] = text + + +async def main() -> None: + async with Actor: + pages_visited: list[str] = [] + crawler = BasicCrawler() + + @crawler.router.default_handler + async def handler(context: BasicCrawlingContext) -> None: + pages_visited.append(context.request.url) + + response = await context.send_request(context.request.url) + html = (await response.read()).decode() + + parser = _PageParser() + parser.feed(html) + + base_url = 'http://localhost:8080' + await context.add_requests([f'{base_url}{link}' for link in parser.links if link.startswith('/')]) + + if '/products/' in context.request.url and parser.data.get('name'): + await context.push_data({'url': context.request.url, **parser.data}) + + await crawler.run(['http://localhost:8080/']) + + await Actor.set_value( + 'CRAWLER_RESULT', + { + 'pages_visited_count': len(pages_visited), + 'crawler_type': 'BasicCrawler', + }, + ) diff --git a/tests/e2e/test_crawlee_crawlers/actor_source/main_beautifulsoup_crawler.py b/tests/e2e/test_crawlee_crawlers/actor_source/main_beautifulsoup_crawler.py new file mode 100644 index 00000000..e613a475 --- /dev/null +++ b/tests/e2e/test_crawlee_crawlers/actor_source/main_beautifulsoup_crawler.py @@ -0,0 +1,40 @@ +from __future__ import annotations + +from crawlee.crawlers import BeautifulSoupCrawler, BeautifulSoupCrawlingContext + +from apify import Actor + + +async def main() -> None: + async with Actor: + pages_visited: list[str] = [] + crawler = BeautifulSoupCrawler() + + @crawler.router.default_handler + async def handler(context: BeautifulSoupCrawlingContext) -> None: + pages_visited.append(context.request.url) + await context.enqueue_links() + + if '/products/' in context.request.url: + name_tag = context.soup.find('h1') + price_tag = context.soup.find('span', class_='price') + desc_tag = context.soup.find('p', class_='description') + if name_tag: + await context.push_data( + { + 'url': context.request.url, + 'name': name_tag.get_text(strip=True), + 'price': price_tag.get_text(strip=True) if price_tag else '', + 'description': desc_tag.get_text(strip=True) if desc_tag else '', + } + ) + + await crawler.run(['http://localhost:8080/']) + + await Actor.set_value( + 'CRAWLER_RESULT', + { + 'pages_visited_count': len(pages_visited), + 'crawler_type': 'BeautifulSoupCrawler', + }, + ) diff --git a/tests/e2e/test_crawlee_crawlers/actor_source/main_http_crawler.py b/tests/e2e/test_crawlee_crawlers/actor_source/main_http_crawler.py new file mode 100644 index 00000000..5f96b6d7 --- /dev/null +++ b/tests/e2e/test_crawlee_crawlers/actor_source/main_http_crawler.py @@ -0,0 +1,46 @@ +from __future__ import annotations + +import re + +from crawlee.crawlers import HttpCrawler, HttpCrawlingContext + +from apify import Actor + + +async def main() -> None: + async with Actor: + pages_visited: list[str] = [] + crawler = HttpCrawler() + + @crawler.router.default_handler + async def handler(context: HttpCrawlingContext) -> None: + pages_visited.append(context.request.url) + html = (await context.http_response.read()).decode() + + links = re.findall(r'href="(/[^"]*)"', html) + base_url = 'http://localhost:8080' + await context.add_requests([f'{base_url}{link}' for link in links]) + + if '/products/' in context.request.url: + name_match = re.search(r'

(.*?)

', html) + price_match = re.search(r'(.*?)', html) + desc_match = re.search(r'

(.*?)

', html) + if name_match: + await context.push_data( + { + 'url': context.request.url, + 'name': name_match.group(1), + 'price': price_match.group(1) if price_match else '', + 'description': desc_match.group(1) if desc_match else '', + } + ) + + await crawler.run(['http://localhost:8080/']) + + await Actor.set_value( + 'CRAWLER_RESULT', + { + 'pages_visited_count': len(pages_visited), + 'crawler_type': 'HttpCrawler', + }, + ) diff --git a/tests/e2e/test_crawlee_crawlers/actor_source/main_parsel_crawler.py b/tests/e2e/test_crawlee_crawlers/actor_source/main_parsel_crawler.py new file mode 100644 index 00000000..253cfa4a --- /dev/null +++ b/tests/e2e/test_crawlee_crawlers/actor_source/main_parsel_crawler.py @@ -0,0 +1,40 @@ +from __future__ import annotations + +from crawlee.crawlers import ParselCrawler, ParselCrawlingContext + +from apify import Actor + + +async def main() -> None: + async with Actor: + pages_visited: list[str] = [] + crawler = ParselCrawler() + + @crawler.router.default_handler + async def handler(context: ParselCrawlingContext) -> None: + pages_visited.append(context.request.url) + await context.enqueue_links() + + if '/products/' in context.request.url: + name = context.selector.css('h1::text').get('').strip() + price = context.selector.css('span.price::text').get('').strip() + description = context.selector.css('p.description::text').get('').strip() + if name: + await context.push_data( + { + 'url': context.request.url, + 'name': name, + 'price': price, + 'description': description, + } + ) + + await crawler.run(['http://localhost:8080/']) + + await Actor.set_value( + 'CRAWLER_RESULT', + { + 'pages_visited_count': len(pages_visited), + 'crawler_type': 'ParselCrawler', + }, + ) diff --git a/tests/e2e/test_crawlee_crawlers/actor_source/main_playwright_crawler.py b/tests/e2e/test_crawlee_crawlers/actor_source/main_playwright_crawler.py new file mode 100644 index 00000000..b149ad7e --- /dev/null +++ b/tests/e2e/test_crawlee_crawlers/actor_source/main_playwright_crawler.py @@ -0,0 +1,40 @@ +from __future__ import annotations + +from crawlee.crawlers import PlaywrightCrawler, PlaywrightCrawlingContext + +from apify import Actor + + +async def main() -> None: + async with Actor: + pages_visited: list[str] = [] + crawler = PlaywrightCrawler() + + @crawler.router.default_handler + async def handler(context: PlaywrightCrawlingContext) -> None: + pages_visited.append(context.request.url) + await context.enqueue_links() + + if '/products/' in context.request.url: + name = await context.page.locator('h1').text_content() + price = await context.page.locator('span.price').text_content() + description = await context.page.locator('p.description').text_content() + if name: + await context.push_data( + { + 'url': context.request.url, + 'name': name.strip(), + 'price': (price or '').strip(), + 'description': (description or '').strip(), + } + ) + + await crawler.run(['http://localhost:8080/']) + + await Actor.set_value( + 'CRAWLER_RESULT', + { + 'pages_visited_count': len(pages_visited), + 'crawler_type': 'PlaywrightCrawler', + }, + ) diff --git a/tests/e2e/test_crawlee_crawlers/actor_source/server.py b/tests/e2e/test_crawlee_crawlers/actor_source/server.py new file mode 100644 index 00000000..6ae639c7 --- /dev/null +++ b/tests/e2e/test_crawlee_crawlers/actor_source/server.py @@ -0,0 +1,84 @@ +import asyncio +import logging +from collections.abc import Awaitable, Callable, Coroutine +from typing import Any + +from uvicorn import Config +from uvicorn.server import Server + +Receive = Callable[[], Awaitable[dict[str, Any]]] +Send = Callable[[dict[str, Any]], Coroutine[None, None, None]] + +_PRODUCTS = { + '1': {'name': 'Widget A', 'price': '$19.99', 'description': 'A basic widget for everyday use'}, + '2': {'name': 'Widget B', 'price': '$29.99', 'description': 'An advanced widget with extra features'}, + '3': {'name': 'Widget C', 'price': '$39.99', 'description': 'A premium widget for professionals'}, +} + + +async def _send_html(send: Send, html: str, status: int = 200) -> None: + await send( + { + 'type': 'http.response.start', + 'status': status, + 'headers': [[b'content-type', b'text/html; charset=utf-8']], + } + ) + await send({'type': 'http.response.body', 'body': html.encode()}) + + +async def app(scope: dict[str, Any], _receive: Receive, send: Send) -> None: + assert scope['type'] == 'http' + path = scope['path'] + + if path == '/': + await _send_html( + send, + 'E-commerce Test Store' + '

Welcome to Test Store

' + 'Widget A' + 'Widget B' + 'Widget C' + 'About Us' + '', + ) + elif path.startswith('/products/'): + product = _PRODUCTS.get(path.split('/')[-1]) + if product: + await _send_html( + send, + f'{product["name"]}' + f'

{product["name"]}

' + f'{product["price"]}' + f'

{product["description"]}

' + f'Back to Home' + f'', + ) + else: + await _send_html(send, 'Not Found', 404) + elif path == '/about': + await _send_html( + send, + 'About Us' + '

About Test Store

' + '

We sell the best widgets in the world.

' + 'Back to Home' + '', + ) + else: + await _send_html(send, 'Not Found', 404) + + +if __name__ == '__main__': + asyncio.run( + Server( + config=Config( + app=app, + lifespan='off', + loop='asyncio', + port=8080, + log_config=None, + log_level=logging.CRITICAL, + ) + ).serve() + ) diff --git a/tests/e2e/test_crawlee_crawlers/conftest.py b/tests/e2e/test_crawlee_crawlers/conftest.py index 402bb77c..cf15e482 100644 --- a/tests/e2e/test_crawlee_crawlers/conftest.py +++ b/tests/e2e/test_crawlee_crawlers/conftest.py @@ -2,6 +2,7 @@ import os import sys +from pathlib import Path from typing import TYPE_CHECKING if TYPE_CHECKING: @@ -11,102 +12,19 @@ _PYTHON_VERSION = os.getenv('INTEGRATION_TESTS_PYTHON_VERSION') or '.'.join(str(x) for x in sys.version_info[:2]) -_TEST_SERVER_PY = """\ -import asyncio -import logging -from collections.abc import Awaitable, Callable, Coroutine -from typing import Any +_ACTOR_SOURCE_DIR = Path(__file__).parent / 'actor_source' -from uvicorn import Config -from uvicorn.server import Server -Receive = Callable[[], Awaitable[dict[str, Any]]] -Send = Callable[[dict[str, Any]], Coroutine[None, None, None]] - -_PRODUCTS = { - '1': {'name': 'Widget A', 'price': '$19.99', 'description': 'A basic widget for everyday use'}, - '2': {'name': 'Widget B', 'price': '$29.99', 'description': 'An advanced widget with extra features'}, - '3': {'name': 'Widget C', 'price': '$39.99', 'description': 'A premium widget for professionals'}, -} +def _read_actor_source(filename: str) -> str: + return (_ACTOR_SOURCE_DIR / filename).read_text() -async def _send_html(send: Send, html: str, status: int = 200) -> None: - await send({ - 'type': 'http.response.start', - 'status': status, - 'headers': [[b'content-type', b'text/html; charset=utf-8']], - }) - await send({'type': 'http.response.body', 'body': html.encode()}) - - -async def app(scope: dict[str, Any], _receive: Receive, send: Send) -> None: - assert scope['type'] == 'http' - path = scope['path'] - - if path == '/': - await _send_html( - send, - 'E-commerce Test Store' - '

Welcome to Test Store

' - 'Widget A' - 'Widget B' - 'Widget C' - 'About Us' - '', - ) - elif path.startswith('/products/'): - product = _PRODUCTS.get(path.split('/')[-1]) - if product: - await _send_html( - send, - f'{product["name"]}' - f'

{product["name"]}

' - f'{product["price"]}' - f'

{product["description"]}

' - f'Back to Home' - f'', - ) - else: - await _send_html(send, 'Not Found', 404) - elif path == '/about': - await _send_html( - send, - 'About Us' - '

About Test Store

' - '

We sell the best widgets in the world.

' - 'Back to Home' - '', - ) - else: - await _send_html(send, 'Not Found', 404) - - -if __name__ == '__main__': - asyncio.run( - Server( - config=Config( - app=app, - lifespan='off', - loop='asyncio', - port=8080, - log_config=None, - log_level=logging.CRITICAL, - ) - ).serve() +def _get_playwright_dockerfile() -> str: + return _read_actor_source('Dockerfile.playwright').replace( + 'PYTHON_VERSION_PLACEHOLDER', + _PYTHON_VERSION, ) -""" - -_PLAYWRIGHT_DOCKERFILE = f"""\ -FROM apify/actor-python-playwright:{_PYTHON_VERSION} - -COPY . ./ - -RUN apt-get update && apt-get install -y git && rm -rf /var/lib/apt/lists/* - -RUN pip install --force-reinstall -r requirements.txt -CMD ["sh", "-c", "python server.py & python -m src"] -""" _EXPECTED_PRODUCTS = { 'Widget A': {'price': '$19.99', 'description': 'A basic widget for everyday use'}, diff --git a/tests/e2e/test_crawlee_crawlers/test_adaptive_playwright_crawler.py b/tests/e2e/test_crawlee_crawlers/test_adaptive_playwright_crawler.py index 0c1edb96..da1c9c21 100644 --- a/tests/e2e/test_crawlee_crawlers/test_adaptive_playwright_crawler.py +++ b/tests/e2e/test_crawlee_crawlers/test_adaptive_playwright_crawler.py @@ -2,57 +2,19 @@ from typing import TYPE_CHECKING -from .conftest import _PLAYWRIGHT_DOCKERFILE, _TEST_SERVER_PY, _verify_crawler_results +from .conftest import _get_playwright_dockerfile, _read_actor_source, _verify_crawler_results if TYPE_CHECKING: from ..conftest import MakeActorFunction, RunActorFunction -_ADAPTIVE_PLAYWRIGHT_CRAWLER_MAIN_PY = """\ -from __future__ import annotations - -from crawlee.crawlers import AdaptivePlaywrightCrawler, AdaptivePlaywrightCrawlingContext - -from apify import Actor - - -async def main() -> None: - async with Actor: - pages_visited: list[str] = [] - crawler = AdaptivePlaywrightCrawler.with_parsel_static_parser() - - @crawler.router.default_handler - async def handler(context: AdaptivePlaywrightCrawlingContext) -> None: - pages_visited.append(context.request.url) - await context.enqueue_links() - - if '/products/' in context.request.url: - name = context.parsed_content.css('h1::text').get('').strip() - price = context.parsed_content.css('span.price::text').get('').strip() - description = context.parsed_content.css('p.description::text').get('').strip() - if name: - await context.push_data({ - 'url': context.request.url, - 'name': name, - 'price': price, - 'description': description, - }) - - await crawler.run(['http://localhost:8080/']) - - await Actor.set_value('CRAWLER_RESULT', { - 'pages_visited_count': len(pages_visited), - 'crawler_type': 'AdaptivePlaywrightCrawler', - }) -""" - async def test_adaptive_playwright_crawler(make_actor: MakeActorFunction, run_actor: RunActorFunction) -> None: actor = await make_actor( label='crawl-adaptive', source_files={ - 'server.py': _TEST_SERVER_PY, - 'src/main.py': _ADAPTIVE_PLAYWRIGHT_CRAWLER_MAIN_PY, - 'Dockerfile': _PLAYWRIGHT_DOCKERFILE, + 'server.py': _read_actor_source('server.py'), + 'src/main.py': _read_actor_source('main_adaptive_playwright_crawler.py'), + 'Dockerfile': _get_playwright_dockerfile(), }, additional_requirements=['crawlee[all]>=1.0.0,<2.0.0'], memory_mbytes=1024, diff --git a/tests/e2e/test_crawlee_crawlers/test_basic_crawler.py b/tests/e2e/test_crawlee_crawlers/test_basic_crawler.py index 07187c58..3a96ddae 100644 --- a/tests/e2e/test_crawlee_crawlers/test_basic_crawler.py +++ b/tests/e2e/test_crawlee_crawlers/test_basic_crawler.py @@ -2,89 +2,19 @@ from typing import TYPE_CHECKING -from .conftest import _TEST_SERVER_PY, _verify_crawler_results +from .conftest import _read_actor_source, _verify_crawler_results if TYPE_CHECKING: from ..conftest import MakeActorFunction, RunActorFunction -_BASIC_CRAWLER_MAIN_PY = """\ -from __future__ import annotations - -from html.parser import HTMLParser - -from crawlee._types import BasicCrawlingContext -from crawlee.crawlers import BasicCrawler - -from apify import Actor - - -class _PageParser(HTMLParser): - def __init__(self) -> None: - super().__init__() - self.links: list[str] = [] - self.data: dict[str, str] = {} - self._in_tag: str | None = None - self._in_class: str = '' - - def handle_starttag(self, tag: str, attrs: list[tuple[str, str | None]]) -> None: - attrs_dict = dict(attrs) - if tag == 'a' and 'href' in attrs_dict: - self.links.append(attrs_dict['href']) - self._in_tag = tag - self._in_class = attrs_dict.get('class', '') or '' - - def handle_endtag(self, tag: str) -> None: - self._in_tag = None - self._in_class = '' - - def handle_data(self, data: str) -> None: - text = data.strip() - if not text: - return - if self._in_tag == 'h1': - self.data['name'] = text - elif self._in_tag == 'span' and self._in_class == 'price': - self.data['price'] = text - elif self._in_tag == 'p' and self._in_class == 'description': - self.data['description'] = text - - -async def main() -> None: - async with Actor: - pages_visited: list[str] = [] - crawler = BasicCrawler() - - @crawler.router.default_handler - async def handler(context: BasicCrawlingContext) -> None: - pages_visited.append(context.request.url) - - response = await context.send_request(context.request.url) - html = (await response.read()).decode() - - parser = _PageParser() - parser.feed(html) - - base_url = 'http://localhost:8080' - await context.add_requests( - [f'{base_url}{link}' for link in parser.links if link.startswith('/')] - ) - - if '/products/' in context.request.url and parser.data.get('name'): - await context.push_data({'url': context.request.url, **parser.data}) - - await crawler.run(['http://localhost:8080/']) - - await Actor.set_value('CRAWLER_RESULT', { - 'pages_visited_count': len(pages_visited), - 'crawler_type': 'BasicCrawler', - }) -""" - async def test_basic_crawler(make_actor: MakeActorFunction, run_actor: RunActorFunction) -> None: actor = await make_actor( label='crawl-basic', - source_files={'server.py': _TEST_SERVER_PY, 'src/main.py': _BASIC_CRAWLER_MAIN_PY}, + source_files={ + 'server.py': _read_actor_source('server.py'), + 'src/main.py': _read_actor_source('main_basic_crawler.py'), + }, ) run_result = await run_actor(actor) await _verify_crawler_results(actor, run_result, 'BasicCrawler') diff --git a/tests/e2e/test_crawlee_crawlers/test_beautifulsoup_crawler.py b/tests/e2e/test_crawlee_crawlers/test_beautifulsoup_crawler.py index f7981286..978a501f 100644 --- a/tests/e2e/test_crawlee_crawlers/test_beautifulsoup_crawler.py +++ b/tests/e2e/test_crawlee_crawlers/test_beautifulsoup_crawler.py @@ -2,54 +2,19 @@ from typing import TYPE_CHECKING -from .conftest import _TEST_SERVER_PY, _verify_crawler_results +from .conftest import _read_actor_source, _verify_crawler_results if TYPE_CHECKING: from ..conftest import MakeActorFunction, RunActorFunction -_BEAUTIFULSOUP_CRAWLER_MAIN_PY = """\ -from __future__ import annotations - -from crawlee.crawlers import BeautifulSoupCrawler, BeautifulSoupCrawlingContext - -from apify import Actor - - -async def main() -> None: - async with Actor: - pages_visited: list[str] = [] - crawler = BeautifulSoupCrawler() - - @crawler.router.default_handler - async def handler(context: BeautifulSoupCrawlingContext) -> None: - pages_visited.append(context.request.url) - await context.enqueue_links() - - if '/products/' in context.request.url: - name_tag = context.soup.find('h1') - price_tag = context.soup.find('span', class_='price') - desc_tag = context.soup.find('p', class_='description') - if name_tag: - await context.push_data({ - 'url': context.request.url, - 'name': name_tag.get_text(strip=True), - 'price': price_tag.get_text(strip=True) if price_tag else '', - 'description': desc_tag.get_text(strip=True) if desc_tag else '', - }) - - await crawler.run(['http://localhost:8080/']) - - await Actor.set_value('CRAWLER_RESULT', { - 'pages_visited_count': len(pages_visited), - 'crawler_type': 'BeautifulSoupCrawler', - }) -""" - async def test_beautifulsoup_crawler(make_actor: MakeActorFunction, run_actor: RunActorFunction) -> None: actor = await make_actor( label='crawl-bsoup', - source_files={'server.py': _TEST_SERVER_PY, 'src/main.py': _BEAUTIFULSOUP_CRAWLER_MAIN_PY}, + source_files={ + 'server.py': _read_actor_source('server.py'), + 'src/main.py': _read_actor_source('main_beautifulsoup_crawler.py'), + }, additional_requirements=['crawlee[beautifulsoup]>=1.0.0,<2.0.0'], ) run_result = await run_actor(actor) diff --git a/tests/e2e/test_crawlee_crawlers/test_http_crawler.py b/tests/e2e/test_crawlee_crawlers/test_http_crawler.py index cb0b45ec..398078ed 100644 --- a/tests/e2e/test_crawlee_crawlers/test_http_crawler.py +++ b/tests/e2e/test_crawlee_crawlers/test_http_crawler.py @@ -2,60 +2,19 @@ from typing import TYPE_CHECKING -from .conftest import _TEST_SERVER_PY, _verify_crawler_results +from .conftest import _read_actor_source, _verify_crawler_results if TYPE_CHECKING: from ..conftest import MakeActorFunction, RunActorFunction -_HTTP_CRAWLER_MAIN_PY = """\ -from __future__ import annotations - -import re - -from crawlee.crawlers import HttpCrawler, HttpCrawlingContext - -from apify import Actor - - -async def main() -> None: - async with Actor: - pages_visited: list[str] = [] - crawler = HttpCrawler() - - @crawler.router.default_handler - async def handler(context: HttpCrawlingContext) -> None: - pages_visited.append(context.request.url) - html = (await context.http_response.read()).decode() - - links = re.findall(r'href="(/[^"]*)"', html) - base_url = 'http://localhost:8080' - await context.add_requests([f'{base_url}{link}' for link in links]) - - if '/products/' in context.request.url: - name_match = re.search(r'

(.*?)

', html) - price_match = re.search(r'(.*?)', html) - desc_match = re.search(r'

(.*?)

', html) - if name_match: - await context.push_data({ - 'url': context.request.url, - 'name': name_match.group(1), - 'price': price_match.group(1) if price_match else '', - 'description': desc_match.group(1) if desc_match else '', - }) - - await crawler.run(['http://localhost:8080/']) - - await Actor.set_value('CRAWLER_RESULT', { - 'pages_visited_count': len(pages_visited), - 'crawler_type': 'HttpCrawler', - }) -""" - async def test_http_crawler(make_actor: MakeActorFunction, run_actor: RunActorFunction) -> None: actor = await make_actor( label='crawl-http', - source_files={'server.py': _TEST_SERVER_PY, 'src/main.py': _HTTP_CRAWLER_MAIN_PY}, + source_files={ + 'server.py': _read_actor_source('server.py'), + 'src/main.py': _read_actor_source('main_http_crawler.py'), + }, ) run_result = await run_actor(actor) await _verify_crawler_results(actor, run_result, 'HttpCrawler') diff --git a/tests/e2e/test_crawlee_crawlers/test_parsel_crawler.py b/tests/e2e/test_crawlee_crawlers/test_parsel_crawler.py index 02aeb934..beb1a341 100644 --- a/tests/e2e/test_crawlee_crawlers/test_parsel_crawler.py +++ b/tests/e2e/test_crawlee_crawlers/test_parsel_crawler.py @@ -2,54 +2,19 @@ from typing import TYPE_CHECKING -from .conftest import _TEST_SERVER_PY, _verify_crawler_results +from .conftest import _read_actor_source, _verify_crawler_results if TYPE_CHECKING: from ..conftest import MakeActorFunction, RunActorFunction -_PARSEL_CRAWLER_MAIN_PY = """\ -from __future__ import annotations - -from crawlee.crawlers import ParselCrawler, ParselCrawlingContext - -from apify import Actor - - -async def main() -> None: - async with Actor: - pages_visited: list[str] = [] - crawler = ParselCrawler() - - @crawler.router.default_handler - async def handler(context: ParselCrawlingContext) -> None: - pages_visited.append(context.request.url) - await context.enqueue_links() - - if '/products/' in context.request.url: - name = context.selector.css('h1::text').get('').strip() - price = context.selector.css('span.price::text').get('').strip() - description = context.selector.css('p.description::text').get('').strip() - if name: - await context.push_data({ - 'url': context.request.url, - 'name': name, - 'price': price, - 'description': description, - }) - - await crawler.run(['http://localhost:8080/']) - - await Actor.set_value('CRAWLER_RESULT', { - 'pages_visited_count': len(pages_visited), - 'crawler_type': 'ParselCrawler', - }) -""" - async def test_parsel_crawler(make_actor: MakeActorFunction, run_actor: RunActorFunction) -> None: actor = await make_actor( label='crawl-parsel', - source_files={'server.py': _TEST_SERVER_PY, 'src/main.py': _PARSEL_CRAWLER_MAIN_PY}, + source_files={ + 'server.py': _read_actor_source('server.py'), + 'src/main.py': _read_actor_source('main_parsel_crawler.py'), + }, ) run_result = await run_actor(actor) await _verify_crawler_results(actor, run_result, 'ParselCrawler') diff --git a/tests/e2e/test_crawlee_crawlers/test_playwright_crawler.py b/tests/e2e/test_crawlee_crawlers/test_playwright_crawler.py index 5b48b479..0bbfa3ee 100644 --- a/tests/e2e/test_crawlee_crawlers/test_playwright_crawler.py +++ b/tests/e2e/test_crawlee_crawlers/test_playwright_crawler.py @@ -2,57 +2,19 @@ from typing import TYPE_CHECKING -from .conftest import _PLAYWRIGHT_DOCKERFILE, _TEST_SERVER_PY, _verify_crawler_results +from .conftest import _get_playwright_dockerfile, _read_actor_source, _verify_crawler_results if TYPE_CHECKING: from ..conftest import MakeActorFunction, RunActorFunction -_PLAYWRIGHT_CRAWLER_MAIN_PY = """\ -from __future__ import annotations - -from crawlee.crawlers import PlaywrightCrawler, PlaywrightCrawlingContext - -from apify import Actor - - -async def main() -> None: - async with Actor: - pages_visited: list[str] = [] - crawler = PlaywrightCrawler() - - @crawler.router.default_handler - async def handler(context: PlaywrightCrawlingContext) -> None: - pages_visited.append(context.request.url) - await context.enqueue_links() - - if '/products/' in context.request.url: - name = await context.page.locator('h1').text_content() - price = await context.page.locator('span.price').text_content() - description = await context.page.locator('p.description').text_content() - if name: - await context.push_data({ - 'url': context.request.url, - 'name': name.strip(), - 'price': (price or '').strip(), - 'description': (description or '').strip(), - }) - - await crawler.run(['http://localhost:8080/']) - - await Actor.set_value('CRAWLER_RESULT', { - 'pages_visited_count': len(pages_visited), - 'crawler_type': 'PlaywrightCrawler', - }) -""" - async def test_playwright_crawler(make_actor: MakeActorFunction, run_actor: RunActorFunction) -> None: actor = await make_actor( label='crawl-playwright', source_files={ - 'server.py': _TEST_SERVER_PY, - 'src/main.py': _PLAYWRIGHT_CRAWLER_MAIN_PY, - 'Dockerfile': _PLAYWRIGHT_DOCKERFILE, + 'server.py': _read_actor_source('server.py'), + 'src/main.py': _read_actor_source('main_playwright_crawler.py'), + 'Dockerfile': _get_playwright_dockerfile(), }, additional_requirements=['crawlee[playwright]>=1.0.0,<2.0.0'], memory_mbytes=1024, From d7d3adf514cfa98abc4f28d638a0f93e5d8d88bb Mon Sep 17 00:00:00 2001 From: Vlada Dusek Date: Fri, 13 Feb 2026 09:33:31 +0100 Subject: [PATCH 4/9] test: remove underscore prefix from shared test helper functions These functions are imported across modules, so they are part of the test package's public API and shouldn't use the private convention. Co-Authored-By: Claude Opus 4.6 --- tests/e2e/test_crawlee_crawlers/conftest.py | 8 ++++---- .../test_adaptive_playwright_crawler.py | 10 +++++----- tests/e2e/test_crawlee_crawlers/test_basic_crawler.py | 8 ++++---- .../test_beautifulsoup_crawler.py | 8 ++++---- tests/e2e/test_crawlee_crawlers/test_http_crawler.py | 8 ++++---- tests/e2e/test_crawlee_crawlers/test_parsel_crawler.py | 8 ++++---- .../test_crawlee_crawlers/test_playwright_crawler.py | 10 +++++----- 7 files changed, 30 insertions(+), 30 deletions(-) diff --git a/tests/e2e/test_crawlee_crawlers/conftest.py b/tests/e2e/test_crawlee_crawlers/conftest.py index cf15e482..e66ca7d9 100644 --- a/tests/e2e/test_crawlee_crawlers/conftest.py +++ b/tests/e2e/test_crawlee_crawlers/conftest.py @@ -15,12 +15,12 @@ _ACTOR_SOURCE_DIR = Path(__file__).parent / 'actor_source' -def _read_actor_source(filename: str) -> str: +def read_actor_source(filename: str) -> str: return (_ACTOR_SOURCE_DIR / filename).read_text() -def _get_playwright_dockerfile() -> str: - return _read_actor_source('Dockerfile.playwright').replace( +def get_playwright_dockerfile() -> str: + return read_actor_source('Dockerfile.playwright').replace( 'PYTHON_VERSION_PLACEHOLDER', _PYTHON_VERSION, ) @@ -33,7 +33,7 @@ def _get_playwright_dockerfile() -> str: } -async def _verify_crawler_results( +async def verify_crawler_results( actor: ActorClientAsync, run_result: ActorRun, expected_crawler_type: str, diff --git a/tests/e2e/test_crawlee_crawlers/test_adaptive_playwright_crawler.py b/tests/e2e/test_crawlee_crawlers/test_adaptive_playwright_crawler.py index da1c9c21..e8b22b2f 100644 --- a/tests/e2e/test_crawlee_crawlers/test_adaptive_playwright_crawler.py +++ b/tests/e2e/test_crawlee_crawlers/test_adaptive_playwright_crawler.py @@ -2,7 +2,7 @@ from typing import TYPE_CHECKING -from .conftest import _get_playwright_dockerfile, _read_actor_source, _verify_crawler_results +from .conftest import get_playwright_dockerfile, read_actor_source, verify_crawler_results if TYPE_CHECKING: from ..conftest import MakeActorFunction, RunActorFunction @@ -12,12 +12,12 @@ async def test_adaptive_playwright_crawler(make_actor: MakeActorFunction, run_ac actor = await make_actor( label='crawl-adaptive', source_files={ - 'server.py': _read_actor_source('server.py'), - 'src/main.py': _read_actor_source('main_adaptive_playwright_crawler.py'), - 'Dockerfile': _get_playwright_dockerfile(), + 'server.py': read_actor_source('server.py'), + 'src/main.py': read_actor_source('main_adaptive_playwright_crawler.py'), + 'Dockerfile': get_playwright_dockerfile(), }, additional_requirements=['crawlee[all]>=1.0.0,<2.0.0'], memory_mbytes=1024, ) run_result = await run_actor(actor) - await _verify_crawler_results(actor, run_result, 'AdaptivePlaywrightCrawler') + await verify_crawler_results(actor, run_result, 'AdaptivePlaywrightCrawler') diff --git a/tests/e2e/test_crawlee_crawlers/test_basic_crawler.py b/tests/e2e/test_crawlee_crawlers/test_basic_crawler.py index 3a96ddae..58dc44f5 100644 --- a/tests/e2e/test_crawlee_crawlers/test_basic_crawler.py +++ b/tests/e2e/test_crawlee_crawlers/test_basic_crawler.py @@ -2,7 +2,7 @@ from typing import TYPE_CHECKING -from .conftest import _read_actor_source, _verify_crawler_results +from .conftest import read_actor_source, verify_crawler_results if TYPE_CHECKING: from ..conftest import MakeActorFunction, RunActorFunction @@ -12,9 +12,9 @@ async def test_basic_crawler(make_actor: MakeActorFunction, run_actor: RunActorF actor = await make_actor( label='crawl-basic', source_files={ - 'server.py': _read_actor_source('server.py'), - 'src/main.py': _read_actor_source('main_basic_crawler.py'), + 'server.py': read_actor_source('server.py'), + 'src/main.py': read_actor_source('main_basic_crawler.py'), }, ) run_result = await run_actor(actor) - await _verify_crawler_results(actor, run_result, 'BasicCrawler') + await verify_crawler_results(actor, run_result, 'BasicCrawler') diff --git a/tests/e2e/test_crawlee_crawlers/test_beautifulsoup_crawler.py b/tests/e2e/test_crawlee_crawlers/test_beautifulsoup_crawler.py index 978a501f..81a8cd37 100644 --- a/tests/e2e/test_crawlee_crawlers/test_beautifulsoup_crawler.py +++ b/tests/e2e/test_crawlee_crawlers/test_beautifulsoup_crawler.py @@ -2,7 +2,7 @@ from typing import TYPE_CHECKING -from .conftest import _read_actor_source, _verify_crawler_results +from .conftest import read_actor_source, verify_crawler_results if TYPE_CHECKING: from ..conftest import MakeActorFunction, RunActorFunction @@ -12,10 +12,10 @@ async def test_beautifulsoup_crawler(make_actor: MakeActorFunction, run_actor: R actor = await make_actor( label='crawl-bsoup', source_files={ - 'server.py': _read_actor_source('server.py'), - 'src/main.py': _read_actor_source('main_beautifulsoup_crawler.py'), + 'server.py': read_actor_source('server.py'), + 'src/main.py': read_actor_source('main_beautifulsoup_crawler.py'), }, additional_requirements=['crawlee[beautifulsoup]>=1.0.0,<2.0.0'], ) run_result = await run_actor(actor) - await _verify_crawler_results(actor, run_result, 'BeautifulSoupCrawler') + await verify_crawler_results(actor, run_result, 'BeautifulSoupCrawler') diff --git a/tests/e2e/test_crawlee_crawlers/test_http_crawler.py b/tests/e2e/test_crawlee_crawlers/test_http_crawler.py index 398078ed..69821139 100644 --- a/tests/e2e/test_crawlee_crawlers/test_http_crawler.py +++ b/tests/e2e/test_crawlee_crawlers/test_http_crawler.py @@ -2,7 +2,7 @@ from typing import TYPE_CHECKING -from .conftest import _read_actor_source, _verify_crawler_results +from .conftest import read_actor_source, verify_crawler_results if TYPE_CHECKING: from ..conftest import MakeActorFunction, RunActorFunction @@ -12,9 +12,9 @@ async def test_http_crawler(make_actor: MakeActorFunction, run_actor: RunActorFu actor = await make_actor( label='crawl-http', source_files={ - 'server.py': _read_actor_source('server.py'), - 'src/main.py': _read_actor_source('main_http_crawler.py'), + 'server.py': read_actor_source('server.py'), + 'src/main.py': read_actor_source('main_http_crawler.py'), }, ) run_result = await run_actor(actor) - await _verify_crawler_results(actor, run_result, 'HttpCrawler') + await verify_crawler_results(actor, run_result, 'HttpCrawler') diff --git a/tests/e2e/test_crawlee_crawlers/test_parsel_crawler.py b/tests/e2e/test_crawlee_crawlers/test_parsel_crawler.py index beb1a341..c4f002ec 100644 --- a/tests/e2e/test_crawlee_crawlers/test_parsel_crawler.py +++ b/tests/e2e/test_crawlee_crawlers/test_parsel_crawler.py @@ -2,7 +2,7 @@ from typing import TYPE_CHECKING -from .conftest import _read_actor_source, _verify_crawler_results +from .conftest import read_actor_source, verify_crawler_results if TYPE_CHECKING: from ..conftest import MakeActorFunction, RunActorFunction @@ -12,9 +12,9 @@ async def test_parsel_crawler(make_actor: MakeActorFunction, run_actor: RunActor actor = await make_actor( label='crawl-parsel', source_files={ - 'server.py': _read_actor_source('server.py'), - 'src/main.py': _read_actor_source('main_parsel_crawler.py'), + 'server.py': read_actor_source('server.py'), + 'src/main.py': read_actor_source('main_parsel_crawler.py'), }, ) run_result = await run_actor(actor) - await _verify_crawler_results(actor, run_result, 'ParselCrawler') + await verify_crawler_results(actor, run_result, 'ParselCrawler') diff --git a/tests/e2e/test_crawlee_crawlers/test_playwright_crawler.py b/tests/e2e/test_crawlee_crawlers/test_playwright_crawler.py index 0bbfa3ee..0fa7ecff 100644 --- a/tests/e2e/test_crawlee_crawlers/test_playwright_crawler.py +++ b/tests/e2e/test_crawlee_crawlers/test_playwright_crawler.py @@ -2,7 +2,7 @@ from typing import TYPE_CHECKING -from .conftest import _get_playwright_dockerfile, _read_actor_source, _verify_crawler_results +from .conftest import get_playwright_dockerfile, read_actor_source, verify_crawler_results if TYPE_CHECKING: from ..conftest import MakeActorFunction, RunActorFunction @@ -12,12 +12,12 @@ async def test_playwright_crawler(make_actor: MakeActorFunction, run_actor: RunA actor = await make_actor( label='crawl-playwright', source_files={ - 'server.py': _read_actor_source('server.py'), - 'src/main.py': _read_actor_source('main_playwright_crawler.py'), - 'Dockerfile': _get_playwright_dockerfile(), + 'server.py': read_actor_source('server.py'), + 'src/main.py': read_actor_source('main_playwright_crawler.py'), + 'Dockerfile': get_playwright_dockerfile(), }, additional_requirements=['crawlee[playwright]>=1.0.0,<2.0.0'], memory_mbytes=1024, ) run_result = await run_actor(actor) - await _verify_crawler_results(actor, run_result, 'PlaywrightCrawler') + await verify_crawler_results(actor, run_result, 'PlaywrightCrawler') From 5f34e280e22fe9b242e1406c6aba8f3a0b9bf807 Mon Sep 17 00:00:00 2001 From: Vlada Dusek Date: Fri, 13 Feb 2026 10:26:11 +0100 Subject: [PATCH 5/9] test: remove unused INTEGRATION_TESTS_PYTHON_VERSION and version constraints The env var was never set anywhere. Use sys.version_info directly. Also drop version constraints from additional_requirements in e2e tests. Co-Authored-By: Claude Opus 4.6 --- tests/e2e/conftest.py | 7 ++----- tests/e2e/test_crawlee_crawlers/conftest.py | 3 +-- .../test_adaptive_playwright_crawler.py | 2 +- .../test_crawlee_crawlers/test_beautifulsoup_crawler.py | 2 +- tests/e2e/test_crawlee_crawlers/test_playwright_crawler.py | 2 +- 5 files changed, 6 insertions(+), 10 deletions(-) diff --git a/tests/e2e/conftest.py b/tests/e2e/conftest.py index 79ae7e0c..4ae56fc6 100644 --- a/tests/e2e/conftest.py +++ b/tests/e2e/conftest.py @@ -168,12 +168,9 @@ def actor_base_source_files(sdk_wheel_path: Path) -> dict[str, str | bytes]: 'APIFY_SDK_WHEEL_PLACEHOLDER', f'./{sdk_wheel_file_name}' ) - current_major_minor_python_version = '.'.join([str(x) for x in sys.version_info[:2]]) - integration_tests_python_version = ( - os.getenv('INTEGRATION_TESTS_PYTHON_VERSION') or current_major_minor_python_version - ) + python_version = f'{sys.version_info[0]}.{sys.version_info[1]}' source_files['Dockerfile'] = str(source_files['Dockerfile']).replace( - 'BASE_IMAGE_VERSION_PLACEHOLDER', integration_tests_python_version + 'BASE_IMAGE_VERSION_PLACEHOLDER', python_version ) return source_files diff --git a/tests/e2e/test_crawlee_crawlers/conftest.py b/tests/e2e/test_crawlee_crawlers/conftest.py index e66ca7d9..310841d7 100644 --- a/tests/e2e/test_crawlee_crawlers/conftest.py +++ b/tests/e2e/test_crawlee_crawlers/conftest.py @@ -1,6 +1,5 @@ from __future__ import annotations -import os import sys from pathlib import Path from typing import TYPE_CHECKING @@ -10,7 +9,7 @@ from apify._models import ActorRun -_PYTHON_VERSION = os.getenv('INTEGRATION_TESTS_PYTHON_VERSION') or '.'.join(str(x) for x in sys.version_info[:2]) +_PYTHON_VERSION = f'{sys.version_info[0]}.{sys.version_info[1]}' _ACTOR_SOURCE_DIR = Path(__file__).parent / 'actor_source' diff --git a/tests/e2e/test_crawlee_crawlers/test_adaptive_playwright_crawler.py b/tests/e2e/test_crawlee_crawlers/test_adaptive_playwright_crawler.py index e8b22b2f..9be75a57 100644 --- a/tests/e2e/test_crawlee_crawlers/test_adaptive_playwright_crawler.py +++ b/tests/e2e/test_crawlee_crawlers/test_adaptive_playwright_crawler.py @@ -16,7 +16,7 @@ async def test_adaptive_playwright_crawler(make_actor: MakeActorFunction, run_ac 'src/main.py': read_actor_source('main_adaptive_playwright_crawler.py'), 'Dockerfile': get_playwright_dockerfile(), }, - additional_requirements=['crawlee[all]>=1.0.0,<2.0.0'], + additional_requirements=['crawlee[all]'], memory_mbytes=1024, ) run_result = await run_actor(actor) diff --git a/tests/e2e/test_crawlee_crawlers/test_beautifulsoup_crawler.py b/tests/e2e/test_crawlee_crawlers/test_beautifulsoup_crawler.py index 81a8cd37..d3f0c7a0 100644 --- a/tests/e2e/test_crawlee_crawlers/test_beautifulsoup_crawler.py +++ b/tests/e2e/test_crawlee_crawlers/test_beautifulsoup_crawler.py @@ -15,7 +15,7 @@ async def test_beautifulsoup_crawler(make_actor: MakeActorFunction, run_actor: R 'server.py': read_actor_source('server.py'), 'src/main.py': read_actor_source('main_beautifulsoup_crawler.py'), }, - additional_requirements=['crawlee[beautifulsoup]>=1.0.0,<2.0.0'], + additional_requirements=['crawlee[beautifulsoup]'], ) run_result = await run_actor(actor) await verify_crawler_results(actor, run_result, 'BeautifulSoupCrawler') diff --git a/tests/e2e/test_crawlee_crawlers/test_playwright_crawler.py b/tests/e2e/test_crawlee_crawlers/test_playwright_crawler.py index 0fa7ecff..6f4ff22a 100644 --- a/tests/e2e/test_crawlee_crawlers/test_playwright_crawler.py +++ b/tests/e2e/test_crawlee_crawlers/test_playwright_crawler.py @@ -16,7 +16,7 @@ async def test_playwright_crawler(make_actor: MakeActorFunction, run_actor: RunA 'src/main.py': read_actor_source('main_playwright_crawler.py'), 'Dockerfile': get_playwright_dockerfile(), }, - additional_requirements=['crawlee[playwright]>=1.0.0,<2.0.0'], + additional_requirements=['crawlee[playwright]'], memory_mbytes=1024, ) run_result = await run_actor(actor) From f8b2b457444877d27328fdbbfef4eb7b72ccd9c5 Mon Sep 17 00:00:00 2001 From: Vlada Dusek Date: Fri, 13 Feb 2026 10:30:22 +0100 Subject: [PATCH 6/9] test: rename Dockerfile.playwright to playwright.Dockerfile Co-Authored-By: Claude Opus 4.6 --- .../{Dockerfile.playwright => playwright.Dockerfile} | 0 tests/e2e/test_crawlee_crawlers/actor_source/server.py | 2 ++ tests/e2e/test_crawlee_crawlers/conftest.py | 2 +- 3 files changed, 3 insertions(+), 1 deletion(-) rename tests/e2e/test_crawlee_crawlers/actor_source/{Dockerfile.playwright => playwright.Dockerfile} (100%) diff --git a/tests/e2e/test_crawlee_crawlers/actor_source/Dockerfile.playwright b/tests/e2e/test_crawlee_crawlers/actor_source/playwright.Dockerfile similarity index 100% rename from tests/e2e/test_crawlee_crawlers/actor_source/Dockerfile.playwright rename to tests/e2e/test_crawlee_crawlers/actor_source/playwright.Dockerfile diff --git a/tests/e2e/test_crawlee_crawlers/actor_source/server.py b/tests/e2e/test_crawlee_crawlers/actor_source/server.py index 6ae639c7..20aff81a 100644 --- a/tests/e2e/test_crawlee_crawlers/actor_source/server.py +++ b/tests/e2e/test_crawlee_crawlers/actor_source/server.py @@ -1,3 +1,5 @@ +from __future__ import annotations + import asyncio import logging from collections.abc import Awaitable, Callable, Coroutine diff --git a/tests/e2e/test_crawlee_crawlers/conftest.py b/tests/e2e/test_crawlee_crawlers/conftest.py index 310841d7..b9b7ef8e 100644 --- a/tests/e2e/test_crawlee_crawlers/conftest.py +++ b/tests/e2e/test_crawlee_crawlers/conftest.py @@ -19,7 +19,7 @@ def read_actor_source(filename: str) -> str: def get_playwright_dockerfile() -> str: - return read_actor_source('Dockerfile.playwright').replace( + return read_actor_source('playwright.Dockerfile').replace( 'PYTHON_VERSION_PLACEHOLDER', _PYTHON_VERSION, ) From 2144a1fa23292bc74405613466877604817f66d0 Mon Sep 17 00:00:00 2001 From: Vlada Dusek Date: Fri, 13 Feb 2026 16:05:55 +0100 Subject: [PATCH 7/9] test: merge server.py files and add max_crawl_depth to crawlee crawler e2e tests Consolidate the two separate server.py files (actor_source_base and test_crawlee_crawlers/actor_source) into a single base server with a category-based depth structure and an infinite /deep/N chain. Add max_crawl_depth=2 to all crawler constructors to test depth limiting. Co-Authored-By: Claude Opus 4.6 --- tests/e2e/actor_source_base/server.py | 152 +++++++++++------- .../main_adaptive_playwright_crawler.py | 2 +- .../actor_source/main_basic_crawler.py | 2 +- .../main_beautifulsoup_crawler.py | 2 +- .../actor_source/main_http_crawler.py | 2 +- .../actor_source/main_parsel_crawler.py | 2 +- .../actor_source/main_playwright_crawler.py | 2 +- .../actor_source/server.py | 86 ---------- tests/e2e/test_crawlee_crawlers/conftest.py | 3 + .../test_adaptive_playwright_crawler.py | 1 - .../test_basic_crawler.py | 1 - .../test_beautifulsoup_crawler.py | 1 - .../test_http_crawler.py | 1 - .../test_parsel_crawler.py | 1 - .../test_playwright_crawler.py | 1 - 15 files changed, 102 insertions(+), 157 deletions(-) delete mode 100644 tests/e2e/test_crawlee_crawlers/actor_source/server.py diff --git a/tests/e2e/actor_source_base/server.py b/tests/e2e/actor_source_base/server.py index c21ecd9e..a632bc7f 100644 --- a/tests/e2e/actor_source_base/server.py +++ b/tests/e2e/actor_source_base/server.py @@ -1,31 +1,40 @@ -""" -Test server is infinite server http://localhost:8080/{any_number} and each page has links to the next 10 pages. -For example: - http://localhost:8080/ contains links: -http://localhost:8080/0, http://localhost:8080/1, ..., http://localhost:8080/9 +"""Test HTTP server for e2e tests. + +Serves an e-commerce test website with a category-based structure for testing crawl depth: - http://localhost:8080/1 contains links: -http://localhost:8080/10, http://localhost:8080/11, ..., http://localhost:8080/19 + / (depth 0) - Homepage with links to categories, about page, and deep chain + /categories/electronics (depth 1) - Links to products 1 and 2 + /categories/home (depth 1) - Links to product 3 + /about (depth 1) - About page + /deep/1 (depth 1) -> /deep/2 (depth 2) -> /deep/3 (depth 3) -> ... (infinite chain) + /products/1 (depth 2) - Widget A + /products/2 (depth 2) - Widget B + /products/3 (depth 2) - Widget C -... and so on. +With max_crawl_depth=2, the crawler reaches all products but does not go beyond /deep/2. """ +from __future__ import annotations + import asyncio import logging from collections.abc import Awaitable, Callable, Coroutine -from socket import socket from typing import Any from uvicorn import Config from uvicorn.server import Server -from yarl import URL Receive = Callable[[], Awaitable[dict[str, Any]]] Send = Callable[[dict[str, Any]], Coroutine[None, None, None]] +_PRODUCTS = { + '1': {'name': 'Widget A', 'price': '$19.99', 'description': 'A basic widget for everyday use'}, + '2': {'name': 'Widget B', 'price': '$29.99', 'description': 'An advanced widget with extra features'}, + '3': {'name': 'Widget C', 'price': '$39.99', 'description': 'A premium widget for professionals'}, +} + -async def send_html_response(send: Send, html_content: bytes, status: int = 200) -> None: - """Send an HTML response to the client.""" +async def _send_html(send: Send, html: str, status: int = 200) -> None: await send( { 'type': 'http.response.start', @@ -33,62 +42,87 @@ async def send_html_response(send: Send, html_content: bytes, status: int = 200) 'headers': [[b'content-type', b'text/html; charset=utf-8']], } ) - await send({'type': 'http.response.body', 'body': html_content}) + await send({'type': 'http.response.body', 'body': html.encode()}) -async def app(scope: dict[str, Any], _: Receive, send: Send) -> None: - """Main ASGI application handler that routes requests to specific handlers. - - Args: - scope: The ASGI connection scope. - _: The ASGI receive function. - send: The ASGI send function. - """ +async def app(scope: dict[str, Any], _receive: Receive, send: Send) -> None: assert scope['type'] == 'http' path = scope['path'] - links = '\n'.join(f'{path}{i}' for i in range(10)) - await send_html_response( - send, - f"""\ - - Title for {path} - - - {links} -""".encode(), - ) - - -class TestServer(Server): - """A test HTTP server implementation based on Uvicorn Server.""" - - @property - def url(self) -> URL: - """Get the base URL of the server. - - Returns: - A URL instance with the server's base URL. - """ - protocol = 'https' if self.config.is_ssl else 'http' - return URL(f'{protocol}://{self.config.host}:{self.config.port}/') - - async def serve(self, sockets: list[socket] | None = None) -> None: - """Run the server.""" - if sockets: - raise RuntimeError('Simple TestServer does not support custom sockets') - self.restart_requested = asyncio.Event() - - loop = asyncio.get_event_loop() - tasks = { - loop.create_task(super().serve()), - } - await asyncio.wait(tasks) + if path == '/': + await _send_html( + send, + 'E-commerce Test Store' + '

Welcome to Test Store

' + 'Electronics' + 'Home & Garden' + 'About Us' + 'Explore More' + '', + ) + elif path == '/categories/electronics': + await _send_html( + send, + 'Electronics' + '

Electronics

' + 'Widget A' + 'Widget B' + 'Back to Home' + '', + ) + elif path == '/categories/home': + await _send_html( + send, + 'Home & Garden' + '

Home & Garden

' + 'Widget C' + 'Back to Home' + '', + ) + elif path.startswith('/products/'): + product = _PRODUCTS.get(path.split('/')[-1]) + if product: + await _send_html( + send, + f'{product["name"]}' + f'

{product["name"]}

' + f'{product["price"]}' + f'

{product["description"]}

' + f'Back to Home' + f'', + ) + else: + await _send_html(send, 'Not Found', 404) + elif path == '/about': + await _send_html( + send, + 'About Us' + '

About Test Store

' + '

We sell the best widgets in the world.

' + 'Back to Home' + '', + ) + elif path.startswith('/deep/'): + try: + n = int(path.split('/')[-1]) + except ValueError: + await _send_html(send, 'Not Found', 404) + return + await _send_html( + send, + f'Deep Page {n}' + f'

Deep Page {n}

' + f'Go Deeper' + f'Back to Home' + f'', + ) + else: + await _send_html(send, 'Not Found', 404) if __name__ == '__main__': asyncio.run( - TestServer( + Server( config=Config( app=app, lifespan='off', diff --git a/tests/e2e/test_crawlee_crawlers/actor_source/main_adaptive_playwright_crawler.py b/tests/e2e/test_crawlee_crawlers/actor_source/main_adaptive_playwright_crawler.py index 928a238d..55e0ec8f 100644 --- a/tests/e2e/test_crawlee_crawlers/actor_source/main_adaptive_playwright_crawler.py +++ b/tests/e2e/test_crawlee_crawlers/actor_source/main_adaptive_playwright_crawler.py @@ -8,7 +8,7 @@ async def main() -> None: async with Actor: pages_visited: list[str] = [] - crawler = AdaptivePlaywrightCrawler.with_parsel_static_parser() + crawler = AdaptivePlaywrightCrawler.with_parsel_static_parser(max_crawl_depth=2) @crawler.router.default_handler async def handler(context: AdaptivePlaywrightCrawlingContext) -> None: diff --git a/tests/e2e/test_crawlee_crawlers/actor_source/main_basic_crawler.py b/tests/e2e/test_crawlee_crawlers/actor_source/main_basic_crawler.py index 2c5cdcff..98ec114b 100644 --- a/tests/e2e/test_crawlee_crawlers/actor_source/main_basic_crawler.py +++ b/tests/e2e/test_crawlee_crawlers/actor_source/main_basic_crawler.py @@ -45,7 +45,7 @@ def handle_data(self, data: str) -> None: async def main() -> None: async with Actor: pages_visited: list[str] = [] - crawler = BasicCrawler() + crawler = BasicCrawler(max_crawl_depth=2) @crawler.router.default_handler async def handler(context: BasicCrawlingContext) -> None: diff --git a/tests/e2e/test_crawlee_crawlers/actor_source/main_beautifulsoup_crawler.py b/tests/e2e/test_crawlee_crawlers/actor_source/main_beautifulsoup_crawler.py index e613a475..09170d7b 100644 --- a/tests/e2e/test_crawlee_crawlers/actor_source/main_beautifulsoup_crawler.py +++ b/tests/e2e/test_crawlee_crawlers/actor_source/main_beautifulsoup_crawler.py @@ -8,7 +8,7 @@ async def main() -> None: async with Actor: pages_visited: list[str] = [] - crawler = BeautifulSoupCrawler() + crawler = BeautifulSoupCrawler(max_crawl_depth=2) @crawler.router.default_handler async def handler(context: BeautifulSoupCrawlingContext) -> None: diff --git a/tests/e2e/test_crawlee_crawlers/actor_source/main_http_crawler.py b/tests/e2e/test_crawlee_crawlers/actor_source/main_http_crawler.py index 5f96b6d7..c9c2bc5a 100644 --- a/tests/e2e/test_crawlee_crawlers/actor_source/main_http_crawler.py +++ b/tests/e2e/test_crawlee_crawlers/actor_source/main_http_crawler.py @@ -10,7 +10,7 @@ async def main() -> None: async with Actor: pages_visited: list[str] = [] - crawler = HttpCrawler() + crawler = HttpCrawler(max_crawl_depth=2) @crawler.router.default_handler async def handler(context: HttpCrawlingContext) -> None: diff --git a/tests/e2e/test_crawlee_crawlers/actor_source/main_parsel_crawler.py b/tests/e2e/test_crawlee_crawlers/actor_source/main_parsel_crawler.py index 253cfa4a..38800dfb 100644 --- a/tests/e2e/test_crawlee_crawlers/actor_source/main_parsel_crawler.py +++ b/tests/e2e/test_crawlee_crawlers/actor_source/main_parsel_crawler.py @@ -8,7 +8,7 @@ async def main() -> None: async with Actor: pages_visited: list[str] = [] - crawler = ParselCrawler() + crawler = ParselCrawler(max_crawl_depth=2) @crawler.router.default_handler async def handler(context: ParselCrawlingContext) -> None: diff --git a/tests/e2e/test_crawlee_crawlers/actor_source/main_playwright_crawler.py b/tests/e2e/test_crawlee_crawlers/actor_source/main_playwright_crawler.py index b149ad7e..650dda7d 100644 --- a/tests/e2e/test_crawlee_crawlers/actor_source/main_playwright_crawler.py +++ b/tests/e2e/test_crawlee_crawlers/actor_source/main_playwright_crawler.py @@ -8,7 +8,7 @@ async def main() -> None: async with Actor: pages_visited: list[str] = [] - crawler = PlaywrightCrawler() + crawler = PlaywrightCrawler(max_crawl_depth=2) @crawler.router.default_handler async def handler(context: PlaywrightCrawlingContext) -> None: diff --git a/tests/e2e/test_crawlee_crawlers/actor_source/server.py b/tests/e2e/test_crawlee_crawlers/actor_source/server.py deleted file mode 100644 index 20aff81a..00000000 --- a/tests/e2e/test_crawlee_crawlers/actor_source/server.py +++ /dev/null @@ -1,86 +0,0 @@ -from __future__ import annotations - -import asyncio -import logging -from collections.abc import Awaitable, Callable, Coroutine -from typing import Any - -from uvicorn import Config -from uvicorn.server import Server - -Receive = Callable[[], Awaitable[dict[str, Any]]] -Send = Callable[[dict[str, Any]], Coroutine[None, None, None]] - -_PRODUCTS = { - '1': {'name': 'Widget A', 'price': '$19.99', 'description': 'A basic widget for everyday use'}, - '2': {'name': 'Widget B', 'price': '$29.99', 'description': 'An advanced widget with extra features'}, - '3': {'name': 'Widget C', 'price': '$39.99', 'description': 'A premium widget for professionals'}, -} - - -async def _send_html(send: Send, html: str, status: int = 200) -> None: - await send( - { - 'type': 'http.response.start', - 'status': status, - 'headers': [[b'content-type', b'text/html; charset=utf-8']], - } - ) - await send({'type': 'http.response.body', 'body': html.encode()}) - - -async def app(scope: dict[str, Any], _receive: Receive, send: Send) -> None: - assert scope['type'] == 'http' - path = scope['path'] - - if path == '/': - await _send_html( - send, - 'E-commerce Test Store' - '

Welcome to Test Store

' - 'Widget A' - 'Widget B' - 'Widget C' - 'About Us' - '', - ) - elif path.startswith('/products/'): - product = _PRODUCTS.get(path.split('/')[-1]) - if product: - await _send_html( - send, - f'{product["name"]}' - f'

{product["name"]}

' - f'{product["price"]}' - f'

{product["description"]}

' - f'Back to Home' - f'', - ) - else: - await _send_html(send, 'Not Found', 404) - elif path == '/about': - await _send_html( - send, - 'About Us' - '

About Test Store

' - '

We sell the best widgets in the world.

' - 'Back to Home' - '', - ) - else: - await _send_html(send, 'Not Found', 404) - - -if __name__ == '__main__': - asyncio.run( - Server( - config=Config( - app=app, - lifespan='off', - loop='asyncio', - port=8080, - log_config=None, - log_level=logging.CRITICAL, - ) - ).serve() - ) diff --git a/tests/e2e/test_crawlee_crawlers/conftest.py b/tests/e2e/test_crawlee_crawlers/conftest.py index b9b7ef8e..9965e5cc 100644 --- a/tests/e2e/test_crawlee_crawlers/conftest.py +++ b/tests/e2e/test_crawlee_crawlers/conftest.py @@ -58,4 +58,7 @@ async def verify_crawler_results( assert kvs_record is not None result = kvs_record['value'] assert result['crawler_type'] == expected_crawler_type + # With max_crawl_depth=2, the server has 9 pages reachable (homepage, 2 categories, about, /deep/1, + # 3 products, /deep/2). The crawler should visit most of them but not go beyond /deep/2. assert result['pages_visited_count'] >= 5 + assert result['pages_visited_count'] <= 15 diff --git a/tests/e2e/test_crawlee_crawlers/test_adaptive_playwright_crawler.py b/tests/e2e/test_crawlee_crawlers/test_adaptive_playwright_crawler.py index 9be75a57..bc84a50a 100644 --- a/tests/e2e/test_crawlee_crawlers/test_adaptive_playwright_crawler.py +++ b/tests/e2e/test_crawlee_crawlers/test_adaptive_playwright_crawler.py @@ -12,7 +12,6 @@ async def test_adaptive_playwright_crawler(make_actor: MakeActorFunction, run_ac actor = await make_actor( label='crawl-adaptive', source_files={ - 'server.py': read_actor_source('server.py'), 'src/main.py': read_actor_source('main_adaptive_playwright_crawler.py'), 'Dockerfile': get_playwright_dockerfile(), }, diff --git a/tests/e2e/test_crawlee_crawlers/test_basic_crawler.py b/tests/e2e/test_crawlee_crawlers/test_basic_crawler.py index 58dc44f5..8667f393 100644 --- a/tests/e2e/test_crawlee_crawlers/test_basic_crawler.py +++ b/tests/e2e/test_crawlee_crawlers/test_basic_crawler.py @@ -12,7 +12,6 @@ async def test_basic_crawler(make_actor: MakeActorFunction, run_actor: RunActorF actor = await make_actor( label='crawl-basic', source_files={ - 'server.py': read_actor_source('server.py'), 'src/main.py': read_actor_source('main_basic_crawler.py'), }, ) diff --git a/tests/e2e/test_crawlee_crawlers/test_beautifulsoup_crawler.py b/tests/e2e/test_crawlee_crawlers/test_beautifulsoup_crawler.py index d3f0c7a0..56d0a497 100644 --- a/tests/e2e/test_crawlee_crawlers/test_beautifulsoup_crawler.py +++ b/tests/e2e/test_crawlee_crawlers/test_beautifulsoup_crawler.py @@ -12,7 +12,6 @@ async def test_beautifulsoup_crawler(make_actor: MakeActorFunction, run_actor: R actor = await make_actor( label='crawl-bsoup', source_files={ - 'server.py': read_actor_source('server.py'), 'src/main.py': read_actor_source('main_beautifulsoup_crawler.py'), }, additional_requirements=['crawlee[beautifulsoup]'], diff --git a/tests/e2e/test_crawlee_crawlers/test_http_crawler.py b/tests/e2e/test_crawlee_crawlers/test_http_crawler.py index 69821139..86dc0e37 100644 --- a/tests/e2e/test_crawlee_crawlers/test_http_crawler.py +++ b/tests/e2e/test_crawlee_crawlers/test_http_crawler.py @@ -12,7 +12,6 @@ async def test_http_crawler(make_actor: MakeActorFunction, run_actor: RunActorFu actor = await make_actor( label='crawl-http', source_files={ - 'server.py': read_actor_source('server.py'), 'src/main.py': read_actor_source('main_http_crawler.py'), }, ) diff --git a/tests/e2e/test_crawlee_crawlers/test_parsel_crawler.py b/tests/e2e/test_crawlee_crawlers/test_parsel_crawler.py index c4f002ec..0815c9f2 100644 --- a/tests/e2e/test_crawlee_crawlers/test_parsel_crawler.py +++ b/tests/e2e/test_crawlee_crawlers/test_parsel_crawler.py @@ -12,7 +12,6 @@ async def test_parsel_crawler(make_actor: MakeActorFunction, run_actor: RunActor actor = await make_actor( label='crawl-parsel', source_files={ - 'server.py': read_actor_source('server.py'), 'src/main.py': read_actor_source('main_parsel_crawler.py'), }, ) diff --git a/tests/e2e/test_crawlee_crawlers/test_playwright_crawler.py b/tests/e2e/test_crawlee_crawlers/test_playwright_crawler.py index 6f4ff22a..5d7d8244 100644 --- a/tests/e2e/test_crawlee_crawlers/test_playwright_crawler.py +++ b/tests/e2e/test_crawlee_crawlers/test_playwright_crawler.py @@ -12,7 +12,6 @@ async def test_playwright_crawler(make_actor: MakeActorFunction, run_actor: RunA actor = await make_actor( label='crawl-playwright', source_files={ - 'server.py': read_actor_source('server.py'), 'src/main.py': read_actor_source('main_playwright_crawler.py'), 'Dockerfile': get_playwright_dockerfile(), }, From 8dd9a169ca0d04b476092458f88644e8ffb2aeec Mon Sep 17 00:00:00 2001 From: Vlada Dusek Date: Fri, 13 Feb 2026 16:08:22 +0100 Subject: [PATCH 8/9] test: merge Scrapy server.py into the shared base server Add direct product links to the base server homepage so Scrapy spiders (which look for /products/ links on the start page) work without their own server.py. Now all e2e tests share a single server. Co-Authored-By: Claude Opus 4.6 --- tests/e2e/actor_source_base/server.py | 15 ++-- tests/e2e/test_scrapy/actor_source/server.py | 86 -------------------- tests/e2e/test_scrapy/conftest.py | 1 - 3 files changed, 10 insertions(+), 92 deletions(-) delete mode 100644 tests/e2e/test_scrapy/actor_source/server.py diff --git a/tests/e2e/actor_source_base/server.py b/tests/e2e/actor_source_base/server.py index a632bc7f..fd5d1f38 100644 --- a/tests/e2e/actor_source_base/server.py +++ b/tests/e2e/actor_source_base/server.py @@ -2,16 +2,18 @@ Serves an e-commerce test website with a category-based structure for testing crawl depth: - / (depth 0) - Homepage with links to categories, about page, and deep chain + / (depth 0) - Homepage with links to products, categories, about page, and deep chain /categories/electronics (depth 1) - Links to products 1 and 2 /categories/home (depth 1) - Links to product 3 /about (depth 1) - About page /deep/1 (depth 1) -> /deep/2 (depth 2) -> /deep/3 (depth 3) -> ... (infinite chain) - /products/1 (depth 2) - Widget A - /products/2 (depth 2) - Widget B - /products/3 (depth 2) - Widget C + /products/1 (depth 1 or 2) - Widget A + /products/2 (depth 1 or 2) - Widget B + /products/3 (depth 1 or 2) - Widget C -With max_crawl_depth=2, the crawler reaches all products but does not go beyond /deep/2. +The homepage includes both direct product links (for Scrapy spiders that look for /products/ links +on the start page) and category links (for testing crawl depth with Crawlee crawlers). +With max_crawl_depth=2, the crawler reaches all products and categories but does not go beyond /deep/2. """ from __future__ import annotations @@ -54,6 +56,9 @@ async def app(scope: dict[str, Any], _receive: Receive, send: Send) -> None: send, 'E-commerce Test Store' '

Welcome to Test Store

' + 'Widget A' + 'Widget B' + 'Widget C' 'Electronics' 'Home & Garden' 'About Us' diff --git a/tests/e2e/test_scrapy/actor_source/server.py b/tests/e2e/test_scrapy/actor_source/server.py deleted file mode 100644 index 20aff81a..00000000 --- a/tests/e2e/test_scrapy/actor_source/server.py +++ /dev/null @@ -1,86 +0,0 @@ -from __future__ import annotations - -import asyncio -import logging -from collections.abc import Awaitable, Callable, Coroutine -from typing import Any - -from uvicorn import Config -from uvicorn.server import Server - -Receive = Callable[[], Awaitable[dict[str, Any]]] -Send = Callable[[dict[str, Any]], Coroutine[None, None, None]] - -_PRODUCTS = { - '1': {'name': 'Widget A', 'price': '$19.99', 'description': 'A basic widget for everyday use'}, - '2': {'name': 'Widget B', 'price': '$29.99', 'description': 'An advanced widget with extra features'}, - '3': {'name': 'Widget C', 'price': '$39.99', 'description': 'A premium widget for professionals'}, -} - - -async def _send_html(send: Send, html: str, status: int = 200) -> None: - await send( - { - 'type': 'http.response.start', - 'status': status, - 'headers': [[b'content-type', b'text/html; charset=utf-8']], - } - ) - await send({'type': 'http.response.body', 'body': html.encode()}) - - -async def app(scope: dict[str, Any], _receive: Receive, send: Send) -> None: - assert scope['type'] == 'http' - path = scope['path'] - - if path == '/': - await _send_html( - send, - 'E-commerce Test Store' - '

Welcome to Test Store

' - 'Widget A' - 'Widget B' - 'Widget C' - 'About Us' - '', - ) - elif path.startswith('/products/'): - product = _PRODUCTS.get(path.split('/')[-1]) - if product: - await _send_html( - send, - f'{product["name"]}' - f'

{product["name"]}

' - f'{product["price"]}' - f'

{product["description"]}

' - f'Back to Home' - f'', - ) - else: - await _send_html(send, 'Not Found', 404) - elif path == '/about': - await _send_html( - send, - 'About Us' - '

About Test Store

' - '

We sell the best widgets in the world.

' - 'Back to Home' - '', - ) - else: - await _send_html(send, 'Not Found', 404) - - -if __name__ == '__main__': - asyncio.run( - Server( - config=Config( - app=app, - lifespan='off', - loop='asyncio', - port=8080, - log_config=None, - log_level=logging.CRITICAL, - ) - ).serve() - ) diff --git a/tests/e2e/test_scrapy/conftest.py b/tests/e2e/test_scrapy/conftest.py index f5c0cc10..e19f6c36 100644 --- a/tests/e2e/test_scrapy/conftest.py +++ b/tests/e2e/test_scrapy/conftest.py @@ -22,7 +22,6 @@ def get_scrapy_source_files( extra_source_files: dict[str, str] | None = None, ) -> dict[str, str]: source_files: dict[str, str] = { - 'server.py': read_actor_source('server.py'), 'src/__main__.py': read_actor_source('__main__.py'), 'src/main.py': read_actor_source('main.py'), 'src/settings.py': read_actor_source('settings.py'), From 5277d2506e42ed371fdf097f04bb9a2e756065e3 Mon Sep 17 00:00:00 2001 From: Vlada Dusek Date: Fri, 13 Feb 2026 16:27:50 +0100 Subject: [PATCH 9/9] test: fix max_crawl_depth test to use /deep/ chain from shared server Update test_actor_on_platform_max_crawl_depth to use the /deep/N URL pattern from the shared server instead of the old infinite pagination URLs that no longer exist. Co-Authored-By: Claude Opus 4.6 --- tests/e2e/test_crawlers_with_storages.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/tests/e2e/test_crawlers_with_storages.py b/tests/e2e/test_crawlers_with_storages.py index cd0d6941..89ff85bb 100644 --- a/tests/e2e/test_crawlers_with_storages.py +++ b/tests/e2e/test_crawlers_with_storages.py @@ -23,7 +23,7 @@ async def main() -> None: async with Actor: crawler = ParselCrawler(max_crawl_depth=2) finished = [] - enqueue_pattern = re.compile(r'http://localhost:8080/2+$') + enqueue_pattern = re.compile(r'http://localhost:8080/deep/\d+$') @crawler.router.default_handler async def default_handler(context: ParselCrawlingContext) -> None: @@ -33,7 +33,11 @@ async def default_handler(context: ParselCrawlingContext) -> None: finished.append(context.request.url) await crawler.run(['http://localhost:8080/']) - assert finished == ['http://localhost:8080/', 'http://localhost:8080/2', 'http://localhost:8080/22'] + assert finished == [ + 'http://localhost:8080/', + 'http://localhost:8080/deep/1', + 'http://localhost:8080/deep/2', + ] actor = await make_actor(label='crawler-max-depth', main_func=main) run_result = await run_actor(actor)