From 8995c1bbd6a02f9883104208eb288a8f60a241e2 Mon Sep 17 00:00:00 2001 From: hafezparast Date: Mon, 23 Mar 2026 09:56:53 +0800 Subject: [PATCH] feat: expose arun_many config-list support in Docker API (#1837) The /crawl endpoint now accepts an optional crawler_configs field (list of CrawlerRunConfig dicts) alongside the existing crawler_config. When provided with multiple URLs, each config is deserialized and passed as a list to arun_many(), enabling per-URL configuration with url_matcher patterns. Single-URL requests and requests without crawler_configs are unchanged (backward compatible). Co-Authored-By: Claude Opus 4.6 (1M context) --- deploy/docker/api.py | 36 ++++--- deploy/docker/schemas.py | 8 ++ deploy/docker/server.py | 3 +- tests/test_issue_1837_config_list.py | 140 +++++++++++++++++++++++++++ 4 files changed, 175 insertions(+), 12 deletions(-) create mode 100644 tests/test_issue_1837_config_list.py diff --git a/deploy/docker/api.py b/deploy/docker/api.py index 1ecc9e0b4..93c04b127 100644 --- a/deploy/docker/api.py +++ b/deploy/docker/api.py @@ -542,7 +542,8 @@ async def handle_crawl_request( browser_config: dict, crawler_config: dict, config: dict, - hooks_config: Optional[dict] = None + hooks_config: Optional[dict] = None, + crawler_configs: Optional[List[dict]] = None, ) -> dict: """Handle non-streaming crawl requests with optional hooks.""" # Track request start @@ -591,19 +592,32 @@ async def handle_crawl_request( logger.info(f"Hooks attachment status: {hooks_status['status']}") base_config = config["crawler"]["base_config"] - # Iterate on key-value pairs in global_config then use hasattr to set them - for key, value in base_config.items(): - if hasattr(crawler_config, key): - current_value = getattr(crawler_config, key) - # Only set base config if user didn't provide a value - if current_value is None or current_value == "": - setattr(crawler_config, key, value) + + # Build the config(s) to pass to arun/arun_many + if crawler_configs and len(urls) > 1: + # Per-URL config list: deserialize each and apply base_config + config_list = [CrawlerRunConfig.load(cc) for cc in crawler_configs] + for cfg in config_list: + for key, value in base_config.items(): + if hasattr(cfg, key): + current_value = getattr(cfg, key) + if current_value is None or current_value == "": + setattr(cfg, key, value) + effective_config = config_list + else: + # Single config (original behavior) + for key, value in base_config.items(): + if hasattr(crawler_config, key): + current_value = getattr(crawler_config, key) + if current_value is None or current_value == "": + setattr(crawler_config, key, value) + effective_config = crawler_config results = [] func = getattr(crawler, "arun" if len(urls) == 1 else "arun_many") - partial_func = partial(func, - urls[0] if len(urls) == 1 else urls, - config=crawler_config, + partial_func = partial(func, + urls[0] if len(urls) == 1 else urls, + config=effective_config, dispatcher=dispatcher) results = await partial_func() diff --git a/deploy/docker/schemas.py b/deploy/docker/schemas.py index ef75ce8bc..b5153ef9b 100644 --- a/deploy/docker/schemas.py +++ b/deploy/docker/schemas.py @@ -8,6 +8,14 @@ class CrawlRequest(BaseModel): urls: List[str] = Field(min_length=1, max_length=100) browser_config: Optional[Dict] = Field(default_factory=dict) crawler_config: Optional[Dict] = Field(default_factory=dict) + crawler_configs: Optional[List[Dict]] = Field( + default=None, + description=( + "List of per-URL CrawlerRunConfig dicts for arun_many(). " + "When provided, each config can include a 'url_matcher' pattern " + "to match against specific URLs. Takes precedence over crawler_config." + ), + ) class HookConfig(BaseModel): diff --git a/deploy/docker/server.py b/deploy/docker/server.py index 7b3a8d964..ad2cbd659 100644 --- a/deploy/docker/server.py +++ b/deploy/docker/server.py @@ -682,7 +682,8 @@ async def crawl( browser_config=crawl_request.browser_config, crawler_config=crawl_request.crawler_config, config=config, - hooks_config=hooks_config + hooks_config=hooks_config, + crawler_configs=crawl_request.crawler_configs, ) # check if all of the results are not successful if all(not result["success"] for result in results["results"]): diff --git a/tests/test_issue_1837_config_list.py b/tests/test_issue_1837_config_list.py new file mode 100644 index 000000000..45150e589 --- /dev/null +++ b/tests/test_issue_1837_config_list.py @@ -0,0 +1,140 @@ +""" +Tests for issue #1837: Docker API arun_many config-list support. + +Verifies that the /crawl endpoint accepts crawler_configs (list of dicts) +alongside the existing crawler_config (single dict), and that the list +is correctly passed through to arun_many(). +""" + +import pytest +from unittest.mock import AsyncMock, patch, MagicMock +from crawl4ai import CrawlerRunConfig, CacheMode + + +# -- Schema tests -- + +class TestCrawlRequestSchema: + """Verify CrawlRequest schema accepts crawler_configs.""" + + def test_schema_has_crawler_configs_field(self): + """CrawlRequest should have an optional crawler_configs field.""" + import importlib.util + with open("deploy/docker/schemas.py") as f: + source = f.read() + assert "crawler_configs" in source + assert "Optional[List[Dict]]" in source + + def test_schema_backward_compatible(self): + """crawler_config (singular) should still work.""" + with open("deploy/docker/schemas.py") as f: + source = f.read() + assert "crawler_config: Optional[Dict]" in source + + def test_crawler_configs_default_none(self): + """crawler_configs should default to None.""" + with open("deploy/docker/schemas.py") as f: + source = f.read() + assert "default=None" in source + + +# -- API handler tests -- + +class TestHandleCrawlRequestSignature: + """Verify handle_crawl_request accepts crawler_configs parameter.""" + + def test_handler_accepts_crawler_configs(self): + """handle_crawl_request should have crawler_configs parameter.""" + with open("deploy/docker/api.py") as f: + source = f.read() + assert "crawler_configs: Optional[List[dict]]" in source + + def test_handler_defaults_crawler_configs_none(self): + """crawler_configs should default to None.""" + with open("deploy/docker/api.py") as f: + source = f.read() + assert "crawler_configs: Optional[List[dict]] = None" in source + + +# -- Config list deserialization -- + +class TestConfigListDeserialization: + """Verify that a list of config dicts can be deserialized.""" + + def test_single_config_loads(self): + """Single config dict should deserialize as before.""" + data = {"type": "CrawlerRunConfig", "params": {"verbose": False}} + config = CrawlerRunConfig.load(data) + assert isinstance(config, CrawlerRunConfig) + + def test_multiple_configs_load(self): + """Multiple config dicts should each deserialize independently.""" + configs_data = [ + {"type": "CrawlerRunConfig", "params": {"verbose": False}}, + {"type": "CrawlerRunConfig", "params": {"cache_mode": {"type": "CacheMode", "params": "bypass"}}}, + ] + configs = [CrawlerRunConfig.load(c) for c in configs_data] + assert len(configs) == 2 + assert all(isinstance(c, CrawlerRunConfig) for c in configs) + + def test_empty_config_list(self): + """Empty config list should produce empty list.""" + configs = [CrawlerRunConfig.load(c) for c in []] + assert configs == [] + + +# -- Integration: config list logic in api.py -- + +class TestConfigListLogic: + """Verify the branching logic for single vs list configs.""" + + def test_api_uses_config_list_when_provided(self): + """When crawler_configs is provided with multiple URLs, it should be used.""" + with open("deploy/docker/api.py") as f: + source = f.read() + # Should check crawler_configs and build a list + assert "if crawler_configs and len(urls) > 1:" in source + assert "config_list" in source + + def test_api_falls_back_to_single_config(self): + """When crawler_configs is None, original single-config path is used.""" + with open("deploy/docker/api.py") as f: + source = f.read() + assert "effective_config = crawler_config" in source + + def test_api_applies_base_config_to_each(self): + """Base config should be applied to each config in the list.""" + with open("deploy/docker/api.py") as f: + source = f.read() + assert "for cfg in config_list:" in source + + +# -- Server endpoint passes crawler_configs -- + +class TestServerEndpoint: + """Verify the /crawl endpoint passes crawler_configs through.""" + + def test_server_passes_crawler_configs(self): + """The crawl endpoint should pass crawler_configs to handle_crawl_request.""" + with open("deploy/docker/server.py") as f: + source = f.read() + assert "crawler_configs=crawl_request.crawler_configs" in source + + +# -- Backward compatibility -- + +class TestBackwardCompatibility: + """Ensure existing single-config requests still work.""" + + def test_single_url_ignores_crawler_configs(self): + """With a single URL, crawler_configs should be ignored (uses arun, not arun_many).""" + with open("deploy/docker/api.py") as f: + source = f.read() + # Single URL uses arun which only takes one config + assert '"arun" if len(urls) == 1 else "arun_many"' in source + + def test_no_crawler_configs_uses_single(self): + """When crawler_configs is None, the original single config path is used.""" + with open("deploy/docker/api.py") as f: + source = f.read() + # The else branch uses the original crawler_config + assert "effective_config = crawler_config" in source