diff --git a/crawl4ai/async_configs.py b/crawl4ai/async_configs.py index aa5745fb0..8cd57c50d 100644 --- a/crawl4ai/async_configs.py +++ b/crawl4ai/async_configs.py @@ -117,6 +117,15 @@ def to_serializable_dict(obj: Any, ignore_default_value : bool = False): return str(obj) +# Allowlist for from_serializable_dict. Only type names listed here (or in +# the CRAWL4AI_DESERIALIZE_ALLOW env var) may be instantiated during +# deserialization. Empty/unset env var = deny all typed deserialization. +_DESERIALIZE_ALLOW_ENV = os.environ.get("CRAWL4AI_DESERIALIZE_ALLOW", "") +ALLOWED_DESERIALIZE_TYPES: set = { + t.strip() for t in _DESERIALIZE_ALLOW_ENV.split(",") if t.strip() +} + + def from_serializable_dict(data: Any) -> Any: """ Recursively convert a serializable dictionary back to an object instance. @@ -134,6 +143,12 @@ def from_serializable_dict(data: Any) -> Any: if data["type"] == "dict" and "value" in data: return {k: from_serializable_dict(v) for k, v in data["value"].items()} + if data["type"] not in ALLOWED_DESERIALIZE_TYPES: + raise ValueError( + f"Disallowed type for deserialization: {data['type']}. " + f"Add it to CRAWL4AI_DESERIALIZE_ALLOW to permit." + ) + cls = None # If you are receiving an error while trying to convert a dict to an object: # Either add a module to `modules_paths` list, or add the `data["type"]` to the crawl4ai __init__.py file diff --git a/deploy/docker/api.py b/deploy/docker/api.py index 81cd312ab..369aeddd7 100644 --- a/deploy/docker/api.py +++ b/deploy/docker/api.py @@ -539,8 +539,11 @@ async def handle_crawl_request( try: urls = [('https://' + url) if not url.startswith(('http://', 'https://')) and not url.startswith(("raw:", "raw://")) else url for url in urls] - browser_config = BrowserConfig.load(browser_config) - crawler_config = CrawlerRunConfig.load(crawler_config) + try: + browser_config = BrowserConfig.load(browser_config) + crawler_config = CrawlerRunConfig.load(crawler_config) + except ValueError as e: + raise HTTPException(status_code=400, detail=str(e)) dispatcher = MemoryAdaptiveDispatcher( memory_threshold_percent=config["crawler"]["memory_threshold_percent"], @@ -548,7 +551,7 @@ async def handle_crawl_request( base_delay=tuple(config["crawler"]["rate_limiter"]["base_delay"]) ) if config["crawler"]["rate_limiter"]["enabled"] else None ) - + from crawler_pool import get_crawler crawler = await get_crawler(browser_config) @@ -720,10 +723,13 @@ async def handle_stream_crawl_request( """Handle streaming crawl requests with optional hooks.""" hooks_info = None try: - browser_config = BrowserConfig.load(browser_config) + try: + browser_config = BrowserConfig.load(browser_config) + crawler_config = CrawlerRunConfig.load(crawler_config) + except ValueError as e: + raise HTTPException(status_code=400, detail=str(e)) # browser_config.verbose = True # Set to False or remove for production stress testing browser_config.verbose = False - crawler_config = CrawlerRunConfig.load(crawler_config) crawler_config.scraping_strategy = LXMLWebScrapingStrategy() crawler_config.stream = True diff --git a/deploy/docker/server.py b/deploy/docker/server.py index 7ae1adb8b..469aefb00 100644 --- a/deploy/docker/server.py +++ b/deploy/docker/server.py @@ -624,7 +624,10 @@ async def crawl( if crawl_request.hooks and not HOOKS_ENABLED: raise HTTPException(403, "Hooks are disabled. Set CRAWL4AI_HOOKS_ENABLED=true to enable.") # Check whether it is a redirection for a streaming request - crawler_config = CrawlerRunConfig.load(crawl_request.crawler_config) + try: + crawler_config = CrawlerRunConfig.load(crawl_request.crawler_config) + except ValueError as e: + raise HTTPException(400, detail=str(e)) if crawler_config.stream: return await stream_process(crawl_request=crawl_request) diff --git a/deploy/docker/tests/test_security_fixes.py b/deploy/docker/tests/test_security_fixes.py index 9321d111e..38e91d311 100644 --- a/deploy/docker/tests/test_security_fixes.py +++ b/deploy/docker/tests/test_security_fixes.py @@ -160,6 +160,92 @@ def test_hooks_disabled_when_false(self): os.environ.pop("CRAWL4AI_HOOKS_ENABLED", None) +class TestDeserializationAllowlist(unittest.TestCase): + """Test the allowlist gate logic for from_serializable_dict. + + Replicates the gate logic locally to avoid importing crawl4ai (which has + heavy dependencies). The logic under test is: + if data["type"] not in ALLOWED_DESERIALIZE_TYPES: raise ValueError + """ + + @staticmethod + def _check_allowlist(data, allowed_types): + """Minimal replica of the allowlist gate in from_serializable_dict.""" + if data is None: + return data + if isinstance(data, (str, int, float, bool)): + return data + if isinstance(data, dict) and "type" in data: + if data["type"] == "dict" and "value" in data: + return {k: v for k, v in data["value"].items()} + if data["type"] not in allowed_types: + raise ValueError( + f"Disallowed type for deserialization: {data['type']}" + ) + return {"_allowed": True, "type": data["type"]} + return data + + def test_disallowed_type_rejected(self): + """Types not in the allowlist must be rejected.""" + allowed = {"BrowserConfig"} + with self.assertRaises(ValueError) as ctx: + self._check_allowlist({"type": "AsyncWebCrawler", "params": {}}, allowed) + self.assertIn("Disallowed type", str(ctx.exception)) + + def test_arbitrary_class_rejected(self): + """Arbitrary class names must be rejected.""" + allowed = {"BrowserConfig"} + with self.assertRaises(ValueError): + self._check_allowlist({"type": "Crawl4aiDockerClient", "params": {}}, allowed) + + def test_allowed_type_passes_gate(self): + """Types in the allowlist must pass the gate check.""" + allowed = {"BrowserConfig", "CrawlerRunConfig"} + result = self._check_allowlist({"type": "BrowserConfig", "params": {}}, allowed) + self.assertEqual(result["type"], "BrowserConfig") + + def test_dict_type_bypasses_allowlist(self): + """The special 'dict' type must still work (not subject to allowlist).""" + result = self._check_allowlist({"type": "dict", "value": {"k": "v"}}, set()) + self.assertEqual(result, {"k": "v"}) + + def test_basic_types_pass_through(self): + """Strings, ints, etc. must pass through unchanged.""" + self.assertEqual(self._check_allowlist("hello", set()), "hello") + self.assertEqual(self._check_allowlist(42, set()), 42) + self.assertIsNone(self._check_allowlist(None, set())) + + def test_empty_allowlist_denies_all(self): + """With empty allowlist, all typed deserialization must be denied.""" + with self.assertRaises(ValueError): + self._check_allowlist({"type": "BrowserConfig", "params": {}}, set()) + + def test_env_var_parsing(self): + """CRAWL4AI_DESERIALIZE_ALLOW env var must be parsed as comma-separated set.""" + original = os.environ.get("CRAWL4AI_DESERIALIZE_ALLOW") + try: + os.environ["CRAWL4AI_DESERIALIZE_ALLOW"] = "BrowserConfig,CrawlerRunConfig,CacheMode" + env_val = os.environ.get("CRAWL4AI_DESERIALIZE_ALLOW", "") + allowed = {t.strip() for t in env_val.split(",") if t.strip()} + self.assertEqual(allowed, {"BrowserConfig", "CrawlerRunConfig", "CacheMode"}) + finally: + if original is not None: + os.environ["CRAWL4AI_DESERIALIZE_ALLOW"] = original + else: + os.environ.pop("CRAWL4AI_DESERIALIZE_ALLOW", None) + + def test_empty_env_var_means_deny_all(self): + """Unset or empty CRAWL4AI_DESERIALIZE_ALLOW must produce empty set.""" + original = os.environ.pop("CRAWL4AI_DESERIALIZE_ALLOW", None) + try: + env_val = os.environ.get("CRAWL4AI_DESERIALIZE_ALLOW", "") + allowed = {t.strip() for t in env_val.split(",") if t.strip()} + self.assertEqual(allowed, set()) + finally: + if original is not None: + os.environ["CRAWL4AI_DESERIALIZE_ALLOW"] = original + + if __name__ == '__main__': print("=" * 60) print("Crawl4AI Security Fixes - Unit Tests")