unclecode · jannahopp · Mar 23, 2026
diff --git a/crawl4ai/async_configs.py b/crawl4ai/async_configs.py
@@ -117,6 +117,15 @@ def to_serializable_dict(obj: Any, ignore_default_value : bool = False):
     return str(obj)
 
 
+# Allowlist for from_serializable_dict. Only type names listed here (or in
+# the CRAWL4AI_DESERIALIZE_ALLOW env var) may be instantiated during
+# deserialization. Empty/unset env var = deny all typed deserialization.
+_DESERIALIZE_ALLOW_ENV = os.environ.get("CRAWL4AI_DESERIALIZE_ALLOW", "")
+ALLOWED_DESERIALIZE_TYPES: set = {
+    t.strip() for t in _DESERIALIZE_ALLOW_ENV.split(",") if t.strip()
+}
+
+
 def from_serializable_dict(data: Any) -> Any:
     """
     Recursively convert a serializable dictionary back to an object instance.
@@ -134,6 +143,12 @@ def from_serializable_dict(data: Any) -> Any:
         if data["type"] == "dict" and "value" in data:
             return {k: from_serializable_dict(v) for k, v in data["value"].items()}
 
+        if data["type"] not in ALLOWED_DESERIALIZE_TYPES:
+            raise ValueError(
+                f"Disallowed type for deserialization: {data['type']}. "
+                f"Add it to CRAWL4AI_DESERIALIZE_ALLOW to permit."
+            )
+
         cls = None
         # If you are receiving an error while trying to convert a dict to an object:
         # Either add a module to `modules_paths` list, or add the `data["type"]` to the crawl4ai __init__.py file

diff --git a/deploy/docker/api.py b/deploy/docker/api.py
@@ -539,16 +539,19 @@ async def handle_crawl_request(
 
     try:
         urls = [('https://' + url) if not url.startswith(('http://', 'https://')) and not url.startswith(("raw:", "raw://")) else url for url in urls]
-        browser_config = BrowserConfig.load(browser_config)
-        crawler_config = CrawlerRunConfig.load(crawler_config)
+        try:
+            browser_config = BrowserConfig.load(browser_config)
+            crawler_config = CrawlerRunConfig.load(crawler_config)
+        except ValueError as e:
+            raise HTTPException(status_code=400, detail=str(e))
 
         dispatcher = MemoryAdaptiveDispatcher(
             memory_threshold_percent=config["crawler"]["memory_threshold_percent"],
             rate_limiter=RateLimiter(
                 base_delay=tuple(config["crawler"]["rate_limiter"]["base_delay"])
             ) if config["crawler"]["rate_limiter"]["enabled"] else None
         )
-        
+
         from crawler_pool import get_crawler
         crawler = await get_crawler(browser_config)
 
@@ -720,10 +723,13 @@ async def handle_stream_crawl_request(
     """Handle streaming crawl requests with optional hooks."""
     hooks_info = None
     try:
-        browser_config = BrowserConfig.load(browser_config)
+        try:
+            browser_config = BrowserConfig.load(browser_config)
+            crawler_config = CrawlerRunConfig.load(crawler_config)
+        except ValueError as e:
+            raise HTTPException(status_code=400, detail=str(e))
         # browser_config.verbose = True # Set to False or remove for production stress testing
         browser_config.verbose = False
-        crawler_config = CrawlerRunConfig.load(crawler_config)
         crawler_config.scraping_strategy = LXMLWebScrapingStrategy()
         crawler_config.stream = True
 

diff --git a/deploy/docker/server.py b/deploy/docker/server.py
@@ -624,7 +624,10 @@ async def crawl(
     if crawl_request.hooks and not HOOKS_ENABLED:
         raise HTTPException(403, "Hooks are disabled. Set CRAWL4AI_HOOKS_ENABLED=true to enable.")
     # Check whether it is a redirection for a streaming request
-    crawler_config = CrawlerRunConfig.load(crawl_request.crawler_config)
+    try:
+        crawler_config = CrawlerRunConfig.load(crawl_request.crawler_config)
+    except ValueError as e:
+        raise HTTPException(400, detail=str(e))
     if crawler_config.stream:
         return await stream_process(crawl_request=crawl_request)
 

diff --git a/deploy/docker/tests/test_security_fixes.py b/deploy/docker/tests/test_security_fixes.py
@@ -160,6 +160,92 @@ def test_hooks_disabled_when_false(self):
                 os.environ.pop("CRAWL4AI_HOOKS_ENABLED", None)
 
 
+class TestDeserializationAllowlist(unittest.TestCase):
+    """Test the allowlist gate logic for from_serializable_dict.
+
+    Replicates the gate logic locally to avoid importing crawl4ai (which has
+    heavy dependencies). The logic under test is:
+      if data["type"] not in ALLOWED_DESERIALIZE_TYPES: raise ValueError
+    """
+
+    @staticmethod
+    def _check_allowlist(data, allowed_types):
+        """Minimal replica of the allowlist gate in from_serializable_dict."""
+        if data is None:
+            return data
+        if isinstance(data, (str, int, float, bool)):
+            return data
+        if isinstance(data, dict) and "type" in data:
+            if data["type"] == "dict" and "value" in data:
+                return {k: v for k, v in data["value"].items()}
+            if data["type"] not in allowed_types:
+                raise ValueError(
+                    f"Disallowed type for deserialization: {data['type']}"
+                )
+            return {"_allowed": True, "type": data["type"]}
+        return data
+
+    def test_disallowed_type_rejected(self):
+        """Types not in the allowlist must be rejected."""
+        allowed = {"BrowserConfig"}
+        with self.assertRaises(ValueError) as ctx:
+            self._check_allowlist({"type": "AsyncWebCrawler", "params": {}}, allowed)
+        self.assertIn("Disallowed type", str(ctx.exception))
+
+    def test_arbitrary_class_rejected(self):
+        """Arbitrary class names must be rejected."""
+        allowed = {"BrowserConfig"}
+        with self.assertRaises(ValueError):
+            self._check_allowlist({"type": "Crawl4aiDockerClient", "params": {}}, allowed)
+
+    def test_allowed_type_passes_gate(self):
+        """Types in the allowlist must pass the gate check."""
+        allowed = {"BrowserConfig", "CrawlerRunConfig"}
+        result = self._check_allowlist({"type": "BrowserConfig", "params": {}}, allowed)
+        self.assertEqual(result["type"], "BrowserConfig")
+
+    def test_dict_type_bypasses_allowlist(self):
+        """The special 'dict' type must still work (not subject to allowlist)."""
+        result = self._check_allowlist({"type": "dict", "value": {"k": "v"}}, set())
+        self.assertEqual(result, {"k": "v"})
+
+    def test_basic_types_pass_through(self):
+        """Strings, ints, etc. must pass through unchanged."""
+        self.assertEqual(self._check_allowlist("hello", set()), "hello")
+        self.assertEqual(self._check_allowlist(42, set()), 42)
+        self.assertIsNone(self._check_allowlist(None, set()))
+
+    def test_empty_allowlist_denies_all(self):
+        """With empty allowlist, all typed deserialization must be denied."""
+        with self.assertRaises(ValueError):
+            self._check_allowlist({"type": "BrowserConfig", "params": {}}, set())
+
+    def test_env_var_parsing(self):
+        """CRAWL4AI_DESERIALIZE_ALLOW env var must be parsed as comma-separated set."""
+        original = os.environ.get("CRAWL4AI_DESERIALIZE_ALLOW")
+        try:
+            os.environ["CRAWL4AI_DESERIALIZE_ALLOW"] = "BrowserConfig,CrawlerRunConfig,CacheMode"
+            env_val = os.environ.get("CRAWL4AI_DESERIALIZE_ALLOW", "")
+            allowed = {t.strip() for t in env_val.split(",") if t.strip()}
+            self.assertEqual(allowed, {"BrowserConfig", "CrawlerRunConfig", "CacheMode"})
+        finally:
+            if original is not None:
+                os.environ["CRAWL4AI_DESERIALIZE_ALLOW"] = original
+            else:
+                os.environ.pop("CRAWL4AI_DESERIALIZE_ALLOW", None)
+
+    def test_empty_env_var_means_deny_all(self):
+        """Unset or empty CRAWL4AI_DESERIALIZE_ALLOW must produce empty set."""
+        original = os.environ.pop("CRAWL4AI_DESERIALIZE_ALLOW", None)
+        try:
+            env_val = os.environ.get("CRAWL4AI_DESERIALIZE_ALLOW", "")
+            allowed = {t.strip() for t in env_val.split(",") if t.strip()}
+            self.assertEqual(allowed, set())
+        finally:
+            if original is not None:
+                os.environ["CRAWL4AI_DESERIALIZE_ALLOW"] = original
+
+
 if __name__ == '__main__':
     print("=" * 60)
     print("Crawl4AI Security Fixes - Unit Tests")