Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 15 additions & 0 deletions crawl4ai/async_configs.py
Original file line number Diff line number Diff line change
Expand Up @@ -117,6 +117,15 @@ def to_serializable_dict(obj: Any, ignore_default_value : bool = False):
return str(obj)


# Allowlist for from_serializable_dict. Only type names listed here (or in
# the CRAWL4AI_DESERIALIZE_ALLOW env var) may be instantiated during
# deserialization. Empty/unset env var = deny all typed deserialization.
_DESERIALIZE_ALLOW_ENV = os.environ.get("CRAWL4AI_DESERIALIZE_ALLOW", "")
ALLOWED_DESERIALIZE_TYPES: set = {
t.strip() for t in _DESERIALIZE_ALLOW_ENV.split(",") if t.strip()
}


def from_serializable_dict(data: Any) -> Any:
"""
Recursively convert a serializable dictionary back to an object instance.
Expand All @@ -134,6 +143,12 @@ def from_serializable_dict(data: Any) -> Any:
if data["type"] == "dict" and "value" in data:
return {k: from_serializable_dict(v) for k, v in data["value"].items()}

if data["type"] not in ALLOWED_DESERIALIZE_TYPES:
raise ValueError(
f"Disallowed type for deserialization: {data['type']}. "
f"Add it to CRAWL4AI_DESERIALIZE_ALLOW to permit."
)

cls = None
# If you are receiving an error while trying to convert a dict to an object:
# Either add a module to `modules_paths` list, or add the `data["type"]` to the crawl4ai __init__.py file
Expand Down
16 changes: 11 additions & 5 deletions deploy/docker/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -539,16 +539,19 @@ async def handle_crawl_request(

try:
urls = [('https://' + url) if not url.startswith(('http://', 'https://')) and not url.startswith(("raw:", "raw://")) else url for url in urls]
browser_config = BrowserConfig.load(browser_config)
crawler_config = CrawlerRunConfig.load(crawler_config)
try:
browser_config = BrowserConfig.load(browser_config)
crawler_config = CrawlerRunConfig.load(crawler_config)
except ValueError as e:
raise HTTPException(status_code=400, detail=str(e))

dispatcher = MemoryAdaptiveDispatcher(
memory_threshold_percent=config["crawler"]["memory_threshold_percent"],
rate_limiter=RateLimiter(
base_delay=tuple(config["crawler"]["rate_limiter"]["base_delay"])
) if config["crawler"]["rate_limiter"]["enabled"] else None
)

from crawler_pool import get_crawler
crawler = await get_crawler(browser_config)

Expand Down Expand Up @@ -720,10 +723,13 @@ async def handle_stream_crawl_request(
"""Handle streaming crawl requests with optional hooks."""
hooks_info = None
try:
browser_config = BrowserConfig.load(browser_config)
try:
browser_config = BrowserConfig.load(browser_config)
crawler_config = CrawlerRunConfig.load(crawler_config)
except ValueError as e:
raise HTTPException(status_code=400, detail=str(e))
# browser_config.verbose = True # Set to False or remove for production stress testing
browser_config.verbose = False
crawler_config = CrawlerRunConfig.load(crawler_config)
crawler_config.scraping_strategy = LXMLWebScrapingStrategy()
crawler_config.stream = True

Expand Down
5 changes: 4 additions & 1 deletion deploy/docker/server.py
Original file line number Diff line number Diff line change
Expand Up @@ -624,7 +624,10 @@ async def crawl(
if crawl_request.hooks and not HOOKS_ENABLED:
raise HTTPException(403, "Hooks are disabled. Set CRAWL4AI_HOOKS_ENABLED=true to enable.")
# Check whether it is a redirection for a streaming request
crawler_config = CrawlerRunConfig.load(crawl_request.crawler_config)
try:
crawler_config = CrawlerRunConfig.load(crawl_request.crawler_config)
except ValueError as e:
raise HTTPException(400, detail=str(e))
if crawler_config.stream:
return await stream_process(crawl_request=crawl_request)

Expand Down
86 changes: 86 additions & 0 deletions deploy/docker/tests/test_security_fixes.py
Original file line number Diff line number Diff line change
Expand Up @@ -160,6 +160,92 @@ def test_hooks_disabled_when_false(self):
os.environ.pop("CRAWL4AI_HOOKS_ENABLED", None)


class TestDeserializationAllowlist(unittest.TestCase):
"""Test the allowlist gate logic for from_serializable_dict.

Replicates the gate logic locally to avoid importing crawl4ai (which has
heavy dependencies). The logic under test is:
if data["type"] not in ALLOWED_DESERIALIZE_TYPES: raise ValueError
"""

@staticmethod
def _check_allowlist(data, allowed_types):
"""Minimal replica of the allowlist gate in from_serializable_dict."""
if data is None:
return data
if isinstance(data, (str, int, float, bool)):
return data
if isinstance(data, dict) and "type" in data:
if data["type"] == "dict" and "value" in data:
return {k: v for k, v in data["value"].items()}
if data["type"] not in allowed_types:
raise ValueError(
f"Disallowed type for deserialization: {data['type']}"
)
return {"_allowed": True, "type": data["type"]}
return data

def test_disallowed_type_rejected(self):
"""Types not in the allowlist must be rejected."""
allowed = {"BrowserConfig"}
with self.assertRaises(ValueError) as ctx:
self._check_allowlist({"type": "AsyncWebCrawler", "params": {}}, allowed)
self.assertIn("Disallowed type", str(ctx.exception))

def test_arbitrary_class_rejected(self):
"""Arbitrary class names must be rejected."""
allowed = {"BrowserConfig"}
with self.assertRaises(ValueError):
self._check_allowlist({"type": "Crawl4aiDockerClient", "params": {}}, allowed)

def test_allowed_type_passes_gate(self):
"""Types in the allowlist must pass the gate check."""
allowed = {"BrowserConfig", "CrawlerRunConfig"}
result = self._check_allowlist({"type": "BrowserConfig", "params": {}}, allowed)
self.assertEqual(result["type"], "BrowserConfig")

def test_dict_type_bypasses_allowlist(self):
"""The special 'dict' type must still work (not subject to allowlist)."""
result = self._check_allowlist({"type": "dict", "value": {"k": "v"}}, set())
self.assertEqual(result, {"k": "v"})

def test_basic_types_pass_through(self):
"""Strings, ints, etc. must pass through unchanged."""
self.assertEqual(self._check_allowlist("hello", set()), "hello")
self.assertEqual(self._check_allowlist(42, set()), 42)
self.assertIsNone(self._check_allowlist(None, set()))

def test_empty_allowlist_denies_all(self):
"""With empty allowlist, all typed deserialization must be denied."""
with self.assertRaises(ValueError):
self._check_allowlist({"type": "BrowserConfig", "params": {}}, set())

def test_env_var_parsing(self):
"""CRAWL4AI_DESERIALIZE_ALLOW env var must be parsed as comma-separated set."""
original = os.environ.get("CRAWL4AI_DESERIALIZE_ALLOW")
try:
os.environ["CRAWL4AI_DESERIALIZE_ALLOW"] = "BrowserConfig,CrawlerRunConfig,CacheMode"
env_val = os.environ.get("CRAWL4AI_DESERIALIZE_ALLOW", "")
allowed = {t.strip() for t in env_val.split(",") if t.strip()}
self.assertEqual(allowed, {"BrowserConfig", "CrawlerRunConfig", "CacheMode"})
finally:
if original is not None:
os.environ["CRAWL4AI_DESERIALIZE_ALLOW"] = original
else:
os.environ.pop("CRAWL4AI_DESERIALIZE_ALLOW", None)

def test_empty_env_var_means_deny_all(self):
"""Unset or empty CRAWL4AI_DESERIALIZE_ALLOW must produce empty set."""
original = os.environ.pop("CRAWL4AI_DESERIALIZE_ALLOW", None)
try:
env_val = os.environ.get("CRAWL4AI_DESERIALIZE_ALLOW", "")
allowed = {t.strip() for t in env_val.split(",") if t.strip()}
self.assertEqual(allowed, set())
finally:
if original is not None:
os.environ["CRAWL4AI_DESERIALIZE_ALLOW"] = original


if __name__ == '__main__':
print("=" * 60)
print("Crawl4AI Security Fixes - Unit Tests")
Expand Down