From 3fd46aa2f2d81d1509930a03ec5b5807b5795555 Mon Sep 17 00:00:00 2001 From: Janna Hopp <201101176+jannahopp@users.noreply.github.com> Date: Mon, 23 Mar 2026 12:33:24 +0100 Subject: [PATCH] fix: block loopback and link-local addresses in crawl targets All crawl endpoints accept user-supplied URLs fetched by a headless browser. Without validation, an attacker can target loopback addresses (127.0.0.1, ::1), link-local addresses (169.254.169.254 cloud metadata), or unspecified addresses (0.0.0.0). Adds validate_url_target() which resolves the hostname via DNS and checks the resolved IPs against blocked ranges before passing the URL to Playwright. Configurable via CRAWL4AI_SSRF_PROTECTION env var (default: true). Note: This is a best-effort check. DNS rebinding can bypass it, and RFC 1918 addresses are intentionally not blocked (legitimate crawl targets in many deployments). Full SSRF protection requires network policies at the infrastructure level. Co-Authored-By: Claude Opus 4.6 (1M context) --- deploy/docker/api.py | 41 ++++++++++++++++++++++++++++++++++++- deploy/docker/server.py | 45 +++++++++++++++++++++++++++++++++++++++++ 2 files changed, 85 insertions(+), 1 deletion(-) diff --git a/deploy/docker/api.py b/deploy/docker/api.py index 81cd312ab..c8deeb5a3 100644 --- a/deploy/docker/api.py +++ b/deploy/docker/api.py @@ -1,6 +1,8 @@ import os import json import asyncio +import socket +import ipaddress from typing import List, Tuple, Dict from functools import partial from uuid import uuid4 @@ -9,7 +11,7 @@ import logging from typing import Optional, AsyncGenerator -from urllib.parse import unquote +from urllib.parse import unquote, urlparse from fastapi import HTTPException, Request, status from fastapi.background import BackgroundTasks from fastapi.responses import JSONResponse @@ -513,6 +515,39 @@ async def stream_results(crawler: AsyncWebCrawler, results_gen: AsyncGenerator) # logger.error(f"Crawler cleanup error: {e}") pass +SSRF_PROTECTION = os.environ.get("CRAWL4AI_SSRF_PROTECTION", "true").lower() == "true" + + +def validate_url_target(url: str) -> None: + """Block requests to loopback, link-local, and metadata addresses. + + Best-effort check — DNS rebinding can bypass it. Full SSRF protection + requires network policies at the infrastructure level. + """ + if not SSRF_PROTECTION: + return + if url.startswith(("raw:", "raw://")): + return + try: + parsed = urlparse(url) + hostname = parsed.hostname + if not hostname: + return + addrs = socket.getaddrinfo(hostname, None, socket.AF_UNSPEC, socket.SOCK_STREAM) + for family, _, _, _, sockaddr in addrs: + ip = ipaddress.ip_address(sockaddr[0]) + if ip.is_loopback or ip.is_link_local or ip.is_unspecified: + raise HTTPException( + 400, + f"URL targets a blocked address ({ip}). " + "Loopback, link-local, and unspecified addresses are not allowed." + ) + except HTTPException: + raise + except Exception: + pass # DNS failures are not a security issue — let Playwright handle them + + async def handle_crawl_request( urls: List[str], browser_config: dict, @@ -539,6 +574,8 @@ async def handle_crawl_request( try: urls = [('https://' + url) if not url.startswith(('http://', 'https://')) and not url.startswith(("raw:", "raw://")) else url for url in urls] + for url in urls: + validate_url_target(url) browser_config = BrowserConfig.load(browser_config) crawler_config = CrawlerRunConfig.load(crawler_config) @@ -720,6 +757,8 @@ async def handle_stream_crawl_request( """Handle streaming crawl requests with optional hooks.""" hooks_info = None try: + for url in urls: + validate_url_target(url) browser_config = BrowserConfig.load(browser_config) # browser_config.verbose = True # Set to False or remove for production stress testing browser_config.verbose = False diff --git a/deploy/docker/server.py b/deploy/docker/server.py index 7ae1adb8b..9f67cfc75 100644 --- a/deploy/docker/server.py +++ b/deploy/docker/server.py @@ -253,6 +253,47 @@ def validate_url_scheme(url: str, allow_raw: bool = False) -> None: raise HTTPException(400, f"URL must start with {schemes}") +# ── SSRF protection: block loopback and link-local targets ── +import socket +import ipaddress +from urllib.parse import urlparse + +SSRF_PROTECTION = os.environ.get("CRAWL4AI_SSRF_PROTECTION", "true").lower() == "true" + + +def validate_url_target(url: str) -> None: + """Block requests to loopback, link-local, and metadata addresses. + + This is a best-effort check — it resolves the hostname before Playwright + connects, so DNS rebinding can bypass it. Full SSRF protection requires + network policies at the infrastructure level. + """ + if not SSRF_PROTECTION: + return + if url.startswith(("raw:", "raw://")): + return + + try: + parsed = urlparse(url) + hostname = parsed.hostname + if not hostname: + return + + addrs = socket.getaddrinfo(hostname, None, socket.AF_UNSPEC, socket.SOCK_STREAM) + for family, _, _, _, sockaddr in addrs: + ip = ipaddress.ip_address(sockaddr[0]) + if ip.is_loopback or ip.is_link_local or ip.is_unspecified: + raise HTTPException( + 400, + f"URL targets a blocked address ({ip}). " + "Loopback, link-local, and unspecified addresses are not allowed." + ) + except HTTPException: + raise + except Exception: + pass # DNS resolution failures are not a security issue — let Playwright handle them + + # ───────────────── safe config‑dump helper ───────────────── ALLOWED_TYPES = { "CrawlerRunConfig": CrawlerRunConfig, @@ -328,6 +369,7 @@ async def get_markdown( if not body.url.startswith(("http://", "https://")) and not body.url.startswith(("raw:", "raw://")): raise HTTPException( 400, "Invalid URL format. Must start with http://, https://, or for raw HTML (raw:, raw://)") + validate_url_target(body.url) markdown = await handle_markdown_request( body.url, body.f, body.q, body.c, config, body.provider, body.temperature, body.base_url @@ -387,6 +429,7 @@ async def generate_screenshot( Then in result instead of the screenshot you will get a path to the saved file. """ validate_url_scheme(body.url) + validate_url_target(body.url) from crawler_pool import get_crawler try: cfg = CrawlerRunConfig(screenshot=True, screenshot_wait_for=body.screenshot_wait_for) @@ -422,6 +465,7 @@ async def generate_pdf( Then in result instead of the PDF you will get a path to the saved file. """ validate_url_scheme(body.url) + validate_url_target(body.url) from crawler_pool import get_crawler try: cfg = CrawlerRunConfig(pdf=True) @@ -495,6 +539,7 @@ class MarkdownGenerationResult(BaseModel): """ validate_url_scheme(body.url) + validate_url_target(body.url) from crawler_pool import get_crawler try: cfg = CrawlerRunConfig(js_code=body.scripts)