Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
41 changes: 40 additions & 1 deletion deploy/docker/api.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
import os
import json
import asyncio
import socket
import ipaddress
from typing import List, Tuple, Dict
from functools import partial
from uuid import uuid4
Expand All @@ -9,7 +11,7 @@

import logging
from typing import Optional, AsyncGenerator
from urllib.parse import unquote
from urllib.parse import unquote, urlparse
from fastapi import HTTPException, Request, status
from fastapi.background import BackgroundTasks
from fastapi.responses import JSONResponse
Expand Down Expand Up @@ -513,6 +515,39 @@ async def stream_results(crawler: AsyncWebCrawler, results_gen: AsyncGenerator)
# logger.error(f"Crawler cleanup error: {e}")
pass

SSRF_PROTECTION = os.environ.get("CRAWL4AI_SSRF_PROTECTION", "true").lower() == "true"


def validate_url_target(url: str) -> None:
"""Block requests to loopback, link-local, and metadata addresses.

Best-effort check — DNS rebinding can bypass it. Full SSRF protection
requires network policies at the infrastructure level.
"""
if not SSRF_PROTECTION:
return
if url.startswith(("raw:", "raw://")):
return
try:
parsed = urlparse(url)
hostname = parsed.hostname
if not hostname:
return
addrs = socket.getaddrinfo(hostname, None, socket.AF_UNSPEC, socket.SOCK_STREAM)
for family, _, _, _, sockaddr in addrs:
ip = ipaddress.ip_address(sockaddr[0])
if ip.is_loopback or ip.is_link_local or ip.is_unspecified:
raise HTTPException(
400,
f"URL targets a blocked address ({ip}). "
"Loopback, link-local, and unspecified addresses are not allowed."
)
except HTTPException:
raise
except Exception:
pass # DNS failures are not a security issue — let Playwright handle them


async def handle_crawl_request(
urls: List[str],
browser_config: dict,
Expand All @@ -539,6 +574,8 @@ async def handle_crawl_request(

try:
urls = [('https://' + url) if not url.startswith(('http://', 'https://')) and not url.startswith(("raw:", "raw://")) else url for url in urls]
for url in urls:
validate_url_target(url)
browser_config = BrowserConfig.load(browser_config)
crawler_config = CrawlerRunConfig.load(crawler_config)

Expand Down Expand Up @@ -720,6 +757,8 @@ async def handle_stream_crawl_request(
"""Handle streaming crawl requests with optional hooks."""
hooks_info = None
try:
for url in urls:
validate_url_target(url)
browser_config = BrowserConfig.load(browser_config)
# browser_config.verbose = True # Set to False or remove for production stress testing
browser_config.verbose = False
Expand Down
45 changes: 45 additions & 0 deletions deploy/docker/server.py
Original file line number Diff line number Diff line change
Expand Up @@ -253,6 +253,47 @@ def validate_url_scheme(url: str, allow_raw: bool = False) -> None:
raise HTTPException(400, f"URL must start with {schemes}")


# ── SSRF protection: block loopback and link-local targets ──
import socket
import ipaddress
from urllib.parse import urlparse

SSRF_PROTECTION = os.environ.get("CRAWL4AI_SSRF_PROTECTION", "true").lower() == "true"


def validate_url_target(url: str) -> None:
"""Block requests to loopback, link-local, and metadata addresses.

This is a best-effort check — it resolves the hostname before Playwright
connects, so DNS rebinding can bypass it. Full SSRF protection requires
network policies at the infrastructure level.
"""
if not SSRF_PROTECTION:
return
if url.startswith(("raw:", "raw://")):
return

try:
parsed = urlparse(url)
hostname = parsed.hostname
if not hostname:
return

addrs = socket.getaddrinfo(hostname, None, socket.AF_UNSPEC, socket.SOCK_STREAM)
for family, _, _, _, sockaddr in addrs:
ip = ipaddress.ip_address(sockaddr[0])
if ip.is_loopback or ip.is_link_local or ip.is_unspecified:
raise HTTPException(
400,
f"URL targets a blocked address ({ip}). "
"Loopback, link-local, and unspecified addresses are not allowed."
)
except HTTPException:
raise
except Exception:
pass # DNS resolution failures are not a security issue — let Playwright handle them


# ───────────────── safe config‑dump helper ─────────────────
ALLOWED_TYPES = {
"CrawlerRunConfig": CrawlerRunConfig,
Expand Down Expand Up @@ -328,6 +369,7 @@ async def get_markdown(
if not body.url.startswith(("http://", "https://")) and not body.url.startswith(("raw:", "raw://")):
raise HTTPException(
400, "Invalid URL format. Must start with http://, https://, or for raw HTML (raw:, raw://)")
validate_url_target(body.url)
markdown = await handle_markdown_request(
body.url, body.f, body.q, body.c, config, body.provider,
body.temperature, body.base_url
Expand Down Expand Up @@ -387,6 +429,7 @@ async def generate_screenshot(
Then in result instead of the screenshot you will get a path to the saved file.
"""
validate_url_scheme(body.url)
validate_url_target(body.url)
from crawler_pool import get_crawler
try:
cfg = CrawlerRunConfig(screenshot=True, screenshot_wait_for=body.screenshot_wait_for)
Expand Down Expand Up @@ -422,6 +465,7 @@ async def generate_pdf(
Then in result instead of the PDF you will get a path to the saved file.
"""
validate_url_scheme(body.url)
validate_url_target(body.url)
from crawler_pool import get_crawler
try:
cfg = CrawlerRunConfig(pdf=True)
Expand Down Expand Up @@ -495,6 +539,7 @@ class MarkdownGenerationResult(BaseModel):

"""
validate_url_scheme(body.url)
validate_url_target(body.url)
from crawler_pool import get_crawler
try:
cfg = CrawlerRunConfig(js_code=body.scripts)
Expand Down