From 19f36a7c7f4b741bfd2ba57e7174c078a5c069a2 Mon Sep 17 00:00:00 2001 From: Janna Hopp <201101176+jannahopp@users.noreply.github.com> Date: Mon, 23 Mar 2026 11:57:52 +0100 Subject: [PATCH] fix: validate URL schemes in /crawl and /crawl/stream endpoints The /crawl and /crawl/stream endpoints normalize URLs by prepending https:// to scheme-less URLs, but silently pass through dangerous schemes like file:// and javascript:. Other endpoints (/screenshot, /pdf, /html, /execute_js) already reject these via validate_url_scheme(). Add scheme validation in both handle_crawl_request and handle_stream_crawl_request: if a URL has an explicit scheme, it must be http://, https://, or raw:. Scheme-less URLs (e.g. example.com) pass through and get https:// prepended as before. Co-Authored-By: Claude Opus 4.6 (1M context) --- deploy/docker/api.py | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/deploy/docker/api.py b/deploy/docker/api.py index 81cd312ab..a9698137f 100644 --- a/deploy/docker/api.py +++ b/deploy/docker/api.py @@ -538,6 +538,12 @@ async def handle_crawl_request( hook_manager = None try: + _allowed_schemes = ('http://', 'https://', 'raw:', 'raw://') + for url in urls: + # Detect explicit scheme: contains :// or starts with a known no-// scheme + has_scheme = '://' in url or url.lower().startswith(('javascript:', 'data:', 'vbscript:')) + if has_scheme and not url.startswith(_allowed_schemes): + raise HTTPException(400, f"URL scheme not allowed: {url[:50]}") urls = [('https://' + url) if not url.startswith(('http://', 'https://')) and not url.startswith(("raw:", "raw://")) else url for url in urls] browser_config = BrowserConfig.load(browser_config) crawler_config = CrawlerRunConfig.load(crawler_config) @@ -720,6 +726,12 @@ async def handle_stream_crawl_request( """Handle streaming crawl requests with optional hooks.""" hooks_info = None try: + _allowed_schemes = ('http://', 'https://', 'raw:', 'raw://') + for url in urls: + # Detect explicit scheme: contains :// or starts with a known no-// scheme + has_scheme = '://' in url or url.lower().startswith(('javascript:', 'data:', 'vbscript:')) + if has_scheme and not url.startswith(_allowed_schemes): + raise HTTPException(400, f"URL scheme not allowed: {url[:50]}") browser_config = BrowserConfig.load(browser_config) # browser_config.verbose = True # Set to False or remove for production stress testing browser_config.verbose = False