diff --git a/deploy/docker/config.yml b/deploy/docker/config.yml index 5790d5be..db3193a6 100644 --- a/deploy/docker/config.yml +++ b/deploy/docker/config.yml @@ -37,6 +37,10 @@ rate_limiting: storage_uri: "memory://" # Use "redis://localhost:6379" for production # Security Configuration +# WARNING: For production deployments, enable security and use proper SECRET_KEY: +# - Set jwt_enabled: true for authentication +# - Set SECRET_KEY environment variable to a secure random value +# - Set CRAWL4AI_HOOKS_ENABLED=true only if you need hooks (RCE risk) security: enabled: false jwt_enabled: false diff --git a/deploy/docker/hook_manager.py b/deploy/docker/hook_manager.py index 41c4f25d..f3e9bb39 100644 --- a/deploy/docker/hook_manager.py +++ b/deploy/docker/hook_manager.py @@ -117,18 +117,18 @@ class UserHookManager: """ try: # Create a safe namespace for the hook - # Use a more complete builtins that includes __import__ + # SECURITY: No __import__ to prevent arbitrary module imports (RCE risk) import builtins safe_builtins = {} - - # Add safe built-in functions + + # Add safe built-in functions (no __import__ for security) allowed_builtins = [ 'print', 'len', 'str', 'int', 'float', 'bool', 'list', 'dict', 'set', 'tuple', 'range', 'enumerate', 'zip', 'map', 'filter', 'any', 'all', 'sum', 'min', 'max', 'sorted', 'reversed', 'abs', 'round', 'isinstance', 'type', 'getattr', 'hasattr', 'setattr', 'callable', 'iter', 'next', - '__import__', '__build_class__' # Required for exec + '__build_class__' # Required for class definitions in exec ] for name in allowed_builtins: diff --git a/deploy/docker/server.py b/deploy/docker/server.py index 62e4e441..7ae1adb8 100644 --- a/deploy/docker/server.py +++ b/deploy/docker/server.py @@ -79,6 +79,10 @@ __version__ = "0.5.1-d1" MAX_PAGES = config["crawler"]["pool"].get("max_pages", 30) GLOBAL_SEM = asyncio.Semaphore(MAX_PAGES) +# ── security feature flags ─────────────────────────────────── +# Hooks are disabled by default for security (RCE risk). Set to "true" to enable. +HOOKS_ENABLED = os.environ.get("CRAWL4AI_HOOKS_ENABLED", "false").lower() == "true" + # ── default browser config helper ───────────────────────────── def get_default_browser_config() -> BrowserConfig: """Get default BrowserConfig from config.yml.""" @@ -236,6 +240,19 @@ async def add_security_headers(request: Request, call_next): resp.headers.update(config["security"]["headers"]) return resp +# ───────────────── URL validation helper ───────────────── +ALLOWED_URL_SCHEMES = ("http://", "https://") +ALLOWED_URL_SCHEMES_WITH_RAW = ("http://", "https://", "raw:", "raw://") + + +def validate_url_scheme(url: str, allow_raw: bool = False) -> None: + """Validate URL scheme to prevent file:// LFI attacks.""" + allowed = ALLOWED_URL_SCHEMES_WITH_RAW if allow_raw else ALLOWED_URL_SCHEMES + if not url.startswith(allowed): + schemes = ", ".join(allowed) + raise HTTPException(400, f"URL must start with {schemes}") + + # ───────────────── safe config‑dump helper ───────────────── ALLOWED_TYPES = { "CrawlerRunConfig": CrawlerRunConfig, @@ -337,6 +354,7 @@ async def generate_html( Crawls the URL, preprocesses the raw HTML for schema extraction, and returns the processed HTML. Use when you need sanitized HTML structures for building schemas or further processing. """ + validate_url_scheme(body.url, allow_raw=True) from crawler_pool import get_crawler cfg = CrawlerRunConfig() try: @@ -368,6 +386,7 @@ async def generate_screenshot( Use when you need an image snapshot of the rendered page. Its recommened to provide an output path to save the screenshot. Then in result instead of the screenshot you will get a path to the saved file. """ + validate_url_scheme(body.url) from crawler_pool import get_crawler try: cfg = CrawlerRunConfig(screenshot=True, screenshot_wait_for=body.screenshot_wait_for) @@ -402,6 +421,7 @@ async def generate_pdf( Use when you need a printable or archivable snapshot of the page. It is recommended to provide an output path to save the PDF. Then in result instead of the PDF you will get a path to the saved file. """ + validate_url_scheme(body.url) from crawler_pool import get_crawler try: cfg = CrawlerRunConfig(pdf=True) @@ -474,6 +494,7 @@ async def execute_js( ``` """ + validate_url_scheme(body.url) from crawler_pool import get_crawler try: cfg = CrawlerRunConfig(js_code=body.scripts) @@ -600,6 +621,8 @@ async def crawl( """ if not crawl_request.urls: raise HTTPException(400, "At least one URL required") + if crawl_request.hooks and not HOOKS_ENABLED: + raise HTTPException(403, "Hooks are disabled. Set CRAWL4AI_HOOKS_ENABLED=true to enable.") # Check whether it is a redirection for a streaming request crawler_config = CrawlerRunConfig.load(crawl_request.crawler_config) if crawler_config.stream: @@ -635,6 +658,8 @@ async def crawl_stream( ): if not crawl_request.urls: raise HTTPException(400, "At least one URL required") + if crawl_request.hooks and not HOOKS_ENABLED: + raise HTTPException(403, "Hooks are disabled. Set CRAWL4AI_HOOKS_ENABLED=true to enable.") return await stream_process(crawl_request=crawl_request) diff --git a/deploy/docker/tests/run_security_tests.py b/deploy/docker/tests/run_security_tests.py new file mode 100755 index 00000000..bba8bb9f --- /dev/null +++ b/deploy/docker/tests/run_security_tests.py @@ -0,0 +1,196 @@ +#!/usr/bin/env python3 +""" +Security Integration Tests for Crawl4AI Docker API. +Tests that security fixes are working correctly against a running server. + +Usage: + python run_security_tests.py [base_url] + +Example: + python run_security_tests.py http://localhost:11235 +""" + +import subprocess +import sys +import re + +# Colors for terminal output +GREEN = '\033[0;32m' +RED = '\033[0;31m' +YELLOW = '\033[1;33m' +NC = '\033[0m' # No Color + +PASSED = 0 +FAILED = 0 + + +def run_curl(args: list) -> str: + """Run curl command and return output.""" + try: + result = subprocess.run( + ['curl', '-s'] + args, + capture_output=True, + text=True, + timeout=30 + ) + return result.stdout + result.stderr + except subprocess.TimeoutExpired: + return "TIMEOUT" + except Exception as e: + return str(e) + + +def test_expect(name: str, expect_pattern: str, curl_args: list) -> bool: + """Run a test and check if output matches expected pattern.""" + global PASSED, FAILED + + result = run_curl(curl_args) + + if re.search(expect_pattern, result, re.IGNORECASE): + print(f"{GREEN}✓{NC} {name}") + PASSED += 1 + return True + else: + print(f"{RED}✗{NC} {name}") + print(f" Expected pattern: {expect_pattern}") + print(f" Got: {result[:200]}") + FAILED += 1 + return False + + +def main(): + global PASSED, FAILED + + base_url = sys.argv[1] if len(sys.argv) > 1 else "http://localhost:11235" + + print("=" * 60) + print("Crawl4AI Security Integration Tests") + print(f"Target: {base_url}") + print("=" * 60) + print() + + # Check server availability + print("Checking server availability...") + result = run_curl(['-o', '/dev/null', '-w', '%{http_code}', f'{base_url}/health']) + if '200' not in result: + print(f"{RED}ERROR: Server not reachable at {base_url}{NC}") + print("Please start the server first.") + sys.exit(1) + print(f"{GREEN}Server is running{NC}") + print() + + # === Part A: Security Tests === + print("=== Part A: Security Tests ===") + print("(Vulnerabilities must be BLOCKED)") + print() + + test_expect( + "A1: Hooks disabled by default (403)", + r"403|disabled|Hooks are disabled", + ['-X', 'POST', f'{base_url}/crawl', + '-H', 'Content-Type: application/json', + '-d', '{"urls":["https://example.com"],"hooks":{"code":{"on_page_context_created":"async def hook(page, context, **kwargs): return page"}}}'] + ) + + test_expect( + "A2: file:// blocked on /execute_js (400)", + r"400|must start with", + ['-X', 'POST', f'{base_url}/execute_js', + '-H', 'Content-Type: application/json', + '-d', '{"url":"file:///etc/passwd","scripts":["1"]}'] + ) + + test_expect( + "A3: file:// blocked on /screenshot (400)", + r"400|must start with", + ['-X', 'POST', f'{base_url}/screenshot', + '-H', 'Content-Type: application/json', + '-d', '{"url":"file:///etc/passwd"}'] + ) + + test_expect( + "A4: file:// blocked on /pdf (400)", + r"400|must start with", + ['-X', 'POST', f'{base_url}/pdf', + '-H', 'Content-Type: application/json', + '-d', '{"url":"file:///etc/passwd"}'] + ) + + test_expect( + "A5: file:// blocked on /html (400)", + r"400|must start with", + ['-X', 'POST', f'{base_url}/html', + '-H', 'Content-Type: application/json', + '-d', '{"url":"file:///etc/passwd"}'] + ) + + print() + + # === Part B: Functionality Tests === + print("=== Part B: Functionality Tests ===") + print("(Normal operations must WORK)") + print() + + test_expect( + "B1: Basic crawl works", + r"success.*true|results", + ['-X', 'POST', f'{base_url}/crawl', + '-H', 'Content-Type: application/json', + '-d', '{"urls":["https://example.com"]}'] + ) + + test_expect( + "B2: /md works with https://", + r"success.*true|markdown", + ['-X', 'POST', f'{base_url}/md', + '-H', 'Content-Type: application/json', + '-d', '{"url":"https://example.com"}'] + ) + + test_expect( + "B3: Health endpoint works", + r"ok", + [f'{base_url}/health'] + ) + + print() + + # === Part C: Edge Cases === + print("=== Part C: Edge Cases ===") + print("(Malformed input must be REJECTED)") + print() + + test_expect( + "C1: javascript: URL rejected (400)", + r"400|must start with", + ['-X', 'POST', f'{base_url}/execute_js', + '-H', 'Content-Type: application/json', + '-d', '{"url":"javascript:alert(1)","scripts":["1"]}'] + ) + + test_expect( + "C2: data: URL rejected (400)", + r"400|must start with", + ['-X', 'POST', f'{base_url}/execute_js', + '-H', 'Content-Type: application/json', + '-d', '{"url":"data:text/html,