Fix critical RCE and LFI vulnerabilities in Docker API deployment

Security fixes for vulnerabilities reported by ProjectDiscovery: 1. Remote Code Execution via Hooks (CVE pending) - Remove __import__ from allowed_builtins in hook_manager.py - Prevents arbitrary module imports (os, subprocess, etc.) - Hooks now disabled by default via CRAWL4AI_HOOKS_ENABLED env var 2. Local File Inclusion via file:// URLs (CVE pending) - Add URL scheme validation to /execute_js, /screenshot, /pdf, /html - Block file://, javascript:, data: and other dangerous schemes - Only allow http://, https://, and raw: (where appropriate) 3. Security hardening - Add CRAWL4AI_HOOKS_ENABLED=false as default (opt-in for hooks) - Add security warning comments in config.yml - Add validate_url_scheme() helper for consistent validation Testing: - Add unit tests (test_security_fixes.py) - 16 tests - Add integration tests (run_security_tests.py) for live server Affected endpoints: - POST /crawl (hooks disabled by default) - POST /crawl/stream (hooks disabled by default) - POST /execute_js (URL validation added) - POST /screenshot (URL validation added) - POST /pdf (URL validation added) - POST /html (URL validation added) Breaking changes: - Hooks require CRAWL4AI_HOOKS_ENABLED=true to function - file:// URLs no longer work on API endpoints (use library directly)
2026-01-12 04:14:37 +00:00
parent 6b2dca76c3
commit f24396c23e
5 changed files with 399 additions and 4 deletions
--- a/deploy/docker/tests/run_security_tests.py
+++ b/deploy/docker/tests/run_security_tests.py
@@ -0,0 +1,196 @@
+#!/usr/bin/env python3
+"""
+Security Integration Tests for Crawl4AI Docker API.
+Tests that security fixes are working correctly against a running server.
+
+Usage:
+    python run_security_tests.py [base_url]
+
+Example:
+    python run_security_tests.py http://localhost:11235
+"""
+
+import subprocess
+import sys
+import re
+
+# Colors for terminal output
+GREEN = '\033[0;32m'
+RED = '\033[0;31m'
+YELLOW = '\033[1;33m'
+NC = '\033[0m'  # No Color
+
+PASSED = 0
+FAILED = 0
+
+
+def run_curl(args: list) -> str:
+    """Run curl command and return output."""
+    try:
+        result = subprocess.run(
+            ['curl', '-s'] + args,
+            capture_output=True,
+            text=True,
+            timeout=30
+        )
+        return result.stdout + result.stderr
+    except subprocess.TimeoutExpired:
+        return "TIMEOUT"
+    except Exception as e:
+        return str(e)
+
+
+def test_expect(name: str, expect_pattern: str, curl_args: list) -> bool:
+    """Run a test and check if output matches expected pattern."""
+    global PASSED, FAILED
+
+    result = run_curl(curl_args)
+
+    if re.search(expect_pattern, result, re.IGNORECASE):
+        print(f"{GREEN}✓{NC} {name}")
+        PASSED += 1
+        return True
+    else:
+        print(f"{RED}✗{NC} {name}")
+        print(f"  Expected pattern: {expect_pattern}")
+        print(f"  Got: {result[:200]}")
+        FAILED += 1
+        return False
+
+
+def main():
+    global PASSED, FAILED
+
+    base_url = sys.argv[1] if len(sys.argv) > 1 else "http://localhost:11235"
+
+    print("=" * 60)
+    print("Crawl4AI Security Integration Tests")
+    print(f"Target: {base_url}")
+    print("=" * 60)
+    print()
+
+    # Check server availability
+    print("Checking server availability...")
+    result = run_curl(['-o', '/dev/null', '-w', '%{http_code}', f'{base_url}/health'])
+    if '200' not in result:
+        print(f"{RED}ERROR: Server not reachable at {base_url}{NC}")
+        print("Please start the server first.")
+        sys.exit(1)
+    print(f"{GREEN}Server is running{NC}")
+    print()
+
+    # === Part A: Security Tests ===
+    print("=== Part A: Security Tests ===")
+    print("(Vulnerabilities must be BLOCKED)")
+    print()
+
+    test_expect(
+        "A1: Hooks disabled by default (403)",
+        r"403|disabled|Hooks are disabled",
+        ['-X', 'POST', f'{base_url}/crawl',
+         '-H', 'Content-Type: application/json',
+         '-d', '{"urls":["https://example.com"],"hooks":{"code":{"on_page_context_created":"async def hook(page, context, **kwargs): return page"}}}']
+    )
+
+    test_expect(
+        "A2: file:// blocked on /execute_js (400)",
+        r"400|must start with",
+        ['-X', 'POST', f'{base_url}/execute_js',
+         '-H', 'Content-Type: application/json',
+         '-d', '{"url":"file:///etc/passwd","scripts":["1"]}']
+    )
+
+    test_expect(
+        "A3: file:// blocked on /screenshot (400)",
+        r"400|must start with",
+        ['-X', 'POST', f'{base_url}/screenshot',
+         '-H', 'Content-Type: application/json',
+         '-d', '{"url":"file:///etc/passwd"}']
+    )
+
+    test_expect(
+        "A4: file:// blocked on /pdf (400)",
+        r"400|must start with",
+        ['-X', 'POST', f'{base_url}/pdf',
+         '-H', 'Content-Type: application/json',
+         '-d', '{"url":"file:///etc/passwd"}']
+    )
+
+    test_expect(
+        "A5: file:// blocked on /html (400)",
+        r"400|must start with",
+        ['-X', 'POST', f'{base_url}/html',
+         '-H', 'Content-Type: application/json',
+         '-d', '{"url":"file:///etc/passwd"}']
+    )
+
+    print()
+
+    # === Part B: Functionality Tests ===
+    print("=== Part B: Functionality Tests ===")
+    print("(Normal operations must WORK)")
+    print()
+
+    test_expect(
+        "B1: Basic crawl works",
+        r"success.*true|results",
+        ['-X', 'POST', f'{base_url}/crawl',
+         '-H', 'Content-Type: application/json',
+         '-d', '{"urls":["https://example.com"]}']
+    )
+
+    test_expect(
+        "B2: /md works with https://",
+        r"success.*true|markdown",
+        ['-X', 'POST', f'{base_url}/md',
+         '-H', 'Content-Type: application/json',
+         '-d', '{"url":"https://example.com"}']
+    )
+
+    test_expect(
+        "B3: Health endpoint works",
+        r"ok",
+        [f'{base_url}/health']
+    )
+
+    print()
+
+    # === Part C: Edge Cases ===
+    print("=== Part C: Edge Cases ===")
+    print("(Malformed input must be REJECTED)")
+    print()
+
+    test_expect(
+        "C1: javascript: URL rejected (400)",
+        r"400|must start with",
+        ['-X', 'POST', f'{base_url}/execute_js',
+         '-H', 'Content-Type: application/json',
+         '-d', '{"url":"javascript:alert(1)","scripts":["1"]}']
+    )
+
+    test_expect(
+        "C2: data: URL rejected (400)",
+        r"400|must start with",
+        ['-X', 'POST', f'{base_url}/execute_js',
+         '-H', 'Content-Type: application/json',
+         '-d', '{"url":"data:text/html,<h1>test</h1>","scripts":["1"]}']
+    )
+
+    print()
+    print("=" * 60)
+    print("Results")
+    print("=" * 60)
+    print(f"Passed: {GREEN}{PASSED}{NC}")
+    print(f"Failed: {RED}{FAILED}{NC}")
+    print()
+
+    if FAILED > 0:
+        print(f"{RED}SOME TESTS FAILED{NC}")
+        sys.exit(1)
+    else:
+        print(f"{GREEN}ALL TESTS PASSED{NC}")
+        sys.exit(0)
+
+
+if __name__ == '__main__':
+    main()