Files
crawl4ai/deploy/docker/tests/run_security_tests.py
unclecode f24396c23e Fix critical RCE and LFI vulnerabilities in Docker API deployment
Security fixes for vulnerabilities reported by ProjectDiscovery:

1. Remote Code Execution via Hooks (CVE pending)
   - Remove __import__ from allowed_builtins in hook_manager.py
   - Prevents arbitrary module imports (os, subprocess, etc.)
   - Hooks now disabled by default via CRAWL4AI_HOOKS_ENABLED env var

2. Local File Inclusion via file:// URLs (CVE pending)
   - Add URL scheme validation to /execute_js, /screenshot, /pdf, /html
   - Block file://, javascript:, data: and other dangerous schemes
   - Only allow http://, https://, and raw: (where appropriate)

3. Security hardening
   - Add CRAWL4AI_HOOKS_ENABLED=false as default (opt-in for hooks)
   - Add security warning comments in config.yml
   - Add validate_url_scheme() helper for consistent validation

Testing:
   - Add unit tests (test_security_fixes.py) - 16 tests
   - Add integration tests (run_security_tests.py) for live server

Affected endpoints:
   - POST /crawl (hooks disabled by default)
   - POST /crawl/stream (hooks disabled by default)
   - POST /execute_js (URL validation added)
   - POST /screenshot (URL validation added)
   - POST /pdf (URL validation added)
   - POST /html (URL validation added)

Breaking changes:
   - Hooks require CRAWL4AI_HOOKS_ENABLED=true to function
   - file:// URLs no longer work on API endpoints (use library directly)
2026-01-12 04:14:37 +00:00

197 lines
5.2 KiB
Python
Executable File

#!/usr/bin/env python3
"""
Security Integration Tests for Crawl4AI Docker API.
Tests that security fixes are working correctly against a running server.
Usage:
python run_security_tests.py [base_url]
Example:
python run_security_tests.py http://localhost:11235
"""
import subprocess
import sys
import re
# Colors for terminal output
GREEN = '\033[0;32m'
RED = '\033[0;31m'
YELLOW = '\033[1;33m'
NC = '\033[0m' # No Color
PASSED = 0
FAILED = 0
def run_curl(args: list) -> str:
"""Run curl command and return output."""
try:
result = subprocess.run(
['curl', '-s'] + args,
capture_output=True,
text=True,
timeout=30
)
return result.stdout + result.stderr
except subprocess.TimeoutExpired:
return "TIMEOUT"
except Exception as e:
return str(e)
def test_expect(name: str, expect_pattern: str, curl_args: list) -> bool:
"""Run a test and check if output matches expected pattern."""
global PASSED, FAILED
result = run_curl(curl_args)
if re.search(expect_pattern, result, re.IGNORECASE):
print(f"{GREEN}{NC} {name}")
PASSED += 1
return True
else:
print(f"{RED}{NC} {name}")
print(f" Expected pattern: {expect_pattern}")
print(f" Got: {result[:200]}")
FAILED += 1
return False
def main():
global PASSED, FAILED
base_url = sys.argv[1] if len(sys.argv) > 1 else "http://localhost:11235"
print("=" * 60)
print("Crawl4AI Security Integration Tests")
print(f"Target: {base_url}")
print("=" * 60)
print()
# Check server availability
print("Checking server availability...")
result = run_curl(['-o', '/dev/null', '-w', '%{http_code}', f'{base_url}/health'])
if '200' not in result:
print(f"{RED}ERROR: Server not reachable at {base_url}{NC}")
print("Please start the server first.")
sys.exit(1)
print(f"{GREEN}Server is running{NC}")
print()
# === Part A: Security Tests ===
print("=== Part A: Security Tests ===")
print("(Vulnerabilities must be BLOCKED)")
print()
test_expect(
"A1: Hooks disabled by default (403)",
r"403|disabled|Hooks are disabled",
['-X', 'POST', f'{base_url}/crawl',
'-H', 'Content-Type: application/json',
'-d', '{"urls":["https://example.com"],"hooks":{"code":{"on_page_context_created":"async def hook(page, context, **kwargs): return page"}}}']
)
test_expect(
"A2: file:// blocked on /execute_js (400)",
r"400|must start with",
['-X', 'POST', f'{base_url}/execute_js',
'-H', 'Content-Type: application/json',
'-d', '{"url":"file:///etc/passwd","scripts":["1"]}']
)
test_expect(
"A3: file:// blocked on /screenshot (400)",
r"400|must start with",
['-X', 'POST', f'{base_url}/screenshot',
'-H', 'Content-Type: application/json',
'-d', '{"url":"file:///etc/passwd"}']
)
test_expect(
"A4: file:// blocked on /pdf (400)",
r"400|must start with",
['-X', 'POST', f'{base_url}/pdf',
'-H', 'Content-Type: application/json',
'-d', '{"url":"file:///etc/passwd"}']
)
test_expect(
"A5: file:// blocked on /html (400)",
r"400|must start with",
['-X', 'POST', f'{base_url}/html',
'-H', 'Content-Type: application/json',
'-d', '{"url":"file:///etc/passwd"}']
)
print()
# === Part B: Functionality Tests ===
print("=== Part B: Functionality Tests ===")
print("(Normal operations must WORK)")
print()
test_expect(
"B1: Basic crawl works",
r"success.*true|results",
['-X', 'POST', f'{base_url}/crawl',
'-H', 'Content-Type: application/json',
'-d', '{"urls":["https://example.com"]}']
)
test_expect(
"B2: /md works with https://",
r"success.*true|markdown",
['-X', 'POST', f'{base_url}/md',
'-H', 'Content-Type: application/json',
'-d', '{"url":"https://example.com"}']
)
test_expect(
"B3: Health endpoint works",
r"ok",
[f'{base_url}/health']
)
print()
# === Part C: Edge Cases ===
print("=== Part C: Edge Cases ===")
print("(Malformed input must be REJECTED)")
print()
test_expect(
"C1: javascript: URL rejected (400)",
r"400|must start with",
['-X', 'POST', f'{base_url}/execute_js',
'-H', 'Content-Type: application/json',
'-d', '{"url":"javascript:alert(1)","scripts":["1"]}']
)
test_expect(
"C2: data: URL rejected (400)",
r"400|must start with",
['-X', 'POST', f'{base_url}/execute_js',
'-H', 'Content-Type: application/json',
'-d', '{"url":"data:text/html,<h1>test</h1>","scripts":["1"]}']
)
print()
print("=" * 60)
print("Results")
print("=" * 60)
print(f"Passed: {GREEN}{PASSED}{NC}")
print(f"Failed: {RED}{FAILED}{NC}")
print()
if FAILED > 0:
print(f"{RED}SOME TESTS FAILED{NC}")
sys.exit(1)
else:
print(f"{GREEN}ALL TESTS PASSED{NC}")
sys.exit(0)
if __name__ == '__main__':
main()