Fix critical RCE and LFI vulnerabilities in Docker API deployment

Security fixes for vulnerabilities reported by ProjectDiscovery: 1. Remote Code Execution via Hooks (CVE pending) - Remove __import__ from allowed_builtins in hook_manager.py - Prevents arbitrary module imports (os, subprocess, etc.) - Hooks now disabled by default via CRAWL4AI_HOOKS_ENABLED env var 2. Local File Inclusion via file:// URLs (CVE pending) - Add URL scheme validation to /execute_js, /screenshot, /pdf, /html - Block file://, javascript:, data: and other dangerous schemes - Only allow http://, https://, and raw: (where appropriate) 3. Security hardening - Add CRAWL4AI_HOOKS_ENABLED=false as default (opt-in for hooks) - Add security warning comments in config.yml - Add validate_url_scheme() helper for consistent validation Testing: - Add unit tests (test_security_fixes.py) - 16 tests - Add integration tests (run_security_tests.py) for live server Affected endpoints: - POST /crawl (hooks disabled by default) - POST /crawl/stream (hooks disabled by default) - POST /execute_js (URL validation added) - POST /screenshot (URL validation added) - POST /pdf (URL validation added) - POST /html (URL validation added) Breaking changes: - Hooks require CRAWL4AI_HOOKS_ENABLED=true to function - file:// URLs no longer work on API endpoints (use library directly)
2026-01-12 04:14:37 +00:00
parent 6b2dca76c3
commit f24396c23e
5 changed files with 399 additions and 4 deletions
--- a/deploy/docker/server.py
+++ b/deploy/docker/server.py
@@ -79,6 +79,10 @@ __version__ = "0.5.1-d1"
 MAX_PAGES = config["crawler"]["pool"].get("max_pages", 30)
 GLOBAL_SEM = asyncio.Semaphore(MAX_PAGES)

+# ── security feature flags ───────────────────────────────────
+# Hooks are disabled by default for security (RCE risk). Set to "true" to enable.
+HOOKS_ENABLED = os.environ.get("CRAWL4AI_HOOKS_ENABLED", "false").lower() == "true"
+
 # ── default browser config helper ─────────────────────────────
 def get_default_browser_config() -> BrowserConfig:
    """Get default BrowserConfig from config.yml."""
@@ -236,6 +240,19 @@ async def add_security_headers(request: Request, call_next):
        resp.headers.update(config["security"]["headers"])
    return resp

+# ───────────────── URL validation helper ─────────────────
+ALLOWED_URL_SCHEMES = ("http://", "https://")
+ALLOWED_URL_SCHEMES_WITH_RAW = ("http://", "https://", "raw:", "raw://")
+
+
+def validate_url_scheme(url: str, allow_raw: bool = False) -> None:
+    """Validate URL scheme to prevent file:// LFI attacks."""
+    allowed = ALLOWED_URL_SCHEMES_WITH_RAW if allow_raw else ALLOWED_URL_SCHEMES
+    if not url.startswith(allowed):
+        schemes = ", ".join(allowed)
+        raise HTTPException(400, f"URL must start with {schemes}")
+
+
 # ───────────────── safe config‑dump helper ─────────────────
 ALLOWED_TYPES = {
    "CrawlerRunConfig": CrawlerRunConfig,
@@ -337,6 +354,7 @@ async def generate_html(
    Crawls the URL, preprocesses the raw HTML for schema extraction, and returns the processed HTML.
    Use when you need sanitized HTML structures for building schemas or further processing.
    """
+    validate_url_scheme(body.url, allow_raw=True)
    from crawler_pool import get_crawler
    cfg = CrawlerRunConfig()
    try:
@@ -368,6 +386,7 @@ async def generate_screenshot(
    Use when you need an image snapshot of the rendered page. Its recommened to provide an output path to save the screenshot.
    Then in result instead of the screenshot you will get a path to the saved file.
    """
+    validate_url_scheme(body.url)
    from crawler_pool import get_crawler
    try:
        cfg = CrawlerRunConfig(screenshot=True, screenshot_wait_for=body.screenshot_wait_for)
@@ -402,6 +421,7 @@ async def generate_pdf(
    Use when you need a printable or archivable snapshot of the page. It is recommended to provide an output path to save the PDF.
    Then in result instead of the PDF you will get a path to the saved file.
    """
+    validate_url_scheme(body.url)
    from crawler_pool import get_crawler
    try:
        cfg = CrawlerRunConfig(pdf=True)
@@ -474,6 +494,7 @@ async def execute_js(
        ```

    """
+    validate_url_scheme(body.url)
    from crawler_pool import get_crawler
    try:
        cfg = CrawlerRunConfig(js_code=body.scripts)
@@ -600,6 +621,8 @@ async def crawl(
    """
    if not crawl_request.urls:
        raise HTTPException(400, "At least one URL required")
+    if crawl_request.hooks and not HOOKS_ENABLED:
+        raise HTTPException(403, "Hooks are disabled. Set CRAWL4AI_HOOKS_ENABLED=true to enable.")
    # Check whether it is a redirection for a streaming request
    crawler_config = CrawlerRunConfig.load(crawl_request.crawler_config)
    if crawler_config.stream:
@@ -635,6 +658,8 @@ async def crawl_stream(
 ):
    if not crawl_request.urls:
        raise HTTPException(400, "At least one URL required")
+    if crawl_request.hooks and not HOOKS_ENABLED:
+        raise HTTPException(403, "Hooks are disabled. Set CRAWL4AI_HOOKS_ENABLED=true to enable.")

    return await stream_process(crawl_request=crawl_request)