Fix critical RCE via deserialization and eval() in /crawl endpoint

- Replace raw eval() in _compute_field() with AST-validated _safe_eval_expression() that blocks __import__, dunder attribute access, and import statements while preserving safe transforms - Add ALLOWED_DESERIALIZE_TYPES allowlist to from_serializable_dict() preventing arbitrary class instantiation from API input - Update security contact email and add v0.8.1 security fixes to SECURITY.md with researcher acknowledgment - Add 17 security tests covering both fixes
2026-01-30 08:46:01 +00:00
parent ad5ebf166a
commit 0104db6de2
4 changed files with 288 additions and 7 deletions
--- a/crawl4ai/extraction_strategy.py
+++ b/crawl4ai/extraction_strategy.py
@@ -1,4 +1,5 @@
 from abc import ABC, abstractmethod
+import ast
 import inspect
 from typing import Any, List, Dict, Optional, Tuple, Pattern, Union
 from concurrent.futures import ThreadPoolExecutor, as_completed
@@ -1001,6 +1002,69 @@ class LLMExtractionStrategy(ExtractionStrategy):
 #######################################################
 # New extraction strategies for JSON-based extraction #
 #######################################################
+
+# Safe builtins allowed in computed field expressions
+_SAFE_EVAL_BUILTINS = {
+    "str": str, "int": int, "float": float, "bool": bool,
+    "len": len, "round": round, "abs": abs, "min": min, "max": max,
+    "sum": sum, "sorted": sorted, "reversed": reversed,
+    "list": list, "dict": dict, "tuple": tuple, "set": set,
+    "enumerate": enumerate, "zip": zip, "map": map, "filter": filter,
+    "any": any, "all": all, "range": range,
+    "True": True, "False": False, "None": None,
+    "isinstance": isinstance, "type": type,
+}
+
+
+def _safe_eval_expression(expression: str, local_vars: dict) -> Any:
+    """
+    Evaluate a computed field expression safely using AST validation.
+
+    Allows simple transforms (math, string methods, attribute access on data)
+    while blocking dangerous operations (__import__, dunder access, etc.).
+
+    Args:
+        expression: The Python expression string to evaluate.
+        local_vars: The local variables (extracted item fields) available to the expression.
+
+    Returns:
+        The result of evaluating the expression.
+
+    Raises:
+        ValueError: If the expression contains disallowed constructs.
+    """
+    try:
+        tree = ast.parse(expression, mode="eval")
+    except SyntaxError as e:
+        raise ValueError(f"Invalid expression syntax: {e}")
+
+    for node in ast.walk(tree):
+        # Block import statements
+        if isinstance(node, (ast.Import, ast.ImportFrom)):
+            raise ValueError("Import statements are not allowed in expressions")
+
+        # Block attribute access to dunder attributes (e.g., __class__, __globals__)
+        if isinstance(node, ast.Attribute) and node.attr.startswith("_"):
+            raise ValueError(
+                f"Access to private/dunder attribute '{node.attr}' is not allowed"
+            )
+
+        # Block calls to __import__ or any name starting with _
+        if isinstance(node, ast.Call):
+            func = node.func
+            if isinstance(func, ast.Name) and func.id.startswith("_"):
+                raise ValueError(
+                    f"Calling '{func.id}' is not allowed in expressions"
+                )
+            if isinstance(func, ast.Attribute) and func.attr.startswith("_"):
+                raise ValueError(
+                    f"Calling '{func.attr}' is not allowed in expressions"
+                )
+
+    safe_globals = {"__builtins__": _SAFE_EVAL_BUILTINS}
+    return eval(compile(tree, "<expression>", "eval"), safe_globals, local_vars)
+
+
 class JsonElementExtractionStrategy(ExtractionStrategy):
    """
    Abstract base class for extracting structured JSON from HTML content.
@@ -1236,7 +1300,7 @@ class JsonElementExtractionStrategy(ExtractionStrategy):
    def _compute_field(self, item, field):
        try:
            if "expression" in field:
-                return eval(field["expression"], {}, item)
+                return _safe_eval_expression(field["expression"], item)
            elif "function" in field:
                return field["function"](item)
        except Exception as e: