Fix critical RCE via deserialization and eval() in /crawl endpoint

- Replace raw eval() in _compute_field() with AST-validated
  _safe_eval_expression() that blocks __import__, dunder attribute
  access, and import statements while preserving safe transforms
- Add ALLOWED_DESERIALIZE_TYPES allowlist to from_serializable_dict()
  preventing arbitrary class instantiation from API input
- Update security contact email and add v0.8.1 security fixes to
  SECURITY.md with researcher acknowledgment
- Add 17 security tests covering both fixes
This commit is contained in:
unclecode
2026-01-30 08:46:01 +00:00
parent ad5ebf166a
commit 0104db6de2
4 changed files with 288 additions and 7 deletions

View File

@@ -1,4 +1,5 @@
from abc import ABC, abstractmethod
import ast
import inspect
from typing import Any, List, Dict, Optional, Tuple, Pattern, Union
from concurrent.futures import ThreadPoolExecutor, as_completed
@@ -1001,6 +1002,69 @@ class LLMExtractionStrategy(ExtractionStrategy):
#######################################################
# New extraction strategies for JSON-based extraction #
#######################################################
# Safe builtins allowed in computed field expressions
_SAFE_EVAL_BUILTINS = {
"str": str, "int": int, "float": float, "bool": bool,
"len": len, "round": round, "abs": abs, "min": min, "max": max,
"sum": sum, "sorted": sorted, "reversed": reversed,
"list": list, "dict": dict, "tuple": tuple, "set": set,
"enumerate": enumerate, "zip": zip, "map": map, "filter": filter,
"any": any, "all": all, "range": range,
"True": True, "False": False, "None": None,
"isinstance": isinstance, "type": type,
}
def _safe_eval_expression(expression: str, local_vars: dict) -> Any:
"""
Evaluate a computed field expression safely using AST validation.
Allows simple transforms (math, string methods, attribute access on data)
while blocking dangerous operations (__import__, dunder access, etc.).
Args:
expression: The Python expression string to evaluate.
local_vars: The local variables (extracted item fields) available to the expression.
Returns:
The result of evaluating the expression.
Raises:
ValueError: If the expression contains disallowed constructs.
"""
try:
tree = ast.parse(expression, mode="eval")
except SyntaxError as e:
raise ValueError(f"Invalid expression syntax: {e}")
for node in ast.walk(tree):
# Block import statements
if isinstance(node, (ast.Import, ast.ImportFrom)):
raise ValueError("Import statements are not allowed in expressions")
# Block attribute access to dunder attributes (e.g., __class__, __globals__)
if isinstance(node, ast.Attribute) and node.attr.startswith("_"):
raise ValueError(
f"Access to private/dunder attribute '{node.attr}' is not allowed"
)
# Block calls to __import__ or any name starting with _
if isinstance(node, ast.Call):
func = node.func
if isinstance(func, ast.Name) and func.id.startswith("_"):
raise ValueError(
f"Calling '{func.id}' is not allowed in expressions"
)
if isinstance(func, ast.Attribute) and func.attr.startswith("_"):
raise ValueError(
f"Calling '{func.attr}' is not allowed in expressions"
)
safe_globals = {"__builtins__": _SAFE_EVAL_BUILTINS}
return eval(compile(tree, "<expression>", "eval"), safe_globals, local_vars)
class JsonElementExtractionStrategy(ExtractionStrategy):
"""
Abstract base class for extracting structured JSON from HTML content.
@@ -1236,7 +1300,7 @@ class JsonElementExtractionStrategy(ExtractionStrategy):
def _compute_field(self, item, field):
try:
if "expression" in field:
return eval(field["expression"], {}, item)
return _safe_eval_expression(field["expression"], item)
elif "function" in field:
return field["function"](item)
except Exception as e: