crawl4ai/deploy/docker/hook_manager.py

"""
Hook Manager for User-Provided Hook Functions
Handles validation, compilation, and safe execution of user-provided hook code
"""

import ast
import asyncio
import traceback
from typing import Dict, Callable, Optional, Tuple, List, Any
import logging

logger = logging.getLogger(__name__)


class UserHookManager:
    """Manages user-provided hook functions with error isolation"""

    # Expected signatures for each hook point
    HOOK_SIGNATURES = {
        "on_browser_created": ["browser"],
        "on_page_context_created": ["page", "context"],
        "before_goto": ["page", "context", "url"],
        "after_goto": ["page", "context", "url", "response"],
        "on_user_agent_updated": ["page", "context", "user_agent"],
        "on_execution_started": ["page", "context"],
        "before_retrieve_html": ["page", "context"],
        "before_return_html": ["page", "context", "html"]
    }

    # Default timeout for hook execution (in seconds)
    DEFAULT_TIMEOUT = 30

    def __init__(self, timeout: int = DEFAULT_TIMEOUT):
        self.timeout = timeout
        self.errors: List[Dict[str, Any]] = []
        self.compiled_hooks: Dict[str, Callable] = {}
        self.execution_log: List[Dict[str, Any]] = []

    def validate_hook_structure(self, hook_code: str, hook_point: str) -> Tuple[bool, str]:
        """
        Validate the structure of user-provided hook code

        Args:
            hook_code: The Python code string containing the hook function
            hook_point: The hook point name (e.g., 'on_page_context_created')

        Returns:
            Tuple of (is_valid, error_message)
        """
        try:
            # Parse the code
            tree = ast.parse(hook_code)

            # Check if it's empty
            if not tree.body:
                return False, "Hook code is empty"

            # Find the function definition
            func_def = None
            for node in tree.body:
                if isinstance(node, (ast.FunctionDef, ast.AsyncFunctionDef)):
                    func_def = node
                    break

            if not func_def:
                return False, "Hook must contain a function definition (def or async def)"

            # Check if it's async (all hooks should be async)
            if not isinstance(func_def, ast.AsyncFunctionDef):
                return False, f"Hook function must be async (use 'async def' instead of 'def')"

            # Get function name for better error messages
            func_name = func_def.name

            # Validate parameters
            expected_params = self.HOOK_SIGNATURES.get(hook_point, [])
            if not expected_params:
                return False, f"Unknown hook point: {hook_point}"

            func_params = [arg.arg for arg in func_def.args.args]

            # Check if it has **kwargs for flexibility
            has_kwargs = func_def.args.kwarg is not None

            # Must have at least the expected parameters
            missing_params = []
            for expected in expected_params:
                if expected not in func_params:
                    missing_params.append(expected)

            if missing_params and not has_kwargs:
                return False, f"Hook function '{func_name}' must accept parameters: {', '.join(expected_params)} (missing: {', '.join(missing_params)})"

            # Check if it returns something (should return page or browser)
            has_return = any(isinstance(node, ast.Return) for node in ast.walk(func_def))
            if not has_return:
                # Warning, not error - we'll handle this
                logger.warning(f"Hook function '{func_name}' should return the {expected_params[0]} object")

            return True, "Valid"

        except SyntaxError as e:
            return False, f"Syntax error at line {e.lineno}: {str(e)}"
        except Exception as e:
            return False, f"Failed to parse hook code: {str(e)}"

    def compile_hook(self, hook_code: str, hook_point: str) -> Optional[Callable]:
        """
        Compile user-provided hook code into a callable function

        Args:
            hook_code: The Python code string
            hook_point: The hook point name

        Returns:
            Compiled function or None if compilation failed
        """
        try:
            # Create a safe namespace for the hook
            # SECURITY: No __import__ to prevent arbitrary module imports (RCE risk)
            import builtins
            safe_builtins = {}

            # Add safe built-in functions (no __import__ for security)
            allowed_builtins = [
                'print', 'len', 'str', 'int', 'float', 'bool',
                'list', 'dict', 'set', 'tuple', 'range', 'enumerate',
                'zip', 'map', 'filter', 'any', 'all', 'sum', 'min', 'max',
                'sorted', 'reversed', 'abs', 'round', 'isinstance', 'type',
                'getattr', 'hasattr', 'setattr', 'callable', 'iter', 'next',
                '__build_class__'  # Required for class definitions in exec
            ]

            for name in allowed_builtins:
                if hasattr(builtins, name):
                    safe_builtins[name] = getattr(builtins, name)

            namespace = {
                '__name__': f'user_hook_{hook_point}',
                '__builtins__': safe_builtins
            }

            # Add commonly needed imports
            exec("import asyncio", namespace)
            exec("import json", namespace)
            exec("import re", namespace)
            exec("from typing import Dict, List, Optional", namespace)

            # Execute the code to define the function
            exec(hook_code, namespace)

            # Find the async function in the namespace
            for name, obj in namespace.items():
                if callable(obj) and not name.startswith('_') and asyncio.iscoroutinefunction(obj):
                    return obj

            # If no async function found, look for any function
            for name, obj in namespace.items():
                if callable(obj) and not name.startswith('_'):
                    logger.warning(f"Found non-async function '{name}' - wrapping it")
                    # Wrap sync function in async
                    async def async_wrapper(*args, **kwargs):
                        return obj(*args, **kwargs)
                    return async_wrapper

            raise ValueError("No callable function found in hook code")

        except Exception as e:
            error = {
                'hook_point': hook_point,
                'error': f"Failed to compile hook: {str(e)}",
                'type': 'compilation_error',
                'traceback': traceback.format_exc()
            }
            self.errors.append(error)
            logger.error(f"Hook compilation failed for {hook_point}: {str(e)}")
            return None

    async def execute_hook_safely(
        self,
        hook_func: Callable,
        hook_point: str,
        *args,
        **kwargs
    ) -> Tuple[Any, Optional[Dict]]:
        """
        Execute a user hook with error isolation and timeout

        Args:
            hook_func: The compiled hook function
            hook_point: The hook point name
            *args, **kwargs: Arguments to pass to the hook

        Returns:
            Tuple of (result, error_dict)
        """
        start_time = asyncio.get_event_loop().time()

        try:
            # Add timeout to prevent infinite loops
            result = await asyncio.wait_for(
                hook_func(*args, **kwargs),
                timeout=self.timeout
            )

            # Log successful execution
            execution_time = asyncio.get_event_loop().time() - start_time
            self.execution_log.append({
                'hook_point': hook_point,
                'status': 'success',
                'execution_time': execution_time,
                'timestamp': start_time
            })

            return result, None

        except asyncio.TimeoutError:
            error = {
                'hook_point': hook_point,
                'error': f'Hook execution timed out ({self.timeout}s limit)',
                'type': 'timeout',
                'execution_time': self.timeout
            }
            self.errors.append(error)
            self.execution_log.append({
                'hook_point': hook_point,
                'status': 'timeout',
                'error': error['error'],
                'execution_time': self.timeout,
                'timestamp': start_time
            })
            # Return the first argument (usually page/browser) to continue
            return args[0] if args else None, error

        except Exception as e:
            execution_time = asyncio.get_event_loop().time() - start_time
            error = {
                'hook_point': hook_point,
                'error': str(e),
                'type': type(e).__name__,
                'traceback': traceback.format_exc(),
                'execution_time': execution_time
            }
            self.errors.append(error)
            self.execution_log.append({
                'hook_point': hook_point,
                'status': 'failed',
                'error': str(e),
                'error_type': type(e).__name__,
                'execution_time': execution_time,
                'timestamp': start_time
            })
            # Return the first argument (usually page/browser) to continue
            return args[0] if args else None, error

    def get_summary(self) -> Dict[str, Any]:
        """Get a summary of hook execution"""
        total_hooks = len(self.execution_log)
        successful = sum(1 for log in self.execution_log if log['status'] == 'success')
        failed = sum(1 for log in self.execution_log if log['status'] == 'failed')
        timed_out = sum(1 for log in self.execution_log if log['status'] == 'timeout')

        return {
            'total_executions': total_hooks,
            'successful': successful,
            'failed': failed,
            'timed_out': timed_out,
            'success_rate': (successful / total_hooks * 100) if total_hooks > 0 else 0,
            'total_errors': len(self.errors)
        }


class IsolatedHookWrapper:
    """Wraps user hooks with error isolation and reporting"""

    def __init__(self, hook_manager: UserHookManager):
        self.hook_manager = hook_manager

    def create_hook_wrapper(self, user_hook: Callable, hook_point: str) -> Callable:
        """
        Create a wrapper that isolates hook errors from main process

        Args:
            user_hook: The compiled user hook function
            hook_point: The hook point name

        Returns:
            Wrapped async function that handles errors gracefully
        """

        async def wrapped_hook(*args, **kwargs):
            """Wrapped hook with error isolation"""
            # Get the main return object (page/browser)
            # This ensures we always have something to return
            return_obj = None
            if args:
                return_obj = args[0]
            elif 'page' in kwargs:
                return_obj = kwargs['page']
            elif 'browser' in kwargs:
                return_obj = kwargs['browser']

            try:
                # Execute user hook with safety
                result, error = await self.hook_manager.execute_hook_safely(
                    user_hook,
                    hook_point,
                    *args,
                    **kwargs
                )

                if error:
                    # Hook failed but we continue with original object
                    logger.warning(f"User hook failed at {hook_point}: {error['error']}")
                    return return_obj

                # Hook succeeded - return its result or the original object
                if result is None:
                    logger.debug(f"Hook at {hook_point} returned None, using original object")
                    return return_obj

                return result

            except Exception as e:
                # This should rarely happen due to execute_hook_safely
                logger.error(f"Unexpected error in hook wrapper for {hook_point}: {e}")
                return return_obj

        # Set function name for debugging
        wrapped_hook.__name__ = f"wrapped_{hook_point}"
        return wrapped_hook


async def process_user_hooks(
    hooks_input: Dict[str, str],
    timeout: int = 30
) -> Tuple[Dict[str, Callable], List[Dict], UserHookManager]:
    """
    Process and compile user-provided hook functions

    Args:
        hooks_input: Dictionary mapping hook points to code strings
        timeout: Timeout for each hook execution

    Returns:
        Tuple of (compiled_hooks, validation_errors, hook_manager)
    """

    hook_manager = UserHookManager(timeout=timeout)
    wrapper = IsolatedHookWrapper(hook_manager)
    compiled_hooks = {}
    validation_errors = []

    for hook_point, hook_code in hooks_input.items():
        # Skip empty hooks
        if not hook_code or not hook_code.strip():
            continue

        # Validate hook point
        if hook_point not in UserHookManager.HOOK_SIGNATURES:
            validation_errors.append({
                'hook_point': hook_point,
                'error': f'Unknown hook point. Valid points: {", ".join(UserHookManager.HOOK_SIGNATURES.keys())}',
                'code_preview': hook_code[:100] + '...' if len(hook_code) > 100 else hook_code
            })
            continue

        # Validate structure
        is_valid, message = hook_manager.validate_hook_structure(hook_code, hook_point)
        if not is_valid:
            validation_errors.append({
                'hook_point': hook_point,
                'error': message,
                'code_preview': hook_code[:100] + '...' if len(hook_code) > 100 else hook_code
            })
            continue

        # Compile the hook
        hook_func = hook_manager.compile_hook(hook_code, hook_point)
        if hook_func:
            # Wrap with error isolation
            wrapped_hook = wrapper.create_hook_wrapper(hook_func, hook_point)
            compiled_hooks[hook_point] = wrapped_hook
            logger.info(f"Successfully compiled hook for {hook_point}")
        else:
            validation_errors.append({
                'hook_point': hook_point,
                'error': 'Failed to compile hook function - check syntax and structure',
                'code_preview': hook_code[:100] + '...' if len(hook_code) > 100 else hook_code
            })

    return compiled_hooks, validation_errors, hook_manager


async def process_user_hooks_with_manager(
    hooks_input: Dict[str, str],
    hook_manager: UserHookManager
) -> Tuple[Dict[str, Callable], List[Dict]]:
    """
    Process and compile user-provided hook functions with existing manager

    Args:
        hooks_input: Dictionary mapping hook points to code strings
        hook_manager: Existing UserHookManager instance

    Returns:
        Tuple of (compiled_hooks, validation_errors)
    """

    wrapper = IsolatedHookWrapper(hook_manager)
    compiled_hooks = {}
    validation_errors = []

    for hook_point, hook_code in hooks_input.items():
        # Skip empty hooks
        if not hook_code or not hook_code.strip():
            continue

        # Validate hook point
        if hook_point not in UserHookManager.HOOK_SIGNATURES:
            validation_errors.append({
                'hook_point': hook_point,
                'error': f'Unknown hook point. Valid points: {", ".join(UserHookManager.HOOK_SIGNATURES.keys())}',
                'code_preview': hook_code[:100] + '...' if len(hook_code) > 100 else hook_code
            })
            continue

        # Validate structure
        is_valid, message = hook_manager.validate_hook_structure(hook_code, hook_point)
        if not is_valid:
            validation_errors.append({
                'hook_point': hook_point,
                'error': message,
                'code_preview': hook_code[:100] + '...' if len(hook_code) > 100 else hook_code
            })
            continue

        # Compile the hook
        hook_func = hook_manager.compile_hook(hook_code, hook_point)
        if hook_func:
            # Wrap with error isolation
            wrapped_hook = wrapper.create_hook_wrapper(hook_func, hook_point)
            compiled_hooks[hook_point] = wrapped_hook
            logger.info(f"Successfully compiled hook for {hook_point}")
        else:
            validation_errors.append({
                'hook_point': hook_point,
                'error': 'Failed to compile hook function - check syntax and structure',
                'code_preview': hook_code[:100] + '...' if len(hook_code) > 100 else hook_code
            })

    return compiled_hooks, validation_errors


async def attach_user_hooks_to_crawler(
    crawler,  # AsyncWebCrawler instance
    user_hooks: Dict[str, str],
    timeout: int = 30,
    hook_manager: Optional[UserHookManager] = None
) -> Tuple[Dict[str, Any], UserHookManager]:
    """
    Attach user-provided hooks to crawler with full error reporting

    Args:
        crawler: AsyncWebCrawler instance
        user_hooks: Dictionary mapping hook points to code strings
        timeout: Timeout for each hook execution
        hook_manager: Optional existing UserHookManager instance

    Returns:
        Tuple of (status_dict, hook_manager)
    """

    # Use provided hook_manager or create a new one
    if hook_manager is None:
        hook_manager = UserHookManager(timeout=timeout)

    # Process hooks with the hook_manager
    compiled_hooks, validation_errors = await process_user_hooks_with_manager(
        user_hooks, hook_manager
    )

    # Log validation errors
    if validation_errors:
        logger.warning(f"Hook validation errors: {validation_errors}")

    # Attach successfully compiled hooks
    attached_hooks = []
    for hook_point, wrapped_hook in compiled_hooks.items():
        try:
            crawler.crawler_strategy.set_hook(hook_point, wrapped_hook)
            attached_hooks.append(hook_point)
            logger.info(f"Attached hook to {hook_point}")
        except Exception as e:
            logger.error(f"Failed to attach hook to {hook_point}: {e}")
            validation_errors.append({
                'hook_point': hook_point,
                'error': f'Failed to attach hook: {str(e)}'
            })

    status = 'success' if not validation_errors else ('partial' if attached_hooks else 'failed')

    status_dict = {
        'status': status,
        'attached_hooks': attached_hooks,
        'validation_errors': validation_errors,
        'total_hooks_provided': len(user_hooks),
        'successfully_attached': len(attached_hooks),
        'failed_validation': len(validation_errors)
    }

    return status_dict, hook_manager