diff --git a/crawl4ai/async_configs.py b/crawl4ai/async_configs.py index 2306a0a6..c7f9e739 100644 --- a/crawl4ai/async_configs.py +++ b/crawl4ai/async_configs.py @@ -28,6 +28,10 @@ from typing import Any, Dict, Optional from enum import Enum from .proxy_strategy import ProxyConfig +try: + from .browser.docker_config import DockerConfig +except ImportError: + DockerConfig = None def to_serializable_dict(obj: Any, ignore_default_value : bool = False) -> Dict: @@ -173,6 +177,7 @@ class BrowserConfig: "builtin" - use the builtin CDP browser running in background "dedicated" - create a new dedicated browser instance each time "custom" - use explicit CDP settings provided in cdp_url + "docker" - run browser in Docker container with isolation Default: "dedicated" use_managed_browser (bool): Launch the browser using a managed approach (e.g., via CDP), allowing advanced manipulation. Default: False. @@ -190,6 +195,8 @@ class BrowserConfig: Default: None. proxy_config (ProxyConfig or dict or None): Detailed proxy configuration, e.g. {"server": "...", "username": "..."}. If None, no additional proxy config. Default: None. + docker_config (DockerConfig or dict or None): Configuration for Docker-based browser automation. + Contains settings for Docker container operation. Default: None. viewport_width (int): Default viewport width for pages. Default: 1080. viewport_height (int): Default viewport height for pages. Default: 600. viewport (dict): Default viewport dimensions for pages. If set, overrides viewport_width and viewport_height. @@ -235,6 +242,7 @@ class BrowserConfig: channel: str = "chromium", proxy: str = None, proxy_config: Union[ProxyConfig, dict, None] = None, + docker_config: Union["DockerConfig", dict, None] = None, viewport_width: int = 1080, viewport_height: int = 600, viewport: dict = None, @@ -275,6 +283,12 @@ class BrowserConfig: self.chrome_channel = "" self.proxy = proxy self.proxy_config = proxy_config + + # Handle docker configuration + if isinstance(docker_config, dict) and DockerConfig is not None: + self.docker_config = DockerConfig.from_kwargs(docker_config) + else: + self.docker_config = docker_config self.viewport_width = viewport_width self.viewport_height = viewport_height self.viewport = viewport @@ -315,6 +329,10 @@ class BrowserConfig: # Builtin mode uses managed browser connecting to builtin CDP endpoint self.use_managed_browser = True # cdp_url will be set later by browser_manager + elif self.browser_mode == "docker": + # Docker mode uses managed browser with CDP to connect to browser in container + self.use_managed_browser = True + # cdp_url will be set later by docker browser strategy elif self.browser_mode == "custom" and self.cdp_url: # Custom mode with explicit CDP URL self.use_managed_browser = True @@ -340,6 +358,7 @@ class BrowserConfig: channel=kwargs.get("channel", "chromium"), proxy=kwargs.get("proxy"), proxy_config=kwargs.get("proxy_config", None), + docker_config=kwargs.get("docker_config", None), viewport_width=kwargs.get("viewport_width", 1080), viewport_height=kwargs.get("viewport_height", 600), accept_downloads=kwargs.get("accept_downloads", False), @@ -364,7 +383,7 @@ class BrowserConfig: ) def to_dict(self): - return { + result = { "browser_type": self.browser_type, "headless": self.headless, "browser_mode": self.browser_mode, @@ -396,6 +415,15 @@ class BrowserConfig: "debugging_port": self.debugging_port, "host": self.host, } + + # Include docker_config if it exists + if hasattr(self, "docker_config") and self.docker_config is not None: + if hasattr(self.docker_config, "to_dict"): + result["docker_config"] = self.docker_config.to_dict() + else: + result["docker_config"] = self.docker_config + + return result def clone(self, **kwargs): """Create a copy of this configuration with updated values. diff --git a/crawl4ai/browser/docker/connect.Dockerfile b/crawl4ai/browser/docker/connect.Dockerfile new file mode 100644 index 00000000..d2d955b6 --- /dev/null +++ b/crawl4ai/browser/docker/connect.Dockerfile @@ -0,0 +1,61 @@ +FROM ubuntu:22.04 + +# Install dependencies with comprehensive Chromium support +RUN apt-get update && apt-get install -y --no-install-recommends \ + wget \ + gnupg \ + ca-certificates \ + fonts-liberation \ + # Sound support + libasound2 \ + # Accessibility support + libatspi2.0-0 \ + libatk1.0-0 \ + libatk-bridge2.0-0 \ + # Graphics and rendering + libdrm2 \ + libgbm1 \ + libgtk-3-0 \ + libxcomposite1 \ + libxdamage1 \ + libxext6 \ + libxfixes3 \ + libxrandr2 \ + # X11 and window system + libx11-6 \ + libxcb1 \ + libxkbcommon0 \ + # Text and internationalization + libpango-1.0-0 \ + libcairo2 \ + # Printing support + libcups2 \ + # System libraries + libdbus-1-3 \ + libnss3 \ + libnspr4 \ + libglib2.0-0 \ + # Utilities + xdg-utils \ + socat \ + # Process management + procps \ + # Clean up + && rm -rf /var/lib/apt/lists/* + +# Install Chrome +RUN wget -q -O - https://dl-ssl.google.com/linux/linux_signing_key.pub | apt-key add - && \ + echo "deb [arch=amd64] http://dl.google.com/linux/chrome/deb/ stable main" >> /etc/apt/sources.list.d/google.list && \ + apt-get update && \ + apt-get install -y google-chrome-stable && \ + rm -rf /var/lib/apt/lists/* + +# Create data directory for user data +RUN mkdir -p /data && chmod 777 /data + +# Add a startup script +COPY start.sh /start.sh +RUN chmod +x /start.sh + +# Set entrypoint +ENTRYPOINT ["/start.sh"] \ No newline at end of file diff --git a/crawl4ai/browser/docker/launch.Dockerfile b/crawl4ai/browser/docker/launch.Dockerfile new file mode 100644 index 00000000..042f724d --- /dev/null +++ b/crawl4ai/browser/docker/launch.Dockerfile @@ -0,0 +1,57 @@ +FROM ubuntu:22.04 + +# Install dependencies with comprehensive Chromium support +RUN apt-get update && apt-get install -y --no-install-recommends \ + wget \ + gnupg \ + ca-certificates \ + fonts-liberation \ + # Sound support + libasound2 \ + # Accessibility support + libatspi2.0-0 \ + libatk1.0-0 \ + libatk-bridge2.0-0 \ + # Graphics and rendering + libdrm2 \ + libgbm1 \ + libgtk-3-0 \ + libxcomposite1 \ + libxdamage1 \ + libxext6 \ + libxfixes3 \ + libxrandr2 \ + # X11 and window system + libx11-6 \ + libxcb1 \ + libxkbcommon0 \ + # Text and internationalization + libpango-1.0-0 \ + libcairo2 \ + # Printing support + libcups2 \ + # System libraries + libdbus-1-3 \ + libnss3 \ + libnspr4 \ + libglib2.0-0 \ + # Utilities + xdg-utils \ + socat \ + # Process management + procps \ + # Clean up + && rm -rf /var/lib/apt/lists/* + +# Install Chrome +RUN wget -q -O - https://dl-ssl.google.com/linux/linux_signing_key.pub | apt-key add - && \ + echo "deb [arch=amd64] http://dl.google.com/linux/chrome/deb/ stable main" >> /etc/apt/sources.list.d/google.list && \ + apt-get update && \ + apt-get install -y google-chrome-stable && \ + rm -rf /var/lib/apt/lists/* + +# Create data directory for user data +RUN mkdir -p /data && chmod 777 /data + +# Keep container running without starting Chrome +CMD ["tail", "-f", "/dev/null"] \ No newline at end of file diff --git a/crawl4ai/browser/docker_config.py b/crawl4ai/browser/docker_config.py new file mode 100644 index 00000000..a63c480c --- /dev/null +++ b/crawl4ai/browser/docker_config.py @@ -0,0 +1,133 @@ +"""Docker configuration module for Crawl4AI browser automation. + +This module provides configuration classes for Docker-based browser automation, +allowing flexible configuration of Docker containers for browsing. +""" + +from typing import Dict, List, Optional, Union + + +class DockerConfig: + """Configuration for Docker-based browser automation. + + This class contains Docker-specific settings to avoid cluttering BrowserConfig. + + Attributes: + mode (str): Docker operation mode - "connect" or "launch". + - "connect": Uses a container with Chrome already running + - "launch": Dynamically configures and starts Chrome in container + image (str): Docker image to use. If None, defaults from DockerUtils are used. + registry_file (str): Path to container registry file for persistence. + persistent (bool): Keep container running after browser closes. + remove_on_exit (bool): Remove container on exit when not persistent. + network (str): Docker network to use. + volumes (List[str]): Volume mappings (e.g., ["host_path:container_path"]). + env_vars (Dict[str, str]): Environment variables to set in container. + extra_args (List[str]): Additional docker run arguments. + host_port (int): Host port to map to container's 9223 port. + user_data_dir (str): Path to user data directory on host. + container_user_data_dir (str): Path to user data directory in container. + """ + + def __init__( + self, + mode: str = "connect", # "connect" or "launch" + image: Optional[str] = None, # Docker image to use + registry_file: Optional[str] = None, # Path to registry file + persistent: bool = False, # Keep container running after browser closes + remove_on_exit: bool = True, # Remove container on exit when not persistent + network: Optional[str] = None, # Docker network to use + volumes: List[str] = None, # Volume mappings + env_vars: Dict[str, str] = None, # Environment variables + extra_args: List[str] = None, # Additional docker run arguments + host_port: Optional[int] = None, # Host port to map to container's 9223 + user_data_dir: Optional[str] = None, # Path to user data directory on host + container_user_data_dir: str = "/data", # Path to user data directory in container + ): + """Initialize Docker configuration. + + Args: + mode: Docker operation mode ("connect" or "launch") + image: Docker image to use + registry_file: Path to container registry file + persistent: Whether to keep container running after browser closes + remove_on_exit: Whether to remove container on exit when not persistent + network: Docker network to use + volumes: Volume mappings as list of strings + env_vars: Environment variables as dictionary + extra_args: Additional docker run arguments + host_port: Host port to map to container's 9223 + user_data_dir: Path to user data directory on host + container_user_data_dir: Path to user data directory in container + """ + self.mode = mode + self.image = image # If None, defaults will be used from DockerUtils + self.registry_file = registry_file + self.persistent = persistent + self.remove_on_exit = remove_on_exit + self.network = network + self.volumes = volumes or [] + self.env_vars = env_vars or {} + self.extra_args = extra_args or [] + self.host_port = host_port + self.user_data_dir = user_data_dir + self.container_user_data_dir = container_user_data_dir + + def to_dict(self) -> Dict: + """Convert this configuration to a dictionary. + + Returns: + Dictionary representation of this configuration + """ + return { + "mode": self.mode, + "image": self.image, + "registry_file": self.registry_file, + "persistent": self.persistent, + "remove_on_exit": self.remove_on_exit, + "network": self.network, + "volumes": self.volumes, + "env_vars": self.env_vars, + "extra_args": self.extra_args, + "host_port": self.host_port, + "user_data_dir": self.user_data_dir, + "container_user_data_dir": self.container_user_data_dir + } + + @staticmethod + def from_kwargs(kwargs: Dict) -> "DockerConfig": + """Create a DockerConfig from a dictionary of keyword arguments. + + Args: + kwargs: Dictionary of configuration options + + Returns: + New DockerConfig instance + """ + return DockerConfig( + mode=kwargs.get("mode", "connect"), + image=kwargs.get("image"), + registry_file=kwargs.get("registry_file"), + persistent=kwargs.get("persistent", False), + remove_on_exit=kwargs.get("remove_on_exit", True), + network=kwargs.get("network"), + volumes=kwargs.get("volumes"), + env_vars=kwargs.get("env_vars"), + extra_args=kwargs.get("extra_args"), + host_port=kwargs.get("host_port"), + user_data_dir=kwargs.get("user_data_dir"), + container_user_data_dir=kwargs.get("container_user_data_dir", "/data") + ) + + def clone(self, **kwargs) -> "DockerConfig": + """Create a copy of this configuration with updated values. + + Args: + **kwargs: Key-value pairs of configuration options to update + + Returns: + DockerConfig: A new instance with the specified updates + """ + config_dict = self.to_dict() + config_dict.update(kwargs) + return DockerConfig.from_kwargs(config_dict) \ No newline at end of file diff --git a/crawl4ai/browser/docker_registry.py b/crawl4ai/browser/docker_registry.py new file mode 100644 index 00000000..91f81c5e --- /dev/null +++ b/crawl4ai/browser/docker_registry.py @@ -0,0 +1,174 @@ +"""Docker registry module for Crawl4AI. + +This module provides a registry system for tracking and reusing Docker containers +across browser sessions, improving performance and resource utilization. +""" + +import os +import json +import time +from typing import Dict, Optional + +from ..utils import get_home_folder + + +class DockerRegistry: + """Manages a registry of Docker containers used for browser automation. + + This registry tracks containers by configuration hash, allowing reuse of appropriately + configured containers instead of creating new ones for each session. + + Attributes: + registry_file (str): Path to the registry file + containers (dict): Dictionary of container information + port_map (dict): Map of host ports to container IDs + last_port (int): Last port assigned + """ + + def __init__(self, registry_file: Optional[str] = None): + """Initialize the registry with an optional path to the registry file. + + Args: + registry_file: Path to the registry file. If None, uses default path. + """ + self.registry_file = registry_file or os.path.join(get_home_folder(), "docker_browser_registry.json") + self.containers = {} + self.port_map = {} + self.last_port = 9222 + self.load() + + def load(self): + """Load container registry from file.""" + if os.path.exists(self.registry_file): + try: + with open(self.registry_file, 'r') as f: + registry_data = json.load(f) + self.containers = registry_data.get("containers", {}) + self.port_map = registry_data.get("ports", {}) + self.last_port = registry_data.get("last_port", 9222) + except Exception: + # Reset to defaults on error + self.containers = {} + self.port_map = {} + self.last_port = 9222 + else: + # Initialize with defaults if file doesn't exist + self.containers = {} + self.port_map = {} + self.last_port = 9222 + + def save(self): + """Save container registry to file.""" + os.makedirs(os.path.dirname(self.registry_file), exist_ok=True) + with open(self.registry_file, 'w') as f: + json.dump({ + "containers": self.containers, + "ports": self.port_map, + "last_port": self.last_port + }, f, indent=2) + + def register_container(self, container_id: str, host_port: int, config_hash: str): + """Register a container with its configuration hash and port mapping. + + Args: + container_id: Docker container ID + host_port: Host port mapped to container + config_hash: Hash of configuration used to create container + """ + self.containers[container_id] = { + "host_port": host_port, + "config_hash": config_hash, + "created_at": time.time() + } + self.port_map[str(host_port)] = container_id + self.save() + + def unregister_container(self, container_id: str): + """Unregister a container. + + Args: + container_id: Docker container ID to unregister + """ + if container_id in self.containers: + host_port = self.containers[container_id]["host_port"] + if str(host_port) in self.port_map: + del self.port_map[str(host_port)] + del self.containers[container_id] + self.save() + + def find_container_by_config(self, config_hash: str, docker_utils) -> Optional[str]: + """Find a container that matches the given configuration hash. + + Args: + config_hash: Hash of configuration to match + docker_utils: DockerUtils instance to check running containers + + Returns: + Container ID if found, None otherwise + """ + for container_id, data in self.containers.items(): + if data["config_hash"] == config_hash and docker_utils.is_container_running(container_id): + return container_id + return None + + def get_container_host_port(self, container_id: str) -> Optional[int]: + """Get the host port mapped to the container. + + Args: + container_id: Docker container ID + + Returns: + Host port if container is registered, None otherwise + """ + if container_id in self.containers: + return self.containers[container_id]["host_port"] + return None + + def get_next_available_port(self, docker_utils) -> int: + """Get the next available host port for Docker mapping. + + Args: + docker_utils: DockerUtils instance to check port availability + + Returns: + Available port number + """ + # Start from last port + 1 + port = self.last_port + 1 + + # Check if port is in use (either in our registry or system-wide) + while port in self.port_map or docker_utils.is_port_in_use(port): + port += 1 + + # Update last port + self.last_port = port + self.save() + + return port + + def get_container_config_hash(self, container_id: str) -> Optional[str]: + """Get the configuration hash for a container. + + Args: + container_id: Docker container ID + + Returns: + Configuration hash if container is registered, None otherwise + """ + if container_id in self.containers: + return self.containers[container_id]["config_hash"] + return None + + def cleanup_stale_containers(self, docker_utils): + """Clean up containers that are no longer running. + + Args: + docker_utils: DockerUtils instance to check container status + """ + to_remove = [] + for container_id in self.containers: + if not docker_utils.is_container_running(container_id): + to_remove.append(container_id) + + for container_id in to_remove: + self.unregister_container(container_id) \ No newline at end of file diff --git a/crawl4ai/browser/docker_strategy.py b/crawl4ai/browser/docker_strategy.py new file mode 100644 index 00000000..639abd84 --- /dev/null +++ b/crawl4ai/browser/docker_strategy.py @@ -0,0 +1,286 @@ +"""Docker browser strategy module for Crawl4AI. + +This module provides browser strategies for running browsers in Docker containers, +which offers better isolation, consistency across platforms, and easy scaling. +""" + +import os +import uuid +import asyncio +from typing import Dict, List, Optional, Tuple, Union +from pathlib import Path + +from playwright.async_api import Page, BrowserContext + +from ..async_logger import AsyncLogger +from ..async_configs import BrowserConfig, CrawlerRunConfig +from .docker_config import DockerConfig +from .docker_registry import DockerRegistry +from .docker_utils import DockerUtils +from .strategies import BuiltinBrowserStrategy + + +class DockerBrowserStrategy(BuiltinBrowserStrategy): + """Docker-based browser strategy. + + Extends the BuiltinBrowserStrategy to run browsers in Docker containers. + Supports two modes: + 1. "connect" - Uses a Docker image with Chrome already running + 2. "launch" - Starts Chrome within the container with custom settings + + Attributes: + docker_config: Docker-specific configuration options + container_id: ID of current Docker container + container_name: Name assigned to the container + registry: Registry for tracking and reusing containers + docker_utils: Utilities for Docker operations + chrome_process_id: Process ID of Chrome within container + socat_process_id: Process ID of socat within container + internal_cdp_port: Chrome's internal CDP port + internal_mapped_port: Port that socat maps to internally + """ + + def __init__(self, config: BrowserConfig, logger: Optional[AsyncLogger] = None): + """Initialize the Docker browser strategy. + + Args: + config: Browser configuration including Docker-specific settings + logger: Logger for recording events and errors + """ + super().__init__(config, logger) + + # Initialize Docker-specific attributes + self.docker_config = self.config.docker_config or DockerConfig() + self.container_id = None + self.container_name = f"crawl4ai-browser-{uuid.uuid4().hex[:8]}" + self.registry = DockerRegistry(self.docker_config.registry_file) + self.docker_utils = DockerUtils(logger) + self.chrome_process_id = None + self.socat_process_id = None + self.internal_cdp_port = 9222 # Chrome's internal CDP port + self.internal_mapped_port = 9223 # Port that socat maps to internally + self.shutting_down = False + + async def _generate_config_hash(self) -> str: + """Generate a hash of the configuration for container matching. + + Returns: + Hash string uniquely identifying this configuration + """ + # Create a dict with the relevant parts of the config + config_dict = { + "image": self.docker_config.image, + "mode": self.docker_config.mode, + "browser_type": self.config.browser_type, + "headless": self.config.headless, + } + + # Add browser-specific config if in launch mode + if self.docker_config.mode == "launch": + config_dict.update({ + "text_mode": self.config.text_mode, + "light_mode": self.config.light_mode, + "viewport_width": self.config.viewport_width, + "viewport_height": self.config.viewport_height, + }) + + # Use the utility method to generate the hash + return self.docker_utils.generate_config_hash(config_dict) + + async def _get_or_create_cdp_url(self) -> str: + """Get CDP URL by either creating a new container or using an existing one. + + Returns: + CDP URL for connecting to the browser + + Raises: + Exception: If container creation or browser launch fails + """ + # If CDP URL is explicitly provided, use it + if self.config.cdp_url: + return self.config.cdp_url + + # Ensure Docker image exists (will build if needed) + image_name = await self.docker_utils.ensure_docker_image_exists( + self.docker_config.image, + self.docker_config.mode + ) + + # Generate config hash for container matching + config_hash = await self._generate_config_hash() + + # Look for existing container with matching config + container_id = self.registry.find_container_by_config(config_hash, self.docker_utils) + + if container_id: + # Use existing container + self.container_id = container_id + host_port = self.registry.get_container_host_port(container_id) + if self.logger: + self.logger.info(f"Using existing Docker container: {container_id[:12]}", tag="DOCKER") + else: + # Get a port for the new container + host_port = self.docker_config.host_port or self.registry.get_next_available_port(self.docker_utils) + + # Prepare volumes list + volumes = list(self.docker_config.volumes) + + # Add user data directory if specified + if self.docker_config.user_data_dir: + # Ensure user data directory exists + os.makedirs(self.docker_config.user_data_dir, exist_ok=True) + volumes.append(f"{self.docker_config.user_data_dir}:{self.docker_config.container_user_data_dir}") + + # Update config user_data_dir to point to container path + self.config.user_data_dir = self.docker_config.container_user_data_dir + + # Create a new container + container_id = await self.docker_utils.create_container( + image_name=image_name, + host_port=host_port, + container_name=self.container_name, + volumes=volumes, + network=self.docker_config.network, + env_vars=self.docker_config.env_vars, + extra_args=self.docker_config.extra_args + ) + + if not container_id: + raise Exception("Failed to create Docker container") + + self.container_id = container_id + + # Register the container + self.registry.register_container(container_id, host_port, config_hash) + + # Wait for container to be ready + await self.docker_utils.wait_for_container_ready(container_id) + + # Handle specific setup based on mode + if self.docker_config.mode == "launch": + # In launch mode, we need to start socat and Chrome + await self.docker_utils.start_socat_in_container(container_id) + + # Build browser arguments + browser_args = self._build_browser_args() + + # Launch Chrome + await self.docker_utils.launch_chrome_in_container(container_id, browser_args) + + # Get PIDs for later cleanup + self.chrome_process_id = await self.docker_utils.get_process_id_in_container( + container_id, "chrome" + ) + self.socat_process_id = await self.docker_utils.get_process_id_in_container( + container_id, "socat" + ) + + # Wait for CDP to be ready + await self.docker_utils.wait_for_cdp_ready(host_port) + + if self.logger: + self.logger.success(f"Docker container ready: {container_id[:12]} on port {host_port}", tag="DOCKER") + + # Return CDP URL + return f"http://localhost:{host_port}" + + def _build_browser_args(self) -> List[str]: + """Build Chrome command line arguments based on BrowserConfig. + + Returns: + List of command line arguments for Chrome + """ + args = [ + "--no-sandbox", + "--disable-gpu", + f"--remote-debugging-port={self.internal_cdp_port}", + "--remote-debugging-address=0.0.0.0", # Allow external connections + "--disable-dev-shm-usage", + ] + + if self.config.headless: + args.append("--headless=new") + + if self.config.viewport_width and self.config.viewport_height: + args.append(f"--window-size={self.config.viewport_width},{self.config.viewport_height}") + + if self.config.user_agent: + args.append(f"--user-agent={self.config.user_agent}") + + if self.config.text_mode: + args.extend([ + "--blink-settings=imagesEnabled=false", + "--disable-remote-fonts", + "--disable-images", + "--disable-javascript", + ]) + + if self.config.light_mode: + # Import here to avoid circular import + from .utils import get_browser_disable_options + args.extend(get_browser_disable_options()) + + if self.config.user_data_dir: + args.append(f"--user-data-dir={self.config.user_data_dir}") + + if self.config.extra_args: + args.extend(self.config.extra_args) + + return args + + async def close(self): + """Close the browser and clean up Docker container if needed.""" + # Set shutting_down flag to prevent race conditions + self.shutting_down = True + + # Store state if needed before closing + if self.browser and self.docker_config.user_data_dir and self.docker_config.persistent: + for context in self.browser.contexts: + try: + storage_path = os.path.join(self.docker_config.user_data_dir, "storage_state.json") + await context.storage_state(path=storage_path) + if self.logger: + self.logger.debug("Persisted storage state before closing browser", tag="DOCKER") + except Exception as e: + if self.logger: + self.logger.warning( + message="Failed to persist storage state: {error}", + tag="DOCKER", + params={"error": str(e)} + ) + + # Close browser connection (but not container) + if self.browser: + await self.browser.close() + self.browser = None + + # Only clean up container if not persistent + if self.container_id and not self.docker_config.persistent: + # Stop Chrome process in "launch" mode + if self.docker_config.mode == "launch" and self.chrome_process_id: + await self.docker_utils.stop_process_in_container( + self.container_id, self.chrome_process_id + ) + + # Stop socat process in "launch" mode + if self.docker_config.mode == "launch" and self.socat_process_id: + await self.docker_utils.stop_process_in_container( + self.container_id, self.socat_process_id + ) + + # Remove or stop container based on configuration + if self.docker_config.remove_on_exit: + await self.docker_utils.remove_container(self.container_id) + # Unregister from registry + self.registry.unregister_container(self.container_id) + else: + await self.docker_utils.stop_container(self.container_id) + + self.container_id = None + + # Close Playwright + if self.playwright: + await self.playwright.stop() + self.playwright = None + + self.shutting_down = False \ No newline at end of file diff --git a/crawl4ai/browser/docker_utils.py b/crawl4ai/browser/docker_utils.py new file mode 100644 index 00000000..0597c2d5 --- /dev/null +++ b/crawl4ai/browser/docker_utils.py @@ -0,0 +1,582 @@ +import os +import json +import asyncio +import hashlib +import tempfile +import shutil +import socket +import subprocess +from typing import Dict, List, Optional, Tuple, Union + +class DockerUtils: + """Utility class for Docker operations in browser automation. + + This class provides methods for managing Docker images, containers, + and related operations needed for browser automation. It handles + image building, container lifecycle, port management, and registry operations. + + Attributes: + DOCKER_FOLDER (str): Path to folder containing Docker files + DOCKER_CONNECT_FILE (str): Path to Dockerfile for connect mode + DOCKER_LAUNCH_FILE (str): Path to Dockerfile for launch mode + DOCKER_START_SCRIPT (str): Path to startup script for connect mode + DEFAULT_CONNECT_IMAGE (str): Default image name for connect mode + DEFAULT_LAUNCH_IMAGE (str): Default image name for launch mode + logger: Optional logger instance + """ + + # File paths for Docker resources + DOCKER_FOLDER = os.path.join(os.path.dirname(__file__), "docker") + DOCKER_CONNECT_FILE = os.path.join(DOCKER_FOLDER, "connect.Dockerfile") + DOCKER_LAUNCH_FILE = os.path.join(DOCKER_FOLDER, "launch.Dockerfile") + DOCKER_START_SCRIPT = os.path.join(DOCKER_FOLDER, "start.sh") + + # Default image names + DEFAULT_CONNECT_IMAGE = "crawl4ai/browser-connect:latest" + DEFAULT_LAUNCH_IMAGE = "crawl4ai/browser-launch:latest" + + def __init__(self, logger=None): + """Initialize Docker utilities. + + Args: + logger: Optional logger for recording operations + """ + self.logger = logger + + # Image Management Methods + + async def check_image_exists(self, image_name: str) -> bool: + """Check if a Docker image exists. + + Args: + image_name: Name of the Docker image to check + + Returns: + bool: True if the image exists, False otherwise + """ + cmd = ["docker", "image", "inspect", image_name] + + try: + process = await asyncio.create_subprocess_exec( + *cmd, stdout=asyncio.subprocess.PIPE, stderr=asyncio.subprocess.PIPE + ) + _, _ = await process.communicate() + return process.returncode == 0 + except Exception as e: + if self.logger: + self.logger.debug(f"Error checking if image exists: {str(e)}", tag="DOCKER") + return False + + async def build_docker_image(self, image_name: str, dockerfile_path: str, + files_to_copy: Dict[str, str] = None) -> bool: + """Build a Docker image from a Dockerfile. + + Args: + image_name: Name to give the built image + dockerfile_path: Path to the Dockerfile + files_to_copy: Dict of {dest_name: source_path} for files to copy to build context + + Returns: + bool: True if image was built successfully, False otherwise + """ + # Create a temporary build context + with tempfile.TemporaryDirectory() as temp_dir: + # Copy the Dockerfile + shutil.copy(dockerfile_path, os.path.join(temp_dir, "Dockerfile")) + + # Copy any additional files needed + if files_to_copy: + for dest_name, source_path in files_to_copy.items(): + shutil.copy(source_path, os.path.join(temp_dir, dest_name)) + + # Build the image + cmd = [ + "docker", "build", + "-t", image_name, + temp_dir + ] + + if self.logger: + self.logger.debug(f"Building Docker image with command: {' '.join(cmd)}", tag="DOCKER") + + process = await asyncio.create_subprocess_exec( + *cmd, stdout=asyncio.subprocess.PIPE, stderr=asyncio.subprocess.PIPE + ) + stdout, stderr = await process.communicate() + + if process.returncode != 0: + if self.logger: + self.logger.error( + message="Failed to build Docker image: {error}", + tag="DOCKER", + params={"error": stderr.decode()} + ) + return False + + if self.logger: + self.logger.success(f"Successfully built Docker image: {image_name}", tag="DOCKER") + return True + + async def ensure_docker_image_exists(self, image_name: str, mode: str = "connect") -> str: + """Ensure the required Docker image exists, creating it if necessary. + + Args: + image_name: Name of the Docker image + mode: Either "connect" or "launch" to determine which image to build + + Returns: + str: Name of the available Docker image + + Raises: + Exception: If image doesn't exist and can't be built + """ + # If image name is not specified, use default based on mode + if not image_name: + image_name = self.DEFAULT_CONNECT_IMAGE if mode == "connect" else self.DEFAULT_LAUNCH_IMAGE + + # Check if the image already exists + if await self.check_image_exists(image_name): + if self.logger: + self.logger.debug(f"Docker image {image_name} already exists", tag="DOCKER") + return image_name + + # If we're using a custom image that doesn't exist, warn and fail + if (image_name != self.DEFAULT_CONNECT_IMAGE and image_name != self.DEFAULT_LAUNCH_IMAGE): + if self.logger: + self.logger.warning( + f"Custom Docker image {image_name} not found and cannot be automatically created", + tag="DOCKER" + ) + raise Exception(f"Docker image {image_name} not found") + + # Build the appropriate default image + if self.logger: + self.logger.info(f"Docker image {image_name} not found, creating it now...", tag="DOCKER") + + if mode == "connect": + success = await self.build_docker_image( + image_name, + self.DOCKER_CONNECT_FILE, + {"start.sh": self.DOCKER_START_SCRIPT} + ) + else: + success = await self.build_docker_image( + image_name, + self.DOCKER_LAUNCH_FILE + ) + + if not success: + raise Exception(f"Failed to create Docker image {image_name}") + + return image_name + + # Container Management Methods + + async def create_container(self, image_name: str, host_port: int, + container_name: Optional[str] = None, + volumes: List[str] = None, + network: Optional[str] = None, + env_vars: Dict[str, str] = None, + extra_args: List[str] = None) -> Optional[str]: + """Create a new Docker container. + + Args: + image_name: Docker image to use + host_port: Port on host to map to container port 9223 + container_name: Optional name for the container + volumes: List of volume mappings (e.g., ["host_path:container_path"]) + network: Optional Docker network to use + env_vars: Dictionary of environment variables + extra_args: Additional docker run arguments + + Returns: + str: Container ID if successful, None otherwise + """ + # Prepare container command + cmd = [ + "docker", "run", + "--detach", + ] + + # Add container name if specified + if container_name: + cmd.extend(["--name", container_name]) + + # Add port mapping + cmd.extend(["-p", f"{host_port}:9223"]) + + # Add volumes + if volumes: + for volume in volumes: + cmd.extend(["-v", volume]) + + # Add network if specified + if network: + cmd.extend(["--network", network]) + + # Add environment variables + if env_vars: + for key, value in env_vars.items(): + cmd.extend(["-e", f"{key}={value}"]) + + # Add extra args + if extra_args: + cmd.extend(extra_args) + + # Add image + cmd.append(image_name) + + if self.logger: + self.logger.debug(f"Creating Docker container with command: {' '.join(cmd)}", tag="DOCKER") + + # Run docker command + try: + process = await asyncio.create_subprocess_exec( + *cmd, stdout=asyncio.subprocess.PIPE, stderr=asyncio.subprocess.PIPE + ) + stdout, stderr = await process.communicate() + + if process.returncode != 0: + if self.logger: + self.logger.error( + message="Failed to create Docker container: {error}", + tag="DOCKER", + params={"error": stderr.decode()} + ) + return None + + # Get container ID + container_id = stdout.decode().strip() + + if self.logger: + self.logger.success(f"Created Docker container: {container_id[:12]}", tag="DOCKER") + + return container_id + + except Exception as e: + if self.logger: + self.logger.error( + message="Error creating Docker container: {error}", + tag="DOCKER", + params={"error": str(e)} + ) + return None + + async def is_container_running(self, container_id: str) -> bool: + """Check if a container is running. + + Args: + container_id: ID of the container to check + + Returns: + bool: True if the container is running, False otherwise + """ + cmd = ["docker", "inspect", "--format", "{{.State.Running}}", container_id] + + try: + process = await asyncio.create_subprocess_exec( + *cmd, stdout=asyncio.subprocess.PIPE, stderr=asyncio.subprocess.PIPE + ) + stdout, _ = await process.communicate() + + return process.returncode == 0 and stdout.decode().strip() == "true" + except Exception as e: + if self.logger: + self.logger.debug(f"Error checking if container is running: {str(e)}", tag="DOCKER") + return False + + async def wait_for_container_ready(self, container_id: str, timeout: int = 30) -> bool: + """Wait for the container to be in running state. + + Args: + container_id: ID of the container to wait for + timeout: Maximum time to wait in seconds + + Returns: + bool: True if container is ready, False if timeout occurred + """ + for _ in range(timeout): + if await self.is_container_running(container_id): + return True + await asyncio.sleep(1) + + if self.logger: + self.logger.warning(f"Container {container_id[:12]} not ready after {timeout}s timeout", tag="DOCKER") + return False + + async def stop_container(self, container_id: str) -> bool: + """Stop a Docker container. + + Args: + container_id: ID of the container to stop + + Returns: + bool: True if stopped successfully, False otherwise + """ + cmd = ["docker", "stop", container_id] + + try: + process = await asyncio.create_subprocess_exec(*cmd) + await process.communicate() + + if self.logger: + self.logger.debug(f"Stopped container: {container_id[:12]}", tag="DOCKER") + + return process.returncode == 0 + except Exception as e: + if self.logger: + self.logger.warning( + message="Failed to stop container: {error}", + tag="DOCKER", + params={"error": str(e)} + ) + return False + + async def remove_container(self, container_id: str, force: bool = True) -> bool: + """Remove a Docker container. + + Args: + container_id: ID of the container to remove + force: Whether to force removal + + Returns: + bool: True if removed successfully, False otherwise + """ + cmd = ["docker", "rm"] + if force: + cmd.append("-f") + cmd.append(container_id) + + try: + process = await asyncio.create_subprocess_exec(*cmd) + await process.communicate() + + if self.logger: + self.logger.debug(f"Removed container: {container_id[:12]}", tag="DOCKER") + + return process.returncode == 0 + except Exception as e: + if self.logger: + self.logger.warning( + message="Failed to remove container: {error}", + tag="DOCKER", + params={"error": str(e)} + ) + return False + + # Container Command Execution Methods + + async def exec_in_container(self, container_id: str, command: List[str], + detach: bool = False) -> Tuple[int, str, str]: + """Execute a command in a running container. + + Args: + container_id: ID of the container + command: Command to execute as a list of strings + detach: Whether to run the command in detached mode + + Returns: + Tuple of (return_code, stdout, stderr) + """ + cmd = ["docker", "exec"] + if detach: + cmd.append("-d") + cmd.append(container_id) + cmd.extend(command) + + try: + process = await asyncio.create_subprocess_exec( + *cmd, stdout=asyncio.subprocess.PIPE, stderr=asyncio.subprocess.PIPE + ) + stdout, stderr = await process.communicate() + + return process.returncode, stdout.decode(), stderr.decode() + except Exception as e: + if self.logger: + self.logger.error( + message="Error executing command in container: {error}", + tag="DOCKER", + params={"error": str(e)} + ) + return -1, "", str(e) + + async def start_socat_in_container(self, container_id: str) -> bool: + """Start socat in the container to map port 9222 to 9223. + + Args: + container_id: ID of the container + + Returns: + bool: True if socat started successfully, False otherwise + """ + # Command to run socat as a background process + cmd = ["socat", "TCP-LISTEN:9223,fork", "TCP:localhost:9222"] + + returncode, _, stderr = await self.exec_in_container(container_id, cmd, detach=True) + + if returncode != 0: + if self.logger: + self.logger.error( + message="Failed to start socat in container: {error}", + tag="DOCKER", + params={"error": stderr} + ) + return False + + if self.logger: + self.logger.debug(f"Started socat in container: {container_id[:12]}", tag="DOCKER") + + # Wait a moment for socat to start + await asyncio.sleep(1) + return True + + async def launch_chrome_in_container(self, container_id: str, browser_args: List[str]) -> bool: + """Launch Chrome inside the container with specified arguments. + + Args: + container_id: ID of the container + browser_args: Chrome command line arguments + + Returns: + bool: True if Chrome started successfully, False otherwise + """ + # Build Chrome command + chrome_cmd = ["google-chrome"] + chrome_cmd.extend(browser_args) + + returncode, _, stderr = await self.exec_in_container(container_id, chrome_cmd, detach=True) + + if returncode != 0: + if self.logger: + self.logger.error( + message="Failed to launch Chrome in container: {error}", + tag="DOCKER", + params={"error": stderr} + ) + return False + + if self.logger: + self.logger.debug(f"Launched Chrome in container: {container_id[:12]}", tag="DOCKER") + + return True + + async def get_process_id_in_container(self, container_id: str, process_name: str) -> Optional[int]: + """Get the process ID for a process in the container. + + Args: + container_id: ID of the container + process_name: Name pattern to search for + + Returns: + int: Process ID if found, None otherwise + """ + cmd = ["pgrep", "-f", process_name] + + returncode, stdout, _ = await self.exec_in_container(container_id, cmd) + + if returncode == 0 and stdout.strip(): + pid = int(stdout.strip().split("\n")[0]) + return pid + + return None + + async def stop_process_in_container(self, container_id: str, pid: int) -> bool: + """Stop a process in the container by PID. + + Args: + container_id: ID of the container + pid: Process ID to stop + + Returns: + bool: True if process was stopped, False otherwise + """ + cmd = ["kill", "-TERM", str(pid)] + + returncode, _, stderr = await self.exec_in_container(container_id, cmd) + + if returncode != 0: + if self.logger: + self.logger.warning( + message="Failed to stop process in container: {error}", + tag="DOCKER", + params={"error": stderr} + ) + return False + + if self.logger: + self.logger.debug(f"Stopped process {pid} in container: {container_id[:12]}", tag="DOCKER") + + return True + + # Network and Port Methods + + async def wait_for_cdp_ready(self, host_port: int, timeout: int = 30) -> bool: + """Wait for the CDP endpoint to be ready. + + Args: + host_port: Port to check for CDP endpoint + timeout: Maximum time to wait in seconds + + Returns: + bool: True if CDP endpoint is ready, False if timeout occurred + """ + import aiohttp + + url = f"http://localhost:{host_port}/json/version" + + for _ in range(timeout): + try: + async with aiohttp.ClientSession() as session: + async with session.get(url, timeout=1) as response: + if response.status == 200: + if self.logger: + self.logger.debug(f"CDP endpoint ready on port {host_port}", tag="DOCKER") + return True + except Exception: + pass + await asyncio.sleep(1) + + if self.logger: + self.logger.warning(f"CDP endpoint not ready on port {host_port} after {timeout}s timeout", tag="DOCKER") + return False + + def is_port_in_use(self, port: int) -> bool: + """Check if a port is already in use on the host. + + Args: + port: Port number to check + + Returns: + bool: True if port is in use, False otherwise + """ + with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s: + return s.connect_ex(('localhost', port)) == 0 + + def get_next_available_port(self, start_port: int = 9223) -> int: + """Get the next available port starting from a given port. + + Args: + start_port: Port number to start checking from + + Returns: + int: First available port number + """ + port = start_port + while self.is_port_in_use(port): + port += 1 + return port + + # Configuration Hash Methods + + def generate_config_hash(self, config_dict: Dict) -> str: + """Generate a hash of the configuration for container matching. + + Args: + config_dict: Dictionary of configuration parameters + + Returns: + str: Hash string uniquely identifying this configuration + """ + # Convert to canonical JSON string and hash + config_json = json.dumps(config_dict, sort_keys=True) + return hashlib.sha256(config_json.encode()).hexdigest() \ No newline at end of file diff --git a/crawl4ai/browser/manager.py b/crawl4ai/browser/manager.py index 9b0cf073..31411844 100644 --- a/crawl4ai/browser/manager.py +++ b/crawl4ai/browser/manager.py @@ -21,6 +21,12 @@ from .strategies import ( BuiltinBrowserStrategy ) +# Import DockerBrowserStrategy if available +try: + from .docker_strategy import DockerBrowserStrategy +except ImportError: + DockerBrowserStrategy = None + class BrowserManager: """Main interface for browser management in Crawl4AI. @@ -69,6 +75,16 @@ class BrowserManager: """ if self.config.browser_mode == "builtin": return BuiltinBrowserStrategy(self.config, self.logger) + elif self.config.browser_mode == "docker": + if DockerBrowserStrategy is None: + if self.logger: + self.logger.error( + "Docker browser strategy requested but not available. " + "Falling back to PlaywrightBrowserStrategy.", + tag="BROWSER" + ) + return PlaywrightBrowserStrategy(self.config, self.logger) + return DockerBrowserStrategy(self.config, self.logger) elif self.config.cdp_url or self.config.use_managed_browser: return CDPBrowserStrategy(self.config, self.logger) else: diff --git a/tests/browser/docker/__init__.py b/tests/browser/docker/__init__.py new file mode 100644 index 00000000..b86e573c --- /dev/null +++ b/tests/browser/docker/__init__.py @@ -0,0 +1,4 @@ +"""Docker browser strategy tests. + +This package contains tests for the Docker browser strategy implementation. +""" \ No newline at end of file diff --git a/tests/browser/docker/test_docker_browser.py b/tests/browser/docker/test_docker_browser.py new file mode 100644 index 00000000..65f0b649 --- /dev/null +++ b/tests/browser/docker/test_docker_browser.py @@ -0,0 +1,653 @@ +"""Test examples for Docker Browser Strategy. + +These examples demonstrate the functionality of Docker Browser Strategy +and serve as functional tests. +""" + +import asyncio +import os +import sys +import shutil +import uuid +import json +from typing import List, Dict, Any, Optional, Tuple + +# Add the project root to Python path if running directly +if __name__ == "__main__": + sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '../../..'))) + +from crawl4ai.browser import BrowserManager +from crawl4ai.async_configs import BrowserConfig, CrawlerRunConfig +from crawl4ai.async_logger import AsyncLogger +from crawl4ai.browser.docker_config import DockerConfig +from crawl4ai.browser.docker_registry import DockerRegistry +from crawl4ai.browser.docker_utils import DockerUtils + +# Create a logger for clear terminal output +logger = AsyncLogger(verbose=True, log_file=None) + +# Global Docker utils instance +docker_utils = DockerUtils(logger) + +async def test_docker_components(): + """Test Docker utilities, registry, and image building. + + This function tests the core Docker components before running the browser tests. + It validates DockerRegistry, DockerUtils, and builds test images to ensure + everything is functioning correctly. + """ + logger.info("Testing Docker components", tag="SETUP") + + # Create a test registry directory + registry_dir = os.path.join(os.path.dirname(__file__), "test_registry") + registry_file = os.path.join(registry_dir, "test_registry.json") + os.makedirs(registry_dir, exist_ok=True) + + try: + # 1. Test DockerRegistry + logger.info("Testing DockerRegistry...", tag="SETUP") + registry = DockerRegistry(registry_file) + + # Test saving and loading registry + test_container_id = "test-container-123" + registry.register_container(test_container_id, 9876, "test-hash-123") + registry.save() + + # Create a new registry instance that loads from the file + registry2 = DockerRegistry(registry_file) + port = registry2.get_container_host_port(test_container_id) + hash_value = registry2.get_container_config_hash(test_container_id) + + if port != 9876 or hash_value != "test-hash-123": + logger.error("DockerRegistry persistence failed", tag="SETUP") + return False + + # Clean up test container from registry + registry2.unregister_container(test_container_id) + logger.success("DockerRegistry works correctly", tag="SETUP") + + # 2. Test DockerUtils + logger.info("Testing DockerUtils...", tag="SETUP") + + # Test port detection + in_use = docker_utils.is_port_in_use(22) # SSH port is usually in use + logger.info(f"Port 22 in use: {in_use}", tag="SETUP") + + # Get next available port + available_port = docker_utils.get_next_available_port(9000) + logger.info(f"Next available port: {available_port}", tag="SETUP") + + # Test config hash generation + config_dict = {"mode": "connect", "headless": True} + config_hash = docker_utils.generate_config_hash(config_dict) + logger.info(f"Generated config hash: {config_hash[:8]}...", tag="SETUP") + + # 3. Test Docker is available + logger.info("Checking Docker availability...", tag="SETUP") + if not await check_docker_available(): + logger.error("Docker is not available - cannot continue tests", tag="SETUP") + return False + + # 4. Test building connect image + logger.info("Building connect mode Docker image...", tag="SETUP") + connect_image = await docker_utils.ensure_docker_image_exists(None, "connect") + if not connect_image: + logger.error("Failed to build connect mode image", tag="SETUP") + return False + logger.success(f"Successfully built connect image: {connect_image}", tag="SETUP") + + # 5. Test building launch image + logger.info("Building launch mode Docker image...", tag="SETUP") + launch_image = await docker_utils.ensure_docker_image_exists(None, "launch") + if not launch_image: + logger.error("Failed to build launch mode image", tag="SETUP") + return False + logger.success(f"Successfully built launch image: {launch_image}", tag="SETUP") + + # 6. Test creating and removing container + logger.info("Testing container creation and removal...", tag="SETUP") + container_id = await docker_utils.create_container( + image_name=launch_image, + host_port=available_port, + container_name="crawl4ai-test-container" + ) + + if not container_id: + logger.error("Failed to create test container", tag="SETUP") + return False + + logger.info(f"Created test container: {container_id[:12]}", tag="SETUP") + + # Verify container is running + running = await docker_utils.is_container_running(container_id) + if not running: + logger.error("Test container is not running", tag="SETUP") + await docker_utils.remove_container(container_id) + return False + + # Test commands in container + logger.info("Testing command execution in container...", tag="SETUP") + returncode, stdout, stderr = await docker_utils.exec_in_container( + container_id, ["ls", "-la", "/"] + ) + + if returncode != 0: + logger.error(f"Command execution failed: {stderr}", tag="SETUP") + await docker_utils.remove_container(container_id) + return False + + # Verify Chrome is installed in the container + returncode, stdout, stderr = await docker_utils.exec_in_container( + container_id, ["which", "google-chrome"] + ) + + if returncode != 0: + logger.error("Chrome not found in container", tag="SETUP") + await docker_utils.remove_container(container_id) + return False + + chrome_path = stdout.strip() + logger.info(f"Chrome found at: {chrome_path}", tag="SETUP") + + # Test Chrome version + returncode, stdout, stderr = await docker_utils.exec_in_container( + container_id, ["google-chrome", "--version"] + ) + + if returncode != 0: + logger.error(f"Failed to get Chrome version: {stderr}", tag="SETUP") + await docker_utils.remove_container(container_id) + return False + + logger.info(f"Chrome version: {stdout.strip()}", tag="SETUP") + + # Remove test container + removed = await docker_utils.remove_container(container_id) + if not removed: + logger.error("Failed to remove test container", tag="SETUP") + return False + + logger.success("Test container removed successfully", tag="SETUP") + + # All components tested successfully + logger.success("All Docker components tested successfully", tag="SETUP") + return True + + except Exception as e: + logger.error(f"Docker component tests failed: {str(e)}", tag="SETUP") + return False + finally: + # Clean up registry test directory + if os.path.exists(registry_dir): + shutil.rmtree(registry_dir) + +async def test_docker_connect_mode(): + """Test Docker browser in connect mode. + + This tests the basic functionality of creating a browser in Docker + connect mode and using it for navigation. + """ + logger.info("Testing Docker browser in connect mode", tag="TEST") + + # Create temp directory for user data + temp_dir = os.path.join(os.path.dirname(__file__), "tmp_user_data") + os.makedirs(temp_dir, exist_ok=True) + + try: + # Create Docker configuration + docker_config = DockerConfig( + mode="connect", + persistent=False, + remove_on_exit=True, + user_data_dir=temp_dir + ) + + # Create browser configuration + browser_config = BrowserConfig( + browser_mode="docker", + headless=True, + docker_config=docker_config + ) + + # Create browser manager + manager = BrowserManager(browser_config=browser_config, logger=logger) + + # Start the browser + await manager.start() + logger.info("Browser started successfully", tag="TEST") + + # Create crawler config + crawler_config = CrawlerRunConfig(url="https://example.com") + + # Get a page + page, context = await manager.get_page(crawler_config) + logger.info("Got page successfully", tag="TEST") + + # Navigate to a website + await page.goto("https://example.com") + logger.info("Navigated to example.com", tag="TEST") + + # Get page title + title = await page.title() + logger.info(f"Page title: {title}", tag="TEST") + + # Clean up + await manager.close() + logger.info("Browser closed successfully", tag="TEST") + + return True + except Exception as e: + logger.error(f"Test failed: {str(e)}", tag="TEST") + # Ensure cleanup + try: + await manager.close() + except: + pass + return False + finally: + # Clean up the temp directory + if os.path.exists(temp_dir): + shutil.rmtree(temp_dir) + +async def test_docker_launch_mode(): + """Test Docker browser in launch mode. + + This tests launching a Chrome browser within a Docker container + on demand with custom settings. + """ + logger.info("Testing Docker browser in launch mode", tag="TEST") + + # Create temp directory for user data + temp_dir = os.path.join(os.path.dirname(__file__), "tmp_user_data_launch") + os.makedirs(temp_dir, exist_ok=True) + + try: + # Create Docker configuration + docker_config = DockerConfig( + mode="launch", + persistent=False, + remove_on_exit=True, + user_data_dir=temp_dir + ) + + # Create browser configuration + browser_config = BrowserConfig( + browser_mode="docker", + headless=True, + text_mode=True, # Enable text mode for faster operation + docker_config=docker_config + ) + + # Create browser manager + manager = BrowserManager(browser_config=browser_config, logger=logger) + + # Start the browser + await manager.start() + logger.info("Browser started successfully", tag="TEST") + + # Create crawler config + crawler_config = CrawlerRunConfig(url="https://example.com") + + # Get a page + page, context = await manager.get_page(crawler_config) + logger.info("Got page successfully", tag="TEST") + + # Navigate to a website + await page.goto("https://example.com") + logger.info("Navigated to example.com", tag="TEST") + + # Get page title + title = await page.title() + logger.info(f"Page title: {title}", tag="TEST") + + # Clean up + await manager.close() + logger.info("Browser closed successfully", tag="TEST") + + return True + except Exception as e: + logger.error(f"Test failed: {str(e)}", tag="TEST") + # Ensure cleanup + try: + await manager.close() + except: + pass + return False + finally: + # Clean up the temp directory + if os.path.exists(temp_dir): + shutil.rmtree(temp_dir) + +async def test_docker_persistent_storage(): + """Test Docker browser with persistent storage. + + This tests creating localStorage data in one session and verifying + it persists to another session when using persistent storage. + """ + logger.info("Testing Docker browser with persistent storage", tag="TEST") + + # Create a unique temp directory + test_id = uuid.uuid4().hex[:8] + temp_dir = os.path.join(os.path.dirname(__file__), f"tmp_user_data_persist_{test_id}") + os.makedirs(temp_dir, exist_ok=True) + + manager1 = None + manager2 = None + + try: + # Create Docker configuration with persistence + docker_config = DockerConfig( + mode="connect", + persistent=True, # Keep container running between sessions + user_data_dir=temp_dir, + container_user_data_dir="/data" + ) + + # Create browser configuration + browser_config = BrowserConfig( + browser_mode="docker", + headless=True, + docker_config=docker_config + ) + + # Create first browser manager + manager1 = BrowserManager(browser_config=browser_config, logger=logger) + + # Start the browser + await manager1.start() + logger.info("First browser started successfully", tag="TEST") + + # Create crawler config + crawler_config = CrawlerRunConfig() + + # Get a page + page1, context1 = await manager1.get_page(crawler_config) + + # Navigate to example.com + await page1.goto("https://example.com") + + # Set localStorage item + test_value = f"test_value_{test_id}" + await page1.evaluate(f"localStorage.setItem('test_key', '{test_value}')") + logger.info(f"Set localStorage test_key = {test_value}", tag="TEST") + + # Close the first browser manager + await manager1.close() + logger.info("First browser closed", tag="TEST") + + # Create second browser manager with same config + manager2 = BrowserManager(browser_config=browser_config, logger=logger) + + # Start the browser + await manager2.start() + logger.info("Second browser started successfully", tag="TEST") + + # Get a page + page2, context2 = await manager2.get_page(crawler_config) + + # Navigate to same site + await page2.goto("https://example.com") + + # Get localStorage item + value = await page2.evaluate("localStorage.getItem('test_key')") + logger.info(f"Retrieved localStorage test_key = {value}", tag="TEST") + + # Check if persistence worked + if value == test_value: + logger.success("Storage persistence verified!", tag="TEST") + else: + logger.error(f"Storage persistence failed! Expected {test_value}, got {value}", tag="TEST") + + # Clean up + await manager2.close() + logger.info("Second browser closed successfully", tag="TEST") + + return value == test_value + except Exception as e: + logger.error(f"Test failed: {str(e)}", tag="TEST") + # Ensure cleanup + try: + if manager1: + await manager1.close() + if manager2: + await manager2.close() + except: + pass + return False + finally: + # Clean up the temp directory + if os.path.exists(temp_dir): + shutil.rmtree(temp_dir) + +async def test_docker_parallel_pages(): + """Test Docker browser with parallel page creation. + + This tests the ability to create and use multiple pages in parallel + from a single Docker browser instance. + """ + logger.info("Testing Docker browser with parallel pages", tag="TEST") + + try: + # Create Docker configuration + docker_config = DockerConfig( + mode="connect", + persistent=False, + remove_on_exit=True + ) + + # Create browser configuration + browser_config = BrowserConfig( + browser_mode="docker", + headless=True, + docker_config=docker_config + ) + + # Create browser manager + manager = BrowserManager(browser_config=browser_config, logger=logger) + + # Start the browser + await manager.start() + logger.info("Browser started successfully", tag="TEST") + + # Create crawler config + crawler_config = CrawlerRunConfig() + + # Get multiple pages + page_count = 3 + pages = await manager.get_pages(crawler_config, count=page_count) + logger.info(f"Got {len(pages)} pages successfully", tag="TEST") + + if len(pages) != page_count: + logger.error(f"Expected {page_count} pages, got {len(pages)}", tag="TEST") + await manager.close() + return False + + # Navigate to different sites with each page + tasks = [] + for i, (page, _) in enumerate(pages): + tasks.append(page.goto(f"https://example.com?page={i}")) + + # Wait for all navigations to complete + await asyncio.gather(*tasks) + logger.info("All pages navigated successfully", tag="TEST") + + # Get titles from all pages + titles = [] + for i, (page, _) in enumerate(pages): + title = await page.title() + titles.append(title) + logger.info(f"Page {i+1} title: {title}", tag="TEST") + + # Clean up + await manager.close() + logger.info("Browser closed successfully", tag="TEST") + + return True + except Exception as e: + logger.error(f"Test failed: {str(e)}", tag="TEST") + # Ensure cleanup + try: + await manager.close() + except: + pass + return False + +async def test_docker_registry_reuse(): + """Test Docker container reuse via registry. + + This tests that containers with matching configurations + are reused rather than creating new ones. + """ + logger.info("Testing Docker container reuse via registry", tag="TEST") + + # Create registry for this test + registry_dir = os.path.join(os.path.dirname(__file__), "registry_reuse_test") + registry_file = os.path.join(registry_dir, "registry.json") + os.makedirs(registry_dir, exist_ok=True) + + manager1 = None + manager2 = None + container_id1 = None + + try: + # Create identical Docker configurations with custom registry + docker_config1 = DockerConfig( + mode="connect", + persistent=True, # Keep container running after closing + registry_file=registry_file + ) + + # Create first browser configuration + browser_config1 = BrowserConfig( + browser_mode="docker", + headless=True, + docker_config=docker_config1 + ) + + # Create first browser manager + manager1 = BrowserManager(browser_config=browser_config1, logger=logger) + + # Start the first browser + await manager1.start() + logger.info("First browser started successfully", tag="TEST") + + # Get container ID from the strategy + docker_strategy1 = manager1._strategy + container_id1 = docker_strategy1.container_id + logger.info(f"First browser container ID: {container_id1[:12]}", tag="TEST") + + # Close the first manager but keep container running + await manager1.close() + logger.info("First browser closed", tag="TEST") + + # Create second Docker configuration identical to first + docker_config2 = DockerConfig( + mode="connect", + persistent=True, + registry_file=registry_file + ) + + # Create second browser configuration + browser_config2 = BrowserConfig( + browser_mode="docker", + headless=True, + docker_config=docker_config2 + ) + + # Create second browser manager + manager2 = BrowserManager(browser_config=browser_config2, logger=logger) + + # Start the second browser - should reuse existing container + await manager2.start() + logger.info("Second browser started successfully", tag="TEST") + + # Get container ID from the second strategy + docker_strategy2 = manager2._strategy + container_id2 = docker_strategy2.container_id + logger.info(f"Second browser container ID: {container_id2[:12]}", tag="TEST") + + # Verify container reuse + if container_id1 == container_id2: + logger.success("Container reuse successful - using same container!", tag="TEST") + else: + logger.error("Container reuse failed - new container created!", tag="TEST") + + # Clean up + docker_strategy2.docker_config.persistent = False + docker_strategy2.docker_config.remove_on_exit = True + await manager2.close() + logger.info("Second browser closed and container removed", tag="TEST") + + return container_id1 == container_id2 + except Exception as e: + logger.error(f"Test failed: {str(e)}", tag="TEST") + # Ensure cleanup + try: + if manager1: + await manager1.close() + if manager2: + await manager2.close() + # Make sure container is removed + if container_id1: + await docker_utils.remove_container(container_id1, force=True) + except: + pass + return False + finally: + # Clean up registry directory + if os.path.exists(registry_dir): + shutil.rmtree(registry_dir) + +async def run_tests(): + """Run all tests sequentially.""" + results = [] + + logger.info("Starting Docker Browser Strategy tests", tag="TEST") + + # Check if Docker is available + if not await check_docker_available(): + logger.error("Docker is not available - skipping tests", tag="TEST") + return + + # First test Docker components + setup_result = await test_docker_components() + if not setup_result: + logger.error("Docker component tests failed - skipping browser tests", tag="TEST") + return + + # Run browser tests + results.append(await test_docker_connect_mode()) + results.append(await test_docker_launch_mode()) + results.append(await test_docker_persistent_storage()) + results.append(await test_docker_parallel_pages()) + results.append(await test_docker_registry_reuse()) + + # Print summary + total = len(results) + passed = sum(1 for r in results if r) + logger.info(f"Tests complete: {passed}/{total} passed", tag="SUMMARY") + + if passed == total: + logger.success("All tests passed!", tag="SUMMARY") + else: + logger.error(f"{total - passed} tests failed", tag="SUMMARY") + +async def check_docker_available() -> bool: + """Check if Docker is available on the system. + + Returns: + bool: True if Docker is available, False otherwise + """ + try: + proc = await asyncio.create_subprocess_exec( + "docker", "--version", + stdout=asyncio.subprocess.PIPE, + stderr=asyncio.subprocess.PIPE + ) + stdout, _ = await proc.communicate() + return proc.returncode == 0 and stdout + except: + return False + +if __name__ == "__main__": + asyncio.run(run_tests()) \ No newline at end of file