feat(browser): add Docker-based browser automation strategy

Implements a new browser strategy that runs Chrome in Docker containers, providing better isolation and cross-platform consistency. Features include: - Connect and launch modes for different container configurations - Persistent storage support for maintaining browser state - Container registry for efficient reuse - Comprehensive test suite for Docker browser functionality This addition allows users to run browser automation workloads in isolated containers, improving security and resource management.
2025-03-24 21:36:58 +08:00
parent 462d5765e2
commit 8c08521301
10 changed files with 1995 additions and 1 deletions
--- a/crawl4ai/async_configs.py
+++ b/crawl4ai/async_configs.py
@@ -28,6 +28,10 @@ from typing import Any, Dict, Optional
 from enum import Enum

 from .proxy_strategy import ProxyConfig
+try:
+    from .browser.docker_config import DockerConfig
+except ImportError:
+    DockerConfig = None


 def to_serializable_dict(obj: Any, ignore_default_value : bool = False) -> Dict:
@@ -173,6 +177,7 @@ class BrowserConfig:
                           "builtin" - use the builtin CDP browser running in background
                           "dedicated" - create a new dedicated browser instance each time
                           "custom" - use explicit CDP settings provided in cdp_url
+                           "docker" - run browser in Docker container with isolation
                           Default: "dedicated"
        use_managed_browser (bool): Launch the browser using a managed approach (e.g., via CDP), allowing
                                    advanced manipulation. Default: False.
@@ -190,6 +195,8 @@ class BrowserConfig:
                             Default: None.
        proxy_config (ProxyConfig or dict or None): Detailed proxy configuration, e.g. {"server": "...", "username": "..."}.
                                     If None, no additional proxy config. Default: None.
+        docker_config (DockerConfig or dict or None): Configuration for Docker-based browser automation.
+                                     Contains settings for Docker container operation. Default: None.
        viewport_width (int): Default viewport width for pages. Default: 1080.
        viewport_height (int): Default viewport height for pages. Default: 600.
        viewport (dict): Default viewport dimensions for pages. If set, overrides viewport_width and viewport_height.
@@ -235,6 +242,7 @@ class BrowserConfig:
        channel: str = "chromium",
        proxy: str = None,
        proxy_config: Union[ProxyConfig, dict, None] = None,
+        docker_config: Union["DockerConfig", dict, None] = None,
        viewport_width: int = 1080,
        viewport_height: int = 600,
        viewport: dict = None,
@@ -275,6 +283,12 @@ class BrowserConfig:
            self.chrome_channel = ""
        self.proxy = proxy
        self.proxy_config = proxy_config
+        
+        # Handle docker configuration
+        if isinstance(docker_config, dict) and DockerConfig is not None:
+            self.docker_config = DockerConfig.from_kwargs(docker_config)
+        else:
+            self.docker_config = docker_config
        self.viewport_width = viewport_width
        self.viewport_height = viewport_height
        self.viewport = viewport
@@ -315,6 +329,10 @@ class BrowserConfig:
            # Builtin mode uses managed browser connecting to builtin CDP endpoint
            self.use_managed_browser = True
            # cdp_url will be set later by browser_manager
+        elif self.browser_mode == "docker":
+            # Docker mode uses managed browser with CDP to connect to browser in container
+            self.use_managed_browser = True
+            # cdp_url will be set later by docker browser strategy
        elif self.browser_mode == "custom" and self.cdp_url:
            # Custom mode with explicit CDP URL
            self.use_managed_browser = True
@@ -340,6 +358,7 @@ class BrowserConfig:
            channel=kwargs.get("channel", "chromium"),
            proxy=kwargs.get("proxy"),
            proxy_config=kwargs.get("proxy_config", None),
+            docker_config=kwargs.get("docker_config", None),
            viewport_width=kwargs.get("viewport_width", 1080),
            viewport_height=kwargs.get("viewport_height", 600),
            accept_downloads=kwargs.get("accept_downloads", False),
@@ -364,7 +383,7 @@ class BrowserConfig:
        )

    def to_dict(self):
-        return {
+        result = {
            "browser_type": self.browser_type,
            "headless": self.headless,
            "browser_mode": self.browser_mode,
@@ -396,6 +415,15 @@ class BrowserConfig:
            "debugging_port": self.debugging_port,
            "host": self.host,
        }
+        
+        # Include docker_config if it exists
+        if hasattr(self, "docker_config") and self.docker_config is not None:
+            if hasattr(self.docker_config, "to_dict"):
+                result["docker_config"] = self.docker_config.to_dict()
+            else:
+                result["docker_config"] = self.docker_config
+                
+        return result

    def clone(self, **kwargs):
        """Create a copy of this configuration with updated values.
--- a/crawl4ai/browser/docker/connect.Dockerfile
+++ b/crawl4ai/browser/docker/connect.Dockerfile
@@ -0,0 +1,61 @@
+FROM ubuntu:22.04
+
+# Install dependencies with comprehensive Chromium support
+RUN apt-get update && apt-get install -y --no-install-recommends \
+    wget \
+    gnupg \
+    ca-certificates \
+    fonts-liberation \
+    # Sound support
+    libasound2 \
+    # Accessibility support
+    libatspi2.0-0 \
+    libatk1.0-0 \
+    libatk-bridge2.0-0 \
+    # Graphics and rendering
+    libdrm2 \
+    libgbm1 \
+    libgtk-3-0 \
+    libxcomposite1 \
+    libxdamage1 \
+    libxext6 \
+    libxfixes3 \
+    libxrandr2 \
+    # X11 and window system
+    libx11-6 \
+    libxcb1 \
+    libxkbcommon0 \
+    # Text and internationalization
+    libpango-1.0-0 \
+    libcairo2 \
+    # Printing support
+    libcups2 \
+    # System libraries
+    libdbus-1-3 \
+    libnss3 \
+    libnspr4 \
+    libglib2.0-0 \
+    # Utilities
+    xdg-utils \
+    socat \
+    # Process management
+    procps \
+    # Clean up
+    && rm -rf /var/lib/apt/lists/*
+
+# Install Chrome
+RUN wget -q -O - https://dl-ssl.google.com/linux/linux_signing_key.pub | apt-key add - && \
+    echo "deb [arch=amd64] http://dl.google.com/linux/chrome/deb/ stable main" >> /etc/apt/sources.list.d/google.list && \
+    apt-get update && \
+    apt-get install -y google-chrome-stable && \
+    rm -rf /var/lib/apt/lists/*
+
+# Create data directory for user data
+RUN mkdir -p /data && chmod 777 /data
+
+# Add a startup script
+COPY start.sh /start.sh
+RUN chmod +x /start.sh
+
+# Set entrypoint
+ENTRYPOINT ["/start.sh"]
--- a/crawl4ai/browser/docker/launch.Dockerfile
+++ b/crawl4ai/browser/docker/launch.Dockerfile
@@ -0,0 +1,57 @@
+FROM ubuntu:22.04
+
+# Install dependencies with comprehensive Chromium support
+RUN apt-get update && apt-get install -y --no-install-recommends \
+    wget \
+    gnupg \
+    ca-certificates \
+    fonts-liberation \
+    # Sound support
+    libasound2 \
+    # Accessibility support
+    libatspi2.0-0 \
+    libatk1.0-0 \
+    libatk-bridge2.0-0 \
+    # Graphics and rendering
+    libdrm2 \
+    libgbm1 \
+    libgtk-3-0 \
+    libxcomposite1 \
+    libxdamage1 \
+    libxext6 \
+    libxfixes3 \
+    libxrandr2 \
+    # X11 and window system
+    libx11-6 \
+    libxcb1 \
+    libxkbcommon0 \
+    # Text and internationalization
+    libpango-1.0-0 \
+    libcairo2 \
+    # Printing support
+    libcups2 \
+    # System libraries
+    libdbus-1-3 \
+    libnss3 \
+    libnspr4 \
+    libglib2.0-0 \
+    # Utilities
+    xdg-utils \
+    socat \
+    # Process management
+    procps \
+    # Clean up
+    && rm -rf /var/lib/apt/lists/*
+
+# Install Chrome
+RUN wget -q -O - https://dl-ssl.google.com/linux/linux_signing_key.pub | apt-key add - && \
+    echo "deb [arch=amd64] http://dl.google.com/linux/chrome/deb/ stable main" >> /etc/apt/sources.list.d/google.list && \
+    apt-get update && \
+    apt-get install -y google-chrome-stable && \
+    rm -rf /var/lib/apt/lists/*
+
+# Create data directory for user data
+RUN mkdir -p /data && chmod 777 /data
+
+# Keep container running without starting Chrome
+CMD ["tail", "-f", "/dev/null"]
--- a/crawl4ai/browser/docker_config.py
+++ b/crawl4ai/browser/docker_config.py
@@ -0,0 +1,133 @@
+"""Docker configuration module for Crawl4AI browser automation.
+
+This module provides configuration classes for Docker-based browser automation,
+allowing flexible configuration of Docker containers for browsing.
+"""
+
+from typing import Dict, List, Optional, Union
+
+
+class DockerConfig:
+    """Configuration for Docker-based browser automation.
+    
+    This class contains Docker-specific settings to avoid cluttering BrowserConfig.
+    
+    Attributes:
+        mode (str): Docker operation mode - "connect" or "launch".
+            - "connect": Uses a container with Chrome already running
+            - "launch": Dynamically configures and starts Chrome in container
+        image (str): Docker image to use. If None, defaults from DockerUtils are used.
+        registry_file (str): Path to container registry file for persistence.
+        persistent (bool): Keep container running after browser closes.
+        remove_on_exit (bool): Remove container on exit when not persistent.
+        network (str): Docker network to use.
+        volumes (List[str]): Volume mappings (e.g., ["host_path:container_path"]).
+        env_vars (Dict[str, str]): Environment variables to set in container.
+        extra_args (List[str]): Additional docker run arguments.
+        host_port (int): Host port to map to container's 9223 port.
+        user_data_dir (str): Path to user data directory on host.
+        container_user_data_dir (str): Path to user data directory in container.
+    """
+    
+    def __init__(
+        self,
+        mode: str = "connect",                     # "connect" or "launch" 
+        image: Optional[str] = None,               # Docker image to use
+        registry_file: Optional[str] = None,       # Path to registry file
+        persistent: bool = False,                  # Keep container running after browser closes
+        remove_on_exit: bool = True,               # Remove container on exit when not persistent
+        network: Optional[str] = None,             # Docker network to use
+        volumes: List[str] = None,                 # Volume mappings
+        env_vars: Dict[str, str] = None,           # Environment variables
+        extra_args: List[str] = None,              # Additional docker run arguments
+        host_port: Optional[int] = None,           # Host port to map to container's 9223
+        user_data_dir: Optional[str] = None,       # Path to user data directory on host
+        container_user_data_dir: str = "/data",    # Path to user data directory in container
+    ):
+        """Initialize Docker configuration.
+        
+        Args:
+            mode: Docker operation mode ("connect" or "launch")
+            image: Docker image to use
+            registry_file: Path to container registry file
+            persistent: Whether to keep container running after browser closes
+            remove_on_exit: Whether to remove container on exit when not persistent
+            network: Docker network to use
+            volumes: Volume mappings as list of strings
+            env_vars: Environment variables as dictionary
+            extra_args: Additional docker run arguments
+            host_port: Host port to map to container's 9223
+            user_data_dir: Path to user data directory on host
+            container_user_data_dir: Path to user data directory in container
+        """
+        self.mode = mode
+        self.image = image  # If None, defaults will be used from DockerUtils
+        self.registry_file = registry_file
+        self.persistent = persistent
+        self.remove_on_exit = remove_on_exit
+        self.network = network
+        self.volumes = volumes or []
+        self.env_vars = env_vars or {}
+        self.extra_args = extra_args or []
+        self.host_port = host_port
+        self.user_data_dir = user_data_dir
+        self.container_user_data_dir = container_user_data_dir
+    
+    def to_dict(self) -> Dict:
+        """Convert this configuration to a dictionary.
+        
+        Returns:
+            Dictionary representation of this configuration
+        """
+        return {
+            "mode": self.mode,
+            "image": self.image,
+            "registry_file": self.registry_file,
+            "persistent": self.persistent,
+            "remove_on_exit": self.remove_on_exit,
+            "network": self.network,
+            "volumes": self.volumes,
+            "env_vars": self.env_vars,
+            "extra_args": self.extra_args,
+            "host_port": self.host_port,
+            "user_data_dir": self.user_data_dir,
+            "container_user_data_dir": self.container_user_data_dir
+        }
+        
+    @staticmethod
+    def from_kwargs(kwargs: Dict) -> "DockerConfig":
+        """Create a DockerConfig from a dictionary of keyword arguments.
+        
+        Args:
+            kwargs: Dictionary of configuration options
+            
+        Returns:
+            New DockerConfig instance
+        """
+        return DockerConfig(
+            mode=kwargs.get("mode", "connect"),
+            image=kwargs.get("image"),
+            registry_file=kwargs.get("registry_file"),
+            persistent=kwargs.get("persistent", False),
+            remove_on_exit=kwargs.get("remove_on_exit", True),
+            network=kwargs.get("network"),
+            volumes=kwargs.get("volumes"),
+            env_vars=kwargs.get("env_vars"),
+            extra_args=kwargs.get("extra_args"),
+            host_port=kwargs.get("host_port"),
+            user_data_dir=kwargs.get("user_data_dir"),
+            container_user_data_dir=kwargs.get("container_user_data_dir", "/data")
+        )
+        
+    def clone(self, **kwargs) -> "DockerConfig":
+        """Create a copy of this configuration with updated values.
+        
+        Args:
+            **kwargs: Key-value pairs of configuration options to update
+            
+        Returns:
+            DockerConfig: A new instance with the specified updates
+        """
+        config_dict = self.to_dict()
+        config_dict.update(kwargs)
+        return DockerConfig.from_kwargs(config_dict)
--- a/crawl4ai/browser/docker_registry.py
+++ b/crawl4ai/browser/docker_registry.py
@@ -0,0 +1,174 @@
+"""Docker registry module for Crawl4AI.
+
+This module provides a registry system for tracking and reusing Docker containers
+across browser sessions, improving performance and resource utilization.
+"""
+
+import os
+import json
+import time
+from typing import Dict, Optional
+
+from ..utils import get_home_folder
+
+
+class DockerRegistry:
+    """Manages a registry of Docker containers used for browser automation.
+    
+    This registry tracks containers by configuration hash, allowing reuse of appropriately
+    configured containers instead of creating new ones for each session.
+    
+    Attributes:
+        registry_file (str): Path to the registry file
+        containers (dict): Dictionary of container information
+        port_map (dict): Map of host ports to container IDs
+        last_port (int): Last port assigned
+    """
+    
+    def __init__(self, registry_file: Optional[str] = None):
+        """Initialize the registry with an optional path to the registry file.
+        
+        Args:
+            registry_file: Path to the registry file. If None, uses default path.
+        """
+        self.registry_file = registry_file or os.path.join(get_home_folder(), "docker_browser_registry.json")
+        self.containers = {}
+        self.port_map = {}
+        self.last_port = 9222
+        self.load()
+    
+    def load(self):
+        """Load container registry from file."""
+        if os.path.exists(self.registry_file):
+            try:
+                with open(self.registry_file, 'r') as f:
+                    registry_data = json.load(f)
+                    self.containers = registry_data.get("containers", {})
+                    self.port_map = registry_data.get("ports", {})
+                    self.last_port = registry_data.get("last_port", 9222)
+            except Exception:
+                # Reset to defaults on error
+                self.containers = {}
+                self.port_map = {}
+                self.last_port = 9222
+        else:
+            # Initialize with defaults if file doesn't exist
+            self.containers = {}
+            self.port_map = {}
+            self.last_port = 9222
+    
+    def save(self):
+        """Save container registry to file."""
+        os.makedirs(os.path.dirname(self.registry_file), exist_ok=True)
+        with open(self.registry_file, 'w') as f:
+            json.dump({
+                "containers": self.containers,
+                "ports": self.port_map,
+                "last_port": self.last_port
+            }, f, indent=2)
+    
+    def register_container(self, container_id: str, host_port: int, config_hash: str):
+        """Register a container with its configuration hash and port mapping.
+        
+        Args:
+            container_id: Docker container ID
+            host_port: Host port mapped to container
+            config_hash: Hash of configuration used to create container
+        """
+        self.containers[container_id] = {
+            "host_port": host_port,
+            "config_hash": config_hash,
+            "created_at": time.time()
+        }
+        self.port_map[str(host_port)] = container_id
+        self.save()
+    
+    def unregister_container(self, container_id: str):
+        """Unregister a container.
+        
+        Args:
+            container_id: Docker container ID to unregister
+        """
+        if container_id in self.containers:
+            host_port = self.containers[container_id]["host_port"]
+            if str(host_port) in self.port_map:
+                del self.port_map[str(host_port)]
+            del self.containers[container_id]
+            self.save()
+    
+    def find_container_by_config(self, config_hash: str, docker_utils) -> Optional[str]:
+        """Find a container that matches the given configuration hash.
+        
+        Args:
+            config_hash: Hash of configuration to match
+            docker_utils: DockerUtils instance to check running containers
+            
+        Returns:
+            Container ID if found, None otherwise
+        """
+        for container_id, data in self.containers.items():
+            if data["config_hash"] == config_hash and docker_utils.is_container_running(container_id):
+                return container_id
+        return None
+    
+    def get_container_host_port(self, container_id: str) -> Optional[int]:
+        """Get the host port mapped to the container.
+        
+        Args:
+            container_id: Docker container ID
+            
+        Returns:
+            Host port if container is registered, None otherwise
+        """
+        if container_id in self.containers:
+            return self.containers[container_id]["host_port"]
+        return None
+    
+    def get_next_available_port(self, docker_utils) -> int:
+        """Get the next available host port for Docker mapping.
+        
+        Args:
+            docker_utils: DockerUtils instance to check port availability
+            
+        Returns:
+            Available port number
+        """
+        # Start from last port + 1
+        port = self.last_port + 1
+        
+        # Check if port is in use (either in our registry or system-wide)
+        while port in self.port_map or docker_utils.is_port_in_use(port):
+            port += 1
+        
+        # Update last port
+        self.last_port = port
+        self.save()
+        
+        return port
+    
+    def get_container_config_hash(self, container_id: str) -> Optional[str]:
+        """Get the configuration hash for a container.
+        
+        Args:
+            container_id: Docker container ID
+            
+        Returns:
+            Configuration hash if container is registered, None otherwise
+        """
+        if container_id in self.containers:
+            return self.containers[container_id]["config_hash"]
+        return None
+    
+    def cleanup_stale_containers(self, docker_utils):
+        """Clean up containers that are no longer running.
+        
+        Args:
+            docker_utils: DockerUtils instance to check container status
+        """
+        to_remove = []
+        for container_id in self.containers:
+            if not docker_utils.is_container_running(container_id):
+                to_remove.append(container_id)
+                
+        for container_id in to_remove:
+            self.unregister_container(container_id)
--- a/crawl4ai/browser/docker_strategy.py
+++ b/crawl4ai/browser/docker_strategy.py
@@ -0,0 +1,286 @@
+"""Docker browser strategy module for Crawl4AI.
+
+This module provides browser strategies for running browsers in Docker containers,
+which offers better isolation, consistency across platforms, and easy scaling.
+"""
+
+import os
+import uuid
+import asyncio
+from typing import Dict, List, Optional, Tuple, Union
+from pathlib import Path
+
+from playwright.async_api import Page, BrowserContext
+
+from ..async_logger import AsyncLogger
+from ..async_configs import BrowserConfig, CrawlerRunConfig
+from .docker_config import DockerConfig
+from .docker_registry import DockerRegistry
+from .docker_utils import DockerUtils
+from .strategies import BuiltinBrowserStrategy
+
+
+class DockerBrowserStrategy(BuiltinBrowserStrategy):
+    """Docker-based browser strategy.
+    
+    Extends the BuiltinBrowserStrategy to run browsers in Docker containers.
+    Supports two modes:
+    1. "connect" - Uses a Docker image with Chrome already running
+    2. "launch" - Starts Chrome within the container with custom settings
+    
+    Attributes:
+        docker_config: Docker-specific configuration options
+        container_id: ID of current Docker container
+        container_name: Name assigned to the container
+        registry: Registry for tracking and reusing containers
+        docker_utils: Utilities for Docker operations
+        chrome_process_id: Process ID of Chrome within container
+        socat_process_id: Process ID of socat within container
+        internal_cdp_port: Chrome's internal CDP port
+        internal_mapped_port: Port that socat maps to internally
+    """
+    
+    def __init__(self, config: BrowserConfig, logger: Optional[AsyncLogger] = None):
+        """Initialize the Docker browser strategy.
+        
+        Args:
+            config: Browser configuration including Docker-specific settings
+            logger: Logger for recording events and errors
+        """
+        super().__init__(config, logger)
+        
+        # Initialize Docker-specific attributes
+        self.docker_config = self.config.docker_config or DockerConfig()
+        self.container_id = None
+        self.container_name = f"crawl4ai-browser-{uuid.uuid4().hex[:8]}"
+        self.registry = DockerRegistry(self.docker_config.registry_file)
+        self.docker_utils = DockerUtils(logger)
+        self.chrome_process_id = None
+        self.socat_process_id = None
+        self.internal_cdp_port = 9222  # Chrome's internal CDP port
+        self.internal_mapped_port = 9223  # Port that socat maps to internally
+        self.shutting_down = False
+    
+    async def _generate_config_hash(self) -> str:
+        """Generate a hash of the configuration for container matching.
+        
+        Returns:
+            Hash string uniquely identifying this configuration
+        """
+        # Create a dict with the relevant parts of the config
+        config_dict = {
+            "image": self.docker_config.image,
+            "mode": self.docker_config.mode,
+            "browser_type": self.config.browser_type,
+            "headless": self.config.headless,
+        }
+        
+        # Add browser-specific config if in launch mode
+        if self.docker_config.mode == "launch":
+            config_dict.update({
+                "text_mode": self.config.text_mode,
+                "light_mode": self.config.light_mode,
+                "viewport_width": self.config.viewport_width,
+                "viewport_height": self.config.viewport_height,
+            })
+        
+        # Use the utility method to generate the hash
+        return self.docker_utils.generate_config_hash(config_dict)
+    
+    async def _get_or_create_cdp_url(self) -> str:
+        """Get CDP URL by either creating a new container or using an existing one.
+        
+        Returns:
+            CDP URL for connecting to the browser
+            
+        Raises:
+            Exception: If container creation or browser launch fails
+        """
+        # If CDP URL is explicitly provided, use it
+        if self.config.cdp_url:
+            return self.config.cdp_url
+        
+        # Ensure Docker image exists (will build if needed)
+        image_name = await self.docker_utils.ensure_docker_image_exists(
+            self.docker_config.image, 
+            self.docker_config.mode
+        )
+        
+        # Generate config hash for container matching
+        config_hash = await self._generate_config_hash()
+        
+        # Look for existing container with matching config
+        container_id = self.registry.find_container_by_config(config_hash, self.docker_utils)
+        
+        if container_id:
+            # Use existing container
+            self.container_id = container_id
+            host_port = self.registry.get_container_host_port(container_id)
+            if self.logger:
+                self.logger.info(f"Using existing Docker container: {container_id[:12]}", tag="DOCKER")
+        else:
+            # Get a port for the new container
+            host_port = self.docker_config.host_port or self.registry.get_next_available_port(self.docker_utils)
+            
+            # Prepare volumes list
+            volumes = list(self.docker_config.volumes)
+            
+            # Add user data directory if specified
+            if self.docker_config.user_data_dir:
+                # Ensure user data directory exists
+                os.makedirs(self.docker_config.user_data_dir, exist_ok=True)
+                volumes.append(f"{self.docker_config.user_data_dir}:{self.docker_config.container_user_data_dir}")
+                
+                # Update config user_data_dir to point to container path
+                self.config.user_data_dir = self.docker_config.container_user_data_dir
+            
+            # Create a new container
+            container_id = await self.docker_utils.create_container(
+                image_name=image_name,
+                host_port=host_port,
+                container_name=self.container_name,
+                volumes=volumes,
+                network=self.docker_config.network,
+                env_vars=self.docker_config.env_vars,
+                extra_args=self.docker_config.extra_args
+            )
+            
+            if not container_id:
+                raise Exception("Failed to create Docker container")
+            
+            self.container_id = container_id
+            
+            # Register the container
+            self.registry.register_container(container_id, host_port, config_hash)
+            
+            # Wait for container to be ready
+            await self.docker_utils.wait_for_container_ready(container_id)
+            
+            # Handle specific setup based on mode
+            if self.docker_config.mode == "launch":
+                # In launch mode, we need to start socat and Chrome
+                await self.docker_utils.start_socat_in_container(container_id)
+                
+                # Build browser arguments
+                browser_args = self._build_browser_args()
+                
+                # Launch Chrome
+                await self.docker_utils.launch_chrome_in_container(container_id, browser_args)
+                
+                # Get PIDs for later cleanup
+                self.chrome_process_id = await self.docker_utils.get_process_id_in_container(
+                    container_id, "chrome"
+                )
+                self.socat_process_id = await self.docker_utils.get_process_id_in_container(
+                    container_id, "socat"
+                )
+            
+            # Wait for CDP to be ready
+            await self.docker_utils.wait_for_cdp_ready(host_port)
+            
+            if self.logger:
+                self.logger.success(f"Docker container ready: {container_id[:12]} on port {host_port}", tag="DOCKER")
+        
+        # Return CDP URL
+        return f"http://localhost:{host_port}"
+    
+    def _build_browser_args(self) -> List[str]:
+        """Build Chrome command line arguments based on BrowserConfig.
+        
+        Returns:
+            List of command line arguments for Chrome
+        """
+        args = [
+            "--no-sandbox",
+            "--disable-gpu",
+            f"--remote-debugging-port={self.internal_cdp_port}",
+            "--remote-debugging-address=0.0.0.0",  # Allow external connections
+            "--disable-dev-shm-usage",
+        ]
+        
+        if self.config.headless:
+            args.append("--headless=new")
+            
+        if self.config.viewport_width and self.config.viewport_height:
+            args.append(f"--window-size={self.config.viewport_width},{self.config.viewport_height}")
+            
+        if self.config.user_agent:
+            args.append(f"--user-agent={self.config.user_agent}")
+            
+        if self.config.text_mode:
+            args.extend([
+                "--blink-settings=imagesEnabled=false",
+                "--disable-remote-fonts",
+                "--disable-images",
+                "--disable-javascript",
+            ])
+            
+        if self.config.light_mode:
+            # Import here to avoid circular import
+            from .utils import get_browser_disable_options
+            args.extend(get_browser_disable_options())
+            
+        if self.config.user_data_dir:
+            args.append(f"--user-data-dir={self.config.user_data_dir}")
+            
+        if self.config.extra_args:
+            args.extend(self.config.extra_args)
+            
+        return args
+    
+    async def close(self):
+        """Close the browser and clean up Docker container if needed."""
+        # Set shutting_down flag to prevent race conditions
+        self.shutting_down = True
+        
+        # Store state if needed before closing
+        if self.browser and self.docker_config.user_data_dir and self.docker_config.persistent:
+            for context in self.browser.contexts:
+                try:
+                    storage_path = os.path.join(self.docker_config.user_data_dir, "storage_state.json")
+                    await context.storage_state(path=storage_path)
+                    if self.logger:
+                        self.logger.debug("Persisted storage state before closing browser", tag="DOCKER")
+                except Exception as e:
+                    if self.logger:
+                        self.logger.warning(
+                            message="Failed to persist storage state: {error}",
+                            tag="DOCKER",
+                            params={"error": str(e)}
+                        )
+        
+        # Close browser connection (but not container)
+        if self.browser:
+            await self.browser.close()
+            self.browser = None
+        
+        # Only clean up container if not persistent
+        if self.container_id and not self.docker_config.persistent:
+            # Stop Chrome process in "launch" mode
+            if self.docker_config.mode == "launch" and self.chrome_process_id:
+                await self.docker_utils.stop_process_in_container(
+                    self.container_id, self.chrome_process_id
+                )
+            
+            # Stop socat process in "launch" mode
+            if self.docker_config.mode == "launch" and self.socat_process_id:
+                await self.docker_utils.stop_process_in_container(
+                    self.container_id, self.socat_process_id
+                )
+            
+            # Remove or stop container based on configuration
+            if self.docker_config.remove_on_exit:
+                await self.docker_utils.remove_container(self.container_id)
+                # Unregister from registry
+                self.registry.unregister_container(self.container_id)
+            else:
+                await self.docker_utils.stop_container(self.container_id)
+            
+            self.container_id = None
+        
+        # Close Playwright
+        if self.playwright:
+            await self.playwright.stop()
+            self.playwright = None
+        
+        self.shutting_down = False
--- a/crawl4ai/browser/docker_utils.py
+++ b/crawl4ai/browser/docker_utils.py
@@ -0,0 +1,582 @@
+import os
+import json
+import asyncio
+import hashlib
+import tempfile
+import shutil
+import socket
+import subprocess
+from typing import Dict, List, Optional, Tuple, Union
+
+class DockerUtils:
+    """Utility class for Docker operations in browser automation.
+    
+    This class provides methods for managing Docker images, containers,
+    and related operations needed for browser automation. It handles
+    image building, container lifecycle, port management, and registry operations.
+    
+    Attributes:
+        DOCKER_FOLDER (str): Path to folder containing Docker files
+        DOCKER_CONNECT_FILE (str): Path to Dockerfile for connect mode
+        DOCKER_LAUNCH_FILE (str): Path to Dockerfile for launch mode
+        DOCKER_START_SCRIPT (str): Path to startup script for connect mode
+        DEFAULT_CONNECT_IMAGE (str): Default image name for connect mode
+        DEFAULT_LAUNCH_IMAGE (str): Default image name for launch mode
+        logger: Optional logger instance
+    """
+    
+    # File paths for Docker resources
+    DOCKER_FOLDER = os.path.join(os.path.dirname(__file__), "docker")
+    DOCKER_CONNECT_FILE = os.path.join(DOCKER_FOLDER, "connect.Dockerfile")
+    DOCKER_LAUNCH_FILE = os.path.join(DOCKER_FOLDER, "launch.Dockerfile")
+    DOCKER_START_SCRIPT = os.path.join(DOCKER_FOLDER, "start.sh")
+    
+    # Default image names
+    DEFAULT_CONNECT_IMAGE = "crawl4ai/browser-connect:latest"
+    DEFAULT_LAUNCH_IMAGE = "crawl4ai/browser-launch:latest"
+    
+    def __init__(self, logger=None):
+        """Initialize Docker utilities.
+        
+        Args:
+            logger: Optional logger for recording operations
+        """
+        self.logger = logger
+    
+    # Image Management Methods
+    
+    async def check_image_exists(self, image_name: str) -> bool:
+        """Check if a Docker image exists.
+        
+        Args:
+            image_name: Name of the Docker image to check
+            
+        Returns:
+            bool: True if the image exists, False otherwise
+        """
+        cmd = ["docker", "image", "inspect", image_name]
+        
+        try:
+            process = await asyncio.create_subprocess_exec(
+                *cmd, stdout=asyncio.subprocess.PIPE, stderr=asyncio.subprocess.PIPE
+            )
+            _, _ = await process.communicate()
+            return process.returncode == 0
+        except Exception as e:
+            if self.logger:
+                self.logger.debug(f"Error checking if image exists: {str(e)}", tag="DOCKER")
+            return False
+    
+    async def build_docker_image(self, image_name: str, dockerfile_path: str, 
+                              files_to_copy: Dict[str, str] = None) -> bool:
+        """Build a Docker image from a Dockerfile.
+        
+        Args:
+            image_name: Name to give the built image
+            dockerfile_path: Path to the Dockerfile
+            files_to_copy: Dict of {dest_name: source_path} for files to copy to build context
+            
+        Returns:
+            bool: True if image was built successfully, False otherwise
+        """
+        # Create a temporary build context
+        with tempfile.TemporaryDirectory() as temp_dir:
+            # Copy the Dockerfile
+            shutil.copy(dockerfile_path, os.path.join(temp_dir, "Dockerfile"))
+            
+            # Copy any additional files needed
+            if files_to_copy:
+                for dest_name, source_path in files_to_copy.items():
+                    shutil.copy(source_path, os.path.join(temp_dir, dest_name))
+            
+            # Build the image
+            cmd = [
+                "docker", "build",
+                "-t", image_name,
+                temp_dir
+            ]
+            
+            if self.logger:
+                self.logger.debug(f"Building Docker image with command: {' '.join(cmd)}", tag="DOCKER")
+            
+            process = await asyncio.create_subprocess_exec(
+                *cmd, stdout=asyncio.subprocess.PIPE, stderr=asyncio.subprocess.PIPE
+            )
+            stdout, stderr = await process.communicate()
+            
+            if process.returncode != 0:
+                if self.logger:
+                    self.logger.error(
+                        message="Failed to build Docker image: {error}",
+                        tag="DOCKER",
+                        params={"error": stderr.decode()}
+                    )
+                return False
+            
+            if self.logger:
+                self.logger.success(f"Successfully built Docker image: {image_name}", tag="DOCKER")
+            return True
+    
+    async def ensure_docker_image_exists(self, image_name: str, mode: str = "connect") -> str:
+        """Ensure the required Docker image exists, creating it if necessary.
+        
+        Args:
+            image_name: Name of the Docker image
+            mode: Either "connect" or "launch" to determine which image to build
+            
+        Returns:
+            str: Name of the available Docker image
+            
+        Raises:
+            Exception: If image doesn't exist and can't be built
+        """
+        # If image name is not specified, use default based on mode
+        if not image_name:
+            image_name = self.DEFAULT_CONNECT_IMAGE if mode == "connect" else self.DEFAULT_LAUNCH_IMAGE
+        
+        # Check if the image already exists
+        if await self.check_image_exists(image_name):
+            if self.logger:
+                self.logger.debug(f"Docker image {image_name} already exists", tag="DOCKER")
+            return image_name
+        
+        # If we're using a custom image that doesn't exist, warn and fail
+        if (image_name != self.DEFAULT_CONNECT_IMAGE and image_name != self.DEFAULT_LAUNCH_IMAGE):
+            if self.logger:
+                self.logger.warning(
+                    f"Custom Docker image {image_name} not found and cannot be automatically created",
+                    tag="DOCKER"
+                )
+            raise Exception(f"Docker image {image_name} not found")
+        
+        # Build the appropriate default image
+        if self.logger:
+            self.logger.info(f"Docker image {image_name} not found, creating it now...", tag="DOCKER")
+        
+        if mode == "connect":
+            success = await self.build_docker_image(
+                image_name, 
+                self.DOCKER_CONNECT_FILE, 
+                {"start.sh": self.DOCKER_START_SCRIPT}
+            )
+        else:
+            success = await self.build_docker_image(
+                image_name, 
+                self.DOCKER_LAUNCH_FILE
+            )
+        
+        if not success:
+            raise Exception(f"Failed to create Docker image {image_name}")
+        
+        return image_name
+    
+    # Container Management Methods
+    
+    async def create_container(self, image_name: str, host_port: int, 
+                            container_name: Optional[str] = None,
+                            volumes: List[str] = None,
+                            network: Optional[str] = None,
+                            env_vars: Dict[str, str] = None,
+                            extra_args: List[str] = None) -> Optional[str]:
+        """Create a new Docker container.
+        
+        Args:
+            image_name: Docker image to use
+            host_port: Port on host to map to container port 9223
+            container_name: Optional name for the container
+            volumes: List of volume mappings (e.g., ["host_path:container_path"])
+            network: Optional Docker network to use
+            env_vars: Dictionary of environment variables
+            extra_args: Additional docker run arguments
+            
+        Returns:
+            str: Container ID if successful, None otherwise
+        """
+        # Prepare container command
+        cmd = [
+            "docker", "run",
+            "--detach",
+        ]
+        
+        # Add container name if specified
+        if container_name:
+            cmd.extend(["--name", container_name])
+        
+        # Add port mapping
+        cmd.extend(["-p", f"{host_port}:9223"])
+        
+        # Add volumes
+        if volumes:
+            for volume in volumes:
+                cmd.extend(["-v", volume])
+        
+        # Add network if specified
+        if network:
+            cmd.extend(["--network", network])
+        
+        # Add environment variables
+        if env_vars:
+            for key, value in env_vars.items():
+                cmd.extend(["-e", f"{key}={value}"])
+        
+        # Add extra args
+        if extra_args:
+            cmd.extend(extra_args)
+        
+        # Add image
+        cmd.append(image_name)
+        
+        if self.logger:
+            self.logger.debug(f"Creating Docker container with command: {' '.join(cmd)}", tag="DOCKER")
+        
+        # Run docker command
+        try:
+            process = await asyncio.create_subprocess_exec(
+                *cmd, stdout=asyncio.subprocess.PIPE, stderr=asyncio.subprocess.PIPE
+            )
+            stdout, stderr = await process.communicate()
+            
+            if process.returncode != 0:
+                if self.logger:
+                    self.logger.error(
+                        message="Failed to create Docker container: {error}",
+                        tag="DOCKER",
+                        params={"error": stderr.decode()}
+                    )
+                return None
+            
+            # Get container ID
+            container_id = stdout.decode().strip()
+            
+            if self.logger:
+                self.logger.success(f"Created Docker container: {container_id[:12]}", tag="DOCKER")
+            
+            return container_id
+            
+        except Exception as e:
+            if self.logger:
+                self.logger.error(
+                    message="Error creating Docker container: {error}",
+                    tag="DOCKER",
+                    params={"error": str(e)}
+                )
+            return None
+    
+    async def is_container_running(self, container_id: str) -> bool:
+        """Check if a container is running.
+        
+        Args:
+            container_id: ID of the container to check
+            
+        Returns:
+            bool: True if the container is running, False otherwise
+        """
+        cmd = ["docker", "inspect", "--format", "{{.State.Running}}", container_id]
+        
+        try:
+            process = await asyncio.create_subprocess_exec(
+                *cmd, stdout=asyncio.subprocess.PIPE, stderr=asyncio.subprocess.PIPE
+            )
+            stdout, _ = await process.communicate()
+            
+            return process.returncode == 0 and stdout.decode().strip() == "true"
+        except Exception as e:
+            if self.logger:
+                self.logger.debug(f"Error checking if container is running: {str(e)}", tag="DOCKER")
+            return False
+    
+    async def wait_for_container_ready(self, container_id: str, timeout: int = 30) -> bool:
+        """Wait for the container to be in running state.
+        
+        Args:
+            container_id: ID of the container to wait for
+            timeout: Maximum time to wait in seconds
+            
+        Returns:
+            bool: True if container is ready, False if timeout occurred
+        """
+        for _ in range(timeout):
+            if await self.is_container_running(container_id):
+                return True
+            await asyncio.sleep(1)
+        
+        if self.logger:
+            self.logger.warning(f"Container {container_id[:12]} not ready after {timeout}s timeout", tag="DOCKER")
+        return False
+    
+    async def stop_container(self, container_id: str) -> bool:
+        """Stop a Docker container.
+        
+        Args:
+            container_id: ID of the container to stop
+            
+        Returns:
+            bool: True if stopped successfully, False otherwise
+        """
+        cmd = ["docker", "stop", container_id]
+        
+        try:
+            process = await asyncio.create_subprocess_exec(*cmd)
+            await process.communicate()
+            
+            if self.logger:
+                self.logger.debug(f"Stopped container: {container_id[:12]}", tag="DOCKER")
+                
+            return process.returncode == 0
+        except Exception as e:
+            if self.logger:
+                self.logger.warning(
+                    message="Failed to stop container: {error}",
+                    tag="DOCKER",
+                    params={"error": str(e)}
+                )
+            return False
+    
+    async def remove_container(self, container_id: str, force: bool = True) -> bool:
+        """Remove a Docker container.
+        
+        Args:
+            container_id: ID of the container to remove
+            force: Whether to force removal
+            
+        Returns:
+            bool: True if removed successfully, False otherwise
+        """
+        cmd = ["docker", "rm"]
+        if force:
+            cmd.append("-f")
+        cmd.append(container_id)
+        
+        try:
+            process = await asyncio.create_subprocess_exec(*cmd)
+            await process.communicate()
+            
+            if self.logger:
+                self.logger.debug(f"Removed container: {container_id[:12]}", tag="DOCKER")
+                
+            return process.returncode == 0
+        except Exception as e:
+            if self.logger:
+                self.logger.warning(
+                    message="Failed to remove container: {error}",
+                    tag="DOCKER",
+                    params={"error": str(e)}
+                )
+            return False
+    
+    # Container Command Execution Methods
+    
+    async def exec_in_container(self, container_id: str, command: List[str], 
+                             detach: bool = False) -> Tuple[int, str, str]:
+        """Execute a command in a running container.
+        
+        Args:
+            container_id: ID of the container
+            command: Command to execute as a list of strings
+            detach: Whether to run the command in detached mode
+            
+        Returns:
+            Tuple of (return_code, stdout, stderr)
+        """
+        cmd = ["docker", "exec"]
+        if detach:
+            cmd.append("-d")
+        cmd.append(container_id)
+        cmd.extend(command)
+        
+        try:
+            process = await asyncio.create_subprocess_exec(
+                *cmd, stdout=asyncio.subprocess.PIPE, stderr=asyncio.subprocess.PIPE
+            )
+            stdout, stderr = await process.communicate()
+            
+            return process.returncode, stdout.decode(), stderr.decode()
+        except Exception as e:
+            if self.logger:
+                self.logger.error(
+                    message="Error executing command in container: {error}",
+                    tag="DOCKER",
+                    params={"error": str(e)}
+                )
+            return -1, "", str(e)
+    
+    async def start_socat_in_container(self, container_id: str) -> bool:
+        """Start socat in the container to map port 9222 to 9223.
+        
+        Args:
+            container_id: ID of the container
+            
+        Returns:
+            bool: True if socat started successfully, False otherwise
+        """
+        # Command to run socat as a background process
+        cmd = ["socat", "TCP-LISTEN:9223,fork", "TCP:localhost:9222"]
+        
+        returncode, _, stderr = await self.exec_in_container(container_id, cmd, detach=True)
+        
+        if returncode != 0:
+            if self.logger:
+                self.logger.error(
+                    message="Failed to start socat in container: {error}",
+                    tag="DOCKER",
+                    params={"error": stderr}
+                )
+            return False
+            
+        if self.logger:
+            self.logger.debug(f"Started socat in container: {container_id[:12]}", tag="DOCKER")
+        
+        # Wait a moment for socat to start
+        await asyncio.sleep(1)
+        return True
+    
+    async def launch_chrome_in_container(self, container_id: str, browser_args: List[str]) -> bool:
+        """Launch Chrome inside the container with specified arguments.
+        
+        Args:
+            container_id: ID of the container
+            browser_args: Chrome command line arguments
+            
+        Returns:
+            bool: True if Chrome started successfully, False otherwise
+        """
+        # Build Chrome command
+        chrome_cmd = ["google-chrome"]
+        chrome_cmd.extend(browser_args)
+        
+        returncode, _, stderr = await self.exec_in_container(container_id, chrome_cmd, detach=True)
+        
+        if returncode != 0:
+            if self.logger:
+                self.logger.error(
+                    message="Failed to launch Chrome in container: {error}",
+                    tag="DOCKER",
+                    params={"error": stderr}
+                )
+            return False
+            
+        if self.logger:
+            self.logger.debug(f"Launched Chrome in container: {container_id[:12]}", tag="DOCKER")
+        
+        return True
+    
+    async def get_process_id_in_container(self, container_id: str, process_name: str) -> Optional[int]:
+        """Get the process ID for a process in the container.
+        
+        Args:
+            container_id: ID of the container
+            process_name: Name pattern to search for
+            
+        Returns:
+            int: Process ID if found, None otherwise
+        """
+        cmd = ["pgrep", "-f", process_name]
+        
+        returncode, stdout, _ = await self.exec_in_container(container_id, cmd)
+        
+        if returncode == 0 and stdout.strip():
+            pid = int(stdout.strip().split("\n")[0])
+            return pid
+        
+        return None
+    
+    async def stop_process_in_container(self, container_id: str, pid: int) -> bool:
+        """Stop a process in the container by PID.
+        
+        Args:
+            container_id: ID of the container
+            pid: Process ID to stop
+            
+        Returns:
+            bool: True if process was stopped, False otherwise
+        """
+        cmd = ["kill", "-TERM", str(pid)]
+        
+        returncode, _, stderr = await self.exec_in_container(container_id, cmd)
+        
+        if returncode != 0:
+            if self.logger:
+                self.logger.warning(
+                    message="Failed to stop process in container: {error}",
+                    tag="DOCKER",
+                    params={"error": stderr}
+                )
+            return False
+            
+        if self.logger:
+            self.logger.debug(f"Stopped process {pid} in container: {container_id[:12]}", tag="DOCKER")
+        
+        return True
+    
+    # Network and Port Methods
+    
+    async def wait_for_cdp_ready(self, host_port: int, timeout: int = 30) -> bool:
+        """Wait for the CDP endpoint to be ready.
+        
+        Args:
+            host_port: Port to check for CDP endpoint
+            timeout: Maximum time to wait in seconds
+            
+        Returns:
+            bool: True if CDP endpoint is ready, False if timeout occurred
+        """
+        import aiohttp
+        
+        url = f"http://localhost:{host_port}/json/version"
+        
+        for _ in range(timeout):
+            try:
+                async with aiohttp.ClientSession() as session:
+                    async with session.get(url, timeout=1) as response:
+                        if response.status == 200:
+                            if self.logger:
+                                self.logger.debug(f"CDP endpoint ready on port {host_port}", tag="DOCKER")
+                            return True
+            except Exception:
+                pass
+            await asyncio.sleep(1)
+        
+        if self.logger:
+            self.logger.warning(f"CDP endpoint not ready on port {host_port} after {timeout}s timeout", tag="DOCKER")
+        return False
+    
+    def is_port_in_use(self, port: int) -> bool:
+        """Check if a port is already in use on the host.
+        
+        Args:
+            port: Port number to check
+            
+        Returns:
+            bool: True if port is in use, False otherwise
+        """
+        with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
+            return s.connect_ex(('localhost', port)) == 0
+    
+    def get_next_available_port(self, start_port: int = 9223) -> int:
+        """Get the next available port starting from a given port.
+        
+        Args:
+            start_port: Port number to start checking from
+            
+        Returns:
+            int: First available port number
+        """
+        port = start_port
+        while self.is_port_in_use(port):
+            port += 1
+        return port
+    
+    # Configuration Hash Methods
+    
+    def generate_config_hash(self, config_dict: Dict) -> str:
+        """Generate a hash of the configuration for container matching.
+        
+        Args:
+            config_dict: Dictionary of configuration parameters
+            
+        Returns:
+            str: Hash string uniquely identifying this configuration
+        """
+        # Convert to canonical JSON string and hash
+        config_json = json.dumps(config_dict, sort_keys=True)
+        return hashlib.sha256(config_json.encode()).hexdigest()
--- a/crawl4ai/browser/manager.py
+++ b/crawl4ai/browser/manager.py
@@ -21,6 +21,12 @@ from .strategies import (
    BuiltinBrowserStrategy
 )

+# Import DockerBrowserStrategy if available
+try:
+    from .docker_strategy import DockerBrowserStrategy
+except ImportError:
+    DockerBrowserStrategy = None
+
 class BrowserManager:
    """Main interface for browser management in Crawl4AI.
    
@@ -69,6 +75,16 @@ class BrowserManager:
        """
        if self.config.browser_mode == "builtin":
            return BuiltinBrowserStrategy(self.config, self.logger)
+        elif self.config.browser_mode == "docker":
+            if DockerBrowserStrategy is None:
+                if self.logger:
+                    self.logger.error(
+                        "Docker browser strategy requested but not available. "
+                        "Falling back to PlaywrightBrowserStrategy.",
+                        tag="BROWSER"
+                    )
+                return PlaywrightBrowserStrategy(self.config, self.logger)
+            return DockerBrowserStrategy(self.config, self.logger)
        elif self.config.cdp_url or self.config.use_managed_browser:
            return CDPBrowserStrategy(self.config, self.logger)
        else: