crawl4ai/crawl4ai/browser/strategies/docker_strategy.py

"""Docker browser strategy module for Crawl4AI.

This module provides browser strategies for running browsers in Docker containers,
which offers better isolation, consistency across platforms, and easy scaling.
"""

import os
import uuid
from typing import List, Optional


from ...async_logger import AsyncLogger
from ...async_configs import BrowserConfig
from ..models import DockerConfig
from ..docker_registry import DockerRegistry
from ..docker_utils import DockerUtils
from .builtin import CDPBrowserStrategy


class DockerBrowserStrategy(CDPBrowserStrategy):
    """Docker-based browser strategy.

    Extends the CDPBrowserStrategy to run browsers in Docker containers.
    Supports two modes:
    1. "connect" - Uses a Docker image with Chrome already running
    2. "launch" - Starts Chrome within the container with custom settings

    Attributes:
        docker_config: Docker-specific configuration options
        container_id: ID of current Docker container
        container_name: Name assigned to the container
        registry: Registry for tracking and reusing containers
        docker_utils: Utilities for Docker operations
        chrome_process_id: Process ID of Chrome within container
        socat_process_id: Process ID of socat within container
        internal_cdp_port: Chrome's internal CDP port
        internal_mapped_port: Port that socat maps to internally
    """

    def __init__(self, config: BrowserConfig, logger: Optional[AsyncLogger] = None):
        """Initialize the Docker browser strategy.

        Args:
            config: Browser configuration including Docker-specific settings
            logger: Logger for recording events and errors
        """
        super().__init__(config, logger)

        # Initialize Docker-specific attributes
        self.docker_config = self.config.docker_config or DockerConfig()
        self.container_id = None
        self.container_name = f"crawl4ai-browser-{uuid.uuid4().hex[:8]}"

        # Use the shared registry file path for consistency with BuiltinBrowserStrategy
        registry_file = self.docker_config.registry_file
        if registry_file is None and self.config.user_data_dir:
            # Use the same registry file as BuiltinBrowserStrategy if possible
            registry_file = os.path.join(
                os.path.dirname(self.config.user_data_dir), "browser_config.json"
            )

        self.registry = DockerRegistry(self.docker_config.registry_file)
        self.docker_utils = DockerUtils(logger)
        self.chrome_process_id = None
        self.socat_process_id = None
        self.internal_cdp_port = 9222  # Chrome's internal CDP port
        self.internal_mapped_port = 9223  # Port that socat maps to internally
        self.shutting_down = False

    async def start(self):
        """Start or connect to a browser running in a Docker container.

        This method initializes Playwright and establishes a connection to
        a browser running in a Docker container. Depending on the configured mode:
        - "connect": Connects to a container with Chrome already running
        - "launch": Creates a container and launches Chrome within it

        Returns:
            self: For method chaining
        """
        # Initialize Playwright
        from ..utils import get_playwright

        self.playwright = await get_playwright()

        if self.logger:
            self.logger.info(
                f"Starting Docker browser strategy in {self.docker_config.mode} mode",
                tag="DOCKER",
            )

        try:
            # Get CDP URL by creating or reusing a Docker container
            # This handles the container management and browser startup
            cdp_url = await self._get_or_create_cdp_url()

            if not cdp_url:
                raise Exception(
                    "Failed to establish CDP connection to Docker container"
                )

            if self.logger:
                self.logger.info(
                    f"Connecting to browser in Docker via CDP: {cdp_url}", tag="DOCKER"
                )

            # Connect to the browser using CDP
            self.browser = await self.playwright.chromium.connect_over_cdp(cdp_url)

            # Get existing context or create default context
            contexts = self.browser.contexts
            if contexts:
                self.default_context = contexts[0]
                if self.logger:
                    self.logger.debug("Using existing browser context", tag="DOCKER")
            else:
                if self.logger:
                    self.logger.debug("Creating new browser context", tag="DOCKER")
                self.default_context = await self.create_browser_context()
                await self.setup_context(self.default_context)

            return self

        except Exception as e:
            # Clean up resources if startup fails
            if self.container_id and not self.docker_config.persistent:
                if self.logger:
                    self.logger.warning(
                        f"Cleaning up container after failed start: {self.container_id[:12]}",
                        tag="DOCKER",
                    )
                await self.docker_utils.remove_container(self.container_id)
                self.registry.unregister_container(self.container_id)
                self.container_id = None

            if self.playwright:
                await self.playwright.stop()
                self.playwright = None

            # Re-raise the exception
            if self.logger:
                self.logger.error(
                    f"Failed to start Docker browser: {str(e)}", tag="DOCKER"
                )
            raise

    async def _generate_config_hash(self) -> str:
        """Generate a hash of the configuration for container matching.

        Returns:
            Hash string uniquely identifying this configuration
        """
        # Create a dict with the relevant parts of the config
        config_dict = {
            "image": self.docker_config.image,
            "mode": self.docker_config.mode,
            "browser_type": self.config.browser_type,
            "headless": self.config.headless,
        }

        # Add browser-specific config if in launch mode
        if self.docker_config.mode == "launch":
            config_dict.update(
                {
                    "text_mode": self.config.text_mode,
                    "light_mode": self.config.light_mode,
                    "viewport_width": self.config.viewport_width,
                    "viewport_height": self.config.viewport_height,
                }
            )

        # Use the utility method to generate the hash
        return self.docker_utils.generate_config_hash(config_dict)

    async def _get_or_create_cdp_url1(self) -> str:
        """Get CDP URL by either creating a new container or using an existing one.

        Returns:
            CDP URL for connecting to the browser

        Raises:
            Exception: If container creation or browser launch fails
        """
        # If CDP URL is explicitly provided, use it
        if self.config.cdp_url:
            return self.config.cdp_url

        # Ensure Docker image exists (will build if needed)
        image_name = await self.docker_utils.ensure_docker_image_exists(
            self.docker_config.image, self.docker_config.mode
        )

        # Generate config hash for container matching
        config_hash = await self._generate_config_hash()

        # Look for existing container with matching config
        container_id = self.registry.find_container_by_config(
            config_hash, self.docker_utils
        )

        if container_id:
            # Use existing container
            self.container_id = container_id
            host_port = self.registry.get_container_host_port(container_id)
            if self.logger:
                self.logger.info(
                    f"Using existing Docker container: {container_id[:12]}",
                    tag="DOCKER",
                )
        else:
            # Get a port for the new container
            host_port = (
                self.docker_config.host_port
                or self.registry.get_next_available_port(self.docker_utils)
            )

            # Prepare volumes list
            volumes = list(self.docker_config.volumes)

            # Add user data directory if specified
            if self.docker_config.user_data_dir:
                # Ensure user data directory exists
                os.makedirs(self.docker_config.user_data_dir, exist_ok=True)
                volumes.append(
                    f"{self.docker_config.user_data_dir}:{self.docker_config.container_user_data_dir}"
                )

                # Update config user_data_dir to point to container path
                self.config.user_data_dir = self.docker_config.container_user_data_dir

            # Create a new container
            container_id = await self.docker_utils.create_container(
                image_name=image_name,
                host_port=host_port,
                container_name=self.container_name,
                volumes=volumes,
                network=self.docker_config.network,
                env_vars=self.docker_config.env_vars,
                extra_args=self.docker_config.extra_args,
            )

            if not container_id:
                raise Exception("Failed to create Docker container")

            self.container_id = container_id

            # Register the container
            self.registry.register_container(container_id, host_port, config_hash)

            # Wait for container to be ready
            await self.docker_utils.wait_for_container_ready(container_id)

            # Handle specific setup based on mode
            if self.docker_config.mode == "launch":
                # In launch mode, we need to start socat and Chrome
                await self.docker_utils.start_socat_in_container(container_id)

                # Build browser arguments
                browser_args = self._build_browser_args()

                # Launch Chrome
                await self.docker_utils.launch_chrome_in_container(
                    container_id, browser_args
                )

                # Get PIDs for later cleanup
                self.chrome_process_id = (
                    await self.docker_utils.get_process_id_in_container(
                        container_id, "chrome"
                    )
                )
                self.socat_process_id = (
                    await self.docker_utils.get_process_id_in_container(
                        container_id, "socat"
                    )
                )

            # Wait for CDP to be ready
            await self.docker_utils.wait_for_cdp_ready(host_port)

            if self.logger:
                self.logger.success(
                    f"Docker container ready: {container_id[:12]} on port {host_port}",
                    tag="DOCKER",
                )

        # Return CDP URL
        return f"http://localhost:{host_port}"

    async def _get_or_create_cdp_url(self) -> str:
        """Get CDP URL by either creating a new container or using an existing one.

        Returns:
            CDP URL for connecting to the browser

        Raises:
            Exception: If container creation or browser launch fails
        """
        # If CDP URL is explicitly provided, use it
        if self.config.cdp_url:
            return self.config.cdp_url

        # Ensure Docker image exists (will build if needed)
        image_name = await self.docker_utils.ensure_docker_image_exists(
            self.docker_config.image, self.docker_config.mode
        )

        # Generate config hash for container matching
        config_hash = await self._generate_config_hash()

        # Look for existing container with matching config
        container_id = await self.registry.find_container_by_config(
            config_hash, self.docker_utils
        )

        if container_id:
            # Use existing container
            self.container_id = container_id
            host_port = self.registry.get_container_host_port(container_id)
            if self.logger:
                self.logger.info(
                    f"Using existing Docker container: {container_id[:12]}",
                    tag="DOCKER",
                )
        else:
            # Get a port for the new container
            host_port = (
                self.docker_config.host_port
                or self.registry.get_next_available_port(self.docker_utils)
            )

            # Prepare volumes list
            volumes = list(self.docker_config.volumes)

            # Add user data directory if specified
            if self.docker_config.user_data_dir:
                # Ensure user data directory exists
                os.makedirs(self.docker_config.user_data_dir, exist_ok=True)
                volumes.append(
                    f"{self.docker_config.user_data_dir}:{self.docker_config.container_user_data_dir}"
                )

                # # Update config user_data_dir to point to container path
                # self.config.user_data_dir = self.docker_config.container_user_data_dir

            # Create a new container
            container_id = await self.docker_utils.create_container(
                image_name=image_name,
                host_port=host_port,
                container_name=self.container_name,
                volumes=volumes,
                network=self.docker_config.network,
                env_vars=self.docker_config.env_vars,
                cpu_limit=self.docker_config.cpu_limit,
                memory_limit=self.docker_config.memory_limit,
                extra_args=self.docker_config.extra_args,
            )

            if not container_id:
                raise Exception("Failed to create Docker container")

            self.container_id = container_id

            # Wait for container to be ready
            await self.docker_utils.wait_for_container_ready(container_id)

            # Handle specific setup based on mode
            if self.docker_config.mode == "launch":
                # In launch mode, we need to start socat and Chrome
                await self.docker_utils.start_socat_in_container(container_id)

                # Build browser arguments
                browser_args = self._build_browser_args()

                # Launch Chrome
                await self.docker_utils.launch_chrome_in_container(
                    container_id, browser_args
                )

                # Get PIDs for later cleanup
                self.chrome_process_id = (
                    await self.docker_utils.get_process_id_in_container(
                        container_id, "chromium"
                    )
                )
                self.socat_process_id = (
                    await self.docker_utils.get_process_id_in_container(
                        container_id, "socat"
                    )
                )

            # Wait for CDP to be ready
            cdp_json_config = await self.docker_utils.wait_for_cdp_ready(host_port)

            if cdp_json_config:
                # Register the container in the shared registry
                self.registry.register_container(
                    container_id, host_port, config_hash, cdp_json_config
                )
            else:
                raise Exception("Failed to get CDP JSON config from Docker container")

            if self.logger:
                self.logger.success(
                    f"Docker container ready: {container_id[:12]} on port {host_port}",
                    tag="DOCKER",
                )

        # Return CDP URL
        return f"http://localhost:{host_port}"

    def _build_browser_args(self) -> List[str]:
        """Build Chrome command line arguments based on BrowserConfig.

        Returns:
            List of command line arguments for Chrome
        """
        # Call parent method to get common arguments
        browser_args = super()._build_browser_args()
        return browser_args["args"] + [
            f"--remote-debugging-port={self.internal_cdp_port}",
            "--remote-debugging-address=0.0.0.0",  # Allow external connections
            "--disable-dev-shm-usage",
            "--headless=new",
        ]

        # args = [
        #     "--no-sandbox",
        #     "--disable-gpu",
        #     f"--remote-debugging-port={self.internal_cdp_port}",
        #     "--remote-debugging-address=0.0.0.0",  # Allow external connections
        #     "--disable-dev-shm-usage",
        # ]

        # if self.config.headless:
        #     args.append("--headless=new")

        # if self.config.viewport_width and self.config.viewport_height:
        #     args.append(f"--window-size={self.config.viewport_width},{self.config.viewport_height}")

        # if self.config.user_agent:
        #     args.append(f"--user-agent={self.config.user_agent}")

        # if self.config.text_mode:
        #     args.extend([
        #         "--blink-settings=imagesEnabled=false",
        #         "--disable-remote-fonts",
        #         "--disable-images",
        #         "--disable-javascript",
        #     ])

        # if self.config.light_mode:
        #     # Import here to avoid circular import
        #     from ..utils import get_browser_disable_options
        #     args.extend(get_browser_disable_options())

        # if self.config.user_data_dir:
        #     args.append(f"--user-data-dir={self.config.user_data_dir}")

        # if self.config.extra_args:
        #     args.extend(self.config.extra_args)

        # return args

    async def close(self):
        """Close the browser and clean up Docker container if needed."""
        # Set flag to track if we were the ones initiating shutdown
        initiated_shutdown = not getattr(self, "shutting_down", False)

        # Storage persistence for Docker needs special handling
        # We need to store state before calling super().close() which will close the browser
        if (
            self.browser
            and self.docker_config.user_data_dir
            and self.docker_config.persistent
        ):
            for context in self.browser.contexts:
                try:
                    # Ensure directory exists
                    os.makedirs(self.docker_config.user_data_dir, exist_ok=True)

                    # Save storage state to user data directory
                    storage_path = os.path.join(
                        self.docker_config.user_data_dir, "storage_state.json"
                    )
                    await context.storage_state(path=storage_path)
                    if self.logger:
                        self.logger.debug(
                            "Persisted Docker-specific storage state", tag="DOCKER"
                        )
                except Exception as e:
                    if self.logger:
                        self.logger.warning(
                            message="Failed to persist Docker storage state: {error}",
                            tag="DOCKER",
                            params={"error": str(e)},
                        )

        # Call parent method to handle common cleanup
        await super().close()

        # Only perform container cleanup if we initiated shutdown
        # and we need to handle Docker-specific resources
        if initiated_shutdown:
            # Only clean up container if not persistent
            if self.container_id and not self.docker_config.persistent:
                # Stop Chrome process in "launch" mode
                if self.docker_config.mode == "launch" and self.chrome_process_id:
                    await self.docker_utils.stop_process_in_container(
                        self.container_id, self.chrome_process_id
                    )
                    if self.logger:
                        self.logger.debug(
                            f"Stopped Chrome process {self.chrome_process_id} in container",
                            tag="DOCKER",
                        )

                # Stop socat process in "launch" mode
                if self.docker_config.mode == "launch" and self.socat_process_id:
                    await self.docker_utils.stop_process_in_container(
                        self.container_id, self.socat_process_id
                    )
                    if self.logger:
                        self.logger.debug(
                            f"Stopped socat process {self.socat_process_id} in container",
                            tag="DOCKER",
                        )

                # Remove or stop container based on configuration
                if self.docker_config.remove_on_exit:
                    await self.docker_utils.remove_container(self.container_id)
                    # Unregister from registry
                    if hasattr(self, "registry") and self.registry:
                        self.registry.unregister_container(self.container_id)
                    if self.logger:
                        self.logger.debug(
                            f"Removed Docker container {self.container_id}",
                            tag="DOCKER",
                        )
                else:
                    await self.docker_utils.stop_container(self.container_id)
                    if self.logger:
                        self.logger.debug(
                            f"Stopped Docker container {self.container_id}",
                            tag="DOCKER",
                        )

                self.container_id = None