From 7f93e88379afa432c6b963aae10cbbf5b94d8365 Mon Sep 17 00:00:00 2001 From: UncleCode Date: Wed, 26 Mar 2025 15:19:29 +0800 Subject: [PATCH 1/7] refactor(tests): remove unused imports in test_docker_browser.py --- tests/browser/docker/test_docker_browser.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/tests/browser/docker/test_docker_browser.py b/tests/browser/docker/test_docker_browser.py index 65f0b649..a3901d8d 100644 --- a/tests/browser/docker/test_docker_browser.py +++ b/tests/browser/docker/test_docker_browser.py @@ -9,8 +9,6 @@ import os import sys import shutil import uuid -import json -from typing import List, Dict, Any, Optional, Tuple # Add the project root to Python path if running directly if __name__ == "__main__": From c635f6b9a2ba9c63ad5465be48bc6436202d43b7 Mon Sep 17 00:00:00 2001 From: UncleCode Date: Thu, 27 Mar 2025 21:35:13 +0800 Subject: [PATCH 2/7] refactor(browser): reorganize browser strategies and improve Docker implementation Reorganize browser strategy code into separate modules for better maintainability and separation of concerns. Improve Docker implementation with: - Add Alpine and Debian-based Dockerfiles for better container options - Enhance Docker registry to share configuration with BuiltinBrowserStrategy - Add CPU and memory limits to container configuration - Improve error handling and logging - Update documentation and examples BREAKING CHANGE: DockerConfig, DockerRegistry, and DockerUtils have been moved to new locations and their APIs have been updated. --- crawl4ai/browser/__init__.py | 14 +- .../browser/docker/alpine/connect.Dockerfile | 34 + .../browser/docker/alpine/launch.Dockerfile | 23 + crawl4ai/browser/docker/connect.Dockerfile | 32 +- .../browser/docker/debian/connect.Dockerfile | 23 + crawl4ai/browser/docker/launch.Dockerfile | 6 +- crawl4ai/browser/docker_config.py | 133 -- crawl4ai/browser/docker_registry.py | 140 +- crawl4ai/browser/docker_utils.py | 443 +++--- crawl4ai/browser/manager.py | 9 +- crawl4ai/browser/models.py | 143 ++ crawl4ai/browser/strategies.py | 1256 ----------------- crawl4ai/browser/strategies/__init__.py | 13 + crawl4ai/browser/strategies/base.py | 270 ++++ crawl4ai/browser/strategies/builtin.py | 394 ++++++ crawl4ai/browser/strategies/cdp.py | 359 +++++ .../{ => strategies}/docker_strategy.py | 205 ++- crawl4ai/browser/strategies/playwright.py | 284 ++++ docs/examples/crypto_analysis_example.py | 487 +++++-- tests/browser/docker/test_docker_browser.py | 20 +- 20 files changed, 2502 insertions(+), 1786 deletions(-) create mode 100644 crawl4ai/browser/docker/alpine/connect.Dockerfile create mode 100644 crawl4ai/browser/docker/alpine/launch.Dockerfile create mode 100644 crawl4ai/browser/docker/debian/connect.Dockerfile delete mode 100644 crawl4ai/browser/docker_config.py delete mode 100644 crawl4ai/browser/strategies.py create mode 100644 crawl4ai/browser/strategies/__init__.py create mode 100644 crawl4ai/browser/strategies/base.py create mode 100644 crawl4ai/browser/strategies/builtin.py create mode 100644 crawl4ai/browser/strategies/cdp.py rename crawl4ai/browser/{ => strategies}/docker_strategy.py (58%) create mode 100644 crawl4ai/browser/strategies/playwright.py diff --git a/crawl4ai/browser/__init__.py b/crawl4ai/browser/__init__.py index fb14b59d..af4d74c7 100644 --- a/crawl4ai/browser/__init__.py +++ b/crawl4ai/browser/__init__.py @@ -6,5 +6,17 @@ for browser creation and interaction. from .manager import BrowserManager from .profiles import BrowserProfileManager +from .models import DockerConfig +from .docker_registry import DockerRegistry +from .docker_utils import DockerUtils +from .strategies import ( + BaseBrowserStrategy, + PlaywrightBrowserStrategy, + CDPBrowserStrategy, + BuiltinBrowserStrategy, + DockerBrowserStrategy +) -__all__ = ['BrowserManager', 'BrowserProfileManager'] \ No newline at end of file +__all__ = ['BrowserManager', 'BrowserProfileManager', 'DockerConfig', 'DockerRegistry', 'DockerUtils', 'BaseBrowserStrategy', + 'PlaywrightBrowserStrategy', 'CDPBrowserStrategy', 'BuiltinBrowserStrategy', + 'DockerBrowserStrategy'] \ No newline at end of file diff --git a/crawl4ai/browser/docker/alpine/connect.Dockerfile b/crawl4ai/browser/docker/alpine/connect.Dockerfile new file mode 100644 index 00000000..96f77cef --- /dev/null +++ b/crawl4ai/browser/docker/alpine/connect.Dockerfile @@ -0,0 +1,34 @@ +# ---------- Dockerfile ---------- + FROM alpine:latest + + # Combine everything in one RUN to keep layers minimal. + RUN apk update && apk upgrade && \ + apk add --no-cache \ + chromium \ + nss \ + freetype \ + harfbuzz \ + ca-certificates \ + ttf-freefont \ + socat \ + curl && \ + addgroup -S chromium && adduser -S chromium -G chromium && \ + mkdir -p /data && chown chromium:chromium /data && \ + rm -rf /var/cache/apk/* + + # Copy start script, then chown/chmod in one step + COPY start.sh /home/chromium/start.sh + RUN chown chromium:chromium /home/chromium/start.sh && \ + chmod +x /home/chromium/start.sh + + USER chromium + WORKDIR /home/chromium + + # Expose port used by socat (mapping 9222→9223 or whichever you prefer) + EXPOSE 9223 + + # Simple healthcheck: is the remote debug endpoint responding? + HEALTHCHECK --interval=30s --timeout=5s --retries=3 CMD curl -f http://localhost:9222/json/version || exit 1 + + CMD ["./start.sh"] + \ No newline at end of file diff --git a/crawl4ai/browser/docker/alpine/launch.Dockerfile b/crawl4ai/browser/docker/alpine/launch.Dockerfile new file mode 100644 index 00000000..60b20539 --- /dev/null +++ b/crawl4ai/browser/docker/alpine/launch.Dockerfile @@ -0,0 +1,23 @@ +# ---------- Dockerfile (Idle Version) ---------- + FROM alpine:latest + + # Install only Chromium and its dependencies in a single layer + RUN apk update && apk upgrade && \ + apk add --no-cache \ + chromium \ + nss \ + freetype \ + harfbuzz \ + ca-certificates \ + ttf-freefont && \ + addgroup -S chromium && adduser -S chromium -G chromium && \ + mkdir -p /data && chown chromium:chromium /data && \ + rm -rf /var/cache/apk/* + + # Switch to a non-root user for security + USER chromium + WORKDIR /home/chromium + + # Idle: container does nothing except stay alive + CMD ["tail", "-f", "/dev/null"] + \ No newline at end of file diff --git a/crawl4ai/browser/docker/connect.Dockerfile b/crawl4ai/browser/docker/connect.Dockerfile index d2d955b6..c83fedb8 100644 --- a/crawl4ai/browser/docker/connect.Dockerfile +++ b/crawl4ai/browser/docker/connect.Dockerfile @@ -3,16 +3,14 @@ FROM ubuntu:22.04 # Install dependencies with comprehensive Chromium support RUN apt-get update && apt-get install -y --no-install-recommends \ wget \ + curl \ gnupg \ ca-certificates \ fonts-liberation \ - # Sound support + # Core dependencies libasound2 \ - # Accessibility support - libatspi2.0-0 \ libatk1.0-0 \ libatk-bridge2.0-0 \ - # Graphics and rendering libdrm2 \ libgbm1 \ libgtk-3-0 \ @@ -21,16 +19,12 @@ RUN apt-get update && apt-get install -y --no-install-recommends \ libxext6 \ libxfixes3 \ libxrandr2 \ - # X11 and window system libx11-6 \ libxcb1 \ libxkbcommon0 \ - # Text and internationalization libpango-1.0-0 \ libcairo2 \ - # Printing support libcups2 \ - # System libraries libdbus-1-3 \ libnss3 \ libnspr4 \ @@ -38,24 +32,24 @@ RUN apt-get update && apt-get install -y --no-install-recommends \ # Utilities xdg-utils \ socat \ - # Process management - procps \ # Clean up && rm -rf /var/lib/apt/lists/* -# Install Chrome -RUN wget -q -O - https://dl-ssl.google.com/linux/linux_signing_key.pub | apt-key add - && \ - echo "deb [arch=amd64] http://dl.google.com/linux/chrome/deb/ stable main" >> /etc/apt/sources.list.d/google.list && \ - apt-get update && \ - apt-get install -y google-chrome-stable && \ - rm -rf /var/lib/apt/lists/* +# Install Chromium with codecs +RUN apt-get update && \ + apt-get install -y \ + chromium-browser \ + chromium-codecs-ffmpeg-extra \ + && rm -rf /var/lib/apt/lists/* -# Create data directory for user data +# Create Chrome alias for compatibility +RUN ln -s /usr/bin/chromium-browser /usr/bin/google-chrome + +# Create data directory RUN mkdir -p /data && chmod 777 /data -# Add a startup script +# Add startup script COPY start.sh /start.sh RUN chmod +x /start.sh -# Set entrypoint ENTRYPOINT ["/start.sh"] \ No newline at end of file diff --git a/crawl4ai/browser/docker/debian/connect.Dockerfile b/crawl4ai/browser/docker/debian/connect.Dockerfile new file mode 100644 index 00000000..ee0f25b4 --- /dev/null +++ b/crawl4ai/browser/docker/debian/connect.Dockerfile @@ -0,0 +1,23 @@ +# Use Debian 12 (Bookworm) slim for a small, stable base image +FROM debian:bookworm-slim + +ENV DEBIAN_FRONTEND=noninteractive + +# Install Chromium, socat, and basic fonts +RUN apt-get update && apt-get install -y --no-install-recommends \ + chromium \ + wget \ + curl \ + socat \ + fonts-freefont-ttf \ + fonts-noto-color-emoji && \ + apt-get clean && rm -rf /var/lib/apt/lists/* + +# Copy start.sh and make it executable +COPY start.sh /start.sh +RUN chmod +x /start.sh + +# Expose socat port (use host mapping, e.g. -p 9225:9223) +EXPOSE 9223 + +ENTRYPOINT ["/start.sh"] diff --git a/crawl4ai/browser/docker/launch.Dockerfile b/crawl4ai/browser/docker/launch.Dockerfile index 042f724d..63d2cee2 100644 --- a/crawl4ai/browser/docker/launch.Dockerfile +++ b/crawl4ai/browser/docker/launch.Dockerfile @@ -43,9 +43,9 @@ RUN apt-get update && apt-get install -y --no-install-recommends \ # Clean up && rm -rf /var/lib/apt/lists/* -# Install Chrome -RUN wget -q -O - https://dl-ssl.google.com/linux/linux_signing_key.pub | apt-key add - && \ - echo "deb [arch=amd64] http://dl.google.com/linux/chrome/deb/ stable main" >> /etc/apt/sources.list.d/google.list && \ +# Install Chrome (new method) +RUN curl -fsSL https://dl.google.com/linux/linux_signing_key.pub | gpg --dearmor -o /usr/share/keyrings/googlechrome-linux-keyring.gpg && \ + echo "deb [arch=amd64 signed-by=/usr/share/keyrings/googlechrome-linux-keyring.gpg] http://dl.google.com/linux/chrome/deb/ stable main" | tee /etc/apt/sources.list.d/google-chrome.list && \ apt-get update && \ apt-get install -y google-chrome-stable && \ rm -rf /var/lib/apt/lists/* diff --git a/crawl4ai/browser/docker_config.py b/crawl4ai/browser/docker_config.py deleted file mode 100644 index a63c480c..00000000 --- a/crawl4ai/browser/docker_config.py +++ /dev/null @@ -1,133 +0,0 @@ -"""Docker configuration module for Crawl4AI browser automation. - -This module provides configuration classes for Docker-based browser automation, -allowing flexible configuration of Docker containers for browsing. -""" - -from typing import Dict, List, Optional, Union - - -class DockerConfig: - """Configuration for Docker-based browser automation. - - This class contains Docker-specific settings to avoid cluttering BrowserConfig. - - Attributes: - mode (str): Docker operation mode - "connect" or "launch". - - "connect": Uses a container with Chrome already running - - "launch": Dynamically configures and starts Chrome in container - image (str): Docker image to use. If None, defaults from DockerUtils are used. - registry_file (str): Path to container registry file for persistence. - persistent (bool): Keep container running after browser closes. - remove_on_exit (bool): Remove container on exit when not persistent. - network (str): Docker network to use. - volumes (List[str]): Volume mappings (e.g., ["host_path:container_path"]). - env_vars (Dict[str, str]): Environment variables to set in container. - extra_args (List[str]): Additional docker run arguments. - host_port (int): Host port to map to container's 9223 port. - user_data_dir (str): Path to user data directory on host. - container_user_data_dir (str): Path to user data directory in container. - """ - - def __init__( - self, - mode: str = "connect", # "connect" or "launch" - image: Optional[str] = None, # Docker image to use - registry_file: Optional[str] = None, # Path to registry file - persistent: bool = False, # Keep container running after browser closes - remove_on_exit: bool = True, # Remove container on exit when not persistent - network: Optional[str] = None, # Docker network to use - volumes: List[str] = None, # Volume mappings - env_vars: Dict[str, str] = None, # Environment variables - extra_args: List[str] = None, # Additional docker run arguments - host_port: Optional[int] = None, # Host port to map to container's 9223 - user_data_dir: Optional[str] = None, # Path to user data directory on host - container_user_data_dir: str = "/data", # Path to user data directory in container - ): - """Initialize Docker configuration. - - Args: - mode: Docker operation mode ("connect" or "launch") - image: Docker image to use - registry_file: Path to container registry file - persistent: Whether to keep container running after browser closes - remove_on_exit: Whether to remove container on exit when not persistent - network: Docker network to use - volumes: Volume mappings as list of strings - env_vars: Environment variables as dictionary - extra_args: Additional docker run arguments - host_port: Host port to map to container's 9223 - user_data_dir: Path to user data directory on host - container_user_data_dir: Path to user data directory in container - """ - self.mode = mode - self.image = image # If None, defaults will be used from DockerUtils - self.registry_file = registry_file - self.persistent = persistent - self.remove_on_exit = remove_on_exit - self.network = network - self.volumes = volumes or [] - self.env_vars = env_vars or {} - self.extra_args = extra_args or [] - self.host_port = host_port - self.user_data_dir = user_data_dir - self.container_user_data_dir = container_user_data_dir - - def to_dict(self) -> Dict: - """Convert this configuration to a dictionary. - - Returns: - Dictionary representation of this configuration - """ - return { - "mode": self.mode, - "image": self.image, - "registry_file": self.registry_file, - "persistent": self.persistent, - "remove_on_exit": self.remove_on_exit, - "network": self.network, - "volumes": self.volumes, - "env_vars": self.env_vars, - "extra_args": self.extra_args, - "host_port": self.host_port, - "user_data_dir": self.user_data_dir, - "container_user_data_dir": self.container_user_data_dir - } - - @staticmethod - def from_kwargs(kwargs: Dict) -> "DockerConfig": - """Create a DockerConfig from a dictionary of keyword arguments. - - Args: - kwargs: Dictionary of configuration options - - Returns: - New DockerConfig instance - """ - return DockerConfig( - mode=kwargs.get("mode", "connect"), - image=kwargs.get("image"), - registry_file=kwargs.get("registry_file"), - persistent=kwargs.get("persistent", False), - remove_on_exit=kwargs.get("remove_on_exit", True), - network=kwargs.get("network"), - volumes=kwargs.get("volumes"), - env_vars=kwargs.get("env_vars"), - extra_args=kwargs.get("extra_args"), - host_port=kwargs.get("host_port"), - user_data_dir=kwargs.get("user_data_dir"), - container_user_data_dir=kwargs.get("container_user_data_dir", "/data") - ) - - def clone(self, **kwargs) -> "DockerConfig": - """Create a copy of this configuration with updated values. - - Args: - **kwargs: Key-value pairs of configuration options to update - - Returns: - DockerConfig: A new instance with the specified updates - """ - config_dict = self.to_dict() - config_dict.update(kwargs) - return DockerConfig.from_kwargs(config_dict) \ No newline at end of file diff --git a/crawl4ai/browser/docker_registry.py b/crawl4ai/browser/docker_registry.py index 91f81c5e..03594e2e 100644 --- a/crawl4ai/browser/docker_registry.py +++ b/crawl4ai/browser/docker_registry.py @@ -31,9 +31,10 @@ class DockerRegistry: Args: registry_file: Path to the registry file. If None, uses default path. """ - self.registry_file = registry_file or os.path.join(get_home_folder(), "docker_browser_registry.json") - self.containers = {} - self.port_map = {} + # Use the same file path as BuiltinBrowserStrategy by default + self.registry_file = registry_file or os.path.join(get_home_folder(), "builtin-browser", "browser_config.json") + self.containers = {} # Still maintain this for backward compatibility + self.port_map = {} # Will be populated from the shared file self.last_port = 9222 self.load() @@ -43,11 +44,35 @@ class DockerRegistry: try: with open(self.registry_file, 'r') as f: registry_data = json.load(f) - self.containers = registry_data.get("containers", {}) - self.port_map = registry_data.get("ports", {}) - self.last_port = registry_data.get("last_port", 9222) - except Exception: + + # Initialize port_map if not present + if "port_map" not in registry_data: + registry_data["port_map"] = {} + + self.port_map = registry_data.get("port_map", {}) + + # Extract container information from port_map entries of type "docker" + self.containers = {} + for port_str, browser_info in self.port_map.items(): + if browser_info.get("browser_type") == "docker" and "container_id" in browser_info: + container_id = browser_info["container_id"] + self.containers[container_id] = { + "host_port": int(port_str), + "config_hash": browser_info.get("config_hash", ""), + "created_at": browser_info.get("created_at", time.time()) + } + + # Get last port if available + if "last_port" in registry_data: + self.last_port = registry_data["last_port"] + else: + # Find highest port in port_map + ports = [int(p) for p in self.port_map.keys() if p.isdigit()] + self.last_port = max(ports + [9222]) + + except Exception as e: # Reset to defaults on error + print(f"Error loading registry: {e}") self.containers = {} self.port_map = {} self.last_port = 9222 @@ -59,28 +84,75 @@ class DockerRegistry: def save(self): """Save container registry to file.""" + # First load the current file to avoid overwriting other browser types + current_data = {"port_map": {}, "last_port": self.last_port} + if os.path.exists(self.registry_file): + try: + with open(self.registry_file, 'r') as f: + current_data = json.load(f) + except Exception: + pass + + # Create a new port_map dictionary + updated_port_map = {} + + # First, copy all non-docker entries from the existing port_map + for port_str, browser_info in current_data.get("port_map", {}).items(): + if browser_info.get("browser_type") != "docker": + updated_port_map[port_str] = browser_info + + # Then add all current docker container entries + for container_id, container_info in self.containers.items(): + port_str = str(container_info["host_port"]) + updated_port_map[port_str] = { + "browser_type": "docker", + "container_id": container_id, + "cdp_url": f"http://localhost:{port_str}", + "config_hash": container_info["config_hash"], + "created_at": container_info["created_at"] + } + + # Replace the port_map with our updated version + current_data["port_map"] = updated_port_map + + # Update last_port + current_data["last_port"] = self.last_port + + # Ensure directory exists os.makedirs(os.path.dirname(self.registry_file), exist_ok=True) + + # Save the updated data with open(self.registry_file, 'w') as f: - json.dump({ - "containers": self.containers, - "ports": self.port_map, - "last_port": self.last_port - }, f, indent=2) + json.dump(current_data, f, indent=2) - def register_container(self, container_id: str, host_port: int, config_hash: str): + def register_container(self, container_id: str, host_port: int, config_hash: str, cdp_json_config: Optional[str] = None): """Register a container with its configuration hash and port mapping. Args: container_id: Docker container ID host_port: Host port mapped to container config_hash: Hash of configuration used to create container + cdp_json_config: CDP JSON configuration if available """ self.containers[container_id] = { "host_port": host_port, "config_hash": config_hash, "created_at": time.time() } - self.port_map[str(host_port)] = container_id + + # Update port_map to maintain compatibility with BuiltinBrowserStrategy + port_str = str(host_port) + self.port_map[port_str] = { + "browser_type": "docker", + "container_id": container_id, + "cdp_url": f"http://localhost:{port_str}", + "config_hash": config_hash, + "created_at": time.time() + } + + if cdp_json_config: + self.port_map[port_str]["cdp_json_config"] = cdp_json_config + self.save() def unregister_container(self, container_id: str): @@ -91,12 +163,18 @@ class DockerRegistry: """ if container_id in self.containers: host_port = self.containers[container_id]["host_port"] - if str(host_port) in self.port_map: - del self.port_map[str(host_port)] + port_str = str(host_port) + + # Remove from port_map + if port_str in self.port_map: + del self.port_map[port_str] + + # Remove from containers del self.containers[container_id] + self.save() - def find_container_by_config(self, config_hash: str, docker_utils) -> Optional[str]: + async def find_container_by_config(self, config_hash: str, docker_utils) -> Optional[str]: """Find a container that matches the given configuration hash. Args: @@ -106,9 +184,16 @@ class DockerRegistry: Returns: Container ID if found, None otherwise """ - for container_id, data in self.containers.items(): - if data["config_hash"] == config_hash and docker_utils.is_container_running(container_id): - return container_id + # Search through port_map for entries with matching config_hash + for port_str, browser_info in self.port_map.items(): + if (browser_info.get("browser_type") == "docker" and + browser_info.get("config_hash") == config_hash and + "container_id" in browser_info): + + container_id = browser_info["container_id"] + if await docker_utils.is_container_running(container_id): + return container_id + return None def get_container_host_port(self, container_id: str) -> Optional[int]: @@ -137,7 +222,7 @@ class DockerRegistry: port = self.last_port + 1 # Check if port is in use (either in our registry or system-wide) - while port in self.port_map or docker_utils.is_port_in_use(port): + while str(port) in self.port_map or docker_utils.is_port_in_use(port): port += 1 # Update last port @@ -166,9 +251,14 @@ class DockerRegistry: docker_utils: DockerUtils instance to check container status """ to_remove = [] - for container_id in self.containers: - if not docker_utils.is_container_running(container_id): - to_remove.append(container_id) - + + # Find containers that are no longer running + for port_str, browser_info in self.port_map.items(): + if browser_info.get("browser_type") == "docker" and "container_id" in browser_info: + container_id = browser_info["container_id"] + if not docker_utils.is_container_running(container_id): + to_remove.append(container_id) + + # Remove stale containers for container_id in to_remove: self.unregister_container(container_id) \ No newline at end of file diff --git a/crawl4ai/browser/docker_utils.py b/crawl4ai/browser/docker_utils.py index 0597c2d5..7ba48534 100644 --- a/crawl4ai/browser/docker_utils.py +++ b/crawl4ai/browser/docker_utils.py @@ -8,13 +8,14 @@ import socket import subprocess from typing import Dict, List, Optional, Tuple, Union + class DockerUtils: """Utility class for Docker operations in browser automation. - + This class provides methods for managing Docker images, containers, and related operations needed for browser automation. It handles image building, container lifecycle, port management, and registry operations. - + Attributes: DOCKER_FOLDER (str): Path to folder containing Docker files DOCKER_CONNECT_FILE (str): Path to Dockerfile for connect mode @@ -24,38 +25,38 @@ class DockerUtils: DEFAULT_LAUNCH_IMAGE (str): Default image name for launch mode logger: Optional logger instance """ - + # File paths for Docker resources DOCKER_FOLDER = os.path.join(os.path.dirname(__file__), "docker") DOCKER_CONNECT_FILE = os.path.join(DOCKER_FOLDER, "connect.Dockerfile") DOCKER_LAUNCH_FILE = os.path.join(DOCKER_FOLDER, "launch.Dockerfile") DOCKER_START_SCRIPT = os.path.join(DOCKER_FOLDER, "start.sh") - + # Default image names DEFAULT_CONNECT_IMAGE = "crawl4ai/browser-connect:latest" DEFAULT_LAUNCH_IMAGE = "crawl4ai/browser-launch:latest" - + def __init__(self, logger=None): """Initialize Docker utilities. - + Args: logger: Optional logger for recording operations """ self.logger = logger - + # Image Management Methods - + async def check_image_exists(self, image_name: str) -> bool: """Check if a Docker image exists. - + Args: image_name: Name of the Docker image to check - + Returns: bool: True if the image exists, False otherwise """ cmd = ["docker", "image", "inspect", image_name] - + try: process = await asyncio.create_subprocess_exec( *cmd, stdout=asyncio.subprocess.PIPE, stderr=asyncio.subprocess.PIPE @@ -64,18 +65,24 @@ class DockerUtils: return process.returncode == 0 except Exception as e: if self.logger: - self.logger.debug(f"Error checking if image exists: {str(e)}", tag="DOCKER") + self.logger.debug( + f"Error checking if image exists: {str(e)}", tag="DOCKER" + ) return False - - async def build_docker_image(self, image_name: str, dockerfile_path: str, - files_to_copy: Dict[str, str] = None) -> bool: + + async def build_docker_image( + self, + image_name: str, + dockerfile_path: str, + files_to_copy: Dict[str, str] = None, + ) -> bool: """Build a Docker image from a Dockerfile. - + Args: image_name: Name to give the built image dockerfile_path: Path to the Dockerfile files_to_copy: Dict of {dest_name: source_path} for files to copy to build context - + Returns: bool: True if image was built successfully, False otherwise """ @@ -83,103 +90,119 @@ class DockerUtils: with tempfile.TemporaryDirectory() as temp_dir: # Copy the Dockerfile shutil.copy(dockerfile_path, os.path.join(temp_dir, "Dockerfile")) - + # Copy any additional files needed if files_to_copy: for dest_name, source_path in files_to_copy.items(): shutil.copy(source_path, os.path.join(temp_dir, dest_name)) - + # Build the image - cmd = [ - "docker", "build", - "-t", image_name, - temp_dir - ] - + cmd = ["docker", "build", "-t", image_name, temp_dir] + if self.logger: - self.logger.debug(f"Building Docker image with command: {' '.join(cmd)}", tag="DOCKER") - + self.logger.debug( + f"Building Docker image with command: {' '.join(cmd)}", tag="DOCKER" + ) + process = await asyncio.create_subprocess_exec( *cmd, stdout=asyncio.subprocess.PIPE, stderr=asyncio.subprocess.PIPE ) stdout, stderr = await process.communicate() - + if process.returncode != 0: if self.logger: self.logger.error( message="Failed to build Docker image: {error}", tag="DOCKER", - params={"error": stderr.decode()} + params={"error": stderr.decode()}, ) return False - + if self.logger: - self.logger.success(f"Successfully built Docker image: {image_name}", tag="DOCKER") + self.logger.success( + f"Successfully built Docker image: {image_name}", tag="DOCKER" + ) return True - - async def ensure_docker_image_exists(self, image_name: str, mode: str = "connect") -> str: + + async def ensure_docker_image_exists( + self, image_name: str, mode: str = "connect" + ) -> str: """Ensure the required Docker image exists, creating it if necessary. - + Args: image_name: Name of the Docker image mode: Either "connect" or "launch" to determine which image to build - + Returns: str: Name of the available Docker image - + Raises: Exception: If image doesn't exist and can't be built """ # If image name is not specified, use default based on mode if not image_name: - image_name = self.DEFAULT_CONNECT_IMAGE if mode == "connect" else self.DEFAULT_LAUNCH_IMAGE - + image_name = ( + self.DEFAULT_CONNECT_IMAGE + if mode == "connect" + else self.DEFAULT_LAUNCH_IMAGE + ) + # Check if the image already exists if await self.check_image_exists(image_name): if self.logger: - self.logger.debug(f"Docker image {image_name} already exists", tag="DOCKER") + self.logger.debug( + f"Docker image {image_name} already exists", tag="DOCKER" + ) return image_name - + # If we're using a custom image that doesn't exist, warn and fail - if (image_name != self.DEFAULT_CONNECT_IMAGE and image_name != self.DEFAULT_LAUNCH_IMAGE): + if ( + image_name != self.DEFAULT_CONNECT_IMAGE + and image_name != self.DEFAULT_LAUNCH_IMAGE + ): if self.logger: self.logger.warning( f"Custom Docker image {image_name} not found and cannot be automatically created", - tag="DOCKER" + tag="DOCKER", ) raise Exception(f"Docker image {image_name} not found") - + # Build the appropriate default image if self.logger: - self.logger.info(f"Docker image {image_name} not found, creating it now...", tag="DOCKER") - + self.logger.info( + f"Docker image {image_name} not found, creating it now...", tag="DOCKER" + ) + if mode == "connect": success = await self.build_docker_image( - image_name, - self.DOCKER_CONNECT_FILE, - {"start.sh": self.DOCKER_START_SCRIPT} + image_name, + self.DOCKER_CONNECT_FILE, + {"start.sh": self.DOCKER_START_SCRIPT}, ) else: - success = await self.build_docker_image( - image_name, - self.DOCKER_LAUNCH_FILE - ) - + success = await self.build_docker_image(image_name, self.DOCKER_LAUNCH_FILE) + if not success: raise Exception(f"Failed to create Docker image {image_name}") - + return image_name - + # Container Management Methods - - async def create_container(self, image_name: str, host_port: int, - container_name: Optional[str] = None, - volumes: List[str] = None, - network: Optional[str] = None, - env_vars: Dict[str, str] = None, - extra_args: List[str] = None) -> Optional[str]: + + async def create_container( + self, + image_name: str, + host_port: int, + container_name: Optional[str] = None, + volumes: List[str] = None, + network: Optional[str] = None, + env_vars: Dict[str, str] = None, + cpu_limit: float = 1.0, + memory_limit: str = "1.5g", + extra_args: List[str] = None, + ) -> Optional[str]: """Create a new Docker container. - + Args: image_name: Docker image to use host_port: Port on host to map to container port 9223 @@ -187,111 +210,134 @@ class DockerUtils: volumes: List of volume mappings (e.g., ["host_path:container_path"]) network: Optional Docker network to use env_vars: Dictionary of environment variables + cpu_limit: CPU limit for the container + memory_limit: Memory limit for the container extra_args: Additional docker run arguments - + Returns: str: Container ID if successful, None otherwise """ # Prepare container command cmd = [ - "docker", "run", + "docker", + "run", "--detach", ] - + # Add container name if specified if container_name: cmd.extend(["--name", container_name]) - + # Add port mapping cmd.extend(["-p", f"{host_port}:9223"]) - + # Add volumes if volumes: for volume in volumes: cmd.extend(["-v", volume]) - + # Add network if specified if network: cmd.extend(["--network", network]) - + # Add environment variables if env_vars: for key, value in env_vars.items(): cmd.extend(["-e", f"{key}={value}"]) - + + # Add CPU and memory limits + if cpu_limit: + cmd.extend(["--cpus", str(cpu_limit)]) + if memory_limit: + cmd.extend(["--memory", memory_limit]) + cmd.extend(["--memory-swap", memory_limit]) + if self.logger: + self.logger.debug( + f"Setting CPU limit: {cpu_limit}, Memory limit: {memory_limit}", + tag="DOCKER", + ) + # Add extra args if extra_args: cmd.extend(extra_args) - + # Add image cmd.append(image_name) - + if self.logger: - self.logger.debug(f"Creating Docker container with command: {' '.join(cmd)}", tag="DOCKER") - + self.logger.debug( + f"Creating Docker container with command: {' '.join(cmd)}", tag="DOCKER" + ) + # Run docker command try: process = await asyncio.create_subprocess_exec( *cmd, stdout=asyncio.subprocess.PIPE, stderr=asyncio.subprocess.PIPE ) stdout, stderr = await process.communicate() - + if process.returncode != 0: if self.logger: self.logger.error( message="Failed to create Docker container: {error}", tag="DOCKER", - params={"error": stderr.decode()} + params={"error": stderr.decode()}, ) return None - + # Get container ID container_id = stdout.decode().strip() - + if self.logger: - self.logger.success(f"Created Docker container: {container_id[:12]}", tag="DOCKER") - + self.logger.success( + f"Created Docker container: {container_id[:12]}", tag="DOCKER" + ) + return container_id - + except Exception as e: if self.logger: self.logger.error( message="Error creating Docker container: {error}", tag="DOCKER", - params={"error": str(e)} + params={"error": str(e)}, ) return None - + async def is_container_running(self, container_id: str) -> bool: """Check if a container is running. - + Args: container_id: ID of the container to check - + Returns: bool: True if the container is running, False otherwise """ cmd = ["docker", "inspect", "--format", "{{.State.Running}}", container_id] - + try: process = await asyncio.create_subprocess_exec( *cmd, stdout=asyncio.subprocess.PIPE, stderr=asyncio.subprocess.PIPE ) stdout, _ = await process.communicate() - + return process.returncode == 0 and stdout.decode().strip() == "true" except Exception as e: if self.logger: - self.logger.debug(f"Error checking if container is running: {str(e)}", tag="DOCKER") + self.logger.debug( + f"Error checking if container is running: {str(e)}", tag="DOCKER" + ) return False - - async def wait_for_container_ready(self, container_id: str, timeout: int = 30) -> bool: + + async def wait_for_container_ready( + self, container_id: str, timeout: int = 30 + ) -> bool: """Wait for the container to be in running state. - + Args: container_id: ID of the container to wait for timeout: Maximum time to wait in seconds - + Returns: bool: True if container is ready, False if timeout occurred """ @@ -299,46 +345,51 @@ class DockerUtils: if await self.is_container_running(container_id): return True await asyncio.sleep(1) - + if self.logger: - self.logger.warning(f"Container {container_id[:12]} not ready after {timeout}s timeout", tag="DOCKER") + self.logger.warning( + f"Container {container_id[:12]} not ready after {timeout}s timeout", + tag="DOCKER", + ) return False - + async def stop_container(self, container_id: str) -> bool: """Stop a Docker container. - + Args: container_id: ID of the container to stop - + Returns: bool: True if stopped successfully, False otherwise """ cmd = ["docker", "stop", container_id] - + try: process = await asyncio.create_subprocess_exec(*cmd) await process.communicate() - + if self.logger: - self.logger.debug(f"Stopped container: {container_id[:12]}", tag="DOCKER") - + self.logger.debug( + f"Stopped container: {container_id[:12]}", tag="DOCKER" + ) + return process.returncode == 0 except Exception as e: if self.logger: self.logger.warning( message="Failed to stop container: {error}", tag="DOCKER", - params={"error": str(e)} + params={"error": str(e)}, ) return False - + async def remove_container(self, container_id: str, force: bool = True) -> bool: """Remove a Docker container. - + Args: container_id: ID of the container to remove force: Whether to force removal - + Returns: bool: True if removed successfully, False otherwise """ @@ -346,35 +397,38 @@ class DockerUtils: if force: cmd.append("-f") cmd.append(container_id) - + try: process = await asyncio.create_subprocess_exec(*cmd) await process.communicate() - + if self.logger: - self.logger.debug(f"Removed container: {container_id[:12]}", tag="DOCKER") - + self.logger.debug( + f"Removed container: {container_id[:12]}", tag="DOCKER" + ) + return process.returncode == 0 except Exception as e: if self.logger: self.logger.warning( message="Failed to remove container: {error}", tag="DOCKER", - params={"error": str(e)} + params={"error": str(e)}, ) return False - + # Container Command Execution Methods - - async def exec_in_container(self, container_id: str, command: List[str], - detach: bool = False) -> Tuple[int, str, str]: + + async def exec_in_container( + self, container_id: str, command: List[str], detach: bool = False + ) -> Tuple[int, str, str]: """Execute a command in a running container. - + Args: container_id: ID of the container command: Command to execute as a list of strings detach: Whether to run the command in detached mode - + Returns: Tuple of (return_code, stdout, stderr) """ @@ -383,181 +437,206 @@ class DockerUtils: cmd.append("-d") cmd.append(container_id) cmd.extend(command) - + try: process = await asyncio.create_subprocess_exec( *cmd, stdout=asyncio.subprocess.PIPE, stderr=asyncio.subprocess.PIPE ) stdout, stderr = await process.communicate() - + return process.returncode, stdout.decode(), stderr.decode() except Exception as e: if self.logger: self.logger.error( message="Error executing command in container: {error}", tag="DOCKER", - params={"error": str(e)} + params={"error": str(e)}, ) return -1, "", str(e) - + async def start_socat_in_container(self, container_id: str) -> bool: """Start socat in the container to map port 9222 to 9223. - + Args: container_id: ID of the container - + Returns: bool: True if socat started successfully, False otherwise """ # Command to run socat as a background process cmd = ["socat", "TCP-LISTEN:9223,fork", "TCP:localhost:9222"] - - returncode, _, stderr = await self.exec_in_container(container_id, cmd, detach=True) - + + returncode, _, stderr = await self.exec_in_container( + container_id, cmd, detach=True + ) + if returncode != 0: if self.logger: self.logger.error( message="Failed to start socat in container: {error}", tag="DOCKER", - params={"error": stderr} + params={"error": stderr}, ) return False - + if self.logger: - self.logger.debug(f"Started socat in container: {container_id[:12]}", tag="DOCKER") - + self.logger.debug( + f"Started socat in container: {container_id[:12]}", tag="DOCKER" + ) + # Wait a moment for socat to start await asyncio.sleep(1) return True - - async def launch_chrome_in_container(self, container_id: str, browser_args: List[str]) -> bool: + + async def launch_chrome_in_container( + self, container_id: str, browser_args: List[str] + ) -> bool: """Launch Chrome inside the container with specified arguments. - + Args: container_id: ID of the container browser_args: Chrome command line arguments - + Returns: bool: True if Chrome started successfully, False otherwise """ # Build Chrome command chrome_cmd = ["google-chrome"] chrome_cmd.extend(browser_args) - - returncode, _, stderr = await self.exec_in_container(container_id, chrome_cmd, detach=True) - + + returncode, _, stderr = await self.exec_in_container( + container_id, chrome_cmd, detach=True + ) + if returncode != 0: if self.logger: self.logger.error( message="Failed to launch Chrome in container: {error}", tag="DOCKER", - params={"error": stderr} + params={"error": stderr}, ) return False - + if self.logger: - self.logger.debug(f"Launched Chrome in container: {container_id[:12]}", tag="DOCKER") - + self.logger.debug( + f"Launched Chrome in container: {container_id[:12]}", tag="DOCKER" + ) + return True - - async def get_process_id_in_container(self, container_id: str, process_name: str) -> Optional[int]: + + async def get_process_id_in_container( + self, container_id: str, process_name: str + ) -> Optional[int]: """Get the process ID for a process in the container. - + Args: container_id: ID of the container process_name: Name pattern to search for - + Returns: int: Process ID if found, None otherwise """ cmd = ["pgrep", "-f", process_name] - + returncode, stdout, _ = await self.exec_in_container(container_id, cmd) - + if returncode == 0 and stdout.strip(): pid = int(stdout.strip().split("\n")[0]) return pid - + return None - + async def stop_process_in_container(self, container_id: str, pid: int) -> bool: """Stop a process in the container by PID. - + Args: container_id: ID of the container pid: Process ID to stop - + Returns: bool: True if process was stopped, False otherwise """ cmd = ["kill", "-TERM", str(pid)] - + returncode, _, stderr = await self.exec_in_container(container_id, cmd) - + if returncode != 0: if self.logger: self.logger.warning( message="Failed to stop process in container: {error}", tag="DOCKER", - params={"error": stderr} + params={"error": stderr}, ) return False - + if self.logger: - self.logger.debug(f"Stopped process {pid} in container: {container_id[:12]}", tag="DOCKER") - + self.logger.debug( + f"Stopped process {pid} in container: {container_id[:12]}", tag="DOCKER" + ) + return True - + # Network and Port Methods - - async def wait_for_cdp_ready(self, host_port: int, timeout: int = 30) -> bool: + + async def wait_for_cdp_ready(self, host_port: int, timeout: int = 10) -> dict: """Wait for the CDP endpoint to be ready. - + Args: host_port: Port to check for CDP endpoint timeout: Maximum time to wait in seconds - + Returns: - bool: True if CDP endpoint is ready, False if timeout occurred + dict: CDP JSON config if ready, None if timeout occurred """ import aiohttp - + url = f"http://localhost:{host_port}/json/version" - + for _ in range(timeout): try: async with aiohttp.ClientSession() as session: async with session.get(url, timeout=1) as response: if response.status == 200: if self.logger: - self.logger.debug(f"CDP endpoint ready on port {host_port}", tag="DOCKER") - return True + self.logger.debug( + f"CDP endpoint ready on port {host_port}", + tag="DOCKER", + ) + cdp_json_config = await response.json() + if self.logger: + self.logger.debug( + f"CDP JSON config: {cdp_json_config}", tag="DOCKER" + ) + return cdp_json_config except Exception: pass await asyncio.sleep(1) - + if self.logger: - self.logger.warning(f"CDP endpoint not ready on port {host_port} after {timeout}s timeout", tag="DOCKER") - return False - + self.logger.warning( + f"CDP endpoint not ready on port {host_port} after {timeout}s timeout", + tag="DOCKER", + ) + return None + def is_port_in_use(self, port: int) -> bool: """Check if a port is already in use on the host. - + Args: port: Port number to check - + Returns: bool: True if port is in use, False otherwise """ with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s: - return s.connect_ex(('localhost', port)) == 0 - + return s.connect_ex(("localhost", port)) == 0 + def get_next_available_port(self, start_port: int = 9223) -> int: """Get the next available port starting from a given port. - + Args: start_port: Port number to start checking from - + Returns: int: First available port number """ @@ -565,18 +644,18 @@ class DockerUtils: while self.is_port_in_use(port): port += 1 return port - + # Configuration Hash Methods - + def generate_config_hash(self, config_dict: Dict) -> str: """Generate a hash of the configuration for container matching. - + Args: config_dict: Dictionary of configuration parameters - + Returns: str: Hash string uniquely identifying this configuration """ # Convert to canonical JSON string and hash config_json = json.dumps(config_dict, sort_keys=True) - return hashlib.sha256(config_json.encode()).hexdigest() \ No newline at end of file + return hashlib.sha256(config_json.encode()).hexdigest() diff --git a/crawl4ai/browser/manager.py b/crawl4ai/browser/manager.py index 31411844..3cb68021 100644 --- a/crawl4ai/browser/manager.py +++ b/crawl4ai/browser/manager.py @@ -18,15 +18,10 @@ from .strategies import ( BaseBrowserStrategy, PlaywrightBrowserStrategy, CDPBrowserStrategy, - BuiltinBrowserStrategy + BuiltinBrowserStrategy, + DockerBrowserStrategy ) -# Import DockerBrowserStrategy if available -try: - from .docker_strategy import DockerBrowserStrategy -except ImportError: - DockerBrowserStrategy = None - class BrowserManager: """Main interface for browser management in Crawl4AI. diff --git a/crawl4ai/browser/models.py b/crawl4ai/browser/models.py index e69de29b..e2ac2b3f 100644 --- a/crawl4ai/browser/models.py +++ b/crawl4ai/browser/models.py @@ -0,0 +1,143 @@ +"""Docker configuration module for Crawl4AI browser automation. + +This module provides configuration classes for Docker-based browser automation, +allowing flexible configuration of Docker containers for browsing. +""" + +from typing import Dict, List, Optional + + +class DockerConfig: + """Configuration for Docker-based browser automation. + + This class contains Docker-specific settings to avoid cluttering BrowserConfig. + + Attributes: + mode (str): Docker operation mode - "connect" or "launch". + - "connect": Uses a container with Chrome already running + - "launch": Dynamically configures and starts Chrome in container + image (str): Docker image to use. If None, defaults from DockerUtils are used. + registry_file (str): Path to container registry file for persistence. + persistent (bool): Keep container running after browser closes. + remove_on_exit (bool): Remove container on exit when not persistent. + network (str): Docker network to use. + volumes (List[str]): Volume mappings (e.g., ["host_path:container_path"]). + env_vars (Dict[str, str]): Environment variables to set in container. + extra_args (List[str]): Additional docker run arguments. + host_port (int): Host port to map to container's 9223 port. + user_data_dir (str): Path to user data directory on host. + container_user_data_dir (str): Path to user data directory in container. + """ + + def __init__( + self, + mode: str = "connect", # "connect" or "launch" + image: Optional[str] = None, # Docker image to use + registry_file: Optional[str] = None, # Path to registry file + persistent: bool = False, # Keep container running after browser closes + remove_on_exit: bool = True, # Remove container on exit when not persistent + network: Optional[str] = None, # Docker network to use + volumes: List[str] = None, # Volume mappings + cpu_limit: float = 1.0, # CPU limit for the container + memory_limit: str = "1.5g", # Memory limit for the container + env_vars: Dict[str, str] = None, # Environment variables + host_port: Optional[int] = None, # Host port to map to container's 9223 + user_data_dir: Optional[str] = None, # Path to user data directory on host + container_user_data_dir: str = "/data", # Path to user data directory in container + extra_args: List[str] = None, # Additional docker run arguments + ): + """Initialize Docker configuration. + + Args: + mode: Docker operation mode ("connect" or "launch") + image: Docker image to use + registry_file: Path to container registry file + persistent: Whether to keep container running after browser closes + remove_on_exit: Whether to remove container on exit when not persistent + network: Docker network to use + volumes: Volume mappings as list of strings + cpu_limit: CPU limit for the container + memory_limit: Memory limit for the container + env_vars: Environment variables as dictionary + extra_args: Additional docker run arguments + host_port: Host port to map to container's 9223 + user_data_dir: Path to user data directory on host + container_user_data_dir: Path to user data directory in container + """ + self.mode = mode + self.image = image # If None, defaults will be used from DockerUtils + self.registry_file = registry_file + self.persistent = persistent + self.remove_on_exit = remove_on_exit + self.network = network + self.volumes = volumes or [] + self.cpu_limit = cpu_limit + self.memory_limit = memory_limit + self.env_vars = env_vars or {} + self.extra_args = extra_args or [] + self.host_port = host_port + self.user_data_dir = user_data_dir + self.container_user_data_dir = container_user_data_dir + + def to_dict(self) -> Dict: + """Convert this configuration to a dictionary. + + Returns: + Dictionary representation of this configuration + """ + return { + "mode": self.mode, + "image": self.image, + "registry_file": self.registry_file, + "persistent": self.persistent, + "remove_on_exit": self.remove_on_exit, + "network": self.network, + "volumes": self.volumes, + "cpu_limit": self.cpu_limit, + "memory_limit": self.memory_limit, + "env_vars": self.env_vars, + "extra_args": self.extra_args, + "host_port": self.host_port, + "user_data_dir": self.user_data_dir, + "container_user_data_dir": self.container_user_data_dir + } + + @staticmethod + def from_kwargs(kwargs: Dict) -> "DockerConfig": + """Create a DockerConfig from a dictionary of keyword arguments. + + Args: + kwargs: Dictionary of configuration options + + Returns: + New DockerConfig instance + """ + return DockerConfig( + mode=kwargs.get("mode", "connect"), + image=kwargs.get("image"), + registry_file=kwargs.get("registry_file"), + persistent=kwargs.get("persistent", False), + remove_on_exit=kwargs.get("remove_on_exit", True), + network=kwargs.get("network"), + volumes=kwargs.get("volumes"), + cpu_limit=kwargs.get("cpu_limit", 1.0), + memory_limit=kwargs.get("memory_limit", "1.5g"), + env_vars=kwargs.get("env_vars"), + extra_args=kwargs.get("extra_args"), + host_port=kwargs.get("host_port"), + user_data_dir=kwargs.get("user_data_dir"), + container_user_data_dir=kwargs.get("container_user_data_dir", "/data") + ) + + def clone(self, **kwargs) -> "DockerConfig": + """Create a copy of this configuration with updated values. + + Args: + **kwargs: Key-value pairs of configuration options to update + + Returns: + DockerConfig: A new instance with the specified updates + """ + config_dict = self.to_dict() + config_dict.update(kwargs) + return DockerConfig.from_kwargs(config_dict) \ No newline at end of file diff --git a/crawl4ai/browser/strategies.py b/crawl4ai/browser/strategies.py deleted file mode 100644 index f2a9525e..00000000 --- a/crawl4ai/browser/strategies.py +++ /dev/null @@ -1,1256 +0,0 @@ -"""Browser strategies module for Crawl4AI. - -This module implements the browser strategy pattern for different -browser implementations, including Playwright, CDP, and builtin browsers. -""" - -from abc import ABC, abstractmethod -import asyncio -import os -import time -import json -import hashlib -import subprocess -import shutil -import signal -from typing import Optional, Dict, Tuple, List, Any - -from playwright.async_api import BrowserContext, Page, ProxySettings - -from ..async_logger import AsyncLogger -from ..async_configs import BrowserConfig, CrawlerRunConfig -from ..config import DOWNLOAD_PAGE_TIMEOUT -from ..js_snippet import load_js_script -from ..utils import get_home_folder -from .utils import get_playwright, get_browser_executable, get_browser_disable_options, create_temp_directory, is_windows, is_browser_running - -from playwright_stealth import StealthConfig - -stealth_config = StealthConfig( - webdriver=True, - chrome_app=True, - chrome_csi=True, - chrome_load_times=True, - chrome_runtime=True, - navigator_languages=True, - navigator_plugins=True, - navigator_permissions=True, - webgl_vendor=True, - outerdimensions=True, - navigator_hardware_concurrency=True, - media_codecs=True, -) - -class BaseBrowserStrategy(ABC): - """Base class for all browser strategies. - - This abstract class defines the interface that all browser strategies - must implement. It handles common functionality like context caching. - """ - - def __init__(self, config: BrowserConfig, logger: Optional[AsyncLogger] = None): - """Initialize the strategy with configuration and logger. - - Args: - config: Browser configuration - logger: Logger for recording events and errors - """ - self.config = config - self.logger = logger - self.browser = None - self.default_context = None - self.contexts_by_config = {} - self._contexts_lock = asyncio.Lock() - self.playwright = None - - @abstractmethod - async def start(self): - """Start the browser. - - Returns: - self: For method chaining - """ - pass - - @abstractmethod - async def get_page(self, crawlerRunConfig: CrawlerRunConfig) -> Tuple[Page, BrowserContext]: - """Get a page with specified configuration. - - Args: - crawlerRunConfig: Crawler run configuration - - Returns: - Tuple of (Page, BrowserContext) - """ - pass - - async def get_pages(self, crawlerRunConfig: CrawlerRunConfig, count: int = 1) -> List[Tuple[Page, BrowserContext]]: - """Get multiple pages with the same configuration. - - Args: - crawlerRunConfig: Configuration for the pages - count: Number of pages to create - - Returns: - List of (Page, Context) tuples - """ - pages = [] - for _ in range(count): - page, context = await self.get_page(crawlerRunConfig) - pages.append((page, context)) - return pages - - @abstractmethod - async def close(self): - """Close the browser and clean up resources.""" - pass - - def _make_config_signature(self, crawlerRunConfig: CrawlerRunConfig) -> str: - """Create a signature hash from configuration for context caching. - - Args: - crawlerRunConfig: Crawler run configuration - - Returns: - str: Unique hash for this configuration - """ - config_dict = crawlerRunConfig.__dict__.copy() - # Exclude items that do not affect browser-level setup - ephemeral_keys = [ - "session_id", - "js_code", - "scraping_strategy", - "extraction_strategy", - "chunking_strategy", - "cache_mode", - "content_filter", - "semaphore_count", - "url" - ] - for key in ephemeral_keys: - if key in config_dict: - del config_dict[key] - - # Convert to canonical JSON string - signature_json = json.dumps(config_dict, sort_keys=True, default=str) - - # Hash the JSON so we get a compact, unique string - signature_hash = hashlib.sha256(signature_json.encode("utf-8")).hexdigest() - return signature_hash - - async def create_browser_context(self, crawlerRunConfig: Optional[CrawlerRunConfig] = None) -> BrowserContext: - """Creates and returns a new browser context with configured settings. - - Args: - crawlerRunConfig: Configuration object for the crawler run - - Returns: - BrowserContext: Browser context object with the specified configurations - """ - if not self.browser: - raise ValueError("Browser must be initialized before creating context") - - # Base settings - user_agent = self.config.headers.get("User-Agent", self.config.user_agent) - viewport_settings = { - "width": self.config.viewport_width, - "height": self.config.viewport_height, - } - proxy_settings = {"server": self.config.proxy} if self.config.proxy else None - - # Define blocked extensions for resource optimization - blocked_extensions = [ - # Images - "jpg", "jpeg", "png", "gif", "webp", "svg", "ico", "bmp", "tiff", "psd", - # Fonts - "woff", "woff2", "ttf", "otf", "eot", - # Media - "mp4", "webm", "ogg", "avi", "mov", "wmv", "flv", "m4v", "mp3", "wav", "aac", - "m4a", "opus", "flac", - # Documents - "pdf", "doc", "docx", "xls", "xlsx", "ppt", "pptx", - # Archives - "zip", "rar", "7z", "tar", "gz", - # Scripts and data - "xml", "swf", "wasm", - ] - - # Common context settings - context_settings = { - "user_agent": user_agent, - "viewport": viewport_settings, - "proxy": proxy_settings, - "accept_downloads": self.config.accept_downloads, - "ignore_https_errors": self.config.ignore_https_errors, - "device_scale_factor": 1.0, - "java_script_enabled": self.config.java_script_enabled, - } - - # Apply text mode settings if enabled - if self.config.text_mode: - text_mode_settings = { - "has_touch": False, - "is_mobile": False, - # Disable javascript in text mode - "java_script_enabled": False - } - # Update context settings with text mode settings - context_settings.update(text_mode_settings) - if self.logger: - self.logger.debug("Text mode enabled for browser context", tag="BROWSER") - - # Handle storage state properly - this is key for persistence - if self.config.storage_state: - context_settings["storage_state"] = self.config.storage_state - if self.logger: - if isinstance(self.config.storage_state, str): - self.logger.debug(f"Using storage state from file: {self.config.storage_state}", tag="BROWSER") - else: - self.logger.debug("Using storage state from config object", tag="BROWSER") - - # If user_data_dir is specified, browser persistence should be automatic - if self.config.user_data_dir and self.logger: - self.logger.debug(f"Using user data directory: {self.config.user_data_dir}", tag="BROWSER") - - # Apply crawler-specific configurations if provided - if crawlerRunConfig: - # Check if there is value for crawlerRunConfig.proxy_config set add that to context - if crawlerRunConfig.proxy_config: - proxy_settings = { - "server": crawlerRunConfig.proxy_config.server, - } - if crawlerRunConfig.proxy_config.username: - proxy_settings.update({ - "username": crawlerRunConfig.proxy_config.username, - "password": crawlerRunConfig.proxy_config.password, - }) - context_settings["proxy"] = proxy_settings - - # Create and return the context - try: - # Create the context with appropriate settings - context = await self.browser.new_context(**context_settings) - - # Apply text mode resource blocking if enabled - if self.config.text_mode: - # Create and apply route patterns for each extension - for ext in blocked_extensions: - await context.route(f"**/*.{ext}", lambda route: route.abort()) - - return context - except Exception as e: - if self.logger: - self.logger.error(f"Error creating browser context: {str(e)}", tag="BROWSER") - # Fallback to basic context creation if the advanced settings fail - return await self.browser.new_context() - - async def setup_context(self, context: BrowserContext, crawlerRunConfig: Optional[CrawlerRunConfig] = None): - """Set up a browser context with the configured options. - - Args: - context: The browser context to set up - crawlerRunConfig: Configuration object containing all browser settings - """ - if self.config.headers: - await context.set_extra_http_headers(self.config.headers) - - if self.config.cookies: - await context.add_cookies(self.config.cookies) - - if self.config.accept_downloads: - context.set_default_timeout(DOWNLOAD_PAGE_TIMEOUT) - context.set_default_navigation_timeout(DOWNLOAD_PAGE_TIMEOUT) - if self.config.downloads_path: - context._impl_obj._options["accept_downloads"] = True - context._impl_obj._options["downloads_path"] = self.config.downloads_path - - # Handle user agent and browser hints - if self.config.user_agent: - combined_headers = { - "User-Agent": self.config.user_agent, - "sec-ch-ua": self.config.browser_hint, - } - combined_headers.update(self.config.headers) - await context.set_extra_http_headers(combined_headers) - - # Add default cookie - await context.add_cookies( - [ - { - "name": "cookiesEnabled", - "value": "true", - "url": crawlerRunConfig and crawlerRunConfig.url or "https://crawl4ai.com/", - } - ] - ) - - # Handle navigator overrides - if crawlerRunConfig: - if ( - crawlerRunConfig.override_navigator - or crawlerRunConfig.simulate_user - or crawlerRunConfig.magic - ): - await context.add_init_script(load_js_script("navigator_overrider")) - -class PlaywrightBrowserStrategy(BaseBrowserStrategy): - """Standard Playwright browser strategy. - - This strategy launches a new browser instance using Playwright - and manages browser contexts. - """ - - def __init__(self, config: BrowserConfig, logger: Optional[AsyncLogger] = None): - """Initialize the Playwright browser strategy. - - Args: - config: Browser configuration - logger: Logger for recording events and errors - """ - super().__init__(config, logger) - # Add session management - self.sessions = {} - self.session_ttl = 1800 # 30 minutes - - async def start(self): - """Start the browser instance. - - Returns: - self: For method chaining - """ - self.playwright = await get_playwright() - browser_args = self._build_browser_args() - - # Launch appropriate browser type - if self.config.browser_type == "firefox": - self.browser = await self.playwright.firefox.launch(**browser_args) - elif self.config.browser_type == "webkit": - self.browser = await self.playwright.webkit.launch(**browser_args) - else: - self.browser = await self.playwright.chromium.launch(**browser_args) - - self.default_context = self.browser - return self - - def _build_browser_args(self) -> dict: - """Build browser launch arguments from config. - - Returns: - dict: Browser launch arguments - """ - args = [ - "--disable-gpu", - "--disable-gpu-compositing", - "--disable-software-rasterizer", - "--no-sandbox", - "--disable-dev-shm-usage", - "--no-first-run", - "--no-default-browser-check", - "--disable-infobars", - "--window-position=0,0", - "--ignore-certificate-errors", - "--ignore-certificate-errors-spki-list", - "--disable-blink-features=AutomationControlled", - "--window-position=400,0", - "--disable-renderer-backgrounding", - "--disable-ipc-flooding-protection", - "--force-color-profile=srgb", - "--mute-audio", - "--disable-background-timer-throttling", - f"--window-size={self.config.viewport_width},{self.config.viewport_height}", - ] - - if self.config.light_mode: - args.extend(get_browser_disable_options()) - - if self.config.text_mode: - args.extend( - [ - "--blink-settings=imagesEnabled=false", - "--disable-remote-fonts", - "--disable-images", - "--disable-javascript", - "--disable-software-rasterizer", - "--disable-dev-shm-usage", - ] - ) - - if self.config.extra_args: - args.extend(self.config.extra_args) - - browser_args = {"headless": self.config.headless, "args": args} - - if self.config.chrome_channel: - browser_args["channel"] = self.config.chrome_channel - - if self.config.accept_downloads: - browser_args["downloads_path"] = self.config.downloads_path or os.path.join( - os.getcwd(), "downloads" - ) - os.makedirs(browser_args["downloads_path"], exist_ok=True) - - if self.config.proxy or self.config.proxy_config: - proxy_settings = ( - ProxySettings(server=self.config.proxy) - if self.config.proxy - else ProxySettings( - server=self.config.proxy_config.server, - username=self.config.proxy_config.username, - password=self.config.proxy_config.password, - ) - ) - browser_args["proxy"] = proxy_settings - - return browser_args - - async def create_browser_context(self, crawlerRunConfig: Optional[CrawlerRunConfig] = None) -> BrowserContext: - """Creates and returns a new browser context with configured settings. - - This implementation extends the base class version to handle user_data_dir specifically. - - Args: - crawlerRunConfig: Configuration object for the crawler run - - Returns: - BrowserContext: Browser context object with the specified configurations - """ - # Handle user_data_dir explicitly to ensure storage persistence - if self.config.user_data_dir: - # Create a storage state file path if none exists - storage_path = os.path.join(self.config.user_data_dir, "Default", "storage_state.json") - - # Create the file if it doesn't exist - if not os.path.exists(storage_path): - os.makedirs(os.path.dirname(storage_path), exist_ok=True) - with open(storage_path, "w") as f: - json.dump({}, f) - - # Override storage_state with our specific path - self.config.storage_state = storage_path - if self.logger: - self.logger.debug(f"Using persistent storage state at: {storage_path}", tag="BROWSER") - - # Now call the base class implementation which handles everything else - return await super().create_browser_context(crawlerRunConfig) - - def _cleanup_expired_sessions(self): - """Clean up expired sessions based on TTL.""" - current_time = time.time() - expired_sessions = [ - sid - for sid, (_, _, last_used) in self.sessions.items() - if current_time - last_used > self.session_ttl - ] - for sid in expired_sessions: - asyncio.create_task(self._kill_session(sid)) - - async def _kill_session(self, session_id: str): - """Kill a browser session and clean up resources. - - Args: - session_id: The session ID to kill - """ - if session_id in self.sessions: - context, page, _ = self.sessions[session_id] - await page.close() - del self.sessions[session_id] - - async def get_page(self, crawlerRunConfig: CrawlerRunConfig) -> Tuple[Page, BrowserContext]: - """Get a page for the given configuration. - - Args: - crawlerRunConfig: Configuration object for the crawler run - - Returns: - Tuple of (Page, BrowserContext) - """ - # Clean up expired sessions first - self._cleanup_expired_sessions() - - # If a session_id is provided and we already have it, reuse that page + context - if crawlerRunConfig.session_id and crawlerRunConfig.session_id in self.sessions: - context, page, _ = self.sessions[crawlerRunConfig.session_id] - # Update last-used timestamp - self.sessions[crawlerRunConfig.session_id] = (context, page, time.time()) - return page, context - - # Otherwise, check if we have an existing context for this config - config_signature = self._make_config_signature(crawlerRunConfig) - - async with self._contexts_lock: - if config_signature in self.contexts_by_config: - context = self.contexts_by_config[config_signature] - else: - # Create and setup a new context - context = await self.create_browser_context(crawlerRunConfig) - await self.setup_context(context, crawlerRunConfig) - self.contexts_by_config[config_signature] = context - - # Create a new page from the chosen context - page = await context.new_page() - - # If a session_id is specified, store this session so we can reuse later - if crawlerRunConfig.session_id: - self.sessions[crawlerRunConfig.session_id] = (context, page, time.time()) - - return page, context - - async def close(self): - """Close the browser and clean up resources.""" - if self.config.sleep_on_close: - await asyncio.sleep(0.5) - - # If we have a user_data_dir configured, ensure persistence of storage state - if self.config.user_data_dir and self.browser and self.default_context: - for context in self.browser.contexts: - try: - await context.storage_state(path=os.path.join(self.config.user_data_dir, "Default", "storage_state.json")) - if self.logger: - self.logger.debug("Ensuring storage state is persisted before closing browser", tag="BROWSER") - except Exception as e: - if self.logger: - self.logger.warning( - message="Failed to ensure storage persistence: {error}", - tag="BROWSER", - params={"error": str(e)} - ) - - # Close all sessions - session_ids = list(self.sessions.keys()) - for session_id in session_ids: - await self._kill_session(session_id) - - # Close all contexts we created - for ctx in self.contexts_by_config.values(): - try: - await ctx.close() - except Exception as e: - if self.logger: - self.logger.error( - message="Error closing context: {error}", - tag="ERROR", - params={"error": str(e)} - ) - self.contexts_by_config.clear() - - if self.browser: - await self.browser.close() - self.browser = None - - if self.playwright: - await self.playwright.stop() - self.playwright = None - -class CDPBrowserStrategy(BaseBrowserStrategy): - """CDP-based browser strategy. - - This strategy connects to an existing browser using CDP protocol or - launches and connects to a browser using CDP. - """ - - def __init__(self, config: BrowserConfig, logger: Optional[AsyncLogger] = None): - """Initialize the CDP browser strategy. - - Args: - config: Browser configuration - logger: Logger for recording events and errors - """ - super().__init__(config, logger) - self.sessions = {} - self.session_ttl = 1800 # 30 minutes - self.browser_process = None - self.temp_dir = None - self.shutting_down = False - - async def start(self): - """Start or connect to the browser using CDP. - - Returns: - self: For method chaining - """ - self.playwright = await get_playwright() - - # Get or create CDP URL - cdp_url = await self._get_or_create_cdp_url() - - # Connect to the browser using CDP - self.browser = await self.playwright.chromium.connect_over_cdp(cdp_url) - - # Get or create default context - contexts = self.browser.contexts - if contexts: - self.default_context = contexts[0] - else: - self.default_context = await self.create_browser_context() - - await self.setup_context(self.default_context) - return self - - async def _get_or_create_cdp_url(self) -> str: - """Get existing CDP URL or launch a browser and return its CDP URL. - - Returns: - str: CDP URL for connecting to the browser - """ - # If CDP URL is provided, just return it - if self.config.cdp_url: - return self.config.cdp_url - - # Create temp dir if needed - if not self.config.user_data_dir: - self.temp_dir = create_temp_directory() - user_data_dir = self.temp_dir - else: - user_data_dir = self.config.user_data_dir - - # Get browser args based on OS and browser type - args = await self._get_browser_args(user_data_dir) - - # Start browser process - try: - # Use DETACHED_PROCESS flag on Windows to fully detach the process - # On Unix, we'll use preexec_fn=os.setpgrp to start the process in a new process group - if is_windows(): - self.browser_process = subprocess.Popen( - args, - stdout=subprocess.PIPE, - stderr=subprocess.PIPE, - creationflags=subprocess.DETACHED_PROCESS | subprocess.CREATE_NEW_PROCESS_GROUP - ) - else: - self.browser_process = subprocess.Popen( - args, - stdout=subprocess.PIPE, - stderr=subprocess.PIPE, - preexec_fn=os.setpgrp # Start in a new process group - ) - - # Monitor for a short time to make sure it starts properly - await asyncio.sleep(0.5) # Give browser time to start - await self._initial_startup_check() - await asyncio.sleep(2) # Give browser more time to start - return f"http://localhost:{self.config.debugging_port}" - except Exception as e: - await self._cleanup_process() - raise Exception(f"Failed to start browser: {e}") - - async def _initial_startup_check(self): - """Perform a quick check to make sure the browser started successfully.""" - if not self.browser_process: - return - - # Check that process started without immediate termination - await asyncio.sleep(0.5) - if self.browser_process.poll() is not None: - # Process already terminated - stdout, stderr = b"", b"" - try: - stdout, stderr = self.browser_process.communicate(timeout=0.5) - except subprocess.TimeoutExpired: - pass - - if self.logger: - self.logger.error( - message="Browser process terminated during startup | Code: {code} | STDOUT: {stdout} | STDERR: {stderr}", - tag="ERROR", - params={ - "code": self.browser_process.returncode, - "stdout": stdout.decode() if stdout else "", - "stderr": stderr.decode() if stderr else "", - }, - ) - - async def _get_browser_args(self, user_data_dir: str) -> List[str]: - """Returns browser-specific command line arguments. - - Args: - user_data_dir: Path to user data directory - - Returns: - List of command-line arguments for the browser - """ - browser_path = await get_browser_executable(self.config.browser_type) - base_args = [browser_path] - - if self.config.browser_type == "chromium": - args = [ - f"--remote-debugging-port={self.config.debugging_port}", - f"--user-data-dir={user_data_dir}", - ] - if self.config.headless: - args.append("--headless=new") - elif self.config.browser_type == "firefox": - args = [ - "--remote-debugging-port", - str(self.config.debugging_port), - "--profile", - user_data_dir, - ] - if self.config.headless: - args.append("--headless") - else: - raise NotImplementedError(f"Browser type {self.config.browser_type} not supported") - - return base_args + args - - async def _cleanup_process(self): - """Cleanup browser process and temporary directory.""" - # Set shutting_down flag BEFORE any termination actions - self.shutting_down = True - - if self.browser_process: - try: - # Only terminate if we have proper control over the process - if not self.browser_process.poll(): - # Process is still running - self.browser_process.terminate() - # Wait for process to end gracefully - for _ in range(10): # 10 attempts, 100ms each - if self.browser_process.poll() is not None: - break - await asyncio.sleep(0.1) - - # Force kill if still running - if self.browser_process.poll() is None: - if is_windows(): - # On Windows we might need taskkill for detached processes - try: - subprocess.run(["taskkill", "/F", "/PID", str(self.browser_process.pid)]) - except Exception: - self.browser_process.kill() - else: - self.browser_process.kill() - await asyncio.sleep(0.1) # Brief wait for kill to take effect - - except Exception as e: - if self.logger: - self.logger.error( - message="Error terminating browser: {error}", - tag="ERROR", - params={"error": str(e)}, - ) - - if self.temp_dir and os.path.exists(self.temp_dir): - try: - shutil.rmtree(self.temp_dir) - except Exception as e: - if self.logger: - self.logger.error( - message="Error removing temporary directory: {error}", - tag="ERROR", - params={"error": str(e)}, - ) - - async def create_browser_context(self, crawlerRunConfig: Optional[CrawlerRunConfig] = None) -> BrowserContext: - """Create a new browser context. - - Uses the base class implementation which handles all configurations. - - Args: - crawlerRunConfig: Configuration object for the crawler run - - Returns: - BrowserContext: Browser context object - """ - # Handle user_data_dir for CDP browsers - if self.config.user_data_dir: - # For CDP-based browsers, storage persistence is typically handled by the user_data_dir - # at the browser level, but we'll create a storage_state location for Playwright as well - storage_path = os.path.join(self.config.user_data_dir, "storage_state.json") - if not os.path.exists(storage_path): - # Create parent directory if it doesn't exist - os.makedirs(os.path.dirname(storage_path), exist_ok=True) - with open(storage_path, "w") as f: - json.dump({}, f) - self.config.storage_state = storage_path - - # Use the base class implementation - return await super().create_browser_context(crawlerRunConfig) - - def _cleanup_expired_sessions(self): - """Clean up expired sessions based on TTL.""" - current_time = time.time() - expired_sessions = [ - sid - for sid, (_, _, last_used) in self.sessions.items() - if current_time - last_used > self.session_ttl - ] - for sid in expired_sessions: - asyncio.create_task(self._kill_session(sid)) - - async def _kill_session(self, session_id: str): - """Kill a browser session and clean up resources. - - Args: - session_id: The session ID to kill - """ - if session_id in self.sessions: - context, page, _ = self.sessions[session_id] - await page.close() - del self.sessions[session_id] - - async def get_page(self, crawlerRunConfig: CrawlerRunConfig) -> Tuple[Page, BrowserContext]: - """Get a page for the given configuration. - - Args: - crawlerRunConfig: Configuration object for the crawler run - - Returns: - Tuple of (Page, BrowserContext) - """ - self._cleanup_expired_sessions() - - # If a session_id is provided and we already have it, reuse that page + context - if crawlerRunConfig.session_id and crawlerRunConfig.session_id in self.sessions: - context, page, _ = self.sessions[crawlerRunConfig.session_id] - # Update last-used timestamp - self.sessions[crawlerRunConfig.session_id] = (context, page, time.time()) - return page, context - - # For CDP, we typically use the shared default_context - context = self.default_context - pages = context.pages - page = next((p for p in pages if p.url == crawlerRunConfig.url), None) - if not page: - page = await context.new_page() - - # If a session_id is specified, store this session so we can reuse later - if crawlerRunConfig.session_id: - self.sessions[crawlerRunConfig.session_id] = (context, page, time.time()) - - return page, context - - async def close(self): - """Close the browser and clean up resources.""" - # Skip cleanup if using external CDP URL and not launched by us - if self.config.cdp_url and not self.browser_process: - return - - if self.config.sleep_on_close: - await asyncio.sleep(0.5) - - # If we have a user_data_dir configured, ensure persistence of storage state - if self.config.user_data_dir and self.browser and self.default_context: - for context in self.browser.contexts: - try: - await context.storage_state(path=os.path.join(self.config.user_data_dir, "Default", "storage_state.json")) - if self.logger: - self.logger.debug("Ensuring storage state is persisted before closing browser", tag="BROWSER") - except Exception as e: - if self.logger: - self.logger.warning( - message="Failed to ensure storage persistence: {error}", - tag="BROWSER", - params={"error": str(e)} - ) - - # Close all sessions - session_ids = list(self.sessions.keys()) - for session_id in session_ids: - await self._kill_session(session_id) - - # Close browser - if self.browser: - await self.browser.close() - self.browser = None - - # Clean up managed browser if we created it - if self.browser_process: - await asyncio.sleep(0.5) - await self._cleanup_process() - self.browser_process = None - - # Close temporary directory - if self.temp_dir and os.path.exists(self.temp_dir): - try: - shutil.rmtree(self.temp_dir) - self.temp_dir = None - except Exception as e: - if self.logger: - self.logger.error( - message="Error removing temporary directory: {error}", - tag="ERROR", - params={"error": str(e)}, - ) - - # Stop playwright - if self.playwright: - await self.playwright.stop() - self.playwright = None - -class BuiltinBrowserStrategy(CDPBrowserStrategy): - """Built-in browser strategy. - - This strategy extends the CDP strategy to use the built-in browser. - """ - - def __init__(self, config: BrowserConfig, logger: Optional[AsyncLogger] = None): - """Initialize the built-in browser strategy. - - Args: - config: Browser configuration - logger: Logger for recording events and errors - """ - super().__init__(config, logger) - self.builtin_browser_dir = os.path.join(get_home_folder(), "builtin-browser") if not self.config.user_data_dir else self.config.user_data_dir - self.builtin_config_file = os.path.join(self.builtin_browser_dir, "browser_config.json") - - # Raise error if user data dir is already engaged - if self._check_user_dir_is_engaged(self.builtin_browser_dir): - raise Exception(f"User data directory {self.builtin_browser_dir} is already engaged by another browser instance.") - - os.makedirs(self.builtin_browser_dir, exist_ok=True) - - def _check_user_dir_is_engaged(self, user_data_dir: str) -> bool: - """Check if the user data directory is already in use. - - Returns: - bool: True if the directory is engaged, False otherwise - """ - # Load browser config file, then iterate in port_map values, check "user_data_dir" key if it matches - # the current user data directory - if os.path.exists(self.builtin_config_file): - try: - with open(self.builtin_config_file, 'r') as f: - browser_info_dict = json.load(f) - - # Check if user data dir is already engaged - for port_str, browser_info in browser_info_dict.get("port_map", {}).items(): - if browser_info.get("user_data_dir") == user_data_dir: - return True - except Exception as e: - if self.logger: - self.logger.error(f"Error reading built-in browser config: {str(e)}", tag="BUILTIN") - return False - - async def start(self): - """Start or connect to the built-in browser. - - Returns: - self: For method chaining - """ - # Check for existing built-in browser (get_browser_info already checks if running) - browser_info = self.get_browser_info() - if browser_info: - if self.logger: - self.logger.info(f"Using existing built-in browser at {browser_info.get('cdp_url')}", tag="BROWSER") - self.config.cdp_url = browser_info.get('cdp_url') - else: - if self.logger: - self.logger.info("Built-in browser not found, launching new instance...", tag="BROWSER") - cdp_url = await self.launch_builtin_browser( - browser_type=self.config.browser_type, - debugging_port=self.config.debugging_port, - headless=self.config.headless, - ) - if not cdp_url: - if self.logger: - self.logger.warning("Failed to launch built-in browser, falling back to regular CDP strategy", tag="BROWSER") - return await super().start() - self.config.cdp_url = cdp_url - - # Call parent class implementation with updated CDP URL - return await super().start() - - @classmethod - def get_builtin_browser_info(cls, debugging_port: int, config_file: str, logger: Optional[AsyncLogger] = None) -> Optional[Dict[str, Any]]: - """Get information about the built-in browser for a specific debugging port. - - Args: - debugging_port: The debugging port to look for - config_file: Path to the config file - logger: Optional logger for recording events - - Returns: - dict: Browser information or None if no running browser is configured for this port - """ - if not os.path.exists(config_file): - return None - - try: - with open(config_file, 'r') as f: - browser_info_dict = json.load(f) - - # Get browser info from port map - if isinstance(browser_info_dict, dict) and "port_map" in browser_info_dict: - port_str = str(debugging_port) - if port_str in browser_info_dict["port_map"]: - browser_info = browser_info_dict["port_map"][port_str] - - # Check if the browser is still running - if not is_browser_running(browser_info.get('pid')): - if logger: - logger.warning(f"Built-in browser on port {debugging_port} is not running", tag="BUILTIN") - # Remove this port from the dictionary - del browser_info_dict["port_map"][port_str] - with open(config_file, 'w') as f: - json.dump(browser_info_dict, f, indent=2) - return None - - return browser_info - - return None - - except Exception as e: - if logger: - logger.error(f"Error reading built-in browser config: {str(e)}", tag="BUILTIN") - return None - - def get_browser_info(self) -> Optional[Dict[str, Any]]: - """Get information about the current built-in browser instance. - - Returns: - dict: Browser information or None if no running browser is configured - """ - return self.get_builtin_browser_info( - debugging_port=self.config.debugging_port, - config_file=self.builtin_config_file, - logger=self.logger - ) - - - async def launch_builtin_browser(self, - browser_type: str = "chromium", - debugging_port: int = 9222, - headless: bool = True) -> Optional[str]: - """Launch a browser in the background for use as the built-in browser. - - Args: - browser_type: Type of browser to launch ('chromium' or 'firefox') - debugging_port: Port to use for CDP debugging - headless: Whether to run in headless mode - - Returns: - str: CDP URL for the browser, or None if launch failed - """ - # Check if there's an existing browser still running - browser_info = self.get_builtin_browser_info( - debugging_port=debugging_port, - config_file=self.builtin_config_file, - logger=self.logger - ) - if browser_info: - if self.logger: - self.logger.info(f"Built-in browser is already running on port {debugging_port}", tag="BUILTIN") - return browser_info.get('cdp_url') - - # Create a user data directory for the built-in browser - user_data_dir = os.path.join(self.builtin_browser_dir, "user_data") - # Raise error if user data dir is already engaged - if self._check_user_dir_is_engaged(user_data_dir): - raise Exception(f"User data directory {user_data_dir} is already engaged by another browser instance.") - - # Create the user data directory if it doesn't exist - os.makedirs(user_data_dir, exist_ok=True) - - # Prepare browser launch arguments - browser_path = await get_browser_executable(browser_type) - if browser_type == "chromium": - args = [ - browser_path, - f"--remote-debugging-port={debugging_port}", - f"--user-data-dir={user_data_dir}", - ] - if headless: - args.append("--headless=new") - elif browser_type == "firefox": - args = [ - browser_path, - "--remote-debugging-port", - str(debugging_port), - "--profile", - user_data_dir, - ] - if headless: - args.append("--headless") - else: - if self.logger: - self.logger.error(f"Browser type {browser_type} not supported for built-in browser", tag="BUILTIN") - return None - - try: - # Start the browser process detached - if is_windows(): - process = subprocess.Popen( - args, - stdout=subprocess.PIPE, - stderr=subprocess.PIPE, - creationflags=subprocess.DETACHED_PROCESS | subprocess.CREATE_NEW_PROCESS_GROUP - ) - else: - process = subprocess.Popen( - args, - stdout=subprocess.PIPE, - stderr=subprocess.PIPE, - preexec_fn=os.setpgrp # Start in a new process group - ) - - # Wait briefly to ensure the process starts successfully - await asyncio.sleep(2.0) - - # Check if the process is still running - if process.poll() is not None: - if self.logger: - self.logger.error(f"Browser process exited immediately with code {process.returncode}", tag="BUILTIN") - return None - - # Construct CDP URL - cdp_url = f"http://localhost:{debugging_port}" - - # Try to verify browser is responsive by fetching version info - import aiohttp - json_url = f"{cdp_url}/json/version" - config_json = None - - try: - async with aiohttp.ClientSession() as session: - for _ in range(10): # Try multiple times - try: - async with session.get(json_url) as response: - if response.status == 200: - config_json = await response.json() - break - except Exception: - pass - await asyncio.sleep(0.5) - except Exception as e: - if self.logger: - self.logger.warning(f"Could not verify browser: {str(e)}", tag="BUILTIN") - - # Create browser info - browser_info = { - 'pid': process.pid, - 'cdp_url': cdp_url, - 'user_data_dir': user_data_dir, - 'browser_type': browser_type, - 'debugging_port': debugging_port, - 'start_time': time.time(), - 'config': config_json - } - - # Read existing config file if it exists - port_map = {} - if os.path.exists(self.builtin_config_file): - try: - with open(self.builtin_config_file, 'r') as f: - existing_data = json.load(f) - - # Check if it already uses port mapping - if isinstance(existing_data, dict) and "port_map" in existing_data: - port_map = existing_data["port_map"] - # Convert legacy format to port mapping - elif isinstance(existing_data, dict) and "debugging_port" in existing_data: - old_port = str(existing_data.get("debugging_port")) - if self._is_browser_running(existing_data.get("pid")): - port_map[old_port] = existing_data - except Exception as e: - if self.logger: - self.logger.warning(f"Could not read existing config: {str(e)}", tag="BUILTIN") - - # Add/update this browser in the port map - port_map[str(debugging_port)] = browser_info - - # Write updated config - with open(self.builtin_config_file, 'w') as f: - json.dump({"port_map": port_map}, f, indent=2) - - # Detach from the browser process - don't keep any references - # This is important to allow the Python script to exit while the browser continues running - process = None - - if self.logger: - self.logger.success(f"Built-in browser launched at CDP URL: {cdp_url}", tag="BUILTIN") - return cdp_url - - except Exception as e: - if self.logger: - self.logger.error(f"Error launching built-in browser: {str(e)}", tag="BUILTIN") - return None - - async def kill_builtin_browser(self) -> bool: - """Kill the built-in browser if it's running. - - Returns: - bool: True if the browser was killed, False otherwise - """ - browser_info = self.get_browser_info() - if not browser_info: - if self.logger: - self.logger.warning(f"No built-in browser found on port {self.config.debugging_port}", tag="BUILTIN") - return False - - pid = browser_info.get('pid') - if not pid: - return False - - try: - if is_windows(): - subprocess.run(["taskkill", "/F", "/PID", str(pid)], check=True) - else: - os.kill(pid, signal.SIGTERM) - # Wait for termination - for _ in range(5): - if not is_browser_running(pid): - break - await asyncio.sleep(0.5) - else: - # Force kill if still running - os.kill(pid, signal.SIGKILL) - - # Update config file to remove this browser - with open(self.builtin_config_file, 'r') as f: - browser_info_dict = json.load(f) - # Remove this port from the dictionary - port_str = str(self.config.debugging_port) - if port_str in browser_info_dict.get("port_map", {}): - del browser_info_dict["port_map"][port_str] - with open(self.builtin_config_file, 'w') as f: - json.dump(browser_info_dict, f, indent=2) - # Remove user data directory if it exists - if os.path.exists(self.builtin_browser_dir): - shutil.rmtree(self.builtin_browser_dir) - # Clear the browser info cache - self.browser = None - self.temp_dir = None - self.shutting_down = True - - if self.logger: - self.logger.success("Built-in browser terminated", tag="BUILTIN") - return True - except Exception as e: - if self.logger: - self.logger.error(f"Error killing built-in browser: {str(e)}", tag="BUILTIN") - return False - - async def get_builtin_browser_status(self) -> Dict[str, Any]: - """Get status information about the built-in browser. - - Returns: - dict: Status information with running, cdp_url, and info fields - """ - browser_info = self.get_browser_info() - - if not browser_info: - return { - 'running': False, - 'cdp_url': None, - 'info': None, - 'port': self.config.debugging_port - } - - return { - 'running': True, - 'cdp_url': browser_info.get('cdp_url'), - 'info': browser_info, - 'port': self.config.debugging_port - } - - # Override the close method to handle built-in browser cleanup - async def close(self): - """Close the built-in browser and clean up resources.""" - # Call parent class close method - await super().close() - - # Clean up built-in browser if we created it - if self.shutting_down: - await self.kill_builtin_browser() diff --git a/crawl4ai/browser/strategies/__init__.py b/crawl4ai/browser/strategies/__init__.py new file mode 100644 index 00000000..c4f17fd9 --- /dev/null +++ b/crawl4ai/browser/strategies/__init__.py @@ -0,0 +1,13 @@ +from .base import BaseBrowserStrategy +from .cdp import CDPBrowserStrategy +from .docker_strategy import DockerBrowserStrategy +from .playwright import PlaywrightBrowserStrategy +from .builtin import BuiltinBrowserStrategy + +__all__ = [ + "BrowserStrategy", + "CDPBrowserStrategy", + "DockerBrowserStrategy", + "PlaywrightBrowserStrategy", + "BuiltinBrowserStrategy", +] \ No newline at end of file diff --git a/crawl4ai/browser/strategies/base.py b/crawl4ai/browser/strategies/base.py new file mode 100644 index 00000000..75613dcd --- /dev/null +++ b/crawl4ai/browser/strategies/base.py @@ -0,0 +1,270 @@ +"""Browser strategies module for Crawl4AI. + +This module implements the browser strategy pattern for different +browser implementations, including Playwright, CDP, and builtin browsers. +""" + +from abc import ABC, abstractmethod +import asyncio +import json +import hashlib +from typing import Optional, Tuple, List + +from playwright.async_api import BrowserContext, Page + +from ...async_logger import AsyncLogger +from ...async_configs import BrowserConfig, CrawlerRunConfig +from ...config import DOWNLOAD_PAGE_TIMEOUT +from ...js_snippet import load_js_script + +class BaseBrowserStrategy(ABC): + """Base class for all browser strategies. + + This abstract class defines the interface that all browser strategies + must implement. It handles common functionality like context caching. + """ + + def __init__(self, config: BrowserConfig, logger: Optional[AsyncLogger] = None): + """Initialize the strategy with configuration and logger. + + Args: + config: Browser configuration + logger: Logger for recording events and errors + """ + self.config = config + self.logger = logger + self.browser = None + self.default_context = None + self.contexts_by_config = {} + self._contexts_lock = asyncio.Lock() + self.playwright = None + + @abstractmethod + async def start(self): + """Start the browser. + + Returns: + self: For method chaining + """ + pass + + @abstractmethod + async def get_page(self, crawlerRunConfig: CrawlerRunConfig) -> Tuple[Page, BrowserContext]: + """Get a page with specified configuration. + + Args: + crawlerRunConfig: Crawler run configuration + + Returns: + Tuple of (Page, BrowserContext) + """ + pass + + async def get_pages(self, crawlerRunConfig: CrawlerRunConfig, count: int = 1) -> List[Tuple[Page, BrowserContext]]: + """Get multiple pages with the same configuration. + + Args: + crawlerRunConfig: Configuration for the pages + count: Number of pages to create + + Returns: + List of (Page, Context) tuples + """ + pages = [] + for _ in range(count): + page, context = await self.get_page(crawlerRunConfig) + pages.append((page, context)) + return pages + + @abstractmethod + async def close(self): + """Close the browser and clean up resources.""" + pass + + def _make_config_signature(self, crawlerRunConfig: CrawlerRunConfig) -> str: + """Create a signature hash from configuration for context caching. + + Args: + crawlerRunConfig: Crawler run configuration + + Returns: + str: Unique hash for this configuration + """ + config_dict = crawlerRunConfig.__dict__.copy() + # Exclude items that do not affect browser-level setup + ephemeral_keys = [ + "session_id", + "js_code", + "scraping_strategy", + "extraction_strategy", + "chunking_strategy", + "cache_mode", + "content_filter", + "semaphore_count", + "url" + ] + for key in ephemeral_keys: + if key in config_dict: + del config_dict[key] + + # Convert to canonical JSON string + signature_json = json.dumps(config_dict, sort_keys=True, default=str) + + # Hash the JSON so we get a compact, unique string + signature_hash = hashlib.sha256(signature_json.encode("utf-8")).hexdigest() + return signature_hash + + async def create_browser_context(self, crawlerRunConfig: Optional[CrawlerRunConfig] = None) -> BrowserContext: + """Creates and returns a new browser context with configured settings. + + Args: + crawlerRunConfig: Configuration object for the crawler run + + Returns: + BrowserContext: Browser context object with the specified configurations + """ + if not self.browser: + raise ValueError("Browser must be initialized before creating context") + + # Base settings + user_agent = self.config.headers.get("User-Agent", self.config.user_agent) + viewport_settings = { + "width": self.config.viewport_width, + "height": self.config.viewport_height, + } + proxy_settings = {"server": self.config.proxy} if self.config.proxy else None + + # Define blocked extensions for resource optimization + blocked_extensions = [ + # Images + "jpg", "jpeg", "png", "gif", "webp", "svg", "ico", "bmp", "tiff", "psd", + # Fonts + "woff", "woff2", "ttf", "otf", "eot", + # Media + "mp4", "webm", "ogg", "avi", "mov", "wmv", "flv", "m4v", "mp3", "wav", "aac", + "m4a", "opus", "flac", + # Documents + "pdf", "doc", "docx", "xls", "xlsx", "ppt", "pptx", + # Archives + "zip", "rar", "7z", "tar", "gz", + # Scripts and data + "xml", "swf", "wasm", + ] + + # Common context settings + context_settings = { + "user_agent": user_agent, + "viewport": viewport_settings, + "proxy": proxy_settings, + "accept_downloads": self.config.accept_downloads, + "ignore_https_errors": self.config.ignore_https_errors, + "device_scale_factor": 1.0, + "java_script_enabled": self.config.java_script_enabled, + } + + # Apply text mode settings if enabled + if self.config.text_mode: + text_mode_settings = { + "has_touch": False, + "is_mobile": False, + # Disable javascript in text mode + "java_script_enabled": False + } + # Update context settings with text mode settings + context_settings.update(text_mode_settings) + if self.logger: + self.logger.debug("Text mode enabled for browser context", tag="BROWSER") + + # Handle storage state properly - this is key for persistence + if self.config.storage_state: + context_settings["storage_state"] = self.config.storage_state + if self.logger: + if isinstance(self.config.storage_state, str): + self.logger.debug(f"Using storage state from file: {self.config.storage_state}", tag="BROWSER") + else: + self.logger.debug("Using storage state from config object", tag="BROWSER") + + # If user_data_dir is specified, browser persistence should be automatic + if self.config.user_data_dir and self.logger: + self.logger.debug(f"Using user data directory: {self.config.user_data_dir}", tag="BROWSER") + + # Apply crawler-specific configurations if provided + if crawlerRunConfig: + # Check if there is value for crawlerRunConfig.proxy_config set add that to context + if crawlerRunConfig.proxy_config: + proxy_settings = { + "server": crawlerRunConfig.proxy_config.server, + } + if crawlerRunConfig.proxy_config.username: + proxy_settings.update({ + "username": crawlerRunConfig.proxy_config.username, + "password": crawlerRunConfig.proxy_config.password, + }) + context_settings["proxy"] = proxy_settings + + # Create and return the context + try: + # Create the context with appropriate settings + context = await self.browser.new_context(**context_settings) + + # Apply text mode resource blocking if enabled + if self.config.text_mode: + # Create and apply route patterns for each extension + for ext in blocked_extensions: + await context.route(f"**/*.{ext}", lambda route: route.abort()) + + return context + except Exception as e: + if self.logger: + self.logger.error(f"Error creating browser context: {str(e)}", tag="BROWSER") + # Fallback to basic context creation if the advanced settings fail + return await self.browser.new_context() + + async def setup_context(self, context: BrowserContext, crawlerRunConfig: Optional[CrawlerRunConfig] = None): + """Set up a browser context with the configured options. + + Args: + context: The browser context to set up + crawlerRunConfig: Configuration object containing all browser settings + """ + if self.config.headers: + await context.set_extra_http_headers(self.config.headers) + + if self.config.cookies: + await context.add_cookies(self.config.cookies) + + if self.config.accept_downloads: + context.set_default_timeout(DOWNLOAD_PAGE_TIMEOUT) + context.set_default_navigation_timeout(DOWNLOAD_PAGE_TIMEOUT) + if self.config.downloads_path: + context._impl_obj._options["accept_downloads"] = True + context._impl_obj._options["downloads_path"] = self.config.downloads_path + + # Handle user agent and browser hints + if self.config.user_agent: + combined_headers = { + "User-Agent": self.config.user_agent, + "sec-ch-ua": self.config.browser_hint, + } + combined_headers.update(self.config.headers) + await context.set_extra_http_headers(combined_headers) + + # Add default cookie + await context.add_cookies( + [ + { + "name": "cookiesEnabled", + "value": "true", + "url": crawlerRunConfig and crawlerRunConfig.url or "https://crawl4ai.com/", + } + ] + ) + + # Handle navigator overrides + if crawlerRunConfig: + if ( + crawlerRunConfig.override_navigator + or crawlerRunConfig.simulate_user + or crawlerRunConfig.magic + ): + await context.add_init_script(load_js_script("navigator_overrider")) diff --git a/crawl4ai/browser/strategies/builtin.py b/crawl4ai/browser/strategies/builtin.py new file mode 100644 index 00000000..fd678ca2 --- /dev/null +++ b/crawl4ai/browser/strategies/builtin.py @@ -0,0 +1,394 @@ +import asyncio +import os +import time +import json +import subprocess +import shutil +import signal +from typing import Optional, Dict, Any + + +from ...async_logger import AsyncLogger +from ...async_configs import BrowserConfig +from ...utils import get_home_folder +from ..utils import get_browser_executable, is_windows, is_browser_running + + +from .cdp import CDPBrowserStrategy + +class BuiltinBrowserStrategy(CDPBrowserStrategy): + """Built-in browser strategy. + + This strategy extends the CDP strategy to use the built-in browser. + """ + + def __init__(self, config: BrowserConfig, logger: Optional[AsyncLogger] = None): + """Initialize the built-in browser strategy. + + Args: + config: Browser configuration + logger: Logger for recording events and errors + """ + super().__init__(config, logger) + self.builtin_browser_dir = os.path.join(get_home_folder(), "builtin-browser") if not self.config.user_data_dir else self.config.user_data_dir + self.builtin_config_file = os.path.join(self.builtin_browser_dir, "browser_config.json") + + # Raise error if user data dir is already engaged + if self._check_user_dir_is_engaged(self.builtin_browser_dir): + raise Exception(f"User data directory {self.builtin_browser_dir} is already engaged by another browser instance.") + + os.makedirs(self.builtin_browser_dir, exist_ok=True) + + def _check_user_dir_is_engaged(self, user_data_dir: str) -> bool: + """Check if the user data directory is already in use. + + Returns: + bool: True if the directory is engaged, False otherwise + """ + # Load browser config file, then iterate in port_map values, check "user_data_dir" key if it matches + # the current user data directory + if os.path.exists(self.builtin_config_file): + try: + with open(self.builtin_config_file, 'r') as f: + browser_info_dict = json.load(f) + + # Check if user data dir is already engaged + for port_str, browser_info in browser_info_dict.get("port_map", {}).items(): + if browser_info.get("user_data_dir") == user_data_dir: + return True + except Exception as e: + if self.logger: + self.logger.error(f"Error reading built-in browser config: {str(e)}", tag="BUILTIN") + return False + + async def start(self): + """Start or connect to the built-in browser. + + Returns: + self: For method chaining + """ + # Check for existing built-in browser (get_browser_info already checks if running) + browser_info = self.get_browser_info() + if browser_info: + if self.logger: + self.logger.info(f"Using existing built-in browser at {browser_info.get('cdp_url')}", tag="BROWSER") + self.config.cdp_url = browser_info.get('cdp_url') + else: + if self.logger: + self.logger.info("Built-in browser not found, launching new instance...", tag="BROWSER") + cdp_url = await self.launch_builtin_browser( + browser_type=self.config.browser_type, + debugging_port=self.config.debugging_port, + headless=self.config.headless, + ) + if not cdp_url: + if self.logger: + self.logger.warning("Failed to launch built-in browser, falling back to regular CDP strategy", tag="BROWSER") + return await super().start() + self.config.cdp_url = cdp_url + + # Call parent class implementation with updated CDP URL + return await super().start() + + @classmethod + def get_builtin_browser_info(cls, debugging_port: int, config_file: str, logger: Optional[AsyncLogger] = None) -> Optional[Dict[str, Any]]: + """Get information about the built-in browser for a specific debugging port. + + Args: + debugging_port: The debugging port to look for + config_file: Path to the config file + logger: Optional logger for recording events + + Returns: + dict: Browser information or None if no running browser is configured for this port + """ + if not os.path.exists(config_file): + return None + + try: + with open(config_file, 'r') as f: + browser_info_dict = json.load(f) + + # Get browser info from port map + if isinstance(browser_info_dict, dict) and "port_map" in browser_info_dict: + port_str = str(debugging_port) + if port_str in browser_info_dict["port_map"]: + browser_info = browser_info_dict["port_map"][port_str] + + # Check if the browser is still running + if not is_browser_running(browser_info.get('pid')): + if logger: + logger.warning(f"Built-in browser on port {debugging_port} is not running", tag="BUILTIN") + # Remove this port from the dictionary + del browser_info_dict["port_map"][port_str] + with open(config_file, 'w') as f: + json.dump(browser_info_dict, f, indent=2) + return None + + return browser_info + + return None + + except Exception as e: + if logger: + logger.error(f"Error reading built-in browser config: {str(e)}", tag="BUILTIN") + return None + + def get_browser_info(self) -> Optional[Dict[str, Any]]: + """Get information about the current built-in browser instance. + + Returns: + dict: Browser information or None if no running browser is configured + """ + return self.get_builtin_browser_info( + debugging_port=self.config.debugging_port, + config_file=self.builtin_config_file, + logger=self.logger + ) + + + async def launch_builtin_browser(self, + browser_type: str = "chromium", + debugging_port: int = 9222, + headless: bool = True) -> Optional[str]: + """Launch a browser in the background for use as the built-in browser. + + Args: + browser_type: Type of browser to launch ('chromium' or 'firefox') + debugging_port: Port to use for CDP debugging + headless: Whether to run in headless mode + + Returns: + str: CDP URL for the browser, or None if launch failed + """ + # Check if there's an existing browser still running + browser_info = self.get_builtin_browser_info( + debugging_port=debugging_port, + config_file=self.builtin_config_file, + logger=self.logger + ) + if browser_info: + if self.logger: + self.logger.info(f"Built-in browser is already running on port {debugging_port}", tag="BUILTIN") + return browser_info.get('cdp_url') + + # Create a user data directory for the built-in browser + user_data_dir = os.path.join(self.builtin_browser_dir, "user_data") + # Raise error if user data dir is already engaged + if self._check_user_dir_is_engaged(user_data_dir): + raise Exception(f"User data directory {user_data_dir} is already engaged by another browser instance.") + + # Create the user data directory if it doesn't exist + os.makedirs(user_data_dir, exist_ok=True) + + # Prepare browser launch arguments + browser_path = await get_browser_executable(browser_type) + if browser_type == "chromium": + args = [ + browser_path, + f"--remote-debugging-port={debugging_port}", + f"--user-data-dir={user_data_dir}", + ] + if headless: + args.append("--headless=new") + elif browser_type == "firefox": + args = [ + browser_path, + "--remote-debugging-port", + str(debugging_port), + "--profile", + user_data_dir, + ] + if headless: + args.append("--headless") + else: + if self.logger: + self.logger.error(f"Browser type {browser_type} not supported for built-in browser", tag="BUILTIN") + return None + + try: + # Start the browser process detached + if is_windows(): + process = subprocess.Popen( + args, + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + creationflags=subprocess.DETACHED_PROCESS | subprocess.CREATE_NEW_PROCESS_GROUP + ) + else: + process = subprocess.Popen( + args, + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + preexec_fn=os.setpgrp # Start in a new process group + ) + + # Wait briefly to ensure the process starts successfully + await asyncio.sleep(2.0) + + # Check if the process is still running + if process.poll() is not None: + if self.logger: + self.logger.error(f"Browser process exited immediately with code {process.returncode}", tag="BUILTIN") + return None + + # Construct CDP URL + cdp_url = f"http://localhost:{debugging_port}" + + # Try to verify browser is responsive by fetching version info + import aiohttp + json_url = f"{cdp_url}/json/version" + config_json = None + + try: + async with aiohttp.ClientSession() as session: + for _ in range(10): # Try multiple times + try: + async with session.get(json_url) as response: + if response.status == 200: + config_json = await response.json() + break + except Exception: + pass + await asyncio.sleep(0.5) + except Exception as e: + if self.logger: + self.logger.warning(f"Could not verify browser: {str(e)}", tag="BUILTIN") + + # Create browser info + browser_info = { + 'pid': process.pid, + 'cdp_url': cdp_url, + 'user_data_dir': user_data_dir, + 'browser_type': browser_type, + 'debugging_port': debugging_port, + 'start_time': time.time(), + 'config': config_json + } + + # Read existing config file if it exists + port_map = {} + if os.path.exists(self.builtin_config_file): + try: + with open(self.builtin_config_file, 'r') as f: + existing_data = json.load(f) + + # Check if it already uses port mapping + if isinstance(existing_data, dict) and "port_map" in existing_data: + port_map = existing_data["port_map"] + # Convert legacy format to port mapping + elif isinstance(existing_data, dict) and "debugging_port" in existing_data: + old_port = str(existing_data.get("debugging_port")) + if self._is_browser_running(existing_data.get("pid")): + port_map[old_port] = existing_data + except Exception as e: + if self.logger: + self.logger.warning(f"Could not read existing config: {str(e)}", tag="BUILTIN") + + # Add/update this browser in the port map + port_map[str(debugging_port)] = browser_info + + # Write updated config + with open(self.builtin_config_file, 'w') as f: + json.dump({"port_map": port_map}, f, indent=2) + + # Detach from the browser process - don't keep any references + # This is important to allow the Python script to exit while the browser continues running + process = None + + if self.logger: + self.logger.success(f"Built-in browser launched at CDP URL: {cdp_url}", tag="BUILTIN") + return cdp_url + + except Exception as e: + if self.logger: + self.logger.error(f"Error launching built-in browser: {str(e)}", tag="BUILTIN") + return None + + async def kill_builtin_browser(self) -> bool: + """Kill the built-in browser if it's running. + + Returns: + bool: True if the browser was killed, False otherwise + """ + browser_info = self.get_browser_info() + if not browser_info: + if self.logger: + self.logger.warning(f"No built-in browser found on port {self.config.debugging_port}", tag="BUILTIN") + return False + + pid = browser_info.get('pid') + if not pid: + return False + + try: + if is_windows(): + subprocess.run(["taskkill", "/F", "/PID", str(pid)], check=True) + else: + os.kill(pid, signal.SIGTERM) + # Wait for termination + for _ in range(5): + if not is_browser_running(pid): + break + await asyncio.sleep(0.5) + else: + # Force kill if still running + os.kill(pid, signal.SIGKILL) + + # Update config file to remove this browser + with open(self.builtin_config_file, 'r') as f: + browser_info_dict = json.load(f) + # Remove this port from the dictionary + port_str = str(self.config.debugging_port) + if port_str in browser_info_dict.get("port_map", {}): + del browser_info_dict["port_map"][port_str] + with open(self.builtin_config_file, 'w') as f: + json.dump(browser_info_dict, f, indent=2) + # Remove user data directory if it exists + if os.path.exists(self.builtin_browser_dir): + shutil.rmtree(self.builtin_browser_dir) + # Clear the browser info cache + self.browser = None + self.temp_dir = None + self.shutting_down = True + + if self.logger: + self.logger.success("Built-in browser terminated", tag="BUILTIN") + return True + except Exception as e: + if self.logger: + self.logger.error(f"Error killing built-in browser: {str(e)}", tag="BUILTIN") + return False + + async def get_builtin_browser_status(self) -> Dict[str, Any]: + """Get status information about the built-in browser. + + Returns: + dict: Status information with running, cdp_url, and info fields + """ + browser_info = self.get_browser_info() + + if not browser_info: + return { + 'running': False, + 'cdp_url': None, + 'info': None, + 'port': self.config.debugging_port + } + + return { + 'running': True, + 'cdp_url': browser_info.get('cdp_url'), + 'info': browser_info, + 'port': self.config.debugging_port + } + + # Override the close method to handle built-in browser cleanup + async def close(self): + """Close the built-in browser and clean up resources.""" + # Call parent class close method + await super().close() + + # Clean up built-in browser if we created it + if self.shutting_down: + await self.kill_builtin_browser() diff --git a/crawl4ai/browser/strategies/cdp.py b/crawl4ai/browser/strategies/cdp.py new file mode 100644 index 00000000..d1d543dc --- /dev/null +++ b/crawl4ai/browser/strategies/cdp.py @@ -0,0 +1,359 @@ +"""Browser strategies module for Crawl4AI. + +This module implements the browser strategy pattern for different +browser implementations, including Playwright, CDP, and builtin browsers. +""" + +import asyncio +import os +import time +import json +import subprocess +import shutil +from typing import Optional, Tuple, List + +from playwright.async_api import BrowserContext, Page + +from ...async_logger import AsyncLogger +from ...async_configs import BrowserConfig, CrawlerRunConfig +from ..utils import get_playwright, get_browser_executable, create_temp_directory, is_windows + +from .base import BaseBrowserStrategy + +class CDPBrowserStrategy(BaseBrowserStrategy): + """CDP-based browser strategy. + + This strategy connects to an existing browser using CDP protocol or + launches and connects to a browser using CDP. + """ + + def __init__(self, config: BrowserConfig, logger: Optional[AsyncLogger] = None): + """Initialize the CDP browser strategy. + + Args: + config: Browser configuration + logger: Logger for recording events and errors + """ + super().__init__(config, logger) + self.sessions = {} + self.session_ttl = 1800 # 30 minutes + self.browser_process = None + self.temp_dir = None + self.shutting_down = False + + async def start(self): + """Start or connect to the browser using CDP. + + Returns: + self: For method chaining + """ + self.playwright = await get_playwright() + + # Get or create CDP URL + cdp_url = await self._get_or_create_cdp_url() + + # Connect to the browser using CDP + self.browser = await self.playwright.chromium.connect_over_cdp(cdp_url) + + # Get or create default context + contexts = self.browser.contexts + if contexts: + self.default_context = contexts[0] + else: + self.default_context = await self.create_browser_context() + + await self.setup_context(self.default_context) + return self + + async def _get_or_create_cdp_url(self) -> str: + """Get existing CDP URL or launch a browser and return its CDP URL. + + Returns: + str: CDP URL for connecting to the browser + """ + # If CDP URL is provided, just return it + if self.config.cdp_url: + return self.config.cdp_url + + # Create temp dir if needed + if not self.config.user_data_dir: + self.temp_dir = create_temp_directory() + user_data_dir = self.temp_dir + else: + user_data_dir = self.config.user_data_dir + + # Get browser args based on OS and browser type + args = await self._get_browser_args(user_data_dir) + + # Start browser process + try: + # Use DETACHED_PROCESS flag on Windows to fully detach the process + # On Unix, we'll use preexec_fn=os.setpgrp to start the process in a new process group + if is_windows(): + self.browser_process = subprocess.Popen( + args, + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + creationflags=subprocess.DETACHED_PROCESS | subprocess.CREATE_NEW_PROCESS_GROUP + ) + else: + self.browser_process = subprocess.Popen( + args, + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + preexec_fn=os.setpgrp # Start in a new process group + ) + + # Monitor for a short time to make sure it starts properly + await asyncio.sleep(0.5) # Give browser time to start + await self._initial_startup_check() + await asyncio.sleep(2) # Give browser more time to start + return f"http://localhost:{self.config.debugging_port}" + except Exception as e: + await self._cleanup_process() + raise Exception(f"Failed to start browser: {e}") + + async def _initial_startup_check(self): + """Perform a quick check to make sure the browser started successfully.""" + if not self.browser_process: + return + + # Check that process started without immediate termination + await asyncio.sleep(0.5) + if self.browser_process.poll() is not None: + # Process already terminated + stdout, stderr = b"", b"" + try: + stdout, stderr = self.browser_process.communicate(timeout=0.5) + except subprocess.TimeoutExpired: + pass + + if self.logger: + self.logger.error( + message="Browser process terminated during startup | Code: {code} | STDOUT: {stdout} | STDERR: {stderr}", + tag="ERROR", + params={ + "code": self.browser_process.returncode, + "stdout": stdout.decode() if stdout else "", + "stderr": stderr.decode() if stderr else "", + }, + ) + + async def _get_browser_args(self, user_data_dir: str) -> List[str]: + """Returns browser-specific command line arguments. + + Args: + user_data_dir: Path to user data directory + + Returns: + List of command-line arguments for the browser + """ + browser_path = await get_browser_executable(self.config.browser_type) + base_args = [browser_path] + + if self.config.browser_type == "chromium": + args = [ + f"--remote-debugging-port={self.config.debugging_port}", + f"--user-data-dir={user_data_dir}", + ] + if self.config.headless: + args.append("--headless=new") + elif self.config.browser_type == "firefox": + args = [ + "--remote-debugging-port", + str(self.config.debugging_port), + "--profile", + user_data_dir, + ] + if self.config.headless: + args.append("--headless") + else: + raise NotImplementedError(f"Browser type {self.config.browser_type} not supported") + + return base_args + args + + async def _cleanup_process(self): + """Cleanup browser process and temporary directory.""" + # Set shutting_down flag BEFORE any termination actions + self.shutting_down = True + + if self.browser_process: + try: + # Only terminate if we have proper control over the process + if not self.browser_process.poll(): + # Process is still running + self.browser_process.terminate() + # Wait for process to end gracefully + for _ in range(10): # 10 attempts, 100ms each + if self.browser_process.poll() is not None: + break + await asyncio.sleep(0.1) + + # Force kill if still running + if self.browser_process.poll() is None: + if is_windows(): + # On Windows we might need taskkill for detached processes + try: + subprocess.run(["taskkill", "/F", "/PID", str(self.browser_process.pid)]) + except Exception: + self.browser_process.kill() + else: + self.browser_process.kill() + await asyncio.sleep(0.1) # Brief wait for kill to take effect + + except Exception as e: + if self.logger: + self.logger.error( + message="Error terminating browser: {error}", + tag="ERROR", + params={"error": str(e)}, + ) + + if self.temp_dir and os.path.exists(self.temp_dir): + try: + shutil.rmtree(self.temp_dir) + except Exception as e: + if self.logger: + self.logger.error( + message="Error removing temporary directory: {error}", + tag="ERROR", + params={"error": str(e)}, + ) + + async def create_browser_context(self, crawlerRunConfig: Optional[CrawlerRunConfig] = None) -> BrowserContext: + """Create a new browser context. + + Uses the base class implementation which handles all configurations. + + Args: + crawlerRunConfig: Configuration object for the crawler run + + Returns: + BrowserContext: Browser context object + """ + # Handle user_data_dir for CDP browsers + if self.config.user_data_dir: + # For CDP-based browsers, storage persistence is typically handled by the user_data_dir + # at the browser level, but we'll create a storage_state location for Playwright as well + storage_path = os.path.join(self.config.user_data_dir, "storage_state.json") + if not os.path.exists(storage_path): + # Create parent directory if it doesn't exist + os.makedirs(os.path.dirname(storage_path), exist_ok=True) + with open(storage_path, "w") as f: + json.dump({}, f) + self.config.storage_state = storage_path + + # Use the base class implementation + return await super().create_browser_context(crawlerRunConfig) + + def _cleanup_expired_sessions(self): + """Clean up expired sessions based on TTL.""" + current_time = time.time() + expired_sessions = [ + sid + for sid, (_, _, last_used) in self.sessions.items() + if current_time - last_used > self.session_ttl + ] + for sid in expired_sessions: + asyncio.create_task(self._kill_session(sid)) + + async def _kill_session(self, session_id: str): + """Kill a browser session and clean up resources. + + Args: + session_id: The session ID to kill + """ + if session_id in self.sessions: + context, page, _ = self.sessions[session_id] + await page.close() + del self.sessions[session_id] + + async def get_page(self, crawlerRunConfig: CrawlerRunConfig) -> Tuple[Page, BrowserContext]: + """Get a page for the given configuration. + + Args: + crawlerRunConfig: Configuration object for the crawler run + + Returns: + Tuple of (Page, BrowserContext) + """ + self._cleanup_expired_sessions() + + # If a session_id is provided and we already have it, reuse that page + context + if crawlerRunConfig.session_id and crawlerRunConfig.session_id in self.sessions: + context, page, _ = self.sessions[crawlerRunConfig.session_id] + # Update last-used timestamp + self.sessions[crawlerRunConfig.session_id] = (context, page, time.time()) + return page, context + + # For CDP, we typically use the shared default_context + context = self.default_context + pages = context.pages + page = next((p for p in pages if p.url == crawlerRunConfig.url), None) + if not page: + page = await context.new_page() + + # If a session_id is specified, store this session so we can reuse later + if crawlerRunConfig.session_id: + self.sessions[crawlerRunConfig.session_id] = (context, page, time.time()) + + return page, context + + async def close(self): + """Close the browser and clean up resources.""" + # Skip cleanup if using external CDP URL and not launched by us + if self.config.cdp_url and not self.browser_process: + return + + if self.config.sleep_on_close: + await asyncio.sleep(0.5) + + # If we have a user_data_dir configured, ensure persistence of storage state + if self.config.user_data_dir and self.browser and self.default_context: + for context in self.browser.contexts: + try: + await context.storage_state(path=os.path.join(self.config.user_data_dir, "Default", "storage_state.json")) + if self.logger: + self.logger.debug("Ensuring storage state is persisted before closing browser", tag="BROWSER") + except Exception as e: + if self.logger: + self.logger.warning( + message="Failed to ensure storage persistence: {error}", + tag="BROWSER", + params={"error": str(e)} + ) + + # Close all sessions + session_ids = list(self.sessions.keys()) + for session_id in session_ids: + await self._kill_session(session_id) + + # Close browser + if self.browser: + await self.browser.close() + self.browser = None + + # Clean up managed browser if we created it + if self.browser_process: + await asyncio.sleep(0.5) + await self._cleanup_process() + self.browser_process = None + + # Close temporary directory + if self.temp_dir and os.path.exists(self.temp_dir): + try: + shutil.rmtree(self.temp_dir) + self.temp_dir = None + except Exception as e: + if self.logger: + self.logger.error( + message="Error removing temporary directory: {error}", + tag="ERROR", + params={"error": str(e)}, + ) + + # Stop playwright + if self.playwright: + await self.playwright.stop() + self.playwright = None + diff --git a/crawl4ai/browser/docker_strategy.py b/crawl4ai/browser/strategies/docker_strategy.py similarity index 58% rename from crawl4ai/browser/docker_strategy.py rename to crawl4ai/browser/strategies/docker_strategy.py index 639abd84..33e581be 100644 --- a/crawl4ai/browser/docker_strategy.py +++ b/crawl4ai/browser/strategies/docker_strategy.py @@ -6,18 +6,15 @@ which offers better isolation, consistency across platforms, and easy scaling. import os import uuid -import asyncio -from typing import Dict, List, Optional, Tuple, Union -from pathlib import Path +from typing import List, Optional -from playwright.async_api import Page, BrowserContext -from ..async_logger import AsyncLogger -from ..async_configs import BrowserConfig, CrawlerRunConfig -from .docker_config import DockerConfig -from .docker_registry import DockerRegistry -from .docker_utils import DockerUtils -from .strategies import BuiltinBrowserStrategy +from ...async_logger import AsyncLogger +from ...async_configs import BrowserConfig +from ..models import DockerConfig +from ..docker_registry import DockerRegistry +from ..docker_utils import DockerUtils +from .builtin import BuiltinBrowserStrategy class DockerBrowserStrategy(BuiltinBrowserStrategy): @@ -53,6 +50,16 @@ class DockerBrowserStrategy(BuiltinBrowserStrategy): self.docker_config = self.config.docker_config or DockerConfig() self.container_id = None self.container_name = f"crawl4ai-browser-{uuid.uuid4().hex[:8]}" + + # Use the shared registry file path for consistency with BuiltinBrowserStrategy + registry_file = self.docker_config.registry_file + if registry_file is None and self.config.user_data_dir: + # Use the same registry file as BuiltinBrowserStrategy if possible + registry_file = os.path.join( + os.path.dirname(self.config.user_data_dir), + "browser_config.json" + ) + self.registry = DockerRegistry(self.docker_config.registry_file) self.docker_utils = DockerUtils(logger) self.chrome_process_id = None @@ -60,7 +67,77 @@ class DockerBrowserStrategy(BuiltinBrowserStrategy): self.internal_cdp_port = 9222 # Chrome's internal CDP port self.internal_mapped_port = 9223 # Port that socat maps to internally self.shutting_down = False - + + async def start(self): + """Start or connect to a browser running in a Docker container. + + This method initializes Playwright and establishes a connection to + a browser running in a Docker container. Depending on the configured mode: + - "connect": Connects to a container with Chrome already running + - "launch": Creates a container and launches Chrome within it + + Returns: + self: For method chaining + """ + # Initialize Playwright + from ..utils import get_playwright + self.playwright = await get_playwright() + + if self.logger: + self.logger.info( + f"Starting Docker browser strategy in {self.docker_config.mode} mode", + tag="DOCKER" + ) + + try: + # Get CDP URL by creating or reusing a Docker container + # This handles the container management and browser startup + cdp_url = await self._get_or_create_cdp_url() + + if not cdp_url: + raise Exception("Failed to establish CDP connection to Docker container") + + if self.logger: + self.logger.info(f"Connecting to browser in Docker via CDP: {cdp_url}", tag="DOCKER") + + # Connect to the browser using CDP + self.browser = await self.playwright.chromium.connect_over_cdp(cdp_url) + + # Get existing context or create default context + contexts = self.browser.contexts + if contexts: + self.default_context = contexts[0] + if self.logger: + self.logger.debug("Using existing browser context", tag="DOCKER") + else: + if self.logger: + self.logger.debug("Creating new browser context", tag="DOCKER") + self.default_context = await self.create_browser_context() + await self.setup_context(self.default_context) + + return self + + except Exception as e: + # Clean up resources if startup fails + if self.container_id and not self.docker_config.persistent: + if self.logger: + self.logger.warning( + f"Cleaning up container after failed start: {self.container_id[:12]}", + tag="DOCKER" + ) + await self.docker_utils.remove_container(self.container_id) + self.registry.unregister_container(self.container_id) + self.container_id = None + + if self.playwright: + await self.playwright.stop() + self.playwright = None + + # Re-raise the exception + if self.logger: + self.logger.error(f"Failed to start Docker browser: {str(e)}", tag="DOCKER") + raise + async def _generate_config_hash(self) -> str: """Generate a hash of the configuration for container matching. @@ -87,7 +164,7 @@ class DockerBrowserStrategy(BuiltinBrowserStrategy): # Use the utility method to generate the hash return self.docker_utils.generate_config_hash(config_dict) - async def _get_or_create_cdp_url(self) -> str: + async def _get_or_create_cdp_url1(self) -> str: """Get CDP URL by either creating a new container or using an existing one. Returns: @@ -183,7 +260,109 @@ class DockerBrowserStrategy(BuiltinBrowserStrategy): # Return CDP URL return f"http://localhost:{host_port}" - + + async def _get_or_create_cdp_url(self) -> str: + """Get CDP URL by either creating a new container or using an existing one. + + Returns: + CDP URL for connecting to the browser + + Raises: + Exception: If container creation or browser launch fails + """ + # If CDP URL is explicitly provided, use it + if self.config.cdp_url: + return self.config.cdp_url + + # Ensure Docker image exists (will build if needed) + image_name = await self.docker_utils.ensure_docker_image_exists( + self.docker_config.image, + self.docker_config.mode + ) + + # Generate config hash for container matching + config_hash = await self._generate_config_hash() + + # Look for existing container with matching config + container_id = await self.registry.find_container_by_config(config_hash, self.docker_utils) + + if container_id: + # Use existing container + self.container_id = container_id + host_port = self.registry.get_container_host_port(container_id) + if self.logger: + self.logger.info(f"Using existing Docker container: {container_id[:12]}", tag="DOCKER") + else: + # Get a port for the new container + host_port = self.docker_config.host_port or self.registry.get_next_available_port(self.docker_utils) + + # Prepare volumes list + volumes = list(self.docker_config.volumes) + + # Add user data directory if specified + if self.docker_config.user_data_dir: + # Ensure user data directory exists + os.makedirs(self.docker_config.user_data_dir, exist_ok=True) + volumes.append(f"{self.docker_config.user_data_dir}:{self.docker_config.container_user_data_dir}") + + # Update config user_data_dir to point to container path + self.config.user_data_dir = self.docker_config.container_user_data_dir + + # Create a new container + container_id = await self.docker_utils.create_container( + image_name=image_name, + host_port=host_port, + container_name=self.container_name, + volumes=volumes, + network=self.docker_config.network, + env_vars=self.docker_config.env_vars, + cpu_limit=self.docker_config.cpu_limit, + memory_limit=self.docker_config.memory_limit, + extra_args=self.docker_config.extra_args + ) + + if not container_id: + raise Exception("Failed to create Docker container") + + self.container_id = container_id + + # Wait for container to be ready + await self.docker_utils.wait_for_container_ready(container_id) + + # Handle specific setup based on mode + if self.docker_config.mode == "launch": + # In launch mode, we need to start socat and Chrome + await self.docker_utils.start_socat_in_container(container_id) + + # Build browser arguments + browser_args = self._build_browser_args() + + # Launch Chrome + await self.docker_utils.launch_chrome_in_container(container_id, browser_args) + + # Get PIDs for later cleanup + self.chrome_process_id = await self.docker_utils.get_process_id_in_container( + container_id, "chrome" + ) + self.socat_process_id = await self.docker_utils.get_process_id_in_container( + container_id, "socat" + ) + + # Wait for CDP to be ready + cdp_json_config = await self.docker_utils.wait_for_cdp_ready(host_port) + + if cdp_json_config: + # Register the container in the shared registry + self.registry.register_container(container_id, host_port, config_hash, cdp_json_config) + else: + raise Exception("Failed to get CDP JSON config from Docker container") + + if self.logger: + self.logger.success(f"Docker container ready: {container_id[:12]} on port {host_port}", tag="DOCKER") + + # Return CDP URL + return f"http://localhost:{host_port}" + def _build_browser_args(self) -> List[str]: """Build Chrome command line arguments based on BrowserConfig. diff --git a/crawl4ai/browser/strategies/playwright.py b/crawl4ai/browser/strategies/playwright.py new file mode 100644 index 00000000..817603ca --- /dev/null +++ b/crawl4ai/browser/strategies/playwright.py @@ -0,0 +1,284 @@ +"""Browser strategies module for Crawl4AI. + +This module implements the browser strategy pattern for different +browser implementations, including Playwright, CDP, and builtin browsers. +""" + +import asyncio +import os +import time +import json +from typing import Optional, Tuple + +from playwright.async_api import BrowserContext, Page, ProxySettings + +from ...async_logger import AsyncLogger +from ...async_configs import BrowserConfig, CrawlerRunConfig +from ..utils import get_playwright, get_browser_disable_options + +from playwright_stealth import StealthConfig + +from .base import BaseBrowserStrategy + +stealth_config = StealthConfig( + webdriver=True, + chrome_app=True, + chrome_csi=True, + chrome_load_times=True, + chrome_runtime=True, + navigator_languages=True, + navigator_plugins=True, + navigator_permissions=True, + webgl_vendor=True, + outerdimensions=True, + navigator_hardware_concurrency=True, + media_codecs=True, +) + +class PlaywrightBrowserStrategy(BaseBrowserStrategy): + """Standard Playwright browser strategy. + + This strategy launches a new browser instance using Playwright + and manages browser contexts. + """ + + def __init__(self, config: BrowserConfig, logger: Optional[AsyncLogger] = None): + """Initialize the Playwright browser strategy. + + Args: + config: Browser configuration + logger: Logger for recording events and errors + """ + super().__init__(config, logger) + # Add session management + self.sessions = {} + self.session_ttl = 1800 # 30 minutes + + async def start(self): + """Start the browser instance. + + Returns: + self: For method chaining + """ + self.playwright = await get_playwright() + browser_args = self._build_browser_args() + + # Launch appropriate browser type + if self.config.browser_type == "firefox": + self.browser = await self.playwright.firefox.launch(**browser_args) + elif self.config.browser_type == "webkit": + self.browser = await self.playwright.webkit.launch(**browser_args) + else: + self.browser = await self.playwright.chromium.launch(**browser_args) + + self.default_context = self.browser + return self + + def _build_browser_args(self) -> dict: + """Build browser launch arguments from config. + + Returns: + dict: Browser launch arguments + """ + args = [ + "--disable-gpu", + "--disable-gpu-compositing", + "--disable-software-rasterizer", + "--no-sandbox", + "--disable-dev-shm-usage", + "--no-first-run", + "--no-default-browser-check", + "--disable-infobars", + "--window-position=0,0", + "--ignore-certificate-errors", + "--ignore-certificate-errors-spki-list", + "--disable-blink-features=AutomationControlled", + "--window-position=400,0", + "--disable-renderer-backgrounding", + "--disable-ipc-flooding-protection", + "--force-color-profile=srgb", + "--mute-audio", + "--disable-background-timer-throttling", + f"--window-size={self.config.viewport_width},{self.config.viewport_height}", + ] + + if self.config.light_mode: + args.extend(get_browser_disable_options()) + + if self.config.text_mode: + args.extend( + [ + "--blink-settings=imagesEnabled=false", + "--disable-remote-fonts", + "--disable-images", + "--disable-javascript", + "--disable-software-rasterizer", + "--disable-dev-shm-usage", + ] + ) + + if self.config.extra_args: + args.extend(self.config.extra_args) + + browser_args = {"headless": self.config.headless, "args": args} + + if self.config.chrome_channel: + browser_args["channel"] = self.config.chrome_channel + + if self.config.accept_downloads: + browser_args["downloads_path"] = self.config.downloads_path or os.path.join( + os.getcwd(), "downloads" + ) + os.makedirs(browser_args["downloads_path"], exist_ok=True) + + if self.config.proxy or self.config.proxy_config: + proxy_settings = ( + ProxySettings(server=self.config.proxy) + if self.config.proxy + else ProxySettings( + server=self.config.proxy_config.server, + username=self.config.proxy_config.username, + password=self.config.proxy_config.password, + ) + ) + browser_args["proxy"] = proxy_settings + + return browser_args + + async def create_browser_context(self, crawlerRunConfig: Optional[CrawlerRunConfig] = None) -> BrowserContext: + """Creates and returns a new browser context with configured settings. + + This implementation extends the base class version to handle user_data_dir specifically. + + Args: + crawlerRunConfig: Configuration object for the crawler run + + Returns: + BrowserContext: Browser context object with the specified configurations + """ + # Handle user_data_dir explicitly to ensure storage persistence + if self.config.user_data_dir: + # Create a storage state file path if none exists + storage_path = os.path.join(self.config.user_data_dir, "Default", "storage_state.json") + + # Create the file if it doesn't exist + if not os.path.exists(storage_path): + os.makedirs(os.path.dirname(storage_path), exist_ok=True) + with open(storage_path, "w") as f: + json.dump({}, f) + + # Override storage_state with our specific path + self.config.storage_state = storage_path + if self.logger: + self.logger.debug(f"Using persistent storage state at: {storage_path}", tag="BROWSER") + + # Now call the base class implementation which handles everything else + return await super().create_browser_context(crawlerRunConfig) + + def _cleanup_expired_sessions(self): + """Clean up expired sessions based on TTL.""" + current_time = time.time() + expired_sessions = [ + sid + for sid, (_, _, last_used) in self.sessions.items() + if current_time - last_used > self.session_ttl + ] + for sid in expired_sessions: + asyncio.create_task(self._kill_session(sid)) + + async def _kill_session(self, session_id: str): + """Kill a browser session and clean up resources. + + Args: + session_id: The session ID to kill + """ + if session_id in self.sessions: + context, page, _ = self.sessions[session_id] + await page.close() + del self.sessions[session_id] + + async def get_page(self, crawlerRunConfig: CrawlerRunConfig) -> Tuple[Page, BrowserContext]: + """Get a page for the given configuration. + + Args: + crawlerRunConfig: Configuration object for the crawler run + + Returns: + Tuple of (Page, BrowserContext) + """ + # Clean up expired sessions first + self._cleanup_expired_sessions() + + # If a session_id is provided and we already have it, reuse that page + context + if crawlerRunConfig.session_id and crawlerRunConfig.session_id in self.sessions: + context, page, _ = self.sessions[crawlerRunConfig.session_id] + # Update last-used timestamp + self.sessions[crawlerRunConfig.session_id] = (context, page, time.time()) + return page, context + + # Otherwise, check if we have an existing context for this config + config_signature = self._make_config_signature(crawlerRunConfig) + + async with self._contexts_lock: + if config_signature in self.contexts_by_config: + context = self.contexts_by_config[config_signature] + else: + # Create and setup a new context + context = await self.create_browser_context(crawlerRunConfig) + await self.setup_context(context, crawlerRunConfig) + self.contexts_by_config[config_signature] = context + + # Create a new page from the chosen context + page = await context.new_page() + + # If a session_id is specified, store this session so we can reuse later + if crawlerRunConfig.session_id: + self.sessions[crawlerRunConfig.session_id] = (context, page, time.time()) + + return page, context + + async def close(self): + """Close the browser and clean up resources.""" + if self.config.sleep_on_close: + await asyncio.sleep(0.5) + + # If we have a user_data_dir configured, ensure persistence of storage state + if self.config.user_data_dir and self.browser and self.default_context: + for context in self.browser.contexts: + try: + await context.storage_state(path=os.path.join(self.config.user_data_dir, "Default", "storage_state.json")) + if self.logger: + self.logger.debug("Ensuring storage state is persisted before closing browser", tag="BROWSER") + except Exception as e: + if self.logger: + self.logger.warning( + message="Failed to ensure storage persistence: {error}", + tag="BROWSER", + params={"error": str(e)} + ) + + # Close all sessions + session_ids = list(self.sessions.keys()) + for session_id in session_ids: + await self._kill_session(session_id) + + # Close all contexts we created + for ctx in self.contexts_by_config.values(): + try: + await ctx.close() + except Exception as e: + if self.logger: + self.logger.error( + message="Error closing context: {error}", + tag="ERROR", + params={"error": str(e)} + ) + self.contexts_by_config.clear() + + if self.browser: + await self.browser.close() + self.browser = None + + if self.playwright: + await self.playwright.stop() + self.playwright = None diff --git a/docs/examples/crypto_analysis_example.py b/docs/examples/crypto_analysis_example.py index 4160ba35..3cdba2c4 100644 --- a/docs/examples/crypto_analysis_example.py +++ b/docs/examples/crypto_analysis_example.py @@ -18,11 +18,20 @@ Key Features: import asyncio import pandas as pd +import numpy as np +import re import plotly.express as px -from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode, LXMLWebScrapingStrategy +from crawl4ai import ( + AsyncWebCrawler, + BrowserConfig, + CrawlerRunConfig, + CacheMode, + LXMLWebScrapingStrategy, +) from crawl4ai import CrawlResult from typing import List -from IPython.display import HTML + +__current_dir__ = __file__.rsplit("/", 1)[0] class CryptoAlphaGenerator: """ @@ -31,134 +40,319 @@ class CryptoAlphaGenerator: - Liquidity scores - Momentum-risk ratios - Machine learning-inspired trading signals - + Methods: analyze_tables(): Process raw tables into trading insights create_visuals(): Generate institutional-grade visualizations generate_insights(): Create plain English trading recommendations """ - + def clean_data(self, df: pd.DataFrame) -> pd.DataFrame: """ - Convert crypto market data to machine-readable format - Handles currency symbols, units (B=Billions), and percentage values + Convert crypto market data to machine-readable format. + Handles currency symbols, units (B=Billions), and percentage values. """ - # Clean numeric columns - df['Price'] = df['Price'].str.replace('[^\d.]', '', regex=True).astype(float) - df['Market Cap'] = df['Market Cap'].str.extract(r'\$([\d.]+)B')[0].astype(float) * 1e9 - df['Volume(24h)'] = df['Volume(24h)'].str.extract(r'\$([\d.]+)B')[0].astype(float) * 1e9 + # Make a copy to avoid SettingWithCopyWarning + df = df.copy() + + # Clean Price column (handle currency symbols) + df["Price"] = df["Price"].astype(str).str.replace("[^\d.]", "", regex=True).astype(float) + + # Handle Market Cap and Volume, considering both Billions and Trillions + def convert_large_numbers(value): + if pd.isna(value): + return float('nan') + value = str(value) + multiplier = 1 + if 'B' in value: + multiplier = 1e9 + elif 'T' in value: + multiplier = 1e12 + # Handle cases where the value might already be numeric + cleaned_value = re.sub(r"[^\d.]", "", value) + return float(cleaned_value) * multiplier if cleaned_value else float('nan') + + df["Market Cap"] = df["Market Cap"].apply(convert_large_numbers) + df["Volume(24h)"] = df["Volume(24h)"].apply(convert_large_numbers) # Convert percentages to decimal values - for col in ['1h %', '24h %', '7d %']: - df[col] = df[col].str.replace('%', '').astype(float) / 100 - + for col in ["1h %", "24h %", "7d %"]: + if col in df.columns: + # First ensure it's string, then clean + df[col] = ( + df[col].astype(str) + .str.replace("%", "") + .str.replace(",", ".") + .replace("nan", np.nan) + ) + df[col] = pd.to_numeric(df[col], errors='coerce') / 100 + return df def calculate_metrics(self, df: pd.DataFrame) -> pd.DataFrame: """ Compute advanced trading metrics used by quantitative funds: - + 1. Volume/Market Cap Ratio - Measures liquidity efficiency - (High ratio = Underestimated attention) - - 2. Volatility Score - Risk-adjusted momentum potential + (High ratio = Underestimated attention, and small-cap = higher growth potential) + + 2. Volatility Score - Risk-adjusted momentum potential - Shows how stable is the trend (STD of 1h/24h/7d returns) - - 3. Momentum Score - Weighted average of returns + + 3. Momentum Score - Weighted average of returns - Shows how strong is the trend (1h:30% + 24h:50% + 7d:20%) - + 4. Volume Anomaly - 3σ deviation detection - (Flags potential insider activity) + (Flags potential insider activity) - Unusual trading activity – Flags coins with volume spikes (potential insider buying or news). """ # Liquidity Metrics - df['Volume/Market Cap Ratio'] = df['Volume(24h)'] / df['Market Cap'] - + df["Volume/Market Cap Ratio"] = df["Volume(24h)"] / df["Market Cap"] + # Risk Metrics - df['Volatility Score'] = df[['1h %','24h %','7d %']].std(axis=1) - + df["Volatility Score"] = df[["1h %", "24h %", "7d %"]].std(axis=1) + # Momentum Metrics - df['Momentum Score'] = (df['1h %']*0.3 + df['24h %']*0.5 + df['7d %']*0.2) - + df["Momentum Score"] = df["1h %"] * 0.3 + df["24h %"] * 0.5 + df["7d %"] * 0.2 + # Anomaly Detection - median_vol = df['Volume(24h)'].median() - df['Volume Anomaly'] = df['Volume(24h)'] > 3 * median_vol - + median_vol = df["Volume(24h)"].median() + df["Volume Anomaly"] = df["Volume(24h)"] > 3 * median_vol + # Value Flags - df['Undervalued Flag'] = (df['Market Cap'] < 1e9) & (df['Momentum Score'] > 0.05) - df['Liquid Giant'] = (df['Volume/Market Cap Ratio'] > 0.15) & (df['Market Cap'] > 1e9) - + # Undervalued Flag - Low market cap and high momentum + # (High growth potential and low attention) + df["Undervalued Flag"] = (df["Market Cap"] < 1e9) & ( + df["Momentum Score"] > 0.05 + ) + # Liquid Giant Flag - High volume/market cap ratio and large market cap + # (High liquidity and large market cap = institutional interest) + df["Liquid Giant"] = (df["Volume/Market Cap Ratio"] > 0.15) & ( + df["Market Cap"] > 1e9 + ) + return df - def create_visuals(self, df: pd.DataFrame) -> dict: + def generate_insights_simple(self, df: pd.DataFrame) -> str: """ - Generate three institutional-grade visualizations: - - 1. 3D Market Map - X:Size, Y:Liquidity, Z:Momentum - 2. Liquidity Tree - Color:Volume Efficiency - 3. Momentum Leaderboard - Top sustainable movers + Generates an ultra-actionable crypto trading report with: + - Risk-tiered opportunities (High/Medium/Low) + - Concrete examples for each trade type + - Entry/exit strategies spelled out + - Visual cues for quick scanning """ - # 3D Market Overview - fig1 = px.scatter_3d( - df, - x='Market Cap', - y='Volume/Market Cap Ratio', - z='Momentum Score', - size='Volatility Score', - color='Volume Anomaly', - hover_name='Name', - title='Smart Money Market Map: Spot Overlooked Opportunities', - labels={'Market Cap': 'Size (Log $)', 'Volume/Market Cap Ratio': 'Liquidity Power'}, - log_x=True, - template='plotly_dark' - ) - - # Liquidity Efficiency Tree - fig2 = px.treemap( - df, - path=['Name'], - values='Market Cap', - color='Volume/Market Cap Ratio', - hover_data=['Momentum Score'], - title='Liquidity Forest: Green = High Trading Efficiency', - color_continuous_scale='RdYlGn' - ) - - # Momentum Leaders - fig3 = px.bar( - df.sort_values('Momentum Score', ascending=False).head(10), - x='Name', - y='Momentum Score', - color='Volatility Score', - title='Sustainable Momentum Leaders (Low Volatility + High Growth)', - text='7d %', - template='plotly_dark' - ) - - return {'market_map': fig1, 'liquidity_tree': fig2, 'momentum_leaders': fig3} + report = [ + "🚀 **CRYPTO TRADING CHEAT SHEET** 🚀", + "*Based on quantitative signals + hedge fund tactics*", + "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━" + ] + + # 1. HIGH-RISK: Undervalued Small-Caps (Momentum Plays) + high_risk = df[df["Undervalued Flag"]].sort_values("Momentum Score", ascending=False) + if not high_risk.empty: + example_coin = high_risk.iloc[0] + report.extend([ + "\n🔥 **HIGH-RISK: Rocket Fuel Small-Caps**", + f"*Example Trade:* {example_coin['Name']} (Price: ${example_coin['Price']:.6f})", + "📊 *Why?* Tiny market cap (<$1B) but STRONG momentum (+{:.0f}% last week)".format(example_coin['7d %']*100), + "🎯 *Strategy:*", + "1. Wait for 5-10% dip from recent high (${:.6f} → Buy under ${:.6f})".format( + example_coin['Price'] / (1 - example_coin['24h %']), # Approx recent high + example_coin['Price'] * 0.95 + ), + "2. Set stop-loss at -10% (${:.6f})".format(example_coin['Price'] * 0.90), + "3. Take profit at +20% (${:.6f})".format(example_coin['Price'] * 1.20), + "⚠️ *Risk Warning:* These can drop 30% fast! Never bet more than 5% of your portfolio.", + "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━" + ]) + + # 2. MEDIUM-RISK: Liquid Giants (Swing Trades) + medium_risk = df[df["Liquid Giant"]].sort_values("Volume/Market Cap Ratio", ascending=False) + if not medium_risk.empty: + example_coin = medium_risk.iloc[0] + report.extend([ + "\n💎 **MEDIUM-RISK: Liquid Giants (Safe Swing Trades)**", + f"*Example Trade:* {example_coin['Name']} (Market Cap: ${example_coin['Market Cap']/1e9:.1f}B)", + "📊 *Why?* Huge volume (${:.1f}M/day) makes it easy to enter/exit".format(example_coin['Volume(24h)']/1e6), + "🎯 *Strategy:*", + "1. Buy when 24h volume > 15% of market cap (Current: {:.0f}%)".format(example_coin['Volume/Market Cap Ratio']*100), + "2. Hold 1-4 weeks (Big coins trend longer)", + "3. Exit when momentum drops below 5% (Current: {:.0f}%)".format(example_coin['Momentum Score']*100), + "📉 *Pro Tip:* Watch Bitcoin's trend - if BTC drops 5%, these usually follow.", + "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━" + ]) + + # 3. LOW-RISK: Stable Momentum (DCA Targets) + low_risk = df[ + (df["Momentum Score"] > 0.05) & + (df["Volatility Score"] < 0.03) + ].sort_values("Market Cap", ascending=False) + if not low_risk.empty: + example_coin = low_risk.iloc[0] + report.extend([ + "\n🛡️ **LOW-RISK: Steady Climbers (DCA & Forget)**", + f"*Example Trade:* {example_coin['Name']} (Volatility: {example_coin['Volatility Score']:.2f}/5)", + "📊 *Why?* Rises steadily (+{:.0f}%/week) with LOW drama".format(example_coin['7d %']*100), + "🎯 *Strategy:*", + "1. Buy small amounts every Tuesday/Friday (DCA)", + "2. Hold for 3+ months (Compound gains work best here)", + "3. Sell 10% at every +25% milestone", + "💰 *Best For:* Long-term investors who hate sleepless nights", + "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━" + ]) + + # Volume Spike Alerts + anomalies = df[df["Volume Anomaly"]].sort_values("Volume(24h)", ascending=False) + if not anomalies.empty: + example_coin = anomalies.iloc[0] + report.extend([ + "\n🚨 **Volume Spike Alert (Possible News/Whale Action)**", + f"*Coin:* {example_coin['Name']} (Volume: ${example_coin['Volume(24h)']/1e6:.1f}M, usual: ${example_coin['Volume(24h)']/3/1e6:.1f}M)", + "🔍 *Check:* Twitter/CoinGecko for news before trading", + "⚡ *If no news:* Could be insider buying - watch price action:", + "- Break above today's high → Buy with tight stop-loss", + "- Fade back down → Avoid (may be a fakeout)" + ]) + + # Pro Tip Footer + report.append("\n✨ *Pro Tip:* Bookmark this report & check back in 24h to see if signals held up.") + + return "\n".join(report) def generate_insights(self, df: pd.DataFrame) -> str: """ - Create plain English trading insights explaining: - - Volume spikes and their implications - - Risk-reward ratios of top movers - - Liquidity warnings for large positions + Generates a tactical trading report with: + - Top 3 trades per risk level (High/Medium/Low) + - Auto-calculated entry/exit prices + - BTC chart toggle tip """ - top_coin = df.sort_values('Momentum Score', ascending=False).iloc[0] - anomaly_coins = df[df['Volume Anomaly']].sort_values('Volume(24h)', ascending=False) + # Filter top candidates for each risk level + high_risk = ( + df[df["Undervalued Flag"]] + .sort_values("Momentum Score", ascending=False) + .head(3) + ) + medium_risk = ( + df[df["Liquid Giant"]] + .sort_values("Volume/Market Cap Ratio", ascending=False) + .head(3) + ) + low_risk = ( + df[(df["Momentum Score"] > 0.05) & (df["Volatility Score"] < 0.03)] + .sort_values("Momentum Score", ascending=False) + .head(3) + ) + + report = ["# 🎯 Crypto Trading Tactical Report (Top 3 Per Risk Tier)"] - report = f""" - 🚀 Top Alpha Opportunity: {top_coin['Name']} - - Momentum Score: {top_coin['Momentum Score']:.2%} (Top 1%) - - Risk-Reward Ratio: {top_coin['Momentum Score']/top_coin['Volatility Score']:.1f} - - Liquidity Warning: {'✅ Safe' if top_coin['Liquid Giant'] else '⚠️ Thin Markets'} + # 1. High-Risk Trades (Small-Cap Momentum) + if not high_risk.empty: + report.append("\n## 🔥 HIGH RISK: Small-Cap Rockets (5-50% Potential)") + for i, coin in high_risk.iterrows(): + current_price = coin["Price"] + entry = current_price * 0.95 # -5% dip + stop_loss = current_price * 0.90 # -10% + take_profit = current_price * 1.20 # +20% + + report.append( + f"\n### {coin['Name']} (Momentum: {coin['Momentum Score']:.1%})" + f"\n- **Current Price:** ${current_price:.4f}" + f"\n- **Entry:** < ${entry:.4f} (Wait for pullback)" + f"\n- **Stop-Loss:** ${stop_loss:.4f} (-10%)" + f"\n- **Target:** ${take_profit:.4f} (+20%)" + f"\n- **Risk/Reward:** 1:2" + f"\n- **Watch:** Volume spikes above {coin['Volume(24h)']/1e6:.1f}M" + ) + + # 2. Medium-Risk Trades (Liquid Giants) + if not medium_risk.empty: + report.append("\n## 💎 MEDIUM RISK: Liquid Swing Trades (10-30% Potential)") + for i, coin in medium_risk.iterrows(): + current_price = coin["Price"] + entry = current_price * 0.98 # -2% dip + stop_loss = current_price * 0.94 # -6% + take_profit = current_price * 1.15 # +15% + + report.append( + f"\n### {coin['Name']} (Liquidity Score: {coin['Volume/Market Cap Ratio']:.1%})" + f"\n- **Current Price:** ${current_price:.2f}" + f"\n- **Entry:** < ${entry:.2f} (Buy slight dips)" + f"\n- **Stop-Loss:** ${stop_loss:.2f} (-6%)" + f"\n- **Target:** ${take_profit:.2f} (+15%)" + f"\n- **Hold Time:** 1-3 weeks" + f"\n- **Key Metric:** Volume/Cap > 15%" + ) + + # 3. Low-Risk Trades (Stable Momentum) + if not low_risk.empty: + report.append("\n## 🛡️ LOW RISK: Steady Gainers (5-15% Potential)") + for i, coin in low_risk.iterrows(): + current_price = coin["Price"] + entry = current_price * 0.99 # -1% dip + stop_loss = current_price * 0.97 # -3% + take_profit = current_price * 1.10 # +10% + + report.append( + f"\n### {coin['Name']} (Stability Score: {1/coin['Volatility Score']:.1f}x)" + f"\n- **Current Price:** ${current_price:.2f}" + f"\n- **Entry:** < ${entry:.2f} (Safe zone)" + f"\n- **Stop-Loss:** ${stop_loss:.2f} (-3%)" + f"\n- **Target:** ${take_profit:.2f} (+10%)" + f"\n- **DCA Suggestion:** 3 buys over 72 hours" + ) + + # Volume Anomaly Alert + anomalies = df[df["Volume Anomaly"]].sort_values("Volume(24h)", ascending=False).head(2) + if not anomalies.empty: + report.append("\n⚠️ **Volume Spike Alerts**") + for i, coin in anomalies.iterrows(): + report.append( + f"- {coin['Name']}: Volume {coin['Volume(24h)']/1e6:.1f}M " + f"(3x normal) | Price moved: {coin['24h %']:.1%}" + ) + + # Pro Tip + report.append( + "\n📊 **Chart Hack:** Hide BTC in visuals:\n" + "```python\n" + "# For 3D Map:\n" + "fig.update_traces(visible=False, selector={'name':'Bitcoin'})\n" + "# For Treemap:\n" + "df = df[df['Name'] != 'Bitcoin']\n" + "```" + ) + + return "\n".join(report) + + def create_visuals(self, df: pd.DataFrame) -> dict: + """Enhanced visuals with BTC toggle support""" + # 3D Market Map (with BTC toggle hint) + fig1 = px.scatter_3d( + df, + x="Market Cap", + y="Volume/Market Cap Ratio", + z="Momentum Score", + color="Name", # Color by name to allow toggling + hover_name="Name", + title="Market Map (Toggle BTC in legend to focus on alts)", + log_x=True + ) + fig1.update_traces( + marker=dict(size=df["Volatility Score"]*100 + 5) # Dynamic sizing + ) - 🔥 Volume Spikes Detected ({len(anomaly_coins)} coins): - {anomaly_coins[['Name', 'Volume(24h)']].head(3).to_markdown(index=False)} + # Liquidity Tree (exclude BTC if too dominant) + if df[df["Name"] == "BitcoinBTC"]["Market Cap"].values[0] > df["Market Cap"].median() * 10: + df = df[df["Name"] != "BitcoinBTC"] - 💡 Smart Money Tip: Coins with Volume/Cap > 15% and Momentum > 5% - historically outperform by 22% weekly returns. - """ - return report + fig2 = px.treemap( + df, + path=["Name"], + values="Market Cap", + color="Volume/Market Cap Ratio", + title="Liquidity Tree (BTC auto-removed if dominant)" + ) + + return {"market_map": fig1, "liquidity_tree": fig2} async def main(): """ @@ -171,60 +365,79 @@ async def main(): """ # Configure browser with anti-detection features browser_config = BrowserConfig( - headless=True, - stealth=True, - block_resources=["image", "media"] + headless=False, ) - + # Initialize crawler with smart table detection crawler = AsyncWebCrawler(config=browser_config) await crawler.start() - + try: # Set up scraping parameters crawl_config = CrawlerRunConfig( cache_mode=CacheMode.BYPASS, - scraping_strategy=LXMLWebScrapingStrategy( - table_score_threshold=8, # Strict table detection - keep_data_attributes=True - ) + table_score_threshold=8, # Strict table detection + keep_data_attributes=True, + scraping_strategy=LXMLWebScrapingStrategy(), + scan_full_page=True, + scroll_delay=0.2, ) - - # Execute market data extraction - results: List[CrawlResult] = await crawler.arun( - url='https://coinmarketcap.com/?page=1', - config=crawl_config - ) - - # Process results - for result in results: - if result.success and result.media['tables']: - # Extract primary market table - raw_df = pd.DataFrame( - result.media['tables'][0]['rows'], - columns=result.media['tables'][0]['headers'] - ) - - # Initialize analysis engine - analyzer = CryptoAlphaGenerator() - clean_df = analyzer.clean_data(raw_df) - analyzed_df = analyzer.calculate_metrics(clean_df) - - # Generate outputs - visuals = analyzer.create_visuals(analyzed_df) - insights = analyzer.generate_insights(analyzed_df) - - # Save visualizations - visuals['market_map'].write_html("market_map.html") - visuals['liquidity_tree'].write_html("liquidity_tree.html") - - # Display results - print("🔑 Key Trading Insights:") - print(insights) - print("\n📊 Open 'market_map.html' for interactive analysis") + + # # Execute market data extraction + # results: List[CrawlResult] = await crawler.arun( + # url="https://coinmarketcap.com/?page=1", config=crawl_config + # ) + + # # Process results + # raw_df = pd.DataFrame() + # for result in results: + # if result.success and result.media["tables"]: + # # Extract primary market table + # # DataFrame + # raw_df = pd.DataFrame( + # result.media["tables"][0]["rows"], + # columns=result.media["tables"][0]["headers"], + # ) + # break + + + # This is for debugging only + # ////// Remove this in production from here.. + # Save raw data for debugging + # raw_df.to_csv(f"{__current_dir__}/tmp/raw_crypto_data.csv", index=False) + # print("🔍 Raw data saved to 'raw_crypto_data.csv'") + + # Read from file for debugging + raw_df = pd.read_csv(f"{__current_dir__}/tmp/raw_crypto_data.csv") + # ////// ..to here + + # Select top 20 + raw_df = raw_df.head(50) + # Remove "Buy" from name + raw_df["Name"] = raw_df["Name"].str.replace("Buy", "") + + # Initialize analysis engine + analyzer = CryptoAlphaGenerator() + clean_df = analyzer.clean_data(raw_df) + analyzed_df = analyzer.calculate_metrics(clean_df) + + # Generate outputs + visuals = analyzer.create_visuals(analyzed_df) + insights = analyzer.generate_insights(analyzed_df) + + # Save visualizations + visuals["market_map"].write_html(f"{__current_dir__}/tmp/market_map.html") + visuals["liquidity_tree"].write_html(f"{__current_dir__}/tmp/liquidity_tree.html") + + # Display results + print("🔑 Key Trading Insights:") + print(insights) + print("\n📊 Open 'market_map.html' for interactive analysis") + print("\n📊 Open 'liquidity_tree.html' for interactive analysis") finally: await crawler.close() + if __name__ == "__main__": - asyncio.run(main()) \ No newline at end of file + asyncio.run(main()) diff --git a/tests/browser/docker/test_docker_browser.py b/tests/browser/docker/test_docker_browser.py index a3901d8d..bd3c4348 100644 --- a/tests/browser/docker/test_docker_browser.py +++ b/tests/browser/docker/test_docker_browser.py @@ -17,9 +17,9 @@ if __name__ == "__main__": from crawl4ai.browser import BrowserManager from crawl4ai.async_configs import BrowserConfig, CrawlerRunConfig from crawl4ai.async_logger import AsyncLogger -from crawl4ai.browser.docker_config import DockerConfig -from crawl4ai.browser.docker_registry import DockerRegistry -from crawl4ai.browser.docker_utils import DockerUtils +from crawl4ai.browser import DockerConfig +from crawl4ai.browser import DockerRegistry +from crawl4ai.browser import DockerUtils # Create a logger for clear terminal output logger = AsyncLogger(verbose=True, log_file=None) @@ -136,7 +136,7 @@ async def test_docker_components(): # Verify Chrome is installed in the container returncode, stdout, stderr = await docker_utils.exec_in_container( - container_id, ["which", "google-chrome"] + container_id, ["which", "chromium"] ) if returncode != 0: @@ -149,7 +149,7 @@ async def test_docker_components(): # Test Chrome version returncode, stdout, stderr = await docker_utils.exec_in_container( - container_id, ["google-chrome", "--version"] + container_id, ["chromium", "--version"] ) if returncode != 0: @@ -608,13 +608,13 @@ async def run_tests(): return # First test Docker components - setup_result = await test_docker_components() - if not setup_result: - logger.error("Docker component tests failed - skipping browser tests", tag="TEST") - return + # setup_result = await test_docker_components() + # if not setup_result: + # logger.error("Docker component tests failed - skipping browser tests", tag="TEST") + # return # Run browser tests - results.append(await test_docker_connect_mode()) + # results.append(await test_docker_connect_mode()) results.append(await test_docker_launch_mode()) results.append(await test_docker_persistent_storage()) results.append(await test_docker_parallel_pages()) From 64f20ab44a2062d85fbc7761ce4f8692cbbc4f7a Mon Sep 17 00:00:00 2001 From: UncleCode Date: Fri, 28 Mar 2025 15:59:02 +0800 Subject: [PATCH 3/7] refactor(docker): update Dockerfile and browser strategy to use Chromium --- crawl4ai/browser/docker/alpine/launch.Dockerfile | 6 +++++- crawl4ai/browser/docker_utils.py | 2 +- crawl4ai/browser/strategies/docker_strategy.py | 10 +++++----- tests/browser/docker/test_docker_browser.py | 4 ++-- 4 files changed, 13 insertions(+), 9 deletions(-) diff --git a/crawl4ai/browser/docker/alpine/launch.Dockerfile b/crawl4ai/browser/docker/alpine/launch.Dockerfile index 60b20539..17e3c660 100644 --- a/crawl4ai/browser/docker/alpine/launch.Dockerfile +++ b/crawl4ai/browser/docker/alpine/launch.Dockerfile @@ -9,11 +9,15 @@ freetype \ harfbuzz \ ca-certificates \ - ttf-freefont && \ + ttf-freefont \ + socat \ + curl && \ addgroup -S chromium && adduser -S chromium -G chromium && \ mkdir -p /data && chown chromium:chromium /data && \ rm -rf /var/cache/apk/* + ENV PATH="/usr/bin:/bin:/usr/sbin:/sbin" + # Switch to a non-root user for security USER chromium WORKDIR /home/chromium diff --git a/crawl4ai/browser/docker_utils.py b/crawl4ai/browser/docker_utils.py index 7ba48534..f93a51b9 100644 --- a/crawl4ai/browser/docker_utils.py +++ b/crawl4ai/browser/docker_utils.py @@ -501,7 +501,7 @@ class DockerUtils: bool: True if Chrome started successfully, False otherwise """ # Build Chrome command - chrome_cmd = ["google-chrome"] + chrome_cmd = ["chromium"] chrome_cmd.extend(browser_args) returncode, _, stderr = await self.exec_in_container( diff --git a/crawl4ai/browser/strategies/docker_strategy.py b/crawl4ai/browser/strategies/docker_strategy.py index 33e581be..ca7e314a 100644 --- a/crawl4ai/browser/strategies/docker_strategy.py +++ b/crawl4ai/browser/strategies/docker_strategy.py @@ -14,13 +14,13 @@ from ...async_configs import BrowserConfig from ..models import DockerConfig from ..docker_registry import DockerRegistry from ..docker_utils import DockerUtils -from .builtin import BuiltinBrowserStrategy +from .builtin import CDPBrowserStrategy -class DockerBrowserStrategy(BuiltinBrowserStrategy): +class DockerBrowserStrategy(CDPBrowserStrategy): """Docker-based browser strategy. - Extends the BuiltinBrowserStrategy to run browsers in Docker containers. + Extends the CDPBrowserStrategy to run browsers in Docker containers. Supports two modes: 1. "connect" - Uses a Docker image with Chrome already running 2. "launch" - Starts Chrome within the container with custom settings @@ -342,7 +342,7 @@ class DockerBrowserStrategy(BuiltinBrowserStrategy): # Get PIDs for later cleanup self.chrome_process_id = await self.docker_utils.get_process_id_in_container( - container_id, "chrome" + container_id, "chromium" ) self.socat_process_id = await self.docker_utils.get_process_id_in_container( container_id, "socat" @@ -396,7 +396,7 @@ class DockerBrowserStrategy(BuiltinBrowserStrategy): if self.config.light_mode: # Import here to avoid circular import - from .utils import get_browser_disable_options + from ..utils import get_browser_disable_options args.extend(get_browser_disable_options()) if self.config.user_data_dir: diff --git a/tests/browser/docker/test_docker_browser.py b/tests/browser/docker/test_docker_browser.py index bd3c4348..610a230e 100644 --- a/tests/browser/docker/test_docker_browser.py +++ b/tests/browser/docker/test_docker_browser.py @@ -615,8 +615,8 @@ async def run_tests(): # Run browser tests # results.append(await test_docker_connect_mode()) - results.append(await test_docker_launch_mode()) - results.append(await test_docker_persistent_storage()) + # results.append(await test_docker_launch_mode()) + # results.append(await test_docker_persistent_storage()) results.append(await test_docker_parallel_pages()) results.append(await test_docker_registry_reuse()) From 3ff7eec8f3f12e79e782a35f60ae4ed0227ea648 Mon Sep 17 00:00:00 2001 From: UncleCode Date: Fri, 28 Mar 2025 22:47:28 +0800 Subject: [PATCH 4/7] refactor(browser): consolidate browser strategy implementations Moves common browser functionality into BaseBrowserStrategy class to reduce code duplication and improve maintainability. Key changes: - Adds shared browser argument building and session management to base class - Standardizes storage state handling across strategies - Improves process cleanup and error handling - Consolidates CDP URL management and container lifecycle BREAKING CHANGE: Changes browser_mode="custom" to "cdp" for consistency --- crawl4ai/async_configs.py | 10 +- crawl4ai/browser/manager.py | 18 +- crawl4ai/browser/strategies/base.py | 343 ++++++++++++- crawl4ai/browser/strategies/builtin.py | 263 ++++++---- crawl4ai/browser/strategies/cdp.py | 250 ++++------ .../browser/strategies/docker_strategy.py | 457 +++++++++++------- crawl4ai/browser/strategies/playwright.py | 216 ++------- crawl4ai/browser/utils.py | 139 +++++- tests/browser/docker/test_docker_browser.py | 6 +- tests/browser/test_builtin_browser.py | 11 +- tests/browser/test_cdp_strategy.py | 5 +- tests/browser/test_playwright_strategy.py | 55 ++- 12 files changed, 1102 insertions(+), 671 deletions(-) diff --git a/crawl4ai/async_configs.py b/crawl4ai/async_configs.py index c7f9e739..a287cfbe 100644 --- a/crawl4ai/async_configs.py +++ b/crawl4ai/async_configs.py @@ -29,7 +29,7 @@ from enum import Enum from .proxy_strategy import ProxyConfig try: - from .browser.docker_config import DockerConfig + from .browser.models import DockerConfig except ImportError: DockerConfig = None @@ -176,7 +176,7 @@ class BrowserConfig: browser_mode (str): Determines how the browser should be initialized: "builtin" - use the builtin CDP browser running in background "dedicated" - create a new dedicated browser instance each time - "custom" - use explicit CDP settings provided in cdp_url + "cdp" - use explicit CDP settings provided in cdp_url "docker" - run browser in Docker container with isolation Default: "dedicated" use_managed_browser (bool): Launch the browser using a managed approach (e.g., via CDP), allowing @@ -242,7 +242,7 @@ class BrowserConfig: channel: str = "chromium", proxy: str = None, proxy_config: Union[ProxyConfig, dict, None] = None, - docker_config: Union["DockerConfig", dict, None] = None, + docker_config: Union[DockerConfig, dict, None] = None, viewport_width: int = 1080, viewport_height: int = 600, viewport: dict = None, @@ -289,6 +289,10 @@ class BrowserConfig: self.docker_config = DockerConfig.from_kwargs(docker_config) else: self.docker_config = docker_config + + if self.docker_config: + self.user_data_dir = self.docker_config.user_data_dir + self.viewport_width = viewport_width self.viewport_height = viewport_height self.viewport = viewport diff --git a/crawl4ai/browser/manager.py b/crawl4ai/browser/manager.py index 3cb68021..288ad7e9 100644 --- a/crawl4ai/browser/manager.py +++ b/crawl4ai/browser/manager.py @@ -80,7 +80,7 @@ class BrowserManager: ) return PlaywrightBrowserStrategy(self.config, self.logger) return DockerBrowserStrategy(self.config, self.logger) - elif self.config.cdp_url or self.config.use_managed_browser: + elif self.config.browser_mode == "cdp" or self.config.cdp_url or self.config.use_managed_browser: return CDPBrowserStrategy(self.config, self.logger) else: return PlaywrightBrowserStrategy(self.config, self.logger) @@ -159,16 +159,12 @@ class BrowserManager: session_id: The session ID to kill """ # Handle kill_session via our strategy if it supports it - if hasattr(self._strategy, '_kill_session'): - await self._strategy._kill_session(session_id) - elif session_id in self.sessions: - context, page, _ = self.sessions[session_id] - await page.close() - # Only close context if not using CDP - if not self.config.use_managed_browser and not self.config.cdp_url and not self.config.browser_mode == "builtin": - await context.close() - del self.sessions[session_id] - + await self._strategy.kill_session(session_id) + + # sync sessions if needed + if hasattr(self._strategy, 'sessions'): + self.sessions = self._strategy.sessions + def _cleanup_expired_sessions(self): """Clean up expired sessions based on TTL.""" # Use strategy's implementation if available diff --git a/crawl4ai/browser/strategies/base.py b/crawl4ai/browser/strategies/base.py index 75613dcd..2c500389 100644 --- a/crawl4ai/browser/strategies/base.py +++ b/crawl4ai/browser/strategies/base.py @@ -8,6 +8,8 @@ from abc import ABC, abstractmethod import asyncio import json import hashlib +import os +import time from typing import Optional, Tuple, List from playwright.async_api import BrowserContext, Page @@ -16,14 +18,31 @@ from ...async_logger import AsyncLogger from ...async_configs import BrowserConfig, CrawlerRunConfig from ...config import DOWNLOAD_PAGE_TIMEOUT from ...js_snippet import load_js_script +from ..utils import get_playwright + class BaseBrowserStrategy(ABC): """Base class for all browser strategies. This abstract class defines the interface that all browser strategies - must implement. It handles common functionality like context caching. + must implement. It handles common functionality like context caching, + browser configuration, and session management. """ + _playwright_instance = None + + @classmethod + async def get_playwright(cls): + """Get or create a shared Playwright instance. + + Returns: + Playwright: The shared Playwright instance + """ + # For now I dont want Singleton pattern for Playwright + if cls._playwright_instance is None or True: + cls._playwright_instance = await get_playwright() + return cls._playwright_instance + def __init__(self, config: BrowserConfig, logger: Optional[AsyncLogger] = None): """Initialize the strategy with configuration and logger. @@ -35,23 +54,40 @@ class BaseBrowserStrategy(ABC): self.logger = logger self.browser = None self.default_context = None - self.contexts_by_config = {} - self._contexts_lock = asyncio.Lock() - self.playwright = None + # Context management + self.contexts_by_config = {} # config_signature -> context + + self._contexts_lock = asyncio.Lock() + + # Session management + self.sessions = {} + self.session_ttl = 1800 # 30 minutes default + + # Playwright instance + self.playwright = None + @abstractmethod async def start(self): """Start the browser. + This method should be implemented by concrete strategies to initialize + the browser in the appropriate way (direct launch, CDP connection, etc.) + Returns: self: For method chaining """ - pass + # Base implementation gets the playwright instance + self.playwright = await self.get_playwright() + return self @abstractmethod async def get_page(self, crawlerRunConfig: CrawlerRunConfig) -> Tuple[Page, BrowserContext]: """Get a page with specified configuration. + This method should be implemented by concrete strategies to create + or retrieve a page according to their browser management approach. + Args: crawlerRunConfig: Crawler run configuration @@ -75,15 +111,122 @@ class BaseBrowserStrategy(ABC): page, context = await self.get_page(crawlerRunConfig) pages.append((page, context)) return pages - - @abstractmethod - async def close(self): - """Close the browser and clean up resources.""" - pass + + def _build_browser_args(self) -> dict: + """Build browser launch arguments from config. + + Returns: + dict: Browser launch arguments for Playwright + """ + # Define common browser arguments that improve performance and stability + args = [ + "--disable-gpu", + "--disable-gpu-compositing", + "--disable-software-rasterizer", + "--no-sandbox", + "--disable-dev-shm-usage", + "--no-first-run", + "--no-default-browser-check", + "--disable-infobars", + "--window-position=0,0", + "--ignore-certificate-errors", + "--ignore-certificate-errors-spki-list", + "--disable-blink-features=AutomationControlled", + "--window-position=400,0", + "--disable-renderer-backgrounding", + "--disable-ipc-flooding-protection", + "--force-color-profile=srgb", + "--mute-audio", + "--disable-background-timer-throttling", + f"--window-size={self.config.viewport_width},{self.config.viewport_height}", + ] + + # Define browser disable options for light mode + browser_disable_options = [ + "--disable-background-networking", + "--disable-background-timer-throttling", + "--disable-backgrounding-occluded-windows", + "--disable-breakpad", + "--disable-client-side-phishing-detection", + "--disable-component-extensions-with-background-pages", + "--disable-default-apps", + "--disable-extensions", + "--disable-features=TranslateUI", + "--disable-hang-monitor", + "--disable-ipc-flooding-protection", + "--disable-popup-blocking", + "--disable-prompt-on-repost", + "--disable-sync", + "--force-color-profile=srgb", + "--metrics-recording-only", + "--no-first-run", + "--password-store=basic", + "--use-mock-keychain", + ] + + # Apply light mode settings if enabled + if self.config.light_mode: + args.extend(browser_disable_options) + + # Apply text mode settings if enabled (disables images, JS, etc) + if self.config.text_mode: + args.extend([ + "--blink-settings=imagesEnabled=false", + "--disable-remote-fonts", + "--disable-images", + "--disable-javascript", + "--disable-software-rasterizer", + "--disable-dev-shm-usage", + ]) + + # Add any extra arguments from the config + if self.config.extra_args: + args.extend(self.config.extra_args) + + # Build the core browser args dictionary + browser_args = {"headless": self.config.headless, "args": args} + + # Add chrome channel if specified + if self.config.chrome_channel: + browser_args["channel"] = self.config.chrome_channel + + # Configure downloads + if self.config.accept_downloads: + browser_args["downloads_path"] = self.config.downloads_path or os.path.join( + os.getcwd(), "downloads" + ) + os.makedirs(browser_args["downloads_path"], exist_ok=True) + + # Check for user data directory + if self.config.user_data_dir: + # Ensure the directory exists + os.makedirs(self.config.user_data_dir, exist_ok=True) + browser_args["user_data_dir"] = self.config.user_data_dir + + # Configure proxy settings + if self.config.proxy or self.config.proxy_config: + from playwright.async_api import ProxySettings + + proxy_settings = ( + ProxySettings(server=self.config.proxy) + if self.config.proxy + else ProxySettings( + server=self.config.proxy_config.server, + username=self.config.proxy_config.username, + password=self.config.proxy_config.password, + ) + ) + browser_args["proxy"] = proxy_settings + + return browser_args def _make_config_signature(self, crawlerRunConfig: CrawlerRunConfig) -> str: """Create a signature hash from configuration for context caching. + Converts the crawlerRunConfig into a dict, excludes ephemeral fields, + then returns a hash of the sorted JSON. This yields a stable signature + that identifies configurations requiring a unique browser context. + Args: crawlerRunConfig: Crawler run configuration @@ -157,6 +300,7 @@ class BaseBrowserStrategy(ABC): "viewport": viewport_settings, "proxy": proxy_settings, "accept_downloads": self.config.accept_downloads, + "storage_state": self.config.storage_state, "ignore_https_errors": self.config.ignore_https_errors, "device_scale_factor": 1.0, "java_script_enabled": self.config.java_script_enabled, @@ -167,8 +311,7 @@ class BaseBrowserStrategy(ABC): text_mode_settings = { "has_touch": False, "is_mobile": False, - # Disable javascript in text mode - "java_script_enabled": False + "java_script_enabled": False, # Disable javascript in text mode } # Update context settings with text mode settings context_settings.update(text_mode_settings) @@ -177,16 +320,25 @@ class BaseBrowserStrategy(ABC): # Handle storage state properly - this is key for persistence if self.config.storage_state: - context_settings["storage_state"] = self.config.storage_state if self.logger: if isinstance(self.config.storage_state, str): self.logger.debug(f"Using storage state from file: {self.config.storage_state}", tag="BROWSER") else: self.logger.debug("Using storage state from config object", tag="BROWSER") - - # If user_data_dir is specified, browser persistence should be automatic - if self.config.user_data_dir and self.logger: - self.logger.debug(f"Using user data directory: {self.config.user_data_dir}", tag="BROWSER") + + if self.config.user_data_dir: + # For CDP-based browsers, storage persistence is typically handled by the user_data_dir + # at the browser level, but we'll create a storage_state location for Playwright as well + storage_path = os.path.join(self.config.user_data_dir, "storage_state.json") + if not os.path.exists(storage_path): + # Create parent directory if it doesn't exist + os.makedirs(os.path.dirname(storage_path), exist_ok=True) + with open(storage_path, "w") as f: + json.dump({}, f) + self.config.storage_state = storage_path + + if self.logger: + self.logger.debug(f"Using user data directory: {self.config.user_data_dir}", tag="BROWSER") # Apply crawler-specific configurations if provided if crawlerRunConfig: @@ -227,12 +379,19 @@ class BaseBrowserStrategy(ABC): context: The browser context to set up crawlerRunConfig: Configuration object containing all browser settings """ + # Set HTTP headers if self.config.headers: await context.set_extra_http_headers(self.config.headers) + # Add cookies if self.config.cookies: await context.add_cookies(self.config.cookies) + # Apply storage state if provided + if self.config.storage_state: + await context.storage_state(path=None) + + # Configure downloads if self.config.accept_downloads: context.set_default_timeout(DOWNLOAD_PAGE_TIMEOUT) context.set_default_navigation_timeout(DOWNLOAD_PAGE_TIMEOUT) @@ -250,12 +409,13 @@ class BaseBrowserStrategy(ABC): await context.set_extra_http_headers(combined_headers) # Add default cookie + target_url = (crawlerRunConfig and crawlerRunConfig.url) or "https://crawl4ai.com/" await context.add_cookies( [ { "name": "cookiesEnabled", "value": "true", - "url": crawlerRunConfig and crawlerRunConfig.url or "https://crawl4ai.com/", + "url": target_url, } ] ) @@ -268,3 +428,150 @@ class BaseBrowserStrategy(ABC): or crawlerRunConfig.magic ): await context.add_init_script(load_js_script("navigator_overrider")) + + async def kill_session(self, session_id: str): + """Kill a browser session and clean up resources. + + Args: + session_id (str): The session ID to kill. + """ + if session_id not in self.sessions: + return + + context, page, _ = self.sessions[session_id] + + # Close the page + try: + await page.close() + except Exception as e: + if self.logger: + self.logger.error(f"Error closing page for session {session_id}: {str(e)}", tag="BROWSER") + + # Remove session from tracking + del self.sessions[session_id] + + # Clean up any contexts that no longer have pages + await self._cleanup_unused_contexts() + + if self.logger: + self.logger.debug(f"Killed session: {session_id}", tag="BROWSER") + + async def _cleanup_unused_contexts(self): + """Clean up contexts that no longer have any pages.""" + async with self._contexts_lock: + # Get all contexts we're managing + contexts_to_check = list(self.contexts_by_config.values()) + + for context in contexts_to_check: + # Check if the context has any pages left + if not context.pages: + # No pages left, we can close this context + config_signature = next((sig for sig, ctx in self.contexts_by_config.items() + if ctx == context), None) + if config_signature: + try: + await context.close() + del self.contexts_by_config[config_signature] + if self.logger: + self.logger.debug(f"Closed unused context", tag="BROWSER") + except Exception as e: + if self.logger: + self.logger.error(f"Error closing unused context: {str(e)}", tag="BROWSER") + + def _cleanup_expired_sessions(self): + """Clean up expired sessions based on TTL.""" + current_time = time.time() + expired_sessions = [ + sid + for sid, (_, _, last_used) in self.sessions.items() + if current_time - last_used > self.session_ttl + ] + + for sid in expired_sessions: + if self.logger: + self.logger.debug(f"Session expired: {sid}", tag="BROWSER") + asyncio.create_task(self.kill_session(sid)) + + async def close(self): + """Close the browser and clean up resources. + + This method handles common cleanup tasks like: + 1. Persisting storage state if a user_data_dir is configured + 2. Closing all sessions + 3. Closing all browser contexts + 4. Closing the browser + 5. Stopping Playwright + + Child classes should override this method to add their specific cleanup logic, + but should call super().close() to ensure common cleanup tasks are performed. + """ + # Set a flag to prevent race conditions during cleanup + self.shutting_down = True + + try: + # Add brief delay if configured + if self.config.sleep_on_close: + await asyncio.sleep(0.5) + + # Persist storage state if using a user data directory + if self.config.user_data_dir and self.browser: + for context in self.browser.contexts: + try: + # Ensure the directory exists + storage_dir = os.path.join(self.config.user_data_dir, "Default") + os.makedirs(storage_dir, exist_ok=True) + + # Save storage state + storage_path = os.path.join(storage_dir, "storage_state.json") + await context.storage_state(path=storage_path) + + if self.logger: + self.logger.debug("Storage state persisted before closing browser", tag="BROWSER") + except Exception as e: + if self.logger: + self.logger.warning( + message="Failed to ensure storage persistence: {error}", + tag="BROWSER", + params={"error": str(e)} + ) + + # Close all active sessions + session_ids = list(self.sessions.keys()) + for session_id in session_ids: + await self.kill_session(session_id) + + # Close all cached contexts + for ctx in self.contexts_by_config.values(): + try: + await ctx.close() + except Exception as e: + if self.logger: + self.logger.error( + message="Error closing context: {error}", + tag="BROWSER", + params={"error": str(e)} + ) + self.contexts_by_config.clear() + + # Close the browser if it exists + if self.browser: + await self.browser.close() + self.browser = None + + # Stop playwright + if self.playwright: + await self.playwright.stop() + self.playwright = None + + except Exception as e: + if self.logger: + self.logger.error( + message="Error during browser cleanup: {error}", + tag="BROWSER", + params={"error": str(e)} + ) + finally: + # Reset shutting down flag + self.shutting_down = False + + \ No newline at end of file diff --git a/crawl4ai/browser/strategies/builtin.py b/crawl4ai/browser/strategies/builtin.py index fd678ca2..2423ee04 100644 --- a/crawl4ai/browser/strategies/builtin.py +++ b/crawl4ai/browser/strategies/builtin.py @@ -5,16 +5,20 @@ import json import subprocess import shutil import signal -from typing import Optional, Dict, Any +from typing import Optional, Dict, Any, Tuple +from ...async_logger import AsyncLogger +from ...async_configs import CrawlerRunConfig +from playwright.async_api import Page, BrowserContext from ...async_logger import AsyncLogger from ...async_configs import BrowserConfig from ...utils import get_home_folder -from ..utils import get_browser_executable, is_windows, is_browser_running +from ..utils import get_browser_executable, is_windows, is_browser_running, find_process_by_port, terminate_process from .cdp import CDPBrowserStrategy +from .base import BaseBrowserStrategy class BuiltinBrowserStrategy(CDPBrowserStrategy): """Built-in browser strategy. @@ -67,29 +71,66 @@ class BuiltinBrowserStrategy(CDPBrowserStrategy): Returns: self: For method chaining """ - # Check for existing built-in browser (get_browser_info already checks if running) - browser_info = self.get_browser_info() - if browser_info: - if self.logger: - self.logger.info(f"Using existing built-in browser at {browser_info.get('cdp_url')}", tag="BROWSER") - self.config.cdp_url = browser_info.get('cdp_url') - else: - if self.logger: - self.logger.info("Built-in browser not found, launching new instance...", tag="BROWSER") - cdp_url = await self.launch_builtin_browser( - browser_type=self.config.browser_type, - debugging_port=self.config.debugging_port, - headless=self.config.headless, - ) - if not cdp_url: - if self.logger: - self.logger.warning("Failed to launch built-in browser, falling back to regular CDP strategy", tag="BROWSER") - return await super().start() - self.config.cdp_url = cdp_url + # Initialize Playwright instance via base class method + await BaseBrowserStrategy.start(self) - # Call parent class implementation with updated CDP URL - return await super().start() - + try: + # Check for existing built-in browser (get_browser_info already checks if running) + browser_info = self.get_browser_info() + if browser_info: + if self.logger: + self.logger.info(f"Using existing built-in browser at {browser_info.get('cdp_url')}", tag="BROWSER") + self.config.cdp_url = browser_info.get('cdp_url') + else: + if self.logger: + self.logger.info("Built-in browser not found, launching new instance...", tag="BROWSER") + cdp_url = await self.launch_builtin_browser( + browser_type=self.config.browser_type, + debugging_port=self.config.debugging_port, + headless=self.config.headless, + ) + if not cdp_url: + if self.logger: + self.logger.warning("Failed to launch built-in browser, falling back to regular CDP strategy", tag="BROWSER") + # Call CDP's start but skip BaseBrowserStrategy.start() since we already called it + return await CDPBrowserStrategy.start(self) + self.config.cdp_url = cdp_url + + # Connect to the browser using CDP protocol + self.browser = await self.playwright.chromium.connect_over_cdp(self.config.cdp_url) + + # Get or create default context + contexts = self.browser.contexts + if contexts: + self.default_context = contexts[0] + else: + self.default_context = await self.create_browser_context() + + await self.setup_context(self.default_context) + + if self.logger: + self.logger.debug(f"Connected to built-in browser at {self.config.cdp_url}", tag="BUILTIN") + + return self + except Exception as e: + if self.logger: + self.logger.error(f"Failed to start built-in browser: {str(e)}", tag="BUILTIN") + raise + + async def get_page(self, crawlerRunConfig: CrawlerRunConfig) -> Tuple[Page, BrowserContext]: + """Get a page for the given configuration. + + Inherits behavior from CDPBrowserStrategy for page management. + + Args: + crawlerRunConfig: Configuration object for the crawler run + + Returns: + Tuple of (Page, BrowserContext) + """ + # For built-in browsers, we use the same page management as CDP strategy + return await super().get_page(crawlerRunConfig) + @classmethod def get_builtin_browser_info(cls, debugging_port: int, config_file: str, logger: Optional[AsyncLogger] = None) -> Optional[Dict[str, Any]]: """Get information about the built-in browser for a specific debugging port. @@ -116,7 +157,31 @@ class BuiltinBrowserStrategy(CDPBrowserStrategy): browser_info = browser_info_dict["port_map"][port_str] # Check if the browser is still running - if not is_browser_running(browser_info.get('pid')): + pids = browser_info.get('pid') + if type(pids) == str and len(pids.split("\n")) > 1: + pids = [int(pid) for pid in pids.split("\n") if pid.isdigit()] + elif type(pids) == str and pids.isdigit(): + pids = [int(pids)] + elif type(pids) == int: + pids = [pids] + else: + pids = [] + # Check if any of the PIDs are running + if not pids: + if logger: + logger.warning(f"Built-in browser on port {debugging_port} has no valid PID", tag="BUILTIN") + # Remove this port from the dictionary + del browser_info_dict["port_map"][port_str] + with open(config_file, 'w') as f: + json.dump(browser_info_dict, f, indent=2) + return None + # Check if any of the PIDs are running + for pid in pids: + if is_browser_running(pid): + browser_info['pid'] = pid + break + else: + # If none of the PIDs are running, remove this port from the dictionary if logger: logger.warning(f"Built-in browser on port {debugging_port} is not running", tag="BUILTIN") # Remove this port from the dictionary @@ -146,7 +211,6 @@ class BuiltinBrowserStrategy(CDPBrowserStrategy): logger=self.logger ) - async def launch_builtin_browser(self, browser_type: str = "chromium", debugging_port: int = 9222, @@ -207,57 +271,50 @@ class BuiltinBrowserStrategy(CDPBrowserStrategy): return None try: - # Start the browser process detached - if is_windows(): - process = subprocess.Popen( - args, - stdout=subprocess.PIPE, - stderr=subprocess.PIPE, - creationflags=subprocess.DETACHED_PROCESS | subprocess.CREATE_NEW_PROCESS_GROUP - ) - else: - process = subprocess.Popen( - args, - stdout=subprocess.PIPE, - stderr=subprocess.PIPE, - preexec_fn=os.setpgrp # Start in a new process group - ) - - # Wait briefly to ensure the process starts successfully - await asyncio.sleep(2.0) - - # Check if the process is still running - if process.poll() is not None: - if self.logger: - self.logger.error(f"Browser process exited immediately with code {process.returncode}", tag="BUILTIN") - return None - - # Construct CDP URL + + # Check if the port is already in use + PID = "" cdp_url = f"http://localhost:{debugging_port}" - - # Try to verify browser is responsive by fetching version info - import aiohttp - json_url = f"{cdp_url}/json/version" - config_json = None - - try: - async with aiohttp.ClientSession() as session: - for _ in range(10): # Try multiple times - try: - async with session.get(json_url) as response: - if response.status == 200: - config_json = await response.json() - break - except Exception: - pass - await asyncio.sleep(0.5) - except Exception as e: + config_json = await self._check_port_in_use(cdp_url) + if config_json: if self.logger: - self.logger.warning(f"Could not verify browser: {str(e)}", tag="BUILTIN") + self.logger.info(f"Port {debugging_port} is already in use.", tag="BUILTIN") + PID = find_process_by_port(debugging_port) + else: + # Start the browser process detached + process = None + if is_windows(): + process = subprocess.Popen( + args, + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + creationflags=subprocess.DETACHED_PROCESS | subprocess.CREATE_NEW_PROCESS_GROUP + ) + else: + process = subprocess.Popen( + args, + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + preexec_fn=os.setpgrp # Start in a new process group + ) + + # Wait briefly to ensure the process starts successfully + await asyncio.sleep(2.0) + + # Check if the process is still running + if process and process.poll() is not None: + if self.logger: + self.logger.error(f"Browser process exited immediately with code {process.returncode}", tag="BUILTIN") + return None + + PID = process.pid + # Construct CDP URL + config_json = await self._check_port_in_use(cdp_url) + # Create browser info browser_info = { - 'pid': process.pid, + 'pid': PID, 'cdp_url': cdp_url, 'user_data_dir': user_data_dir, 'browser_type': browser_type, @@ -304,7 +361,37 @@ class BuiltinBrowserStrategy(CDPBrowserStrategy): if self.logger: self.logger.error(f"Error launching built-in browser: {str(e)}", tag="BUILTIN") return None - + + async def _check_port_in_use(self, cdp_url: str) -> dict: + """Check if a port is already in use by a Chrome DevTools instance. + + Args: + cdp_url: The CDP URL to check + + Returns: + dict: Chrome DevTools protocol version information or None if not found + """ + import aiohttp + json_url = f"{cdp_url}/json/version" + json_config = None + + try: + async with aiohttp.ClientSession() as session: + try: + async with session.get(json_url, timeout=2.0) as response: + if response.status == 200: + json_config = await response.json() + if self.logger: + self.logger.debug(f"Found CDP server running at {cdp_url}", tag="BUILTIN") + return json_config + except (aiohttp.ClientError, asyncio.TimeoutError): + pass + return None + except Exception as e: + if self.logger: + self.logger.debug(f"Error checking CDP port: {str(e)}", tag="BUILTIN") + return None + async def kill_builtin_browser(self) -> bool: """Kill the built-in browser if it's running. @@ -321,20 +408,8 @@ class BuiltinBrowserStrategy(CDPBrowserStrategy): if not pid: return False - try: - if is_windows(): - subprocess.run(["taskkill", "/F", "/PID", str(pid)], check=True) - else: - os.kill(pid, signal.SIGTERM) - # Wait for termination - for _ in range(5): - if not is_browser_running(pid): - break - await asyncio.sleep(0.5) - else: - # Force kill if still running - os.kill(pid, signal.SIGKILL) - + success, error_msg = terminate_process(pid, logger=self.logger) + if success: # Update config file to remove this browser with open(self.builtin_config_file, 'r') as f: browser_info_dict = json.load(f) @@ -355,9 +430,9 @@ class BuiltinBrowserStrategy(CDPBrowserStrategy): if self.logger: self.logger.success("Built-in browser terminated", tag="BUILTIN") return True - except Exception as e: + else: if self.logger: - self.logger.error(f"Error killing built-in browser: {str(e)}", tag="BUILTIN") + self.logger.error(f"Error killing built-in browser: {error_msg}", tag="BUILTIN") return False async def get_builtin_browser_status(self) -> Dict[str, Any]: @@ -383,12 +458,16 @@ class BuiltinBrowserStrategy(CDPBrowserStrategy): 'port': self.config.debugging_port } - # Override the close method to handle built-in browser cleanup async def close(self): """Close the built-in browser and clean up resources.""" + # Store the shutting_down state + was_shutting_down = getattr(self, 'shutting_down', False) + # Call parent class close method await super().close() - # Clean up built-in browser if we created it - if self.shutting_down: + # Clean up built-in browser if we created it and were in shutdown mode + if was_shutting_down: await self.kill_builtin_browser() + if self.logger: + self.logger.debug("Killed built-in browser during shutdown", tag="BUILTIN") \ No newline at end of file diff --git a/crawl4ai/browser/strategies/cdp.py b/crawl4ai/browser/strategies/cdp.py index d1d543dc..26eba00e 100644 --- a/crawl4ai/browser/strategies/cdp.py +++ b/crawl4ai/browser/strategies/cdp.py @@ -16,7 +16,7 @@ from playwright.async_api import BrowserContext, Page from ...async_logger import AsyncLogger from ...async_configs import BrowserConfig, CrawlerRunConfig -from ..utils import get_playwright, get_browser_executable, create_temp_directory, is_windows +from ..utils import get_playwright, get_browser_executable, create_temp_directory, is_windows, check_process_is_running, terminate_process from .base import BaseBrowserStrategy @@ -47,22 +47,34 @@ class CDPBrowserStrategy(BaseBrowserStrategy): Returns: self: For method chaining """ - self.playwright = await get_playwright() + # Call the base class start to initialize Playwright + await super().start() - # Get or create CDP URL - cdp_url = await self._get_or_create_cdp_url() - - # Connect to the browser using CDP - self.browser = await self.playwright.chromium.connect_over_cdp(cdp_url) - - # Get or create default context - contexts = self.browser.contexts - if contexts: - self.default_context = contexts[0] - else: - self.default_context = await self.create_browser_context() - - await self.setup_context(self.default_context) + try: + # Get or create CDP URL + cdp_url = await self._get_or_create_cdp_url() + + # Connect to the browser using CDP + self.browser = await self.playwright.chromium.connect_over_cdp(cdp_url) + + # Get or create default context + contexts = self.browser.contexts + if contexts: + self.default_context = contexts[0] + else: + self.default_context = await self.create_browser_context() + + await self.setup_context(self.default_context) + + if self.logger: + self.logger.debug(f"Connected to CDP browser at {cdp_url}", tag="CDP") + except Exception as e: + if self.logger: + self.logger.error(f"Failed to connect to CDP browser: {str(e)}", tag="CDP") + # Clean up any resources before re-raising + await self._cleanup_process() + raise + return self async def _get_or_create_cdp_url(self) -> str: @@ -105,39 +117,25 @@ class CDPBrowserStrategy(BaseBrowserStrategy): ) # Monitor for a short time to make sure it starts properly - await asyncio.sleep(0.5) # Give browser time to start - await self._initial_startup_check() - await asyncio.sleep(2) # Give browser more time to start + is_running, return_code, stdout, stderr = await check_process_is_running(self.browser_process, delay=2) + if not is_running: + if self.logger: + self.logger.error( + message="Browser process terminated unexpectedly | Code: {code} | STDOUT: {stdout} | STDERR: {stderr}", + tag="ERROR", + params={ + "code": return_code, + "stdout": stdout.decode() if stdout else "", + "stderr": stderr.decode() if stderr else "", + }, + ) + await self._cleanup_process() + raise Exception("Browser process terminated unexpectedly") + return f"http://localhost:{self.config.debugging_port}" except Exception as e: await self._cleanup_process() - raise Exception(f"Failed to start browser: {e}") - - async def _initial_startup_check(self): - """Perform a quick check to make sure the browser started successfully.""" - if not self.browser_process: - return - - # Check that process started without immediate termination - await asyncio.sleep(0.5) - if self.browser_process.poll() is not None: - # Process already terminated - stdout, stderr = b"", b"" - try: - stdout, stderr = self.browser_process.communicate(timeout=0.5) - except subprocess.TimeoutExpired: - pass - - if self.logger: - self.logger.error( - message="Browser process terminated during startup | Code: {code} | STDOUT: {stdout} | STDERR: {stderr}", - tag="ERROR", - params={ - "code": self.browser_process.returncode, - "stdout": stdout.decode() if stdout else "", - "stderr": stderr.decode() if stderr else "", - }, - ) + raise Exception(f"Failed to start browser: {e}") async def _get_browser_args(self, user_data_dir: str) -> List[str]: """Returns browser-specific command line arguments. @@ -148,6 +146,7 @@ class CDPBrowserStrategy(BaseBrowserStrategy): Returns: List of command-line arguments for the browser """ + browser_args = super()._build_browser_args() browser_path = await get_browser_executable(self.config.browser_type) base_args = [browser_path] @@ -170,7 +169,7 @@ class CDPBrowserStrategy(BaseBrowserStrategy): else: raise NotImplementedError(f"Browser type {self.config.browser_type} not supported") - return base_args + args + return base_args + browser_args + args async def _cleanup_process(self): """Cleanup browser process and temporary directory.""" @@ -179,33 +178,26 @@ class CDPBrowserStrategy(BaseBrowserStrategy): if self.browser_process: try: - # Only terminate if we have proper control over the process - if not self.browser_process.poll(): - # Process is still running - self.browser_process.terminate() - # Wait for process to end gracefully - for _ in range(10): # 10 attempts, 100ms each - if self.browser_process.poll() is not None: - break - await asyncio.sleep(0.1) - - # Force kill if still running - if self.browser_process.poll() is None: - if is_windows(): - # On Windows we might need taskkill for detached processes - try: - subprocess.run(["taskkill", "/F", "/PID", str(self.browser_process.pid)]) - except Exception: - self.browser_process.kill() - else: - self.browser_process.kill() - await asyncio.sleep(0.1) # Brief wait for kill to take effect - + # Only attempt termination if the process is still running + if self.browser_process.poll() is None: + # Use our robust cross-platform termination utility + success = terminate_process( + pid=self.browser_process.pid, + timeout=1.0, # Equivalent to the previous 10*0.1s wait + logger=self.logger + ) + + if not success and self.logger: + self.logger.warning( + message="Failed to terminate browser process cleanly", + tag="PROCESS" + ) + except Exception as e: if self.logger: self.logger.error( - message="Error terminating browser: {error}", - tag="ERROR", + message="Error during browser process cleanup: {error}", + tag="ERROR", params={"error": str(e)}, ) @@ -220,54 +212,6 @@ class CDPBrowserStrategy(BaseBrowserStrategy): params={"error": str(e)}, ) - async def create_browser_context(self, crawlerRunConfig: Optional[CrawlerRunConfig] = None) -> BrowserContext: - """Create a new browser context. - - Uses the base class implementation which handles all configurations. - - Args: - crawlerRunConfig: Configuration object for the crawler run - - Returns: - BrowserContext: Browser context object - """ - # Handle user_data_dir for CDP browsers - if self.config.user_data_dir: - # For CDP-based browsers, storage persistence is typically handled by the user_data_dir - # at the browser level, but we'll create a storage_state location for Playwright as well - storage_path = os.path.join(self.config.user_data_dir, "storage_state.json") - if not os.path.exists(storage_path): - # Create parent directory if it doesn't exist - os.makedirs(os.path.dirname(storage_path), exist_ok=True) - with open(storage_path, "w") as f: - json.dump({}, f) - self.config.storage_state = storage_path - - # Use the base class implementation - return await super().create_browser_context(crawlerRunConfig) - - def _cleanup_expired_sessions(self): - """Clean up expired sessions based on TTL.""" - current_time = time.time() - expired_sessions = [ - sid - for sid, (_, _, last_used) in self.sessions.items() - if current_time - last_used > self.session_ttl - ] - for sid in expired_sessions: - asyncio.create_task(self._kill_session(sid)) - - async def _kill_session(self, session_id: str): - """Kill a browser session and clean up resources. - - Args: - session_id: The session ID to kill - """ - if session_id in self.sessions: - context, page, _ = self.sessions[session_id] - await page.close() - del self.sessions[session_id] - async def get_page(self, crawlerRunConfig: CrawlerRunConfig) -> Tuple[Page, BrowserContext]: """Get a page for the given configuration. @@ -277,6 +221,7 @@ class CDPBrowserStrategy(BaseBrowserStrategy): Returns: Tuple of (Page, BrowserContext) """ + # Clean up expired sessions using base class method self._cleanup_expired_sessions() # If a session_id is provided and we already have it, reuse that page + context @@ -289,71 +234,56 @@ class CDPBrowserStrategy(BaseBrowserStrategy): # For CDP, we typically use the shared default_context context = self.default_context pages = context.pages + + # Otherwise, check if we have an existing context for this config + config_signature = self._make_config_signature(crawlerRunConfig) + self.contexts_by_config[config_signature] = context + + await self.setup_context(context, crawlerRunConfig) + + # Check if there's already a page with the target URL page = next((p for p in pages if p.url == crawlerRunConfig.url), None) + + # If not found, create a new page if not page: page = await context.new_page() - # If a session_id is specified, store this session so we can reuse later + # If a session_id is specified, store this session for reuse if crawlerRunConfig.session_id: self.sessions[crawlerRunConfig.session_id] = (context, page, time.time()) return page, context - + async def close(self): - """Close the browser and clean up resources.""" + """Close the CDP browser and clean up resources.""" # Skip cleanup if using external CDP URL and not launched by us if self.config.cdp_url and not self.browser_process: + if self.logger: + self.logger.debug("Skipping cleanup for external CDP browser", tag="CDP") return - if self.config.sleep_on_close: - await asyncio.sleep(0.5) + # Call parent implementation for common cleanup + await super().close() - # If we have a user_data_dir configured, ensure persistence of storage state - if self.config.user_data_dir and self.browser and self.default_context: - for context in self.browser.contexts: - try: - await context.storage_state(path=os.path.join(self.config.user_data_dir, "Default", "storage_state.json")) - if self.logger: - self.logger.debug("Ensuring storage state is persisted before closing browser", tag="BROWSER") - except Exception as e: - if self.logger: - self.logger.warning( - message="Failed to ensure storage persistence: {error}", - tag="BROWSER", - params={"error": str(e)} - ) - - # Close all sessions - session_ids = list(self.sessions.keys()) - for session_id in session_ids: - await self._kill_session(session_id) - - # Close browser - if self.browser: - await self.browser.close() - self.browser = None - - # Clean up managed browser if we created it + # Additional CDP-specific cleanup if self.browser_process: await asyncio.sleep(0.5) await self._cleanup_process() self.browser_process = None + if self.logger: + self.logger.debug("Cleaned up CDP browser process", tag="CDP") - # Close temporary directory + # Clean up temporary directory if self.temp_dir and os.path.exists(self.temp_dir): try: shutil.rmtree(self.temp_dir) self.temp_dir = None + if self.logger: + self.logger.debug("Removed temporary directory", tag="CDP") except Exception as e: if self.logger: self.logger.error( message="Error removing temporary directory: {error}", - tag="ERROR", - params={"error": str(e)}, - ) - - # Stop playwright - if self.playwright: - await self.playwright.stop() - self.playwright = None - + tag="CDP", + params={"error": str(e)} + ) \ No newline at end of file diff --git a/crawl4ai/browser/strategies/docker_strategy.py b/crawl4ai/browser/strategies/docker_strategy.py index ca7e314a..a71d48e9 100644 --- a/crawl4ai/browser/strategies/docker_strategy.py +++ b/crawl4ai/browser/strategies/docker_strategy.py @@ -19,12 +19,12 @@ from .builtin import CDPBrowserStrategy class DockerBrowserStrategy(CDPBrowserStrategy): """Docker-based browser strategy. - + Extends the CDPBrowserStrategy to run browsers in Docker containers. Supports two modes: 1. "connect" - Uses a Docker image with Chrome already running 2. "launch" - Starts Chrome within the container with custom settings - + Attributes: docker_config: Docker-specific configuration options container_id: ID of current Docker container @@ -36,16 +36,16 @@ class DockerBrowserStrategy(CDPBrowserStrategy): internal_cdp_port: Chrome's internal CDP port internal_mapped_port: Port that socat maps to internally """ - + def __init__(self, config: BrowserConfig, logger: Optional[AsyncLogger] = None): """Initialize the Docker browser strategy. - + Args: config: Browser configuration including Docker-specific settings logger: Logger for recording events and errors """ super().__init__(config, logger) - + # Initialize Docker-specific attributes self.docker_config = self.config.docker_config or DockerConfig() self.container_id = None @@ -56,10 +56,9 @@ class DockerBrowserStrategy(CDPBrowserStrategy): if registry_file is None and self.config.user_data_dir: # Use the same registry file as BuiltinBrowserStrategy if possible registry_file = os.path.join( - os.path.dirname(self.config.user_data_dir), - "browser_config.json" + os.path.dirname(self.config.user_data_dir), "browser_config.json" ) - + self.registry = DockerRegistry(self.docker_config.registry_file) self.docker_utils = DockerUtils(logger) self.chrome_process_id = None @@ -70,39 +69,44 @@ class DockerBrowserStrategy(CDPBrowserStrategy): async def start(self): """Start or connect to a browser running in a Docker container. - - This method initializes Playwright and establishes a connection to + + This method initializes Playwright and establishes a connection to a browser running in a Docker container. Depending on the configured mode: - "connect": Connects to a container with Chrome already running - "launch": Creates a container and launches Chrome within it - + Returns: self: For method chaining """ # Initialize Playwright from ..utils import get_playwright + self.playwright = await get_playwright() - + if self.logger: self.logger.info( f"Starting Docker browser strategy in {self.docker_config.mode} mode", - tag="DOCKER" + tag="DOCKER", ) - + try: # Get CDP URL by creating or reusing a Docker container # This handles the container management and browser startup cdp_url = await self._get_or_create_cdp_url() - + if not cdp_url: - raise Exception("Failed to establish CDP connection to Docker container") - + raise Exception( + "Failed to establish CDP connection to Docker container" + ) + if self.logger: - self.logger.info(f"Connecting to browser in Docker via CDP: {cdp_url}", tag="DOCKER") - + self.logger.info( + f"Connecting to browser in Docker via CDP: {cdp_url}", tag="DOCKER" + ) + # Connect to the browser using CDP self.browser = await self.playwright.chromium.connect_over_cdp(cdp_url) - + # Get existing context or create default context contexts = self.browser.contexts if contexts: @@ -114,33 +118,35 @@ class DockerBrowserStrategy(CDPBrowserStrategy): self.logger.debug("Creating new browser context", tag="DOCKER") self.default_context = await self.create_browser_context() await self.setup_context(self.default_context) - + return self - + except Exception as e: # Clean up resources if startup fails if self.container_id and not self.docker_config.persistent: if self.logger: self.logger.warning( f"Cleaning up container after failed start: {self.container_id[:12]}", - tag="DOCKER" + tag="DOCKER", ) await self.docker_utils.remove_container(self.container_id) self.registry.unregister_container(self.container_id) self.container_id = None - + if self.playwright: await self.playwright.stop() self.playwright = None - + # Re-raise the exception if self.logger: - self.logger.error(f"Failed to start Docker browser: {str(e)}", tag="DOCKER") - raise + self.logger.error( + f"Failed to start Docker browser: {str(e)}", tag="DOCKER" + ) + raise async def _generate_config_hash(self) -> str: """Generate a hash of the configuration for container matching. - + Returns: Hash string uniquely identifying this configuration """ @@ -151,66 +157,77 @@ class DockerBrowserStrategy(CDPBrowserStrategy): "browser_type": self.config.browser_type, "headless": self.config.headless, } - + # Add browser-specific config if in launch mode if self.docker_config.mode == "launch": - config_dict.update({ - "text_mode": self.config.text_mode, - "light_mode": self.config.light_mode, - "viewport_width": self.config.viewport_width, - "viewport_height": self.config.viewport_height, - }) - + config_dict.update( + { + "text_mode": self.config.text_mode, + "light_mode": self.config.light_mode, + "viewport_width": self.config.viewport_width, + "viewport_height": self.config.viewport_height, + } + ) + # Use the utility method to generate the hash return self.docker_utils.generate_config_hash(config_dict) - + async def _get_or_create_cdp_url1(self) -> str: """Get CDP URL by either creating a new container or using an existing one. - + Returns: CDP URL for connecting to the browser - + Raises: Exception: If container creation or browser launch fails """ # If CDP URL is explicitly provided, use it if self.config.cdp_url: return self.config.cdp_url - + # Ensure Docker image exists (will build if needed) image_name = await self.docker_utils.ensure_docker_image_exists( - self.docker_config.image, - self.docker_config.mode + self.docker_config.image, self.docker_config.mode ) - + # Generate config hash for container matching config_hash = await self._generate_config_hash() - + # Look for existing container with matching config - container_id = self.registry.find_container_by_config(config_hash, self.docker_utils) - + container_id = self.registry.find_container_by_config( + config_hash, self.docker_utils + ) + if container_id: # Use existing container self.container_id = container_id host_port = self.registry.get_container_host_port(container_id) if self.logger: - self.logger.info(f"Using existing Docker container: {container_id[:12]}", tag="DOCKER") + self.logger.info( + f"Using existing Docker container: {container_id[:12]}", + tag="DOCKER", + ) else: # Get a port for the new container - host_port = self.docker_config.host_port or self.registry.get_next_available_port(self.docker_utils) - + host_port = ( + self.docker_config.host_port + or self.registry.get_next_available_port(self.docker_utils) + ) + # Prepare volumes list volumes = list(self.docker_config.volumes) - + # Add user data directory if specified if self.docker_config.user_data_dir: # Ensure user data directory exists os.makedirs(self.docker_config.user_data_dir, exist_ok=True) - volumes.append(f"{self.docker_config.user_data_dir}:{self.docker_config.container_user_data_dir}") - + volumes.append( + f"{self.docker_config.user_data_dir}:{self.docker_config.container_user_data_dir}" + ) + # Update config user_data_dir to point to container path self.config.user_data_dir = self.docker_config.container_user_data_dir - + # Create a new container container_id = await self.docker_utils.create_container( image_name=image_name, @@ -219,54 +236,63 @@ class DockerBrowserStrategy(CDPBrowserStrategy): volumes=volumes, network=self.docker_config.network, env_vars=self.docker_config.env_vars, - extra_args=self.docker_config.extra_args + extra_args=self.docker_config.extra_args, ) - + if not container_id: raise Exception("Failed to create Docker container") - + self.container_id = container_id - + # Register the container self.registry.register_container(container_id, host_port, config_hash) - + # Wait for container to be ready await self.docker_utils.wait_for_container_ready(container_id) - + # Handle specific setup based on mode if self.docker_config.mode == "launch": # In launch mode, we need to start socat and Chrome await self.docker_utils.start_socat_in_container(container_id) - + # Build browser arguments browser_args = self._build_browser_args() - + # Launch Chrome - await self.docker_utils.launch_chrome_in_container(container_id, browser_args) - + await self.docker_utils.launch_chrome_in_container( + container_id, browser_args + ) + # Get PIDs for later cleanup - self.chrome_process_id = await self.docker_utils.get_process_id_in_container( - container_id, "chrome" + self.chrome_process_id = ( + await self.docker_utils.get_process_id_in_container( + container_id, "chrome" + ) ) - self.socat_process_id = await self.docker_utils.get_process_id_in_container( - container_id, "socat" + self.socat_process_id = ( + await self.docker_utils.get_process_id_in_container( + container_id, "socat" + ) ) - + # Wait for CDP to be ready await self.docker_utils.wait_for_cdp_ready(host_port) - + if self.logger: - self.logger.success(f"Docker container ready: {container_id[:12]} on port {host_port}", tag="DOCKER") - + self.logger.success( + f"Docker container ready: {container_id[:12]} on port {host_port}", + tag="DOCKER", + ) + # Return CDP URL return f"http://localhost:{host_port}" async def _get_or_create_cdp_url(self) -> str: """Get CDP URL by either creating a new container or using an existing one. - + Returns: CDP URL for connecting to the browser - + Raises: Exception: If container creation or browser launch fails """ @@ -276,38 +302,47 @@ class DockerBrowserStrategy(CDPBrowserStrategy): # Ensure Docker image exists (will build if needed) image_name = await self.docker_utils.ensure_docker_image_exists( - self.docker_config.image, - self.docker_config.mode + self.docker_config.image, self.docker_config.mode ) - + # Generate config hash for container matching config_hash = await self._generate_config_hash() - + # Look for existing container with matching config - container_id = await self.registry.find_container_by_config(config_hash, self.docker_utils) - + container_id = await self.registry.find_container_by_config( + config_hash, self.docker_utils + ) + if container_id: # Use existing container self.container_id = container_id host_port = self.registry.get_container_host_port(container_id) if self.logger: - self.logger.info(f"Using existing Docker container: {container_id[:12]}", tag="DOCKER") + self.logger.info( + f"Using existing Docker container: {container_id[:12]}", + tag="DOCKER", + ) else: # Get a port for the new container - host_port = self.docker_config.host_port or self.registry.get_next_available_port(self.docker_utils) - + host_port = ( + self.docker_config.host_port + or self.registry.get_next_available_port(self.docker_utils) + ) + # Prepare volumes list volumes = list(self.docker_config.volumes) - + # Add user data directory if specified if self.docker_config.user_data_dir: # Ensure user data directory exists os.makedirs(self.docker_config.user_data_dir, exist_ok=True) - volumes.append(f"{self.docker_config.user_data_dir}:{self.docker_config.container_user_data_dir}") - - # Update config user_data_dir to point to container path - self.config.user_data_dir = self.docker_config.container_user_data_dir - + volumes.append( + f"{self.docker_config.user_data_dir}:{self.docker_config.container_user_data_dir}" + ) + + # # Update config user_data_dir to point to container path + # self.config.user_data_dir = self.docker_config.container_user_data_dir + # Create a new container container_id = await self.docker_utils.create_container( image_name=image_name, @@ -318,148 +353,196 @@ class DockerBrowserStrategy(CDPBrowserStrategy): env_vars=self.docker_config.env_vars, cpu_limit=self.docker_config.cpu_limit, memory_limit=self.docker_config.memory_limit, - extra_args=self.docker_config.extra_args + extra_args=self.docker_config.extra_args, ) - + if not container_id: raise Exception("Failed to create Docker container") - - self.container_id = container_id - + + self.container_id = container_id + # Wait for container to be ready await self.docker_utils.wait_for_container_ready(container_id) - + # Handle specific setup based on mode if self.docker_config.mode == "launch": # In launch mode, we need to start socat and Chrome await self.docker_utils.start_socat_in_container(container_id) - + # Build browser arguments browser_args = self._build_browser_args() - + # Launch Chrome - await self.docker_utils.launch_chrome_in_container(container_id, browser_args) - + await self.docker_utils.launch_chrome_in_container( + container_id, browser_args + ) + # Get PIDs for later cleanup - self.chrome_process_id = await self.docker_utils.get_process_id_in_container( - container_id, "chromium" + self.chrome_process_id = ( + await self.docker_utils.get_process_id_in_container( + container_id, "chromium" + ) ) - self.socat_process_id = await self.docker_utils.get_process_id_in_container( - container_id, "socat" + self.socat_process_id = ( + await self.docker_utils.get_process_id_in_container( + container_id, "socat" + ) ) - + # Wait for CDP to be ready cdp_json_config = await self.docker_utils.wait_for_cdp_ready(host_port) if cdp_json_config: # Register the container in the shared registry - self.registry.register_container(container_id, host_port, config_hash, cdp_json_config) + self.registry.register_container( + container_id, host_port, config_hash, cdp_json_config + ) else: raise Exception("Failed to get CDP JSON config from Docker container") - + if self.logger: - self.logger.success(f"Docker container ready: {container_id[:12]} on port {host_port}", tag="DOCKER") - + self.logger.success( + f"Docker container ready: {container_id[:12]} on port {host_port}", + tag="DOCKER", + ) + # Return CDP URL return f"http://localhost:{host_port}" def _build_browser_args(self) -> List[str]: """Build Chrome command line arguments based on BrowserConfig. - + Returns: List of command line arguments for Chrome """ - args = [ - "--no-sandbox", - "--disable-gpu", + # Call parent method to get common arguments + browser_args = super()._build_browser_args() + return browser_args["args"] + [ f"--remote-debugging-port={self.internal_cdp_port}", "--remote-debugging-address=0.0.0.0", # Allow external connections "--disable-dev-shm-usage", + "--headless=new", ] - - if self.config.headless: - args.append("--headless=new") - - if self.config.viewport_width and self.config.viewport_height: - args.append(f"--window-size={self.config.viewport_width},{self.config.viewport_height}") - - if self.config.user_agent: - args.append(f"--user-agent={self.config.user_agent}") - - if self.config.text_mode: - args.extend([ - "--blink-settings=imagesEnabled=false", - "--disable-remote-fonts", - "--disable-images", - "--disable-javascript", - ]) - - if self.config.light_mode: - # Import here to avoid circular import - from ..utils import get_browser_disable_options - args.extend(get_browser_disable_options()) - - if self.config.user_data_dir: - args.append(f"--user-data-dir={self.config.user_data_dir}") - - if self.config.extra_args: - args.extend(self.config.extra_args) - - return args - + + # args = [ + # "--no-sandbox", + # "--disable-gpu", + # f"--remote-debugging-port={self.internal_cdp_port}", + # "--remote-debugging-address=0.0.0.0", # Allow external connections + # "--disable-dev-shm-usage", + # ] + + # if self.config.headless: + # args.append("--headless=new") + + # if self.config.viewport_width and self.config.viewport_height: + # args.append(f"--window-size={self.config.viewport_width},{self.config.viewport_height}") + + # if self.config.user_agent: + # args.append(f"--user-agent={self.config.user_agent}") + + # if self.config.text_mode: + # args.extend([ + # "--blink-settings=imagesEnabled=false", + # "--disable-remote-fonts", + # "--disable-images", + # "--disable-javascript", + # ]) + + # if self.config.light_mode: + # # Import here to avoid circular import + # from ..utils import get_browser_disable_options + # args.extend(get_browser_disable_options()) + + # if self.config.user_data_dir: + # args.append(f"--user-data-dir={self.config.user_data_dir}") + + # if self.config.extra_args: + # args.extend(self.config.extra_args) + + # return args + async def close(self): """Close the browser and clean up Docker container if needed.""" - # Set shutting_down flag to prevent race conditions - self.shutting_down = True - - # Store state if needed before closing - if self.browser and self.docker_config.user_data_dir and self.docker_config.persistent: + # Set flag to track if we were the ones initiating shutdown + initiated_shutdown = not getattr(self, "shutting_down", False) + + # Storage persistence for Docker needs special handling + # We need to store state before calling super().close() which will close the browser + if ( + self.browser + and self.docker_config.user_data_dir + and self.docker_config.persistent + ): for context in self.browser.contexts: try: - storage_path = os.path.join(self.docker_config.user_data_dir, "storage_state.json") + # Ensure directory exists + os.makedirs(self.docker_config.user_data_dir, exist_ok=True) + + # Save storage state to user data directory + storage_path = os.path.join( + self.docker_config.user_data_dir, "storage_state.json" + ) await context.storage_state(path=storage_path) if self.logger: - self.logger.debug("Persisted storage state before closing browser", tag="DOCKER") + self.logger.debug( + "Persisted Docker-specific storage state", tag="DOCKER" + ) except Exception as e: if self.logger: self.logger.warning( - message="Failed to persist storage state: {error}", + message="Failed to persist Docker storage state: {error}", tag="DOCKER", - params={"error": str(e)} + params={"error": str(e)}, ) - - # Close browser connection (but not container) - if self.browser: - await self.browser.close() - self.browser = None - - # Only clean up container if not persistent - if self.container_id and not self.docker_config.persistent: - # Stop Chrome process in "launch" mode - if self.docker_config.mode == "launch" and self.chrome_process_id: - await self.docker_utils.stop_process_in_container( - self.container_id, self.chrome_process_id - ) - - # Stop socat process in "launch" mode - if self.docker_config.mode == "launch" and self.socat_process_id: - await self.docker_utils.stop_process_in_container( - self.container_id, self.socat_process_id - ) - - # Remove or stop container based on configuration - if self.docker_config.remove_on_exit: - await self.docker_utils.remove_container(self.container_id) - # Unregister from registry - self.registry.unregister_container(self.container_id) - else: - await self.docker_utils.stop_container(self.container_id) - - self.container_id = None - - # Close Playwright - if self.playwright: - await self.playwright.stop() - self.playwright = None - - self.shutting_down = False \ No newline at end of file + + # Call parent method to handle common cleanup + await super().close() + + # Only perform container cleanup if we initiated shutdown + # and we need to handle Docker-specific resources + if initiated_shutdown: + # Only clean up container if not persistent + if self.container_id and not self.docker_config.persistent: + # Stop Chrome process in "launch" mode + if self.docker_config.mode == "launch" and self.chrome_process_id: + await self.docker_utils.stop_process_in_container( + self.container_id, self.chrome_process_id + ) + if self.logger: + self.logger.debug( + f"Stopped Chrome process {self.chrome_process_id} in container", + tag="DOCKER", + ) + + # Stop socat process in "launch" mode + if self.docker_config.mode == "launch" and self.socat_process_id: + await self.docker_utils.stop_process_in_container( + self.container_id, self.socat_process_id + ) + if self.logger: + self.logger.debug( + f"Stopped socat process {self.socat_process_id} in container", + tag="DOCKER", + ) + + # Remove or stop container based on configuration + if self.docker_config.remove_on_exit: + await self.docker_utils.remove_container(self.container_id) + # Unregister from registry + if hasattr(self, "registry") and self.registry: + self.registry.unregister_container(self.container_id) + if self.logger: + self.logger.debug( + f"Removed Docker container {self.container_id}", + tag="DOCKER", + ) + else: + await self.docker_utils.stop_container(self.container_id) + if self.logger: + self.logger.debug( + f"Stopped Docker container {self.container_id}", + tag="DOCKER", + ) + + self.container_id = None diff --git a/crawl4ai/browser/strategies/playwright.py b/crawl4ai/browser/strategies/playwright.py index 817603ca..b24edf72 100644 --- a/crawl4ai/browser/strategies/playwright.py +++ b/crawl4ai/browser/strategies/playwright.py @@ -4,17 +4,13 @@ This module implements the browser strategy pattern for different browser implementations, including Playwright, CDP, and builtin browsers. """ -import asyncio -import os import time -import json from typing import Optional, Tuple -from playwright.async_api import BrowserContext, Page, ProxySettings +from playwright.async_api import BrowserContext, Page from ...async_logger import AsyncLogger from ...async_configs import BrowserConfig, CrawlerRunConfig -from ..utils import get_playwright, get_browser_disable_options from playwright_stealth import StealthConfig @@ -50,9 +46,7 @@ class PlaywrightBrowserStrategy(BaseBrowserStrategy): logger: Logger for recording events and errors """ super().__init__(config, logger) - # Add session management - self.sessions = {} - self.session_ttl = 1800 # 30 minutes + # No need to re-initialize sessions and session_ttl as they're now in the base class async def start(self): """Start the browser instance. @@ -60,142 +54,32 @@ class PlaywrightBrowserStrategy(BaseBrowserStrategy): Returns: self: For method chaining """ - self.playwright = await get_playwright() + # Call the base class start to initialize Playwright + await super().start() + + # Build browser arguments using the base class method browser_args = self._build_browser_args() - # Launch appropriate browser type - if self.config.browser_type == "firefox": - self.browser = await self.playwright.firefox.launch(**browser_args) - elif self.config.browser_type == "webkit": - self.browser = await self.playwright.webkit.launch(**browser_args) - else: - self.browser = await self.playwright.chromium.launch(**browser_args) - - self.default_context = self.browser - return self - - def _build_browser_args(self) -> dict: - """Build browser launch arguments from config. - - Returns: - dict: Browser launch arguments - """ - args = [ - "--disable-gpu", - "--disable-gpu-compositing", - "--disable-software-rasterizer", - "--no-sandbox", - "--disable-dev-shm-usage", - "--no-first-run", - "--no-default-browser-check", - "--disable-infobars", - "--window-position=0,0", - "--ignore-certificate-errors", - "--ignore-certificate-errors-spki-list", - "--disable-blink-features=AutomationControlled", - "--window-position=400,0", - "--disable-renderer-backgrounding", - "--disable-ipc-flooding-protection", - "--force-color-profile=srgb", - "--mute-audio", - "--disable-background-timer-throttling", - f"--window-size={self.config.viewport_width},{self.config.viewport_height}", - ] - - if self.config.light_mode: - args.extend(get_browser_disable_options()) - - if self.config.text_mode: - args.extend( - [ - "--blink-settings=imagesEnabled=false", - "--disable-remote-fonts", - "--disable-images", - "--disable-javascript", - "--disable-software-rasterizer", - "--disable-dev-shm-usage", - ] - ) - - if self.config.extra_args: - args.extend(self.config.extra_args) - - browser_args = {"headless": self.config.headless, "args": args} - - if self.config.chrome_channel: - browser_args["channel"] = self.config.chrome_channel - - if self.config.accept_downloads: - browser_args["downloads_path"] = self.config.downloads_path or os.path.join( - os.getcwd(), "downloads" - ) - os.makedirs(browser_args["downloads_path"], exist_ok=True) - - if self.config.proxy or self.config.proxy_config: - proxy_settings = ( - ProxySettings(server=self.config.proxy) - if self.config.proxy - else ProxySettings( - server=self.config.proxy_config.server, - username=self.config.proxy_config.username, - password=self.config.proxy_config.password, - ) - ) - browser_args["proxy"] = proxy_settings - - return browser_args - - async def create_browser_context(self, crawlerRunConfig: Optional[CrawlerRunConfig] = None) -> BrowserContext: - """Creates and returns a new browser context with configured settings. - - This implementation extends the base class version to handle user_data_dir specifically. - - Args: - crawlerRunConfig: Configuration object for the crawler run - - Returns: - BrowserContext: Browser context object with the specified configurations - """ - # Handle user_data_dir explicitly to ensure storage persistence - if self.config.user_data_dir: - # Create a storage state file path if none exists - storage_path = os.path.join(self.config.user_data_dir, "Default", "storage_state.json") - - # Create the file if it doesn't exist - if not os.path.exists(storage_path): - os.makedirs(os.path.dirname(storage_path), exist_ok=True) - with open(storage_path, "w") as f: - json.dump({}, f) - - # Override storage_state with our specific path - self.config.storage_state = storage_path - if self.logger: - self.logger.debug(f"Using persistent storage state at: {storage_path}", tag="BROWSER") + try: + # Launch appropriate browser type + if self.config.browser_type == "firefox": + self.browser = await self.playwright.firefox.launch(**browser_args) + elif self.config.browser_type == "webkit": + self.browser = await self.playwright.webkit.launch(**browser_args) + else: + self.browser = await self.playwright.chromium.launch(**browser_args) - # Now call the base class implementation which handles everything else - return await super().create_browser_context(crawlerRunConfig) - - def _cleanup_expired_sessions(self): - """Clean up expired sessions based on TTL.""" - current_time = time.time() - expired_sessions = [ - sid - for sid, (_, _, last_used) in self.sessions.items() - if current_time - last_used > self.session_ttl - ] - for sid in expired_sessions: - asyncio.create_task(self._kill_session(sid)) - - async def _kill_session(self, session_id: str): - """Kill a browser session and clean up resources. - - Args: - session_id: The session ID to kill - """ - if session_id in self.sessions: - context, page, _ = self.sessions[session_id] - await page.close() - del self.sessions[session_id] + self.default_context = self.browser + + if self.logger: + self.logger.debug(f"Launched {self.config.browser_type} browser", tag="BROWSER") + + except Exception as e: + if self.logger: + self.logger.error(f"Failed to launch browser: {str(e)}", tag="BROWSER") + raise + + return self async def get_page(self, crawlerRunConfig: CrawlerRunConfig) -> Tuple[Page, BrowserContext]: """Get a page for the given configuration. @@ -236,49 +120,9 @@ class PlaywrightBrowserStrategy(BaseBrowserStrategy): self.sessions[crawlerRunConfig.session_id] = (context, page, time.time()) return page, context - - async def close(self): - """Close the browser and clean up resources.""" - if self.config.sleep_on_close: - await asyncio.sleep(0.5) - - # If we have a user_data_dir configured, ensure persistence of storage state - if self.config.user_data_dir and self.browser and self.default_context: - for context in self.browser.contexts: - try: - await context.storage_state(path=os.path.join(self.config.user_data_dir, "Default", "storage_state.json")) - if self.logger: - self.logger.debug("Ensuring storage state is persisted before closing browser", tag="BROWSER") - except Exception as e: - if self.logger: - self.logger.warning( - message="Failed to ensure storage persistence: {error}", - tag="BROWSER", - params={"error": str(e)} - ) - # Close all sessions - session_ids = list(self.sessions.keys()) - for session_id in session_ids: - await self._kill_session(session_id) - - # Close all contexts we created - for ctx in self.contexts_by_config.values(): - try: - await ctx.close() - except Exception as e: - if self.logger: - self.logger.error( - message="Error closing context: {error}", - tag="ERROR", - params={"error": str(e)} - ) - self.contexts_by_config.clear() - - if self.browser: - await self.browser.close() - self.browser = None - - if self.playwright: - await self.playwright.stop() - self.playwright = None + async def close(self): + """Close the Playwright browser and clean up resources.""" + # The base implementation already handles everything needed for Playwright + # including storage persistence, sessions, contexts, browser and playwright + await super().close() \ No newline at end of file diff --git a/crawl4ai/browser/utils.py b/crawl4ai/browser/utils.py index 74d2ea12..421230bf 100644 --- a/crawl4ai/browser/utils.py +++ b/crawl4ai/browser/utils.py @@ -11,7 +11,9 @@ import sys import time import tempfile import subprocess -from typing import Optional +from typing import Optional, Tuple, Union +import signal +import psutil from playwright.async_api import async_playwright @@ -93,6 +95,8 @@ def is_browser_running(pid: Optional[int]) -> bool: return False try: + if type(pid) is str: + pid = int(pid) # Check if the process exists if is_windows(): process = subprocess.run(["tasklist", "/FI", f"PID eq {pid}"], @@ -326,3 +330,136 @@ async def find_optimal_browser_config(total_urls=50, verbose=True, rate_limit_de "optimal": optimal, "all_configs": results } + + +# Find process ID of the existing browser using os +def find_process_by_port(port: int) -> str: + """Find process ID listening on a specific port. + + Args: + port: Port number to check + + Returns: + str: Process ID or empty string if not found + """ + try: + if is_windows(): + cmd = f"netstat -ano | findstr :{port}" + result = subprocess.check_output(cmd, shell=True).decode() + return result.strip().split()[-1] if result else "" + else: + cmd = f"lsof -i :{port} -t" + return subprocess.check_output(cmd, shell=True).decode().strip() + except subprocess.CalledProcessError: + return "" + +async def check_process_is_running(process: subprocess.Popen, delay: float = 0.5) -> Tuple[bool, Optional[int], bytes, bytes]: + """Perform a quick check to make sure the browser started successfully.""" + if not process: + return False, None, b"", b"" + + # Check that process started without immediate termination + await asyncio.sleep(delay) + if process.poll() is not None: + # Process already terminated + stdout, stderr = b"", b"" + try: + stdout, stderr = process.communicate(timeout=0.5) + except subprocess.TimeoutExpired: + pass + + return False, process.returncode, stdout, stderr + + + return True, 0, b"", b"" + + +def terminate_process( + pid: Union[int, str], + timeout: float = 5.0, + force_kill_timeout: float = 3.0, + logger = None +) -> Tuple[bool, Optional[str]]: + """ + Robustly terminate a process across platforms with verification. + + Args: + pid: Process ID to terminate (int or string) + timeout: Seconds to wait for graceful termination before force killing + force_kill_timeout: Seconds to wait after force kill before considering it failed + logger: Optional logger object with error, warning, and info methods + + Returns: + Tuple of (success: bool, error_message: Optional[str]) + """ + # Convert pid to int if it's a string + if isinstance(pid, str): + try: + pid = int(pid) + except ValueError: + error_msg = f"Invalid PID format: {pid}" + if logger: + logger.error(error_msg) + return False, error_msg + + # Check if process exists + if not psutil.pid_exists(pid): + return True, None # Process already terminated + + try: + process = psutil.Process(pid) + + # First attempt: graceful termination + if logger: + logger.info(f"Attempting graceful termination of process {pid}") + + if os.name == 'nt': # Windows + subprocess.run(["taskkill", "/PID", str(pid)], + stdout=subprocess.DEVNULL, + stderr=subprocess.DEVNULL, + check=False) + else: # Unix/Linux/MacOS + process.send_signal(signal.SIGTERM) + + # Wait for process to terminate + try: + process.wait(timeout=timeout) + if logger: + logger.info(f"Process {pid} terminated gracefully") + return True, None + except psutil.TimeoutExpired: + if logger: + logger.warning(f"Process {pid} did not terminate gracefully within {timeout} seconds, forcing termination") + + # Second attempt: force kill + if os.name == 'nt': # Windows + subprocess.run(["taskkill", "/F", "/PID", str(pid)], + stdout=subprocess.DEVNULL, + stderr=subprocess.DEVNULL, + check=False) + else: # Unix/Linux/MacOS + process.send_signal(signal.SIGKILL) + + # Verify process is killed + gone, alive = psutil.wait_procs([process], timeout=force_kill_timeout) + if process in alive: + error_msg = f"Failed to kill process {pid} even after force kill" + if logger: + logger.error(error_msg) + return False, error_msg + + if logger: + logger.info(f"Process {pid} terminated by force") + return True, None + + except psutil.NoSuchProcess: + # Process terminated while we were working with it + if logger: + logger.info(f"Process {pid} already terminated") + return True, None + + except Exception as e: + error_msg = f"Error terminating process {pid}: {str(e)}" + if logger: + logger.error(error_msg) + return False, error_msg \ No newline at end of file diff --git a/tests/browser/docker/test_docker_browser.py b/tests/browser/docker/test_docker_browser.py index 610a230e..cde5df23 100644 --- a/tests/browser/docker/test_docker_browser.py +++ b/tests/browser/docker/test_docker_browser.py @@ -614,9 +614,9 @@ async def run_tests(): # return # Run browser tests - # results.append(await test_docker_connect_mode()) - # results.append(await test_docker_launch_mode()) - # results.append(await test_docker_persistent_storage()) + results.append(await test_docker_connect_mode()) + results.append(await test_docker_launch_mode()) + results.append(await test_docker_persistent_storage()) results.append(await test_docker_parallel_pages()) results.append(await test_docker_registry_reuse()) diff --git a/tests/browser/test_builtin_browser.py b/tests/browser/test_builtin_browser.py index 013da637..0735e457 100644 --- a/tests/browser/test_builtin_browser.py +++ b/tests/browser/test_builtin_browser.py @@ -77,7 +77,7 @@ async def test_builtin_browser_creation(): # Step 4: Get browser info from the strategy print(f"\n{INFO}4. Getting browser information{RESET}") - browser_info = manager._strategy.get_builtin_browser_info() + browser_info = manager._strategy.get_browser_info() if browser_info: print(f"{SUCCESS}Browser info retrieved:{RESET}") for key, value in browser_info.items(): @@ -205,7 +205,7 @@ async def test_multiple_managers(): # Step 1: Create first manager print(f"\n{INFO}1. Creating first browser manager{RESET}") - browser_config1 = (BrowserConfig(browser_mode="builtin", headless=True),) + browser_config1 = BrowserConfig(browser_mode="builtin", headless=True) manager1 = BrowserManager(browser_config=browser_config1, logger=logger) # Step 2: Create second manager @@ -781,15 +781,16 @@ async def main(): # await manager.close() # Run multiple managers test - # await test_multiple_managers() + await test_multiple_managers() # Run performance scaling test await test_performance_scaling() + # Run cleanup test - # await cleanup_browsers() + await cleanup_browsers() # Run edge cases test - # await test_edge_cases() + await test_edge_cases() print(f"\n{SUCCESS}All tests completed!{RESET}") diff --git a/tests/browser/test_cdp_strategy.py b/tests/browser/test_cdp_strategy.py index abadf42a..1df089a5 100644 --- a/tests/browser/test_cdp_strategy.py +++ b/tests/browser/test_cdp_strategy.py @@ -25,6 +25,7 @@ async def test_cdp_launch_connect(): browser_config = BrowserConfig( use_managed_browser=True, + browser_mode="cdp", headless=True ) @@ -70,8 +71,8 @@ async def test_cdp_with_user_data_dir(): logger.info(f"Created temporary user data directory: {user_data_dir}", tag="TEST") browser_config = BrowserConfig( - use_managed_browser=True, headless=True, + browser_mode="cdp", user_data_dir=user_data_dir ) @@ -210,7 +211,7 @@ async def run_tests(): results = [] # results.append(await test_cdp_launch_connect()) - # results.append(await test_cdp_with_user_data_dir()) + results.append(await test_cdp_with_user_data_dir()) results.append(await test_cdp_session_management()) # Print summary diff --git a/tests/browser/test_playwright_strategy.py b/tests/browser/test_playwright_strategy.py index 2344c9ba..94003b53 100644 --- a/tests/browser/test_playwright_strategy.py +++ b/tests/browser/test_playwright_strategy.py @@ -6,6 +6,7 @@ and serve as functional tests. import asyncio import os +import re import sys # Add the project root to Python path if running directly @@ -19,6 +20,53 @@ from crawl4ai.async_logger import AsyncLogger # Create a logger for clear terminal output logger = AsyncLogger(verbose=True, log_file=None) + + +async def test_start_close(): + # Create browser config for standard Playwright + browser_config = BrowserConfig( + headless=True, + viewport_width=1280, + viewport_height=800 + ) + + # Create browser manager with the config + manager = BrowserManager(browser_config=browser_config, logger=logger) + + try: + for _ in range(4): + # Start the browser + await manager.start() + logger.info("Browser started successfully", tag="TEST") + + # Get a page + page, context = await manager.get_page(CrawlerRunConfig()) + logger.info("Got page successfully", tag="TEST") + + # Navigate to a website + await page.goto("https://example.com") + logger.info("Navigated to example.com", tag="TEST") + + # Get page title + title = await page.title() + logger.info(f"Page title: {title}", tag="TEST") + + # Clean up + await manager.close() + logger.info("Browser closed successfully", tag="TEST") + + await asyncio.sleep(1) # Wait for a moment before restarting + + except Exception as e: + logger.error(f"Test failed: {str(e)}", tag="TEST") + # Ensure cleanup + try: + await manager.close() + except: + pass + return False + return True + async def test_playwright_basic(): """Test basic Playwright browser functionality.""" logger.info("Testing standard Playwright browser", tag="TEST") @@ -248,9 +296,10 @@ async def run_tests(): """Run all tests sequentially.""" results = [] - results.append(await test_playwright_basic()) - results.append(await test_playwright_text_mode()) - results.append(await test_playwright_context_reuse()) + # results.append(await test_start_close()) + # results.append(await test_playwright_basic()) + # results.append(await test_playwright_text_mode()) + # results.append(await test_playwright_context_reuse()) results.append(await test_playwright_session_management()) # Print summary From bb0239808683a11589aed77cc19faf8b3b6d27fb Mon Sep 17 00:00:00 2001 From: UncleCode Date: Sun, 30 Mar 2025 20:58:39 +0800 Subject: [PATCH 5/7] refactor(browser): improve browser strategy architecture and lifecycle management Major refactoring of browser strategy implementations to improve code organization and reliability: - Move CrawlResultContainer and RunManyReturn types from async_webcrawler to models.py - Simplify browser lifecycle management in AsyncWebCrawler - Standardize browser strategy interface with _generate_page method - Improve headless mode handling and browser args construction - Clean up Docker and Playwright strategy implementations - Fix session management and context handling across strategies BREAKING CHANGE: Browser strategy interface has changed with new _generate_page method requirement --- crawl4ai/async_configs.py | 2 +- crawl4ai/async_webcrawler.py | 180 ++++++------------ crawl4ai/browser/manager.py | 58 ++---- crawl4ai/browser/strategies/base.py | 41 ++-- crawl4ai/browser/strategies/builtin.py | 65 +++---- crawl4ai/browser/strategies/cdp.py | 128 ++++++------- .../browser/strategies/docker_strategy.py | 124 +----------- crawl4ai/browser/strategies/playwright.py | 38 ++-- crawl4ai/models.py | 62 +++--- tests/browser/docker/test_docker_browser.py | 4 +- tests/browser/test_builtin_browser.py | 28 +-- 11 files changed, 271 insertions(+), 459 deletions(-) diff --git a/crawl4ai/async_configs.py b/crawl4ai/async_configs.py index a287cfbe..9198fa1d 100644 --- a/crawl4ai/async_configs.py +++ b/crawl4ai/async_configs.py @@ -270,7 +270,7 @@ class BrowserConfig: host: str = "localhost", ): self.browser_type = browser_type - self.headless = headless + self.headless = headless and "new" or False self.browser_mode = browser_mode self.use_managed_browser = use_managed_browser self.cdp_url = cdp_url diff --git a/crawl4ai/async_webcrawler.py b/crawl4ai/async_webcrawler.py index bbee502b..fca2d673 100644 --- a/crawl4ai/async_webcrawler.py +++ b/crawl4ai/async_webcrawler.py @@ -4,18 +4,25 @@ import sys import time from colorama import Fore from pathlib import Path -from typing import Optional, List, Generic, TypeVar +from typing import Optional, List import json import asyncio # from contextlib import nullcontext, asynccontextmanager from contextlib import asynccontextmanager -from .models import CrawlResult, MarkdownGenerationResult, DispatchResult, ScrapingResult +from .models import ( + CrawlResult, + MarkdownGenerationResult, + DispatchResult, + ScrapingResult, + CrawlResultContainer, + RunManyReturn +) from .async_database import async_db_manager from .chunking_strategy import * # noqa: F403 from .chunking_strategy import IdentityChunking from .content_filter_strategy import * # noqa: F403 -from .extraction_strategy import * # noqa: F403 +from .extraction_strategy import * # noqa: F403 from .extraction_strategy import NoExtractionStrategy from .async_crawler_strategy import ( AsyncCrawlerStrategy, @@ -30,7 +37,7 @@ from .markdown_generation_strategy import ( from .deep_crawling import DeepCrawlDecorator from .async_logger import AsyncLogger, AsyncLoggerBase from .async_configs import BrowserConfig, CrawlerRunConfig -from .async_dispatcher import * # noqa: F403 +from .async_dispatcher import * # noqa: F403 from .async_dispatcher import BaseDispatcher, MemoryAdaptiveDispatcher, RateLimiter from .utils import ( @@ -42,45 +49,6 @@ from .utils import ( RobotsParser, ) -from typing import Union, AsyncGenerator - -CrawlResultT = TypeVar('CrawlResultT', bound=CrawlResult) -# RunManyReturn = Union[CrawlResultT, List[CrawlResultT], AsyncGenerator[CrawlResultT, None]] - -class CrawlResultContainer(Generic[CrawlResultT]): - def __init__(self, results: Union[CrawlResultT, List[CrawlResultT]]): - # Normalize to a list - if isinstance(results, list): - self._results = results - else: - self._results = [results] - - def __iter__(self): - return iter(self._results) - - def __getitem__(self, index): - return self._results[index] - - def __len__(self): - return len(self._results) - - def __getattr__(self, attr): - # Delegate attribute access to the first element. - if self._results: - return getattr(self._results[0], attr) - raise AttributeError(f"{self.__class__.__name__} object has no attribute '{attr}'") - - def __repr__(self): - return f"{self.__class__.__name__}({self._results!r})" - -# Redefine the union type. Now synchronous calls always return a container, -# while stream mode is handled with an AsyncGenerator. -RunManyReturn = Union[ - CrawlResultContainer[CrawlResultT], - AsyncGenerator[CrawlResultT, None] -] - - class AsyncWebCrawler: """ @@ -193,45 +161,18 @@ class AsyncWebCrawler: # Decorate arun method with deep crawling capabilities self._deep_handler = DeepCrawlDecorator(self) - self.arun = self._deep_handler(self.arun) + self.arun = self._deep_handler(self.arun) async def start(self): """ Start the crawler explicitly without using context manager. This is equivalent to using 'async with' but gives more control over the lifecycle. - - This method will: - 1. Check for builtin browser if browser_mode is 'builtin' - 2. Initialize the browser and context - 3. Perform warmup sequence - 4. Return the crawler instance for method chaining - Returns: AsyncWebCrawler: The initialized crawler instance """ - # Check for builtin browser if requested - if self.browser_config.browser_mode == "builtin" and not self.browser_config.cdp_url: - # Import here to avoid circular imports - from .browser_profiler import BrowserProfiler - profiler = BrowserProfiler(logger=self.logger) - - # Get builtin browser info or launch if needed - browser_info = profiler.get_builtin_browser_info() - if not browser_info: - self.logger.info("Builtin browser not found, launching new instance...", tag="BROWSER") - cdp_url = await profiler.launch_builtin_browser() - if not cdp_url: - self.logger.warning("Failed to launch builtin browser, falling back to dedicated browser", tag="BROWSER") - else: - self.browser_config.cdp_url = cdp_url - self.browser_config.use_managed_browser = True - else: - self.logger.info(f"Using existing builtin browser at {browser_info.get('cdp_url')}", tag="BROWSER") - self.browser_config.cdp_url = browser_info.get('cdp_url') - self.browser_config.use_managed_browser = True - await self.crawler_strategy.__aenter__() - await self.awarmup() + self.logger.info(f"Crawl4AI {crawl4ai_version}", tag="INIT") + self.ready = True return self async def close(self): @@ -251,18 +192,6 @@ class AsyncWebCrawler: async def __aexit__(self, exc_type, exc_val, exc_tb): await self.close() - async def awarmup(self): - """ - Initialize the crawler with warm-up sequence. - - This method: - 1. Logs initialization info - 2. Sets up browser configuration - 3. Marks the crawler as ready - """ - self.logger.info(f"Crawl4AI {crawl4ai_version}", tag="INIT") - self.ready = True - @asynccontextmanager async def nullcontext(self): """异步空上下文管理器""" @@ -305,7 +234,7 @@ class AsyncWebCrawler: # Auto-start if not ready if not self.ready: await self.start() - + config = config or CrawlerRunConfig() if not isinstance(url, str) or not url: raise ValueError("Invalid URL, make sure the URL is a non-empty string") @@ -319,9 +248,7 @@ class AsyncWebCrawler: config.cache_mode = CacheMode.ENABLED # Create cache context - cache_context = CacheContext( - url, config.cache_mode, False - ) + cache_context = CacheContext(url, config.cache_mode, False) # Initialize processing variables async_response: AsyncCrawlResponse = None @@ -351,7 +278,7 @@ class AsyncWebCrawler: # if config.screenshot and not screenshot or config.pdf and not pdf: if config.screenshot and not screenshot_data: cached_result = None - + if config.pdf and not pdf_data: cached_result = None @@ -383,14 +310,18 @@ class AsyncWebCrawler: # Check robots.txt if enabled if config and config.check_robots_txt: - if not await self.robots_parser.can_fetch(url, self.browser_config.user_agent): + if not await self.robots_parser.can_fetch( + url, self.browser_config.user_agent + ): return CrawlResult( url=url, html="", success=False, status_code=403, error_message="Access denied by robots.txt", - response_headers={"X-Robots-Status": "Blocked by robots.txt"} + response_headers={ + "X-Robots-Status": "Blocked by robots.txt" + }, ) ############################## @@ -417,7 +348,7 @@ class AsyncWebCrawler: ############################################################### # Process the HTML content, Call CrawlerStrategy.process_html # ############################################################### - crawl_result : CrawlResult = await self.aprocess_html( + crawl_result: CrawlResult = await self.aprocess_html( url=url, html=html, extracted_content=extracted_content, @@ -494,7 +425,7 @@ class AsyncWebCrawler: tag="ERROR", ) - return CrawlResultContainer( + return CrawlResultContainer( CrawlResult( url=url, html="", success=False, error_message=error_message ) @@ -539,15 +470,14 @@ class AsyncWebCrawler: # Process HTML content params = config.__dict__.copy() - params.pop("url", None) + params.pop("url", None) # add keys from kwargs to params that doesn't exist in params params.update({k: v for k, v in kwargs.items() if k not in params.keys()}) - ################################ # Scraping Strategy Execution # ################################ - result : ScrapingResult = scraping_strategy.scrap(url, html, **params) + result: ScrapingResult = scraping_strategy.scrap(url, html, **params) if result is None: raise ValueError( @@ -596,7 +526,10 @@ class AsyncWebCrawler: self.logger.info( message="{url:.50}... | Time: {timing}s", tag="SCRAPE", - params={"url": _url, "timing": int((time.perf_counter() - t1) * 1000) / 1000}, + params={ + "url": _url, + "timing": int((time.perf_counter() - t1) * 1000) / 1000, + }, ) ################################ @@ -671,7 +604,7 @@ class AsyncWebCrawler: async def arun_many( self, urls: List[str], - config: Optional[CrawlerRunConfig] = None, + config: Optional[CrawlerRunConfig] = None, dispatcher: Optional[BaseDispatcher] = None, # Legacy parameters maintained for backwards compatibility # word_count_threshold=MIN_WORD_THRESHOLD, @@ -685,8 +618,8 @@ class AsyncWebCrawler: # pdf: bool = False, # user_agent: str = None, # verbose=True, - **kwargs - ) -> RunManyReturn: + **kwargs, + ) -> RunManyReturn: """ Runs the crawler for multiple URLs concurrently using a configurable dispatcher strategy. @@ -742,37 +675,32 @@ class AsyncWebCrawler: def transform_result(task_result): return ( - setattr(task_result.result, 'dispatch_result', - DispatchResult( - task_id=task_result.task_id, - memory_usage=task_result.memory_usage, - peak_memory=task_result.peak_memory, - start_time=task_result.start_time, - end_time=task_result.end_time, - error_message=task_result.error_message, - ) - ) or task_result.result + setattr( + task_result.result, + "dispatch_result", + DispatchResult( + task_id=task_result.task_id, + memory_usage=task_result.memory_usage, + peak_memory=task_result.peak_memory, + start_time=task_result.start_time, + end_time=task_result.end_time, + error_message=task_result.error_message, + ), ) + or task_result.result + ) stream = config.stream - + if stream: + async def result_transformer(): - async for task_result in dispatcher.run_urls_stream(crawler=self, urls=urls, config=config): + async for task_result in dispatcher.run_urls_stream( + crawler=self, urls=urls, config=config + ): yield transform_result(task_result) + return result_transformer() else: _results = await dispatcher.run_urls(crawler=self, urls=urls, config=config) - return [transform_result(res) for res in _results] - - async def aclear_cache(self): - """Clear the cache database.""" - await async_db_manager.cleanup() - - async def aflush_cache(self): - """Flush the cache database.""" - await async_db_manager.aflush_db() - - async def aget_cache_size(self): - """Get the total number of cached items.""" - return await async_db_manager.aget_total_count() + return [transform_result(res) for res in _results] diff --git a/crawl4ai/browser/manager.py b/crawl4ai/browser/manager.py index 288ad7e9..129a940b 100644 --- a/crawl4ai/browser/manager.py +++ b/crawl4ai/browser/manager.py @@ -50,7 +50,7 @@ class BrowserManager: self.logger = logger # Create strategy based on configuration - self._strategy = self._create_strategy() + self.strategy = self._create_strategy() # Initialize state variables for compatibility with existing code self.browser = None @@ -92,23 +92,23 @@ class BrowserManager: self: For method chaining """ # Start the strategy - await self._strategy.start() + await self.strategy.start() # Update legacy references - self.browser = self._strategy.browser - self.default_context = self._strategy.default_context + self.browser = self.strategy.browser + self.default_context = self.strategy.default_context # Set browser process reference (for CDP strategy) - if hasattr(self._strategy, 'browser_process'): - self.managed_browser = self._strategy + if hasattr(self.strategy, 'browser_process'): + self.managed_browser = self.strategy # Set Playwright reference - self.playwright = self._strategy.playwright + self.playwright = self.strategy.playwright # Sync sessions if needed - if hasattr(self._strategy, 'sessions'): - self.sessions = self._strategy.sessions - self.session_ttl = self._strategy.session_ttl + if hasattr(self.strategy, 'sessions'): + self.sessions = self.strategy.sessions + self.session_ttl = self.strategy.session_ttl return self @@ -122,11 +122,11 @@ class BrowserManager: Tuple of (Page, BrowserContext) """ # Delegate to strategy - page, context = await self._strategy.get_page(crawlerRunConfig) + page, context = await self.strategy.get_page(crawlerRunConfig) # Sync sessions if needed - if hasattr(self._strategy, 'sessions'): - self.sessions = self._strategy.sessions + if hasattr(self.strategy, 'sessions'): + self.sessions = self.strategy.sessions return page, context @@ -144,14 +144,15 @@ class BrowserManager: List of (Page, Context) tuples """ # Delegate to strategy - pages = await self._strategy.get_pages(crawlerRunConfig, count) + pages = await self.strategy.get_pages(crawlerRunConfig, count) # Sync sessions if needed - if hasattr(self._strategy, 'sessions'): - self.sessions = self._strategy.sessions + if hasattr(self.strategy, 'sessions'): + self.sessions = self.strategy.sessions return pages + # Just for legacy compatibility async def kill_session(self, session_id: str): """Kill a browser session and clean up resources. @@ -159,33 +160,16 @@ class BrowserManager: session_id: The session ID to kill """ # Handle kill_session via our strategy if it supports it - await self._strategy.kill_session(session_id) + await self.strategy.kill_session(session_id) # sync sessions if needed - if hasattr(self._strategy, 'sessions'): - self.sessions = self._strategy.sessions - - def _cleanup_expired_sessions(self): - """Clean up expired sessions based on TTL.""" - # Use strategy's implementation if available - if hasattr(self._strategy, '_cleanup_expired_sessions'): - self._strategy._cleanup_expired_sessions() - return - - # Otherwise use our own implementation - current_time = time.time() - expired_sessions = [ - sid - for sid, (_, _, last_used) in self.sessions.items() - if current_time - last_used > self.session_ttl - ] - for sid in expired_sessions: - asyncio.create_task(self.kill_session(sid)) + if hasattr(self.strategy, 'sessions'): + self.sessions = self.strategy.sessions async def close(self): """Close the browser and clean up resources.""" # Delegate to strategy - await self._strategy.close() + await self.strategy.close() # Reset legacy references self.browser = None diff --git a/crawl4ai/browser/strategies/base.py b/crawl4ai/browser/strategies/base.py index 2c500389..5c46cbe4 100644 --- a/crawl4ai/browser/strategies/base.py +++ b/crawl4ai/browser/strategies/base.py @@ -82,6 +82,9 @@ class BaseBrowserStrategy(ABC): return self @abstractmethod + async def _generate_page(self, crawlerRunConfig: CrawlerRunConfig) -> Tuple[Page, BrowserContext]: + pass + async def get_page(self, crawlerRunConfig: CrawlerRunConfig) -> Tuple[Page, BrowserContext]: """Get a page with specified configuration. @@ -94,6 +97,23 @@ class BaseBrowserStrategy(ABC): Returns: Tuple of (Page, BrowserContext) """ + # Clean up expired sessions first + self._cleanup_expired_sessions() + + # If a session_id is provided and we already have it, reuse that page + context + if crawlerRunConfig.session_id and crawlerRunConfig.session_id in self.sessions: + context, page, _ = self.sessions[crawlerRunConfig.session_id] + # Update last-used timestamp + self.sessions[crawlerRunConfig.session_id] = (context, page, time.time()) + return page, context + + page, context = await self._generate_page(crawlerRunConfig) + + # If a session_id is specified, store this session so we can reuse later + if crawlerRunConfig.session_id: + self.sessions[crawlerRunConfig.session_id] = (context, page, time.time()) + + return page, context pass async def get_pages(self, crawlerRunConfig: CrawlerRunConfig, count: int = 1) -> List[Tuple[Page, BrowserContext]]: @@ -120,31 +140,29 @@ class BaseBrowserStrategy(ABC): """ # Define common browser arguments that improve performance and stability args = [ - "--disable-gpu", - "--disable-gpu-compositing", - "--disable-software-rasterizer", "--no-sandbox", - "--disable-dev-shm-usage", "--no-first-run", "--no-default-browser-check", - "--disable-infobars", "--window-position=0,0", "--ignore-certificate-errors", "--ignore-certificate-errors-spki-list", - "--disable-blink-features=AutomationControlled", "--window-position=400,0", - "--disable-renderer-backgrounding", - "--disable-ipc-flooding-protection", "--force-color-profile=srgb", "--mute-audio", + "--disable-gpu", + "--disable-gpu-compositing", + "--disable-software-rasterizer", + "--disable-dev-shm-usage", + "--disable-infobars", + "--disable-blink-features=AutomationControlled", + "--disable-renderer-backgrounding", + "--disable-ipc-flooding-protection", "--disable-background-timer-throttling", f"--window-size={self.config.viewport_width},{self.config.viewport_height}", ] # Define browser disable options for light mode browser_disable_options = [ - "--disable-background-networking", - "--disable-background-timer-throttling", "--disable-backgrounding-occluded-windows", "--disable-breakpad", "--disable-client-side-phishing-detection", @@ -153,13 +171,10 @@ class BaseBrowserStrategy(ABC): "--disable-extensions", "--disable-features=TranslateUI", "--disable-hang-monitor", - "--disable-ipc-flooding-protection", "--disable-popup-blocking", "--disable-prompt-on-repost", "--disable-sync", - "--force-color-profile=srgb", "--metrics-recording-only", - "--no-first-run", "--password-store=basic", "--use-mock-keychain", ] diff --git a/crawl4ai/browser/strategies/builtin.py b/crawl4ai/browser/strategies/builtin.py index 2423ee04..678346fc 100644 --- a/crawl4ai/browser/strategies/builtin.py +++ b/crawl4ai/browser/strategies/builtin.py @@ -115,24 +115,11 @@ class BuiltinBrowserStrategy(CDPBrowserStrategy): except Exception as e: if self.logger: self.logger.error(f"Failed to start built-in browser: {str(e)}", tag="BUILTIN") + + # There is a possibility that at this point I need to clean up some resourece raise - async def get_page(self, crawlerRunConfig: CrawlerRunConfig) -> Tuple[Page, BrowserContext]: - """Get a page for the given configuration. - - Inherits behavior from CDPBrowserStrategy for page management. - - Args: - crawlerRunConfig: Configuration object for the crawler run - - Returns: - Tuple of (Page, BrowserContext) - """ - # For built-in browsers, we use the same page management as CDP strategy - return await super().get_page(crawlerRunConfig) - - @classmethod - def get_builtin_browser_info(cls, debugging_port: int, config_file: str, logger: Optional[AsyncLogger] = None) -> Optional[Dict[str, Any]]: + def _get_builtin_browser_info(cls, debugging_port: int, config_file: str, logger: Optional[AsyncLogger] = None) -> Optional[Dict[str, Any]]: """Get information about the built-in browser for a specific debugging port. Args: @@ -157,15 +144,14 @@ class BuiltinBrowserStrategy(CDPBrowserStrategy): browser_info = browser_info_dict["port_map"][port_str] # Check if the browser is still running - pids = browser_info.get('pid') - if type(pids) == str and len(pids.split("\n")) > 1: - pids = [int(pid) for pid in pids.split("\n") if pid.isdigit()] - elif type(pids) == str and pids.isdigit(): - pids = [int(pids)] - elif type(pids) == int: + pids = browser_info.get('pid', '') + if isinstance(pids, str): + pids = [int(pid) for pid in pids.split() if pid.isdigit()] + elif isinstance(pids, int): pids = [pids] else: pids = [] + # Check if any of the PIDs are running if not pids: if logger: @@ -205,7 +191,7 @@ class BuiltinBrowserStrategy(CDPBrowserStrategy): Returns: dict: Browser information or None if no running browser is configured """ - return self.get_builtin_browser_info( + return self._get_builtin_browser_info( debugging_port=self.config.debugging_port, config_file=self.builtin_config_file, logger=self.logger @@ -226,7 +212,7 @@ class BuiltinBrowserStrategy(CDPBrowserStrategy): str: CDP URL for the browser, or None if launch failed """ # Check if there's an existing browser still running - browser_info = self.get_builtin_browser_info( + browser_info = self._get_builtin_browser_info( debugging_port=debugging_port, config_file=self.builtin_config_file, logger=self.logger @@ -238,6 +224,7 @@ class BuiltinBrowserStrategy(CDPBrowserStrategy): # Create a user data directory for the built-in browser user_data_dir = os.path.join(self.builtin_browser_dir, "user_data") + # Raise error if user data dir is already engaged if self._check_user_dir_is_engaged(user_data_dir): raise Exception(f"User data directory {user_data_dir} is already engaged by another browser instance.") @@ -246,15 +233,19 @@ class BuiltinBrowserStrategy(CDPBrowserStrategy): os.makedirs(user_data_dir, exist_ok=True) # Prepare browser launch arguments + browser_args = super()._build_browser_args() browser_path = await get_browser_executable(browser_type) + base_args = [browser_path] + if browser_type == "chromium": args = [ browser_path, f"--remote-debugging-port={debugging_port}", f"--user-data-dir={user_data_dir}", ] - if headless: - args.append("--headless=new") + # if headless: + # args.append("--headless=new") + elif browser_type == "firefox": args = [ browser_path, @@ -270,6 +261,8 @@ class BuiltinBrowserStrategy(CDPBrowserStrategy): self.logger.error(f"Browser type {browser_type} not supported for built-in browser", tag="BUILTIN") return None + args = base_args + browser_args + args + try: # Check if the port is already in use @@ -333,11 +326,12 @@ class BuiltinBrowserStrategy(CDPBrowserStrategy): # Check if it already uses port mapping if isinstance(existing_data, dict) and "port_map" in existing_data: port_map = existing_data["port_map"] - # Convert legacy format to port mapping - elif isinstance(existing_data, dict) and "debugging_port" in existing_data: - old_port = str(existing_data.get("debugging_port")) - if self._is_browser_running(existing_data.get("pid")): - port_map[old_port] = existing_data + + # # Convert legacy format to port mapping + # elif isinstance(existing_data, dict) and "debugging_port" in existing_data: + # old_port = str(existing_data.get("debugging_port")) + # if self._is_browser_running(existing_data.get("pid")): + # port_map[old_port] = existing_data except Exception as e: if self.logger: self.logger.warning(f"Could not read existing config: {str(e)}", tag="BUILTIN") @@ -413,15 +407,19 @@ class BuiltinBrowserStrategy(CDPBrowserStrategy): # Update config file to remove this browser with open(self.builtin_config_file, 'r') as f: browser_info_dict = json.load(f) + # Remove this port from the dictionary port_str = str(self.config.debugging_port) if port_str in browser_info_dict.get("port_map", {}): del browser_info_dict["port_map"][port_str] + with open(self.builtin_config_file, 'w') as f: json.dump(browser_info_dict, f, indent=2) + # Remove user data directory if it exists if os.path.exists(self.builtin_browser_dir): shutil.rmtree(self.builtin_browser_dir) + # Clear the browser info cache self.browser = None self.temp_dir = None @@ -460,14 +458,11 @@ class BuiltinBrowserStrategy(CDPBrowserStrategy): async def close(self): """Close the built-in browser and clean up resources.""" - # Store the shutting_down state - was_shutting_down = getattr(self, 'shutting_down', False) - # Call parent class close method await super().close() # Clean up built-in browser if we created it and were in shutdown mode - if was_shutting_down: + if self.shutting_down: await self.kill_builtin_browser() if self.logger: self.logger.debug("Killed built-in browser during shutdown", tag="BUILTIN") \ No newline at end of file diff --git a/crawl4ai/browser/strategies/cdp.py b/crawl4ai/browser/strategies/cdp.py index 26eba00e..e5982065 100644 --- a/crawl4ai/browser/strategies/cdp.py +++ b/crawl4ai/browser/strategies/cdp.py @@ -68,9 +68,11 @@ class CDPBrowserStrategy(BaseBrowserStrategy): if self.logger: self.logger.debug(f"Connected to CDP browser at {cdp_url}", tag="CDP") + except Exception as e: if self.logger: self.logger.error(f"Failed to connect to CDP browser: {str(e)}", tag="CDP") + # Clean up any resources before re-raising await self._cleanup_process() raise @@ -95,7 +97,32 @@ class CDPBrowserStrategy(BaseBrowserStrategy): user_data_dir = self.config.user_data_dir # Get browser args based on OS and browser type - args = await self._get_browser_args(user_data_dir) + # args = await self._get_browser_args(user_data_dir) + browser_args = super()._build_browser_args() + browser_path = await get_browser_executable(self.config.browser_type) + base_args = [browser_path] + + if self.config.browser_type == "chromium": + args = [ + f"--remote-debugging-port={self.config.debugging_port}", + f"--user-data-dir={user_data_dir}", + ] + # if self.config.headless: + # args.append("--headless=new") + + elif self.config.browser_type == "firefox": + args = [ + "--remote-debugging-port", + str(self.config.debugging_port), + "--profile", + user_data_dir, + ] + if self.config.headless: + args.append("--headless") + else: + raise NotImplementedError(f"Browser type {self.config.browser_type} not supported") + + args = base_args + browser_args + args # Start browser process try: @@ -136,40 +163,6 @@ class CDPBrowserStrategy(BaseBrowserStrategy): except Exception as e: await self._cleanup_process() raise Exception(f"Failed to start browser: {e}") - - async def _get_browser_args(self, user_data_dir: str) -> List[str]: - """Returns browser-specific command line arguments. - - Args: - user_data_dir: Path to user data directory - - Returns: - List of command-line arguments for the browser - """ - browser_args = super()._build_browser_args() - browser_path = await get_browser_executable(self.config.browser_type) - base_args = [browser_path] - - if self.config.browser_type == "chromium": - args = [ - f"--remote-debugging-port={self.config.debugging_port}", - f"--user-data-dir={user_data_dir}", - ] - if self.config.headless: - args.append("--headless=new") - elif self.config.browser_type == "firefox": - args = [ - "--remote-debugging-port", - str(self.config.debugging_port), - "--profile", - user_data_dir, - ] - if self.config.headless: - args.append("--headless") - else: - raise NotImplementedError(f"Browser type {self.config.browser_type} not supported") - - return base_args + browser_args + args async def _cleanup_process(self): """Cleanup browser process and temporary directory.""" @@ -204,15 +197,40 @@ class CDPBrowserStrategy(BaseBrowserStrategy): if self.temp_dir and os.path.exists(self.temp_dir): try: shutil.rmtree(self.temp_dir) + self.temp_dir = None + if self.logger: + self.logger.debug("Removed temporary directory", tag="CDP") except Exception as e: if self.logger: self.logger.error( message="Error removing temporary directory: {error}", - tag="ERROR", - params={"error": str(e)}, + tag="CDP", + params={"error": str(e)} ) + + self.browser_process = None - async def get_page(self, crawlerRunConfig: CrawlerRunConfig) -> Tuple[Page, BrowserContext]: + async def _generate_page(self, crawlerRunConfig: CrawlerRunConfig) -> Tuple[Page, BrowserContext]: + # For CDP, we typically use the shared default_context + context = self.default_context + pages = context.pages + + # Otherwise, check if we have an existing context for this config + config_signature = self._make_config_signature(crawlerRunConfig) + self.contexts_by_config[config_signature] = context + + await self.setup_context(context, crawlerRunConfig) + + # Check if there's already a page with the target URL + page = next((p for p in pages if p.url == crawlerRunConfig.url), None) + + # If not found, create a new page + if not page: + page = await context.new_page() + + return page, context + + async def _get_page(self, crawlerRunConfig: CrawlerRunConfig) -> Tuple[Page, BrowserContext]: """Get a page for the given configuration. Args: @@ -221,15 +239,8 @@ class CDPBrowserStrategy(BaseBrowserStrategy): Returns: Tuple of (Page, BrowserContext) """ - # Clean up expired sessions using base class method - self._cleanup_expired_sessions() - - # If a session_id is provided and we already have it, reuse that page + context - if crawlerRunConfig.session_id and crawlerRunConfig.session_id in self.sessions: - context, page, _ = self.sessions[crawlerRunConfig.session_id] - # Update last-used timestamp - self.sessions[crawlerRunConfig.session_id] = (context, page, time.time()) - return page, context + # Call parent method to ensure browser is started + await super().get_page(crawlerRunConfig) # For CDP, we typically use the shared default_context context = self.default_context @@ -266,24 +277,5 @@ class CDPBrowserStrategy(BaseBrowserStrategy): await super().close() # Additional CDP-specific cleanup - if self.browser_process: - await asyncio.sleep(0.5) - await self._cleanup_process() - self.browser_process = None - if self.logger: - self.logger.debug("Cleaned up CDP browser process", tag="CDP") - - # Clean up temporary directory - if self.temp_dir and os.path.exists(self.temp_dir): - try: - shutil.rmtree(self.temp_dir) - self.temp_dir = None - if self.logger: - self.logger.debug("Removed temporary directory", tag="CDP") - except Exception as e: - if self.logger: - self.logger.error( - message="Error removing temporary directory: {error}", - tag="CDP", - params={"error": str(e)} - ) \ No newline at end of file + await asyncio.sleep(0.5) + await self._cleanup_process() diff --git a/crawl4ai/browser/strategies/docker_strategy.py b/crawl4ai/browser/strategies/docker_strategy.py index a71d48e9..5390fc8a 100644 --- a/crawl4ai/browser/strategies/docker_strategy.py +++ b/crawl4ai/browser/strategies/docker_strategy.py @@ -15,7 +15,7 @@ from ..models import DockerConfig from ..docker_registry import DockerRegistry from ..docker_utils import DockerUtils from .builtin import CDPBrowserStrategy - +from .base import BaseBrowserStrategy class DockerBrowserStrategy(CDPBrowserStrategy): """Docker-based browser strategy. @@ -79,9 +79,7 @@ class DockerBrowserStrategy(CDPBrowserStrategy): self: For method chaining """ # Initialize Playwright - from ..utils import get_playwright - - self.playwright = await get_playwright() + await BaseBrowserStrategy.start(self) if self.logger: self.logger.info( @@ -172,121 +170,6 @@ class DockerBrowserStrategy(CDPBrowserStrategy): # Use the utility method to generate the hash return self.docker_utils.generate_config_hash(config_dict) - async def _get_or_create_cdp_url1(self) -> str: - """Get CDP URL by either creating a new container or using an existing one. - - Returns: - CDP URL for connecting to the browser - - Raises: - Exception: If container creation or browser launch fails - """ - # If CDP URL is explicitly provided, use it - if self.config.cdp_url: - return self.config.cdp_url - - # Ensure Docker image exists (will build if needed) - image_name = await self.docker_utils.ensure_docker_image_exists( - self.docker_config.image, self.docker_config.mode - ) - - # Generate config hash for container matching - config_hash = await self._generate_config_hash() - - # Look for existing container with matching config - container_id = self.registry.find_container_by_config( - config_hash, self.docker_utils - ) - - if container_id: - # Use existing container - self.container_id = container_id - host_port = self.registry.get_container_host_port(container_id) - if self.logger: - self.logger.info( - f"Using existing Docker container: {container_id[:12]}", - tag="DOCKER", - ) - else: - # Get a port for the new container - host_port = ( - self.docker_config.host_port - or self.registry.get_next_available_port(self.docker_utils) - ) - - # Prepare volumes list - volumes = list(self.docker_config.volumes) - - # Add user data directory if specified - if self.docker_config.user_data_dir: - # Ensure user data directory exists - os.makedirs(self.docker_config.user_data_dir, exist_ok=True) - volumes.append( - f"{self.docker_config.user_data_dir}:{self.docker_config.container_user_data_dir}" - ) - - # Update config user_data_dir to point to container path - self.config.user_data_dir = self.docker_config.container_user_data_dir - - # Create a new container - container_id = await self.docker_utils.create_container( - image_name=image_name, - host_port=host_port, - container_name=self.container_name, - volumes=volumes, - network=self.docker_config.network, - env_vars=self.docker_config.env_vars, - extra_args=self.docker_config.extra_args, - ) - - if not container_id: - raise Exception("Failed to create Docker container") - - self.container_id = container_id - - # Register the container - self.registry.register_container(container_id, host_port, config_hash) - - # Wait for container to be ready - await self.docker_utils.wait_for_container_ready(container_id) - - # Handle specific setup based on mode - if self.docker_config.mode == "launch": - # In launch mode, we need to start socat and Chrome - await self.docker_utils.start_socat_in_container(container_id) - - # Build browser arguments - browser_args = self._build_browser_args() - - # Launch Chrome - await self.docker_utils.launch_chrome_in_container( - container_id, browser_args - ) - - # Get PIDs for later cleanup - self.chrome_process_id = ( - await self.docker_utils.get_process_id_in_container( - container_id, "chrome" - ) - ) - self.socat_process_id = ( - await self.docker_utils.get_process_id_in_container( - container_id, "socat" - ) - ) - - # Wait for CDP to be ready - await self.docker_utils.wait_for_cdp_ready(host_port) - - if self.logger: - self.logger.success( - f"Docker container ready: {container_id[:12]} on port {host_port}", - tag="DOCKER", - ) - - # Return CDP URL - return f"http://localhost:{host_port}" - async def _get_or_create_cdp_url(self) -> str: """Get CDP URL by either creating a new container or using an existing one. @@ -465,8 +348,7 @@ class DockerBrowserStrategy(CDPBrowserStrategy): async def close(self): """Close the browser and clean up Docker container if needed.""" # Set flag to track if we were the ones initiating shutdown - initiated_shutdown = not getattr(self, "shutting_down", False) - + initiated_shutdown = not self.shutting_down # Storage persistence for Docker needs special handling # We need to store state before calling super().close() which will close the browser if ( diff --git a/crawl4ai/browser/strategies/playwright.py b/crawl4ai/browser/strategies/playwright.py index b24edf72..bea99753 100644 --- a/crawl4ai/browser/strategies/playwright.py +++ b/crawl4ai/browser/strategies/playwright.py @@ -80,8 +80,26 @@ class PlaywrightBrowserStrategy(BaseBrowserStrategy): raise return self - - async def get_page(self, crawlerRunConfig: CrawlerRunConfig) -> Tuple[Page, BrowserContext]: + + async def _generate_page(self, crawlerRunConfig: CrawlerRunConfig) -> Tuple[Page, BrowserContext]: + # Otherwise, check if we have an existing context for this config + config_signature = self._make_config_signature(crawlerRunConfig) + + async with self._contexts_lock: + if config_signature in self.contexts_by_config: + context = self.contexts_by_config[config_signature] + else: + # Create and setup a new context + context = await self.create_browser_context(crawlerRunConfig) + await self.setup_context(context, crawlerRunConfig) + self.contexts_by_config[config_signature] = context + + # Create a new page from the chosen context + page = await context.new_page() + + return page, context + + async def _get_page(self, crawlerRunConfig: CrawlerRunConfig) -> Tuple[Page, BrowserContext]: """Get a page for the given configuration. Args: @@ -90,15 +108,8 @@ class PlaywrightBrowserStrategy(BaseBrowserStrategy): Returns: Tuple of (Page, BrowserContext) """ - # Clean up expired sessions first - self._cleanup_expired_sessions() - - # If a session_id is provided and we already have it, reuse that page + context - if crawlerRunConfig.session_id and crawlerRunConfig.session_id in self.sessions: - context, page, _ = self.sessions[crawlerRunConfig.session_id] - # Update last-used timestamp - self.sessions[crawlerRunConfig.session_id] = (context, page, time.time()) - return page, context + # Call parent method to ensure browser is started + await super().get_page(crawlerRunConfig) # Otherwise, check if we have an existing context for this config config_signature = self._make_config_signature(crawlerRunConfig) @@ -121,8 +132,3 @@ class PlaywrightBrowserStrategy(BaseBrowserStrategy): return page, context - async def close(self): - """Close the Playwright browser and clean up resources.""" - # The base implementation already handles everything needed for Playwright - # including storage persistence, sessions, contexts, browser and playwright - await super().close() \ No newline at end of file diff --git a/crawl4ai/models.py b/crawl4ai/models.py index f9551c1a..aad14a1d 100644 --- a/crawl4ai/models.py +++ b/crawl4ai/models.py @@ -1,5 +1,7 @@ from pydantic import BaseModel, HttpUrl, PrivateAttr from typing import List, Dict, Optional, Callable, Awaitable, Union, Any +from typing import AsyncGenerator +from typing import Generic, TypeVar from enum import Enum from dataclasses import dataclass from .ssl_certificate import SSLCertificate @@ -34,34 +36,12 @@ class CrawlerTaskResult: def success(self) -> bool: return self.result.success - class CrawlStatus(Enum): QUEUED = "QUEUED" IN_PROGRESS = "IN_PROGRESS" COMPLETED = "COMPLETED" FAILED = "FAILED" - -# @dataclass -# class CrawlStats: -# task_id: str -# url: str -# status: CrawlStatus -# start_time: Optional[datetime] = None -# end_time: Optional[datetime] = None -# memory_usage: float = 0.0 -# peak_memory: float = 0.0 -# error_message: str = "" - -# @property -# def duration(self) -> str: -# if not self.start_time: -# return "0:00" -# end = self.end_time or datetime.now() -# duration = end - self.start_time -# return str(timedelta(seconds=int(duration.total_seconds()))) - - @dataclass class CrawlStats: task_id: str @@ -95,7 +75,6 @@ class CrawlStats: duration = end - start return str(timedelta(seconds=int(duration.total_seconds()))) - class DisplayMode(Enum): DETAILED = "DETAILED" AGGREGATED = "AGGREGATED" @@ -112,12 +91,10 @@ class TokenUsage: completion_tokens_details: Optional[dict] = None prompt_tokens_details: Optional[dict] = None - class UrlModel(BaseModel): url: HttpUrl forced: bool = False - class MarkdownGenerationResult(BaseModel): raw_markdown: str markdown_with_citations: str @@ -284,6 +261,40 @@ class StringCompatibleMarkdown(str): def __getattr__(self, name): return getattr(self._markdown_result, name) +CrawlResultT = TypeVar('CrawlResultT', bound=CrawlResult) + +class CrawlResultContainer(Generic[CrawlResultT]): + def __init__(self, results: Union[CrawlResultT, List[CrawlResultT]]): + # Normalize to a list + if isinstance(results, list): + self._results = results + else: + self._results = [results] + + def __iter__(self): + return iter(self._results) + + def __getitem__(self, index): + return self._results[index] + + def __len__(self): + return len(self._results) + + def __getattr__(self, attr): + # Delegate attribute access to the first element. + if self._results: + return getattr(self._results[0], attr) + raise AttributeError(f"{self.__class__.__name__} object has no attribute '{attr}'") + + def __repr__(self): + return f"{self.__class__.__name__}({self._results!r})" + +RunManyReturn = Union[ + CrawlResultContainer[CrawlResultT], + AsyncGenerator[CrawlResultT, None] +] + + # END of backward compatibility code for markdown/markdown_v2. # When removing this code in the future, make sure to: # 1. Replace the private attribute and property with a standard field @@ -304,7 +315,6 @@ class AsyncCrawlResponse(BaseModel): class Config: arbitrary_types_allowed = True - ############################### # Scraping Models ############################### diff --git a/tests/browser/docker/test_docker_browser.py b/tests/browser/docker/test_docker_browser.py index cde5df23..2ec64a6b 100644 --- a/tests/browser/docker/test_docker_browser.py +++ b/tests/browser/docker/test_docker_browser.py @@ -530,7 +530,7 @@ async def test_docker_registry_reuse(): logger.info("First browser started successfully", tag="TEST") # Get container ID from the strategy - docker_strategy1 = manager1._strategy + docker_strategy1 = manager1.strategy container_id1 = docker_strategy1.container_id logger.info(f"First browser container ID: {container_id1[:12]}", tag="TEST") @@ -560,7 +560,7 @@ async def test_docker_registry_reuse(): logger.info("Second browser started successfully", tag="TEST") # Get container ID from the second strategy - docker_strategy2 = manager2._strategy + docker_strategy2 = manager2.strategy container_id2 = docker_strategy2.container_id logger.info(f"Second browser container ID: {container_id2[:12]}", tag="TEST") diff --git a/tests/browser/test_builtin_browser.py b/tests/browser/test_builtin_browser.py index 0735e457..4797648c 100644 --- a/tests/browser/test_builtin_browser.py +++ b/tests/browser/test_builtin_browser.py @@ -56,13 +56,13 @@ async def test_builtin_browser_creation(): # Step 2: Check if we have a BuiltinBrowserStrategy print(f"\n{INFO}2. Checking if we have a BuiltinBrowserStrategy{RESET}") - if isinstance(manager._strategy, BuiltinBrowserStrategy): + if isinstance(manager.strategy, BuiltinBrowserStrategy): print( - f"{SUCCESS}Correct strategy type: {manager._strategy.__class__.__name__}{RESET}" + f"{SUCCESS}Correct strategy type: {manager.strategy.__class__.__name__}{RESET}" ) else: print( - f"{ERROR}Wrong strategy type: {manager._strategy.__class__.__name__}{RESET}" + f"{ERROR}Wrong strategy type: {manager.strategy.__class__.__name__}{RESET}" ) return None @@ -77,7 +77,7 @@ async def test_builtin_browser_creation(): # Step 4: Get browser info from the strategy print(f"\n{INFO}4. Getting browser information{RESET}") - browser_info = manager._strategy.get_browser_info() + browser_info = manager.strategy.get_browser_info() if browser_info: print(f"{SUCCESS}Browser info retrieved:{RESET}") for key, value in browser_info.items(): @@ -149,7 +149,7 @@ async def test_browser_status_management(manager: BrowserManager): # Step 1: Get browser status print(f"\n{INFO}1. Getting browser status{RESET}") try: - status = await manager._strategy.get_builtin_browser_status() + status = await manager.strategy.get_builtin_browser_status() print(f"{SUCCESS}Browser status:{RESET}") print(f" Running: {status['running']}") print(f" CDP URL: {status['cdp_url']}") @@ -160,7 +160,7 @@ async def test_browser_status_management(manager: BrowserManager): # Step 2: Test killing the browser print(f"\n{INFO}2. Testing killing the browser{RESET}") try: - result = await manager._strategy.kill_builtin_browser() + result = await manager.strategy.kill_builtin_browser() if result: print(f"{SUCCESS}Browser killed successfully{RESET}") else: @@ -172,7 +172,7 @@ async def test_browser_status_management(manager: BrowserManager): # Step 3: Check status after kill print(f"\n{INFO}3. Checking status after kill{RESET}") try: - status = await manager._strategy.get_builtin_browser_status() + status = await manager.strategy.get_builtin_browser_status() if not status["running"]: print(f"{SUCCESS}Browser is correctly reported as not running{RESET}") else: @@ -184,7 +184,7 @@ async def test_browser_status_management(manager: BrowserManager): # Step 4: Launch a new browser print(f"\n{INFO}4. Launching a new browser{RESET}") try: - cdp_url = await manager._strategy.launch_builtin_browser( + cdp_url = await manager.strategy.launch_builtin_browser( browser_type="chromium", headless=True ) if cdp_url: @@ -223,8 +223,8 @@ async def test_multiple_managers(): print(f"{SUCCESS}Second manager started{RESET}") # Check if they got the same CDP URL - cdp_url1 = manager1._strategy.config.cdp_url - cdp_url2 = manager2._strategy.config.cdp_url + cdp_url1 = manager1.strategy.config.cdp_url + cdp_url2 = manager2.strategy.config.cdp_url if cdp_url1 == cdp_url2: print( @@ -316,7 +316,7 @@ async def test_edge_cases(): # Kill the browser directly print(f"{INFO}Killing the browser...{RESET}") - await manager._strategy.kill_builtin_browser() + await manager.strategy.kill_builtin_browser() print(f"{SUCCESS}Browser killed{RESET}") # Try to get a page (should fail or launch a new browser) @@ -350,7 +350,7 @@ async def cleanup_browsers(): try: # No need to start, just access the strategy directly - strategy = manager._strategy + strategy = manager.strategy if isinstance(strategy, BuiltinBrowserStrategy): result = await strategy.kill_builtin_browser() if result: @@ -420,7 +420,7 @@ async def test_performance_scaling(): user_data_dir=os.path.join(temp_dir, f"browser_profile_{i}"), ) manager = BrowserManager(browser_config=browser_config, logger=logger) - manager._strategy.shutting_down = True + manager.strategy.shutting_down = True manager_configs.append((manager, i, port)) # Define async function to start a single manager @@ -614,7 +614,7 @@ async def test_performance_scaling_lab( num_browsers: int = 10, pages_per_browse user_data_dir=os.path.join(temp_dir, f"browser_profile_{i}"), ) manager = BrowserManager(browser_config=browser_config, logger=logger) - manager._strategy.shutting_down = True + manager.strategy.shutting_down = True manager_configs.append((manager, i, port)) # Define async function to start a single manager From 555455d71032b16e96c98340ccc3bc6db0a28d9c Mon Sep 17 00:00:00 2001 From: UncleCode Date: Mon, 31 Mar 2025 21:55:07 +0800 Subject: [PATCH 6/7] feat(browser): implement browser pooling and page pre-warming Adds a new BrowserManager implementation with browser pooling and page pre-warming capabilities: - Adds support for managing multiple browser instances per configuration - Implements page pre-warming for improved performance - Adds configurable behavior for when no browsers are available - Includes comprehensive status reporting and monitoring - Maintains backward compatibility with existing API - Adds demo script showcasing new features BREAKING CHANGE: BrowserManager API now returns a strategy instance along with page and context --- crawl4ai/async_configs.py | 2 +- crawl4ai/browser/manager copy.py | 177 ++++ crawl4ai/browser/manager.py | 867 ++++++++++++++++-- crawl4ai/browser/strategies/base.py | 9 + crawl4ai/browser/strategies/cdp.py | 2 +- tests/browser/manager/demo_browser_manager.py | 525 +++++++++++ 6 files changed, 1484 insertions(+), 98 deletions(-) create mode 100644 crawl4ai/browser/manager copy.py create mode 100644 tests/browser/manager/demo_browser_manager.py diff --git a/crawl4ai/async_configs.py b/crawl4ai/async_configs.py index 9198fa1d..8833eea5 100644 --- a/crawl4ai/async_configs.py +++ b/crawl4ai/async_configs.py @@ -270,7 +270,7 @@ class BrowserConfig: host: str = "localhost", ): self.browser_type = browser_type - self.headless = headless and "new" or False + self.headless = headless or True self.browser_mode = browser_mode self.use_managed_browser = use_managed_browser self.cdp_url = cdp_url diff --git a/crawl4ai/browser/manager copy.py b/crawl4ai/browser/manager copy.py new file mode 100644 index 00000000..97aaf587 --- /dev/null +++ b/crawl4ai/browser/manager copy.py @@ -0,0 +1,177 @@ +"""Browser manager module for Crawl4AI. + +This module provides a central browser management class that uses the +strategy pattern internally while maintaining the existing API. +It also implements a page pooling mechanism for improved performance. +""" + +from typing import Optional, Tuple, List + +from playwright.async_api import Page, BrowserContext + +from ..async_logger import AsyncLogger +from ..async_configs import BrowserConfig, CrawlerRunConfig + +from .strategies import ( + BaseBrowserStrategy, + PlaywrightBrowserStrategy, + CDPBrowserStrategy, + BuiltinBrowserStrategy, + DockerBrowserStrategy +) + +class BrowserManager: + """Main interface for browser management in Crawl4AI. + + This class maintains backward compatibility with the existing implementation + while using the strategy pattern internally for different browser types. + + Attributes: + config (BrowserConfig): Configuration object containing all browser settings + logger: Logger instance for recording events and errors + browser: The browser instance + default_context: The default browser context + managed_browser: The managed browser instance + playwright: The Playwright instance + sessions: Dictionary to store session information + session_ttl: Session timeout in seconds + """ + + def __init__(self, browser_config: Optional[BrowserConfig] = None, logger: Optional[AsyncLogger] = None): + """Initialize the BrowserManager with a browser configuration. + + Args: + browser_config: Configuration object containing all browser settings + logger: Logger instance for recording events and errors + """ + self.config = browser_config or BrowserConfig() + self.logger = logger + + # Create strategy based on configuration + self.strategy = self._create_strategy() + + # Initialize state variables for compatibility with existing code + self.browser = None + self.default_context = None + self.managed_browser = None + self.playwright = None + + # For session management (from existing implementation) + self.sessions = {} + self.session_ttl = 1800 # 30 minutes + + def _create_strategy(self) -> BaseBrowserStrategy: + """Create appropriate browser strategy based on configuration. + + Returns: + BaseBrowserStrategy: The selected browser strategy + """ + if self.config.browser_mode == "builtin": + return BuiltinBrowserStrategy(self.config, self.logger) + elif self.config.browser_mode == "docker": + if DockerBrowserStrategy is None: + if self.logger: + self.logger.error( + "Docker browser strategy requested but not available. " + "Falling back to PlaywrightBrowserStrategy.", + tag="BROWSER" + ) + return PlaywrightBrowserStrategy(self.config, self.logger) + return DockerBrowserStrategy(self.config, self.logger) + elif self.config.browser_mode == "cdp" or self.config.cdp_url or self.config.use_managed_browser: + return CDPBrowserStrategy(self.config, self.logger) + else: + return PlaywrightBrowserStrategy(self.config, self.logger) + + async def start(self): + """Start the browser instance and set up the default context. + + Returns: + self: For method chaining + """ + # Start the strategy + await self.strategy.start() + + # Update legacy references + self.browser = self.strategy.browser + self.default_context = self.strategy.default_context + + # Set browser process reference (for CDP strategy) + if hasattr(self.strategy, 'browser_process'): + self.managed_browser = self.strategy + + # Set Playwright reference + self.playwright = self.strategy.playwright + + # Sync sessions if needed + if hasattr(self.strategy, 'sessions'): + self.sessions = self.strategy.sessions + self.session_ttl = self.strategy.session_ttl + + return self + + async def get_page(self, crawlerRunConfig: CrawlerRunConfig) -> Tuple[Page, BrowserContext]: + """Get a page for the given configuration. + + Args: + crawlerRunConfig: Configuration object for the crawler run + + Returns: + Tuple of (Page, BrowserContext) + """ + # Delegate to strategy + page, context = await self.strategy.get_page(crawlerRunConfig) + + # Sync sessions if needed + if hasattr(self.strategy, 'sessions'): + self.sessions = self.strategy.sessions + + return page, context + + async def get_pages(self, crawlerRunConfig: CrawlerRunConfig, count: int = 1) -> List[Tuple[Page, BrowserContext]]: + """Get multiple pages with the same configuration. + + This method efficiently creates multiple browser pages using the same configuration, + which is useful for parallel crawling of multiple URLs. + + Args: + crawlerRunConfig: Configuration for the pages + count: Number of pages to create + + Returns: + List of (Page, Context) tuples + """ + # Delegate to strategy + pages = await self.strategy.get_pages(crawlerRunConfig, count) + + # Sync sessions if needed + if hasattr(self.strategy, 'sessions'): + self.sessions = self.strategy.sessions + + return pages + + # Just for legacy compatibility + async def kill_session(self, session_id: str): + """Kill a browser session and clean up resources. + + Args: + session_id: The session ID to kill + """ + # Handle kill_session via our strategy if it supports it + await self.strategy.kill_session(session_id) + + # sync sessions if needed + if hasattr(self.strategy, 'sessions'): + self.sessions = self.strategy.sessions + + async def close(self): + """Close the browser and clean up resources.""" + # Delegate to strategy + await self.strategy.close() + + # Reset legacy references + self.browser = None + self.default_context = None + self.managed_browser = None + self.playwright = None + self.sessions = {} diff --git a/crawl4ai/browser/manager.py b/crawl4ai/browser/manager.py index 129a940b..ba48cbd7 100644 --- a/crawl4ai/browser/manager.py +++ b/crawl4ai/browser/manager.py @@ -2,12 +2,15 @@ This module provides a central browser management class that uses the strategy pattern internally while maintaining the existing API. -It also implements a page pooling mechanism for improved performance. +It also implements browser pooling for improved performance. """ import asyncio -import time -from typing import Optional, Tuple, List +import hashlib +import json +import math +from enum import Enum +from typing import Dict, List, Optional, Tuple, Any from playwright.async_api import Page, BrowserContext @@ -22,55 +25,111 @@ from .strategies import ( DockerBrowserStrategy ) +class UnavailableBehavior(Enum): + """Behavior when no browser is available.""" + ON_DEMAND = "on_demand" # Create new browser on demand + PENDING = "pending" # Wait until a browser is available + EXCEPTION = "exception" # Raise an exception + + class BrowserManager: - """Main interface for browser management in Crawl4AI. + """Main interface for browser management and pooling in Crawl4AI. This class maintains backward compatibility with the existing implementation while using the strategy pattern internally for different browser types. + It also implements browser pooling for improved performance. Attributes: - config (BrowserConfig): Configuration object containing all browser settings - logger: Logger instance for recording events and errors - browser: The browser instance - default_context: The default browser context - managed_browser: The managed browser instance - playwright: The Playwright instance - sessions: Dictionary to store session information - session_ttl: Session timeout in seconds + config (BrowserConfig): Default configuration object for browsers + logger (AsyncLogger): Logger instance for recording events and errors + browser_pool (Dict): Dictionary to store browser instances by configuration + browser_in_use (Dict): Dictionary to track which browsers are in use + request_queues (Dict): Queues for pending requests by configuration + unavailable_behavior (UnavailableBehavior): Behavior when no browser is available """ - def __init__(self, browser_config: Optional[BrowserConfig] = None, logger: Optional[AsyncLogger] = None): + def __init__( + self, + browser_config: Optional[BrowserConfig] = None, + logger: Optional[AsyncLogger] = None, + unavailable_behavior: UnavailableBehavior = UnavailableBehavior.EXCEPTION, + max_browsers_per_config: int = 10, + max_pages_per_browser: int = 5 + ): """Initialize the BrowserManager with a browser configuration. Args: browser_config: Configuration object containing all browser settings logger: Logger instance for recording events and errors + unavailable_behavior: Behavior when no browser is available + max_browsers_per_config: Maximum number of browsers per configuration + max_pages_per_browser: Maximum number of pages per browser """ self.config = browser_config or BrowserConfig() self.logger = logger + self.unavailable_behavior = unavailable_behavior + self.max_browsers_per_config = max_browsers_per_config + self.max_pages_per_browser = max_pages_per_browser - # Create strategy based on configuration - self.strategy = self._create_strategy() + # Browser pool management + self.browser_pool = {} # config_hash -> list of browser strategies + self.browser_in_use = {} # strategy instance -> Boolean + self.request_queues = {} # config_hash -> asyncio.Queue() + self._browser_locks = {} # config_hash -> asyncio.Lock() + self._browser_pool_lock = asyncio.Lock() # Global lock for pool modifications - # Initialize state variables for compatibility with existing code + # Page pool management + self.page_pool = {} # (browser_config_hash, crawler_config_hash) -> list of (page, context, strategy) + self._page_pool_lock = asyncio.Lock() + + self.browser_page_counts = {} # strategy instance -> current page count + self._page_count_lock = asyncio.Lock() # Lock for thread-safe access to page counts + + # For session management (from existing implementation) + self.sessions = {} + self.session_ttl = 1800 # 30 minutes + + # For legacy compatibility self.browser = None self.default_context = None self.managed_browser = None self.playwright = None - - # For session management (from existing implementation) - self.sessions = {} - self.session_ttl = 1800 # 30 minutes + self.strategy = None - def _create_strategy(self) -> BaseBrowserStrategy: + def _create_browser_config_hash(self, browser_config: BrowserConfig) -> str: + """Create a hash of the browser configuration for browser pooling. + + Args: + browser_config: Browser configuration + + Returns: + str: Hash of the browser configuration + """ + # Convert config to dictionary, excluding any callable objects + config_dict = browser_config.__dict__.copy() + for key in list(config_dict.keys()): + if callable(config_dict[key]): + del config_dict[key] + + # Convert to canonical JSON string + config_json = json.dumps(config_dict, sort_keys=True, default=str) + + # Hash the JSON + config_hash = hashlib.sha256(config_json.encode()).hexdigest() + return config_hash + + def _create_strategy(self, browser_config: BrowserConfig) -> BaseBrowserStrategy: """Create appropriate browser strategy based on configuration. + Args: + browser_config: Browser configuration + Returns: BaseBrowserStrategy: The selected browser strategy """ - if self.config.browser_mode == "builtin": - return BuiltinBrowserStrategy(self.config, self.logger) - elif self.config.browser_mode == "docker": + if browser_config.browser_mode == "builtin": + return BuiltinBrowserStrategy(browser_config, self.logger) + elif browser_config.browser_mode == "docker": if DockerBrowserStrategy is None: if self.logger: self.logger.error( @@ -78,102 +137,718 @@ class BrowserManager: "Falling back to PlaywrightBrowserStrategy.", tag="BROWSER" ) - return PlaywrightBrowserStrategy(self.config, self.logger) - return DockerBrowserStrategy(self.config, self.logger) - elif self.config.browser_mode == "cdp" or self.config.cdp_url or self.config.use_managed_browser: - return CDPBrowserStrategy(self.config, self.logger) + return PlaywrightBrowserStrategy(browser_config, self.logger) + return DockerBrowserStrategy(browser_config, self.logger) + elif browser_config.browser_mode == "cdp" or browser_config.cdp_url or browser_config.use_managed_browser: + return CDPBrowserStrategy(browser_config, self.logger) else: - return PlaywrightBrowserStrategy(self.config, self.logger) + return PlaywrightBrowserStrategy(browser_config, self.logger) + async def initialize_pool( + self, + browser_configs: List[BrowserConfig] = None, + browsers_per_config: int = 1, + page_configs: Optional[List[Tuple[BrowserConfig, CrawlerRunConfig, int]]] = None + ): + """Initialize the browser pool with multiple browser configurations. + + Args: + browser_configs: List of browser configurations to initialize + browsers_per_config: Number of browser instances per configuration + page_configs: Optional list of (browser_config, crawler_run_config, count) tuples + for pre-warming pages + + Returns: + self: For method chaining + """ + if not browser_configs: + browser_configs = [self.config] + + # Calculate how many browsers we'll need based on page_configs + browsers_needed = {} + if page_configs: + for browser_config, _, page_count in page_configs: + config_hash = self._create_browser_config_hash(browser_config) + # Calculate browsers based on max_pages_per_browser + browsers_needed_for_config = math.ceil(page_count / self.max_pages_per_browser) + browsers_needed[config_hash] = max( + browsers_needed.get(config_hash, 0), + browsers_needed_for_config + ) + + # Adjust browsers_per_config if needed to ensure enough capacity + config_browsers_needed = {} + for browser_config in browser_configs: + config_hash = self._create_browser_config_hash(browser_config) + + # Estimate browsers needed based on page requirements + browsers_for_config = browsers_per_config + if config_hash in browsers_needed: + browsers_for_config = max(browsers_for_config, browsers_needed[config_hash]) + + config_browsers_needed[config_hash] = browsers_for_config + + # Update max_browsers_per_config if needed + if browsers_for_config > self.max_browsers_per_config: + self.max_browsers_per_config = browsers_for_config + if self.logger: + self.logger.info( + f"Increased max_browsers_per_config to {browsers_for_config} to accommodate page requirements", + tag="POOL" + ) + + # Initialize locks and queues for each config + async with self._browser_pool_lock: + for browser_config in browser_configs: + config_hash = self._create_browser_config_hash(browser_config) + + # Initialize lock for this config if needed + if config_hash not in self._browser_locks: + self._browser_locks[config_hash] = asyncio.Lock() + + # Initialize queue for this config if needed + if config_hash not in self.request_queues: + self.request_queues[config_hash] = asyncio.Queue() + + # Initialize pool for this config if needed + if config_hash not in self.browser_pool: + self.browser_pool[config_hash] = [] + + # Create browser instances for each configuration in parallel + browser_tasks = [] + + for browser_config in browser_configs: + config_hash = self._create_browser_config_hash(browser_config) + browsers_to_create = config_browsers_needed.get( + config_hash, + browsers_per_config + ) - len(self.browser_pool.get(config_hash, [])) + + if browsers_to_create <= 0: + continue + + for _ in range(browsers_to_create): + # Create a task for each browser initialization + task = self._create_and_add_browser(browser_config, config_hash) + browser_tasks.append(task) + + # Wait for all browser initializations to complete + if browser_tasks: + if self.logger: + self.logger.info(f"Initializing {len(browser_tasks)} browsers in parallel...", tag="POOL") + await asyncio.gather(*browser_tasks) + + # Pre-warm pages if requested + if page_configs: + page_tasks = [] + for browser_config, crawler_run_config, count in page_configs: + task = self._prewarm_pages(browser_config, crawler_run_config, count) + page_tasks.append(task) + + if page_tasks: + if self.logger: + self.logger.info(f"Pre-warming pages with {len(page_tasks)} configurations...", tag="POOL") + await asyncio.gather(*page_tasks) + + # Update legacy references + if self.browser_pool and next(iter(self.browser_pool.values()), []): + strategy = next(iter(self.browser_pool.values()))[0] + self.strategy = strategy + self.browser = strategy.browser + self.default_context = strategy.default_context + self.playwright = strategy.playwright + + return self + + async def _create_and_add_browser(self, browser_config: BrowserConfig, config_hash: str): + """Create and add a browser to the pool. + + Args: + browser_config: Browser configuration + config_hash: Hash of the configuration + """ + try: + strategy = self._create_strategy(browser_config) + await strategy.start() + + async with self._browser_pool_lock: + if config_hash not in self.browser_pool: + self.browser_pool[config_hash] = [] + self.browser_pool[config_hash].append(strategy) + self.browser_in_use[strategy] = False + + if self.logger: + self.logger.debug( + f"Added browser to pool: {browser_config.browser_type} " + f"({browser_config.browser_mode})", + tag="POOL" + ) + except Exception as e: + if self.logger: + self.logger.error( + f"Failed to create browser: {str(e)}", + tag="POOL" + ) + raise + + def _make_config_signature(self, crawlerRunConfig: CrawlerRunConfig) -> str: + """Create a signature hash from crawler configuration. + + Args: + crawlerRunConfig: Crawler run configuration + + Returns: + str: Hash of the crawler configuration + """ + config_dict = crawlerRunConfig.__dict__.copy() + # Exclude items that do not affect page creation + ephemeral_keys = [ + "session_id", + "js_code", + "scraping_strategy", + "extraction_strategy", + "chunking_strategy", + "cache_mode", + "content_filter", + "semaphore_count", + "url" + ] + for key in ephemeral_keys: + if key in config_dict: + del config_dict[key] + + # Convert to canonical JSON string + config_json = json.dumps(config_dict, sort_keys=True, default=str) + + # Hash the JSON + config_hash = hashlib.sha256(config_json.encode("utf-8")).hexdigest() + return config_hash + + async def _prewarm_pages( + self, + browser_config: BrowserConfig, + crawler_run_config: CrawlerRunConfig, + count: int + ): + """Pre-warm pages for a specific configuration. + + Args: + browser_config: Browser configuration + crawler_run_config: Crawler run configuration + count: Number of pages to pre-warm + """ + try: + # Create individual page tasks and run them in parallel + browser_config_hash = self._create_browser_config_hash(browser_config) + crawler_config_hash = self._make_config_signature(crawler_run_config) + async def get_single_page(): + strategy = await self.get_available_browser(browser_config) + try: + page, context = await strategy.get_page(crawler_run_config) + # Store config hashes on the page object for later retrieval + setattr(page, "_browser_config_hash", browser_config_hash) + setattr(page, "_crawler_config_hash", crawler_config_hash) + return page, context, strategy + except Exception as e: + # Release the browser back to the pool + await self.release_browser(strategy, browser_config) + raise e + + # Create tasks for parallel execution + page_tasks = [get_single_page() for _ in range(count)] + + # Execute all page creation tasks in parallel + pages_contexts_strategies = await asyncio.gather(*page_tasks) + + # Add pages to the page pool + browser_config_hash = self._create_browser_config_hash(browser_config) + crawler_config_hash = self._make_config_signature(crawler_run_config) + pool_key = (browser_config_hash, crawler_config_hash) + + async with self._page_pool_lock: + if pool_key not in self.page_pool: + self.page_pool[pool_key] = [] + + # Add all pages to the pool + self.page_pool[pool_key].extend(pages_contexts_strategies) + + if self.logger: + self.logger.debug( + f"Pre-warmed {count} pages in parallel with config {crawler_run_config}", + tag="POOL" + ) + except Exception as e: + if self.logger: + self.logger.error( + f"Failed to pre-warm pages: {str(e)}", + tag="POOL" + ) + raise + + async def get_available_browser( + self, + browser_config: Optional[BrowserConfig] = None + ) -> BaseBrowserStrategy: + """Get an available browser from the pool for the given configuration. + + Args: + browser_config: Browser configuration to match + + Returns: + BaseBrowserStrategy: An available browser strategy + + Raises: + Exception: If no browser is available and behavior is EXCEPTION + """ + browser_config = browser_config or self.config + config_hash = self._create_browser_config_hash(browser_config) + + async with self._browser_locks.get(config_hash, asyncio.Lock()): + # Check if we have browsers for this config + if config_hash not in self.browser_pool or not self.browser_pool[config_hash]: + if self.unavailable_behavior == UnavailableBehavior.ON_DEMAND: + # Create a new browser on demand + if self.logger: + self.logger.info( + f"1> Creating new browser on demand for config {config_hash[:8]}", + tag="POOL" + ) + + # Initialize pool for this config if needed + async with self._browser_pool_lock: + if config_hash not in self.browser_pool: + self.browser_pool[config_hash] = [] + + strategy = self._create_strategy(browser_config) + await strategy.start() + + self.browser_pool[config_hash].append(strategy) + self.browser_in_use[strategy] = False + + elif self.unavailable_behavior == UnavailableBehavior.EXCEPTION: + raise Exception(f"No browsers available for configuration {config_hash[:8]}") + + # Check for an available browser with capacity in the pool + for strategy in self.browser_pool[config_hash]: + # Check if this browser has capacity for more pages + async with self._page_count_lock: + current_pages = self.browser_page_counts.get(strategy, 0) + + if current_pages < self.max_pages_per_browser: + # Increment the page count + self.browser_page_counts[strategy] = current_pages + 1 + + self.browser_in_use[strategy] = True + + # Get browser information for better logging + browser_type = getattr(strategy.config, 'browser_type', 'unknown') + browser_mode = getattr(strategy.config, 'browser_mode', 'unknown') + strategy_id = id(strategy) # Use object ID as a unique identifier + + if self.logger: + self.logger.debug( + f"Selected browser #{strategy_id} ({browser_type}/{browser_mode}) - " + f"pages: {current_pages+1}/{self.max_pages_per_browser}", + tag="POOL" + ) + + return strategy + + # All browsers are at capacity or in use + if self.unavailable_behavior == UnavailableBehavior.ON_DEMAND: + # Check if we've reached the maximum number of browsers + if len(self.browser_pool[config_hash]) >= self.max_browsers_per_config: + if self.logger: + self.logger.warning( + f"Maximum browsers reached for config {config_hash[:8]} and all at page capacity", + tag="POOL" + ) + if self.unavailable_behavior == UnavailableBehavior.EXCEPTION: + raise Exception("Maximum browsers reached and all at page capacity") + + # Create a new browser on demand + if self.logger: + self.logger.info( + f"2> Creating new browser on demand for config {config_hash[:8]}", + tag="POOL" + ) + + strategy = self._create_strategy(browser_config) + await strategy.start() + + async with self._browser_pool_lock: + self.browser_pool[config_hash].append(strategy) + self.browser_in_use[strategy] = True + + return strategy + + # If we get here, either behavior is EXCEPTION or PENDING + if self.unavailable_behavior == UnavailableBehavior.EXCEPTION: + raise Exception(f"All browsers in use or at page capacity for configuration {config_hash[:8]}") + + # For PENDING behavior, set up waiting mechanism + if config_hash not in self.request_queues: + self.request_queues[config_hash] = asyncio.Queue() + + # Create a future to wait on + future = asyncio.Future() + await self.request_queues[config_hash].put(future) + + if self.logger: + self.logger.debug( + f"Waiting for available browser for config {config_hash[:8]}", + tag="POOL" + ) + + # Wait for a browser to become available + strategy = await future + return strategy + + async def get_page( + self, + crawlerRunConfig: CrawlerRunConfig, + browser_config: Optional[BrowserConfig] = None + ) -> Tuple[Page, BrowserContext, BaseBrowserStrategy]: + """Get a page from the browser pool.""" + browser_config = browser_config or self.config + + # Check if we have a pre-warmed page available + browser_config_hash = self._create_browser_config_hash(browser_config) + crawler_config_hash = self._make_config_signature(crawlerRunConfig) + pool_key = (browser_config_hash, crawler_config_hash) + + # Try to get a page from the pool + async with self._page_pool_lock: + if pool_key in self.page_pool and self.page_pool[pool_key]: + # Get a page from the pool + page, context, strategy = self.page_pool[pool_key].pop() + + # Mark browser as in use (it already is, but ensure consistency) + self.browser_in_use[strategy] = True + + if self.logger: + self.logger.debug( + f"Using pre-warmed page for config {crawler_config_hash[:8]}", + tag="POOL" + ) + + # Note: We don't increment page count since it was already counted when created + + return page, context, strategy + + # No pre-warmed page available, create a new one + # get_available_browser already increments the page count + strategy = await self.get_available_browser(browser_config) + + try: + # Get a page from the browser + page, context = await strategy.get_page(crawlerRunConfig) + + # Store config hashes on the page object for later retrieval + setattr(page, "_browser_config_hash", browser_config_hash) + setattr(page, "_crawler_config_hash", crawler_config_hash) + + return page, context, strategy + except Exception as e: + # Release the browser back to the pool and decrement the page count + await self.release_browser(strategy, browser_config, decrement_page_count=True) + raise e + + async def release_page( + self, + page: Page, + strategy: BaseBrowserStrategy, + browser_config: Optional[BrowserConfig] = None, + keep_alive: bool = True, + return_to_pool: bool = True + ): + """Release a page back to the pool.""" + browser_config = browser_config or self.config + + page_url = page.url if page else None + + # If not keeping the page alive, close it and decrement count + if not keep_alive: + try: + await page.close() + except Exception as e: + if self.logger: + self.logger.error( + f"Error closing page: {str(e)}", + tag="POOL" + ) + # Release the browser with page count decrement + await self.release_browser(strategy, browser_config, decrement_page_count=True) + return + + # If returning to pool + if return_to_pool: + # Get the configuration hashes from the page object + browser_config_hash = getattr(page, "_browser_config_hash", None) + crawler_config_hash = getattr(page, "_crawler_config_hash", None) + + if browser_config_hash and crawler_config_hash: + pool_key = (browser_config_hash, crawler_config_hash) + + async with self._page_pool_lock: + if pool_key not in self.page_pool: + self.page_pool[pool_key] = [] + + # Add page back to the pool + self.page_pool[pool_key].append((page, page.context, strategy)) + + if self.logger: + self.logger.debug( + f"Returned page to pool for config {crawler_config_hash[:8]}, url: {page_url}", + tag="POOL" + ) + + # Note: We don't decrement the page count here since the page is still "in use" + # from the browser's perspective, just in our pool + return + else: + # If we can't identify the configuration, log a warning + if self.logger: + self.logger.warning( + "Cannot return page to pool - missing configuration hashes", + tag="POOL" + ) + + # If we got here, we couldn't return to pool, so just release the browser + await self.release_browser(strategy, browser_config, decrement_page_count=True) + + async def release_browser( + self, + strategy: BaseBrowserStrategy, + browser_config: Optional[BrowserConfig] = None, + decrement_page_count: bool = True + ): + """Release a browser back to the pool.""" + browser_config = browser_config or self.config + config_hash = self._create_browser_config_hash(browser_config) + + # Decrement page count + if decrement_page_count: + async with self._page_count_lock: + current_count = self.browser_page_counts.get(strategy, 1) + self.browser_page_counts[strategy] = max(0, current_count - 1) + + if self.logger: + self.logger.debug( + f"Decremented page count for browser (now: {self.browser_page_counts[strategy]})", + tag="POOL" + ) + + # Mark as not in use + self.browser_in_use[strategy] = False + + # Process any waiting requests + if config_hash in self.request_queues and not self.request_queues[config_hash].empty(): + future = await self.request_queues[config_hash].get() + if not future.done(): + future.set_result(strategy) + + async def get_pages( + self, + crawlerRunConfig: CrawlerRunConfig, + count: int = 1, + browser_config: Optional[BrowserConfig] = None + ) -> List[Tuple[Page, BrowserContext, BaseBrowserStrategy]]: + """Get multiple pages from the browser pool. + + Args: + crawlerRunConfig: Configuration for the crawler run + count: Number of pages to get + browser_config: Browser configuration to use + + Returns: + List of (Page, Context, Strategy) tuples + """ + results = [] + for _ in range(count): + try: + result = await self.get_page(crawlerRunConfig, browser_config) + results.append(result) + except Exception as e: + # Release any pages we've already gotten + for page, _, strategy in results: + await self.release_page(page, strategy, browser_config) + raise e + + return results + + async def get_page_pool_status(self) -> Dict[str, Any]: + """Get information about the page pool status. + + Returns: + Dict with page pool status information + """ + status = { + "total_pooled_pages": 0, + "configs": {} + } + + async with self._page_pool_lock: + for (browser_hash, crawler_hash), pages in self.page_pool.items(): + config_key = f"{browser_hash[:8]}_{crawler_hash[:8]}" + status["configs"][config_key] = len(pages) + status["total_pooled_pages"] += len(pages) + + if self.logger: + self.logger.debug( + f"Page pool status: {status['total_pooled_pages']} pages available", + tag="POOL" + ) + + return status + + async def get_pool_status(self) -> Dict[str, Any]: + """Get information about the browser pool status. + + Returns: + Dict with pool status information + """ + status = { + "total_browsers": 0, + "browsers_in_use": 0, + "total_pages": 0, + "configs": {} + } + + for config_hash, strategies in self.browser_pool.items(): + config_pages = 0 + in_use = 0 + + for strategy in strategies: + is_in_use = self.browser_in_use.get(strategy, False) + if is_in_use: + in_use += 1 + + # Get page count for this browser + try: + page_count = len(await strategy.get_opened_pages()) + config_pages += page_count + except Exception as e: + if self.logger: + self.logger.error(f"Error getting page count: {str(e)}", tag="POOL") + + config_status = { + "total_browsers": len(strategies), + "browsers_in_use": in_use, + "pages_open": config_pages, + "waiting_requests": self.request_queues.get(config_hash, asyncio.Queue()).qsize(), + "max_capacity": len(strategies) * self.max_pages_per_browser, + "utilization_pct": round((config_pages / (len(strategies) * self.max_pages_per_browser)) * 100, 1) + if strategies else 0 + } + + status["configs"][config_hash] = config_status + status["total_browsers"] += config_status["total_browsers"] + status["browsers_in_use"] += config_status["browsers_in_use"] + status["total_pages"] += config_pages + + # Add overall utilization + if status["total_browsers"] > 0: + max_capacity = status["total_browsers"] * self.max_pages_per_browser + status["overall_utilization_pct"] = round((status["total_pages"] / max_capacity) * 100, 1) + else: + status["overall_utilization_pct"] = 0 + + return status + + async def start(self): - """Start the browser instance and set up the default context. + """Start at least one browser instance in the pool. + + This method is kept for backward compatibility. Returns: self: For method chaining """ - # Start the strategy - await self.strategy.start() - - # Update legacy references - self.browser = self.strategy.browser - self.default_context = self.strategy.default_context - - # Set browser process reference (for CDP strategy) - if hasattr(self.strategy, 'browser_process'): - self.managed_browser = self.strategy - - # Set Playwright reference - self.playwright = self.strategy.playwright - - # Sync sessions if needed - if hasattr(self.strategy, 'sessions'): - self.sessions = self.strategy.sessions - self.session_ttl = self.strategy.session_ttl - + await self.initialize_pool([self.config], 1) return self - - async def get_page(self, crawlerRunConfig: CrawlerRunConfig) -> Tuple[Page, BrowserContext]: - """Get a page for the given configuration. - Args: - crawlerRunConfig: Configuration object for the crawler run - - Returns: - Tuple of (Page, BrowserContext) - """ - # Delegate to strategy - page, context = await self.strategy.get_page(crawlerRunConfig) - - # Sync sessions if needed - if hasattr(self.strategy, 'sessions'): - self.sessions = self.strategy.sessions - - return page, context - - async def get_pages(self, crawlerRunConfig: CrawlerRunConfig, count: int = 1) -> List[Tuple[Page, BrowserContext]]: - """Get multiple pages with the same configuration. - - This method efficiently creates multiple browser pages using the same configuration, - which is useful for parallel crawling of multiple URLs. - - Args: - crawlerRunConfig: Configuration for the pages - count: Number of pages to create - - Returns: - List of (Page, Context) tuples - """ - # Delegate to strategy - pages = await self.strategy.get_pages(crawlerRunConfig, count) - - # Sync sessions if needed - if hasattr(self.strategy, 'sessions'): - self.sessions = self.strategy.sessions - - return pages - - # Just for legacy compatibility async def kill_session(self, session_id: str): """Kill a browser session and clean up resources. + Delegated to the strategy. This method is kept for backward compatibility. + Args: session_id: The session ID to kill """ - # Handle kill_session via our strategy if it supports it + if not self.strategy: + return + await self.strategy.kill_session(session_id) - - # sync sessions if needed + + # Sync sessions if hasattr(self.strategy, 'sessions'): self.sessions = self.strategy.sessions async def close(self): - """Close the browser and clean up resources.""" - # Delegate to strategy - await self.strategy.close() + """Close all browsers in the pool and clean up resources.""" + # Close all browsers in the pool + for strategies in self.browser_pool.values(): + for strategy in strategies: + try: + await strategy.close() + except Exception as e: + if self.logger: + self.logger.error( + f"Error closing browser: {str(e)}", + tag="POOL" + ) + + # Clear pool data + self.browser_pool = {} + self.browser_in_use = {} # Reset legacy references self.browser = None self.default_context = None self.managed_browser = None self.playwright = None + self.strategy = None self.sessions = {} + + +async def create_browser_manager( + browser_config: Optional[BrowserConfig] = None, + logger: Optional[AsyncLogger] = None, + unavailable_behavior: UnavailableBehavior = UnavailableBehavior.EXCEPTION, + max_browsers_per_config: int = 10, + initial_pool_size: int = 1, + page_configs: Optional[List[Tuple[BrowserConfig, CrawlerRunConfig, int]]] = None +) -> BrowserManager: + """Factory function to create and initialize a BrowserManager. + + Args: + browser_config: Configuration for the browsers + logger: Logger for recording events + unavailable_behavior: Behavior when no browser is available + max_browsers_per_config: Maximum browsers per configuration + initial_pool_size: Initial number of browsers per configuration + page_configs: Optional configurations for pre-warming pages + + Returns: + Initialized BrowserManager + """ + manager = BrowserManager( + browser_config=browser_config, + logger=logger, + unavailable_behavior=unavailable_behavior, + max_browsers_per_config=max_browsers_per_config + ) + + await manager.initialize_pool( + [browser_config] if browser_config else None, + initial_pool_size, + page_configs + ) + + return manager + + + + + diff --git a/crawl4ai/browser/strategies/base.py b/crawl4ai/browser/strategies/base.py index 5c46cbe4..14f7464d 100644 --- a/crawl4ai/browser/strategies/base.py +++ b/crawl4ai/browser/strategies/base.py @@ -109,6 +109,9 @@ class BaseBrowserStrategy(ABC): page, context = await self._generate_page(crawlerRunConfig) + import uuid + setattr(page, "guid", uuid.uuid4()) + # If a session_id is specified, store this session so we can reuse later if crawlerRunConfig.session_id: self.sessions[crawlerRunConfig.session_id] = (context, page, time.time()) @@ -132,6 +135,12 @@ class BaseBrowserStrategy(ABC): pages.append((page, context)) return pages + async def get_opened_pages(self) -> List[Page]: + """Get all opened pages in the + browser. + """ + return [page for context in self.contexts_by_config.values() for page in context.pages] + def _build_browser_args(self) -> dict: """Build browser launch arguments from config. diff --git a/crawl4ai/browser/strategies/cdp.py b/crawl4ai/browser/strategies/cdp.py index e5982065..0bef6fec 100644 --- a/crawl4ai/browser/strategies/cdp.py +++ b/crawl4ai/browser/strategies/cdp.py @@ -122,7 +122,7 @@ class CDPBrowserStrategy(BaseBrowserStrategy): else: raise NotImplementedError(f"Browser type {self.config.browser_type} not supported") - args = base_args + browser_args + args + args = base_args + browser_args['args'] + args # Start browser process try: diff --git a/tests/browser/manager/demo_browser_manager.py b/tests/browser/manager/demo_browser_manager.py new file mode 100644 index 00000000..2fde7e8a --- /dev/null +++ b/tests/browser/manager/demo_browser_manager.py @@ -0,0 +1,525 @@ +"""Demo script for testing the enhanced BrowserManager. + +This script demonstrates the browser pooling capabilities of the enhanced +BrowserManager with various configurations and usage patterns. +""" + +import asyncio +import time +import random + +from crawl4ai.browser.manager import BrowserManager, UnavailableBehavior +from crawl4ai.async_configs import BrowserConfig, CrawlerRunConfig +from crawl4ai.async_logger import AsyncLogger + +import playwright + +SAFE_URLS = [ + "https://example.com", + "https://example.com/page1", + "https://httpbin.org/get", + "https://httpbin.org/html", + "https://httpbin.org/ip", + "https://httpbin.org/user-agent", + "https://httpbin.org/headers", + "https://httpbin.org/cookies", + "https://httpstat.us/200", + "https://httpstat.us/301", + "https://httpstat.us/404", + "https://httpstat.us/500", + "https://jsonplaceholder.typicode.com/posts/1", + "https://jsonplaceholder.typicode.com/posts/2", + "https://jsonplaceholder.typicode.com/posts/3", + "https://jsonplaceholder.typicode.com/posts/4", + "https://jsonplaceholder.typicode.com/posts/5", + "https://jsonplaceholder.typicode.com/comments/1", + "https://jsonplaceholder.typicode.com/comments/2", + "https://jsonplaceholder.typicode.com/users/1", + "https://jsonplaceholder.typicode.com/users/2", + "https://jsonplaceholder.typicode.com/albums/1", + "https://jsonplaceholder.typicode.com/albums/2", + "https://jsonplaceholder.typicode.com/photos/1", + "https://jsonplaceholder.typicode.com/photos/2", + "https://jsonplaceholder.typicode.com/todos/1", + "https://jsonplaceholder.typicode.com/todos/2", + "https://www.iana.org", + "https://www.iana.org/domains", + "https://www.iana.org/numbers", + "https://www.iana.org/protocols", + "https://www.iana.org/about", + "https://www.iana.org/time-zones", + "https://www.data.gov", + "https://catalog.data.gov/dataset", + "https://www.archives.gov", + "https://www.usa.gov", + "https://www.loc.gov", + "https://www.irs.gov", + "https://www.census.gov", + "https://www.bls.gov", + "https://www.gpo.gov", + "https://www.w3.org", + "https://www.w3.org/standards", + "https://www.w3.org/WAI", + "https://www.rfc-editor.org", + "https://www.ietf.org", + "https://www.icann.org", + "https://www.internetsociety.org", + "https://www.python.org" +] + +async def basic_pooling_demo(): + """Demonstrate basic browser pooling functionality.""" + print("\n=== Basic Browser Pooling Demo ===") + + # Create logger + logger = AsyncLogger(verbose=True) + + # Create browser configurations + config1 = BrowserConfig( + browser_type="chromium", + headless=True, + browser_mode="playwright" + ) + + config2 = BrowserConfig( + browser_type="chromium", + headless=True, + browser_mode="cdp" + ) + + # Create browser manager with on-demand behavior + manager = BrowserManager( + browser_config=config1, + logger=logger, + unavailable_behavior=UnavailableBehavior.ON_DEMAND, + max_browsers_per_config=3 + ) + + try: + # Initialize pool with both configurations + print("Initializing browser pool...") + await manager.initialize_pool( + browser_configs=[config1, config2], + browsers_per_config=2 + ) + + # Display initial pool status + status = await manager.get_pool_status() + print(f"Initial pool status: {status}") + + # Create crawler run configurations + run_config1 = CrawlerRunConfig() + run_config2 = CrawlerRunConfig() + + # Simulate concurrent page requests + print("\nGetting pages for parallel crawling...") + + # Function to simulate crawling + async def simulate_crawl(index: int, config: BrowserConfig, run_config: CrawlerRunConfig): + print(f"Crawler {index}: Requesting page...") + page, context, strategy = await manager.get_page(run_config, config) + print(f"Crawler {index}: Got page, navigating to example.com...") + + try: + await page.goto("https://example.com") + title = await page.title() + print(f"Crawler {index}: Page title: {title}") + + # Simulate work + await asyncio.sleep(random.uniform(1, 3)) + print(f"Crawler {index}: Work completed, releasing page...") + + # Check dynamic page content + content = await page.content() + content_length = len(content) + print(f"Crawler {index}: Page content length: {content_length}") + + except Exception as e: + print(f"Crawler {index}: Error: {str(e)}") + finally: + # Release the page + await manager.release_page(page, strategy, config) + print(f"Crawler {index}: Page released") + + # Create 5 parallel crawls + crawl_tasks = [] + for i in range(5): + # Alternate between configurations + config = config1 if i % 2 == 0 else config2 + run_config = run_config1 if i % 2 == 0 else run_config2 + + task = asyncio.create_task(simulate_crawl(i+1, config, run_config)) + crawl_tasks.append(task) + + # Wait for all crawls to complete + await asyncio.gather(*crawl_tasks) + + # Display final pool status + status = await manager.get_pool_status() + print(f"\nFinal pool status: {status}") + + finally: + # Clean up + print("\nClosing browser manager...") + await manager.close() + print("Browser manager closed") + + +async def prewarm_pages_demo(): + """Demonstrate page pre-warming functionality.""" + print("\n=== Page Pre-warming Demo ===") + + # Create logger + logger = AsyncLogger(verbose=True) + + # Create browser configuration + config = BrowserConfig( + browser_type="chromium", + headless=True, + browser_mode="playwright" + ) + + # Create crawler run configurations for pre-warming + run_config1 = CrawlerRunConfig( + user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36" + ) + + run_config2 = CrawlerRunConfig( + user_agent="Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.0 Safari/605.1.15" + ) + + # Create page pre-warm configurations + page_configs = [ + (config, run_config1, 2), # 2 pages with run_config1 + (config, run_config2, 3) # 3 pages with run_config2 + ] + + # Create browser manager + manager = BrowserManager( + browser_config=config, + logger=logger, + unavailable_behavior=UnavailableBehavior.EXCEPTION + ) + + try: + # Initialize pool with pre-warmed pages + print("Initializing browser pool with pre-warmed pages...") + await manager.initialize_pool( + browser_configs=[config], + browsers_per_config=2, + page_configs=page_configs + ) + + # Display pool status + status = await manager.get_pool_status() + print(f"Pool status after pre-warming: {status}") + + # Simulate using pre-warmed pages + print("\nUsing pre-warmed pages...") + + async def use_prewarm_page(index: int, run_config: CrawlerRunConfig): + print(f"Task {index}: Requesting pre-warmed page...") + page, context, strategy = await manager.get_page(run_config, config) + + try: + print(f"Task {index}: Got page, navigating to example.com...") + await page.goto("https://example.com") + + # Verify user agent was applied correctly + user_agent = await page.evaluate("() => navigator.userAgent") + print(f"Task {index}: User agent: {user_agent}") + + # Get page title + title = await page.title() + print(f"Task {index}: Page title: {title}") + + # Simulate work + await asyncio.sleep(1) + finally: + # Release the page + print(f"Task {index}: Releasing page...") + await manager.release_page(page, strategy, config) + + # Create tasks to use pre-warmed pages + tasks = [] + # Use run_config1 pages + for i in range(2): + tasks.append(asyncio.create_task(use_prewarm_page(i+1, run_config1))) + + # Use run_config2 pages + for i in range(3): + tasks.append(asyncio.create_task(use_prewarm_page(i+3, run_config2))) + + # Wait for all tasks to complete + await asyncio.gather(*tasks) + + # Try to use more pages than we pre-warmed (should raise exception) + print("\nTrying to use more pages than pre-warmed...") + try: + page, context, strategy = await manager.get_page(run_config1, config) + try: + print("Got extra page (unexpected)") + await page.goto("https://example.com") + finally: + await manager.release_page(page, strategy, config) + except Exception as e: + print(f"Expected exception when requesting more pages: {str(e)}") + + finally: + # Clean up + print("\nClosing browser manager...") + await manager.close() + print("Browser manager closed") + + +async def prewarm_on_demand_demo(): + """Demonstrate pre-warming with on-demand browser creation.""" + print("\n=== Pre-warming with On-Demand Browser Creation Demo ===") + + # Create logger + logger = AsyncLogger(verbose=True) + + # Create browser configuration + config = BrowserConfig( + browser_type="chromium", + headless=True, + browser_mode="playwright" + ) + + # Create crawler run configurations + run_config = CrawlerRunConfig( + user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36" + ) + + # Create page pre-warm configurations - just pre-warm 2 pages + page_configs = [ + (config, run_config, 2) + ] + + # Create browser manager with ON_DEMAND behavior + manager = BrowserManager( + browser_config=config, + logger=logger, + unavailable_behavior=UnavailableBehavior.ON_DEMAND, + max_browsers_per_config=5 # Allow up to 5 browsers + ) + + try: + # Initialize pool with pre-warmed pages + print("Initializing browser pool with pre-warmed pages...") + await manager.initialize_pool( + browser_configs=[config], + browsers_per_config=1, # Start with just 1 browser + page_configs=page_configs + ) + + # Display initial pool status + status = await manager.get_pool_status() + print(f"Initial pool status: {status}") + + # Simulate using more pages than pre-warmed - should create browsers on demand + print("\nUsing more pages than pre-warmed (should create on demand)...") + + async def use_page(index: int): + print(f"Task {index}: Requesting page...") + page, context, strategy = await manager.get_page(run_config, config) + + try: + print(f"Task {index}: Got page, navigating to example.com...") + await page.goto("https://example.com") + + # Get page title + title = await page.title() + print(f"Task {index}: Page title: {title}") + + # Simulate work for a varying amount of time + work_time = 1 + (index * 0.5) # Stagger completion times + print(f"Task {index}: Working for {work_time} seconds...") + await asyncio.sleep(work_time) + print(f"Task {index}: Work completed") + finally: + # Release the page + print(f"Task {index}: Releasing page...") + await manager.release_page(page, strategy, config) + + # Create more tasks than pre-warmed pages + tasks = [] + for i in range(5): # Try to use 5 pages when only 2 are pre-warmed + tasks.append(asyncio.create_task(use_page(i+1))) + + # Wait for all tasks to complete + await asyncio.gather(*tasks) + + # Display final pool status - should show on-demand created browsers + status = await manager.get_pool_status() + print(f"\nFinal pool status: {status}") + + finally: + # Clean up + print("\nClosing browser manager...") + await manager.close() + print("Browser manager closed") + + +async def high_volume_demo(): + """Demonstrate high-volume access to pre-warmed pages.""" + print("\n=== High Volume Pre-warmed Pages Demo ===") + + # Create logger + logger = AsyncLogger(verbose=True) + + # Create browser configuration + config = BrowserConfig( + browser_type="chromium", + headless=True, + browser_mode="playwright" + ) + + # Create crawler run configuration + run_config = CrawlerRunConfig( + user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36" + ) + + # Set up dimensions + browser_count = 10 + pages_per_browser = 5 + total_pages = browser_count * pages_per_browser + + # Create page pre-warm configuration + page_configs = [ + (config, run_config, total_pages) + ] + + print(f"Preparing {browser_count} browsers with {pages_per_browser} pages each ({total_pages} total pages)") + + # Create browser manager with ON_DEMAND behavior as fallback + # No need to specify max_browsers_per_config as it will be calculated automatically + manager = BrowserManager( + browser_config=config, + logger=logger, + unavailable_behavior=UnavailableBehavior.ON_DEMAND + ) + + try: + # Initialize pool with browsers and pre-warmed pages + print(f"Pre-warming {total_pages} pages...") + start_time = time.time() + await manager.initialize_pool( + browser_configs=[config], + browsers_per_config=browser_count, + page_configs=page_configs + ) + warmup_time = time.time() - start_time + print(f"Pre-warming completed in {warmup_time:.2f} seconds") + + # Display pool status + status = await manager.get_pool_status() + print(f"Pool status after pre-warming: {status}") + + # Simulate using all pre-warmed pages simultaneously + print(f"\nSending {total_pages} crawl requests simultaneously...") + + async def crawl_page(index: int): + # url = f"https://example.com/page{index}" + url = SAFE_URLS[index % len(SAFE_URLS)] + print(f"Page {index}: Requesting page...") + # Measure time to acquire page + page_start = time.time() + page, context, strategy = await manager.get_page(run_config, config) + page_acquisition_time = time.time() - page_start + + try: + # Navigate to the URL + nav_start = time.time() + await page.goto(url, timeout=5000) + navigation_time = time.time() - nav_start + + # Get the page title + title = await page.title() + + return { + "index": index, + "url": url, + "title": title, + "page_acquisition_time": page_acquisition_time, + "navigation_time": navigation_time + } + except playwright._impl._errors.TimeoutError as e: + # print(f"Page {index}: Navigation timed out - {e}") + return { + "index": index, + "url": url, + "title": "Navigation timed out", + "page_acquisition_time": page_acquisition_time, + "navigation_time": 0 + } + finally: + # Release the page + await manager.release_page(page, strategy, config) + + # Create and execute all tasks simultaneously + start_time = time.time() + + # Non-parallel way + # for i in range(total_pages): + # await crawl_page(i+1) + + tasks = [crawl_page(i+1) for i in range(total_pages)] + results = await asyncio.gather(*tasks) + total_time = time.time() - start_time + + # # Print all titles + # for result in results: + # print(f"Page {result['index']} ({result['url']}): Title: {result['title']}") + # print(f" Page acquisition time: {result['page_acquisition_time']:.4f}s") + # print(f" Navigation time: {result['navigation_time']:.4f}s") + # print(f" Total time: {result['page_acquisition_time'] + result['navigation_time']:.4f}s") + # print("-" * 40) + + # Report results + print(f"\nAll {total_pages} crawls completed in {total_time:.2f} seconds") + + # Calculate statistics + acquisition_times = [r["page_acquisition_time"] for r in results] + navigation_times = [r["navigation_time"] for r in results] + + avg_acquisition = sum(acquisition_times) / len(acquisition_times) + max_acquisition = max(acquisition_times) + min_acquisition = min(acquisition_times) + + avg_navigation = sum(navigation_times) / len(navigation_times) + max_navigation = max(navigation_times) + min_navigation = min(navigation_times) + + print("\nPage acquisition times:") + print(f" Average: {avg_acquisition:.4f}s") + print(f" Min: {min_acquisition:.4f}s") + print(f" Max: {max_acquisition:.4f}s") + + print("\nPage navigation times:") + print(f" Average: {avg_navigation:.4f}s") + print(f" Min: {min_navigation:.4f}s") + print(f" Max: {max_navigation:.4f}s") + + # Display final pool status + status = await manager.get_pool_status() + print(f"\nFinal pool status: {status}") + + finally: + # Clean up + print("\nClosing browser manager...") + await manager.close() + print("Browser manager closed") + + +async def main(): + """Run all demos.""" + # await basic_pooling_demo() + # await prewarm_pages_demo() + # await prewarm_on_demand_demo() + await high_volume_demo() + # Additional demo functions can be added here + + +if __name__ == "__main__": + asyncio.run(main()) \ No newline at end of file From c5cac2b45907a2fcde86fc9eeaa9b8a6f9a95d94 Mon Sep 17 00:00:00 2001 From: UncleCode Date: Tue, 1 Apr 2025 20:35:02 +0800 Subject: [PATCH 7/7] feat(browser): add BrowserHub for centralized browser management and resource sharing --- crawl4ai/browser/browser_hub.py | 183 +++++++++++++++++++++ crawl4ai/browser/docker/connect.Dockerfile | 55 ------- crawl4ai/browser/docker/launch.Dockerfile | 57 ------- crawl4ai/browser/manager.py | 1 - 4 files changed, 183 insertions(+), 113 deletions(-) create mode 100644 crawl4ai/browser/browser_hub.py delete mode 100644 crawl4ai/browser/docker/connect.Dockerfile delete mode 100644 crawl4ai/browser/docker/launch.Dockerfile diff --git a/crawl4ai/browser/browser_hub.py b/crawl4ai/browser/browser_hub.py new file mode 100644 index 00000000..33144319 --- /dev/null +++ b/crawl4ai/browser/browser_hub.py @@ -0,0 +1,183 @@ +# browser_hub_manager.py +import hashlib +import json +import asyncio +from typing import Dict, Optional +from .manager import BrowserManager, UnavailableBehavior +from ..async_configs import BrowserConfig +from ..async_logger import AsyncLogger + +class BrowserHub: + """ + Manages Browser-Hub instances for sharing across multiple pipelines. + + This class provides centralized management for browser resources, allowing + multiple pipelines to share browser instances efficiently, connect to + existing browser hubs, or create new ones with custom configurations. + """ + _instances: Dict[str, BrowserManager] = {} + _lock = asyncio.Lock() + + @classmethod + async def get_or_create_hub( + cls, + config: Optional[BrowserConfig] = None, + hub_id: Optional[str] = None, + connection_info: Optional[str] = None, + logger: Optional[AsyncLogger] = None, + max_browsers_per_config: int = 10, + max_pages_per_browser: int = 5, + initial_pool_size: int = 1, + page_configs: Optional[list] = None + ) -> BrowserManager: + """ + Get an existing Browser-Hub or create a new one based on parameters. + + Args: + config: Browser configuration for new hub + hub_id: Identifier for the hub instance + connection_info: Connection string for existing hub + logger: Logger for recording events and errors + max_browsers_per_config: Maximum browsers per configuration + max_pages_per_browser: Maximum pages per browser + initial_pool_size: Initial number of browsers to create + page_configs: Optional configurations for pre-warming pages + + Returns: + BrowserManager: The requested browser manager instance + """ + async with cls._lock: + # Scenario 3: Use existing hub via connection info + if connection_info: + instance_key = f"connection:{connection_info}" + if instance_key not in cls._instances: + cls._instances[instance_key] = await cls._connect_to_browser_hub( + connection_info, logger + ) + return cls._instances[instance_key] + + # Scenario 2: Custom configured hub + if config: + config_hash = cls._hash_config(config) + instance_key = hub_id or f"config:{config_hash}" + if instance_key not in cls._instances: + cls._instances[instance_key] = await cls._create_browser_hub( + config, + logger, + max_browsers_per_config, + max_pages_per_browser, + initial_pool_size, + page_configs + ) + return cls._instances[instance_key] + + # Scenario 1: Default hub + instance_key = "default" + if instance_key not in cls._instances: + cls._instances[instance_key] = await cls._create_default_browser_hub( + logger, + max_browsers_per_config, + max_pages_per_browser, + initial_pool_size + ) + return cls._instances[instance_key] + + @classmethod + async def _create_browser_hub( + cls, + config: BrowserConfig, + logger: Optional[AsyncLogger], + max_browsers_per_config: int, + max_pages_per_browser: int, + initial_pool_size: int, + page_configs: Optional[list] + ) -> BrowserManager: + """Create a new browser hub with the specified configuration.""" + manager = BrowserManager( + browser_config=config, + logger=logger, + unavailable_behavior=UnavailableBehavior.ON_DEMAND, + max_browsers_per_config=max_browsers_per_config + ) + + # Initialize the pool + await manager.initialize_pool( + browser_configs=[config] if config else None, + browsers_per_config=initial_pool_size, + page_configs=page_configs + ) + + return manager + + @classmethod + async def _create_default_browser_hub( + cls, + logger: Optional[AsyncLogger], + max_browsers_per_config: int, + max_pages_per_browser: int, + initial_pool_size: int + ) -> BrowserManager: + """Create a default browser hub with standard settings.""" + config = BrowserConfig(headless=True) + return await cls._create_browser_hub( + config, + logger, + max_browsers_per_config, + max_pages_per_browser, + initial_pool_size, + None + ) + + @classmethod + async def _connect_to_browser_hub( + cls, + connection_info: str, + logger: Optional[AsyncLogger] + ) -> BrowserManager: + """ + Connect to an existing browser hub. + + Note: This is a placeholder for future remote connection functionality. + Currently creates a local instance. + """ + if logger: + logger.info( + message="Remote browser hub connections not yet implemented. Creating local instance.", + tag="BROWSER_HUB" + ) + # For now, create a default local instance + return await cls._create_default_browser_hub( + logger, + max_browsers_per_config=10, + max_pages_per_browser=5, + initial_pool_size=1 + ) + + @classmethod + def _hash_config(cls, config: BrowserConfig) -> str: + """Create a hash of the browser configuration for identification.""" + # Convert config to dictionary, excluding any callable objects + config_dict = config.__dict__.copy() + for key in list(config_dict.keys()): + if callable(config_dict[key]): + del config_dict[key] + + # Convert to canonical JSON string + config_json = json.dumps(config_dict, sort_keys=True, default=str) + + # Hash the JSON + config_hash = hashlib.sha256(config_json.encode()).hexdigest() + return config_hash + + @classmethod + async def shutdown_all(cls): + """Close all browser hub instances and clear the registry.""" + async with cls._lock: + shutdown_tasks = [] + for hub in cls._instances.values(): + shutdown_tasks.append(hub.close()) + + if shutdown_tasks: + await asyncio.gather(*shutdown_tasks) + + cls._instances.clear() \ No newline at end of file diff --git a/crawl4ai/browser/docker/connect.Dockerfile b/crawl4ai/browser/docker/connect.Dockerfile deleted file mode 100644 index c83fedb8..00000000 --- a/crawl4ai/browser/docker/connect.Dockerfile +++ /dev/null @@ -1,55 +0,0 @@ -FROM ubuntu:22.04 - -# Install dependencies with comprehensive Chromium support -RUN apt-get update && apt-get install -y --no-install-recommends \ - wget \ - curl \ - gnupg \ - ca-certificates \ - fonts-liberation \ - # Core dependencies - libasound2 \ - libatk1.0-0 \ - libatk-bridge2.0-0 \ - libdrm2 \ - libgbm1 \ - libgtk-3-0 \ - libxcomposite1 \ - libxdamage1 \ - libxext6 \ - libxfixes3 \ - libxrandr2 \ - libx11-6 \ - libxcb1 \ - libxkbcommon0 \ - libpango-1.0-0 \ - libcairo2 \ - libcups2 \ - libdbus-1-3 \ - libnss3 \ - libnspr4 \ - libglib2.0-0 \ - # Utilities - xdg-utils \ - socat \ - # Clean up - && rm -rf /var/lib/apt/lists/* - -# Install Chromium with codecs -RUN apt-get update && \ - apt-get install -y \ - chromium-browser \ - chromium-codecs-ffmpeg-extra \ - && rm -rf /var/lib/apt/lists/* - -# Create Chrome alias for compatibility -RUN ln -s /usr/bin/chromium-browser /usr/bin/google-chrome - -# Create data directory -RUN mkdir -p /data && chmod 777 /data - -# Add startup script -COPY start.sh /start.sh -RUN chmod +x /start.sh - -ENTRYPOINT ["/start.sh"] \ No newline at end of file diff --git a/crawl4ai/browser/docker/launch.Dockerfile b/crawl4ai/browser/docker/launch.Dockerfile deleted file mode 100644 index 63d2cee2..00000000 --- a/crawl4ai/browser/docker/launch.Dockerfile +++ /dev/null @@ -1,57 +0,0 @@ -FROM ubuntu:22.04 - -# Install dependencies with comprehensive Chromium support -RUN apt-get update && apt-get install -y --no-install-recommends \ - wget \ - gnupg \ - ca-certificates \ - fonts-liberation \ - # Sound support - libasound2 \ - # Accessibility support - libatspi2.0-0 \ - libatk1.0-0 \ - libatk-bridge2.0-0 \ - # Graphics and rendering - libdrm2 \ - libgbm1 \ - libgtk-3-0 \ - libxcomposite1 \ - libxdamage1 \ - libxext6 \ - libxfixes3 \ - libxrandr2 \ - # X11 and window system - libx11-6 \ - libxcb1 \ - libxkbcommon0 \ - # Text and internationalization - libpango-1.0-0 \ - libcairo2 \ - # Printing support - libcups2 \ - # System libraries - libdbus-1-3 \ - libnss3 \ - libnspr4 \ - libglib2.0-0 \ - # Utilities - xdg-utils \ - socat \ - # Process management - procps \ - # Clean up - && rm -rf /var/lib/apt/lists/* - -# Install Chrome (new method) -RUN curl -fsSL https://dl.google.com/linux/linux_signing_key.pub | gpg --dearmor -o /usr/share/keyrings/googlechrome-linux-keyring.gpg && \ - echo "deb [arch=amd64 signed-by=/usr/share/keyrings/googlechrome-linux-keyring.gpg] http://dl.google.com/linux/chrome/deb/ stable main" | tee /etc/apt/sources.list.d/google-chrome.list && \ - apt-get update && \ - apt-get install -y google-chrome-stable && \ - rm -rf /var/lib/apt/lists/* - -# Create data directory for user data -RUN mkdir -p /data && chmod 777 /data - -# Keep container running without starting Chrome -CMD ["tail", "-f", "/dev/null"] \ No newline at end of file diff --git a/crawl4ai/browser/manager.py b/crawl4ai/browser/manager.py index ba48cbd7..429d2516 100644 --- a/crawl4ai/browser/manager.py +++ b/crawl4ai/browser/manager.py @@ -756,7 +756,6 @@ class BrowserManager: return status - async def start(self): """Start at least one browser instance in the pool.