Merge branch 'next' into 2025-MAR-ALPHA-1
This commit is contained in:
@@ -28,6 +28,10 @@ from typing import Any, Dict, Optional
|
||||
from enum import Enum
|
||||
|
||||
from .proxy_strategy import ProxyConfig
|
||||
try:
|
||||
from .browser.docker_config import DockerConfig
|
||||
except ImportError:
|
||||
DockerConfig = None
|
||||
|
||||
|
||||
def to_serializable_dict(obj: Any, ignore_default_value : bool = False) -> Dict:
|
||||
@@ -173,6 +177,7 @@ class BrowserConfig:
|
||||
"builtin" - use the builtin CDP browser running in background
|
||||
"dedicated" - create a new dedicated browser instance each time
|
||||
"custom" - use explicit CDP settings provided in cdp_url
|
||||
"docker" - run browser in Docker container with isolation
|
||||
Default: "dedicated"
|
||||
use_managed_browser (bool): Launch the browser using a managed approach (e.g., via CDP), allowing
|
||||
advanced manipulation. Default: False.
|
||||
@@ -190,6 +195,8 @@ class BrowserConfig:
|
||||
Default: None.
|
||||
proxy_config (ProxyConfig or dict or None): Detailed proxy configuration, e.g. {"server": "...", "username": "..."}.
|
||||
If None, no additional proxy config. Default: None.
|
||||
docker_config (DockerConfig or dict or None): Configuration for Docker-based browser automation.
|
||||
Contains settings for Docker container operation. Default: None.
|
||||
viewport_width (int): Default viewport width for pages. Default: 1080.
|
||||
viewport_height (int): Default viewport height for pages. Default: 600.
|
||||
viewport (dict): Default viewport dimensions for pages. If set, overrides viewport_width and viewport_height.
|
||||
@@ -200,7 +207,7 @@ class BrowserConfig:
|
||||
Default: False.
|
||||
downloads_path (str or None): Directory to store downloaded files. If None and accept_downloads is True,
|
||||
a default path will be created. Default: None.
|
||||
storage_state (str or dict or None): Path or object describing storage state (cookies, localStorage).
|
||||
storage_state (str or dict or None): An in-memory storage state (cookies, localStorage).
|
||||
Default: None.
|
||||
ignore_https_errors (bool): Ignore HTTPS certificate errors. Default: True.
|
||||
java_script_enabled (bool): Enable JavaScript execution in pages. Default: True.
|
||||
@@ -235,6 +242,7 @@ class BrowserConfig:
|
||||
channel: str = "chromium",
|
||||
proxy: str = None,
|
||||
proxy_config: Union[ProxyConfig, dict, None] = None,
|
||||
docker_config: Union["DockerConfig", dict, None] = None,
|
||||
viewport_width: int = 1080,
|
||||
viewport_height: int = 600,
|
||||
viewport: dict = None,
|
||||
@@ -275,6 +283,12 @@ class BrowserConfig:
|
||||
self.chrome_channel = ""
|
||||
self.proxy = proxy
|
||||
self.proxy_config = proxy_config
|
||||
|
||||
# Handle docker configuration
|
||||
if isinstance(docker_config, dict) and DockerConfig is not None:
|
||||
self.docker_config = DockerConfig.from_kwargs(docker_config)
|
||||
else:
|
||||
self.docker_config = docker_config
|
||||
self.viewport_width = viewport_width
|
||||
self.viewport_height = viewport_height
|
||||
self.viewport = viewport
|
||||
@@ -315,6 +329,10 @@ class BrowserConfig:
|
||||
# Builtin mode uses managed browser connecting to builtin CDP endpoint
|
||||
self.use_managed_browser = True
|
||||
# cdp_url will be set later by browser_manager
|
||||
elif self.browser_mode == "docker":
|
||||
# Docker mode uses managed browser with CDP to connect to browser in container
|
||||
self.use_managed_browser = True
|
||||
# cdp_url will be set later by docker browser strategy
|
||||
elif self.browser_mode == "custom" and self.cdp_url:
|
||||
# Custom mode with explicit CDP URL
|
||||
self.use_managed_browser = True
|
||||
@@ -340,6 +358,7 @@ class BrowserConfig:
|
||||
channel=kwargs.get("channel", "chromium"),
|
||||
proxy=kwargs.get("proxy"),
|
||||
proxy_config=kwargs.get("proxy_config", None),
|
||||
docker_config=kwargs.get("docker_config", None),
|
||||
viewport_width=kwargs.get("viewport_width", 1080),
|
||||
viewport_height=kwargs.get("viewport_height", 600),
|
||||
accept_downloads=kwargs.get("accept_downloads", False),
|
||||
@@ -364,7 +383,7 @@ class BrowserConfig:
|
||||
)
|
||||
|
||||
def to_dict(self):
|
||||
return {
|
||||
result = {
|
||||
"browser_type": self.browser_type,
|
||||
"headless": self.headless,
|
||||
"browser_mode": self.browser_mode,
|
||||
@@ -396,6 +415,15 @@ class BrowserConfig:
|
||||
"debugging_port": self.debugging_port,
|
||||
"host": self.host,
|
||||
}
|
||||
|
||||
# Include docker_config if it exists
|
||||
if hasattr(self, "docker_config") and self.docker_config is not None:
|
||||
if hasattr(self.docker_config, "to_dict"):
|
||||
result["docker_config"] = self.docker_config.to_dict()
|
||||
else:
|
||||
result["docker_config"] = self.docker_config
|
||||
|
||||
return result
|
||||
|
||||
def clone(self, **kwargs):
|
||||
"""Create a copy of this configuration with updated values.
|
||||
|
||||
10
crawl4ai/browser/__init__.py
Normal file
10
crawl4ai/browser/__init__.py
Normal file
@@ -0,0 +1,10 @@
|
||||
"""Browser management module for Crawl4AI.
|
||||
|
||||
This module provides browser management capabilities using different strategies
|
||||
for browser creation and interaction.
|
||||
"""
|
||||
|
||||
from .manager import BrowserManager
|
||||
from .profiles import BrowserProfileManager
|
||||
|
||||
__all__ = ['BrowserManager', 'BrowserProfileManager']
|
||||
61
crawl4ai/browser/docker/connect.Dockerfile
Normal file
61
crawl4ai/browser/docker/connect.Dockerfile
Normal file
@@ -0,0 +1,61 @@
|
||||
FROM ubuntu:22.04
|
||||
|
||||
# Install dependencies with comprehensive Chromium support
|
||||
RUN apt-get update && apt-get install -y --no-install-recommends \
|
||||
wget \
|
||||
gnupg \
|
||||
ca-certificates \
|
||||
fonts-liberation \
|
||||
# Sound support
|
||||
libasound2 \
|
||||
# Accessibility support
|
||||
libatspi2.0-0 \
|
||||
libatk1.0-0 \
|
||||
libatk-bridge2.0-0 \
|
||||
# Graphics and rendering
|
||||
libdrm2 \
|
||||
libgbm1 \
|
||||
libgtk-3-0 \
|
||||
libxcomposite1 \
|
||||
libxdamage1 \
|
||||
libxext6 \
|
||||
libxfixes3 \
|
||||
libxrandr2 \
|
||||
# X11 and window system
|
||||
libx11-6 \
|
||||
libxcb1 \
|
||||
libxkbcommon0 \
|
||||
# Text and internationalization
|
||||
libpango-1.0-0 \
|
||||
libcairo2 \
|
||||
# Printing support
|
||||
libcups2 \
|
||||
# System libraries
|
||||
libdbus-1-3 \
|
||||
libnss3 \
|
||||
libnspr4 \
|
||||
libglib2.0-0 \
|
||||
# Utilities
|
||||
xdg-utils \
|
||||
socat \
|
||||
# Process management
|
||||
procps \
|
||||
# Clean up
|
||||
&& rm -rf /var/lib/apt/lists/*
|
||||
|
||||
# Install Chrome
|
||||
RUN wget -q -O - https://dl-ssl.google.com/linux/linux_signing_key.pub | apt-key add - && \
|
||||
echo "deb [arch=amd64] http://dl.google.com/linux/chrome/deb/ stable main" >> /etc/apt/sources.list.d/google.list && \
|
||||
apt-get update && \
|
||||
apt-get install -y google-chrome-stable && \
|
||||
rm -rf /var/lib/apt/lists/*
|
||||
|
||||
# Create data directory for user data
|
||||
RUN mkdir -p /data && chmod 777 /data
|
||||
|
||||
# Add a startup script
|
||||
COPY start.sh /start.sh
|
||||
RUN chmod +x /start.sh
|
||||
|
||||
# Set entrypoint
|
||||
ENTRYPOINT ["/start.sh"]
|
||||
57
crawl4ai/browser/docker/launch.Dockerfile
Normal file
57
crawl4ai/browser/docker/launch.Dockerfile
Normal file
@@ -0,0 +1,57 @@
|
||||
FROM ubuntu:22.04
|
||||
|
||||
# Install dependencies with comprehensive Chromium support
|
||||
RUN apt-get update && apt-get install -y --no-install-recommends \
|
||||
wget \
|
||||
gnupg \
|
||||
ca-certificates \
|
||||
fonts-liberation \
|
||||
# Sound support
|
||||
libasound2 \
|
||||
# Accessibility support
|
||||
libatspi2.0-0 \
|
||||
libatk1.0-0 \
|
||||
libatk-bridge2.0-0 \
|
||||
# Graphics and rendering
|
||||
libdrm2 \
|
||||
libgbm1 \
|
||||
libgtk-3-0 \
|
||||
libxcomposite1 \
|
||||
libxdamage1 \
|
||||
libxext6 \
|
||||
libxfixes3 \
|
||||
libxrandr2 \
|
||||
# X11 and window system
|
||||
libx11-6 \
|
||||
libxcb1 \
|
||||
libxkbcommon0 \
|
||||
# Text and internationalization
|
||||
libpango-1.0-0 \
|
||||
libcairo2 \
|
||||
# Printing support
|
||||
libcups2 \
|
||||
# System libraries
|
||||
libdbus-1-3 \
|
||||
libnss3 \
|
||||
libnspr4 \
|
||||
libglib2.0-0 \
|
||||
# Utilities
|
||||
xdg-utils \
|
||||
socat \
|
||||
# Process management
|
||||
procps \
|
||||
# Clean up
|
||||
&& rm -rf /var/lib/apt/lists/*
|
||||
|
||||
# Install Chrome
|
||||
RUN wget -q -O - https://dl-ssl.google.com/linux/linux_signing_key.pub | apt-key add - && \
|
||||
echo "deb [arch=amd64] http://dl.google.com/linux/chrome/deb/ stable main" >> /etc/apt/sources.list.d/google.list && \
|
||||
apt-get update && \
|
||||
apt-get install -y google-chrome-stable && \
|
||||
rm -rf /var/lib/apt/lists/*
|
||||
|
||||
# Create data directory for user data
|
||||
RUN mkdir -p /data && chmod 777 /data
|
||||
|
||||
# Keep container running without starting Chrome
|
||||
CMD ["tail", "-f", "/dev/null"]
|
||||
133
crawl4ai/browser/docker_config.py
Normal file
133
crawl4ai/browser/docker_config.py
Normal file
@@ -0,0 +1,133 @@
|
||||
"""Docker configuration module for Crawl4AI browser automation.
|
||||
|
||||
This module provides configuration classes for Docker-based browser automation,
|
||||
allowing flexible configuration of Docker containers for browsing.
|
||||
"""
|
||||
|
||||
from typing import Dict, List, Optional, Union
|
||||
|
||||
|
||||
class DockerConfig:
|
||||
"""Configuration for Docker-based browser automation.
|
||||
|
||||
This class contains Docker-specific settings to avoid cluttering BrowserConfig.
|
||||
|
||||
Attributes:
|
||||
mode (str): Docker operation mode - "connect" or "launch".
|
||||
- "connect": Uses a container with Chrome already running
|
||||
- "launch": Dynamically configures and starts Chrome in container
|
||||
image (str): Docker image to use. If None, defaults from DockerUtils are used.
|
||||
registry_file (str): Path to container registry file for persistence.
|
||||
persistent (bool): Keep container running after browser closes.
|
||||
remove_on_exit (bool): Remove container on exit when not persistent.
|
||||
network (str): Docker network to use.
|
||||
volumes (List[str]): Volume mappings (e.g., ["host_path:container_path"]).
|
||||
env_vars (Dict[str, str]): Environment variables to set in container.
|
||||
extra_args (List[str]): Additional docker run arguments.
|
||||
host_port (int): Host port to map to container's 9223 port.
|
||||
user_data_dir (str): Path to user data directory on host.
|
||||
container_user_data_dir (str): Path to user data directory in container.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
mode: str = "connect", # "connect" or "launch"
|
||||
image: Optional[str] = None, # Docker image to use
|
||||
registry_file: Optional[str] = None, # Path to registry file
|
||||
persistent: bool = False, # Keep container running after browser closes
|
||||
remove_on_exit: bool = True, # Remove container on exit when not persistent
|
||||
network: Optional[str] = None, # Docker network to use
|
||||
volumes: List[str] = None, # Volume mappings
|
||||
env_vars: Dict[str, str] = None, # Environment variables
|
||||
extra_args: List[str] = None, # Additional docker run arguments
|
||||
host_port: Optional[int] = None, # Host port to map to container's 9223
|
||||
user_data_dir: Optional[str] = None, # Path to user data directory on host
|
||||
container_user_data_dir: str = "/data", # Path to user data directory in container
|
||||
):
|
||||
"""Initialize Docker configuration.
|
||||
|
||||
Args:
|
||||
mode: Docker operation mode ("connect" or "launch")
|
||||
image: Docker image to use
|
||||
registry_file: Path to container registry file
|
||||
persistent: Whether to keep container running after browser closes
|
||||
remove_on_exit: Whether to remove container on exit when not persistent
|
||||
network: Docker network to use
|
||||
volumes: Volume mappings as list of strings
|
||||
env_vars: Environment variables as dictionary
|
||||
extra_args: Additional docker run arguments
|
||||
host_port: Host port to map to container's 9223
|
||||
user_data_dir: Path to user data directory on host
|
||||
container_user_data_dir: Path to user data directory in container
|
||||
"""
|
||||
self.mode = mode
|
||||
self.image = image # If None, defaults will be used from DockerUtils
|
||||
self.registry_file = registry_file
|
||||
self.persistent = persistent
|
||||
self.remove_on_exit = remove_on_exit
|
||||
self.network = network
|
||||
self.volumes = volumes or []
|
||||
self.env_vars = env_vars or {}
|
||||
self.extra_args = extra_args or []
|
||||
self.host_port = host_port
|
||||
self.user_data_dir = user_data_dir
|
||||
self.container_user_data_dir = container_user_data_dir
|
||||
|
||||
def to_dict(self) -> Dict:
|
||||
"""Convert this configuration to a dictionary.
|
||||
|
||||
Returns:
|
||||
Dictionary representation of this configuration
|
||||
"""
|
||||
return {
|
||||
"mode": self.mode,
|
||||
"image": self.image,
|
||||
"registry_file": self.registry_file,
|
||||
"persistent": self.persistent,
|
||||
"remove_on_exit": self.remove_on_exit,
|
||||
"network": self.network,
|
||||
"volumes": self.volumes,
|
||||
"env_vars": self.env_vars,
|
||||
"extra_args": self.extra_args,
|
||||
"host_port": self.host_port,
|
||||
"user_data_dir": self.user_data_dir,
|
||||
"container_user_data_dir": self.container_user_data_dir
|
||||
}
|
||||
|
||||
@staticmethod
|
||||
def from_kwargs(kwargs: Dict) -> "DockerConfig":
|
||||
"""Create a DockerConfig from a dictionary of keyword arguments.
|
||||
|
||||
Args:
|
||||
kwargs: Dictionary of configuration options
|
||||
|
||||
Returns:
|
||||
New DockerConfig instance
|
||||
"""
|
||||
return DockerConfig(
|
||||
mode=kwargs.get("mode", "connect"),
|
||||
image=kwargs.get("image"),
|
||||
registry_file=kwargs.get("registry_file"),
|
||||
persistent=kwargs.get("persistent", False),
|
||||
remove_on_exit=kwargs.get("remove_on_exit", True),
|
||||
network=kwargs.get("network"),
|
||||
volumes=kwargs.get("volumes"),
|
||||
env_vars=kwargs.get("env_vars"),
|
||||
extra_args=kwargs.get("extra_args"),
|
||||
host_port=kwargs.get("host_port"),
|
||||
user_data_dir=kwargs.get("user_data_dir"),
|
||||
container_user_data_dir=kwargs.get("container_user_data_dir", "/data")
|
||||
)
|
||||
|
||||
def clone(self, **kwargs) -> "DockerConfig":
|
||||
"""Create a copy of this configuration with updated values.
|
||||
|
||||
Args:
|
||||
**kwargs: Key-value pairs of configuration options to update
|
||||
|
||||
Returns:
|
||||
DockerConfig: A new instance with the specified updates
|
||||
"""
|
||||
config_dict = self.to_dict()
|
||||
config_dict.update(kwargs)
|
||||
return DockerConfig.from_kwargs(config_dict)
|
||||
174
crawl4ai/browser/docker_registry.py
Normal file
174
crawl4ai/browser/docker_registry.py
Normal file
@@ -0,0 +1,174 @@
|
||||
"""Docker registry module for Crawl4AI.
|
||||
|
||||
This module provides a registry system for tracking and reusing Docker containers
|
||||
across browser sessions, improving performance and resource utilization.
|
||||
"""
|
||||
|
||||
import os
|
||||
import json
|
||||
import time
|
||||
from typing import Dict, Optional
|
||||
|
||||
from ..utils import get_home_folder
|
||||
|
||||
|
||||
class DockerRegistry:
|
||||
"""Manages a registry of Docker containers used for browser automation.
|
||||
|
||||
This registry tracks containers by configuration hash, allowing reuse of appropriately
|
||||
configured containers instead of creating new ones for each session.
|
||||
|
||||
Attributes:
|
||||
registry_file (str): Path to the registry file
|
||||
containers (dict): Dictionary of container information
|
||||
port_map (dict): Map of host ports to container IDs
|
||||
last_port (int): Last port assigned
|
||||
"""
|
||||
|
||||
def __init__(self, registry_file: Optional[str] = None):
|
||||
"""Initialize the registry with an optional path to the registry file.
|
||||
|
||||
Args:
|
||||
registry_file: Path to the registry file. If None, uses default path.
|
||||
"""
|
||||
self.registry_file = registry_file or os.path.join(get_home_folder(), "docker_browser_registry.json")
|
||||
self.containers = {}
|
||||
self.port_map = {}
|
||||
self.last_port = 9222
|
||||
self.load()
|
||||
|
||||
def load(self):
|
||||
"""Load container registry from file."""
|
||||
if os.path.exists(self.registry_file):
|
||||
try:
|
||||
with open(self.registry_file, 'r') as f:
|
||||
registry_data = json.load(f)
|
||||
self.containers = registry_data.get("containers", {})
|
||||
self.port_map = registry_data.get("ports", {})
|
||||
self.last_port = registry_data.get("last_port", 9222)
|
||||
except Exception:
|
||||
# Reset to defaults on error
|
||||
self.containers = {}
|
||||
self.port_map = {}
|
||||
self.last_port = 9222
|
||||
else:
|
||||
# Initialize with defaults if file doesn't exist
|
||||
self.containers = {}
|
||||
self.port_map = {}
|
||||
self.last_port = 9222
|
||||
|
||||
def save(self):
|
||||
"""Save container registry to file."""
|
||||
os.makedirs(os.path.dirname(self.registry_file), exist_ok=True)
|
||||
with open(self.registry_file, 'w') as f:
|
||||
json.dump({
|
||||
"containers": self.containers,
|
||||
"ports": self.port_map,
|
||||
"last_port": self.last_port
|
||||
}, f, indent=2)
|
||||
|
||||
def register_container(self, container_id: str, host_port: int, config_hash: str):
|
||||
"""Register a container with its configuration hash and port mapping.
|
||||
|
||||
Args:
|
||||
container_id: Docker container ID
|
||||
host_port: Host port mapped to container
|
||||
config_hash: Hash of configuration used to create container
|
||||
"""
|
||||
self.containers[container_id] = {
|
||||
"host_port": host_port,
|
||||
"config_hash": config_hash,
|
||||
"created_at": time.time()
|
||||
}
|
||||
self.port_map[str(host_port)] = container_id
|
||||
self.save()
|
||||
|
||||
def unregister_container(self, container_id: str):
|
||||
"""Unregister a container.
|
||||
|
||||
Args:
|
||||
container_id: Docker container ID to unregister
|
||||
"""
|
||||
if container_id in self.containers:
|
||||
host_port = self.containers[container_id]["host_port"]
|
||||
if str(host_port) in self.port_map:
|
||||
del self.port_map[str(host_port)]
|
||||
del self.containers[container_id]
|
||||
self.save()
|
||||
|
||||
def find_container_by_config(self, config_hash: str, docker_utils) -> Optional[str]:
|
||||
"""Find a container that matches the given configuration hash.
|
||||
|
||||
Args:
|
||||
config_hash: Hash of configuration to match
|
||||
docker_utils: DockerUtils instance to check running containers
|
||||
|
||||
Returns:
|
||||
Container ID if found, None otherwise
|
||||
"""
|
||||
for container_id, data in self.containers.items():
|
||||
if data["config_hash"] == config_hash and docker_utils.is_container_running(container_id):
|
||||
return container_id
|
||||
return None
|
||||
|
||||
def get_container_host_port(self, container_id: str) -> Optional[int]:
|
||||
"""Get the host port mapped to the container.
|
||||
|
||||
Args:
|
||||
container_id: Docker container ID
|
||||
|
||||
Returns:
|
||||
Host port if container is registered, None otherwise
|
||||
"""
|
||||
if container_id in self.containers:
|
||||
return self.containers[container_id]["host_port"]
|
||||
return None
|
||||
|
||||
def get_next_available_port(self, docker_utils) -> int:
|
||||
"""Get the next available host port for Docker mapping.
|
||||
|
||||
Args:
|
||||
docker_utils: DockerUtils instance to check port availability
|
||||
|
||||
Returns:
|
||||
Available port number
|
||||
"""
|
||||
# Start from last port + 1
|
||||
port = self.last_port + 1
|
||||
|
||||
# Check if port is in use (either in our registry or system-wide)
|
||||
while port in self.port_map or docker_utils.is_port_in_use(port):
|
||||
port += 1
|
||||
|
||||
# Update last port
|
||||
self.last_port = port
|
||||
self.save()
|
||||
|
||||
return port
|
||||
|
||||
def get_container_config_hash(self, container_id: str) -> Optional[str]:
|
||||
"""Get the configuration hash for a container.
|
||||
|
||||
Args:
|
||||
container_id: Docker container ID
|
||||
|
||||
Returns:
|
||||
Configuration hash if container is registered, None otherwise
|
||||
"""
|
||||
if container_id in self.containers:
|
||||
return self.containers[container_id]["config_hash"]
|
||||
return None
|
||||
|
||||
def cleanup_stale_containers(self, docker_utils):
|
||||
"""Clean up containers that are no longer running.
|
||||
|
||||
Args:
|
||||
docker_utils: DockerUtils instance to check container status
|
||||
"""
|
||||
to_remove = []
|
||||
for container_id in self.containers:
|
||||
if not docker_utils.is_container_running(container_id):
|
||||
to_remove.append(container_id)
|
||||
|
||||
for container_id in to_remove:
|
||||
self.unregister_container(container_id)
|
||||
286
crawl4ai/browser/docker_strategy.py
Normal file
286
crawl4ai/browser/docker_strategy.py
Normal file
@@ -0,0 +1,286 @@
|
||||
"""Docker browser strategy module for Crawl4AI.
|
||||
|
||||
This module provides browser strategies for running browsers in Docker containers,
|
||||
which offers better isolation, consistency across platforms, and easy scaling.
|
||||
"""
|
||||
|
||||
import os
|
||||
import uuid
|
||||
import asyncio
|
||||
from typing import Dict, List, Optional, Tuple, Union
|
||||
from pathlib import Path
|
||||
|
||||
from playwright.async_api import Page, BrowserContext
|
||||
|
||||
from ..async_logger import AsyncLogger
|
||||
from ..async_configs import BrowserConfig, CrawlerRunConfig
|
||||
from .docker_config import DockerConfig
|
||||
from .docker_registry import DockerRegistry
|
||||
from .docker_utils import DockerUtils
|
||||
from .strategies import BuiltinBrowserStrategy
|
||||
|
||||
|
||||
class DockerBrowserStrategy(BuiltinBrowserStrategy):
|
||||
"""Docker-based browser strategy.
|
||||
|
||||
Extends the BuiltinBrowserStrategy to run browsers in Docker containers.
|
||||
Supports two modes:
|
||||
1. "connect" - Uses a Docker image with Chrome already running
|
||||
2. "launch" - Starts Chrome within the container with custom settings
|
||||
|
||||
Attributes:
|
||||
docker_config: Docker-specific configuration options
|
||||
container_id: ID of current Docker container
|
||||
container_name: Name assigned to the container
|
||||
registry: Registry for tracking and reusing containers
|
||||
docker_utils: Utilities for Docker operations
|
||||
chrome_process_id: Process ID of Chrome within container
|
||||
socat_process_id: Process ID of socat within container
|
||||
internal_cdp_port: Chrome's internal CDP port
|
||||
internal_mapped_port: Port that socat maps to internally
|
||||
"""
|
||||
|
||||
def __init__(self, config: BrowserConfig, logger: Optional[AsyncLogger] = None):
|
||||
"""Initialize the Docker browser strategy.
|
||||
|
||||
Args:
|
||||
config: Browser configuration including Docker-specific settings
|
||||
logger: Logger for recording events and errors
|
||||
"""
|
||||
super().__init__(config, logger)
|
||||
|
||||
# Initialize Docker-specific attributes
|
||||
self.docker_config = self.config.docker_config or DockerConfig()
|
||||
self.container_id = None
|
||||
self.container_name = f"crawl4ai-browser-{uuid.uuid4().hex[:8]}"
|
||||
self.registry = DockerRegistry(self.docker_config.registry_file)
|
||||
self.docker_utils = DockerUtils(logger)
|
||||
self.chrome_process_id = None
|
||||
self.socat_process_id = None
|
||||
self.internal_cdp_port = 9222 # Chrome's internal CDP port
|
||||
self.internal_mapped_port = 9223 # Port that socat maps to internally
|
||||
self.shutting_down = False
|
||||
|
||||
async def _generate_config_hash(self) -> str:
|
||||
"""Generate a hash of the configuration for container matching.
|
||||
|
||||
Returns:
|
||||
Hash string uniquely identifying this configuration
|
||||
"""
|
||||
# Create a dict with the relevant parts of the config
|
||||
config_dict = {
|
||||
"image": self.docker_config.image,
|
||||
"mode": self.docker_config.mode,
|
||||
"browser_type": self.config.browser_type,
|
||||
"headless": self.config.headless,
|
||||
}
|
||||
|
||||
# Add browser-specific config if in launch mode
|
||||
if self.docker_config.mode == "launch":
|
||||
config_dict.update({
|
||||
"text_mode": self.config.text_mode,
|
||||
"light_mode": self.config.light_mode,
|
||||
"viewport_width": self.config.viewport_width,
|
||||
"viewport_height": self.config.viewport_height,
|
||||
})
|
||||
|
||||
# Use the utility method to generate the hash
|
||||
return self.docker_utils.generate_config_hash(config_dict)
|
||||
|
||||
async def _get_or_create_cdp_url(self) -> str:
|
||||
"""Get CDP URL by either creating a new container or using an existing one.
|
||||
|
||||
Returns:
|
||||
CDP URL for connecting to the browser
|
||||
|
||||
Raises:
|
||||
Exception: If container creation or browser launch fails
|
||||
"""
|
||||
# If CDP URL is explicitly provided, use it
|
||||
if self.config.cdp_url:
|
||||
return self.config.cdp_url
|
||||
|
||||
# Ensure Docker image exists (will build if needed)
|
||||
image_name = await self.docker_utils.ensure_docker_image_exists(
|
||||
self.docker_config.image,
|
||||
self.docker_config.mode
|
||||
)
|
||||
|
||||
# Generate config hash for container matching
|
||||
config_hash = await self._generate_config_hash()
|
||||
|
||||
# Look for existing container with matching config
|
||||
container_id = self.registry.find_container_by_config(config_hash, self.docker_utils)
|
||||
|
||||
if container_id:
|
||||
# Use existing container
|
||||
self.container_id = container_id
|
||||
host_port = self.registry.get_container_host_port(container_id)
|
||||
if self.logger:
|
||||
self.logger.info(f"Using existing Docker container: {container_id[:12]}", tag="DOCKER")
|
||||
else:
|
||||
# Get a port for the new container
|
||||
host_port = self.docker_config.host_port or self.registry.get_next_available_port(self.docker_utils)
|
||||
|
||||
# Prepare volumes list
|
||||
volumes = list(self.docker_config.volumes)
|
||||
|
||||
# Add user data directory if specified
|
||||
if self.docker_config.user_data_dir:
|
||||
# Ensure user data directory exists
|
||||
os.makedirs(self.docker_config.user_data_dir, exist_ok=True)
|
||||
volumes.append(f"{self.docker_config.user_data_dir}:{self.docker_config.container_user_data_dir}")
|
||||
|
||||
# Update config user_data_dir to point to container path
|
||||
self.config.user_data_dir = self.docker_config.container_user_data_dir
|
||||
|
||||
# Create a new container
|
||||
container_id = await self.docker_utils.create_container(
|
||||
image_name=image_name,
|
||||
host_port=host_port,
|
||||
container_name=self.container_name,
|
||||
volumes=volumes,
|
||||
network=self.docker_config.network,
|
||||
env_vars=self.docker_config.env_vars,
|
||||
extra_args=self.docker_config.extra_args
|
||||
)
|
||||
|
||||
if not container_id:
|
||||
raise Exception("Failed to create Docker container")
|
||||
|
||||
self.container_id = container_id
|
||||
|
||||
# Register the container
|
||||
self.registry.register_container(container_id, host_port, config_hash)
|
||||
|
||||
# Wait for container to be ready
|
||||
await self.docker_utils.wait_for_container_ready(container_id)
|
||||
|
||||
# Handle specific setup based on mode
|
||||
if self.docker_config.mode == "launch":
|
||||
# In launch mode, we need to start socat and Chrome
|
||||
await self.docker_utils.start_socat_in_container(container_id)
|
||||
|
||||
# Build browser arguments
|
||||
browser_args = self._build_browser_args()
|
||||
|
||||
# Launch Chrome
|
||||
await self.docker_utils.launch_chrome_in_container(container_id, browser_args)
|
||||
|
||||
# Get PIDs for later cleanup
|
||||
self.chrome_process_id = await self.docker_utils.get_process_id_in_container(
|
||||
container_id, "chrome"
|
||||
)
|
||||
self.socat_process_id = await self.docker_utils.get_process_id_in_container(
|
||||
container_id, "socat"
|
||||
)
|
||||
|
||||
# Wait for CDP to be ready
|
||||
await self.docker_utils.wait_for_cdp_ready(host_port)
|
||||
|
||||
if self.logger:
|
||||
self.logger.success(f"Docker container ready: {container_id[:12]} on port {host_port}", tag="DOCKER")
|
||||
|
||||
# Return CDP URL
|
||||
return f"http://localhost:{host_port}"
|
||||
|
||||
def _build_browser_args(self) -> List[str]:
|
||||
"""Build Chrome command line arguments based on BrowserConfig.
|
||||
|
||||
Returns:
|
||||
List of command line arguments for Chrome
|
||||
"""
|
||||
args = [
|
||||
"--no-sandbox",
|
||||
"--disable-gpu",
|
||||
f"--remote-debugging-port={self.internal_cdp_port}",
|
||||
"--remote-debugging-address=0.0.0.0", # Allow external connections
|
||||
"--disable-dev-shm-usage",
|
||||
]
|
||||
|
||||
if self.config.headless:
|
||||
args.append("--headless=new")
|
||||
|
||||
if self.config.viewport_width and self.config.viewport_height:
|
||||
args.append(f"--window-size={self.config.viewport_width},{self.config.viewport_height}")
|
||||
|
||||
if self.config.user_agent:
|
||||
args.append(f"--user-agent={self.config.user_agent}")
|
||||
|
||||
if self.config.text_mode:
|
||||
args.extend([
|
||||
"--blink-settings=imagesEnabled=false",
|
||||
"--disable-remote-fonts",
|
||||
"--disable-images",
|
||||
"--disable-javascript",
|
||||
])
|
||||
|
||||
if self.config.light_mode:
|
||||
# Import here to avoid circular import
|
||||
from .utils import get_browser_disable_options
|
||||
args.extend(get_browser_disable_options())
|
||||
|
||||
if self.config.user_data_dir:
|
||||
args.append(f"--user-data-dir={self.config.user_data_dir}")
|
||||
|
||||
if self.config.extra_args:
|
||||
args.extend(self.config.extra_args)
|
||||
|
||||
return args
|
||||
|
||||
async def close(self):
|
||||
"""Close the browser and clean up Docker container if needed."""
|
||||
# Set shutting_down flag to prevent race conditions
|
||||
self.shutting_down = True
|
||||
|
||||
# Store state if needed before closing
|
||||
if self.browser and self.docker_config.user_data_dir and self.docker_config.persistent:
|
||||
for context in self.browser.contexts:
|
||||
try:
|
||||
storage_path = os.path.join(self.docker_config.user_data_dir, "storage_state.json")
|
||||
await context.storage_state(path=storage_path)
|
||||
if self.logger:
|
||||
self.logger.debug("Persisted storage state before closing browser", tag="DOCKER")
|
||||
except Exception as e:
|
||||
if self.logger:
|
||||
self.logger.warning(
|
||||
message="Failed to persist storage state: {error}",
|
||||
tag="DOCKER",
|
||||
params={"error": str(e)}
|
||||
)
|
||||
|
||||
# Close browser connection (but not container)
|
||||
if self.browser:
|
||||
await self.browser.close()
|
||||
self.browser = None
|
||||
|
||||
# Only clean up container if not persistent
|
||||
if self.container_id and not self.docker_config.persistent:
|
||||
# Stop Chrome process in "launch" mode
|
||||
if self.docker_config.mode == "launch" and self.chrome_process_id:
|
||||
await self.docker_utils.stop_process_in_container(
|
||||
self.container_id, self.chrome_process_id
|
||||
)
|
||||
|
||||
# Stop socat process in "launch" mode
|
||||
if self.docker_config.mode == "launch" and self.socat_process_id:
|
||||
await self.docker_utils.stop_process_in_container(
|
||||
self.container_id, self.socat_process_id
|
||||
)
|
||||
|
||||
# Remove or stop container based on configuration
|
||||
if self.docker_config.remove_on_exit:
|
||||
await self.docker_utils.remove_container(self.container_id)
|
||||
# Unregister from registry
|
||||
self.registry.unregister_container(self.container_id)
|
||||
else:
|
||||
await self.docker_utils.stop_container(self.container_id)
|
||||
|
||||
self.container_id = None
|
||||
|
||||
# Close Playwright
|
||||
if self.playwright:
|
||||
await self.playwright.stop()
|
||||
self.playwright = None
|
||||
|
||||
self.shutting_down = False
|
||||
582
crawl4ai/browser/docker_utils.py
Normal file
582
crawl4ai/browser/docker_utils.py
Normal file
@@ -0,0 +1,582 @@
|
||||
import os
|
||||
import json
|
||||
import asyncio
|
||||
import hashlib
|
||||
import tempfile
|
||||
import shutil
|
||||
import socket
|
||||
import subprocess
|
||||
from typing import Dict, List, Optional, Tuple, Union
|
||||
|
||||
class DockerUtils:
|
||||
"""Utility class for Docker operations in browser automation.
|
||||
|
||||
This class provides methods for managing Docker images, containers,
|
||||
and related operations needed for browser automation. It handles
|
||||
image building, container lifecycle, port management, and registry operations.
|
||||
|
||||
Attributes:
|
||||
DOCKER_FOLDER (str): Path to folder containing Docker files
|
||||
DOCKER_CONNECT_FILE (str): Path to Dockerfile for connect mode
|
||||
DOCKER_LAUNCH_FILE (str): Path to Dockerfile for launch mode
|
||||
DOCKER_START_SCRIPT (str): Path to startup script for connect mode
|
||||
DEFAULT_CONNECT_IMAGE (str): Default image name for connect mode
|
||||
DEFAULT_LAUNCH_IMAGE (str): Default image name for launch mode
|
||||
logger: Optional logger instance
|
||||
"""
|
||||
|
||||
# File paths for Docker resources
|
||||
DOCKER_FOLDER = os.path.join(os.path.dirname(__file__), "docker")
|
||||
DOCKER_CONNECT_FILE = os.path.join(DOCKER_FOLDER, "connect.Dockerfile")
|
||||
DOCKER_LAUNCH_FILE = os.path.join(DOCKER_FOLDER, "launch.Dockerfile")
|
||||
DOCKER_START_SCRIPT = os.path.join(DOCKER_FOLDER, "start.sh")
|
||||
|
||||
# Default image names
|
||||
DEFAULT_CONNECT_IMAGE = "crawl4ai/browser-connect:latest"
|
||||
DEFAULT_LAUNCH_IMAGE = "crawl4ai/browser-launch:latest"
|
||||
|
||||
def __init__(self, logger=None):
|
||||
"""Initialize Docker utilities.
|
||||
|
||||
Args:
|
||||
logger: Optional logger for recording operations
|
||||
"""
|
||||
self.logger = logger
|
||||
|
||||
# Image Management Methods
|
||||
|
||||
async def check_image_exists(self, image_name: str) -> bool:
|
||||
"""Check if a Docker image exists.
|
||||
|
||||
Args:
|
||||
image_name: Name of the Docker image to check
|
||||
|
||||
Returns:
|
||||
bool: True if the image exists, False otherwise
|
||||
"""
|
||||
cmd = ["docker", "image", "inspect", image_name]
|
||||
|
||||
try:
|
||||
process = await asyncio.create_subprocess_exec(
|
||||
*cmd, stdout=asyncio.subprocess.PIPE, stderr=asyncio.subprocess.PIPE
|
||||
)
|
||||
_, _ = await process.communicate()
|
||||
return process.returncode == 0
|
||||
except Exception as e:
|
||||
if self.logger:
|
||||
self.logger.debug(f"Error checking if image exists: {str(e)}", tag="DOCKER")
|
||||
return False
|
||||
|
||||
async def build_docker_image(self, image_name: str, dockerfile_path: str,
|
||||
files_to_copy: Dict[str, str] = None) -> bool:
|
||||
"""Build a Docker image from a Dockerfile.
|
||||
|
||||
Args:
|
||||
image_name: Name to give the built image
|
||||
dockerfile_path: Path to the Dockerfile
|
||||
files_to_copy: Dict of {dest_name: source_path} for files to copy to build context
|
||||
|
||||
Returns:
|
||||
bool: True if image was built successfully, False otherwise
|
||||
"""
|
||||
# Create a temporary build context
|
||||
with tempfile.TemporaryDirectory() as temp_dir:
|
||||
# Copy the Dockerfile
|
||||
shutil.copy(dockerfile_path, os.path.join(temp_dir, "Dockerfile"))
|
||||
|
||||
# Copy any additional files needed
|
||||
if files_to_copy:
|
||||
for dest_name, source_path in files_to_copy.items():
|
||||
shutil.copy(source_path, os.path.join(temp_dir, dest_name))
|
||||
|
||||
# Build the image
|
||||
cmd = [
|
||||
"docker", "build",
|
||||
"-t", image_name,
|
||||
temp_dir
|
||||
]
|
||||
|
||||
if self.logger:
|
||||
self.logger.debug(f"Building Docker image with command: {' '.join(cmd)}", tag="DOCKER")
|
||||
|
||||
process = await asyncio.create_subprocess_exec(
|
||||
*cmd, stdout=asyncio.subprocess.PIPE, stderr=asyncio.subprocess.PIPE
|
||||
)
|
||||
stdout, stderr = await process.communicate()
|
||||
|
||||
if process.returncode != 0:
|
||||
if self.logger:
|
||||
self.logger.error(
|
||||
message="Failed to build Docker image: {error}",
|
||||
tag="DOCKER",
|
||||
params={"error": stderr.decode()}
|
||||
)
|
||||
return False
|
||||
|
||||
if self.logger:
|
||||
self.logger.success(f"Successfully built Docker image: {image_name}", tag="DOCKER")
|
||||
return True
|
||||
|
||||
async def ensure_docker_image_exists(self, image_name: str, mode: str = "connect") -> str:
|
||||
"""Ensure the required Docker image exists, creating it if necessary.
|
||||
|
||||
Args:
|
||||
image_name: Name of the Docker image
|
||||
mode: Either "connect" or "launch" to determine which image to build
|
||||
|
||||
Returns:
|
||||
str: Name of the available Docker image
|
||||
|
||||
Raises:
|
||||
Exception: If image doesn't exist and can't be built
|
||||
"""
|
||||
# If image name is not specified, use default based on mode
|
||||
if not image_name:
|
||||
image_name = self.DEFAULT_CONNECT_IMAGE if mode == "connect" else self.DEFAULT_LAUNCH_IMAGE
|
||||
|
||||
# Check if the image already exists
|
||||
if await self.check_image_exists(image_name):
|
||||
if self.logger:
|
||||
self.logger.debug(f"Docker image {image_name} already exists", tag="DOCKER")
|
||||
return image_name
|
||||
|
||||
# If we're using a custom image that doesn't exist, warn and fail
|
||||
if (image_name != self.DEFAULT_CONNECT_IMAGE and image_name != self.DEFAULT_LAUNCH_IMAGE):
|
||||
if self.logger:
|
||||
self.logger.warning(
|
||||
f"Custom Docker image {image_name} not found and cannot be automatically created",
|
||||
tag="DOCKER"
|
||||
)
|
||||
raise Exception(f"Docker image {image_name} not found")
|
||||
|
||||
# Build the appropriate default image
|
||||
if self.logger:
|
||||
self.logger.info(f"Docker image {image_name} not found, creating it now...", tag="DOCKER")
|
||||
|
||||
if mode == "connect":
|
||||
success = await self.build_docker_image(
|
||||
image_name,
|
||||
self.DOCKER_CONNECT_FILE,
|
||||
{"start.sh": self.DOCKER_START_SCRIPT}
|
||||
)
|
||||
else:
|
||||
success = await self.build_docker_image(
|
||||
image_name,
|
||||
self.DOCKER_LAUNCH_FILE
|
||||
)
|
||||
|
||||
if not success:
|
||||
raise Exception(f"Failed to create Docker image {image_name}")
|
||||
|
||||
return image_name
|
||||
|
||||
# Container Management Methods
|
||||
|
||||
async def create_container(self, image_name: str, host_port: int,
|
||||
container_name: Optional[str] = None,
|
||||
volumes: List[str] = None,
|
||||
network: Optional[str] = None,
|
||||
env_vars: Dict[str, str] = None,
|
||||
extra_args: List[str] = None) -> Optional[str]:
|
||||
"""Create a new Docker container.
|
||||
|
||||
Args:
|
||||
image_name: Docker image to use
|
||||
host_port: Port on host to map to container port 9223
|
||||
container_name: Optional name for the container
|
||||
volumes: List of volume mappings (e.g., ["host_path:container_path"])
|
||||
network: Optional Docker network to use
|
||||
env_vars: Dictionary of environment variables
|
||||
extra_args: Additional docker run arguments
|
||||
|
||||
Returns:
|
||||
str: Container ID if successful, None otherwise
|
||||
"""
|
||||
# Prepare container command
|
||||
cmd = [
|
||||
"docker", "run",
|
||||
"--detach",
|
||||
]
|
||||
|
||||
# Add container name if specified
|
||||
if container_name:
|
||||
cmd.extend(["--name", container_name])
|
||||
|
||||
# Add port mapping
|
||||
cmd.extend(["-p", f"{host_port}:9223"])
|
||||
|
||||
# Add volumes
|
||||
if volumes:
|
||||
for volume in volumes:
|
||||
cmd.extend(["-v", volume])
|
||||
|
||||
# Add network if specified
|
||||
if network:
|
||||
cmd.extend(["--network", network])
|
||||
|
||||
# Add environment variables
|
||||
if env_vars:
|
||||
for key, value in env_vars.items():
|
||||
cmd.extend(["-e", f"{key}={value}"])
|
||||
|
||||
# Add extra args
|
||||
if extra_args:
|
||||
cmd.extend(extra_args)
|
||||
|
||||
# Add image
|
||||
cmd.append(image_name)
|
||||
|
||||
if self.logger:
|
||||
self.logger.debug(f"Creating Docker container with command: {' '.join(cmd)}", tag="DOCKER")
|
||||
|
||||
# Run docker command
|
||||
try:
|
||||
process = await asyncio.create_subprocess_exec(
|
||||
*cmd, stdout=asyncio.subprocess.PIPE, stderr=asyncio.subprocess.PIPE
|
||||
)
|
||||
stdout, stderr = await process.communicate()
|
||||
|
||||
if process.returncode != 0:
|
||||
if self.logger:
|
||||
self.logger.error(
|
||||
message="Failed to create Docker container: {error}",
|
||||
tag="DOCKER",
|
||||
params={"error": stderr.decode()}
|
||||
)
|
||||
return None
|
||||
|
||||
# Get container ID
|
||||
container_id = stdout.decode().strip()
|
||||
|
||||
if self.logger:
|
||||
self.logger.success(f"Created Docker container: {container_id[:12]}", tag="DOCKER")
|
||||
|
||||
return container_id
|
||||
|
||||
except Exception as e:
|
||||
if self.logger:
|
||||
self.logger.error(
|
||||
message="Error creating Docker container: {error}",
|
||||
tag="DOCKER",
|
||||
params={"error": str(e)}
|
||||
)
|
||||
return None
|
||||
|
||||
async def is_container_running(self, container_id: str) -> bool:
|
||||
"""Check if a container is running.
|
||||
|
||||
Args:
|
||||
container_id: ID of the container to check
|
||||
|
||||
Returns:
|
||||
bool: True if the container is running, False otherwise
|
||||
"""
|
||||
cmd = ["docker", "inspect", "--format", "{{.State.Running}}", container_id]
|
||||
|
||||
try:
|
||||
process = await asyncio.create_subprocess_exec(
|
||||
*cmd, stdout=asyncio.subprocess.PIPE, stderr=asyncio.subprocess.PIPE
|
||||
)
|
||||
stdout, _ = await process.communicate()
|
||||
|
||||
return process.returncode == 0 and stdout.decode().strip() == "true"
|
||||
except Exception as e:
|
||||
if self.logger:
|
||||
self.logger.debug(f"Error checking if container is running: {str(e)}", tag="DOCKER")
|
||||
return False
|
||||
|
||||
async def wait_for_container_ready(self, container_id: str, timeout: int = 30) -> bool:
|
||||
"""Wait for the container to be in running state.
|
||||
|
||||
Args:
|
||||
container_id: ID of the container to wait for
|
||||
timeout: Maximum time to wait in seconds
|
||||
|
||||
Returns:
|
||||
bool: True if container is ready, False if timeout occurred
|
||||
"""
|
||||
for _ in range(timeout):
|
||||
if await self.is_container_running(container_id):
|
||||
return True
|
||||
await asyncio.sleep(1)
|
||||
|
||||
if self.logger:
|
||||
self.logger.warning(f"Container {container_id[:12]} not ready after {timeout}s timeout", tag="DOCKER")
|
||||
return False
|
||||
|
||||
async def stop_container(self, container_id: str) -> bool:
|
||||
"""Stop a Docker container.
|
||||
|
||||
Args:
|
||||
container_id: ID of the container to stop
|
||||
|
||||
Returns:
|
||||
bool: True if stopped successfully, False otherwise
|
||||
"""
|
||||
cmd = ["docker", "stop", container_id]
|
||||
|
||||
try:
|
||||
process = await asyncio.create_subprocess_exec(*cmd)
|
||||
await process.communicate()
|
||||
|
||||
if self.logger:
|
||||
self.logger.debug(f"Stopped container: {container_id[:12]}", tag="DOCKER")
|
||||
|
||||
return process.returncode == 0
|
||||
except Exception as e:
|
||||
if self.logger:
|
||||
self.logger.warning(
|
||||
message="Failed to stop container: {error}",
|
||||
tag="DOCKER",
|
||||
params={"error": str(e)}
|
||||
)
|
||||
return False
|
||||
|
||||
async def remove_container(self, container_id: str, force: bool = True) -> bool:
|
||||
"""Remove a Docker container.
|
||||
|
||||
Args:
|
||||
container_id: ID of the container to remove
|
||||
force: Whether to force removal
|
||||
|
||||
Returns:
|
||||
bool: True if removed successfully, False otherwise
|
||||
"""
|
||||
cmd = ["docker", "rm"]
|
||||
if force:
|
||||
cmd.append("-f")
|
||||
cmd.append(container_id)
|
||||
|
||||
try:
|
||||
process = await asyncio.create_subprocess_exec(*cmd)
|
||||
await process.communicate()
|
||||
|
||||
if self.logger:
|
||||
self.logger.debug(f"Removed container: {container_id[:12]}", tag="DOCKER")
|
||||
|
||||
return process.returncode == 0
|
||||
except Exception as e:
|
||||
if self.logger:
|
||||
self.logger.warning(
|
||||
message="Failed to remove container: {error}",
|
||||
tag="DOCKER",
|
||||
params={"error": str(e)}
|
||||
)
|
||||
return False
|
||||
|
||||
# Container Command Execution Methods
|
||||
|
||||
async def exec_in_container(self, container_id: str, command: List[str],
|
||||
detach: bool = False) -> Tuple[int, str, str]:
|
||||
"""Execute a command in a running container.
|
||||
|
||||
Args:
|
||||
container_id: ID of the container
|
||||
command: Command to execute as a list of strings
|
||||
detach: Whether to run the command in detached mode
|
||||
|
||||
Returns:
|
||||
Tuple of (return_code, stdout, stderr)
|
||||
"""
|
||||
cmd = ["docker", "exec"]
|
||||
if detach:
|
||||
cmd.append("-d")
|
||||
cmd.append(container_id)
|
||||
cmd.extend(command)
|
||||
|
||||
try:
|
||||
process = await asyncio.create_subprocess_exec(
|
||||
*cmd, stdout=asyncio.subprocess.PIPE, stderr=asyncio.subprocess.PIPE
|
||||
)
|
||||
stdout, stderr = await process.communicate()
|
||||
|
||||
return process.returncode, stdout.decode(), stderr.decode()
|
||||
except Exception as e:
|
||||
if self.logger:
|
||||
self.logger.error(
|
||||
message="Error executing command in container: {error}",
|
||||
tag="DOCKER",
|
||||
params={"error": str(e)}
|
||||
)
|
||||
return -1, "", str(e)
|
||||
|
||||
async def start_socat_in_container(self, container_id: str) -> bool:
|
||||
"""Start socat in the container to map port 9222 to 9223.
|
||||
|
||||
Args:
|
||||
container_id: ID of the container
|
||||
|
||||
Returns:
|
||||
bool: True if socat started successfully, False otherwise
|
||||
"""
|
||||
# Command to run socat as a background process
|
||||
cmd = ["socat", "TCP-LISTEN:9223,fork", "TCP:localhost:9222"]
|
||||
|
||||
returncode, _, stderr = await self.exec_in_container(container_id, cmd, detach=True)
|
||||
|
||||
if returncode != 0:
|
||||
if self.logger:
|
||||
self.logger.error(
|
||||
message="Failed to start socat in container: {error}",
|
||||
tag="DOCKER",
|
||||
params={"error": stderr}
|
||||
)
|
||||
return False
|
||||
|
||||
if self.logger:
|
||||
self.logger.debug(f"Started socat in container: {container_id[:12]}", tag="DOCKER")
|
||||
|
||||
# Wait a moment for socat to start
|
||||
await asyncio.sleep(1)
|
||||
return True
|
||||
|
||||
async def launch_chrome_in_container(self, container_id: str, browser_args: List[str]) -> bool:
|
||||
"""Launch Chrome inside the container with specified arguments.
|
||||
|
||||
Args:
|
||||
container_id: ID of the container
|
||||
browser_args: Chrome command line arguments
|
||||
|
||||
Returns:
|
||||
bool: True if Chrome started successfully, False otherwise
|
||||
"""
|
||||
# Build Chrome command
|
||||
chrome_cmd = ["google-chrome"]
|
||||
chrome_cmd.extend(browser_args)
|
||||
|
||||
returncode, _, stderr = await self.exec_in_container(container_id, chrome_cmd, detach=True)
|
||||
|
||||
if returncode != 0:
|
||||
if self.logger:
|
||||
self.logger.error(
|
||||
message="Failed to launch Chrome in container: {error}",
|
||||
tag="DOCKER",
|
||||
params={"error": stderr}
|
||||
)
|
||||
return False
|
||||
|
||||
if self.logger:
|
||||
self.logger.debug(f"Launched Chrome in container: {container_id[:12]}", tag="DOCKER")
|
||||
|
||||
return True
|
||||
|
||||
async def get_process_id_in_container(self, container_id: str, process_name: str) -> Optional[int]:
|
||||
"""Get the process ID for a process in the container.
|
||||
|
||||
Args:
|
||||
container_id: ID of the container
|
||||
process_name: Name pattern to search for
|
||||
|
||||
Returns:
|
||||
int: Process ID if found, None otherwise
|
||||
"""
|
||||
cmd = ["pgrep", "-f", process_name]
|
||||
|
||||
returncode, stdout, _ = await self.exec_in_container(container_id, cmd)
|
||||
|
||||
if returncode == 0 and stdout.strip():
|
||||
pid = int(stdout.strip().split("\n")[0])
|
||||
return pid
|
||||
|
||||
return None
|
||||
|
||||
async def stop_process_in_container(self, container_id: str, pid: int) -> bool:
|
||||
"""Stop a process in the container by PID.
|
||||
|
||||
Args:
|
||||
container_id: ID of the container
|
||||
pid: Process ID to stop
|
||||
|
||||
Returns:
|
||||
bool: True if process was stopped, False otherwise
|
||||
"""
|
||||
cmd = ["kill", "-TERM", str(pid)]
|
||||
|
||||
returncode, _, stderr = await self.exec_in_container(container_id, cmd)
|
||||
|
||||
if returncode != 0:
|
||||
if self.logger:
|
||||
self.logger.warning(
|
||||
message="Failed to stop process in container: {error}",
|
||||
tag="DOCKER",
|
||||
params={"error": stderr}
|
||||
)
|
||||
return False
|
||||
|
||||
if self.logger:
|
||||
self.logger.debug(f"Stopped process {pid} in container: {container_id[:12]}", tag="DOCKER")
|
||||
|
||||
return True
|
||||
|
||||
# Network and Port Methods
|
||||
|
||||
async def wait_for_cdp_ready(self, host_port: int, timeout: int = 30) -> bool:
|
||||
"""Wait for the CDP endpoint to be ready.
|
||||
|
||||
Args:
|
||||
host_port: Port to check for CDP endpoint
|
||||
timeout: Maximum time to wait in seconds
|
||||
|
||||
Returns:
|
||||
bool: True if CDP endpoint is ready, False if timeout occurred
|
||||
"""
|
||||
import aiohttp
|
||||
|
||||
url = f"http://localhost:{host_port}/json/version"
|
||||
|
||||
for _ in range(timeout):
|
||||
try:
|
||||
async with aiohttp.ClientSession() as session:
|
||||
async with session.get(url, timeout=1) as response:
|
||||
if response.status == 200:
|
||||
if self.logger:
|
||||
self.logger.debug(f"CDP endpoint ready on port {host_port}", tag="DOCKER")
|
||||
return True
|
||||
except Exception:
|
||||
pass
|
||||
await asyncio.sleep(1)
|
||||
|
||||
if self.logger:
|
||||
self.logger.warning(f"CDP endpoint not ready on port {host_port} after {timeout}s timeout", tag="DOCKER")
|
||||
return False
|
||||
|
||||
def is_port_in_use(self, port: int) -> bool:
|
||||
"""Check if a port is already in use on the host.
|
||||
|
||||
Args:
|
||||
port: Port number to check
|
||||
|
||||
Returns:
|
||||
bool: True if port is in use, False otherwise
|
||||
"""
|
||||
with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
|
||||
return s.connect_ex(('localhost', port)) == 0
|
||||
|
||||
def get_next_available_port(self, start_port: int = 9223) -> int:
|
||||
"""Get the next available port starting from a given port.
|
||||
|
||||
Args:
|
||||
start_port: Port number to start checking from
|
||||
|
||||
Returns:
|
||||
int: First available port number
|
||||
"""
|
||||
port = start_port
|
||||
while self.is_port_in_use(port):
|
||||
port += 1
|
||||
return port
|
||||
|
||||
# Configuration Hash Methods
|
||||
|
||||
def generate_config_hash(self, config_dict: Dict) -> str:
|
||||
"""Generate a hash of the configuration for container matching.
|
||||
|
||||
Args:
|
||||
config_dict: Dictionary of configuration parameters
|
||||
|
||||
Returns:
|
||||
str: Hash string uniquely identifying this configuration
|
||||
"""
|
||||
# Convert to canonical JSON string and hash
|
||||
config_json = json.dumps(config_dict, sort_keys=True)
|
||||
return hashlib.sha256(config_json.encode()).hexdigest()
|
||||
204
crawl4ai/browser/manager.py
Normal file
204
crawl4ai/browser/manager.py
Normal file
@@ -0,0 +1,204 @@
|
||||
"""Browser manager module for Crawl4AI.
|
||||
|
||||
This module provides a central browser management class that uses the
|
||||
strategy pattern internally while maintaining the existing API.
|
||||
It also implements a page pooling mechanism for improved performance.
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
import time
|
||||
from typing import Optional, Tuple, List
|
||||
|
||||
from playwright.async_api import Page, BrowserContext
|
||||
|
||||
from ..async_logger import AsyncLogger
|
||||
from ..async_configs import BrowserConfig, CrawlerRunConfig
|
||||
|
||||
from .strategies import (
|
||||
BaseBrowserStrategy,
|
||||
PlaywrightBrowserStrategy,
|
||||
CDPBrowserStrategy,
|
||||
BuiltinBrowserStrategy
|
||||
)
|
||||
|
||||
# Import DockerBrowserStrategy if available
|
||||
try:
|
||||
from .docker_strategy import DockerBrowserStrategy
|
||||
except ImportError:
|
||||
DockerBrowserStrategy = None
|
||||
|
||||
class BrowserManager:
|
||||
"""Main interface for browser management in Crawl4AI.
|
||||
|
||||
This class maintains backward compatibility with the existing implementation
|
||||
while using the strategy pattern internally for different browser types.
|
||||
|
||||
Attributes:
|
||||
config (BrowserConfig): Configuration object containing all browser settings
|
||||
logger: Logger instance for recording events and errors
|
||||
browser: The browser instance
|
||||
default_context: The default browser context
|
||||
managed_browser: The managed browser instance
|
||||
playwright: The Playwright instance
|
||||
sessions: Dictionary to store session information
|
||||
session_ttl: Session timeout in seconds
|
||||
"""
|
||||
|
||||
def __init__(self, browser_config: Optional[BrowserConfig] = None, logger: Optional[AsyncLogger] = None):
|
||||
"""Initialize the BrowserManager with a browser configuration.
|
||||
|
||||
Args:
|
||||
browser_config: Configuration object containing all browser settings
|
||||
logger: Logger instance for recording events and errors
|
||||
"""
|
||||
self.config = browser_config or BrowserConfig()
|
||||
self.logger = logger
|
||||
|
||||
# Create strategy based on configuration
|
||||
self._strategy = self._create_strategy()
|
||||
|
||||
# Initialize state variables for compatibility with existing code
|
||||
self.browser = None
|
||||
self.default_context = None
|
||||
self.managed_browser = None
|
||||
self.playwright = None
|
||||
|
||||
# For session management (from existing implementation)
|
||||
self.sessions = {}
|
||||
self.session_ttl = 1800 # 30 minutes
|
||||
|
||||
def _create_strategy(self) -> BaseBrowserStrategy:
|
||||
"""Create appropriate browser strategy based on configuration.
|
||||
|
||||
Returns:
|
||||
BaseBrowserStrategy: The selected browser strategy
|
||||
"""
|
||||
if self.config.browser_mode == "builtin":
|
||||
return BuiltinBrowserStrategy(self.config, self.logger)
|
||||
elif self.config.browser_mode == "docker":
|
||||
if DockerBrowserStrategy is None:
|
||||
if self.logger:
|
||||
self.logger.error(
|
||||
"Docker browser strategy requested but not available. "
|
||||
"Falling back to PlaywrightBrowserStrategy.",
|
||||
tag="BROWSER"
|
||||
)
|
||||
return PlaywrightBrowserStrategy(self.config, self.logger)
|
||||
return DockerBrowserStrategy(self.config, self.logger)
|
||||
elif self.config.cdp_url or self.config.use_managed_browser:
|
||||
return CDPBrowserStrategy(self.config, self.logger)
|
||||
else:
|
||||
return PlaywrightBrowserStrategy(self.config, self.logger)
|
||||
|
||||
async def start(self):
|
||||
"""Start the browser instance and set up the default context.
|
||||
|
||||
Returns:
|
||||
self: For method chaining
|
||||
"""
|
||||
# Start the strategy
|
||||
await self._strategy.start()
|
||||
|
||||
# Update legacy references
|
||||
self.browser = self._strategy.browser
|
||||
self.default_context = self._strategy.default_context
|
||||
|
||||
# Set browser process reference (for CDP strategy)
|
||||
if hasattr(self._strategy, 'browser_process'):
|
||||
self.managed_browser = self._strategy
|
||||
|
||||
# Set Playwright reference
|
||||
self.playwright = self._strategy.playwright
|
||||
|
||||
# Sync sessions if needed
|
||||
if hasattr(self._strategy, 'sessions'):
|
||||
self.sessions = self._strategy.sessions
|
||||
self.session_ttl = self._strategy.session_ttl
|
||||
|
||||
return self
|
||||
|
||||
async def get_page(self, crawlerRunConfig: CrawlerRunConfig) -> Tuple[Page, BrowserContext]:
|
||||
"""Get a page for the given configuration.
|
||||
|
||||
Args:
|
||||
crawlerRunConfig: Configuration object for the crawler run
|
||||
|
||||
Returns:
|
||||
Tuple of (Page, BrowserContext)
|
||||
"""
|
||||
# Delegate to strategy
|
||||
page, context = await self._strategy.get_page(crawlerRunConfig)
|
||||
|
||||
# Sync sessions if needed
|
||||
if hasattr(self._strategy, 'sessions'):
|
||||
self.sessions = self._strategy.sessions
|
||||
|
||||
return page, context
|
||||
|
||||
async def get_pages(self, crawlerRunConfig: CrawlerRunConfig, count: int = 1) -> List[Tuple[Page, BrowserContext]]:
|
||||
"""Get multiple pages with the same configuration.
|
||||
|
||||
This method efficiently creates multiple browser pages using the same configuration,
|
||||
which is useful for parallel crawling of multiple URLs.
|
||||
|
||||
Args:
|
||||
crawlerRunConfig: Configuration for the pages
|
||||
count: Number of pages to create
|
||||
|
||||
Returns:
|
||||
List of (Page, Context) tuples
|
||||
"""
|
||||
# Delegate to strategy
|
||||
pages = await self._strategy.get_pages(crawlerRunConfig, count)
|
||||
|
||||
# Sync sessions if needed
|
||||
if hasattr(self._strategy, 'sessions'):
|
||||
self.sessions = self._strategy.sessions
|
||||
|
||||
return pages
|
||||
|
||||
async def kill_session(self, session_id: str):
|
||||
"""Kill a browser session and clean up resources.
|
||||
|
||||
Args:
|
||||
session_id: The session ID to kill
|
||||
"""
|
||||
# Handle kill_session via our strategy if it supports it
|
||||
if hasattr(self._strategy, '_kill_session'):
|
||||
await self._strategy._kill_session(session_id)
|
||||
elif session_id in self.sessions:
|
||||
context, page, _ = self.sessions[session_id]
|
||||
await page.close()
|
||||
# Only close context if not using CDP
|
||||
if not self.config.use_managed_browser and not self.config.cdp_url and not self.config.browser_mode == "builtin":
|
||||
await context.close()
|
||||
del self.sessions[session_id]
|
||||
|
||||
def _cleanup_expired_sessions(self):
|
||||
"""Clean up expired sessions based on TTL."""
|
||||
# Use strategy's implementation if available
|
||||
if hasattr(self._strategy, '_cleanup_expired_sessions'):
|
||||
self._strategy._cleanup_expired_sessions()
|
||||
return
|
||||
|
||||
# Otherwise use our own implementation
|
||||
current_time = time.time()
|
||||
expired_sessions = [
|
||||
sid
|
||||
for sid, (_, _, last_used) in self.sessions.items()
|
||||
if current_time - last_used > self.session_ttl
|
||||
]
|
||||
for sid in expired_sessions:
|
||||
asyncio.create_task(self.kill_session(sid))
|
||||
|
||||
async def close(self):
|
||||
"""Close the browser and clean up resources."""
|
||||
# Delegate to strategy
|
||||
await self._strategy.close()
|
||||
|
||||
# Reset legacy references
|
||||
self.browser = None
|
||||
self.default_context = None
|
||||
self.managed_browser = None
|
||||
self.playwright = None
|
||||
self.sessions = {}
|
||||
0
crawl4ai/browser/models.py
Normal file
0
crawl4ai/browser/models.py
Normal file
457
crawl4ai/browser/profiles.py
Normal file
457
crawl4ai/browser/profiles.py
Normal file
@@ -0,0 +1,457 @@
|
||||
"""Browser profile management module for Crawl4AI.
|
||||
|
||||
This module provides functionality for creating and managing browser profiles
|
||||
that can be used for authenticated browsing.
|
||||
"""
|
||||
|
||||
import os
|
||||
import asyncio
|
||||
import signal
|
||||
import sys
|
||||
import datetime
|
||||
import uuid
|
||||
import shutil
|
||||
from typing import List, Dict, Optional, Any
|
||||
from colorama import Fore, Style, init
|
||||
|
||||
from ..async_configs import BrowserConfig
|
||||
from ..async_logger import AsyncLogger, AsyncLoggerBase
|
||||
from ..utils import get_home_folder
|
||||
|
||||
class BrowserProfileManager:
|
||||
"""Manages browser profiles for Crawl4AI.
|
||||
|
||||
This class provides functionality to create and manage browser profiles
|
||||
that can be used for authenticated browsing with Crawl4AI.
|
||||
|
||||
Profiles are stored by default in ~/.crawl4ai/profiles/
|
||||
"""
|
||||
|
||||
def __init__(self, logger: Optional[AsyncLoggerBase] = None):
|
||||
"""Initialize the BrowserProfileManager.
|
||||
|
||||
Args:
|
||||
logger: Logger for outputting messages. If None, a default AsyncLogger is created.
|
||||
"""
|
||||
# Initialize colorama for colorful terminal output
|
||||
init()
|
||||
|
||||
# Create a logger if not provided
|
||||
if logger is None:
|
||||
self.logger = AsyncLogger(verbose=True)
|
||||
elif not isinstance(logger, AsyncLoggerBase):
|
||||
self.logger = AsyncLogger(verbose=True)
|
||||
else:
|
||||
self.logger = logger
|
||||
|
||||
# Ensure profiles directory exists
|
||||
self.profiles_dir = os.path.join(get_home_folder(), "profiles")
|
||||
os.makedirs(self.profiles_dir, exist_ok=True)
|
||||
|
||||
async def create_profile(self,
|
||||
profile_name: Optional[str] = None,
|
||||
browser_config: Optional[BrowserConfig] = None) -> Optional[str]:
|
||||
"""Create a browser profile interactively.
|
||||
|
||||
Args:
|
||||
profile_name: Name for the profile. If None, a name is generated.
|
||||
browser_config: Configuration for the browser. If None, a default configuration is used.
|
||||
|
||||
Returns:
|
||||
Path to the created profile directory, or None if creation failed
|
||||
"""
|
||||
# Create default browser config if none provided
|
||||
if browser_config is None:
|
||||
browser_config = BrowserConfig(
|
||||
browser_type="chromium",
|
||||
headless=False, # Must be visible for user interaction
|
||||
verbose=True
|
||||
)
|
||||
else:
|
||||
# Ensure headless is False for user interaction
|
||||
browser_config.headless = False
|
||||
|
||||
# Generate profile name if not provided
|
||||
if not profile_name:
|
||||
timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
|
||||
profile_name = f"profile_{timestamp}_{uuid.uuid4().hex[:6]}"
|
||||
|
||||
# Sanitize profile name (replace spaces and special chars)
|
||||
profile_name = "".join(c if c.isalnum() or c in "-_" else "_" for c in profile_name)
|
||||
|
||||
# Set user data directory
|
||||
profile_path = os.path.join(self.profiles_dir, profile_name)
|
||||
os.makedirs(profile_path, exist_ok=True)
|
||||
|
||||
# Print instructions for the user with colorama formatting
|
||||
border = f"{Fore.CYAN}{'='*80}{Style.RESET_ALL}"
|
||||
self.logger.info(f"\n{border}", tag="PROFILE")
|
||||
self.logger.info(f"Creating browser profile: {Fore.GREEN}{profile_name}{Style.RESET_ALL}", tag="PROFILE")
|
||||
self.logger.info(f"Profile directory: {Fore.YELLOW}{profile_path}{Style.RESET_ALL}", tag="PROFILE")
|
||||
|
||||
self.logger.info("\nInstructions:", tag="PROFILE")
|
||||
self.logger.info("1. A browser window will open for you to set up your profile.", tag="PROFILE")
|
||||
self.logger.info(f"2. {Fore.CYAN}Log in to websites{Style.RESET_ALL}, configure settings, etc. as needed.", tag="PROFILE")
|
||||
self.logger.info(f"3. When you're done, {Fore.YELLOW}press 'q' in this terminal{Style.RESET_ALL} to close the browser.", tag="PROFILE")
|
||||
self.logger.info("4. The profile will be saved and ready to use with Crawl4AI.", tag="PROFILE")
|
||||
self.logger.info(f"{border}\n", tag="PROFILE")
|
||||
|
||||
# Import the necessary classes with local imports to avoid circular references
|
||||
from .strategies import CDPBrowserStrategy
|
||||
|
||||
# Set browser config to use the profile path
|
||||
browser_config.user_data_dir = profile_path
|
||||
|
||||
# Create a CDP browser strategy for the profile creation
|
||||
browser_strategy = CDPBrowserStrategy(browser_config, self.logger)
|
||||
|
||||
# Set up signal handlers to ensure cleanup on interrupt
|
||||
original_sigint = signal.getsignal(signal.SIGINT)
|
||||
original_sigterm = signal.getsignal(signal.SIGTERM)
|
||||
|
||||
# Define cleanup handler for signals
|
||||
async def cleanup_handler(sig, frame):
|
||||
self.logger.warning("\nCleaning up browser process...", tag="PROFILE")
|
||||
await browser_strategy.close()
|
||||
# Restore original signal handlers
|
||||
signal.signal(signal.SIGINT, original_sigint)
|
||||
signal.signal(signal.SIGTERM, original_sigterm)
|
||||
if sig == signal.SIGINT:
|
||||
self.logger.error("Profile creation interrupted. Profile may be incomplete.", tag="PROFILE")
|
||||
sys.exit(1)
|
||||
|
||||
# Set signal handlers
|
||||
def sigint_handler(sig, frame):
|
||||
asyncio.create_task(cleanup_handler(sig, frame))
|
||||
|
||||
signal.signal(signal.SIGINT, sigint_handler)
|
||||
signal.signal(signal.SIGTERM, sigint_handler)
|
||||
|
||||
# Event to signal when user is done with the browser
|
||||
user_done_event = asyncio.Event()
|
||||
|
||||
# Run keyboard input loop in a separate task
|
||||
async def listen_for_quit_command():
|
||||
import termios
|
||||
import tty
|
||||
import select
|
||||
|
||||
# First output the prompt
|
||||
self.logger.info(f"{Fore.CYAN}Press '{Fore.WHITE}q{Fore.CYAN}' when you've finished using the browser...{Style.RESET_ALL}", tag="PROFILE")
|
||||
|
||||
# Save original terminal settings
|
||||
fd = sys.stdin.fileno()
|
||||
old_settings = termios.tcgetattr(fd)
|
||||
|
||||
try:
|
||||
# Switch to non-canonical mode (no line buffering)
|
||||
tty.setcbreak(fd)
|
||||
|
||||
while True:
|
||||
# Check if input is available (non-blocking)
|
||||
readable, _, _ = select.select([sys.stdin], [], [], 0.5)
|
||||
if readable:
|
||||
key = sys.stdin.read(1)
|
||||
if key.lower() == 'q':
|
||||
self.logger.info(f"{Fore.GREEN}Closing browser and saving profile...{Style.RESET_ALL}", tag="PROFILE")
|
||||
user_done_event.set()
|
||||
return
|
||||
|
||||
# Check if the browser process has already exited
|
||||
if browser_strategy.browser_process and browser_strategy.browser_process.poll() is not None:
|
||||
self.logger.info("Browser already closed. Ending input listener.", tag="PROFILE")
|
||||
user_done_event.set()
|
||||
return
|
||||
|
||||
await asyncio.sleep(0.1)
|
||||
|
||||
finally:
|
||||
# Restore terminal settings
|
||||
termios.tcsetattr(fd, termios.TCSADRAIN, old_settings)
|
||||
|
||||
try:
|
||||
# Start the browser
|
||||
await browser_strategy.start()
|
||||
|
||||
# Check if browser started successfully
|
||||
if not browser_strategy.browser_process:
|
||||
self.logger.error("Failed to start browser process.", tag="PROFILE")
|
||||
return None
|
||||
|
||||
self.logger.info(f"Browser launched. {Fore.CYAN}Waiting for you to finish...{Style.RESET_ALL}", tag="PROFILE")
|
||||
|
||||
# Start listening for keyboard input
|
||||
listener_task = asyncio.create_task(listen_for_quit_command())
|
||||
|
||||
# Wait for either the user to press 'q' or for the browser process to exit naturally
|
||||
while not user_done_event.is_set() and browser_strategy.browser_process.poll() is None:
|
||||
await asyncio.sleep(0.5)
|
||||
|
||||
# Cancel the listener task if it's still running
|
||||
if not listener_task.done():
|
||||
listener_task.cancel()
|
||||
try:
|
||||
await listener_task
|
||||
except asyncio.CancelledError:
|
||||
pass
|
||||
|
||||
# If the browser is still running and the user pressed 'q', terminate it
|
||||
if browser_strategy.browser_process.poll() is None and user_done_event.is_set():
|
||||
self.logger.info("Terminating browser process...", tag="PROFILE")
|
||||
await browser_strategy.close()
|
||||
|
||||
self.logger.success(f"Browser closed. Profile saved at: {Fore.GREEN}{profile_path}{Style.RESET_ALL}", tag="PROFILE")
|
||||
|
||||
except Exception as e:
|
||||
self.logger.error(f"Error creating profile: {str(e)}", tag="PROFILE")
|
||||
await browser_strategy.close()
|
||||
return None
|
||||
finally:
|
||||
# Restore original signal handlers
|
||||
signal.signal(signal.SIGINT, original_sigint)
|
||||
signal.signal(signal.SIGTERM, original_sigterm)
|
||||
|
||||
# Make sure browser is fully cleaned up
|
||||
await browser_strategy.close()
|
||||
|
||||
# Return the profile path
|
||||
return profile_path
|
||||
|
||||
def list_profiles(self) -> List[Dict[str, Any]]:
|
||||
"""List all available browser profiles.
|
||||
|
||||
Returns:
|
||||
List of dictionaries containing profile information
|
||||
"""
|
||||
if not os.path.exists(self.profiles_dir):
|
||||
return []
|
||||
|
||||
profiles = []
|
||||
|
||||
for name in os.listdir(self.profiles_dir):
|
||||
profile_path = os.path.join(self.profiles_dir, name)
|
||||
|
||||
# Skip if not a directory
|
||||
if not os.path.isdir(profile_path):
|
||||
continue
|
||||
|
||||
# Check if this looks like a valid browser profile
|
||||
# For Chromium: Look for Preferences file
|
||||
# For Firefox: Look for prefs.js file
|
||||
is_valid = False
|
||||
|
||||
if os.path.exists(os.path.join(profile_path, "Preferences")) or \
|
||||
os.path.exists(os.path.join(profile_path, "Default", "Preferences")):
|
||||
is_valid = "chromium"
|
||||
elif os.path.exists(os.path.join(profile_path, "prefs.js")):
|
||||
is_valid = "firefox"
|
||||
|
||||
if is_valid:
|
||||
# Get creation time
|
||||
created = datetime.datetime.fromtimestamp(
|
||||
os.path.getctime(profile_path)
|
||||
)
|
||||
|
||||
profiles.append({
|
||||
"name": name,
|
||||
"path": profile_path,
|
||||
"created": created,
|
||||
"type": is_valid
|
||||
})
|
||||
|
||||
# Sort by creation time, newest first
|
||||
profiles.sort(key=lambda x: x["created"], reverse=True)
|
||||
|
||||
return profiles
|
||||
|
||||
def get_profile_path(self, profile_name: str) -> Optional[str]:
|
||||
"""Get the full path to a profile by name.
|
||||
|
||||
Args:
|
||||
profile_name: Name of the profile (not the full path)
|
||||
|
||||
Returns:
|
||||
Full path to the profile directory, or None if not found
|
||||
"""
|
||||
profile_path = os.path.join(self.profiles_dir, profile_name)
|
||||
|
||||
# Check if path exists and is a valid profile
|
||||
if not os.path.isdir(profile_path):
|
||||
# Check if profile_name itself is full path
|
||||
if os.path.isabs(profile_name):
|
||||
profile_path = profile_name
|
||||
else:
|
||||
return None
|
||||
|
||||
# Look for profile indicators
|
||||
is_profile = (
|
||||
os.path.exists(os.path.join(profile_path, "Preferences")) or
|
||||
os.path.exists(os.path.join(profile_path, "Default", "Preferences")) or
|
||||
os.path.exists(os.path.join(profile_path, "prefs.js"))
|
||||
)
|
||||
|
||||
if not is_profile:
|
||||
return None # Not a valid browser profile
|
||||
|
||||
return profile_path
|
||||
|
||||
def delete_profile(self, profile_name_or_path: str) -> bool:
|
||||
"""Delete a browser profile by name or path.
|
||||
|
||||
Args:
|
||||
profile_name_or_path: Name of the profile or full path to profile directory
|
||||
|
||||
Returns:
|
||||
True if the profile was deleted successfully, False otherwise
|
||||
"""
|
||||
# Determine if input is a name or a path
|
||||
if os.path.isabs(profile_name_or_path):
|
||||
# Full path provided
|
||||
profile_path = profile_name_or_path
|
||||
else:
|
||||
# Just a name provided, construct path
|
||||
profile_path = os.path.join(self.profiles_dir, profile_name_or_path)
|
||||
|
||||
# Check if path exists and is a valid profile
|
||||
if not os.path.isdir(profile_path):
|
||||
return False
|
||||
|
||||
# Look for profile indicators
|
||||
is_profile = (
|
||||
os.path.exists(os.path.join(profile_path, "Preferences")) or
|
||||
os.path.exists(os.path.join(profile_path, "Default", "Preferences")) or
|
||||
os.path.exists(os.path.join(profile_path, "prefs.js"))
|
||||
)
|
||||
|
||||
if not is_profile:
|
||||
return False # Not a valid browser profile
|
||||
|
||||
# Delete the profile directory
|
||||
try:
|
||||
shutil.rmtree(profile_path)
|
||||
return True
|
||||
except Exception:
|
||||
return False
|
||||
|
||||
async def interactive_manager(self, crawl_callback=None):
|
||||
"""Launch an interactive profile management console.
|
||||
|
||||
Args:
|
||||
crawl_callback: Function to call when selecting option to use
|
||||
a profile for crawling. It will be called with (profile_path, url).
|
||||
"""
|
||||
while True:
|
||||
self.logger.info(f"\n{Fore.CYAN}Profile Management Options:{Style.RESET_ALL}", tag="MENU")
|
||||
self.logger.info(f"1. {Fore.GREEN}Create a new profile{Style.RESET_ALL}", tag="MENU")
|
||||
self.logger.info(f"2. {Fore.YELLOW}List available profiles{Style.RESET_ALL}", tag="MENU")
|
||||
self.logger.info(f"3. {Fore.RED}Delete a profile{Style.RESET_ALL}", tag="MENU")
|
||||
|
||||
# Only show crawl option if callback provided
|
||||
if crawl_callback:
|
||||
self.logger.info(f"4. {Fore.CYAN}Use a profile to crawl a website{Style.RESET_ALL}", tag="MENU")
|
||||
self.logger.info(f"5. {Fore.MAGENTA}Exit{Style.RESET_ALL}", tag="MENU")
|
||||
exit_option = "5"
|
||||
else:
|
||||
self.logger.info(f"4. {Fore.MAGENTA}Exit{Style.RESET_ALL}", tag="MENU")
|
||||
exit_option = "4"
|
||||
|
||||
choice = input(f"\n{Fore.CYAN}Enter your choice (1-{exit_option}): {Style.RESET_ALL}")
|
||||
|
||||
if choice == "1":
|
||||
# Create new profile
|
||||
name = input(f"{Fore.GREEN}Enter a name for the new profile (or press Enter for auto-generated name): {Style.RESET_ALL}")
|
||||
await self.create_profile(name or None)
|
||||
|
||||
elif choice == "2":
|
||||
# List profiles
|
||||
profiles = self.list_profiles()
|
||||
|
||||
if not profiles:
|
||||
self.logger.warning(" No profiles found. Create one first with option 1.", tag="PROFILES")
|
||||
continue
|
||||
|
||||
# Print profile information with colorama formatting
|
||||
self.logger.info("\nAvailable profiles:", tag="PROFILES")
|
||||
for i, profile in enumerate(profiles):
|
||||
self.logger.info(f"[{i+1}] {Fore.CYAN}{profile['name']}{Style.RESET_ALL}", tag="PROFILES")
|
||||
self.logger.info(f" Path: {Fore.YELLOW}{profile['path']}{Style.RESET_ALL}", tag="PROFILES")
|
||||
self.logger.info(f" Created: {profile['created'].strftime('%Y-%m-%d %H:%M:%S')}", tag="PROFILES")
|
||||
self.logger.info(f" Browser type: {profile['type']}", tag="PROFILES")
|
||||
self.logger.info("", tag="PROFILES") # Empty line for spacing
|
||||
|
||||
elif choice == "3":
|
||||
# Delete profile
|
||||
profiles = self.list_profiles()
|
||||
if not profiles:
|
||||
self.logger.warning("No profiles found to delete", tag="PROFILES")
|
||||
continue
|
||||
|
||||
# Display numbered list
|
||||
self.logger.info(f"\n{Fore.YELLOW}Available profiles:{Style.RESET_ALL}", tag="PROFILES")
|
||||
for i, profile in enumerate(profiles):
|
||||
self.logger.info(f"[{i+1}] {profile['name']}", tag="PROFILES")
|
||||
|
||||
# Get profile to delete
|
||||
profile_idx = input(f"{Fore.RED}Enter the number of the profile to delete (or 'c' to cancel): {Style.RESET_ALL}")
|
||||
if profile_idx.lower() == 'c':
|
||||
continue
|
||||
|
||||
try:
|
||||
idx = int(profile_idx) - 1
|
||||
if 0 <= idx < len(profiles):
|
||||
profile_name = profiles[idx]["name"]
|
||||
self.logger.info(f"Deleting profile: {Fore.YELLOW}{profile_name}{Style.RESET_ALL}", tag="PROFILES")
|
||||
|
||||
# Confirm deletion
|
||||
confirm = input(f"{Fore.RED}Are you sure you want to delete this profile? (y/n): {Style.RESET_ALL}")
|
||||
if confirm.lower() == 'y':
|
||||
success = self.delete_profile(profiles[idx]["path"])
|
||||
|
||||
if success:
|
||||
self.logger.success(f"Profile {Fore.GREEN}{profile_name}{Style.RESET_ALL} deleted successfully", tag="PROFILES")
|
||||
else:
|
||||
self.logger.error(f"Failed to delete profile {Fore.RED}{profile_name}{Style.RESET_ALL}", tag="PROFILES")
|
||||
else:
|
||||
self.logger.error("Invalid profile number", tag="PROFILES")
|
||||
except ValueError:
|
||||
self.logger.error("Please enter a valid number", tag="PROFILES")
|
||||
|
||||
elif choice == "4" and crawl_callback:
|
||||
# Use profile to crawl a site
|
||||
profiles = self.list_profiles()
|
||||
if not profiles:
|
||||
self.logger.warning("No profiles found. Create one first.", tag="PROFILES")
|
||||
continue
|
||||
|
||||
# Display numbered list
|
||||
self.logger.info(f"\n{Fore.YELLOW}Available profiles:{Style.RESET_ALL}", tag="PROFILES")
|
||||
for i, profile in enumerate(profiles):
|
||||
self.logger.info(f"[{i+1}] {profile['name']}", tag="PROFILES")
|
||||
|
||||
# Get profile to use
|
||||
profile_idx = input(f"{Fore.CYAN}Enter the number of the profile to use (or 'c' to cancel): {Style.RESET_ALL}")
|
||||
if profile_idx.lower() == 'c':
|
||||
continue
|
||||
|
||||
try:
|
||||
idx = int(profile_idx) - 1
|
||||
if 0 <= idx < len(profiles):
|
||||
profile_path = profiles[idx]["path"]
|
||||
url = input(f"{Fore.CYAN}Enter the URL to crawl: {Style.RESET_ALL}")
|
||||
if url:
|
||||
# Call the provided crawl callback
|
||||
await crawl_callback(profile_path, url)
|
||||
else:
|
||||
self.logger.error("No URL provided", tag="CRAWL")
|
||||
else:
|
||||
self.logger.error("Invalid profile number", tag="PROFILES")
|
||||
except ValueError:
|
||||
self.logger.error("Please enter a valid number", tag="PROFILES")
|
||||
|
||||
elif (choice == "4" and not crawl_callback) or (choice == "5" and crawl_callback):
|
||||
# Exit
|
||||
self.logger.info("Exiting profile management", tag="MENU")
|
||||
break
|
||||
|
||||
else:
|
||||
self.logger.error(f"Invalid choice. Please enter a number between 1 and {exit_option}.", tag="MENU")
|
||||
1256
crawl4ai/browser/strategies.py
Normal file
1256
crawl4ai/browser/strategies.py
Normal file
File diff suppressed because it is too large
Load Diff
328
crawl4ai/browser/utils.py
Normal file
328
crawl4ai/browser/utils.py
Normal file
@@ -0,0 +1,328 @@
|
||||
"""Browser utilities module for Crawl4AI.
|
||||
|
||||
This module provides utility functions for browser management,
|
||||
including process management, CDP connection utilities,
|
||||
and Playwright instance management.
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
import os
|
||||
import sys
|
||||
import time
|
||||
import tempfile
|
||||
import subprocess
|
||||
from typing import Optional
|
||||
|
||||
from playwright.async_api import async_playwright
|
||||
|
||||
from ..utils import get_chromium_path
|
||||
from ..async_configs import BrowserConfig, CrawlerRunConfig
|
||||
|
||||
from ..async_logger import AsyncLogger
|
||||
|
||||
|
||||
_playwright_instance = None
|
||||
|
||||
async def get_playwright():
|
||||
"""Get or create the Playwright instance (singleton pattern).
|
||||
|
||||
Returns:
|
||||
Playwright: The Playwright instance
|
||||
"""
|
||||
global _playwright_instance
|
||||
if _playwright_instance is None or True:
|
||||
_playwright_instance = await async_playwright().start()
|
||||
return _playwright_instance
|
||||
|
||||
async def get_browser_executable(browser_type: str) -> str:
|
||||
"""Get the path to browser executable, with platform-specific handling.
|
||||
|
||||
Args:
|
||||
browser_type: Type of browser (chromium, firefox, webkit)
|
||||
|
||||
Returns:
|
||||
Path to browser executable
|
||||
"""
|
||||
return await get_chromium_path(browser_type)
|
||||
|
||||
def create_temp_directory(prefix="browser-profile-") -> str:
|
||||
"""Create a temporary directory for browser data.
|
||||
|
||||
Args:
|
||||
prefix: Prefix for the temporary directory name
|
||||
|
||||
Returns:
|
||||
Path to the created temporary directory
|
||||
"""
|
||||
return tempfile.mkdtemp(prefix=prefix)
|
||||
|
||||
def is_windows() -> bool:
|
||||
"""Check if the current platform is Windows.
|
||||
|
||||
Returns:
|
||||
True if Windows, False otherwise
|
||||
"""
|
||||
return sys.platform == "win32"
|
||||
|
||||
def is_macos() -> bool:
|
||||
"""Check if the current platform is macOS.
|
||||
|
||||
Returns:
|
||||
True if macOS, False otherwise
|
||||
"""
|
||||
return sys.platform == "darwin"
|
||||
|
||||
def is_linux() -> bool:
|
||||
"""Check if the current platform is Linux.
|
||||
|
||||
Returns:
|
||||
True if Linux, False otherwise
|
||||
"""
|
||||
return not (is_windows() or is_macos())
|
||||
|
||||
def is_browser_running(pid: Optional[int]) -> bool:
|
||||
"""Check if a process with the given PID is running.
|
||||
|
||||
Args:
|
||||
pid: Process ID to check
|
||||
|
||||
Returns:
|
||||
bool: True if the process is running, False otherwise
|
||||
"""
|
||||
if not pid:
|
||||
return False
|
||||
|
||||
try:
|
||||
# Check if the process exists
|
||||
if is_windows():
|
||||
process = subprocess.run(["tasklist", "/FI", f"PID eq {pid}"],
|
||||
capture_output=True, text=True)
|
||||
return str(pid) in process.stdout
|
||||
else:
|
||||
# Unix-like systems
|
||||
os.kill(pid, 0) # This doesn't actually kill the process, just checks if it exists
|
||||
return True
|
||||
except (ProcessLookupError, PermissionError, OSError):
|
||||
return False
|
||||
|
||||
def get_browser_disable_options() -> list:
|
||||
"""Get standard list of browser disable options for performance.
|
||||
|
||||
Returns:
|
||||
List of command-line options to disable various browser features
|
||||
"""
|
||||
return [
|
||||
"--disable-background-networking",
|
||||
"--disable-background-timer-throttling",
|
||||
"--disable-backgrounding-occluded-windows",
|
||||
"--disable-breakpad",
|
||||
"--disable-client-side-phishing-detection",
|
||||
"--disable-component-extensions-with-background-pages",
|
||||
"--disable-default-apps",
|
||||
"--disable-extensions",
|
||||
"--disable-features=TranslateUI",
|
||||
"--disable-hang-monitor",
|
||||
"--disable-ipc-flooding-protection",
|
||||
"--disable-popup-blocking",
|
||||
"--disable-prompt-on-repost",
|
||||
"--disable-sync",
|
||||
"--force-color-profile=srgb",
|
||||
"--metrics-recording-only",
|
||||
"--no-first-run",
|
||||
"--password-store=basic",
|
||||
"--use-mock-keychain",
|
||||
]
|
||||
|
||||
|
||||
async def find_optimal_browser_config(total_urls=50, verbose=True, rate_limit_delay=0.2):
|
||||
"""Find optimal browser configuration for crawling a specific number of URLs.
|
||||
|
||||
Args:
|
||||
total_urls: Number of URLs to crawl
|
||||
verbose: Whether to print progress
|
||||
rate_limit_delay: Delay between page loads to avoid rate limiting
|
||||
|
||||
Returns:
|
||||
dict: Contains fastest, lowest_memory, and optimal configurations
|
||||
"""
|
||||
from .manager import BrowserManager
|
||||
if verbose:
|
||||
print(f"\n=== Finding optimal configuration for crawling {total_urls} URLs ===\n")
|
||||
|
||||
# Generate test URLs with timestamp to avoid caching
|
||||
timestamp = int(time.time())
|
||||
urls = [f"https://example.com/page_{i}?t={timestamp}" for i in range(total_urls)]
|
||||
|
||||
# Limit browser configurations to test (1 browser to max 10)
|
||||
max_browsers = min(10, total_urls)
|
||||
configs_to_test = []
|
||||
|
||||
# Generate configurations (browser count, pages distribution)
|
||||
for num_browsers in range(1, max_browsers + 1):
|
||||
base_pages = total_urls // num_browsers
|
||||
remainder = total_urls % num_browsers
|
||||
|
||||
# Create distribution array like [3, 3, 2, 2] (some browsers get one more page)
|
||||
if remainder > 0:
|
||||
distribution = [base_pages + 1] * remainder + [base_pages] * (num_browsers - remainder)
|
||||
else:
|
||||
distribution = [base_pages] * num_browsers
|
||||
|
||||
configs_to_test.append((num_browsers, distribution))
|
||||
|
||||
results = []
|
||||
|
||||
# Test each configuration
|
||||
for browser_count, page_distribution in configs_to_test:
|
||||
if verbose:
|
||||
print(f"Testing {browser_count} browsers with distribution {tuple(page_distribution)}")
|
||||
|
||||
try:
|
||||
# Track memory if possible
|
||||
try:
|
||||
import psutil
|
||||
process = psutil.Process()
|
||||
start_memory = process.memory_info().rss / (1024 * 1024) # MB
|
||||
except ImportError:
|
||||
if verbose:
|
||||
print("Memory tracking not available (psutil not installed)")
|
||||
start_memory = 0
|
||||
|
||||
# Start browsers in parallel
|
||||
managers = []
|
||||
start_tasks = []
|
||||
start_time = time.time()
|
||||
|
||||
logger = AsyncLogger(verbose=True, log_file=None)
|
||||
|
||||
for i in range(browser_count):
|
||||
config = BrowserConfig(headless=True)
|
||||
manager = BrowserManager(browser_config=config, logger=logger)
|
||||
start_tasks.append(manager.start())
|
||||
managers.append(manager)
|
||||
|
||||
await asyncio.gather(*start_tasks)
|
||||
|
||||
# Distribute URLs among browsers
|
||||
urls_per_manager = {}
|
||||
url_index = 0
|
||||
|
||||
for i, manager in enumerate(managers):
|
||||
pages_for_this_browser = page_distribution[i]
|
||||
end_index = url_index + pages_for_this_browser
|
||||
urls_per_manager[manager] = urls[url_index:end_index]
|
||||
url_index = end_index
|
||||
|
||||
# Create pages for each browser
|
||||
all_pages = []
|
||||
for manager, manager_urls in urls_per_manager.items():
|
||||
if not manager_urls:
|
||||
continue
|
||||
pages = await manager.get_pages(CrawlerRunConfig(), count=len(manager_urls))
|
||||
all_pages.extend(zip(pages, manager_urls))
|
||||
|
||||
# Crawl pages with delay to avoid rate limiting
|
||||
async def crawl_page(page_ctx, url):
|
||||
page, _ = page_ctx
|
||||
try:
|
||||
await page.goto(url)
|
||||
if rate_limit_delay > 0:
|
||||
await asyncio.sleep(rate_limit_delay)
|
||||
title = await page.title()
|
||||
return title
|
||||
finally:
|
||||
await page.close()
|
||||
|
||||
crawl_start = time.time()
|
||||
crawl_tasks = [crawl_page(page_ctx, url) for page_ctx, url in all_pages]
|
||||
await asyncio.gather(*crawl_tasks)
|
||||
crawl_time = time.time() - crawl_start
|
||||
total_time = time.time() - start_time
|
||||
|
||||
# Measure final memory usage
|
||||
if start_memory > 0:
|
||||
end_memory = process.memory_info().rss / (1024 * 1024)
|
||||
memory_used = end_memory - start_memory
|
||||
else:
|
||||
memory_used = 0
|
||||
|
||||
# Close all browsers
|
||||
for manager in managers:
|
||||
await manager.close()
|
||||
|
||||
# Calculate metrics
|
||||
pages_per_second = total_urls / crawl_time
|
||||
|
||||
# Calculate efficiency score (higher is better)
|
||||
# This balances speed vs memory
|
||||
if memory_used > 0:
|
||||
efficiency = pages_per_second / (memory_used + 1)
|
||||
else:
|
||||
efficiency = pages_per_second
|
||||
|
||||
# Store result
|
||||
result = {
|
||||
"browser_count": browser_count,
|
||||
"distribution": tuple(page_distribution),
|
||||
"crawl_time": crawl_time,
|
||||
"total_time": total_time,
|
||||
"memory_used": memory_used,
|
||||
"pages_per_second": pages_per_second,
|
||||
"efficiency": efficiency
|
||||
}
|
||||
|
||||
results.append(result)
|
||||
|
||||
if verbose:
|
||||
print(f" ✓ Crawled {total_urls} pages in {crawl_time:.2f}s ({pages_per_second:.1f} pages/sec)")
|
||||
if memory_used > 0:
|
||||
print(f" ✓ Memory used: {memory_used:.1f}MB ({memory_used/total_urls:.1f}MB per page)")
|
||||
print(f" ✓ Efficiency score: {efficiency:.4f}")
|
||||
|
||||
except Exception as e:
|
||||
if verbose:
|
||||
print(f" ✗ Error: {str(e)}")
|
||||
|
||||
# Clean up
|
||||
for manager in managers:
|
||||
try:
|
||||
await manager.close()
|
||||
except:
|
||||
pass
|
||||
|
||||
# If no successful results, return None
|
||||
if not results:
|
||||
return None
|
||||
|
||||
# Find best configurations
|
||||
fastest = sorted(results, key=lambda x: x["crawl_time"])[0]
|
||||
|
||||
# Only consider memory if available
|
||||
memory_results = [r for r in results if r["memory_used"] > 0]
|
||||
if memory_results:
|
||||
lowest_memory = sorted(memory_results, key=lambda x: x["memory_used"])[0]
|
||||
else:
|
||||
lowest_memory = fastest
|
||||
|
||||
# Find most efficient (balanced speed vs memory)
|
||||
optimal = sorted(results, key=lambda x: x["efficiency"], reverse=True)[0]
|
||||
|
||||
# Print summary
|
||||
if verbose:
|
||||
print("\n=== OPTIMAL CONFIGURATIONS ===")
|
||||
print(f"⚡ Fastest: {fastest['browser_count']} browsers {fastest['distribution']}")
|
||||
print(f" {fastest['crawl_time']:.2f}s, {fastest['pages_per_second']:.1f} pages/sec")
|
||||
|
||||
print(f"💾 Memory-efficient: {lowest_memory['browser_count']} browsers {lowest_memory['distribution']}")
|
||||
if lowest_memory["memory_used"] > 0:
|
||||
print(f" {lowest_memory['memory_used']:.1f}MB, {lowest_memory['memory_used']/total_urls:.2f}MB per page")
|
||||
|
||||
print(f"🌟 Balanced optimal: {optimal['browser_count']} browsers {optimal['distribution']}")
|
||||
print(f" {optimal['crawl_time']:.2f}s, {optimal['pages_per_second']:.1f} pages/sec, score: {optimal['efficiency']:.4f}")
|
||||
|
||||
return {
|
||||
"fastest": fastest,
|
||||
"lowest_memory": lowest_memory,
|
||||
"optimal": optimal,
|
||||
"all_configs": results
|
||||
}
|
||||
@@ -163,6 +163,7 @@ class ManagedBrowser:
|
||||
)
|
||||
|
||||
# We'll monitor for a short time to make sure it starts properly, but won't keep monitoring
|
||||
await asyncio.sleep(0.5) # Give browser time to start
|
||||
await self._initial_startup_check()
|
||||
await asyncio.sleep(2) # Give browser time to start
|
||||
return f"http://{self.host}:{self.debugging_port}"
|
||||
|
||||
@@ -555,7 +555,6 @@ class BrowserProfiler:
|
||||
else:
|
||||
self.logger.error(f"Invalid choice. Please enter a number between 1 and {exit_option}.", tag="MENU")
|
||||
|
||||
|
||||
async def launch_standalone_browser(self,
|
||||
browser_type: str = "chromium",
|
||||
user_data_dir: Optional[str] = None,
|
||||
|
||||
@@ -9,6 +9,26 @@ from crawl4ai import (
|
||||
CrawlResult
|
||||
)
|
||||
|
||||
async def example_cdp():
|
||||
browser_conf = BrowserConfig(
|
||||
headless=False,
|
||||
cdp_url="http://localhost:9223"
|
||||
)
|
||||
crawler_config = CrawlerRunConfig(
|
||||
session_id="test",
|
||||
js_code = """(() => { return {"result": "Hello World!"} })()""",
|
||||
js_only=True
|
||||
)
|
||||
async with AsyncWebCrawler(
|
||||
config=browser_conf,
|
||||
verbose=True,
|
||||
) as crawler:
|
||||
result : CrawlResult = await crawler.arun(
|
||||
url="https://www.helloworld.org",
|
||||
config=crawler_config,
|
||||
)
|
||||
print(result.js_execution_result)
|
||||
|
||||
|
||||
async def main():
|
||||
browser_config = BrowserConfig(headless=True, verbose=True)
|
||||
@@ -16,18 +36,15 @@ async def main():
|
||||
crawler_config = CrawlerRunConfig(
|
||||
cache_mode=CacheMode.BYPASS,
|
||||
markdown_generator=DefaultMarkdownGenerator(
|
||||
# content_filter=PruningContentFilter(
|
||||
# threshold=0.48, threshold_type="fixed", min_word_threshold=0
|
||||
# )
|
||||
content_filter=PruningContentFilter(
|
||||
threshold=0.48, threshold_type="fixed", min_word_threshold=0
|
||||
)
|
||||
),
|
||||
)
|
||||
result : CrawlResult = await crawler.arun(
|
||||
# url="https://www.helloworld.org", config=crawler_config
|
||||
url="https://www.kidocode.com", config=crawler_config
|
||||
url="https://www.helloworld.org", config=crawler_config
|
||||
)
|
||||
print(result.markdown.raw_markdown[:500])
|
||||
# print(result.model_dump())
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(main())
|
||||
|
||||
4
tests/browser/docker/__init__.py
Normal file
4
tests/browser/docker/__init__.py
Normal file
@@ -0,0 +1,4 @@
|
||||
"""Docker browser strategy tests.
|
||||
|
||||
This package contains tests for the Docker browser strategy implementation.
|
||||
"""
|
||||
653
tests/browser/docker/test_docker_browser.py
Normal file
653
tests/browser/docker/test_docker_browser.py
Normal file
@@ -0,0 +1,653 @@
|
||||
"""Test examples for Docker Browser Strategy.
|
||||
|
||||
These examples demonstrate the functionality of Docker Browser Strategy
|
||||
and serve as functional tests.
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
import os
|
||||
import sys
|
||||
import shutil
|
||||
import uuid
|
||||
import json
|
||||
from typing import List, Dict, Any, Optional, Tuple
|
||||
|
||||
# Add the project root to Python path if running directly
|
||||
if __name__ == "__main__":
|
||||
sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '../../..')))
|
||||
|
||||
from crawl4ai.browser import BrowserManager
|
||||
from crawl4ai.async_configs import BrowserConfig, CrawlerRunConfig
|
||||
from crawl4ai.async_logger import AsyncLogger
|
||||
from crawl4ai.browser.docker_config import DockerConfig
|
||||
from crawl4ai.browser.docker_registry import DockerRegistry
|
||||
from crawl4ai.browser.docker_utils import DockerUtils
|
||||
|
||||
# Create a logger for clear terminal output
|
||||
logger = AsyncLogger(verbose=True, log_file=None)
|
||||
|
||||
# Global Docker utils instance
|
||||
docker_utils = DockerUtils(logger)
|
||||
|
||||
async def test_docker_components():
|
||||
"""Test Docker utilities, registry, and image building.
|
||||
|
||||
This function tests the core Docker components before running the browser tests.
|
||||
It validates DockerRegistry, DockerUtils, and builds test images to ensure
|
||||
everything is functioning correctly.
|
||||
"""
|
||||
logger.info("Testing Docker components", tag="SETUP")
|
||||
|
||||
# Create a test registry directory
|
||||
registry_dir = os.path.join(os.path.dirname(__file__), "test_registry")
|
||||
registry_file = os.path.join(registry_dir, "test_registry.json")
|
||||
os.makedirs(registry_dir, exist_ok=True)
|
||||
|
||||
try:
|
||||
# 1. Test DockerRegistry
|
||||
logger.info("Testing DockerRegistry...", tag="SETUP")
|
||||
registry = DockerRegistry(registry_file)
|
||||
|
||||
# Test saving and loading registry
|
||||
test_container_id = "test-container-123"
|
||||
registry.register_container(test_container_id, 9876, "test-hash-123")
|
||||
registry.save()
|
||||
|
||||
# Create a new registry instance that loads from the file
|
||||
registry2 = DockerRegistry(registry_file)
|
||||
port = registry2.get_container_host_port(test_container_id)
|
||||
hash_value = registry2.get_container_config_hash(test_container_id)
|
||||
|
||||
if port != 9876 or hash_value != "test-hash-123":
|
||||
logger.error("DockerRegistry persistence failed", tag="SETUP")
|
||||
return False
|
||||
|
||||
# Clean up test container from registry
|
||||
registry2.unregister_container(test_container_id)
|
||||
logger.success("DockerRegistry works correctly", tag="SETUP")
|
||||
|
||||
# 2. Test DockerUtils
|
||||
logger.info("Testing DockerUtils...", tag="SETUP")
|
||||
|
||||
# Test port detection
|
||||
in_use = docker_utils.is_port_in_use(22) # SSH port is usually in use
|
||||
logger.info(f"Port 22 in use: {in_use}", tag="SETUP")
|
||||
|
||||
# Get next available port
|
||||
available_port = docker_utils.get_next_available_port(9000)
|
||||
logger.info(f"Next available port: {available_port}", tag="SETUP")
|
||||
|
||||
# Test config hash generation
|
||||
config_dict = {"mode": "connect", "headless": True}
|
||||
config_hash = docker_utils.generate_config_hash(config_dict)
|
||||
logger.info(f"Generated config hash: {config_hash[:8]}...", tag="SETUP")
|
||||
|
||||
# 3. Test Docker is available
|
||||
logger.info("Checking Docker availability...", tag="SETUP")
|
||||
if not await check_docker_available():
|
||||
logger.error("Docker is not available - cannot continue tests", tag="SETUP")
|
||||
return False
|
||||
|
||||
# 4. Test building connect image
|
||||
logger.info("Building connect mode Docker image...", tag="SETUP")
|
||||
connect_image = await docker_utils.ensure_docker_image_exists(None, "connect")
|
||||
if not connect_image:
|
||||
logger.error("Failed to build connect mode image", tag="SETUP")
|
||||
return False
|
||||
logger.success(f"Successfully built connect image: {connect_image}", tag="SETUP")
|
||||
|
||||
# 5. Test building launch image
|
||||
logger.info("Building launch mode Docker image...", tag="SETUP")
|
||||
launch_image = await docker_utils.ensure_docker_image_exists(None, "launch")
|
||||
if not launch_image:
|
||||
logger.error("Failed to build launch mode image", tag="SETUP")
|
||||
return False
|
||||
logger.success(f"Successfully built launch image: {launch_image}", tag="SETUP")
|
||||
|
||||
# 6. Test creating and removing container
|
||||
logger.info("Testing container creation and removal...", tag="SETUP")
|
||||
container_id = await docker_utils.create_container(
|
||||
image_name=launch_image,
|
||||
host_port=available_port,
|
||||
container_name="crawl4ai-test-container"
|
||||
)
|
||||
|
||||
if not container_id:
|
||||
logger.error("Failed to create test container", tag="SETUP")
|
||||
return False
|
||||
|
||||
logger.info(f"Created test container: {container_id[:12]}", tag="SETUP")
|
||||
|
||||
# Verify container is running
|
||||
running = await docker_utils.is_container_running(container_id)
|
||||
if not running:
|
||||
logger.error("Test container is not running", tag="SETUP")
|
||||
await docker_utils.remove_container(container_id)
|
||||
return False
|
||||
|
||||
# Test commands in container
|
||||
logger.info("Testing command execution in container...", tag="SETUP")
|
||||
returncode, stdout, stderr = await docker_utils.exec_in_container(
|
||||
container_id, ["ls", "-la", "/"]
|
||||
)
|
||||
|
||||
if returncode != 0:
|
||||
logger.error(f"Command execution failed: {stderr}", tag="SETUP")
|
||||
await docker_utils.remove_container(container_id)
|
||||
return False
|
||||
|
||||
# Verify Chrome is installed in the container
|
||||
returncode, stdout, stderr = await docker_utils.exec_in_container(
|
||||
container_id, ["which", "google-chrome"]
|
||||
)
|
||||
|
||||
if returncode != 0:
|
||||
logger.error("Chrome not found in container", tag="SETUP")
|
||||
await docker_utils.remove_container(container_id)
|
||||
return False
|
||||
|
||||
chrome_path = stdout.strip()
|
||||
logger.info(f"Chrome found at: {chrome_path}", tag="SETUP")
|
||||
|
||||
# Test Chrome version
|
||||
returncode, stdout, stderr = await docker_utils.exec_in_container(
|
||||
container_id, ["google-chrome", "--version"]
|
||||
)
|
||||
|
||||
if returncode != 0:
|
||||
logger.error(f"Failed to get Chrome version: {stderr}", tag="SETUP")
|
||||
await docker_utils.remove_container(container_id)
|
||||
return False
|
||||
|
||||
logger.info(f"Chrome version: {stdout.strip()}", tag="SETUP")
|
||||
|
||||
# Remove test container
|
||||
removed = await docker_utils.remove_container(container_id)
|
||||
if not removed:
|
||||
logger.error("Failed to remove test container", tag="SETUP")
|
||||
return False
|
||||
|
||||
logger.success("Test container removed successfully", tag="SETUP")
|
||||
|
||||
# All components tested successfully
|
||||
logger.success("All Docker components tested successfully", tag="SETUP")
|
||||
return True
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Docker component tests failed: {str(e)}", tag="SETUP")
|
||||
return False
|
||||
finally:
|
||||
# Clean up registry test directory
|
||||
if os.path.exists(registry_dir):
|
||||
shutil.rmtree(registry_dir)
|
||||
|
||||
async def test_docker_connect_mode():
|
||||
"""Test Docker browser in connect mode.
|
||||
|
||||
This tests the basic functionality of creating a browser in Docker
|
||||
connect mode and using it for navigation.
|
||||
"""
|
||||
logger.info("Testing Docker browser in connect mode", tag="TEST")
|
||||
|
||||
# Create temp directory for user data
|
||||
temp_dir = os.path.join(os.path.dirname(__file__), "tmp_user_data")
|
||||
os.makedirs(temp_dir, exist_ok=True)
|
||||
|
||||
try:
|
||||
# Create Docker configuration
|
||||
docker_config = DockerConfig(
|
||||
mode="connect",
|
||||
persistent=False,
|
||||
remove_on_exit=True,
|
||||
user_data_dir=temp_dir
|
||||
)
|
||||
|
||||
# Create browser configuration
|
||||
browser_config = BrowserConfig(
|
||||
browser_mode="docker",
|
||||
headless=True,
|
||||
docker_config=docker_config
|
||||
)
|
||||
|
||||
# Create browser manager
|
||||
manager = BrowserManager(browser_config=browser_config, logger=logger)
|
||||
|
||||
# Start the browser
|
||||
await manager.start()
|
||||
logger.info("Browser started successfully", tag="TEST")
|
||||
|
||||
# Create crawler config
|
||||
crawler_config = CrawlerRunConfig(url="https://example.com")
|
||||
|
||||
# Get a page
|
||||
page, context = await manager.get_page(crawler_config)
|
||||
logger.info("Got page successfully", tag="TEST")
|
||||
|
||||
# Navigate to a website
|
||||
await page.goto("https://example.com")
|
||||
logger.info("Navigated to example.com", tag="TEST")
|
||||
|
||||
# Get page title
|
||||
title = await page.title()
|
||||
logger.info(f"Page title: {title}", tag="TEST")
|
||||
|
||||
# Clean up
|
||||
await manager.close()
|
||||
logger.info("Browser closed successfully", tag="TEST")
|
||||
|
||||
return True
|
||||
except Exception as e:
|
||||
logger.error(f"Test failed: {str(e)}", tag="TEST")
|
||||
# Ensure cleanup
|
||||
try:
|
||||
await manager.close()
|
||||
except:
|
||||
pass
|
||||
return False
|
||||
finally:
|
||||
# Clean up the temp directory
|
||||
if os.path.exists(temp_dir):
|
||||
shutil.rmtree(temp_dir)
|
||||
|
||||
async def test_docker_launch_mode():
|
||||
"""Test Docker browser in launch mode.
|
||||
|
||||
This tests launching a Chrome browser within a Docker container
|
||||
on demand with custom settings.
|
||||
"""
|
||||
logger.info("Testing Docker browser in launch mode", tag="TEST")
|
||||
|
||||
# Create temp directory for user data
|
||||
temp_dir = os.path.join(os.path.dirname(__file__), "tmp_user_data_launch")
|
||||
os.makedirs(temp_dir, exist_ok=True)
|
||||
|
||||
try:
|
||||
# Create Docker configuration
|
||||
docker_config = DockerConfig(
|
||||
mode="launch",
|
||||
persistent=False,
|
||||
remove_on_exit=True,
|
||||
user_data_dir=temp_dir
|
||||
)
|
||||
|
||||
# Create browser configuration
|
||||
browser_config = BrowserConfig(
|
||||
browser_mode="docker",
|
||||
headless=True,
|
||||
text_mode=True, # Enable text mode for faster operation
|
||||
docker_config=docker_config
|
||||
)
|
||||
|
||||
# Create browser manager
|
||||
manager = BrowserManager(browser_config=browser_config, logger=logger)
|
||||
|
||||
# Start the browser
|
||||
await manager.start()
|
||||
logger.info("Browser started successfully", tag="TEST")
|
||||
|
||||
# Create crawler config
|
||||
crawler_config = CrawlerRunConfig(url="https://example.com")
|
||||
|
||||
# Get a page
|
||||
page, context = await manager.get_page(crawler_config)
|
||||
logger.info("Got page successfully", tag="TEST")
|
||||
|
||||
# Navigate to a website
|
||||
await page.goto("https://example.com")
|
||||
logger.info("Navigated to example.com", tag="TEST")
|
||||
|
||||
# Get page title
|
||||
title = await page.title()
|
||||
logger.info(f"Page title: {title}", tag="TEST")
|
||||
|
||||
# Clean up
|
||||
await manager.close()
|
||||
logger.info("Browser closed successfully", tag="TEST")
|
||||
|
||||
return True
|
||||
except Exception as e:
|
||||
logger.error(f"Test failed: {str(e)}", tag="TEST")
|
||||
# Ensure cleanup
|
||||
try:
|
||||
await manager.close()
|
||||
except:
|
||||
pass
|
||||
return False
|
||||
finally:
|
||||
# Clean up the temp directory
|
||||
if os.path.exists(temp_dir):
|
||||
shutil.rmtree(temp_dir)
|
||||
|
||||
async def test_docker_persistent_storage():
|
||||
"""Test Docker browser with persistent storage.
|
||||
|
||||
This tests creating localStorage data in one session and verifying
|
||||
it persists to another session when using persistent storage.
|
||||
"""
|
||||
logger.info("Testing Docker browser with persistent storage", tag="TEST")
|
||||
|
||||
# Create a unique temp directory
|
||||
test_id = uuid.uuid4().hex[:8]
|
||||
temp_dir = os.path.join(os.path.dirname(__file__), f"tmp_user_data_persist_{test_id}")
|
||||
os.makedirs(temp_dir, exist_ok=True)
|
||||
|
||||
manager1 = None
|
||||
manager2 = None
|
||||
|
||||
try:
|
||||
# Create Docker configuration with persistence
|
||||
docker_config = DockerConfig(
|
||||
mode="connect",
|
||||
persistent=True, # Keep container running between sessions
|
||||
user_data_dir=temp_dir,
|
||||
container_user_data_dir="/data"
|
||||
)
|
||||
|
||||
# Create browser configuration
|
||||
browser_config = BrowserConfig(
|
||||
browser_mode="docker",
|
||||
headless=True,
|
||||
docker_config=docker_config
|
||||
)
|
||||
|
||||
# Create first browser manager
|
||||
manager1 = BrowserManager(browser_config=browser_config, logger=logger)
|
||||
|
||||
# Start the browser
|
||||
await manager1.start()
|
||||
logger.info("First browser started successfully", tag="TEST")
|
||||
|
||||
# Create crawler config
|
||||
crawler_config = CrawlerRunConfig()
|
||||
|
||||
# Get a page
|
||||
page1, context1 = await manager1.get_page(crawler_config)
|
||||
|
||||
# Navigate to example.com
|
||||
await page1.goto("https://example.com")
|
||||
|
||||
# Set localStorage item
|
||||
test_value = f"test_value_{test_id}"
|
||||
await page1.evaluate(f"localStorage.setItem('test_key', '{test_value}')")
|
||||
logger.info(f"Set localStorage test_key = {test_value}", tag="TEST")
|
||||
|
||||
# Close the first browser manager
|
||||
await manager1.close()
|
||||
logger.info("First browser closed", tag="TEST")
|
||||
|
||||
# Create second browser manager with same config
|
||||
manager2 = BrowserManager(browser_config=browser_config, logger=logger)
|
||||
|
||||
# Start the browser
|
||||
await manager2.start()
|
||||
logger.info("Second browser started successfully", tag="TEST")
|
||||
|
||||
# Get a page
|
||||
page2, context2 = await manager2.get_page(crawler_config)
|
||||
|
||||
# Navigate to same site
|
||||
await page2.goto("https://example.com")
|
||||
|
||||
# Get localStorage item
|
||||
value = await page2.evaluate("localStorage.getItem('test_key')")
|
||||
logger.info(f"Retrieved localStorage test_key = {value}", tag="TEST")
|
||||
|
||||
# Check if persistence worked
|
||||
if value == test_value:
|
||||
logger.success("Storage persistence verified!", tag="TEST")
|
||||
else:
|
||||
logger.error(f"Storage persistence failed! Expected {test_value}, got {value}", tag="TEST")
|
||||
|
||||
# Clean up
|
||||
await manager2.close()
|
||||
logger.info("Second browser closed successfully", tag="TEST")
|
||||
|
||||
return value == test_value
|
||||
except Exception as e:
|
||||
logger.error(f"Test failed: {str(e)}", tag="TEST")
|
||||
# Ensure cleanup
|
||||
try:
|
||||
if manager1:
|
||||
await manager1.close()
|
||||
if manager2:
|
||||
await manager2.close()
|
||||
except:
|
||||
pass
|
||||
return False
|
||||
finally:
|
||||
# Clean up the temp directory
|
||||
if os.path.exists(temp_dir):
|
||||
shutil.rmtree(temp_dir)
|
||||
|
||||
async def test_docker_parallel_pages():
|
||||
"""Test Docker browser with parallel page creation.
|
||||
|
||||
This tests the ability to create and use multiple pages in parallel
|
||||
from a single Docker browser instance.
|
||||
"""
|
||||
logger.info("Testing Docker browser with parallel pages", tag="TEST")
|
||||
|
||||
try:
|
||||
# Create Docker configuration
|
||||
docker_config = DockerConfig(
|
||||
mode="connect",
|
||||
persistent=False,
|
||||
remove_on_exit=True
|
||||
)
|
||||
|
||||
# Create browser configuration
|
||||
browser_config = BrowserConfig(
|
||||
browser_mode="docker",
|
||||
headless=True,
|
||||
docker_config=docker_config
|
||||
)
|
||||
|
||||
# Create browser manager
|
||||
manager = BrowserManager(browser_config=browser_config, logger=logger)
|
||||
|
||||
# Start the browser
|
||||
await manager.start()
|
||||
logger.info("Browser started successfully", tag="TEST")
|
||||
|
||||
# Create crawler config
|
||||
crawler_config = CrawlerRunConfig()
|
||||
|
||||
# Get multiple pages
|
||||
page_count = 3
|
||||
pages = await manager.get_pages(crawler_config, count=page_count)
|
||||
logger.info(f"Got {len(pages)} pages successfully", tag="TEST")
|
||||
|
||||
if len(pages) != page_count:
|
||||
logger.error(f"Expected {page_count} pages, got {len(pages)}", tag="TEST")
|
||||
await manager.close()
|
||||
return False
|
||||
|
||||
# Navigate to different sites with each page
|
||||
tasks = []
|
||||
for i, (page, _) in enumerate(pages):
|
||||
tasks.append(page.goto(f"https://example.com?page={i}"))
|
||||
|
||||
# Wait for all navigations to complete
|
||||
await asyncio.gather(*tasks)
|
||||
logger.info("All pages navigated successfully", tag="TEST")
|
||||
|
||||
# Get titles from all pages
|
||||
titles = []
|
||||
for i, (page, _) in enumerate(pages):
|
||||
title = await page.title()
|
||||
titles.append(title)
|
||||
logger.info(f"Page {i+1} title: {title}", tag="TEST")
|
||||
|
||||
# Clean up
|
||||
await manager.close()
|
||||
logger.info("Browser closed successfully", tag="TEST")
|
||||
|
||||
return True
|
||||
except Exception as e:
|
||||
logger.error(f"Test failed: {str(e)}", tag="TEST")
|
||||
# Ensure cleanup
|
||||
try:
|
||||
await manager.close()
|
||||
except:
|
||||
pass
|
||||
return False
|
||||
|
||||
async def test_docker_registry_reuse():
|
||||
"""Test Docker container reuse via registry.
|
||||
|
||||
This tests that containers with matching configurations
|
||||
are reused rather than creating new ones.
|
||||
"""
|
||||
logger.info("Testing Docker container reuse via registry", tag="TEST")
|
||||
|
||||
# Create registry for this test
|
||||
registry_dir = os.path.join(os.path.dirname(__file__), "registry_reuse_test")
|
||||
registry_file = os.path.join(registry_dir, "registry.json")
|
||||
os.makedirs(registry_dir, exist_ok=True)
|
||||
|
||||
manager1 = None
|
||||
manager2 = None
|
||||
container_id1 = None
|
||||
|
||||
try:
|
||||
# Create identical Docker configurations with custom registry
|
||||
docker_config1 = DockerConfig(
|
||||
mode="connect",
|
||||
persistent=True, # Keep container running after closing
|
||||
registry_file=registry_file
|
||||
)
|
||||
|
||||
# Create first browser configuration
|
||||
browser_config1 = BrowserConfig(
|
||||
browser_mode="docker",
|
||||
headless=True,
|
||||
docker_config=docker_config1
|
||||
)
|
||||
|
||||
# Create first browser manager
|
||||
manager1 = BrowserManager(browser_config=browser_config1, logger=logger)
|
||||
|
||||
# Start the first browser
|
||||
await manager1.start()
|
||||
logger.info("First browser started successfully", tag="TEST")
|
||||
|
||||
# Get container ID from the strategy
|
||||
docker_strategy1 = manager1._strategy
|
||||
container_id1 = docker_strategy1.container_id
|
||||
logger.info(f"First browser container ID: {container_id1[:12]}", tag="TEST")
|
||||
|
||||
# Close the first manager but keep container running
|
||||
await manager1.close()
|
||||
logger.info("First browser closed", tag="TEST")
|
||||
|
||||
# Create second Docker configuration identical to first
|
||||
docker_config2 = DockerConfig(
|
||||
mode="connect",
|
||||
persistent=True,
|
||||
registry_file=registry_file
|
||||
)
|
||||
|
||||
# Create second browser configuration
|
||||
browser_config2 = BrowserConfig(
|
||||
browser_mode="docker",
|
||||
headless=True,
|
||||
docker_config=docker_config2
|
||||
)
|
||||
|
||||
# Create second browser manager
|
||||
manager2 = BrowserManager(browser_config=browser_config2, logger=logger)
|
||||
|
||||
# Start the second browser - should reuse existing container
|
||||
await manager2.start()
|
||||
logger.info("Second browser started successfully", tag="TEST")
|
||||
|
||||
# Get container ID from the second strategy
|
||||
docker_strategy2 = manager2._strategy
|
||||
container_id2 = docker_strategy2.container_id
|
||||
logger.info(f"Second browser container ID: {container_id2[:12]}", tag="TEST")
|
||||
|
||||
# Verify container reuse
|
||||
if container_id1 == container_id2:
|
||||
logger.success("Container reuse successful - using same container!", tag="TEST")
|
||||
else:
|
||||
logger.error("Container reuse failed - new container created!", tag="TEST")
|
||||
|
||||
# Clean up
|
||||
docker_strategy2.docker_config.persistent = False
|
||||
docker_strategy2.docker_config.remove_on_exit = True
|
||||
await manager2.close()
|
||||
logger.info("Second browser closed and container removed", tag="TEST")
|
||||
|
||||
return container_id1 == container_id2
|
||||
except Exception as e:
|
||||
logger.error(f"Test failed: {str(e)}", tag="TEST")
|
||||
# Ensure cleanup
|
||||
try:
|
||||
if manager1:
|
||||
await manager1.close()
|
||||
if manager2:
|
||||
await manager2.close()
|
||||
# Make sure container is removed
|
||||
if container_id1:
|
||||
await docker_utils.remove_container(container_id1, force=True)
|
||||
except:
|
||||
pass
|
||||
return False
|
||||
finally:
|
||||
# Clean up registry directory
|
||||
if os.path.exists(registry_dir):
|
||||
shutil.rmtree(registry_dir)
|
||||
|
||||
async def run_tests():
|
||||
"""Run all tests sequentially."""
|
||||
results = []
|
||||
|
||||
logger.info("Starting Docker Browser Strategy tests", tag="TEST")
|
||||
|
||||
# Check if Docker is available
|
||||
if not await check_docker_available():
|
||||
logger.error("Docker is not available - skipping tests", tag="TEST")
|
||||
return
|
||||
|
||||
# First test Docker components
|
||||
setup_result = await test_docker_components()
|
||||
if not setup_result:
|
||||
logger.error("Docker component tests failed - skipping browser tests", tag="TEST")
|
||||
return
|
||||
|
||||
# Run browser tests
|
||||
results.append(await test_docker_connect_mode())
|
||||
results.append(await test_docker_launch_mode())
|
||||
results.append(await test_docker_persistent_storage())
|
||||
results.append(await test_docker_parallel_pages())
|
||||
results.append(await test_docker_registry_reuse())
|
||||
|
||||
# Print summary
|
||||
total = len(results)
|
||||
passed = sum(1 for r in results if r)
|
||||
logger.info(f"Tests complete: {passed}/{total} passed", tag="SUMMARY")
|
||||
|
||||
if passed == total:
|
||||
logger.success("All tests passed!", tag="SUMMARY")
|
||||
else:
|
||||
logger.error(f"{total - passed} tests failed", tag="SUMMARY")
|
||||
|
||||
async def check_docker_available() -> bool:
|
||||
"""Check if Docker is available on the system.
|
||||
|
||||
Returns:
|
||||
bool: True if Docker is available, False otherwise
|
||||
"""
|
||||
try:
|
||||
proc = await asyncio.create_subprocess_exec(
|
||||
"docker", "--version",
|
||||
stdout=asyncio.subprocess.PIPE,
|
||||
stderr=asyncio.subprocess.PIPE
|
||||
)
|
||||
stdout, _ = await proc.communicate()
|
||||
return proc.returncode == 0 and stdout
|
||||
except:
|
||||
return False
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(run_tests())
|
||||
190
tests/browser/test_browser_manager.py
Normal file
190
tests/browser/test_browser_manager.py
Normal file
@@ -0,0 +1,190 @@
|
||||
"""Test examples for BrowserManager.
|
||||
|
||||
These examples demonstrate the functionality of BrowserManager
|
||||
and serve as functional tests.
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
import os
|
||||
import sys
|
||||
from typing import List
|
||||
|
||||
# Add the project root to Python path if running directly
|
||||
if __name__ == "__main__":
|
||||
sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '../..')))
|
||||
|
||||
from crawl4ai.browser import BrowserManager
|
||||
from crawl4ai.async_configs import BrowserConfig, CrawlerRunConfig
|
||||
from crawl4ai.async_logger import AsyncLogger
|
||||
|
||||
# Create a logger for clear terminal output
|
||||
logger = AsyncLogger(verbose=True, log_file=None)
|
||||
|
||||
async def test_basic_browser_manager():
|
||||
"""Test basic BrowserManager functionality with default configuration."""
|
||||
logger.info("Starting test_basic_browser_manager", tag="TEST")
|
||||
|
||||
try:
|
||||
# Create a browser manager with default config
|
||||
manager = BrowserManager(logger=logger)
|
||||
|
||||
# Start the browser
|
||||
await manager.start()
|
||||
logger.info("Browser started successfully", tag="TEST")
|
||||
|
||||
# Get a page
|
||||
crawler_config = CrawlerRunConfig(url="https://example.com")
|
||||
page, context = await manager.get_page(crawler_config)
|
||||
logger.info("Page created successfully", tag="TEST")
|
||||
|
||||
# Navigate to a website
|
||||
await page.goto("https://example.com")
|
||||
title = await page.title()
|
||||
logger.info(f"Page title: {title}", tag="TEST")
|
||||
|
||||
# Clean up
|
||||
await manager.close()
|
||||
logger.success("test_basic_browser_manager completed successfully", tag="TEST")
|
||||
return True
|
||||
except Exception as e:
|
||||
logger.error(f"test_basic_browser_manager failed: {str(e)}", tag="TEST")
|
||||
return False
|
||||
|
||||
async def test_custom_browser_config():
|
||||
"""Test BrowserManager with custom browser configuration."""
|
||||
logger.info("Starting test_custom_browser_config", tag="TEST")
|
||||
|
||||
try:
|
||||
# Create a custom browser config
|
||||
browser_config = BrowserConfig(
|
||||
browser_type="chromium",
|
||||
headless=True,
|
||||
viewport_width=1280,
|
||||
viewport_height=800,
|
||||
light_mode=True
|
||||
)
|
||||
|
||||
# Create browser manager with the config
|
||||
manager = BrowserManager(browser_config=browser_config, logger=logger)
|
||||
|
||||
# Start the browser
|
||||
await manager.start()
|
||||
logger.info("Browser started successfully with custom config", tag="TEST")
|
||||
|
||||
# Get a page
|
||||
crawler_config = CrawlerRunConfig(url="https://example.com")
|
||||
page, context = await manager.get_page(crawler_config)
|
||||
|
||||
# Navigate to a website
|
||||
await page.goto("https://example.com")
|
||||
title = await page.title()
|
||||
logger.info(f"Page title: {title}", tag="TEST")
|
||||
|
||||
# Verify viewport size
|
||||
viewport_size = await page.evaluate("() => ({ width: window.innerWidth, height: window.innerHeight })")
|
||||
logger.info(f"Viewport size: {viewport_size}", tag="TEST")
|
||||
|
||||
# Clean up
|
||||
await manager.close()
|
||||
logger.success("test_custom_browser_config completed successfully", tag="TEST")
|
||||
return True
|
||||
except Exception as e:
|
||||
logger.error(f"test_custom_browser_config failed: {str(e)}", tag="TEST")
|
||||
return False
|
||||
|
||||
async def test_multiple_pages():
|
||||
"""Test BrowserManager with multiple pages."""
|
||||
logger.info("Starting test_multiple_pages", tag="TEST")
|
||||
|
||||
try:
|
||||
# Create browser manager
|
||||
manager = BrowserManager(logger=logger)
|
||||
|
||||
# Start the browser
|
||||
await manager.start()
|
||||
logger.info("Browser started successfully", tag="TEST")
|
||||
|
||||
# Create multiple pages
|
||||
pages = []
|
||||
urls = ["https://example.com", "https://example.org", "https://mozilla.org"]
|
||||
|
||||
for i, url in enumerate(urls):
|
||||
crawler_config = CrawlerRunConfig(url=url)
|
||||
page, context = await manager.get_page(crawler_config)
|
||||
await page.goto(url)
|
||||
pages.append((page, url))
|
||||
logger.info(f"Created page {i+1} for {url}", tag="TEST")
|
||||
|
||||
# Verify all pages are loaded correctly
|
||||
for i, (page, url) in enumerate(pages):
|
||||
title = await page.title()
|
||||
logger.info(f"Page {i+1} title: {title}", tag="TEST")
|
||||
|
||||
# Clean up
|
||||
await manager.close()
|
||||
logger.success("test_multiple_pages completed successfully", tag="TEST")
|
||||
return True
|
||||
except Exception as e:
|
||||
logger.error(f"test_multiple_pages failed: {str(e)}", tag="TEST")
|
||||
return False
|
||||
|
||||
async def test_session_management():
|
||||
"""Test session management in BrowserManager."""
|
||||
logger.info("Starting test_session_management", tag="TEST")
|
||||
|
||||
try:
|
||||
# Create browser manager
|
||||
manager = BrowserManager(logger=logger)
|
||||
|
||||
# Start the browser
|
||||
await manager.start()
|
||||
logger.info("Browser started successfully", tag="TEST")
|
||||
|
||||
# Create a session
|
||||
session_id = "test_session_1"
|
||||
crawler_config = CrawlerRunConfig(url="https://example.com", session_id=session_id)
|
||||
page1, context1 = await manager.get_page(crawler_config)
|
||||
await page1.goto("https://example.com")
|
||||
logger.info(f"Created session with ID: {session_id}", tag="TEST")
|
||||
|
||||
# Get the same session again
|
||||
page2, context2 = await manager.get_page(crawler_config)
|
||||
|
||||
# Verify it's the same page/context
|
||||
is_same_page = page1 == page2
|
||||
is_same_context = context1 == context2
|
||||
logger.info(f"Same page: {is_same_page}, Same context: {is_same_context}", tag="TEST")
|
||||
|
||||
# Kill the session
|
||||
await manager.kill_session(session_id)
|
||||
logger.info(f"Killed session with ID: {session_id}", tag="TEST")
|
||||
|
||||
# Clean up
|
||||
await manager.close()
|
||||
logger.success("test_session_management completed successfully", tag="TEST")
|
||||
return True
|
||||
except Exception as e:
|
||||
logger.error(f"test_session_management failed: {str(e)}", tag="TEST")
|
||||
return False
|
||||
|
||||
async def run_tests():
|
||||
"""Run all tests sequentially."""
|
||||
results = []
|
||||
|
||||
results.append(await test_basic_browser_manager())
|
||||
results.append(await test_custom_browser_config())
|
||||
results.append(await test_multiple_pages())
|
||||
results.append(await test_session_management())
|
||||
|
||||
# Print summary
|
||||
total = len(results)
|
||||
passed = sum(results)
|
||||
logger.info(f"Tests complete: {passed}/{total} passed", tag="SUMMARY")
|
||||
|
||||
if passed == total:
|
||||
logger.success("All tests passed!", tag="SUMMARY")
|
||||
else:
|
||||
logger.error(f"{total - passed} tests failed", tag="SUMMARY")
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(run_tests())
|
||||
File diff suppressed because it is too large
Load Diff
160
tests/browser/test_builtin_strategy.py
Normal file
160
tests/browser/test_builtin_strategy.py
Normal file
@@ -0,0 +1,160 @@
|
||||
"""Test examples for BuiltinBrowserStrategy.
|
||||
|
||||
These examples demonstrate the functionality of BuiltinBrowserStrategy
|
||||
and serve as functional tests.
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
import os
|
||||
import sys
|
||||
|
||||
# Add the project root to Python path if running directly
|
||||
if __name__ == "__main__":
|
||||
sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '../..')))
|
||||
|
||||
from crawl4ai.browser import BrowserManager
|
||||
from crawl4ai.async_configs import BrowserConfig, CrawlerRunConfig
|
||||
from crawl4ai.async_logger import AsyncLogger
|
||||
|
||||
# Create a logger for clear terminal output
|
||||
logger = AsyncLogger(verbose=True, log_file=None)
|
||||
|
||||
async def test_builtin_browser():
|
||||
"""Test using a builtin browser that persists between sessions."""
|
||||
logger.info("Testing builtin browser", tag="TEST")
|
||||
|
||||
browser_config = BrowserConfig(
|
||||
browser_mode="builtin",
|
||||
headless=True
|
||||
)
|
||||
|
||||
manager = BrowserManager(browser_config=browser_config, logger=logger)
|
||||
|
||||
try:
|
||||
# Start should connect to existing builtin browser or create one
|
||||
await manager.start()
|
||||
logger.info("Connected to builtin browser", tag="TEST")
|
||||
|
||||
# Test page creation
|
||||
crawler_config = CrawlerRunConfig()
|
||||
page, context = await manager.get_page(crawler_config)
|
||||
|
||||
# Test navigation
|
||||
await page.goto("https://example.com")
|
||||
title = await page.title()
|
||||
logger.info(f"Page title: {title}", tag="TEST")
|
||||
|
||||
# Close manager (should not close the builtin browser)
|
||||
await manager.close()
|
||||
logger.info("First session closed", tag="TEST")
|
||||
|
||||
# Create a second manager to verify browser persistence
|
||||
logger.info("Creating second session to verify persistence", tag="TEST")
|
||||
manager2 = BrowserManager(browser_config=browser_config, logger=logger)
|
||||
|
||||
await manager2.start()
|
||||
logger.info("Connected to existing builtin browser", tag="TEST")
|
||||
|
||||
page2, context2 = await manager2.get_page(crawler_config)
|
||||
await page2.goto("https://example.org")
|
||||
title2 = await page2.title()
|
||||
logger.info(f"Second session page title: {title2}", tag="TEST")
|
||||
|
||||
await manager2.close()
|
||||
logger.info("Second session closed successfully", tag="TEST")
|
||||
|
||||
return True
|
||||
except Exception as e:
|
||||
logger.error(f"Test failed: {str(e)}", tag="TEST")
|
||||
try:
|
||||
await manager.close()
|
||||
except:
|
||||
pass
|
||||
return False
|
||||
|
||||
async def test_builtin_browser_status():
|
||||
"""Test getting status of the builtin browser."""
|
||||
logger.info("Testing builtin browser status", tag="TEST")
|
||||
|
||||
from crawl4ai.browser.strategies import BuiltinBrowserStrategy
|
||||
|
||||
browser_config = BrowserConfig(
|
||||
browser_mode="builtin",
|
||||
headless=True
|
||||
)
|
||||
|
||||
# Create strategy directly to access its status methods
|
||||
strategy = BuiltinBrowserStrategy(browser_config, logger)
|
||||
|
||||
try:
|
||||
# Get status before starting (should be not running)
|
||||
status_before = await strategy.get_builtin_browser_status()
|
||||
logger.info(f"Initial status: {status_before}", tag="TEST")
|
||||
|
||||
# Start the browser
|
||||
await strategy.start()
|
||||
logger.info("Browser started successfully", tag="TEST")
|
||||
|
||||
# Get status after starting
|
||||
status_after = await strategy.get_builtin_browser_status()
|
||||
logger.info(f"Status after start: {status_after}", tag="TEST")
|
||||
|
||||
# Create a page to verify functionality
|
||||
crawler_config = CrawlerRunConfig()
|
||||
page, context = await strategy.get_page(crawler_config)
|
||||
await page.goto("https://example.com")
|
||||
title = await page.title()
|
||||
logger.info(f"Page title: {title}", tag="TEST")
|
||||
|
||||
# Close strategy (should not kill the builtin browser)
|
||||
await strategy.close()
|
||||
logger.info("Strategy closed successfully", tag="TEST")
|
||||
|
||||
# Create a new strategy object
|
||||
strategy2 = BuiltinBrowserStrategy(browser_config, logger)
|
||||
|
||||
# Get status again (should still be running)
|
||||
status_final = await strategy2.get_builtin_browser_status()
|
||||
logger.info(f"Final status: {status_final}", tag="TEST")
|
||||
|
||||
# Verify that the status shows the browser is running
|
||||
is_running = status_final.get('running', False)
|
||||
logger.info(f"Builtin browser persistence confirmed: {is_running}", tag="TEST")
|
||||
|
||||
# Kill the builtin browser to clean up
|
||||
logger.info("Killing builtin browser", tag="TEST")
|
||||
success = await strategy2.kill_builtin_browser()
|
||||
logger.info(f"Killed builtin browser successfully: {success}", tag="TEST")
|
||||
|
||||
return is_running and success
|
||||
except Exception as e:
|
||||
logger.error(f"Test failed: {str(e)}", tag="TEST")
|
||||
try:
|
||||
await strategy.close()
|
||||
|
||||
# Try to kill the builtin browser to clean up
|
||||
strategy2 = BuiltinBrowserStrategy(browser_config, logger)
|
||||
await strategy2.kill_builtin_browser()
|
||||
except:
|
||||
pass
|
||||
return False
|
||||
|
||||
async def run_tests():
|
||||
"""Run all tests sequentially."""
|
||||
results = []
|
||||
|
||||
results.append(await test_builtin_browser())
|
||||
results.append(await test_builtin_browser_status())
|
||||
|
||||
# Print summary
|
||||
total = len(results)
|
||||
passed = sum(results)
|
||||
logger.info(f"Tests complete: {passed}/{total} passed", tag="SUMMARY")
|
||||
|
||||
if passed == total:
|
||||
logger.success("All tests passed!", tag="SUMMARY")
|
||||
else:
|
||||
logger.error(f"{total - passed} tests failed", tag="SUMMARY")
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(run_tests())
|
||||
227
tests/browser/test_cdp_strategy.py
Normal file
227
tests/browser/test_cdp_strategy.py
Normal file
@@ -0,0 +1,227 @@
|
||||
"""Test examples for CDPBrowserStrategy.
|
||||
|
||||
These examples demonstrate the functionality of CDPBrowserStrategy
|
||||
and serve as functional tests.
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
import os
|
||||
import sys
|
||||
|
||||
# Add the project root to Python path if running directly
|
||||
if __name__ == "__main__":
|
||||
sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '../..')))
|
||||
|
||||
from crawl4ai.browser import BrowserManager
|
||||
from crawl4ai.async_configs import BrowserConfig, CrawlerRunConfig
|
||||
from crawl4ai.async_logger import AsyncLogger
|
||||
|
||||
# Create a logger for clear terminal output
|
||||
logger = AsyncLogger(verbose=True, log_file=None)
|
||||
|
||||
async def test_cdp_launch_connect():
|
||||
"""Test launching a browser and connecting via CDP."""
|
||||
logger.info("Testing launch and connect via CDP", tag="TEST")
|
||||
|
||||
browser_config = BrowserConfig(
|
||||
use_managed_browser=True,
|
||||
headless=True
|
||||
)
|
||||
|
||||
manager = BrowserManager(browser_config=browser_config, logger=logger)
|
||||
|
||||
try:
|
||||
await manager.start()
|
||||
logger.info("Browser launched and connected via CDP", tag="TEST")
|
||||
|
||||
# Test with multiple pages
|
||||
pages = []
|
||||
for i in range(3):
|
||||
crawler_config = CrawlerRunConfig()
|
||||
page, context = await manager.get_page(crawler_config)
|
||||
await page.goto(f"https://example.com?test={i}")
|
||||
pages.append(page)
|
||||
logger.info(f"Created page {i+1}", tag="TEST")
|
||||
|
||||
# Verify all pages are working
|
||||
for i, page in enumerate(pages):
|
||||
title = await page.title()
|
||||
logger.info(f"Page {i+1} title: {title}", tag="TEST")
|
||||
|
||||
await manager.close()
|
||||
logger.info("Browser closed successfully", tag="TEST")
|
||||
|
||||
return True
|
||||
except Exception as e:
|
||||
logger.error(f"Test failed: {str(e)}", tag="TEST")
|
||||
try:
|
||||
await manager.close()
|
||||
except:
|
||||
pass
|
||||
return False
|
||||
|
||||
async def test_cdp_with_user_data_dir():
|
||||
"""Test CDP browser with a user data directory."""
|
||||
logger.info("Testing CDP browser with user data directory", tag="TEST")
|
||||
|
||||
# Create a temporary user data directory
|
||||
import tempfile
|
||||
user_data_dir = tempfile.mkdtemp(prefix="crawl4ai-test-")
|
||||
logger.info(f"Created temporary user data directory: {user_data_dir}", tag="TEST")
|
||||
|
||||
browser_config = BrowserConfig(
|
||||
use_managed_browser=True,
|
||||
headless=True,
|
||||
user_data_dir=user_data_dir
|
||||
)
|
||||
|
||||
manager = BrowserManager(browser_config=browser_config, logger=logger)
|
||||
|
||||
try:
|
||||
await manager.start()
|
||||
logger.info("Browser launched with user data directory", tag="TEST")
|
||||
|
||||
# Navigate to a page and store some data
|
||||
crawler_config = CrawlerRunConfig()
|
||||
page, context = await manager.get_page(crawler_config)
|
||||
|
||||
# Set a cookie
|
||||
await context.add_cookies([{
|
||||
"name": "test_cookie",
|
||||
"value": "test_value",
|
||||
"url": "https://example.com"
|
||||
}])
|
||||
|
||||
# Visit the site
|
||||
await page.goto("https://example.com")
|
||||
|
||||
# Verify cookie was set
|
||||
cookies = await context.cookies(["https://example.com"])
|
||||
has_test_cookie = any(cookie["name"] == "test_cookie" for cookie in cookies)
|
||||
logger.info(f"Cookie set successfully: {has_test_cookie}", tag="TEST")
|
||||
|
||||
# Close the browser
|
||||
await manager.close()
|
||||
logger.info("First browser session closed", tag="TEST")
|
||||
|
||||
# Start a new browser with the same user data directory
|
||||
logger.info("Starting second browser session with same user data directory", tag="TEST")
|
||||
manager2 = BrowserManager(browser_config=browser_config, logger=logger)
|
||||
await manager2.start()
|
||||
|
||||
# Get a new page and check if the cookie persists
|
||||
page2, context2 = await manager2.get_page(crawler_config)
|
||||
await page2.goto("https://example.com")
|
||||
|
||||
# Verify cookie persisted
|
||||
cookies2 = await context2.cookies(["https://example.com"])
|
||||
has_test_cookie2 = any(cookie["name"] == "test_cookie" for cookie in cookies2)
|
||||
logger.info(f"Cookie persisted across sessions: {has_test_cookie2}", tag="TEST")
|
||||
|
||||
# Clean up
|
||||
await manager2.close()
|
||||
|
||||
# Remove temporary directory
|
||||
import shutil
|
||||
shutil.rmtree(user_data_dir, ignore_errors=True)
|
||||
logger.info(f"Removed temporary user data directory", tag="TEST")
|
||||
|
||||
return has_test_cookie and has_test_cookie2
|
||||
except Exception as e:
|
||||
logger.error(f"Test failed: {str(e)}", tag="TEST")
|
||||
try:
|
||||
await manager.close()
|
||||
except:
|
||||
pass
|
||||
|
||||
# Clean up temporary directory
|
||||
try:
|
||||
import shutil
|
||||
shutil.rmtree(user_data_dir, ignore_errors=True)
|
||||
except:
|
||||
pass
|
||||
|
||||
return False
|
||||
|
||||
async def test_cdp_session_management():
|
||||
"""Test session management with CDP browser."""
|
||||
logger.info("Testing session management with CDP browser", tag="TEST")
|
||||
|
||||
browser_config = BrowserConfig(
|
||||
use_managed_browser=True,
|
||||
headless=True
|
||||
)
|
||||
|
||||
manager = BrowserManager(browser_config=browser_config, logger=logger)
|
||||
|
||||
try:
|
||||
await manager.start()
|
||||
logger.info("Browser launched successfully", tag="TEST")
|
||||
|
||||
# Create two sessions
|
||||
session1_id = "test_session_1"
|
||||
session2_id = "test_session_2"
|
||||
|
||||
# Set up first session
|
||||
crawler_config1 = CrawlerRunConfig(session_id=session1_id)
|
||||
page1, context1 = await manager.get_page(crawler_config1)
|
||||
await page1.goto("https://example.com")
|
||||
await page1.evaluate("localStorage.setItem('session1_data', 'test_value')")
|
||||
logger.info(f"Set up session 1 with ID: {session1_id}", tag="TEST")
|
||||
|
||||
# Set up second session
|
||||
crawler_config2 = CrawlerRunConfig(session_id=session2_id)
|
||||
page2, context2 = await manager.get_page(crawler_config2)
|
||||
await page2.goto("https://example.org")
|
||||
await page2.evaluate("localStorage.setItem('session2_data', 'test_value2')")
|
||||
logger.info(f"Set up session 2 with ID: {session2_id}", tag="TEST")
|
||||
|
||||
# Get first session again
|
||||
page1_again, _ = await manager.get_page(crawler_config1)
|
||||
|
||||
# Verify it's the same page and data persists
|
||||
is_same_page = page1 == page1_again
|
||||
data1 = await page1_again.evaluate("localStorage.getItem('session1_data')")
|
||||
logger.info(f"Session 1 reuse successful: {is_same_page}, data: {data1}", tag="TEST")
|
||||
|
||||
# Kill first session
|
||||
await manager.kill_session(session1_id)
|
||||
logger.info(f"Killed session 1", tag="TEST")
|
||||
|
||||
# Verify second session still works
|
||||
data2 = await page2.evaluate("localStorage.getItem('session2_data')")
|
||||
logger.info(f"Session 2 still functional after killing session 1, data: {data2}", tag="TEST")
|
||||
|
||||
# Clean up
|
||||
await manager.close()
|
||||
logger.info("Browser closed successfully", tag="TEST")
|
||||
|
||||
return is_same_page and data1 == "test_value" and data2 == "test_value2"
|
||||
except Exception as e:
|
||||
logger.error(f"Test failed: {str(e)}", tag="TEST")
|
||||
try:
|
||||
await manager.close()
|
||||
except:
|
||||
pass
|
||||
return False
|
||||
|
||||
async def run_tests():
|
||||
"""Run all tests sequentially."""
|
||||
results = []
|
||||
|
||||
# results.append(await test_cdp_launch_connect())
|
||||
# results.append(await test_cdp_with_user_data_dir())
|
||||
results.append(await test_cdp_session_management())
|
||||
|
||||
# Print summary
|
||||
total = len(results)
|
||||
passed = sum(results)
|
||||
logger.info(f"Tests complete: {passed}/{total} passed", tag="SUMMARY")
|
||||
|
||||
if passed == total:
|
||||
logger.success("All tests passed!", tag="SUMMARY")
|
||||
else:
|
||||
logger.error(f"{total - passed} tests failed", tag="SUMMARY")
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(run_tests())
|
||||
77
tests/browser/test_combined.py
Normal file
77
tests/browser/test_combined.py
Normal file
@@ -0,0 +1,77 @@
|
||||
"""Combined test runner for all browser module tests.
|
||||
|
||||
This script runs all the browser module tests in sequence and
|
||||
provides a comprehensive summary.
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
import os
|
||||
import sys
|
||||
import time
|
||||
|
||||
# Add the project root to Python path if running directly
|
||||
if __name__ == "__main__":
|
||||
sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '../..')))
|
||||
|
||||
from crawl4ai.async_logger import AsyncLogger
|
||||
|
||||
# Create a logger for clear terminal output
|
||||
logger = AsyncLogger(verbose=True, log_file=None)
|
||||
|
||||
async def run_test_module(module_name, header):
|
||||
"""Run all tests in a module and return results."""
|
||||
logger.info(f"\n{'-'*30}", tag="TEST")
|
||||
logger.info(f"RUNNING: {header}", tag="TEST")
|
||||
logger.info(f"{'-'*30}", tag="TEST")
|
||||
|
||||
# Import the module dynamically
|
||||
module = __import__(f"tests.browser.{module_name}", fromlist=["run_tests"])
|
||||
|
||||
# Track time for performance measurement
|
||||
start_time = time.time()
|
||||
|
||||
# Run the tests
|
||||
await module.run_tests()
|
||||
|
||||
# Calculate time taken
|
||||
time_taken = time.time() - start_time
|
||||
logger.info(f"Time taken: {time_taken:.2f} seconds", tag="TIMING")
|
||||
|
||||
return time_taken
|
||||
|
||||
async def main():
|
||||
"""Run all test modules."""
|
||||
logger.info("STARTING COMPREHENSIVE BROWSER MODULE TESTS", tag="MAIN")
|
||||
|
||||
# List of test modules to run
|
||||
test_modules = [
|
||||
("test_browser_manager", "Browser Manager Tests"),
|
||||
("test_playwright_strategy", "Playwright Strategy Tests"),
|
||||
("test_cdp_strategy", "CDP Strategy Tests"),
|
||||
("test_builtin_strategy", "Builtin Browser Strategy Tests"),
|
||||
("test_profiles", "Profile Management Tests")
|
||||
]
|
||||
|
||||
# Run each test module
|
||||
timings = {}
|
||||
for module_name, header in test_modules:
|
||||
try:
|
||||
time_taken = await run_test_module(module_name, header)
|
||||
timings[module_name] = time_taken
|
||||
except Exception as e:
|
||||
logger.error(f"Error running {module_name}: {str(e)}", tag="ERROR")
|
||||
|
||||
# Print summary
|
||||
logger.info("\n\nTEST SUMMARY:", tag="SUMMARY")
|
||||
logger.info(f"{'-'*50}", tag="SUMMARY")
|
||||
for module_name, header in test_modules:
|
||||
if module_name in timings:
|
||||
logger.info(f"{header}: {timings[module_name]:.2f} seconds", tag="SUMMARY")
|
||||
else:
|
||||
logger.error(f"{header}: FAILED TO RUN", tag="SUMMARY")
|
||||
logger.info(f"{'-'*50}", tag="SUMMARY")
|
||||
total_time = sum(timings.values())
|
||||
logger.info(f"Total time: {total_time:.2f} seconds", tag="SUMMARY")
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(main())
|
||||
902
tests/browser/test_parallel_crawling.py
Normal file
902
tests/browser/test_parallel_crawling.py
Normal file
@@ -0,0 +1,902 @@
|
||||
"""
|
||||
Test examples for parallel crawling with the browser module.
|
||||
|
||||
These examples demonstrate the functionality of parallel page creation
|
||||
and serve as functional tests for multi-page crawling performance.
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
import os
|
||||
import sys
|
||||
import time
|
||||
from typing import List
|
||||
|
||||
# Add the project root to Python path if running directly
|
||||
if __name__ == "__main__":
|
||||
sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '../..')))
|
||||
|
||||
from crawl4ai.browser import BrowserManager
|
||||
from crawl4ai.async_configs import BrowserConfig, CrawlerRunConfig
|
||||
from crawl4ai.async_logger import AsyncLogger
|
||||
|
||||
# Create a logger for clear terminal output
|
||||
logger = AsyncLogger(verbose=True, log_file=None)
|
||||
|
||||
async def test_get_pages_basic():
|
||||
"""Test basic functionality of get_pages method."""
|
||||
logger.info("Testing basic get_pages functionality", tag="TEST")
|
||||
|
||||
browser_config = BrowserConfig(headless=True)
|
||||
manager = BrowserManager(browser_config=browser_config, logger=logger)
|
||||
|
||||
try:
|
||||
await manager.start()
|
||||
|
||||
# Request 3 pages
|
||||
crawler_config = CrawlerRunConfig()
|
||||
pages = await manager.get_pages(crawler_config, count=3)
|
||||
|
||||
# Verify we got the correct number of pages
|
||||
assert len(pages) == 3, f"Expected 3 pages, got {len(pages)}"
|
||||
|
||||
# Verify each page is valid
|
||||
for i, (page, context) in enumerate(pages):
|
||||
await page.goto("https://example.com")
|
||||
title = await page.title()
|
||||
logger.info(f"Page {i+1} title: {title}", tag="TEST")
|
||||
assert title, f"Page {i+1} has no title"
|
||||
|
||||
await manager.close()
|
||||
logger.success("Basic get_pages test completed successfully", tag="TEST")
|
||||
return True
|
||||
except Exception as e:
|
||||
logger.error(f"Test failed: {str(e)}", tag="TEST")
|
||||
try:
|
||||
await manager.close()
|
||||
except:
|
||||
pass
|
||||
return False
|
||||
|
||||
async def test_parallel_approaches_comparison():
|
||||
"""Compare two parallel crawling approaches:
|
||||
1. Create a page for each URL on-demand (get_page + gather)
|
||||
2. Get all pages upfront with get_pages, then use them (get_pages + gather)
|
||||
"""
|
||||
logger.info("Comparing different parallel crawling approaches", tag="TEST")
|
||||
|
||||
urls = [
|
||||
"https://example.com/page1",
|
||||
"https://crawl4ai.com",
|
||||
"https://kidocode.com",
|
||||
"https://bbc.com",
|
||||
# "https://example.com/page1",
|
||||
# "https://example.com/page2",
|
||||
# "https://example.com/page3",
|
||||
# "https://example.com/page4",
|
||||
]
|
||||
|
||||
browser_config = BrowserConfig(headless=False)
|
||||
manager = BrowserManager(browser_config=browser_config, logger=logger)
|
||||
|
||||
try:
|
||||
await manager.start()
|
||||
|
||||
# Approach 1: Create a page for each URL on-demand and run in parallel
|
||||
logger.info("Testing approach 1: get_page for each URL + gather", tag="TEST")
|
||||
start_time = time.time()
|
||||
|
||||
async def fetch_title_approach1(url):
|
||||
"""Create a new page for each URL, go to the URL, and get title"""
|
||||
crawler_config = CrawlerRunConfig(url=url)
|
||||
page, context = await manager.get_page(crawler_config)
|
||||
try:
|
||||
await page.goto(url)
|
||||
title = await page.title()
|
||||
return title
|
||||
finally:
|
||||
await page.close()
|
||||
|
||||
# Run fetch_title_approach1 for each URL in parallel
|
||||
tasks = [fetch_title_approach1(url) for url in urls]
|
||||
approach1_results = await asyncio.gather(*tasks)
|
||||
|
||||
approach1_time = time.time() - start_time
|
||||
logger.info(f"Approach 1 time (get_page + gather): {approach1_time:.2f}s", tag="TEST")
|
||||
|
||||
# Approach 2: Get all pages upfront with get_pages, then use them in parallel
|
||||
logger.info("Testing approach 2: get_pages upfront + gather", tag="TEST")
|
||||
start_time = time.time()
|
||||
|
||||
# Get all pages upfront
|
||||
crawler_config = CrawlerRunConfig()
|
||||
pages = await manager.get_pages(crawler_config, count=len(urls))
|
||||
|
||||
async def fetch_title_approach2(page_ctx, url):
|
||||
"""Use a pre-created page to go to URL and get title"""
|
||||
page, _ = page_ctx
|
||||
try:
|
||||
await page.goto(url)
|
||||
title = await page.title()
|
||||
return title
|
||||
finally:
|
||||
await page.close()
|
||||
|
||||
# Use the pre-created pages to fetch titles in parallel
|
||||
tasks = [fetch_title_approach2(page_ctx, url) for page_ctx, url in zip(pages, urls)]
|
||||
approach2_results = await asyncio.gather(*tasks)
|
||||
|
||||
approach2_time = time.time() - start_time
|
||||
logger.info(f"Approach 2 time (get_pages + gather): {approach2_time:.2f}s", tag="TEST")
|
||||
|
||||
# Compare results and performance
|
||||
speedup = approach1_time / approach2_time if approach2_time > 0 else 0
|
||||
if speedup > 1:
|
||||
logger.success(f"Approach 2 (get_pages upfront) was {speedup:.2f}x faster", tag="TEST")
|
||||
else:
|
||||
logger.info(f"Approach 1 (get_page + gather) was {1/speedup:.2f}x faster", tag="TEST")
|
||||
|
||||
# Verify same content was retrieved in both approaches
|
||||
assert len(approach1_results) == len(approach2_results), "Result count mismatch"
|
||||
|
||||
# Sort results for comparison since parallel execution might complete in different order
|
||||
assert sorted(approach1_results) == sorted(approach2_results), "Results content mismatch"
|
||||
|
||||
await manager.close()
|
||||
return True
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Test failed: {str(e)}", tag="TEST")
|
||||
try:
|
||||
await manager.close()
|
||||
except:
|
||||
pass
|
||||
return False
|
||||
|
||||
async def test_multi_browser_scaling(num_browsers=3, pages_per_browser=5):
|
||||
"""Test performance with multiple browsers and pages per browser.
|
||||
Compares two approaches:
|
||||
1. On-demand page creation (get_page + gather)
|
||||
2. Pre-created pages (get_pages + gather)
|
||||
"""
|
||||
logger.info(f"Testing multi-browser scaling with {num_browsers} browsers × {pages_per_browser} pages", tag="TEST")
|
||||
|
||||
# Generate test URLs
|
||||
total_pages = num_browsers * pages_per_browser
|
||||
urls = [f"https://example.com/page_{i}" for i in range(total_pages)]
|
||||
|
||||
# Create browser managers
|
||||
managers = []
|
||||
base_port = 9222
|
||||
|
||||
try:
|
||||
# Start all browsers in parallel
|
||||
start_tasks = []
|
||||
for i in range(num_browsers):
|
||||
browser_config = BrowserConfig(
|
||||
headless=True # Using default browser mode like in test_parallel_approaches_comparison
|
||||
)
|
||||
manager = BrowserManager(browser_config=browser_config, logger=logger)
|
||||
start_tasks.append(manager.start())
|
||||
managers.append(manager)
|
||||
|
||||
await asyncio.gather(*start_tasks)
|
||||
|
||||
# Distribute URLs among managers
|
||||
urls_per_manager = {}
|
||||
for i, manager in enumerate(managers):
|
||||
start_idx = i * pages_per_browser
|
||||
end_idx = min(start_idx + pages_per_browser, len(urls))
|
||||
urls_per_manager[manager] = urls[start_idx:end_idx]
|
||||
|
||||
# Approach 1: Create a page for each URL on-demand and run in parallel
|
||||
logger.info("Testing approach 1: get_page for each URL + gather", tag="TEST")
|
||||
start_time = time.time()
|
||||
|
||||
async def fetch_title_approach1(manager, url):
|
||||
"""Create a new page for the URL, go to the URL, and get title"""
|
||||
crawler_config = CrawlerRunConfig(url=url)
|
||||
page, context = await manager.get_page(crawler_config)
|
||||
try:
|
||||
await page.goto(url)
|
||||
title = await page.title()
|
||||
return title
|
||||
finally:
|
||||
await page.close()
|
||||
|
||||
# Run fetch_title_approach1 for each URL in parallel
|
||||
tasks = []
|
||||
for manager, manager_urls in urls_per_manager.items():
|
||||
for url in manager_urls:
|
||||
tasks.append(fetch_title_approach1(manager, url))
|
||||
|
||||
approach1_results = await asyncio.gather(*tasks)
|
||||
|
||||
approach1_time = time.time() - start_time
|
||||
logger.info(f"Approach 1 time (get_page + gather): {approach1_time:.2f}s", tag="TEST")
|
||||
|
||||
# Approach 2: Get all pages upfront with get_pages, then use them in parallel
|
||||
logger.info("Testing approach 2: get_pages upfront + gather", tag="TEST")
|
||||
start_time = time.time()
|
||||
|
||||
# Get all pages upfront for each manager
|
||||
all_pages = []
|
||||
for manager, manager_urls in urls_per_manager.items():
|
||||
crawler_config = CrawlerRunConfig()
|
||||
pages = await manager.get_pages(crawler_config, count=len(manager_urls))
|
||||
all_pages.extend(zip(pages, manager_urls))
|
||||
|
||||
async def fetch_title_approach2(page_ctx, url):
|
||||
"""Use a pre-created page to go to URL and get title"""
|
||||
page, _ = page_ctx
|
||||
try:
|
||||
await page.goto(url)
|
||||
title = await page.title()
|
||||
return title
|
||||
finally:
|
||||
await page.close()
|
||||
|
||||
# Use the pre-created pages to fetch titles in parallel
|
||||
tasks = [fetch_title_approach2(page_ctx, url) for page_ctx, url in all_pages]
|
||||
approach2_results = await asyncio.gather(*tasks)
|
||||
|
||||
approach2_time = time.time() - start_time
|
||||
logger.info(f"Approach 2 time (get_pages + gather): {approach2_time:.2f}s", tag="TEST")
|
||||
|
||||
# Compare results and performance
|
||||
speedup = approach1_time / approach2_time if approach2_time > 0 else 0
|
||||
pages_per_second = total_pages / approach2_time
|
||||
|
||||
# Show a simple summary
|
||||
logger.info(f"📊 Summary: {num_browsers} browsers × {pages_per_browser} pages = {total_pages} total crawls", tag="TEST")
|
||||
logger.info(f"⚡ Performance: {pages_per_second:.1f} pages/second ({pages_per_second*60:.0f} pages/minute)", tag="TEST")
|
||||
logger.info(f"🚀 Total crawl time: {approach2_time:.2f} seconds", tag="TEST")
|
||||
|
||||
if speedup > 1:
|
||||
logger.success(f"✅ Approach 2 (get_pages upfront) was {speedup:.2f}x faster", tag="TEST")
|
||||
else:
|
||||
logger.info(f"✅ Approach 1 (get_page + gather) was {1/speedup:.2f}x faster", tag="TEST")
|
||||
|
||||
# Close all managers
|
||||
for manager in managers:
|
||||
await manager.close()
|
||||
|
||||
return True
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Test failed: {str(e)}", tag="TEST")
|
||||
# Clean up
|
||||
for manager in managers:
|
||||
try:
|
||||
await manager.close()
|
||||
except:
|
||||
pass
|
||||
return False
|
||||
|
||||
async def grid_search_optimal_configuration(total_urls=50):
|
||||
"""Perform a grid search to find the optimal balance between number of browsers and pages per browser.
|
||||
|
||||
This function tests different combinations of browser count and pages per browser,
|
||||
while keeping the total number of URLs constant. It measures performance metrics
|
||||
for each configuration to find the "sweet spot" that provides the best speed
|
||||
with reasonable memory usage.
|
||||
|
||||
Args:
|
||||
total_urls: Total number of URLs to crawl (default: 50)
|
||||
"""
|
||||
logger.info(f"=== GRID SEARCH FOR OPTIMAL CRAWLING CONFIGURATION ({total_urls} URLs) ===", tag="TEST")
|
||||
|
||||
# Generate test URLs once
|
||||
urls = [f"https://example.com/page_{i}" for i in range(total_urls)]
|
||||
|
||||
# Define grid search configurations
|
||||
# We'll use more flexible approach: test all browser counts from 1 to min(20, total_urls)
|
||||
# and distribute pages evenly (some browsers may have 1 more page than others)
|
||||
configurations = []
|
||||
|
||||
# Maximum number of browsers to test
|
||||
max_browsers_to_test = min(20, total_urls)
|
||||
|
||||
# Try configurations with 1 to max_browsers_to_test browsers
|
||||
for num_browsers in range(1, max_browsers_to_test + 1):
|
||||
base_pages_per_browser = total_urls // num_browsers
|
||||
remainder = total_urls % num_browsers
|
||||
|
||||
# Generate exact page distribution array
|
||||
if remainder > 0:
|
||||
# First 'remainder' browsers get one more page
|
||||
page_distribution = [base_pages_per_browser + 1] * remainder + [base_pages_per_browser] * (num_browsers - remainder)
|
||||
pages_distribution = f"{base_pages_per_browser+1} pages × {remainder} browsers, {base_pages_per_browser} pages × {num_browsers - remainder} browsers"
|
||||
else:
|
||||
# All browsers get the same number of pages
|
||||
page_distribution = [base_pages_per_browser] * num_browsers
|
||||
pages_distribution = f"{base_pages_per_browser} pages × {num_browsers} browsers"
|
||||
|
||||
# Format the distribution as a tuple string like (4, 4, 3, 3)
|
||||
distribution_str = str(tuple(page_distribution))
|
||||
|
||||
configurations.append((num_browsers, base_pages_per_browser, pages_distribution, page_distribution, distribution_str))
|
||||
|
||||
# Track results
|
||||
results = []
|
||||
|
||||
# Test each configuration
|
||||
for num_browsers, pages_per_browser, pages_distribution, page_distribution, distribution_str in configurations:
|
||||
logger.info("-" * 80, tag="TEST")
|
||||
logger.info(f"Testing configuration: {num_browsers} browsers with distribution: {distribution_str}", tag="TEST")
|
||||
logger.info(f"Details: {pages_distribution}", tag="TEST")
|
||||
# Sleep a bit for randomness
|
||||
await asyncio.sleep(0.5)
|
||||
|
||||
try:
|
||||
# Import psutil for memory tracking
|
||||
try:
|
||||
import psutil
|
||||
process = psutil.Process()
|
||||
initial_memory = process.memory_info().rss / (1024 * 1024) # MB
|
||||
except ImportError:
|
||||
logger.warning("psutil not available, memory metrics will not be tracked", tag="TEST")
|
||||
initial_memory = 0
|
||||
|
||||
# Create and start browser managers
|
||||
managers = []
|
||||
start_time = time.time()
|
||||
|
||||
# Start all browsers in parallel
|
||||
start_tasks = []
|
||||
for i in range(num_browsers):
|
||||
browser_config = BrowserConfig(
|
||||
headless=True
|
||||
)
|
||||
manager = BrowserManager(browser_config=browser_config, logger=logger)
|
||||
start_tasks.append(manager.start())
|
||||
managers.append(manager)
|
||||
|
||||
await asyncio.gather(*start_tasks)
|
||||
browser_startup_time = time.time() - start_time
|
||||
|
||||
# Measure memory after browser startup
|
||||
if initial_memory > 0:
|
||||
browser_memory = process.memory_info().rss / (1024 * 1024) - initial_memory
|
||||
else:
|
||||
browser_memory = 0
|
||||
|
||||
# Distribute URLs among managers using the exact page distribution
|
||||
urls_per_manager = {}
|
||||
total_assigned = 0
|
||||
|
||||
for i, manager in enumerate(managers):
|
||||
if i < len(page_distribution):
|
||||
# Get the exact number of pages for this browser from our distribution
|
||||
manager_pages = page_distribution[i]
|
||||
|
||||
# Get the URL slice for this manager
|
||||
start_idx = total_assigned
|
||||
end_idx = start_idx + manager_pages
|
||||
urls_per_manager[manager] = urls[start_idx:end_idx]
|
||||
total_assigned += manager_pages
|
||||
else:
|
||||
# If we have more managers than our distribution (should never happen)
|
||||
urls_per_manager[manager] = []
|
||||
|
||||
# Use the more efficient approach (pre-created pages)
|
||||
logger.info("Running page crawling test...", tag="TEST")
|
||||
crawl_start_time = time.time()
|
||||
|
||||
# Get all pages upfront for each manager
|
||||
all_pages = []
|
||||
for manager, manager_urls in urls_per_manager.items():
|
||||
if not manager_urls: # Skip managers with no URLs
|
||||
continue
|
||||
crawler_config = CrawlerRunConfig()
|
||||
pages = await manager.get_pages(crawler_config, count=len(manager_urls))
|
||||
all_pages.extend(zip(pages, manager_urls))
|
||||
|
||||
# Measure memory after page creation
|
||||
if initial_memory > 0:
|
||||
pages_memory = process.memory_info().rss / (1024 * 1024) - browser_memory - initial_memory
|
||||
else:
|
||||
pages_memory = 0
|
||||
|
||||
# Function to crawl a URL with a pre-created page
|
||||
async def fetch_title(page_ctx, url):
|
||||
page, _ = page_ctx
|
||||
try:
|
||||
await page.goto(url)
|
||||
title = await page.title()
|
||||
return title
|
||||
finally:
|
||||
await page.close()
|
||||
|
||||
# Use the pre-created pages to fetch titles in parallel
|
||||
tasks = [fetch_title(page_ctx, url) for page_ctx, url in all_pages]
|
||||
crawl_results = await asyncio.gather(*tasks)
|
||||
|
||||
crawl_time = time.time() - crawl_start_time
|
||||
total_time = time.time() - start_time
|
||||
|
||||
# Final memory measurement
|
||||
if initial_memory > 0:
|
||||
peak_memory = max(browser_memory + pages_memory, process.memory_info().rss / (1024 * 1024) - initial_memory)
|
||||
else:
|
||||
peak_memory = 0
|
||||
|
||||
# Close all managers
|
||||
for manager in managers:
|
||||
await manager.close()
|
||||
|
||||
# Calculate metrics
|
||||
pages_per_second = total_urls / crawl_time
|
||||
|
||||
# Store result metrics
|
||||
result = {
|
||||
"num_browsers": num_browsers,
|
||||
"pages_per_browser": pages_per_browser,
|
||||
"page_distribution": page_distribution,
|
||||
"distribution_str": distribution_str,
|
||||
"total_urls": total_urls,
|
||||
"browser_startup_time": browser_startup_time,
|
||||
"crawl_time": crawl_time,
|
||||
"total_time": total_time,
|
||||
"browser_memory": browser_memory,
|
||||
"pages_memory": pages_memory,
|
||||
"peak_memory": peak_memory,
|
||||
"pages_per_second": pages_per_second,
|
||||
# Calculate efficiency score (higher is better)
|
||||
# This balances speed vs memory usage
|
||||
"efficiency_score": pages_per_second / (peak_memory + 1) if peak_memory > 0 else pages_per_second,
|
||||
}
|
||||
|
||||
results.append(result)
|
||||
|
||||
# Log the results
|
||||
logger.info(f"Browser startup: {browser_startup_time:.2f}s", tag="TEST")
|
||||
logger.info(f"Crawl time: {crawl_time:.2f}s", tag="TEST")
|
||||
logger.info(f"Total time: {total_time:.2f}s", tag="TEST")
|
||||
logger.info(f"Performance: {pages_per_second:.1f} pages/second", tag="TEST")
|
||||
|
||||
if peak_memory > 0:
|
||||
logger.info(f"Browser memory: {browser_memory:.1f}MB", tag="TEST")
|
||||
logger.info(f"Pages memory: {pages_memory:.1f}MB", tag="TEST")
|
||||
logger.info(f"Peak memory: {peak_memory:.1f}MB", tag="TEST")
|
||||
logger.info(f"Efficiency score: {result['efficiency_score']:.6f}", tag="TEST")
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error testing configuration: {str(e)}", tag="TEST")
|
||||
import traceback
|
||||
traceback.print_exc()
|
||||
|
||||
# Clean up
|
||||
for manager in managers:
|
||||
try:
|
||||
await manager.close()
|
||||
except:
|
||||
pass
|
||||
|
||||
# Print summary of all configurations
|
||||
logger.info("=" * 100, tag="TEST")
|
||||
logger.info("GRID SEARCH RESULTS SUMMARY", tag="TEST")
|
||||
logger.info("=" * 100, tag="TEST")
|
||||
|
||||
# Rank configurations by efficiency score
|
||||
ranked_results = sorted(results, key=lambda x: x["efficiency_score"], reverse=True)
|
||||
|
||||
# Also determine rankings by different metrics
|
||||
fastest = sorted(results, key=lambda x: x["crawl_time"])[0]
|
||||
lowest_memory = sorted(results, key=lambda x: x["peak_memory"] if x["peak_memory"] > 0 else float('inf'))[0]
|
||||
most_efficient = ranked_results[0]
|
||||
|
||||
# Print top performers by category
|
||||
logger.info("🏆 TOP PERFORMERS BY CATEGORY:", tag="TEST")
|
||||
logger.info(f"⚡ Fastest: {fastest['num_browsers']} browsers × ~{fastest['pages_per_browser']} pages " +
|
||||
f"({fastest['crawl_time']:.2f}s, {fastest['pages_per_second']:.1f} pages/s)", tag="TEST")
|
||||
|
||||
if lowest_memory["peak_memory"] > 0:
|
||||
logger.info(f"💾 Lowest memory: {lowest_memory['num_browsers']} browsers × ~{lowest_memory['pages_per_browser']} pages " +
|
||||
f"({lowest_memory['peak_memory']:.1f}MB)", tag="TEST")
|
||||
|
||||
logger.info(f"🌟 Most efficient: {most_efficient['num_browsers']} browsers × ~{most_efficient['pages_per_browser']} pages " +
|
||||
f"(score: {most_efficient['efficiency_score']:.6f})", tag="TEST")
|
||||
|
||||
# Print result table header
|
||||
logger.info("\n📊 COMPLETE RANKING TABLE (SORTED BY EFFICIENCY SCORE):", tag="TEST")
|
||||
logger.info("-" * 120, tag="TEST")
|
||||
|
||||
# Define table header
|
||||
header = f"{'Rank':<5} | {'Browsers':<8} | {'Distribution':<55} | {'Total Time(s)':<12} | {'Speed(p/s)':<12} | {'Memory(MB)':<12} | {'Efficiency':<10} | {'Notes'}"
|
||||
logger.info(header, tag="TEST")
|
||||
logger.info("-" * 120, tag="TEST")
|
||||
|
||||
# Print each configuration in ranked order
|
||||
for rank, result in enumerate(ranked_results, 1):
|
||||
# Add special notes for top performers
|
||||
notes = []
|
||||
if result == fastest:
|
||||
notes.append("⚡ Fastest")
|
||||
if result == lowest_memory:
|
||||
notes.append("💾 Lowest Memory")
|
||||
if result == most_efficient:
|
||||
notes.append("🌟 Most Efficient")
|
||||
|
||||
notes_str = " | ".join(notes) if notes else ""
|
||||
|
||||
# Format memory if available
|
||||
memory_str = f"{result['peak_memory']:.1f}" if result['peak_memory'] > 0 else "N/A"
|
||||
|
||||
# Get the distribution string
|
||||
dist_str = result.get('distribution_str', str(tuple([result['pages_per_browser']] * result['num_browsers'])))
|
||||
|
||||
# Build the row
|
||||
row = f"{rank:<5} | {result['num_browsers']:<8} | {dist_str:<55} | {result['total_time']:.2f}s{' ':<7} | "
|
||||
row += f"{result['pages_per_second']:.2f}{' ':<6} | {memory_str}{' ':<6} | {result['efficiency_score']:.4f}{' ':<4} | {notes_str}"
|
||||
|
||||
logger.info(row, tag="TEST")
|
||||
|
||||
logger.info("-" * 120, tag="TEST")
|
||||
|
||||
# Generate visualization if matplotlib is available
|
||||
try:
|
||||
import matplotlib.pyplot as plt
|
||||
import numpy as np
|
||||
|
||||
# Extract data for plotting from ranked results
|
||||
browser_counts = [r["num_browsers"] for r in ranked_results]
|
||||
efficiency_scores = [r["efficiency_score"] for r in ranked_results]
|
||||
crawl_times = [r["crawl_time"] for r in ranked_results]
|
||||
total_times = [r["total_time"] for r in ranked_results]
|
||||
|
||||
# Filter results with memory data
|
||||
memory_results = [r for r in ranked_results if r["peak_memory"] > 0]
|
||||
memory_browser_counts = [r["num_browsers"] for r in memory_results]
|
||||
peak_memories = [r["peak_memory"] for r in memory_results]
|
||||
|
||||
# Create figure with clean design
|
||||
plt.figure(figsize=(14, 12), facecolor='white')
|
||||
plt.style.use('ggplot')
|
||||
|
||||
# Create grid for subplots
|
||||
gs = plt.GridSpec(3, 1, height_ratios=[1, 1, 1], hspace=0.3)
|
||||
|
||||
# Plot 1: Efficiency Score (higher is better)
|
||||
ax1 = plt.subplot(gs[0])
|
||||
bar_colors = ['#3498db'] * len(browser_counts)
|
||||
|
||||
# Highlight the most efficient
|
||||
most_efficient_idx = browser_counts.index(most_efficient["num_browsers"])
|
||||
bar_colors[most_efficient_idx] = '#e74c3c' # Red for most efficient
|
||||
|
||||
bars = ax1.bar(range(len(browser_counts)), efficiency_scores, color=bar_colors)
|
||||
ax1.set_xticks(range(len(browser_counts)))
|
||||
ax1.set_xticklabels([f"{bc}" for bc in browser_counts], rotation=45)
|
||||
ax1.set_xlabel('Number of Browsers')
|
||||
ax1.set_ylabel('Efficiency Score (higher is better)')
|
||||
ax1.set_title('Browser Configuration Efficiency (higher is better)')
|
||||
|
||||
# Add value labels on top of bars
|
||||
for bar, score in zip(bars, efficiency_scores):
|
||||
height = bar.get_height()
|
||||
ax1.text(bar.get_x() + bar.get_width()/2., height + 0.02*max(efficiency_scores),
|
||||
f'{score:.3f}', ha='center', va='bottom', rotation=90, fontsize=8)
|
||||
|
||||
# Highlight best configuration
|
||||
ax1.text(0.02, 0.90, f"🌟 Most Efficient: {most_efficient['num_browsers']} browsers with ~{most_efficient['pages_per_browser']} pages",
|
||||
transform=ax1.transAxes, fontsize=12, verticalalignment='top',
|
||||
bbox=dict(boxstyle='round,pad=0.5', facecolor='yellow', alpha=0.3))
|
||||
|
||||
# Plot 2: Time Performance
|
||||
ax2 = plt.subplot(gs[1])
|
||||
|
||||
# Plot both total time and crawl time
|
||||
ax2.plot(browser_counts, crawl_times, 'bo-', label='Crawl Time (s)', linewidth=2)
|
||||
ax2.plot(browser_counts, total_times, 'go--', label='Total Time (s)', linewidth=2, alpha=0.6)
|
||||
|
||||
# Mark the fastest configuration
|
||||
fastest_idx = browser_counts.index(fastest["num_browsers"])
|
||||
ax2.plot(browser_counts[fastest_idx], crawl_times[fastest_idx], 'ro', ms=10,
|
||||
label=f'Fastest: {fastest["num_browsers"]} browsers')
|
||||
|
||||
ax2.set_xlabel('Number of Browsers')
|
||||
ax2.set_ylabel('Time (seconds)')
|
||||
ax2.set_title(f'Time Performance for {total_urls} URLs by Browser Count')
|
||||
ax2.grid(True, linestyle='--', alpha=0.7)
|
||||
ax2.legend(loc='upper right')
|
||||
|
||||
# Plot pages per second on second y-axis
|
||||
pages_per_second = [total_urls/t for t in crawl_times]
|
||||
ax2_twin = ax2.twinx()
|
||||
ax2_twin.plot(browser_counts, pages_per_second, 'r^--', label='Pages/second', alpha=0.5)
|
||||
ax2_twin.set_ylabel('Pages per second')
|
||||
|
||||
# Add note about the fastest configuration
|
||||
ax2.text(0.02, 0.90, f"⚡ Fastest: {fastest['num_browsers']} browsers with ~{fastest['pages_per_browser']} pages" +
|
||||
f"\n {fastest['crawl_time']:.2f}s ({fastest['pages_per_second']:.1f} pages/s)",
|
||||
transform=ax2.transAxes, fontsize=12, verticalalignment='top',
|
||||
bbox=dict(boxstyle='round,pad=0.5', facecolor='lightblue', alpha=0.3))
|
||||
|
||||
# Plot 3: Memory Usage (if available)
|
||||
if memory_results:
|
||||
ax3 = plt.subplot(gs[2])
|
||||
|
||||
# Prepare data for grouped bar chart
|
||||
memory_per_browser = [m/n for m, n in zip(peak_memories, memory_browser_counts)]
|
||||
memory_per_page = [m/(n*p) for m, n, p in zip(
|
||||
[r["peak_memory"] for r in memory_results],
|
||||
[r["num_browsers"] for r in memory_results],
|
||||
[r["pages_per_browser"] for r in memory_results])]
|
||||
|
||||
x = np.arange(len(memory_browser_counts))
|
||||
width = 0.35
|
||||
|
||||
# Create grouped bars
|
||||
ax3.bar(x - width/2, peak_memories, width, label='Total Memory (MB)', color='#9b59b6')
|
||||
ax3.bar(x + width/2, memory_per_browser, width, label='Memory per Browser (MB)', color='#3498db')
|
||||
|
||||
# Configure axis
|
||||
ax3.set_xticks(x)
|
||||
ax3.set_xticklabels([f"{bc}" for bc in memory_browser_counts], rotation=45)
|
||||
ax3.set_xlabel('Number of Browsers')
|
||||
ax3.set_ylabel('Memory (MB)')
|
||||
ax3.set_title('Memory Usage by Browser Configuration')
|
||||
ax3.legend(loc='upper left')
|
||||
ax3.grid(True, linestyle='--', alpha=0.7)
|
||||
|
||||
# Add second y-axis for memory per page
|
||||
ax3_twin = ax3.twinx()
|
||||
ax3_twin.plot(x, memory_per_page, 'ro-', label='Memory per Page (MB)')
|
||||
ax3_twin.set_ylabel('Memory per Page (MB)')
|
||||
|
||||
# Get lowest memory configuration
|
||||
lowest_memory_idx = memory_browser_counts.index(lowest_memory["num_browsers"])
|
||||
|
||||
# Add note about lowest memory configuration
|
||||
ax3.text(0.02, 0.90, f"💾 Lowest Memory: {lowest_memory['num_browsers']} browsers with ~{lowest_memory['pages_per_browser']} pages" +
|
||||
f"\n {lowest_memory['peak_memory']:.1f}MB ({lowest_memory['peak_memory']/total_urls:.2f}MB per page)",
|
||||
transform=ax3.transAxes, fontsize=12, verticalalignment='top',
|
||||
bbox=dict(boxstyle='round,pad=0.5', facecolor='lightgreen', alpha=0.3))
|
||||
|
||||
# Add overall title
|
||||
plt.suptitle(f'Browser Scaling Grid Search Results for {total_urls} URLs', fontsize=16, y=0.98)
|
||||
|
||||
# Add timestamp and info at the bottom
|
||||
plt.figtext(0.5, 0.01, f"Generated by Crawl4AI at {time.strftime('%Y-%m-%d %H:%M:%S')}",
|
||||
ha="center", fontsize=10, style='italic')
|
||||
|
||||
# Get current directory and save the figure there
|
||||
import os
|
||||
__current_file = os.path.abspath(__file__)
|
||||
current_dir = os.path.dirname(__current_file)
|
||||
output_file = os.path.join(current_dir, 'browser_scaling_grid_search.png')
|
||||
|
||||
# Adjust layout and save figure with high DPI
|
||||
plt.tight_layout(rect=[0, 0.03, 1, 0.97])
|
||||
plt.savefig(output_file, dpi=200, bbox_inches='tight')
|
||||
logger.success(f"Visualization saved to {output_file}", tag="TEST")
|
||||
|
||||
except ImportError:
|
||||
logger.warning("matplotlib not available, skipping visualization", tag="TEST")
|
||||
|
||||
return most_efficient["num_browsers"], most_efficient["pages_per_browser"]
|
||||
|
||||
async def find_optimal_browser_config(total_urls=50, verbose=True, rate_limit_delay=0.2):
|
||||
"""Find optimal browser configuration for crawling a specific number of URLs.
|
||||
|
||||
Args:
|
||||
total_urls: Number of URLs to crawl
|
||||
verbose: Whether to print progress
|
||||
rate_limit_delay: Delay between page loads to avoid rate limiting
|
||||
|
||||
Returns:
|
||||
dict: Contains fastest, lowest_memory, and optimal configurations
|
||||
"""
|
||||
if verbose:
|
||||
print(f"\n=== Finding optimal configuration for crawling {total_urls} URLs ===\n")
|
||||
|
||||
# Generate test URLs with timestamp to avoid caching
|
||||
timestamp = int(time.time())
|
||||
urls = [f"https://example.com/page_{i}?t={timestamp}" for i in range(total_urls)]
|
||||
|
||||
# Limit browser configurations to test (1 browser to max 10)
|
||||
max_browsers = min(10, total_urls)
|
||||
configs_to_test = []
|
||||
|
||||
# Generate configurations (browser count, pages distribution)
|
||||
for num_browsers in range(1, max_browsers + 1):
|
||||
base_pages = total_urls // num_browsers
|
||||
remainder = total_urls % num_browsers
|
||||
|
||||
# Create distribution array like [3, 3, 2, 2] (some browsers get one more page)
|
||||
if remainder > 0:
|
||||
distribution = [base_pages + 1] * remainder + [base_pages] * (num_browsers - remainder)
|
||||
else:
|
||||
distribution = [base_pages] * num_browsers
|
||||
|
||||
configs_to_test.append((num_browsers, distribution))
|
||||
|
||||
results = []
|
||||
|
||||
# Test each configuration
|
||||
for browser_count, page_distribution in configs_to_test:
|
||||
if verbose:
|
||||
print(f"Testing {browser_count} browsers with distribution {tuple(page_distribution)}")
|
||||
|
||||
try:
|
||||
# Track memory if possible
|
||||
try:
|
||||
import psutil
|
||||
process = psutil.Process()
|
||||
start_memory = process.memory_info().rss / (1024 * 1024) # MB
|
||||
except ImportError:
|
||||
if verbose:
|
||||
print("Memory tracking not available (psutil not installed)")
|
||||
start_memory = 0
|
||||
|
||||
# Start browsers in parallel
|
||||
managers = []
|
||||
start_tasks = []
|
||||
start_time = time.time()
|
||||
|
||||
for i in range(browser_count):
|
||||
config = BrowserConfig(headless=True)
|
||||
manager = BrowserManager(browser_config=config, logger=logger)
|
||||
start_tasks.append(manager.start())
|
||||
managers.append(manager)
|
||||
|
||||
await asyncio.gather(*start_tasks)
|
||||
|
||||
# Distribute URLs among browsers
|
||||
urls_per_manager = {}
|
||||
url_index = 0
|
||||
|
||||
for i, manager in enumerate(managers):
|
||||
pages_for_this_browser = page_distribution[i]
|
||||
end_index = url_index + pages_for_this_browser
|
||||
urls_per_manager[manager] = urls[url_index:end_index]
|
||||
url_index = end_index
|
||||
|
||||
# Create pages for each browser
|
||||
all_pages = []
|
||||
for manager, manager_urls in urls_per_manager.items():
|
||||
if not manager_urls:
|
||||
continue
|
||||
pages = await manager.get_pages(CrawlerRunConfig(), count=len(manager_urls))
|
||||
all_pages.extend(zip(pages, manager_urls))
|
||||
|
||||
# Crawl pages with delay to avoid rate limiting
|
||||
async def crawl_page(page_ctx, url):
|
||||
page, _ = page_ctx
|
||||
try:
|
||||
await page.goto(url)
|
||||
if rate_limit_delay > 0:
|
||||
await asyncio.sleep(rate_limit_delay)
|
||||
title = await page.title()
|
||||
return title
|
||||
finally:
|
||||
await page.close()
|
||||
|
||||
crawl_start = time.time()
|
||||
crawl_tasks = [crawl_page(page_ctx, url) for page_ctx, url in all_pages]
|
||||
await asyncio.gather(*crawl_tasks)
|
||||
crawl_time = time.time() - crawl_start
|
||||
total_time = time.time() - start_time
|
||||
|
||||
# Measure final memory usage
|
||||
if start_memory > 0:
|
||||
end_memory = process.memory_info().rss / (1024 * 1024)
|
||||
memory_used = end_memory - start_memory
|
||||
else:
|
||||
memory_used = 0
|
||||
|
||||
# Close all browsers
|
||||
for manager in managers:
|
||||
await manager.close()
|
||||
|
||||
# Calculate metrics
|
||||
pages_per_second = total_urls / crawl_time
|
||||
|
||||
# Calculate efficiency score (higher is better)
|
||||
# This balances speed vs memory
|
||||
if memory_used > 0:
|
||||
efficiency = pages_per_second / (memory_used + 1)
|
||||
else:
|
||||
efficiency = pages_per_second
|
||||
|
||||
# Store result
|
||||
result = {
|
||||
"browser_count": browser_count,
|
||||
"distribution": tuple(page_distribution),
|
||||
"crawl_time": crawl_time,
|
||||
"total_time": total_time,
|
||||
"memory_used": memory_used,
|
||||
"pages_per_second": pages_per_second,
|
||||
"efficiency": efficiency
|
||||
}
|
||||
|
||||
results.append(result)
|
||||
|
||||
if verbose:
|
||||
print(f" ✓ Crawled {total_urls} pages in {crawl_time:.2f}s ({pages_per_second:.1f} pages/sec)")
|
||||
if memory_used > 0:
|
||||
print(f" ✓ Memory used: {memory_used:.1f}MB ({memory_used/total_urls:.1f}MB per page)")
|
||||
print(f" ✓ Efficiency score: {efficiency:.4f}")
|
||||
|
||||
except Exception as e:
|
||||
if verbose:
|
||||
print(f" ✗ Error: {str(e)}")
|
||||
|
||||
# Clean up
|
||||
for manager in managers:
|
||||
try:
|
||||
await manager.close()
|
||||
except:
|
||||
pass
|
||||
|
||||
# If no successful results, return None
|
||||
if not results:
|
||||
return None
|
||||
|
||||
# Find best configurations
|
||||
fastest = sorted(results, key=lambda x: x["crawl_time"])[0]
|
||||
|
||||
# Only consider memory if available
|
||||
memory_results = [r for r in results if r["memory_used"] > 0]
|
||||
if memory_results:
|
||||
lowest_memory = sorted(memory_results, key=lambda x: x["memory_used"])[0]
|
||||
else:
|
||||
lowest_memory = fastest
|
||||
|
||||
# Find most efficient (balanced speed vs memory)
|
||||
optimal = sorted(results, key=lambda x: x["efficiency"], reverse=True)[0]
|
||||
|
||||
# Print summary
|
||||
if verbose:
|
||||
print("\n=== OPTIMAL CONFIGURATIONS ===")
|
||||
print(f"⚡ Fastest: {fastest['browser_count']} browsers {fastest['distribution']}")
|
||||
print(f" {fastest['crawl_time']:.2f}s, {fastest['pages_per_second']:.1f} pages/sec")
|
||||
|
||||
print(f"💾 Memory-efficient: {lowest_memory['browser_count']} browsers {lowest_memory['distribution']}")
|
||||
if lowest_memory["memory_used"] > 0:
|
||||
print(f" {lowest_memory['memory_used']:.1f}MB, {lowest_memory['memory_used']/total_urls:.2f}MB per page")
|
||||
|
||||
print(f"🌟 Balanced optimal: {optimal['browser_count']} browsers {optimal['distribution']}")
|
||||
print(f" {optimal['crawl_time']:.2f}s, {optimal['pages_per_second']:.1f} pages/sec, score: {optimal['efficiency']:.4f}")
|
||||
|
||||
return {
|
||||
"fastest": fastest,
|
||||
"lowest_memory": lowest_memory,
|
||||
"optimal": optimal,
|
||||
"all_configs": results
|
||||
}
|
||||
|
||||
async def run_tests():
|
||||
"""Run all tests sequentially."""
|
||||
results = []
|
||||
|
||||
# Find optimal configuration using our utility function
|
||||
configs = await find_optimal_browser_config(
|
||||
total_urls=20, # Use a small number for faster testing
|
||||
verbose=True,
|
||||
rate_limit_delay=0.2 # 200ms delay between page loads to avoid rate limiting
|
||||
)
|
||||
|
||||
if configs:
|
||||
# Show the optimal configuration
|
||||
optimal = configs["optimal"]
|
||||
print(f"\n🎯 Recommended configuration for production use:")
|
||||
print(f" {optimal['browser_count']} browsers with distribution {optimal['distribution']}")
|
||||
print(f" Estimated performance: {optimal['pages_per_second']:.1f} pages/second")
|
||||
results.append(True)
|
||||
else:
|
||||
print("\n❌ Failed to find optimal configuration")
|
||||
results.append(False)
|
||||
|
||||
# Print summary
|
||||
total = len(results)
|
||||
passed = sum(results)
|
||||
print(f"\nTests complete: {passed}/{total} passed")
|
||||
|
||||
if passed == total:
|
||||
print("All tests passed!")
|
||||
else:
|
||||
print(f"{total - passed} tests failed")
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(run_tests())
|
||||
267
tests/browser/test_playwright_strategy.py
Normal file
267
tests/browser/test_playwright_strategy.py
Normal file
@@ -0,0 +1,267 @@
|
||||
"""Test examples for PlaywrightBrowserStrategy.
|
||||
|
||||
These examples demonstrate the functionality of PlaywrightBrowserStrategy
|
||||
and serve as functional tests.
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
import os
|
||||
import sys
|
||||
|
||||
# Add the project root to Python path if running directly
|
||||
if __name__ == "__main__":
|
||||
sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '../..')))
|
||||
|
||||
from crawl4ai.browser import BrowserManager
|
||||
from crawl4ai.async_configs import BrowserConfig, CrawlerRunConfig
|
||||
from crawl4ai.async_logger import AsyncLogger
|
||||
|
||||
# Create a logger for clear terminal output
|
||||
logger = AsyncLogger(verbose=True, log_file=None)
|
||||
|
||||
async def test_playwright_basic():
|
||||
"""Test basic Playwright browser functionality."""
|
||||
logger.info("Testing standard Playwright browser", tag="TEST")
|
||||
|
||||
# Create browser config for standard Playwright
|
||||
browser_config = BrowserConfig(
|
||||
headless=True,
|
||||
viewport_width=1280,
|
||||
viewport_height=800
|
||||
)
|
||||
|
||||
# Create browser manager with the config
|
||||
manager = BrowserManager(browser_config=browser_config, logger=logger)
|
||||
|
||||
try:
|
||||
# Start the browser
|
||||
await manager.start()
|
||||
logger.info("Browser started successfully", tag="TEST")
|
||||
|
||||
# Create crawler config
|
||||
crawler_config = CrawlerRunConfig(url="https://example.com")
|
||||
|
||||
# Get a page
|
||||
page, context = await manager.get_page(crawler_config)
|
||||
logger.info("Got page successfully", tag="TEST")
|
||||
|
||||
# Navigate to a website
|
||||
await page.goto("https://example.com")
|
||||
logger.info("Navigated to example.com", tag="TEST")
|
||||
|
||||
# Get page title
|
||||
title = await page.title()
|
||||
logger.info(f"Page title: {title}", tag="TEST")
|
||||
|
||||
# Clean up
|
||||
await manager.close()
|
||||
logger.info("Browser closed successfully", tag="TEST")
|
||||
|
||||
return True
|
||||
except Exception as e:
|
||||
logger.error(f"Test failed: {str(e)}", tag="TEST")
|
||||
# Ensure cleanup
|
||||
try:
|
||||
await manager.close()
|
||||
except:
|
||||
pass
|
||||
return False
|
||||
|
||||
async def test_playwright_text_mode():
|
||||
"""Test Playwright browser in text-only mode."""
|
||||
logger.info("Testing Playwright text mode", tag="TEST")
|
||||
|
||||
# Create browser config with text mode enabled
|
||||
browser_config = BrowserConfig(
|
||||
headless=True,
|
||||
text_mode=True # Enable text-only mode
|
||||
)
|
||||
|
||||
# Create browser manager with the config
|
||||
manager = BrowserManager(browser_config=browser_config, logger=logger)
|
||||
|
||||
try:
|
||||
# Start the browser
|
||||
await manager.start()
|
||||
logger.info("Browser started successfully in text mode", tag="TEST")
|
||||
|
||||
# Get a page
|
||||
crawler_config = CrawlerRunConfig(url="https://example.com")
|
||||
page, context = await manager.get_page(crawler_config)
|
||||
|
||||
# Navigate to a website
|
||||
await page.goto("https://example.com")
|
||||
logger.info("Navigated to example.com", tag="TEST")
|
||||
|
||||
# Get page title
|
||||
title = await page.title()
|
||||
logger.info(f"Page title: {title}", tag="TEST")
|
||||
|
||||
# Check if images are blocked in text mode
|
||||
# We'll check if any image requests were made
|
||||
has_images = False
|
||||
async with page.expect_request("**/*.{png,jpg,jpeg,gif,webp,svg}", timeout=1000) as request_info:
|
||||
try:
|
||||
# Try to load a page with images
|
||||
await page.goto("https://picsum.photos/", wait_until="domcontentloaded")
|
||||
request = await request_info.value
|
||||
has_images = True
|
||||
except:
|
||||
# Timeout without image requests means text mode is working
|
||||
has_images = False
|
||||
|
||||
logger.info(f"Text mode image blocking working: {not has_images}", tag="TEST")
|
||||
|
||||
# Clean up
|
||||
await manager.close()
|
||||
logger.info("Browser closed successfully", tag="TEST")
|
||||
|
||||
return True
|
||||
except Exception as e:
|
||||
logger.error(f"Test failed: {str(e)}", tag="TEST")
|
||||
# Ensure cleanup
|
||||
try:
|
||||
await manager.close()
|
||||
except:
|
||||
pass
|
||||
return False
|
||||
|
||||
async def test_playwright_context_reuse():
|
||||
"""Test context caching and reuse with identical configurations."""
|
||||
logger.info("Testing context reuse with identical configurations", tag="TEST")
|
||||
|
||||
# Create browser config
|
||||
browser_config = BrowserConfig(headless=True)
|
||||
|
||||
# Create browser manager
|
||||
manager = BrowserManager(browser_config=browser_config, logger=logger)
|
||||
|
||||
try:
|
||||
# Start the browser
|
||||
await manager.start()
|
||||
logger.info("Browser started successfully", tag="TEST")
|
||||
|
||||
# Create identical crawler configs
|
||||
crawler_config1 = CrawlerRunConfig(
|
||||
css_selector="body",
|
||||
)
|
||||
|
||||
crawler_config2 = CrawlerRunConfig(
|
||||
css_selector="body",
|
||||
)
|
||||
|
||||
# Get pages with these configs
|
||||
page1, context1 = await manager.get_page(crawler_config1)
|
||||
page2, context2 = await manager.get_page(crawler_config2)
|
||||
|
||||
# Check if contexts are reused
|
||||
is_same_context = context1 == context2
|
||||
logger.info(f"Contexts reused: {is_same_context}", tag="TEST")
|
||||
|
||||
# Now try with a different config
|
||||
crawler_config3 = CrawlerRunConfig()
|
||||
|
||||
page3, context3 = await manager.get_page(crawler_config3)
|
||||
|
||||
# This should be a different context
|
||||
is_different_context = context1 != context3
|
||||
logger.info(f"Different contexts for different configs: {is_different_context}", tag="TEST")
|
||||
|
||||
# Clean up
|
||||
await manager.close()
|
||||
logger.info("Browser closed successfully", tag="TEST")
|
||||
|
||||
# Both tests should pass for success
|
||||
return is_same_context and is_different_context
|
||||
except Exception as e:
|
||||
logger.error(f"Test failed: {str(e)}", tag="TEST")
|
||||
# Ensure cleanup
|
||||
try:
|
||||
await manager.close()
|
||||
except:
|
||||
pass
|
||||
return False
|
||||
|
||||
async def test_playwright_session_management():
|
||||
"""Test session management with Playwright browser."""
|
||||
logger.info("Testing session management with Playwright browser", tag="TEST")
|
||||
|
||||
browser_config = BrowserConfig(
|
||||
headless=True
|
||||
)
|
||||
|
||||
manager = BrowserManager(browser_config=browser_config, logger=logger)
|
||||
|
||||
try:
|
||||
await manager.start()
|
||||
logger.info("Browser launched successfully", tag="TEST")
|
||||
|
||||
# Create two sessions
|
||||
session1_id = "playwright_session_1"
|
||||
session2_id = "playwright_session_2"
|
||||
|
||||
# Set up first session
|
||||
crawler_config1 = CrawlerRunConfig(session_id=session1_id, url="https://example.com")
|
||||
page1, context1 = await manager.get_page(crawler_config1)
|
||||
await page1.goto("https://example.com")
|
||||
await page1.evaluate("localStorage.setItem('playwright_session1_data', 'test_value1')")
|
||||
logger.info(f"Set up session 1 with ID: {session1_id}", tag="TEST")
|
||||
|
||||
# Set up second session
|
||||
crawler_config2 = CrawlerRunConfig(session_id=session2_id, url="https://example.org")
|
||||
page2, context2 = await manager.get_page(crawler_config2)
|
||||
await page2.goto("https://example.org")
|
||||
await page2.evaluate("localStorage.setItem('playwright_session2_data', 'test_value2')")
|
||||
logger.info(f"Set up session 2 with ID: {session2_id}", tag="TEST")
|
||||
|
||||
# Get first session again
|
||||
page1_again, context1_again = await manager.get_page(crawler_config1)
|
||||
|
||||
# Verify it's the same page and data persists
|
||||
is_same_page = page1 == page1_again
|
||||
is_same_context = context1 == context1_again
|
||||
data1 = await page1_again.evaluate("localStorage.getItem('playwright_session1_data')")
|
||||
logger.info(f"Session 1 reuse successful: {is_same_page}, data: {data1}", tag="TEST")
|
||||
|
||||
# Kill first session
|
||||
await manager.kill_session(session1_id)
|
||||
logger.info(f"Killed session 1", tag="TEST")
|
||||
|
||||
# Verify second session still works
|
||||
data2 = await page2.evaluate("localStorage.getItem('playwright_session2_data')")
|
||||
logger.info(f"Session 2 still functional after killing session 1, data: {data2}", tag="TEST")
|
||||
|
||||
# Clean up
|
||||
await manager.close()
|
||||
logger.info("Browser closed successfully", tag="TEST")
|
||||
|
||||
return is_same_page and is_same_context and data1 == "test_value1" and data2 == "test_value2"
|
||||
except Exception as e:
|
||||
logger.error(f"Test failed: {str(e)}", tag="TEST")
|
||||
try:
|
||||
await manager.close()
|
||||
except:
|
||||
pass
|
||||
return False
|
||||
|
||||
async def run_tests():
|
||||
"""Run all tests sequentially."""
|
||||
results = []
|
||||
|
||||
results.append(await test_playwright_basic())
|
||||
results.append(await test_playwright_text_mode())
|
||||
results.append(await test_playwright_context_reuse())
|
||||
results.append(await test_playwright_session_management())
|
||||
|
||||
# Print summary
|
||||
total = len(results)
|
||||
passed = sum(results)
|
||||
logger.info(f"Tests complete: {passed}/{total} passed", tag="SUMMARY")
|
||||
|
||||
if passed == total:
|
||||
logger.success("All tests passed!", tag="SUMMARY")
|
||||
else:
|
||||
logger.error(f"{total - passed} tests failed", tag="SUMMARY")
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(run_tests())
|
||||
176
tests/browser/test_profiles.py
Normal file
176
tests/browser/test_profiles.py
Normal file
@@ -0,0 +1,176 @@
|
||||
"""Test examples for BrowserProfileManager.
|
||||
|
||||
These examples demonstrate the functionality of BrowserProfileManager
|
||||
and serve as functional tests.
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
import os
|
||||
import sys
|
||||
import uuid
|
||||
import shutil
|
||||
|
||||
# Add the project root to Python path if running directly
|
||||
if __name__ == "__main__":
|
||||
sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '../..')))
|
||||
|
||||
from crawl4ai.browser import BrowserManager, BrowserProfileManager
|
||||
from crawl4ai.async_configs import BrowserConfig, CrawlerRunConfig
|
||||
from crawl4ai.async_logger import AsyncLogger
|
||||
|
||||
# Create a logger for clear terminal output
|
||||
logger = AsyncLogger(verbose=True, log_file=None)
|
||||
|
||||
async def test_profile_creation():
|
||||
"""Test creating and managing browser profiles."""
|
||||
logger.info("Testing profile creation and management", tag="TEST")
|
||||
|
||||
profile_manager = BrowserProfileManager(logger=logger)
|
||||
|
||||
try:
|
||||
# List existing profiles
|
||||
profiles = profile_manager.list_profiles()
|
||||
logger.info(f"Found {len(profiles)} existing profiles", tag="TEST")
|
||||
|
||||
# Generate a unique profile name for testing
|
||||
test_profile_name = f"test-profile-{uuid.uuid4().hex[:8]}"
|
||||
|
||||
# Create a test profile directory
|
||||
profile_path = os.path.join(profile_manager.profiles_dir, test_profile_name)
|
||||
os.makedirs(os.path.join(profile_path, "Default"), exist_ok=True)
|
||||
|
||||
# Create a dummy Preferences file to simulate a Chrome profile
|
||||
with open(os.path.join(profile_path, "Default", "Preferences"), "w") as f:
|
||||
f.write("{\"test\": true}")
|
||||
|
||||
logger.info(f"Created test profile at: {profile_path}", tag="TEST")
|
||||
|
||||
# Verify the profile is now in the list
|
||||
profiles = profile_manager.list_profiles()
|
||||
profile_found = any(p["name"] == test_profile_name for p in profiles)
|
||||
logger.info(f"Profile found in list: {profile_found}", tag="TEST")
|
||||
|
||||
# Try to get the profile path
|
||||
retrieved_path = profile_manager.get_profile_path(test_profile_name)
|
||||
path_match = retrieved_path == profile_path
|
||||
logger.info(f"Retrieved correct profile path: {path_match}", tag="TEST")
|
||||
|
||||
# Delete the profile
|
||||
success = profile_manager.delete_profile(test_profile_name)
|
||||
logger.info(f"Profile deletion successful: {success}", tag="TEST")
|
||||
|
||||
# Verify it's gone
|
||||
profiles_after = profile_manager.list_profiles()
|
||||
profile_removed = not any(p["name"] == test_profile_name for p in profiles_after)
|
||||
logger.info(f"Profile removed from list: {profile_removed}", tag="TEST")
|
||||
|
||||
# Clean up just in case
|
||||
if os.path.exists(profile_path):
|
||||
shutil.rmtree(profile_path, ignore_errors=True)
|
||||
|
||||
return profile_found and path_match and success and profile_removed
|
||||
except Exception as e:
|
||||
logger.error(f"Test failed: {str(e)}", tag="TEST")
|
||||
# Clean up test directory
|
||||
try:
|
||||
if os.path.exists(profile_path):
|
||||
shutil.rmtree(profile_path, ignore_errors=True)
|
||||
except:
|
||||
pass
|
||||
return False
|
||||
|
||||
async def test_profile_with_browser():
|
||||
"""Test using a profile with a browser."""
|
||||
logger.info("Testing using a profile with a browser", tag="TEST")
|
||||
|
||||
profile_manager = BrowserProfileManager(logger=logger)
|
||||
test_profile_name = f"test-browser-profile-{uuid.uuid4().hex[:8]}"
|
||||
profile_path = None
|
||||
|
||||
try:
|
||||
# Create a test profile directory
|
||||
profile_path = os.path.join(profile_manager.profiles_dir, test_profile_name)
|
||||
os.makedirs(os.path.join(profile_path, "Default"), exist_ok=True)
|
||||
|
||||
# Create a dummy Preferences file to simulate a Chrome profile
|
||||
with open(os.path.join(profile_path, "Default", "Preferences"), "w") as f:
|
||||
f.write("{\"test\": true}")
|
||||
|
||||
logger.info(f"Created test profile at: {profile_path}", tag="TEST")
|
||||
|
||||
# Now use this profile with a browser
|
||||
browser_config = BrowserConfig(
|
||||
user_data_dir=profile_path,
|
||||
headless=True
|
||||
)
|
||||
|
||||
manager = BrowserManager(browser_config=browser_config, logger=logger)
|
||||
|
||||
# Start the browser with the profile
|
||||
await manager.start()
|
||||
logger.info("Browser started with profile", tag="TEST")
|
||||
|
||||
# Create a page
|
||||
crawler_config = CrawlerRunConfig()
|
||||
page, context = await manager.get_page(crawler_config)
|
||||
|
||||
# Navigate and set some data to verify profile works
|
||||
await page.goto("https://example.com")
|
||||
await page.evaluate("localStorage.setItem('test_data', 'profile_value')")
|
||||
|
||||
# Close browser
|
||||
await manager.close()
|
||||
logger.info("First browser session closed", tag="TEST")
|
||||
|
||||
# Create a new browser with the same profile
|
||||
manager2 = BrowserManager(browser_config=browser_config, logger=logger)
|
||||
await manager2.start()
|
||||
logger.info("Second browser session started with same profile", tag="TEST")
|
||||
|
||||
# Get a page and check if the data persists
|
||||
page2, context2 = await manager2.get_page(crawler_config)
|
||||
await page2.goto("https://example.com")
|
||||
data = await page2.evaluate("localStorage.getItem('test_data')")
|
||||
|
||||
# Verify data persisted
|
||||
data_persisted = data == "profile_value"
|
||||
logger.info(f"Data persisted across sessions: {data_persisted}", tag="TEST")
|
||||
|
||||
# Clean up
|
||||
await manager2.close()
|
||||
logger.info("Second browser session closed", tag="TEST")
|
||||
|
||||
# Delete the test profile
|
||||
success = profile_manager.delete_profile(test_profile_name)
|
||||
logger.info(f"Test profile deleted: {success}", tag="TEST")
|
||||
|
||||
return data_persisted and success
|
||||
except Exception as e:
|
||||
logger.error(f"Test failed: {str(e)}", tag="TEST")
|
||||
# Clean up
|
||||
try:
|
||||
if profile_path and os.path.exists(profile_path):
|
||||
shutil.rmtree(profile_path, ignore_errors=True)
|
||||
except:
|
||||
pass
|
||||
return False
|
||||
|
||||
async def run_tests():
|
||||
"""Run all tests sequentially."""
|
||||
results = []
|
||||
|
||||
results.append(await test_profile_creation())
|
||||
results.append(await test_profile_with_browser())
|
||||
|
||||
# Print summary
|
||||
total = len(results)
|
||||
passed = sum(results)
|
||||
logger.info(f"Tests complete: {passed}/{total} passed", tag="SUMMARY")
|
||||
|
||||
if passed == total:
|
||||
logger.success("All tests passed!", tag="SUMMARY")
|
||||
else:
|
||||
logger.error(f"{total - passed} tests failed", tag="SUMMARY")
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(run_tests())
|
||||
Reference in New Issue
Block a user