From 4ab0893ffb7d6308d8ccdaf29cedb9ae8bdb919f Mon Sep 17 00:00:00 2001 From: UncleCode Date: Fri, 21 Mar 2025 22:50:00 +0800 Subject: [PATCH 1/5] feat(browser): implement modular browser management system Adds a new browser management system with strategy pattern implementation: - Introduces BrowserManager class with strategy pattern support - Adds PlaywrightBrowserStrategy, CDPBrowserStrategy, and BuiltinBrowserStrategy - Implements BrowserProfileManager for profile management - Adds PagePoolConfig for browser page pooling - Includes comprehensive test suite for all browser strategies BREAKING CHANGE: Browser management has been moved to browser/ module. Direct usage of browser_manager.py and browser_profiler.py is deprecated. --- crawl4ai/async_configs.py | 48 + crawl4ai/browser/__init__.py | 10 + crawl4ai/browser/manager.py | 165 ++++ crawl4ai/browser/models.py | 0 crawl4ai/browser/profiles.py | 458 +++++++++ crawl4ai/browser/strategies.py | 1048 +++++++++++++++++++++ crawl4ai/browser/utils.py | 105 +++ crawl4ai/browser_manager.py | 1 + crawl4ai/browser_profiler.py | 1 - docs/examples/hello_world.py | 31 +- tests/browser/test_browser_manager.py | 190 ++++ tests/browser/test_builtin_strategy.py | 160 ++++ tests/browser/test_cdp_strategy.py | 227 +++++ tests/browser/test_combined.py | 77 ++ tests/browser/test_playwright_strategy.py | 275 ++++++ tests/browser/test_profiles.py | 176 ++++ 16 files changed, 2964 insertions(+), 8 deletions(-) create mode 100644 crawl4ai/browser/__init__.py create mode 100644 crawl4ai/browser/manager.py create mode 100644 crawl4ai/browser/models.py create mode 100644 crawl4ai/browser/profiles.py create mode 100644 crawl4ai/browser/strategies.py create mode 100644 crawl4ai/browser/utils.py create mode 100644 tests/browser/test_browser_manager.py create mode 100644 tests/browser/test_builtin_strategy.py create mode 100644 tests/browser/test_cdp_strategy.py create mode 100644 tests/browser/test_combined.py create mode 100644 tests/browser/test_playwright_strategy.py create mode 100644 tests/browser/test_profiles.py diff --git a/crawl4ai/async_configs.py b/crawl4ai/async_configs.py index fa0b97f4..0606c656 100644 --- a/crawl4ai/async_configs.py +++ b/crawl4ai/async_configs.py @@ -156,6 +156,41 @@ def is_empty_value(value: Any) -> bool: return False +class PagePoolConfig: + """Configuration for browser page pooling. + + This class configures the page pooling mechanism that maintains pre-warmed + browser pages ready for immediate use, improving performance for scenarios + where multiple URLs need to be processed in sequence. + + Attributes: + mode (str): Pooling mode - "static" or "adaptive". + "static" uses a fixed pool size defined by static_size. + "adaptive" calculates optimal size based on available system memory. + Default: "static". + static_size (int): Number of pages to maintain in the pool when mode is "static". + Default: 10. + memory_per_page (int): Estimated memory used by a single page in MB. + Used for "adaptive" mode calculations. + Default: 200. + memory_threshold (float): Maximum percentage of system memory to use in "adaptive" mode. + Default: 0.7 (70% of available memory). + timeout (float): Seconds to wait for a page from the pool before creating a new one. + Default: 5.0. + """ + + def __init__(self, + mode="static", + static_size=10, + memory_per_page=200, + memory_threshold=0.7, + timeout=5.0): + self.mode = mode + self.static_size = static_size + self.memory_per_page = memory_per_page + self.memory_threshold = memory_threshold + self.timeout = timeout + class BrowserConfig: """ Configuration class for setting up a browser instance and its context in AsyncPlaywrightCrawlerStrategy. @@ -220,6 +255,9 @@ class BrowserConfig: light_mode (bool): Disables certain background features for performance gains. Default: False. extra_args (list): Additional command-line arguments passed to the browser. Default: []. + page_pool_config (PagePoolConfig or None): Configuration for page pooling mechanism. + If None, page pooling is disabled. + Default: None. """ def __init__( @@ -260,6 +298,7 @@ class BrowserConfig: extra_args: list = None, debugging_port: int = 9222, host: str = "localhost", + page_pool_config: Optional[PagePoolConfig] = None, ): self.browser_type = browser_type self.headless = headless @@ -298,6 +337,7 @@ class BrowserConfig: self.verbose = verbose self.debugging_port = debugging_port self.host = host + self.page_pool_config = page_pool_config fa_user_agenr_generator = ValidUAGenerator() if self.user_agent_mode == "random": @@ -328,6 +368,12 @@ class BrowserConfig: @staticmethod def from_kwargs(kwargs: dict) -> "BrowserConfig": + # Handle page_pool_config + page_pool_config = kwargs.get("page_pool_config") + if isinstance(page_pool_config, dict): + # If it's a dict, convert to PagePoolConfig + page_pool_config = PagePoolConfig(**page_pool_config) + return BrowserConfig( browser_type=kwargs.get("browser_type", "chromium"), headless=kwargs.get("headless", True), @@ -361,6 +407,7 @@ class BrowserConfig: extra_args=kwargs.get("extra_args", []), debugging_port=kwargs.get("debugging_port", 9222), host=kwargs.get("host", "localhost"), + page_pool_config=page_pool_config, ) def to_dict(self): @@ -395,6 +442,7 @@ class BrowserConfig: "verbose": self.verbose, "debugging_port": self.debugging_port, "host": self.host, + "page_pool_config": self.page_pool_config, } def clone(self, **kwargs): diff --git a/crawl4ai/browser/__init__.py b/crawl4ai/browser/__init__.py new file mode 100644 index 00000000..fb14b59d --- /dev/null +++ b/crawl4ai/browser/__init__.py @@ -0,0 +1,10 @@ +"""Browser management module for Crawl4AI. + +This module provides browser management capabilities using different strategies +for browser creation and interaction. +""" + +from .manager import BrowserManager +from .profiles import BrowserProfileManager + +__all__ = ['BrowserManager', 'BrowserProfileManager'] \ No newline at end of file diff --git a/crawl4ai/browser/manager.py b/crawl4ai/browser/manager.py new file mode 100644 index 00000000..4ebee637 --- /dev/null +++ b/crawl4ai/browser/manager.py @@ -0,0 +1,165 @@ +"""Browser manager module for Crawl4AI. + +This module provides a central browser management class that uses the +strategy pattern internally while maintaining the existing API. +""" + +import asyncio +import time +from typing import Optional, Tuple, Dict, Any + +from playwright.async_api import Page, BrowserContext + +from ..async_logger import AsyncLogger +from ..async_configs import BrowserConfig, CrawlerRunConfig + +from .strategies import ( + BaseBrowserStrategy, + PlaywrightBrowserStrategy, + CDPBrowserStrategy, + BuiltinBrowserStrategy +) + +class BrowserManager: + """Main interface for browser management in Crawl4AI. + + This class maintains backward compatibility with the existing implementation + while using the strategy pattern internally for different browser types. + + Attributes: + config (BrowserConfig): Configuration object containing all browser settings + logger: Logger instance for recording events and errors + browser: The browser instance + default_context: The default browser context + managed_browser: The managed browser instance + playwright: The Playwright instance + sessions: Dictionary to store session information + session_ttl: Session timeout in seconds + """ + + def __init__(self, browser_config: Optional[BrowserConfig] = None, logger: Optional[AsyncLogger] = None): + """Initialize the BrowserManager with a browser configuration. + + Args: + browser_config: Configuration object containing all browser settings + logger: Logger instance for recording events and errors + """ + self.config = browser_config or BrowserConfig() + self.logger = logger + + # Create strategy based on configuration + self._strategy = self._create_strategy() + + # Initialize state variables for compatibility with existing code + self.browser = None + self.default_context = None + self.managed_browser = None + self.playwright = None + + # For session management (from existing implementation) + self.sessions = {} + self.session_ttl = 1800 # 30 minutes + + def _create_strategy(self) -> BaseBrowserStrategy: + """Create appropriate browser strategy based on configuration. + + Returns: + BaseBrowserStrategy: The selected browser strategy + """ + if self.config.browser_mode == "builtin": + return BuiltinBrowserStrategy(self.config, self.logger) + elif self.config.cdp_url or self.config.use_managed_browser: + return CDPBrowserStrategy(self.config, self.logger) + else: + return PlaywrightBrowserStrategy(self.config, self.logger) + + async def start(self): + """Start the browser instance and set up the default context. + + Returns: + self: For method chaining + """ + # Start the strategy + await self._strategy.start() + + # Update legacy references + self.browser = self._strategy.browser + self.default_context = self._strategy.default_context + + # Set browser process reference (for CDP strategy) + if hasattr(self._strategy, 'browser_process'): + self.managed_browser = self._strategy + + # Set Playwright reference + self.playwright = self._strategy.playwright + + # Sync sessions if needed + if hasattr(self._strategy, 'sessions'): + self.sessions = self._strategy.sessions + self.session_ttl = self._strategy.session_ttl + + return self + + async def get_page(self, crawlerRunConfig: CrawlerRunConfig) -> Tuple[Page, BrowserContext]: + """Get a page for the given configuration. + + Args: + crawlerRunConfig: Configuration object for the crawler run + + Returns: + Tuple of (Page, BrowserContext) + """ + # Delegate to strategy + page, context = await self._strategy.get_page(crawlerRunConfig) + + # Sync sessions if needed + if hasattr(self._strategy, 'sessions'): + self.sessions = self._strategy.sessions + + return page, context + + async def kill_session(self, session_id: str): + """Kill a browser session and clean up resources. + + Args: + session_id: The session ID to kill + """ + # Handle kill_session via our strategy if it supports it + if hasattr(self._strategy, '_kill_session'): + await self._strategy._kill_session(session_id) + elif session_id in self.sessions: + context, page, _ = self.sessions[session_id] + await page.close() + # Only close context if not using CDP + if not self.config.use_managed_browser and not self.config.cdp_url and not self.config.browser_mode == "builtin": + await context.close() + del self.sessions[session_id] + + def _cleanup_expired_sessions(self): + """Clean up expired sessions based on TTL.""" + # Use strategy's implementation if available + if hasattr(self._strategy, '_cleanup_expired_sessions'): + self._strategy._cleanup_expired_sessions() + return + + # Otherwise use our own implementation + current_time = time.time() + expired_sessions = [ + sid + for sid, (_, _, last_used) in self.sessions.items() + if current_time - last_used > self.session_ttl + ] + for sid in expired_sessions: + asyncio.create_task(self.kill_session(sid)) + + async def close(self): + """Close the browser and clean up resources.""" + # Delegate to strategy + await self._strategy.close() + + # Reset legacy references + self.browser = None + self.default_context = None + self.managed_browser = None + self.playwright = None + self.sessions = {} diff --git a/crawl4ai/browser/models.py b/crawl4ai/browser/models.py new file mode 100644 index 00000000..e69de29b diff --git a/crawl4ai/browser/profiles.py b/crawl4ai/browser/profiles.py new file mode 100644 index 00000000..58a8bff2 --- /dev/null +++ b/crawl4ai/browser/profiles.py @@ -0,0 +1,458 @@ +"""Browser profile management module for Crawl4AI. + +This module provides functionality for creating and managing browser profiles +that can be used for authenticated browsing. +""" + +import os +import asyncio +import signal +import sys +import datetime +import uuid +import shutil +from typing import List, Dict, Optional, Any +from colorama import Fore, Style, init + +from ..async_configs import BrowserConfig +from ..async_logger import AsyncLogger, AsyncLoggerBase +from ..utils import get_home_folder +from .strategies import is_windows + +class BrowserProfileManager: + """Manages browser profiles for Crawl4AI. + + This class provides functionality to create and manage browser profiles + that can be used for authenticated browsing with Crawl4AI. + + Profiles are stored by default in ~/.crawl4ai/profiles/ + """ + + def __init__(self, logger: Optional[AsyncLoggerBase] = None): + """Initialize the BrowserProfileManager. + + Args: + logger: Logger for outputting messages. If None, a default AsyncLogger is created. + """ + # Initialize colorama for colorful terminal output + init() + + # Create a logger if not provided + if logger is None: + self.logger = AsyncLogger(verbose=True) + elif not isinstance(logger, AsyncLoggerBase): + self.logger = AsyncLogger(verbose=True) + else: + self.logger = logger + + # Ensure profiles directory exists + self.profiles_dir = os.path.join(get_home_folder(), "profiles") + os.makedirs(self.profiles_dir, exist_ok=True) + + async def create_profile(self, + profile_name: Optional[str] = None, + browser_config: Optional[BrowserConfig] = None) -> Optional[str]: + """Create a browser profile interactively. + + Args: + profile_name: Name for the profile. If None, a name is generated. + browser_config: Configuration for the browser. If None, a default configuration is used. + + Returns: + Path to the created profile directory, or None if creation failed + """ + # Create default browser config if none provided + if browser_config is None: + browser_config = BrowserConfig( + browser_type="chromium", + headless=False, # Must be visible for user interaction + verbose=True + ) + else: + # Ensure headless is False for user interaction + browser_config.headless = False + + # Generate profile name if not provided + if not profile_name: + timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S") + profile_name = f"profile_{timestamp}_{uuid.uuid4().hex[:6]}" + + # Sanitize profile name (replace spaces and special chars) + profile_name = "".join(c if c.isalnum() or c in "-_" else "_" for c in profile_name) + + # Set user data directory + profile_path = os.path.join(self.profiles_dir, profile_name) + os.makedirs(profile_path, exist_ok=True) + + # Print instructions for the user with colorama formatting + border = f"{Fore.CYAN}{'='*80}{Style.RESET_ALL}" + self.logger.info(f"\n{border}", tag="PROFILE") + self.logger.info(f"Creating browser profile: {Fore.GREEN}{profile_name}{Style.RESET_ALL}", tag="PROFILE") + self.logger.info(f"Profile directory: {Fore.YELLOW}{profile_path}{Style.RESET_ALL}", tag="PROFILE") + + self.logger.info("\nInstructions:", tag="PROFILE") + self.logger.info("1. A browser window will open for you to set up your profile.", tag="PROFILE") + self.logger.info(f"2. {Fore.CYAN}Log in to websites{Style.RESET_ALL}, configure settings, etc. as needed.", tag="PROFILE") + self.logger.info(f"3. When you're done, {Fore.YELLOW}press 'q' in this terminal{Style.RESET_ALL} to close the browser.", tag="PROFILE") + self.logger.info("4. The profile will be saved and ready to use with Crawl4AI.", tag="PROFILE") + self.logger.info(f"{border}\n", tag="PROFILE") + + # Import the necessary classes with local imports to avoid circular references + from .strategies import CDPBrowserStrategy + + # Set browser config to use the profile path + browser_config.user_data_dir = profile_path + + # Create a CDP browser strategy for the profile creation + browser_strategy = CDPBrowserStrategy(browser_config, self.logger) + + # Set up signal handlers to ensure cleanup on interrupt + original_sigint = signal.getsignal(signal.SIGINT) + original_sigterm = signal.getsignal(signal.SIGTERM) + + # Define cleanup handler for signals + async def cleanup_handler(sig, frame): + self.logger.warning("\nCleaning up browser process...", tag="PROFILE") + await browser_strategy.close() + # Restore original signal handlers + signal.signal(signal.SIGINT, original_sigint) + signal.signal(signal.SIGTERM, original_sigterm) + if sig == signal.SIGINT: + self.logger.error("Profile creation interrupted. Profile may be incomplete.", tag="PROFILE") + sys.exit(1) + + # Set signal handlers + def sigint_handler(sig, frame): + asyncio.create_task(cleanup_handler(sig, frame)) + + signal.signal(signal.SIGINT, sigint_handler) + signal.signal(signal.SIGTERM, sigint_handler) + + # Event to signal when user is done with the browser + user_done_event = asyncio.Event() + + # Run keyboard input loop in a separate task + async def listen_for_quit_command(): + import termios + import tty + import select + + # First output the prompt + self.logger.info(f"{Fore.CYAN}Press '{Fore.WHITE}q{Fore.CYAN}' when you've finished using the browser...{Style.RESET_ALL}", tag="PROFILE") + + # Save original terminal settings + fd = sys.stdin.fileno() + old_settings = termios.tcgetattr(fd) + + try: + # Switch to non-canonical mode (no line buffering) + tty.setcbreak(fd) + + while True: + # Check if input is available (non-blocking) + readable, _, _ = select.select([sys.stdin], [], [], 0.5) + if readable: + key = sys.stdin.read(1) + if key.lower() == 'q': + self.logger.info(f"{Fore.GREEN}Closing browser and saving profile...{Style.RESET_ALL}", tag="PROFILE") + user_done_event.set() + return + + # Check if the browser process has already exited + if browser_strategy.browser_process and browser_strategy.browser_process.poll() is not None: + self.logger.info("Browser already closed. Ending input listener.", tag="PROFILE") + user_done_event.set() + return + + await asyncio.sleep(0.1) + + finally: + # Restore terminal settings + termios.tcsetattr(fd, termios.TCSADRAIN, old_settings) + + try: + # Start the browser + await browser_strategy.start() + + # Check if browser started successfully + if not browser_strategy.browser_process: + self.logger.error("Failed to start browser process.", tag="PROFILE") + return None + + self.logger.info(f"Browser launched. {Fore.CYAN}Waiting for you to finish...{Style.RESET_ALL}", tag="PROFILE") + + # Start listening for keyboard input + listener_task = asyncio.create_task(listen_for_quit_command()) + + # Wait for either the user to press 'q' or for the browser process to exit naturally + while not user_done_event.is_set() and browser_strategy.browser_process.poll() is None: + await asyncio.sleep(0.5) + + # Cancel the listener task if it's still running + if not listener_task.done(): + listener_task.cancel() + try: + await listener_task + except asyncio.CancelledError: + pass + + # If the browser is still running and the user pressed 'q', terminate it + if browser_strategy.browser_process.poll() is None and user_done_event.is_set(): + self.logger.info("Terminating browser process...", tag="PROFILE") + await browser_strategy.close() + + self.logger.success(f"Browser closed. Profile saved at: {Fore.GREEN}{profile_path}{Style.RESET_ALL}", tag="PROFILE") + + except Exception as e: + self.logger.error(f"Error creating profile: {str(e)}", tag="PROFILE") + await browser_strategy.close() + return None + finally: + # Restore original signal handlers + signal.signal(signal.SIGINT, original_sigint) + signal.signal(signal.SIGTERM, original_sigterm) + + # Make sure browser is fully cleaned up + await browser_strategy.close() + + # Return the profile path + return profile_path + + def list_profiles(self) -> List[Dict[str, Any]]: + """List all available browser profiles. + + Returns: + List of dictionaries containing profile information + """ + if not os.path.exists(self.profiles_dir): + return [] + + profiles = [] + + for name in os.listdir(self.profiles_dir): + profile_path = os.path.join(self.profiles_dir, name) + + # Skip if not a directory + if not os.path.isdir(profile_path): + continue + + # Check if this looks like a valid browser profile + # For Chromium: Look for Preferences file + # For Firefox: Look for prefs.js file + is_valid = False + + if os.path.exists(os.path.join(profile_path, "Preferences")) or \ + os.path.exists(os.path.join(profile_path, "Default", "Preferences")): + is_valid = "chromium" + elif os.path.exists(os.path.join(profile_path, "prefs.js")): + is_valid = "firefox" + + if is_valid: + # Get creation time + created = datetime.datetime.fromtimestamp( + os.path.getctime(profile_path) + ) + + profiles.append({ + "name": name, + "path": profile_path, + "created": created, + "type": is_valid + }) + + # Sort by creation time, newest first + profiles.sort(key=lambda x: x["created"], reverse=True) + + return profiles + + def get_profile_path(self, profile_name: str) -> Optional[str]: + """Get the full path to a profile by name. + + Args: + profile_name: Name of the profile (not the full path) + + Returns: + Full path to the profile directory, or None if not found + """ + profile_path = os.path.join(self.profiles_dir, profile_name) + + # Check if path exists and is a valid profile + if not os.path.isdir(profile_path): + # Check if profile_name itself is full path + if os.path.isabs(profile_name): + profile_path = profile_name + else: + return None + + # Look for profile indicators + is_profile = ( + os.path.exists(os.path.join(profile_path, "Preferences")) or + os.path.exists(os.path.join(profile_path, "Default", "Preferences")) or + os.path.exists(os.path.join(profile_path, "prefs.js")) + ) + + if not is_profile: + return None # Not a valid browser profile + + return profile_path + + def delete_profile(self, profile_name_or_path: str) -> bool: + """Delete a browser profile by name or path. + + Args: + profile_name_or_path: Name of the profile or full path to profile directory + + Returns: + True if the profile was deleted successfully, False otherwise + """ + # Determine if input is a name or a path + if os.path.isabs(profile_name_or_path): + # Full path provided + profile_path = profile_name_or_path + else: + # Just a name provided, construct path + profile_path = os.path.join(self.profiles_dir, profile_name_or_path) + + # Check if path exists and is a valid profile + if not os.path.isdir(profile_path): + return False + + # Look for profile indicators + is_profile = ( + os.path.exists(os.path.join(profile_path, "Preferences")) or + os.path.exists(os.path.join(profile_path, "Default", "Preferences")) or + os.path.exists(os.path.join(profile_path, "prefs.js")) + ) + + if not is_profile: + return False # Not a valid browser profile + + # Delete the profile directory + try: + shutil.rmtree(profile_path) + return True + except Exception: + return False + + async def interactive_manager(self, crawl_callback=None): + """Launch an interactive profile management console. + + Args: + crawl_callback: Function to call when selecting option to use + a profile for crawling. It will be called with (profile_path, url). + """ + while True: + self.logger.info(f"\n{Fore.CYAN}Profile Management Options:{Style.RESET_ALL}", tag="MENU") + self.logger.info(f"1. {Fore.GREEN}Create a new profile{Style.RESET_ALL}", tag="MENU") + self.logger.info(f"2. {Fore.YELLOW}List available profiles{Style.RESET_ALL}", tag="MENU") + self.logger.info(f"3. {Fore.RED}Delete a profile{Style.RESET_ALL}", tag="MENU") + + # Only show crawl option if callback provided + if crawl_callback: + self.logger.info(f"4. {Fore.CYAN}Use a profile to crawl a website{Style.RESET_ALL}", tag="MENU") + self.logger.info(f"5. {Fore.MAGENTA}Exit{Style.RESET_ALL}", tag="MENU") + exit_option = "5" + else: + self.logger.info(f"4. {Fore.MAGENTA}Exit{Style.RESET_ALL}", tag="MENU") + exit_option = "4" + + choice = input(f"\n{Fore.CYAN}Enter your choice (1-{exit_option}): {Style.RESET_ALL}") + + if choice == "1": + # Create new profile + name = input(f"{Fore.GREEN}Enter a name for the new profile (or press Enter for auto-generated name): {Style.RESET_ALL}") + await self.create_profile(name or None) + + elif choice == "2": + # List profiles + profiles = self.list_profiles() + + if not profiles: + self.logger.warning(" No profiles found. Create one first with option 1.", tag="PROFILES") + continue + + # Print profile information with colorama formatting + self.logger.info("\nAvailable profiles:", tag="PROFILES") + for i, profile in enumerate(profiles): + self.logger.info(f"[{i+1}] {Fore.CYAN}{profile['name']}{Style.RESET_ALL}", tag="PROFILES") + self.logger.info(f" Path: {Fore.YELLOW}{profile['path']}{Style.RESET_ALL}", tag="PROFILES") + self.logger.info(f" Created: {profile['created'].strftime('%Y-%m-%d %H:%M:%S')}", tag="PROFILES") + self.logger.info(f" Browser type: {profile['type']}", tag="PROFILES") + self.logger.info("", tag="PROFILES") # Empty line for spacing + + elif choice == "3": + # Delete profile + profiles = self.list_profiles() + if not profiles: + self.logger.warning("No profiles found to delete", tag="PROFILES") + continue + + # Display numbered list + self.logger.info(f"\n{Fore.YELLOW}Available profiles:{Style.RESET_ALL}", tag="PROFILES") + for i, profile in enumerate(profiles): + self.logger.info(f"[{i+1}] {profile['name']}", tag="PROFILES") + + # Get profile to delete + profile_idx = input(f"{Fore.RED}Enter the number of the profile to delete (or 'c' to cancel): {Style.RESET_ALL}") + if profile_idx.lower() == 'c': + continue + + try: + idx = int(profile_idx) - 1 + if 0 <= idx < len(profiles): + profile_name = profiles[idx]["name"] + self.logger.info(f"Deleting profile: {Fore.YELLOW}{profile_name}{Style.RESET_ALL}", tag="PROFILES") + + # Confirm deletion + confirm = input(f"{Fore.RED}Are you sure you want to delete this profile? (y/n): {Style.RESET_ALL}") + if confirm.lower() == 'y': + success = self.delete_profile(profiles[idx]["path"]) + + if success: + self.logger.success(f"Profile {Fore.GREEN}{profile_name}{Style.RESET_ALL} deleted successfully", tag="PROFILES") + else: + self.logger.error(f"Failed to delete profile {Fore.RED}{profile_name}{Style.RESET_ALL}", tag="PROFILES") + else: + self.logger.error("Invalid profile number", tag="PROFILES") + except ValueError: + self.logger.error("Please enter a valid number", tag="PROFILES") + + elif choice == "4" and crawl_callback: + # Use profile to crawl a site + profiles = self.list_profiles() + if not profiles: + self.logger.warning("No profiles found. Create one first.", tag="PROFILES") + continue + + # Display numbered list + self.logger.info(f"\n{Fore.YELLOW}Available profiles:{Style.RESET_ALL}", tag="PROFILES") + for i, profile in enumerate(profiles): + self.logger.info(f"[{i+1}] {profile['name']}", tag="PROFILES") + + # Get profile to use + profile_idx = input(f"{Fore.CYAN}Enter the number of the profile to use (or 'c' to cancel): {Style.RESET_ALL}") + if profile_idx.lower() == 'c': + continue + + try: + idx = int(profile_idx) - 1 + if 0 <= idx < len(profiles): + profile_path = profiles[idx]["path"] + url = input(f"{Fore.CYAN}Enter the URL to crawl: {Style.RESET_ALL}") + if url: + # Call the provided crawl callback + await crawl_callback(profile_path, url) + else: + self.logger.error("No URL provided", tag="CRAWL") + else: + self.logger.error("Invalid profile number", tag="PROFILES") + except ValueError: + self.logger.error("Please enter a valid number", tag="PROFILES") + + elif (choice == "4" and not crawl_callback) or (choice == "5" and crawl_callback): + # Exit + self.logger.info("Exiting profile management", tag="MENU") + break + + else: + self.logger.error(f"Invalid choice. Please enter a number between 1 and {exit_option}.", tag="MENU") diff --git a/crawl4ai/browser/strategies.py b/crawl4ai/browser/strategies.py new file mode 100644 index 00000000..fd47f30e --- /dev/null +++ b/crawl4ai/browser/strategies.py @@ -0,0 +1,1048 @@ +"""Browser strategies module for Crawl4AI. + +This module implements the browser strategy pattern for different +browser implementations, including Playwright, CDP, and builtin browsers. +""" + +from abc import ABC, abstractmethod +import asyncio +import os +import time +import json +import hashlib +import subprocess +import sys +import shutil +import signal +from typing import Optional, Dict, Tuple, List, Any + +from playwright.async_api import Browser, BrowserContext, Page, ProxySettings + +from ..async_logger import AsyncLogger +from ..async_configs import BrowserConfig, CrawlerRunConfig +from ..config import DOWNLOAD_PAGE_TIMEOUT +from ..js_snippet import load_js_script +from ..utils import get_home_folder +from .utils import get_playwright, get_browser_executable, get_browser_disable_options, create_temp_directory, is_windows + +from playwright_stealth import StealthConfig + +stealth_config = StealthConfig( + webdriver=True, + chrome_app=True, + chrome_csi=True, + chrome_load_times=True, + chrome_runtime=True, + navigator_languages=True, + navigator_plugins=True, + navigator_permissions=True, + webgl_vendor=True, + outerdimensions=True, + navigator_hardware_concurrency=True, + media_codecs=True, +) + +class BaseBrowserStrategy(ABC): + """Base class for all browser strategies. + + This abstract class defines the interface that all browser strategies + must implement. It handles common functionality like context caching. + """ + + def __init__(self, config: BrowserConfig, logger: Optional[AsyncLogger] = None): + """Initialize the strategy with configuration and logger. + + Args: + config: Browser configuration + logger: Logger for recording events and errors + """ + self.config = config + self.logger = logger + self.browser = None + self.default_context = None + self.contexts_by_config = {} + self._contexts_lock = asyncio.Lock() + self.playwright = None + + @abstractmethod + async def start(self): + """Start the browser. + + Returns: + self: For method chaining + """ + pass + + @abstractmethod + async def get_page(self, crawlerRunConfig: CrawlerRunConfig) -> Tuple[Page, BrowserContext]: + """Get a page with specified configuration. + + Args: + crawlerRunConfig: Crawler run configuration + + Returns: + Tuple of (Page, BrowserContext) + """ + pass + + @abstractmethod + async def close(self): + """Close the browser and clean up resources.""" + pass + + def _make_config_signature(self, crawlerRunConfig: CrawlerRunConfig) -> str: + """Create a signature hash from configuration for context caching. + + Args: + crawlerRunConfig: Crawler run configuration + + Returns: + str: Unique hash for this configuration + """ + config_dict = crawlerRunConfig.__dict__.copy() + # Exclude items that do not affect browser-level setup + ephemeral_keys = [ + "session_id", + "js_code", + "scraping_strategy", + "extraction_strategy", + "chunking_strategy", + "cache_mode", + "content_filter", + "semaphore_count", + "url" + ] + for key in ephemeral_keys: + if key in config_dict: + del config_dict[key] + + # Convert to canonical JSON string + signature_json = json.dumps(config_dict, sort_keys=True, default=str) + + # Hash the JSON so we get a compact, unique string + signature_hash = hashlib.sha256(signature_json.encode("utf-8")).hexdigest() + return signature_hash + + async def setup_context(self, context: BrowserContext, crawlerRunConfig: Optional[CrawlerRunConfig] = None): + """Set up a browser context with the configured options. + + Args: + context: The browser context to set up + crawlerRunConfig: Configuration object containing all browser settings + """ + if self.config.headers: + await context.set_extra_http_headers(self.config.headers) + + if self.config.cookies: + await context.add_cookies(self.config.cookies) + + if self.config.storage_state: + await context.storage_state(path=None) + + if self.config.accept_downloads: + context.set_default_timeout(DOWNLOAD_PAGE_TIMEOUT) + context.set_default_navigation_timeout(DOWNLOAD_PAGE_TIMEOUT) + if self.config.downloads_path: + context._impl_obj._options["accept_downloads"] = True + context._impl_obj._options["downloads_path"] = self.config.downloads_path + + # Handle user agent and browser hints + if self.config.user_agent: + combined_headers = { + "User-Agent": self.config.user_agent, + "sec-ch-ua": self.config.browser_hint, + } + combined_headers.update(self.config.headers) + await context.set_extra_http_headers(combined_headers) + + # Add default cookie + await context.add_cookies( + [ + { + "name": "cookiesEnabled", + "value": "true", + "url": crawlerRunConfig.url if crawlerRunConfig else "https://crawl4ai.com/", + } + ] + ) + + # Handle navigator overrides + if crawlerRunConfig: + if ( + crawlerRunConfig.override_navigator + or crawlerRunConfig.simulate_user + or crawlerRunConfig.magic + ): + await context.add_init_script(load_js_script("navigator_overrider")) + +class PlaywrightBrowserStrategy(BaseBrowserStrategy): + """Standard Playwright browser strategy. + + This strategy launches a new browser instance using Playwright + and manages browser contexts. + """ + + def __init__(self, config: BrowserConfig, logger: Optional[AsyncLogger] = None): + """Initialize the Playwright browser strategy. + + Args: + config: Browser configuration + logger: Logger for recording events and errors + """ + super().__init__(config, logger) + # Add session management + self.sessions = {} + self.session_ttl = 1800 # 30 minutes + + async def start(self): + """Start the browser instance. + + Returns: + self: For method chaining + """ + self.playwright = await get_playwright() + browser_args = self._build_browser_args() + + # Launch appropriate browser type + if self.config.browser_type == "firefox": + self.browser = await self.playwright.firefox.launch(**browser_args) + elif self.config.browser_type == "webkit": + self.browser = await self.playwright.webkit.launch(**browser_args) + else: + self.browser = await self.playwright.chromium.launch(**browser_args) + + self.default_context = self.browser + return self + + def _build_browser_args(self) -> dict: + """Build browser launch arguments from config. + + Returns: + dict: Browser launch arguments + """ + args = [ + "--disable-gpu", + "--disable-gpu-compositing", + "--disable-software-rasterizer", + "--no-sandbox", + "--disable-dev-shm-usage", + "--no-first-run", + "--no-default-browser-check", + "--disable-infobars", + "--window-position=0,0", + "--ignore-certificate-errors", + "--ignore-certificate-errors-spki-list", + "--disable-blink-features=AutomationControlled", + "--window-position=400,0", + "--disable-renderer-backgrounding", + "--disable-ipc-flooding-protection", + "--force-color-profile=srgb", + "--mute-audio", + "--disable-background-timer-throttling", + f"--window-size={self.config.viewport_width},{self.config.viewport_height}", + ] + + if self.config.light_mode: + args.extend(get_browser_disable_options()) + + if self.config.text_mode: + args.extend( + [ + "--blink-settings=imagesEnabled=false", + "--disable-remote-fonts", + "--disable-images", + "--disable-javascript", + "--disable-software-rasterizer", + "--disable-dev-shm-usage", + ] + ) + + if self.config.extra_args: + args.extend(self.config.extra_args) + + browser_args = {"headless": self.config.headless, "args": args} + + if self.config.chrome_channel: + browser_args["channel"] = self.config.chrome_channel + + if self.config.accept_downloads: + browser_args["downloads_path"] = self.config.downloads_path or os.path.join( + os.getcwd(), "downloads" + ) + os.makedirs(browser_args["downloads_path"], exist_ok=True) + + if self.config.proxy or self.config.proxy_config: + proxy_settings = ( + ProxySettings(server=self.config.proxy) + if self.config.proxy + else ProxySettings( + server=self.config.proxy_config.server, + username=self.config.proxy_config.username, + password=self.config.proxy_config.password, + ) + ) + browser_args["proxy"] = proxy_settings + + return browser_args + + async def create_browser_context(self, crawlerRunConfig: Optional[CrawlerRunConfig] = None) -> BrowserContext: + """Creates and returns a new browser context with configured settings. + + Args: + crawlerRunConfig: Configuration object for the crawler run + + Returns: + BrowserContext: Browser context object with the specified configurations + """ + # Base settings + user_agent = self.config.headers.get("User-Agent", self.config.user_agent) + viewport_settings = { + "width": self.config.viewport_width, + "height": self.config.viewport_height, + } + proxy_settings = {"server": self.config.proxy} if self.config.proxy else None + + blocked_extensions = [ + # Images + "jpg", "jpeg", "png", "gif", "webp", "svg", "ico", "bmp", "tiff", "psd", + # Fonts + "woff", "woff2", "ttf", "otf", "eot", + # Media + "mp4", "webm", "ogg", "avi", "mov", "wmv", "flv", "m4v", "mp3", "wav", "aac", + "m4a", "opus", "flac", + # Documents + "pdf", "doc", "docx", "xls", "xlsx", "ppt", "pptx", + # Archives + "zip", "rar", "7z", "tar", "gz", + # Scripts and data + "xml", "swf", "wasm", + ] + + # Common context settings + context_settings = { + "user_agent": user_agent, + "viewport": viewport_settings, + "proxy": proxy_settings, + "accept_downloads": self.config.accept_downloads, + "storage_state": self.config.storage_state, + "ignore_https_errors": self.config.ignore_https_errors, + "device_scale_factor": 1.0, + "java_script_enabled": self.config.java_script_enabled, + } + + if crawlerRunConfig: + # Check if there is value for crawlerRunConfig.proxy_config set add that to context + if crawlerRunConfig.proxy_config: + proxy_settings = { + "server": crawlerRunConfig.proxy_config.server, + } + if crawlerRunConfig.proxy_config.username: + proxy_settings.update({ + "username": crawlerRunConfig.proxy_config.username, + "password": crawlerRunConfig.proxy_config.password, + }) + context_settings["proxy"] = proxy_settings + + if self.config.text_mode: + text_mode_settings = { + "has_touch": False, + "is_mobile": False, + } + # Update context settings with text mode settings + context_settings.update(text_mode_settings) + + # Create and return the context with all settings + context = await self.browser.new_context(**context_settings) + + # Apply text mode settings if enabled + if self.config.text_mode: + # Create and apply route patterns for each extension + for ext in blocked_extensions: + await context.route(f"**/*.{ext}", lambda route: route.abort()) + return context + + def _cleanup_expired_sessions(self): + """Clean up expired sessions based on TTL.""" + current_time = time.time() + expired_sessions = [ + sid + for sid, (_, _, last_used) in self.sessions.items() + if current_time - last_used > self.session_ttl + ] + for sid in expired_sessions: + asyncio.create_task(self._kill_session(sid)) + + async def _kill_session(self, session_id: str): + """Kill a browser session and clean up resources. + + Args: + session_id: The session ID to kill + """ + if session_id in self.sessions: + context, page, _ = self.sessions[session_id] + await page.close() + del self.sessions[session_id] + + async def get_page(self, crawlerRunConfig: CrawlerRunConfig) -> Tuple[Page, BrowserContext]: + """Get a page for the given configuration. + + Args: + crawlerRunConfig: Configuration object for the crawler run + + Returns: + Tuple of (Page, BrowserContext) + """ + # Clean up expired sessions first + self._cleanup_expired_sessions() + + # If a session_id is provided and we already have it, reuse that page + context + if crawlerRunConfig.session_id and crawlerRunConfig.session_id in self.sessions: + context, page, _ = self.sessions[crawlerRunConfig.session_id] + # Update last-used timestamp + self.sessions[crawlerRunConfig.session_id] = (context, page, time.time()) + return page, context + + # Otherwise, check if we have an existing context for this config + config_signature = self._make_config_signature(crawlerRunConfig) + + async with self._contexts_lock: + if config_signature in self.contexts_by_config: + context = self.contexts_by_config[config_signature] + else: + # Create and setup a new context + context = await self.create_browser_context(crawlerRunConfig) + await self.setup_context(context, crawlerRunConfig) + self.contexts_by_config[config_signature] = context + + # Create a new page from the chosen context + page = await context.new_page() + + # If a session_id is specified, store this session so we can reuse later + if crawlerRunConfig.session_id: + self.sessions[crawlerRunConfig.session_id] = (context, page, time.time()) + + return page, context + + async def close(self): + """Close the browser and clean up resources.""" + if self.config.sleep_on_close: + await asyncio.sleep(0.5) + + # Close all sessions + session_ids = list(self.sessions.keys()) + for session_id in session_ids: + await self._kill_session(session_id) + + # Close all contexts we created + for ctx in self.contexts_by_config.values(): + try: + await ctx.close() + except Exception as e: + if self.logger: + self.logger.error( + message="Error closing context: {error}", + tag="ERROR", + params={"error": str(e)} + ) + self.contexts_by_config.clear() + + if self.browser: + await self.browser.close() + self.browser = None + + if self.playwright: + await self.playwright.stop() + self.playwright = None + +class CDPBrowserStrategy(BaseBrowserStrategy): + """CDP-based browser strategy. + + This strategy connects to an existing browser using CDP protocol or + launches and connects to a browser using CDP. + """ + + def __init__(self, config: BrowserConfig, logger: Optional[AsyncLogger] = None): + """Initialize the CDP browser strategy. + + Args: + config: Browser configuration + logger: Logger for recording events and errors + """ + super().__init__(config, logger) + self.sessions = {} + self.session_ttl = 1800 # 30 minutes + self.browser_process = None + self.temp_dir = None + self.shutting_down = False + + async def start(self): + """Start or connect to the browser using CDP. + + Returns: + self: For method chaining + """ + self.playwright = await get_playwright() + + # Get or create CDP URL + cdp_url = await self._get_or_create_cdp_url() + + # Connect to the browser using CDP + self.browser = await self.playwright.chromium.connect_over_cdp(cdp_url) + + # Get or create default context + contexts = self.browser.contexts + if contexts: + self.default_context = contexts[0] + else: + self.default_context = await self.create_browser_context() + + await self.setup_context(self.default_context) + return self + + async def _get_or_create_cdp_url(self) -> str: + """Get existing CDP URL or launch a browser and return its CDP URL. + + Returns: + str: CDP URL for connecting to the browser + """ + # If CDP URL is provided, just return it + if self.config.cdp_url: + return self.config.cdp_url + + # Create temp dir if needed + if not self.config.user_data_dir: + self.temp_dir = create_temp_directory() + user_data_dir = self.temp_dir + else: + user_data_dir = self.config.user_data_dir + + # Get browser args based on OS and browser type + args = await self._get_browser_args(user_data_dir) + + # Start browser process + try: + # Use DETACHED_PROCESS flag on Windows to fully detach the process + # On Unix, we'll use preexec_fn=os.setpgrp to start the process in a new process group + if is_windows(): + self.browser_process = subprocess.Popen( + args, + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + creationflags=subprocess.DETACHED_PROCESS | subprocess.CREATE_NEW_PROCESS_GROUP + ) + else: + self.browser_process = subprocess.Popen( + args, + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + preexec_fn=os.setpgrp # Start in a new process group + ) + + # Monitor for a short time to make sure it starts properly + await asyncio.sleep(0.5) # Give browser time to start + await self._initial_startup_check() + await asyncio.sleep(2) # Give browser more time to start + return f"http://localhost:{self.config.debugging_port}" + except Exception as e: + await self._cleanup_process() + raise Exception(f"Failed to start browser: {e}") + + async def _initial_startup_check(self): + """Perform a quick check to make sure the browser started successfully.""" + if not self.browser_process: + return + + # Check that process started without immediate termination + await asyncio.sleep(0.5) + if self.browser_process.poll() is not None: + # Process already terminated + stdout, stderr = b"", b"" + try: + stdout, stderr = self.browser_process.communicate(timeout=0.5) + except subprocess.TimeoutExpired: + pass + + if self.logger: + self.logger.error( + message="Browser process terminated during startup | Code: {code} | STDOUT: {stdout} | STDERR: {stderr}", + tag="ERROR", + params={ + "code": self.browser_process.returncode, + "stdout": stdout.decode() if stdout else "", + "stderr": stderr.decode() if stderr else "", + }, + ) + + async def _get_browser_args(self, user_data_dir: str) -> List[str]: + """Returns browser-specific command line arguments. + + Args: + user_data_dir: Path to user data directory + + Returns: + List of command-line arguments for the browser + """ + browser_path = get_browser_executable(self.config.browser_type) + base_args = [browser_path] + + if self.config.browser_type == "chromium": + args = [ + f"--remote-debugging-port={self.config.debugging_port}", + f"--user-data-dir={user_data_dir}", + ] + if self.config.headless: + args.append("--headless=new") + elif self.config.browser_type == "firefox": + args = [ + "--remote-debugging-port", + str(self.config.debugging_port), + "--profile", + user_data_dir, + ] + if self.config.headless: + args.append("--headless") + else: + raise NotImplementedError(f"Browser type {self.config.browser_type} not supported") + + return base_args + args + + async def _cleanup_process(self): + """Cleanup browser process and temporary directory.""" + # Set shutting_down flag BEFORE any termination actions + self.shutting_down = True + + if self.browser_process: + try: + # Only terminate if we have proper control over the process + if not self.browser_process.poll(): + # Process is still running + self.browser_process.terminate() + # Wait for process to end gracefully + for _ in range(10): # 10 attempts, 100ms each + if self.browser_process.poll() is not None: + break + await asyncio.sleep(0.1) + + # Force kill if still running + if self.browser_process.poll() is None: + if is_windows(): + # On Windows we might need taskkill for detached processes + try: + subprocess.run(["taskkill", "/F", "/PID", str(self.browser_process.pid)]) + except Exception: + self.browser_process.kill() + else: + self.browser_process.kill() + await asyncio.sleep(0.1) # Brief wait for kill to take effect + + except Exception as e: + if self.logger: + self.logger.error( + message="Error terminating browser: {error}", + tag="ERROR", + params={"error": str(e)}, + ) + + if self.temp_dir and os.path.exists(self.temp_dir): + try: + shutil.rmtree(self.temp_dir) + except Exception as e: + if self.logger: + self.logger.error( + message="Error removing temporary directory: {error}", + tag="ERROR", + params={"error": str(e)}, + ) + + async def create_browser_context(self, crawlerRunConfig: Optional[CrawlerRunConfig] = None) -> BrowserContext: + """Create a new browser context. + + Args: + crawlerRunConfig: Configuration object for the crawler run + + Returns: + BrowserContext: Browser context object + """ + return await self.browser.new_context() + + def _cleanup_expired_sessions(self): + """Clean up expired sessions based on TTL.""" + current_time = time.time() + expired_sessions = [ + sid + for sid, (_, _, last_used) in self.sessions.items() + if current_time - last_used > self.session_ttl + ] + for sid in expired_sessions: + asyncio.create_task(self._kill_session(sid)) + + async def _kill_session(self, session_id: str): + """Kill a browser session and clean up resources. + + Args: + session_id: The session ID to kill + """ + if session_id in self.sessions: + context, page, _ = self.sessions[session_id] + await page.close() + del self.sessions[session_id] + + async def get_page(self, crawlerRunConfig: CrawlerRunConfig) -> Tuple[Page, BrowserContext]: + """Get a page for the given configuration. + + Args: + crawlerRunConfig: Configuration object for the crawler run + + Returns: + Tuple of (Page, BrowserContext) + """ + self._cleanup_expired_sessions() + + # If a session_id is provided and we already have it, reuse that page + context + if crawlerRunConfig.session_id and crawlerRunConfig.session_id in self.sessions: + context, page, _ = self.sessions[crawlerRunConfig.session_id] + # Update last-used timestamp + self.sessions[crawlerRunConfig.session_id] = (context, page, time.time()) + return page, context + + # For CDP, we typically use the shared default_context + context = self.default_context + pages = context.pages + page = next((p for p in pages if p.url == crawlerRunConfig.url), None) + if not page: + page = await context.new_page() + + # If a session_id is specified, store this session so we can reuse later + if crawlerRunConfig.session_id: + self.sessions[crawlerRunConfig.session_id] = (context, page, time.time()) + + return page, context + + async def close(self): + """Close the browser and clean up resources.""" + # Skip cleanup if using external CDP URL and not launched by us + if self.config.cdp_url and not self.browser_process: + return + + if self.config.sleep_on_close: + await asyncio.sleep(0.5) + + # Close all sessions + session_ids = list(self.sessions.keys()) + for session_id in session_ids: + await self._kill_session(session_id) + + # Close browser + if self.browser: + await self.browser.close() + self.browser = None + + # Clean up managed browser if we created it + if self.browser_process: + await asyncio.sleep(0.5) + await self._cleanup_process() + self.browser_process = None + + # Close temporary directory + if self.temp_dir and os.path.exists(self.temp_dir): + try: + shutil.rmtree(self.temp_dir) + self.temp_dir = None + except Exception as e: + if self.logger: + self.logger.error( + message="Error removing temporary directory: {error}", + tag="ERROR", + params={"error": str(e)}, + ) + + # Stop playwright + if self.playwright: + await self.playwright.stop() + self.playwright = None + +class BuiltinBrowserStrategy(CDPBrowserStrategy): + """Built-in browser strategy. + + This strategy extends the CDP strategy to use the built-in browser. + """ + + def __init__(self, config: BrowserConfig, logger: Optional[AsyncLogger] = None): + """Initialize the built-in browser strategy. + + Args: + config: Browser configuration + logger: Logger for recording events and errors + """ + super().__init__(config, logger) + self.builtin_browser_dir = os.path.join(get_home_folder(), "builtin-browser") + self.builtin_config_file = os.path.join(self.builtin_browser_dir, "browser_config.json") + os.makedirs(self.builtin_browser_dir, exist_ok=True) + + async def start(self): + """Start or connect to the built-in browser. + + Returns: + self: For method chaining + """ + # Check for existing built-in browser + browser_info = self.get_builtin_browser_info() + if browser_info and self._is_browser_running(browser_info.get('pid')): + if self.logger: + self.logger.info(f"Using existing built-in browser at {browser_info.get('cdp_url')}", tag="BROWSER") + self.config.cdp_url = browser_info.get('cdp_url') + else: + if self.logger: + self.logger.info("Built-in browser not found, launching new instance...", tag="BROWSER") + cdp_url = await self.launch_builtin_browser( + browser_type=self.config.browser_type, + debugging_port=self.config.debugging_port, + headless=self.config.headless + ) + if not cdp_url: + if self.logger: + self.logger.warning("Failed to launch built-in browser, falling back to regular CDP strategy", tag="BROWSER") + return await super().start() + self.config.cdp_url = cdp_url + + # Call parent class implementation with updated CDP URL + return await super().start() + + def get_builtin_browser_info(self) -> Optional[Dict[str, Any]]: + """Get information about the built-in browser. + + Returns: + dict: Browser information or None if no built-in browser is configured + """ + if not os.path.exists(self.builtin_config_file): + return None + + try: + with open(self.builtin_config_file, 'r') as f: + browser_info = json.load(f) + + # Check if the browser is still running + if not self._is_browser_running(browser_info.get('pid')): + if self.logger: + self.logger.warning("Built-in browser is not running", tag="BUILTIN") + return None + + return browser_info + except Exception as e: + if self.logger: + self.logger.error(f"Error reading built-in browser config: {str(e)}", tag="BUILTIN") + return None + + def _is_browser_running(self, pid: Optional[int]) -> bool: + """Check if a process with the given PID is running. + + Args: + pid: Process ID to check + + Returns: + bool: True if the process is running, False otherwise + """ + if not pid: + return False + + try: + # Check if the process exists + if is_windows(): + process = subprocess.run(["tasklist", "/FI", f"PID eq {pid}"], + capture_output=True, text=True) + return str(pid) in process.stdout + else: + # Unix-like systems + os.kill(pid, 0) # This doesn't actually kill the process, just checks if it exists + return True + except (ProcessLookupError, PermissionError, OSError): + return False + + async def launch_builtin_browser(self, + browser_type: str = "chromium", + debugging_port: int = 9222, + headless: bool = True) -> Optional[str]: + """Launch a browser in the background for use as the built-in browser. + + Args: + browser_type: Type of browser to launch ('chromium' or 'firefox') + debugging_port: Port to use for CDP debugging + headless: Whether to run in headless mode + + Returns: + str: CDP URL for the browser, or None if launch failed + """ + # Check if there's an existing browser still running + browser_info = self.get_builtin_browser_info() + if browser_info and self._is_browser_running(browser_info.get('pid')): + if self.logger: + self.logger.info("Built-in browser is already running", tag="BUILTIN") + return browser_info.get('cdp_url') + + # Create a user data directory for the built-in browser + user_data_dir = os.path.join(self.builtin_browser_dir, "user_data") + os.makedirs(user_data_dir, exist_ok=True) + + # Prepare browser launch arguments + browser_path = get_browser_executable(browser_type) + if browser_type == "chromium": + args = [ + browser_path, + f"--remote-debugging-port={debugging_port}", + f"--user-data-dir={user_data_dir}", + ] + if headless: + args.append("--headless=new") + elif browser_type == "firefox": + args = [ + browser_path, + "--remote-debugging-port", + str(debugging_port), + "--profile", + user_data_dir, + ] + if headless: + args.append("--headless") + else: + if self.logger: + self.logger.error(f"Browser type {browser_type} not supported for built-in browser", tag="BUILTIN") + return None + + try: + # Start the browser process detached + if is_windows(): + process = subprocess.Popen( + args, + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + creationflags=subprocess.DETACHED_PROCESS | subprocess.CREATE_NEW_PROCESS_GROUP + ) + else: + process = subprocess.Popen( + args, + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + preexec_fn=os.setpgrp # Start in a new process group + ) + + # Wait briefly to ensure the process starts successfully + await asyncio.sleep(2.0) + + # Check if the process is still running + if process.poll() is not None: + if self.logger: + self.logger.error(f"Browser process exited immediately with code {process.returncode}", tag="BUILTIN") + return None + + # Construct CDP URL + cdp_url = f"http://localhost:{debugging_port}" + + # Try to verify browser is responsive by fetching version info + import aiohttp + json_url = f"{cdp_url}/json/version" + config_json = None + + try: + async with aiohttp.ClientSession() as session: + for _ in range(10): # Try multiple times + try: + async with session.get(json_url) as response: + if response.status == 200: + config_json = await response.json() + break + except Exception: + pass + await asyncio.sleep(0.5) + except Exception as e: + if self.logger: + self.logger.warning(f"Could not verify browser: {str(e)}", tag="BUILTIN") + + # Save browser info + browser_info = { + 'pid': process.pid, + 'cdp_url': cdp_url, + 'user_data_dir': user_data_dir, + 'browser_type': browser_type, + 'debugging_port': debugging_port, + 'start_time': time.time(), + 'config': config_json + } + + with open(self.builtin_config_file, 'w') as f: + json.dump(browser_info, f, indent=2) + + # Detach from the browser process - don't keep any references + # This is important to allow the Python script to exit while the browser continues running + process = None + + if self.logger: + self.logger.success(f"Built-in browser launched at CDP URL: {cdp_url}", tag="BUILTIN") + return cdp_url + + except Exception as e: + if self.logger: + self.logger.error(f"Error launching built-in browser: {str(e)}", tag="BUILTIN") + return None + + async def kill_builtin_browser(self) -> bool: + """Kill the built-in browser if it's running. + + Returns: + bool: True if the browser was killed, False otherwise + """ + browser_info = self.get_builtin_browser_info() + if not browser_info: + if self.logger: + self.logger.warning("No built-in browser found", tag="BUILTIN") + return False + + pid = browser_info.get('pid') + if not pid: + return False + + try: + if is_windows(): + subprocess.run(["taskkill", "/F", "/PID", str(pid)], check=True) + else: + os.kill(pid, signal.SIGTERM) + # Wait for termination + for _ in range(5): + if not self._is_browser_running(pid): + break + await asyncio.sleep(0.5) + else: + # Force kill if still running + os.kill(pid, signal.SIGKILL) + + # Remove config file + if os.path.exists(self.builtin_config_file): + os.unlink(self.builtin_config_file) + + if self.logger: + self.logger.success("Built-in browser terminated", tag="BUILTIN") + return True + except Exception as e: + if self.logger: + self.logger.error(f"Error killing built-in browser: {str(e)}", tag="BUILTIN") + return False + + async def get_builtin_browser_status(self) -> Dict[str, Any]: + """Get status information about the built-in browser. + + Returns: + dict: Status information with running, cdp_url, and info fields + """ + browser_info = self.get_builtin_browser_info() + + if not browser_info: + return { + 'running': False, + 'cdp_url': None, + 'info': None + } + + return { + 'running': True, + 'cdp_url': browser_info.get('cdp_url'), + 'info': browser_info + } diff --git a/crawl4ai/browser/utils.py b/crawl4ai/browser/utils.py new file mode 100644 index 00000000..2dff0924 --- /dev/null +++ b/crawl4ai/browser/utils.py @@ -0,0 +1,105 @@ +"""Browser utilities module for Crawl4AI. + +This module provides utility functions for browser management, +including process management, CDP connection utilities, +and Playwright instance management. +""" + +import asyncio +import os +import sys +import platform +import tempfile +from typing import Optional, Any + +from playwright.async_api import async_playwright + +from ..async_logger import AsyncLogger +from ..utils import get_chromium_path + +_playwright_instance = None + +async def get_playwright(): + """Get or create the Playwright instance (singleton pattern). + + Returns: + Playwright: The Playwright instance + """ + global _playwright_instance + if _playwright_instance is None or True: + _playwright_instance = await async_playwright().start() + return _playwright_instance + +def get_browser_executable(browser_type: str) -> str: + """Get the path to browser executable, with platform-specific handling. + + Args: + browser_type: Type of browser (chromium, firefox, webkit) + + Returns: + Path to browser executable + """ + return get_chromium_path(browser_type) + +def create_temp_directory(prefix="browser-profile-") -> str: + """Create a temporary directory for browser data. + + Args: + prefix: Prefix for the temporary directory name + + Returns: + Path to the created temporary directory + """ + return tempfile.mkdtemp(prefix=prefix) + +def is_windows() -> bool: + """Check if the current platform is Windows. + + Returns: + True if Windows, False otherwise + """ + return sys.platform == "win32" + +def is_macos() -> bool: + """Check if the current platform is macOS. + + Returns: + True if macOS, False otherwise + """ + return sys.platform == "darwin" + +def is_linux() -> bool: + """Check if the current platform is Linux. + + Returns: + True if Linux, False otherwise + """ + return not (is_windows() or is_macos()) + +def get_browser_disable_options() -> list: + """Get standard list of browser disable options for performance. + + Returns: + List of command-line options to disable various browser features + """ + return [ + "--disable-background-networking", + "--disable-background-timer-throttling", + "--disable-backgrounding-occluded-windows", + "--disable-breakpad", + "--disable-client-side-phishing-detection", + "--disable-component-extensions-with-background-pages", + "--disable-default-apps", + "--disable-extensions", + "--disable-features=TranslateUI", + "--disable-hang-monitor", + "--disable-ipc-flooding-protection", + "--disable-popup-blocking", + "--disable-prompt-on-repost", + "--disable-sync", + "--force-color-profile=srgb", + "--metrics-recording-only", + "--no-first-run", + "--password-store=basic", + "--use-mock-keychain", + ] diff --git a/crawl4ai/browser_manager.py b/crawl4ai/browser_manager.py index f40efbbc..df0886c7 100644 --- a/crawl4ai/browser_manager.py +++ b/crawl4ai/browser_manager.py @@ -163,6 +163,7 @@ class ManagedBrowser: ) # We'll monitor for a short time to make sure it starts properly, but won't keep monitoring + await asyncio.sleep(0.5) # Give browser time to start await self._initial_startup_check() await asyncio.sleep(2) # Give browser time to start return f"http://{self.host}:{self.debugging_port}" diff --git a/crawl4ai/browser_profiler.py b/crawl4ai/browser_profiler.py index 1fd76ddc..2291faa2 100644 --- a/crawl4ai/browser_profiler.py +++ b/crawl4ai/browser_profiler.py @@ -555,7 +555,6 @@ class BrowserProfiler: else: self.logger.error(f"Invalid choice. Please enter a number between 1 and {exit_option}.", tag="MENU") - async def launch_standalone_browser(self, browser_type: str = "chromium", user_data_dir: Optional[str] = None, diff --git a/docs/examples/hello_world.py b/docs/examples/hello_world.py index c44908d5..fbdd5283 100644 --- a/docs/examples/hello_world.py +++ b/docs/examples/hello_world.py @@ -9,6 +9,26 @@ from crawl4ai import ( CrawlResult ) +async def example_cdp(): + browser_conf = BrowserConfig( + headless=False, + cdp_url="http://localhost:9223" + ) + crawler_config = CrawlerRunConfig( + session_id="test", + js_code = """(() => { return {"result": "Hello World!"} })()""", + js_only=True + ) + async with AsyncWebCrawler( + config=browser_conf, + verbose=True, + ) as crawler: + result : CrawlResult = await crawler.arun( + url="https://www.helloworld.org", + config=crawler_config, + ) + print(result.js_execution_result) + async def main(): browser_config = BrowserConfig(headless=True, verbose=True) @@ -16,18 +36,15 @@ async def main(): crawler_config = CrawlerRunConfig( cache_mode=CacheMode.BYPASS, markdown_generator=DefaultMarkdownGenerator( - # content_filter=PruningContentFilter( - # threshold=0.48, threshold_type="fixed", min_word_threshold=0 - # ) + content_filter=PruningContentFilter( + threshold=0.48, threshold_type="fixed", min_word_threshold=0 + ) ), ) result : CrawlResult = await crawler.arun( - # url="https://www.helloworld.org", config=crawler_config - url="https://www.kidocode.com", config=crawler_config + url="https://www.helloworld.org", config=crawler_config ) print(result.markdown.raw_markdown[:500]) - # print(result.model_dump()) - if __name__ == "__main__": asyncio.run(main()) diff --git a/tests/browser/test_browser_manager.py b/tests/browser/test_browser_manager.py new file mode 100644 index 00000000..2293b90d --- /dev/null +++ b/tests/browser/test_browser_manager.py @@ -0,0 +1,190 @@ +"""Test examples for BrowserManager. + +These examples demonstrate the functionality of BrowserManager +and serve as functional tests. +""" + +import asyncio +import os +import sys +from typing import List + +# Add the project root to Python path if running directly +if __name__ == "__main__": + sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '../..'))) + +from crawl4ai.browser import BrowserManager +from crawl4ai.async_configs import BrowserConfig, CrawlerRunConfig +from crawl4ai.async_logger import AsyncLogger + +# Create a logger for clear terminal output +logger = AsyncLogger(verbose=True, log_file=None) + +async def test_basic_browser_manager(): + """Test basic BrowserManager functionality with default configuration.""" + logger.info("Starting test_basic_browser_manager", tag="TEST") + + try: + # Create a browser manager with default config + manager = BrowserManager(logger=logger) + + # Start the browser + await manager.start() + logger.info("Browser started successfully", tag="TEST") + + # Get a page + crawler_config = CrawlerRunConfig(url="https://example.com") + page, context = await manager.get_page(crawler_config) + logger.info("Page created successfully", tag="TEST") + + # Navigate to a website + await page.goto("https://example.com") + title = await page.title() + logger.info(f"Page title: {title}", tag="TEST") + + # Clean up + await manager.close() + logger.success("test_basic_browser_manager completed successfully", tag="TEST") + return True + except Exception as e: + logger.error(f"test_basic_browser_manager failed: {str(e)}", tag="TEST") + return False + +async def test_custom_browser_config(): + """Test BrowserManager with custom browser configuration.""" + logger.info("Starting test_custom_browser_config", tag="TEST") + + try: + # Create a custom browser config + browser_config = BrowserConfig( + browser_type="chromium", + headless=True, + viewport_width=1280, + viewport_height=800, + light_mode=True + ) + + # Create browser manager with the config + manager = BrowserManager(browser_config=browser_config, logger=logger) + + # Start the browser + await manager.start() + logger.info("Browser started successfully with custom config", tag="TEST") + + # Get a page + crawler_config = CrawlerRunConfig(url="https://example.com") + page, context = await manager.get_page(crawler_config) + + # Navigate to a website + await page.goto("https://example.com") + title = await page.title() + logger.info(f"Page title: {title}", tag="TEST") + + # Verify viewport size + viewport_size = await page.evaluate("() => ({ width: window.innerWidth, height: window.innerHeight })") + logger.info(f"Viewport size: {viewport_size}", tag="TEST") + + # Clean up + await manager.close() + logger.success("test_custom_browser_config completed successfully", tag="TEST") + return True + except Exception as e: + logger.error(f"test_custom_browser_config failed: {str(e)}", tag="TEST") + return False + +async def test_multiple_pages(): + """Test BrowserManager with multiple pages.""" + logger.info("Starting test_multiple_pages", tag="TEST") + + try: + # Create browser manager + manager = BrowserManager(logger=logger) + + # Start the browser + await manager.start() + logger.info("Browser started successfully", tag="TEST") + + # Create multiple pages + pages = [] + urls = ["https://example.com", "https://example.org", "https://mozilla.org"] + + for i, url in enumerate(urls): + crawler_config = CrawlerRunConfig(url=url) + page, context = await manager.get_page(crawler_config) + await page.goto(url) + pages.append((page, url)) + logger.info(f"Created page {i+1} for {url}", tag="TEST") + + # Verify all pages are loaded correctly + for i, (page, url) in enumerate(pages): + title = await page.title() + logger.info(f"Page {i+1} title: {title}", tag="TEST") + + # Clean up + await manager.close() + logger.success("test_multiple_pages completed successfully", tag="TEST") + return True + except Exception as e: + logger.error(f"test_multiple_pages failed: {str(e)}", tag="TEST") + return False + +async def test_session_management(): + """Test session management in BrowserManager.""" + logger.info("Starting test_session_management", tag="TEST") + + try: + # Create browser manager + manager = BrowserManager(logger=logger) + + # Start the browser + await manager.start() + logger.info("Browser started successfully", tag="TEST") + + # Create a session + session_id = "test_session_1" + crawler_config = CrawlerRunConfig(url="https://example.com", session_id=session_id) + page1, context1 = await manager.get_page(crawler_config) + await page1.goto("https://example.com") + logger.info(f"Created session with ID: {session_id}", tag="TEST") + + # Get the same session again + page2, context2 = await manager.get_page(crawler_config) + + # Verify it's the same page/context + is_same_page = page1 == page2 + is_same_context = context1 == context2 + logger.info(f"Same page: {is_same_page}, Same context: {is_same_context}", tag="TEST") + + # Kill the session + await manager.kill_session(session_id) + logger.info(f"Killed session with ID: {session_id}", tag="TEST") + + # Clean up + await manager.close() + logger.success("test_session_management completed successfully", tag="TEST") + return True + except Exception as e: + logger.error(f"test_session_management failed: {str(e)}", tag="TEST") + return False + +async def run_tests(): + """Run all tests sequentially.""" + results = [] + + # results.append(await test_basic_browser_manager()) + # results.append(await test_custom_browser_config()) + # results.append(await test_multiple_pages()) + results.append(await test_session_management()) + + # Print summary + total = len(results) + passed = sum(results) + logger.info(f"Tests complete: {passed}/{total} passed", tag="SUMMARY") + + if passed == total: + logger.success("All tests passed!", tag="SUMMARY") + else: + logger.error(f"{total - passed} tests failed", tag="SUMMARY") + +if __name__ == "__main__": + asyncio.run(run_tests()) diff --git a/tests/browser/test_builtin_strategy.py b/tests/browser/test_builtin_strategy.py new file mode 100644 index 00000000..7c435b3d --- /dev/null +++ b/tests/browser/test_builtin_strategy.py @@ -0,0 +1,160 @@ +"""Test examples for BuiltinBrowserStrategy. + +These examples demonstrate the functionality of BuiltinBrowserStrategy +and serve as functional tests. +""" + +import asyncio +import os +import sys + +# Add the project root to Python path if running directly +if __name__ == "__main__": + sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '../..'))) + +from crawl4ai.browser import BrowserManager +from crawl4ai.async_configs import BrowserConfig, CrawlerRunConfig +from crawl4ai.async_logger import AsyncLogger + +# Create a logger for clear terminal output +logger = AsyncLogger(verbose=True, log_file=None) + +async def test_builtin_browser(): + """Test using a builtin browser that persists between sessions.""" + logger.info("Testing builtin browser", tag="TEST") + + browser_config = BrowserConfig( + browser_mode="builtin", + headless=True + ) + + manager = BrowserManager(browser_config=browser_config, logger=logger) + + try: + # Start should connect to existing builtin browser or create one + await manager.start() + logger.info("Connected to builtin browser", tag="TEST") + + # Test page creation + crawler_config = CrawlerRunConfig() + page, context = await manager.get_page(crawler_config) + + # Test navigation + await page.goto("https://example.com") + title = await page.title() + logger.info(f"Page title: {title}", tag="TEST") + + # Close manager (should not close the builtin browser) + await manager.close() + logger.info("First session closed", tag="TEST") + + # Create a second manager to verify browser persistence + logger.info("Creating second session to verify persistence", tag="TEST") + manager2 = BrowserManager(browser_config=browser_config, logger=logger) + + await manager2.start() + logger.info("Connected to existing builtin browser", tag="TEST") + + page2, context2 = await manager2.get_page(crawler_config) + await page2.goto("https://example.org") + title2 = await page2.title() + logger.info(f"Second session page title: {title2}", tag="TEST") + + await manager2.close() + logger.info("Second session closed successfully", tag="TEST") + + return True + except Exception as e: + logger.error(f"Test failed: {str(e)}", tag="TEST") + try: + await manager.close() + except: + pass + return False + +async def test_builtin_browser_status(): + """Test getting status of the builtin browser.""" + logger.info("Testing builtin browser status", tag="TEST") + + from crawl4ai.browser.strategies import BuiltinBrowserStrategy + + browser_config = BrowserConfig( + browser_mode="builtin", + headless=True + ) + + # Create strategy directly to access its status methods + strategy = BuiltinBrowserStrategy(browser_config, logger) + + try: + # Get status before starting (should be not running) + status_before = await strategy.get_builtin_browser_status() + logger.info(f"Initial status: {status_before}", tag="TEST") + + # Start the browser + await strategy.start() + logger.info("Browser started successfully", tag="TEST") + + # Get status after starting + status_after = await strategy.get_builtin_browser_status() + logger.info(f"Status after start: {status_after}", tag="TEST") + + # Create a page to verify functionality + crawler_config = CrawlerRunConfig() + page, context = await strategy.get_page(crawler_config) + await page.goto("https://example.com") + title = await page.title() + logger.info(f"Page title: {title}", tag="TEST") + + # Close strategy (should not kill the builtin browser) + await strategy.close() + logger.info("Strategy closed successfully", tag="TEST") + + # Create a new strategy object + strategy2 = BuiltinBrowserStrategy(browser_config, logger) + + # Get status again (should still be running) + status_final = await strategy2.get_builtin_browser_status() + logger.info(f"Final status: {status_final}", tag="TEST") + + # Verify that the status shows the browser is running + is_running = status_final.get('running', False) + logger.info(f"Builtin browser persistence confirmed: {is_running}", tag="TEST") + + # Kill the builtin browser to clean up + logger.info("Killing builtin browser", tag="TEST") + success = await strategy2.kill_builtin_browser() + logger.info(f"Killed builtin browser successfully: {success}", tag="TEST") + + return is_running and success + except Exception as e: + logger.error(f"Test failed: {str(e)}", tag="TEST") + try: + await strategy.close() + + # Try to kill the builtin browser to clean up + strategy2 = BuiltinBrowserStrategy(browser_config, logger) + await strategy2.kill_builtin_browser() + except: + pass + return False + +async def run_tests(): + """Run all tests sequentially.""" + results = [] + + results.append(await test_builtin_browser()) + results.append(await test_builtin_browser_status()) + + # Print summary + total = len(results) + passed = sum(results) + logger.info(f"Tests complete: {passed}/{total} passed", tag="SUMMARY") + + if passed == total: + logger.success("All tests passed!", tag="SUMMARY") + else: + logger.error(f"{total - passed} tests failed", tag="SUMMARY") + +if __name__ == "__main__": + asyncio.run(run_tests()) diff --git a/tests/browser/test_cdp_strategy.py b/tests/browser/test_cdp_strategy.py new file mode 100644 index 00000000..4ec1f7f1 --- /dev/null +++ b/tests/browser/test_cdp_strategy.py @@ -0,0 +1,227 @@ +"""Test examples for CDPBrowserStrategy. + +These examples demonstrate the functionality of CDPBrowserStrategy +and serve as functional tests. +""" + +import asyncio +import os +import sys + +# Add the project root to Python path if running directly +if __name__ == "__main__": + sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '../..'))) + +from crawl4ai.browser import BrowserManager +from crawl4ai.async_configs import BrowserConfig, CrawlerRunConfig +from crawl4ai.async_logger import AsyncLogger + +# Create a logger for clear terminal output +logger = AsyncLogger(verbose=True, log_file=None) + +async def test_cdp_launch_connect(): + """Test launching a browser and connecting via CDP.""" + logger.info("Testing launch and connect via CDP", tag="TEST") + + browser_config = BrowserConfig( + use_managed_browser=True, + headless=True + ) + + manager = BrowserManager(browser_config=browser_config, logger=logger) + + try: + await manager.start() + logger.info("Browser launched and connected via CDP", tag="TEST") + + # Test with multiple pages + pages = [] + for i in range(3): + crawler_config = CrawlerRunConfig() + page, context = await manager.get_page(crawler_config) + await page.goto(f"https://example.com?test={i}") + pages.append(page) + logger.info(f"Created page {i+1}", tag="TEST") + + # Verify all pages are working + for i, page in enumerate(pages): + title = await page.title() + logger.info(f"Page {i+1} title: {title}", tag="TEST") + + await manager.close() + logger.info("Browser closed successfully", tag="TEST") + + return True + except Exception as e: + logger.error(f"Test failed: {str(e)}", tag="TEST") + try: + await manager.close() + except: + pass + return False + +async def test_cdp_with_user_data_dir(): + """Test CDP browser with a user data directory.""" + logger.info("Testing CDP browser with user data directory", tag="TEST") + + # Create a temporary user data directory + import tempfile + user_data_dir = tempfile.mkdtemp(prefix="crawl4ai-test-") + logger.info(f"Created temporary user data directory: {user_data_dir}", tag="TEST") + + browser_config = BrowserConfig( + use_managed_browser=True, + headless=True, + user_data_dir=user_data_dir + ) + + manager = BrowserManager(browser_config=browser_config, logger=logger) + + try: + await manager.start() + logger.info("Browser launched with user data directory", tag="TEST") + + # Navigate to a page and store some data + crawler_config = CrawlerRunConfig() + page, context = await manager.get_page(crawler_config) + + # Set a cookie + await context.add_cookies([{ + "name": "test_cookie", + "value": "test_value", + "url": "https://example.com" + }]) + + # Visit the site + await page.goto("https://example.com") + + # Verify cookie was set + cookies = await context.cookies(["https://example.com"]) + has_test_cookie = any(cookie["name"] == "test_cookie" for cookie in cookies) + logger.info(f"Cookie set successfully: {has_test_cookie}", tag="TEST") + + # Close the browser + await manager.close() + logger.info("First browser session closed", tag="TEST") + + # Start a new browser with the same user data directory + logger.info("Starting second browser session with same user data directory", tag="TEST") + manager2 = BrowserManager(browser_config=browser_config, logger=logger) + await manager2.start() + + # Get a new page and check if the cookie persists + page2, context2 = await manager2.get_page(crawler_config) + await page2.goto("https://example.com") + + # Verify cookie persisted + cookies2 = await context2.cookies(["https://example.com"]) + has_test_cookie2 = any(cookie["name"] == "test_cookie" for cookie in cookies2) + logger.info(f"Cookie persisted across sessions: {has_test_cookie2}", tag="TEST") + + # Clean up + await manager2.close() + + # Remove temporary directory + import shutil + shutil.rmtree(user_data_dir, ignore_errors=True) + logger.info(f"Removed temporary user data directory", tag="TEST") + + return has_test_cookie and has_test_cookie2 + except Exception as e: + logger.error(f"Test failed: {str(e)}", tag="TEST") + try: + await manager.close() + except: + pass + + # Clean up temporary directory + try: + import shutil + shutil.rmtree(user_data_dir, ignore_errors=True) + except: + pass + + return False + +async def test_cdp_session_management(): + """Test session management with CDP browser.""" + logger.info("Testing session management with CDP browser", tag="TEST") + + browser_config = BrowserConfig( + use_managed_browser=True, + headless=True + ) + + manager = BrowserManager(browser_config=browser_config, logger=logger) + + try: + await manager.start() + logger.info("Browser launched successfully", tag="TEST") + + # Create two sessions + session1_id = "test_session_1" + session2_id = "test_session_2" + + # Set up first session + crawler_config1 = CrawlerRunConfig(session_id=session1_id) + page1, context1 = await manager.get_page(crawler_config1) + await page1.goto("https://example.com") + await page1.evaluate("localStorage.setItem('session1_data', 'test_value')") + logger.info(f"Set up session 1 with ID: {session1_id}", tag="TEST") + + # Set up second session + crawler_config2 = CrawlerRunConfig(session_id=session2_id) + page2, context2 = await manager.get_page(crawler_config2) + await page2.goto("https://example.org") + await page2.evaluate("localStorage.setItem('session2_data', 'test_value2')") + logger.info(f"Set up session 2 with ID: {session2_id}", tag="TEST") + + # Get first session again + page1_again, _ = await manager.get_page(crawler_config1) + + # Verify it's the same page and data persists + is_same_page = page1 == page1_again + data1 = await page1_again.evaluate("localStorage.getItem('session1_data')") + logger.info(f"Session 1 reuse successful: {is_same_page}, data: {data1}", tag="TEST") + + # Kill first session + await manager.kill_session(session1_id) + logger.info(f"Killed session 1", tag="TEST") + + # Verify second session still works + data2 = await page2.evaluate("localStorage.getItem('session2_data')") + logger.info(f"Session 2 still functional after killing session 1, data: {data2}", tag="TEST") + + # Clean up + await manager.close() + logger.info("Browser closed successfully", tag="TEST") + + return is_same_page and data1 == "test_value" and data2 == "test_value2" + except Exception as e: + logger.error(f"Test failed: {str(e)}", tag="TEST") + try: + await manager.close() + except: + pass + return False + +async def run_tests(): + """Run all tests sequentially.""" + results = [] + + results.append(await test_cdp_launch_connect()) + results.append(await test_cdp_with_user_data_dir()) + results.append(await test_cdp_session_management()) + + # Print summary + total = len(results) + passed = sum(results) + logger.info(f"Tests complete: {passed}/{total} passed", tag="SUMMARY") + + if passed == total: + logger.success("All tests passed!", tag="SUMMARY") + else: + logger.error(f"{total - passed} tests failed", tag="SUMMARY") + +if __name__ == "__main__": + asyncio.run(run_tests()) diff --git a/tests/browser/test_combined.py b/tests/browser/test_combined.py new file mode 100644 index 00000000..b5bce3cd --- /dev/null +++ b/tests/browser/test_combined.py @@ -0,0 +1,77 @@ +"""Combined test runner for all browser module tests. + +This script runs all the browser module tests in sequence and +provides a comprehensive summary. +""" + +import asyncio +import os +import sys +import time + +# Add the project root to Python path if running directly +if __name__ == "__main__": + sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '../..'))) + +from crawl4ai.async_logger import AsyncLogger + +# Create a logger for clear terminal output +logger = AsyncLogger(verbose=True, log_file=None) + +async def run_test_module(module_name, header): + """Run all tests in a module and return results.""" + logger.info(f"\n{'-'*30}", tag="TEST") + logger.info(f"RUNNING: {header}", tag="TEST") + logger.info(f"{'-'*30}", tag="TEST") + + # Import the module dynamically + module = __import__(f"tests.browser.{module_name}", fromlist=["run_tests"]) + + # Track time for performance measurement + start_time = time.time() + + # Run the tests + await module.run_tests() + + # Calculate time taken + time_taken = time.time() - start_time + logger.info(f"Time taken: {time_taken:.2f} seconds", tag="TIMING") + + return time_taken + +async def main(): + """Run all test modules.""" + logger.info("STARTING COMPREHENSIVE BROWSER MODULE TESTS", tag="MAIN") + + # List of test modules to run + test_modules = [ + ("test_browser_manager", "Browser Manager Tests"), + ("test_playwright_strategy", "Playwright Strategy Tests"), + ("test_cdp_strategy", "CDP Strategy Tests"), + ("test_builtin_strategy", "Builtin Browser Strategy Tests"), + ("test_profiles", "Profile Management Tests") + ] + + # Run each test module + timings = {} + for module_name, header in test_modules: + try: + time_taken = await run_test_module(module_name, header) + timings[module_name] = time_taken + except Exception as e: + logger.error(f"Error running {module_name}: {str(e)}", tag="ERROR") + + # Print summary + logger.info("\n\nTEST SUMMARY:", tag="SUMMARY") + logger.info(f"{'-'*50}", tag="SUMMARY") + for module_name, header in test_modules: + if module_name in timings: + logger.info(f"{header}: {timings[module_name]:.2f} seconds", tag="SUMMARY") + else: + logger.error(f"{header}: FAILED TO RUN", tag="SUMMARY") + logger.info(f"{'-'*50}", tag="SUMMARY") + total_time = sum(timings.values()) + logger.info(f"Total time: {total_time:.2f} seconds", tag="SUMMARY") + +if __name__ == "__main__": + asyncio.run(main()) diff --git a/tests/browser/test_playwright_strategy.py b/tests/browser/test_playwright_strategy.py new file mode 100644 index 00000000..1d897bcf --- /dev/null +++ b/tests/browser/test_playwright_strategy.py @@ -0,0 +1,275 @@ +"""Test examples for PlaywrightBrowserStrategy. + +These examples demonstrate the functionality of PlaywrightBrowserStrategy +and serve as functional tests. +""" + +import asyncio +import os +import sys + +# Add the project root to Python path if running directly +if __name__ == "__main__": + sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '../..'))) + +from crawl4ai.browser import BrowserManager +from crawl4ai.async_configs import BrowserConfig, CrawlerRunConfig +from crawl4ai.async_logger import AsyncLogger + +# Create a logger for clear terminal output +logger = AsyncLogger(verbose=True, log_file=None) + +async def test_playwright_basic(): + """Test basic Playwright browser functionality.""" + logger.info("Testing standard Playwright browser", tag="TEST") + + # Create browser config for standard Playwright + browser_config = BrowserConfig( + headless=True, + viewport_width=1280, + viewport_height=800 + ) + + # Create browser manager with the config + manager = BrowserManager(browser_config=browser_config, logger=logger) + + try: + # Start the browser + await manager.start() + logger.info("Browser started successfully", tag="TEST") + + # Create crawler config + crawler_config = CrawlerRunConfig(url="https://example.com") + + # Get a page + page, context = await manager.get_page(crawler_config) + logger.info("Got page successfully", tag="TEST") + + # Navigate to a website + await page.goto("https://example.com") + logger.info("Navigated to example.com", tag="TEST") + + # Get page title + title = await page.title() + logger.info(f"Page title: {title}", tag="TEST") + + # Clean up + await manager.close() + logger.info("Browser closed successfully", tag="TEST") + + return True + except Exception as e: + logger.error(f"Test failed: {str(e)}", tag="TEST") + # Ensure cleanup + try: + await manager.close() + except: + pass + return False + +async def test_playwright_text_mode(): + """Test Playwright browser in text-only mode.""" + logger.info("Testing Playwright text mode", tag="TEST") + + # Create browser config with text mode enabled + browser_config = BrowserConfig( + headless=True, + text_mode=True # Enable text-only mode + ) + + # Create browser manager with the config + manager = BrowserManager(browser_config=browser_config, logger=logger) + + try: + # Start the browser + await manager.start() + logger.info("Browser started successfully in text mode", tag="TEST") + + # Get a page + crawler_config = CrawlerRunConfig(url="https://example.com") + page, context = await manager.get_page(crawler_config) + + # Navigate to a website + await page.goto("https://example.com") + logger.info("Navigated to example.com", tag="TEST") + + # Get page title + title = await page.title() + logger.info(f"Page title: {title}", tag="TEST") + + # Check if images are blocked in text mode + # We'll check if any image requests were made + has_images = False + async with page.expect_request("**/*.{png,jpg,jpeg,gif,webp,svg}", timeout=1000) as request_info: + try: + # Try to load a page with images + await page.goto("https://picsum.photos/", wait_until="domcontentloaded") + request = await request_info.value + has_images = True + except: + # Timeout without image requests means text mode is working + has_images = False + + logger.info(f"Text mode image blocking working: {not has_images}", tag="TEST") + + # Clean up + await manager.close() + logger.info("Browser closed successfully", tag="TEST") + + return True + except Exception as e: + logger.error(f"Test failed: {str(e)}", tag="TEST") + # Ensure cleanup + try: + await manager.close() + except: + pass + return False + +async def test_playwright_context_reuse(): + """Test context caching and reuse with identical configurations.""" + logger.info("Testing context reuse with identical configurations", tag="TEST") + + # Create browser config + browser_config = BrowserConfig(headless=True) + + # Create browser manager + manager = BrowserManager(browser_config=browser_config, logger=logger) + + try: + # Start the browser + await manager.start() + logger.info("Browser started successfully", tag="TEST") + + # Create identical crawler configs + crawler_config1 = CrawlerRunConfig( + url="https://example.com", + viewport_width=1280, + viewport_height=800 + ) + + crawler_config2 = CrawlerRunConfig( + url="https://example.org", # Different URL but same browser parameters + viewport_width=1280, + viewport_height=800 + ) + + # Get pages with these configs + page1, context1 = await manager.get_page(crawler_config1) + page2, context2 = await manager.get_page(crawler_config2) + + # Check if contexts are reused + is_same_context = context1 == context2 + logger.info(f"Contexts reused: {is_same_context}", tag="TEST") + + # Now try with a different config + crawler_config3 = CrawlerRunConfig( + url="https://example.net", + viewport_width=800, # Different viewport size + viewport_height=600 + ) + + page3, context3 = await manager.get_page(crawler_config3) + + # This should be a different context + is_different_context = context1 != context3 + logger.info(f"Different contexts for different configs: {is_different_context}", tag="TEST") + + # Clean up + await manager.close() + logger.info("Browser closed successfully", tag="TEST") + + # Both tests should pass for success + return is_same_context and is_different_context + except Exception as e: + logger.error(f"Test failed: {str(e)}", tag="TEST") + # Ensure cleanup + try: + await manager.close() + except: + pass + return False + +async def test_playwright_session_management(): + """Test session management with Playwright browser.""" + logger.info("Testing session management with Playwright browser", tag="TEST") + + browser_config = BrowserConfig( + headless=True + ) + + manager = BrowserManager(browser_config=browser_config, logger=logger) + + try: + await manager.start() + logger.info("Browser launched successfully", tag="TEST") + + # Create two sessions + session1_id = "playwright_session_1" + session2_id = "playwright_session_2" + + # Set up first session + crawler_config1 = CrawlerRunConfig(session_id=session1_id, url="https://example.com") + page1, context1 = await manager.get_page(crawler_config1) + await page1.goto("https://example.com") + await page1.evaluate("localStorage.setItem('playwright_session1_data', 'test_value1')") + logger.info(f"Set up session 1 with ID: {session1_id}", tag="TEST") + + # Set up second session + crawler_config2 = CrawlerRunConfig(session_id=session2_id, url="https://example.org") + page2, context2 = await manager.get_page(crawler_config2) + await page2.goto("https://example.org") + await page2.evaluate("localStorage.setItem('playwright_session2_data', 'test_value2')") + logger.info(f"Set up session 2 with ID: {session2_id}", tag="TEST") + + # Get first session again + page1_again, context1_again = await manager.get_page(crawler_config1) + + # Verify it's the same page and data persists + is_same_page = page1 == page1_again + is_same_context = context1 == context1_again + data1 = await page1_again.evaluate("localStorage.getItem('playwright_session1_data')") + logger.info(f"Session 1 reuse successful: {is_same_page}, data: {data1}", tag="TEST") + + # Kill first session + await manager.kill_session(session1_id) + logger.info(f"Killed session 1", tag="TEST") + + # Verify second session still works + data2 = await page2.evaluate("localStorage.getItem('playwright_session2_data')") + logger.info(f"Session 2 still functional after killing session 1, data: {data2}", tag="TEST") + + # Clean up + await manager.close() + logger.info("Browser closed successfully", tag="TEST") + + return is_same_page and is_same_context and data1 == "test_value1" and data2 == "test_value2" + except Exception as e: + logger.error(f"Test failed: {str(e)}", tag="TEST") + try: + await manager.close() + except: + pass + return False + +async def run_tests(): + """Run all tests sequentially.""" + results = [] + + results.append(await test_playwright_basic()) + results.append(await test_playwright_text_mode()) + results.append(await test_playwright_context_reuse()) + results.append(await test_playwright_session_management()) + + # Print summary + total = len(results) + passed = sum(results) + logger.info(f"Tests complete: {passed}/{total} passed", tag="SUMMARY") + + if passed == total: + logger.success("All tests passed!", tag="SUMMARY") + else: + logger.error(f"{total - passed} tests failed", tag="SUMMARY") + +if __name__ == "__main__": + asyncio.run(run_tests()) diff --git a/tests/browser/test_profiles.py b/tests/browser/test_profiles.py new file mode 100644 index 00000000..8325b561 --- /dev/null +++ b/tests/browser/test_profiles.py @@ -0,0 +1,176 @@ +"""Test examples for BrowserProfileManager. + +These examples demonstrate the functionality of BrowserProfileManager +and serve as functional tests. +""" + +import asyncio +import os +import sys +import uuid +import shutil + +# Add the project root to Python path if running directly +if __name__ == "__main__": + sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '../..'))) + +from crawl4ai.browser import BrowserManager, BrowserProfileManager +from crawl4ai.async_configs import BrowserConfig, CrawlerRunConfig +from crawl4ai.async_logger import AsyncLogger + +# Create a logger for clear terminal output +logger = AsyncLogger(verbose=True, log_file=None) + +async def test_profile_creation(): + """Test creating and managing browser profiles.""" + logger.info("Testing profile creation and management", tag="TEST") + + profile_manager = BrowserProfileManager(logger=logger) + + try: + # List existing profiles + profiles = profile_manager.list_profiles() + logger.info(f"Found {len(profiles)} existing profiles", tag="TEST") + + # Generate a unique profile name for testing + test_profile_name = f"test-profile-{uuid.uuid4().hex[:8]}" + + # Create a test profile directory + profile_path = os.path.join(profile_manager.profiles_dir, test_profile_name) + os.makedirs(os.path.join(profile_path, "Default"), exist_ok=True) + + # Create a dummy Preferences file to simulate a Chrome profile + with open(os.path.join(profile_path, "Default", "Preferences"), "w") as f: + f.write("{\"test\": true}") + + logger.info(f"Created test profile at: {profile_path}", tag="TEST") + + # Verify the profile is now in the list + profiles = profile_manager.list_profiles() + profile_found = any(p["name"] == test_profile_name for p in profiles) + logger.info(f"Profile found in list: {profile_found}", tag="TEST") + + # Try to get the profile path + retrieved_path = profile_manager.get_profile_path(test_profile_name) + path_match = retrieved_path == profile_path + logger.info(f"Retrieved correct profile path: {path_match}", tag="TEST") + + # Delete the profile + success = profile_manager.delete_profile(test_profile_name) + logger.info(f"Profile deletion successful: {success}", tag="TEST") + + # Verify it's gone + profiles_after = profile_manager.list_profiles() + profile_removed = not any(p["name"] == test_profile_name for p in profiles_after) + logger.info(f"Profile removed from list: {profile_removed}", tag="TEST") + + # Clean up just in case + if os.path.exists(profile_path): + shutil.rmtree(profile_path, ignore_errors=True) + + return profile_found and path_match and success and profile_removed + except Exception as e: + logger.error(f"Test failed: {str(e)}", tag="TEST") + # Clean up test directory + try: + if os.path.exists(profile_path): + shutil.rmtree(profile_path, ignore_errors=True) + except: + pass + return False + +async def test_profile_with_browser(): + """Test using a profile with a browser.""" + logger.info("Testing using a profile with a browser", tag="TEST") + + profile_manager = BrowserProfileManager(logger=logger) + test_profile_name = f"test-browser-profile-{uuid.uuid4().hex[:8]}" + profile_path = None + + try: + # Create a test profile directory + profile_path = os.path.join(profile_manager.profiles_dir, test_profile_name) + os.makedirs(os.path.join(profile_path, "Default"), exist_ok=True) + + # Create a dummy Preferences file to simulate a Chrome profile + with open(os.path.join(profile_path, "Default", "Preferences"), "w") as f: + f.write("{\"test\": true}") + + logger.info(f"Created test profile at: {profile_path}", tag="TEST") + + # Now use this profile with a browser + browser_config = BrowserConfig( + user_data_dir=profile_path, + headless=True + ) + + manager = BrowserManager(browser_config=browser_config, logger=logger) + + # Start the browser with the profile + await manager.start() + logger.info("Browser started with profile", tag="TEST") + + # Create a page + crawler_config = CrawlerRunConfig() + page, context = await manager.get_page(crawler_config) + + # Navigate and set some data to verify profile works + await page.goto("https://example.com") + await page.evaluate("localStorage.setItem('test_data', 'profile_value')") + + # Close browser + await manager.close() + logger.info("First browser session closed", tag="TEST") + + # Create a new browser with the same profile + manager2 = BrowserManager(browser_config=browser_config, logger=logger) + await manager2.start() + logger.info("Second browser session started with same profile", tag="TEST") + + # Get a page and check if the data persists + page2, context2 = await manager2.get_page(crawler_config) + await page2.goto("https://example.com") + data = await page2.evaluate("localStorage.getItem('test_data')") + + # Verify data persisted + data_persisted = data == "profile_value" + logger.info(f"Data persisted across sessions: {data_persisted}", tag="TEST") + + # Clean up + await manager2.close() + logger.info("Second browser session closed", tag="TEST") + + # Delete the test profile + success = profile_manager.delete_profile(test_profile_name) + logger.info(f"Test profile deleted: {success}", tag="TEST") + + return data_persisted and success + except Exception as e: + logger.error(f"Test failed: {str(e)}", tag="TEST") + # Clean up + try: + if profile_path and os.path.exists(profile_path): + shutil.rmtree(profile_path, ignore_errors=True) + except: + pass + return False + +async def run_tests(): + """Run all tests sequentially.""" + results = [] + + results.append(await test_profile_creation()) + results.append(await test_profile_with_browser()) + + # Print summary + total = len(results) + passed = sum(results) + logger.info(f"Tests complete: {passed}/{total} passed", tag="SUMMARY") + + if passed == total: + logger.success("All tests passed!", tag="SUMMARY") + else: + logger.error(f"{total - passed} tests failed", tag="SUMMARY") + +if __name__ == "__main__": + asyncio.run(run_tests()) From 0094cac6756d13676dcbd83fa69e2670cc316eca Mon Sep 17 00:00:00 2001 From: UncleCode Date: Sun, 23 Mar 2025 18:53:24 +0800 Subject: [PATCH 2/5] refactor(browser): improve parallel crawling and browser management Remove PagePoolConfig in favor of direct page management in browser strategies. Add get_pages() method for efficient parallel page creation. Improve storage state handling and persistence. Add comprehensive parallel crawling tests and performance analysis. BREAKING CHANGE: Removed PagePoolConfig class and related functionality. --- crawl4ai/async_configs.py | 50 +- crawl4ai/browser/manager.py | 27 +- crawl4ai/browser/strategies.py | 292 ++++++-- crawl4ai/browser/utils.py | 233 +++++- tests/browser/test_browser_manager.py | 6 +- tests/browser/test_builtin_browser.py | 956 ++++++++++++++++++------ tests/browser/test_parallel_crawling.py | 902 ++++++++++++++++++++++ 7 files changed, 2115 insertions(+), 351 deletions(-) create mode 100644 tests/browser/test_parallel_crawling.py diff --git a/crawl4ai/async_configs.py b/crawl4ai/async_configs.py index 0606c656..2306a0a6 100644 --- a/crawl4ai/async_configs.py +++ b/crawl4ai/async_configs.py @@ -156,41 +156,6 @@ def is_empty_value(value: Any) -> bool: return False -class PagePoolConfig: - """Configuration for browser page pooling. - - This class configures the page pooling mechanism that maintains pre-warmed - browser pages ready for immediate use, improving performance for scenarios - where multiple URLs need to be processed in sequence. - - Attributes: - mode (str): Pooling mode - "static" or "adaptive". - "static" uses a fixed pool size defined by static_size. - "adaptive" calculates optimal size based on available system memory. - Default: "static". - static_size (int): Number of pages to maintain in the pool when mode is "static". - Default: 10. - memory_per_page (int): Estimated memory used by a single page in MB. - Used for "adaptive" mode calculations. - Default: 200. - memory_threshold (float): Maximum percentage of system memory to use in "adaptive" mode. - Default: 0.7 (70% of available memory). - timeout (float): Seconds to wait for a page from the pool before creating a new one. - Default: 5.0. - """ - - def __init__(self, - mode="static", - static_size=10, - memory_per_page=200, - memory_threshold=0.7, - timeout=5.0): - self.mode = mode - self.static_size = static_size - self.memory_per_page = memory_per_page - self.memory_threshold = memory_threshold - self.timeout = timeout - class BrowserConfig: """ Configuration class for setting up a browser instance and its context in AsyncPlaywrightCrawlerStrategy. @@ -235,7 +200,7 @@ class BrowserConfig: Default: False. downloads_path (str or None): Directory to store downloaded files. If None and accept_downloads is True, a default path will be created. Default: None. - storage_state (str or dict or None): Path or object describing storage state (cookies, localStorage). + storage_state (str or dict or None): An in-memory storage state (cookies, localStorage). Default: None. ignore_https_errors (bool): Ignore HTTPS certificate errors. Default: True. java_script_enabled (bool): Enable JavaScript execution in pages. Default: True. @@ -255,9 +220,6 @@ class BrowserConfig: light_mode (bool): Disables certain background features for performance gains. Default: False. extra_args (list): Additional command-line arguments passed to the browser. Default: []. - page_pool_config (PagePoolConfig or None): Configuration for page pooling mechanism. - If None, page pooling is disabled. - Default: None. """ def __init__( @@ -298,7 +260,6 @@ class BrowserConfig: extra_args: list = None, debugging_port: int = 9222, host: str = "localhost", - page_pool_config: Optional[PagePoolConfig] = None, ): self.browser_type = browser_type self.headless = headless @@ -337,7 +298,6 @@ class BrowserConfig: self.verbose = verbose self.debugging_port = debugging_port self.host = host - self.page_pool_config = page_pool_config fa_user_agenr_generator = ValidUAGenerator() if self.user_agent_mode == "random": @@ -368,12 +328,6 @@ class BrowserConfig: @staticmethod def from_kwargs(kwargs: dict) -> "BrowserConfig": - # Handle page_pool_config - page_pool_config = kwargs.get("page_pool_config") - if isinstance(page_pool_config, dict): - # If it's a dict, convert to PagePoolConfig - page_pool_config = PagePoolConfig(**page_pool_config) - return BrowserConfig( browser_type=kwargs.get("browser_type", "chromium"), headless=kwargs.get("headless", True), @@ -407,7 +361,6 @@ class BrowserConfig: extra_args=kwargs.get("extra_args", []), debugging_port=kwargs.get("debugging_port", 9222), host=kwargs.get("host", "localhost"), - page_pool_config=page_pool_config, ) def to_dict(self): @@ -442,7 +395,6 @@ class BrowserConfig: "verbose": self.verbose, "debugging_port": self.debugging_port, "host": self.host, - "page_pool_config": self.page_pool_config, } def clone(self, **kwargs): diff --git a/crawl4ai/browser/manager.py b/crawl4ai/browser/manager.py index 4ebee637..3a37efcb 100644 --- a/crawl4ai/browser/manager.py +++ b/crawl4ai/browser/manager.py @@ -2,11 +2,14 @@ This module provides a central browser management class that uses the strategy pattern internally while maintaining the existing API. +It also implements a page pooling mechanism for improved performance. """ import asyncio import time -from typing import Optional, Tuple, Dict, Any +import os +import psutil +from typing import Optional, Tuple, Dict, Any, List, Set from playwright.async_api import Page, BrowserContext @@ -117,6 +120,28 @@ class BrowserManager: self.sessions = self._strategy.sessions return page, context + + async def get_pages(self, crawlerRunConfig: CrawlerRunConfig, count: int = 1) -> List[Tuple[Page, BrowserContext]]: + """Get multiple pages with the same configuration. + + This method efficiently creates multiple browser pages using the same configuration, + which is useful for parallel crawling of multiple URLs. + + Args: + crawlerRunConfig: Configuration for the pages + count: Number of pages to create + + Returns: + List of (Page, Context) tuples + """ + # Delegate to strategy + pages = await self._strategy.get_pages(crawlerRunConfig, count) + + # Sync sessions if needed + if hasattr(self._strategy, 'sessions'): + self.sessions = self._strategy.sessions + + return pages async def kill_session(self, session_id: str): """Kill a browser session and clean up resources. diff --git a/crawl4ai/browser/strategies.py b/crawl4ai/browser/strategies.py index fd47f30e..85feef36 100644 --- a/crawl4ai/browser/strategies.py +++ b/crawl4ai/browser/strategies.py @@ -23,7 +23,7 @@ from ..async_configs import BrowserConfig, CrawlerRunConfig from ..config import DOWNLOAD_PAGE_TIMEOUT from ..js_snippet import load_js_script from ..utils import get_home_folder -from .utils import get_playwright, get_browser_executable, get_browser_disable_options, create_temp_directory, is_windows +from .utils import get_playwright, get_browser_executable, get_browser_disable_options, create_temp_directory, is_windows, is_browser_running from playwright_stealth import StealthConfig @@ -85,6 +85,22 @@ class BaseBrowserStrategy(ABC): """ pass + async def get_pages(self, crawlerRunConfig: CrawlerRunConfig, count: int = 1) -> List[Tuple[Page, BrowserContext]]: + """Get multiple pages with the same configuration. + + Args: + crawlerRunConfig: Configuration for the pages + count: Number of pages to create + + Returns: + List of (Page, Context) tuples + """ + pages = [] + for _ in range(count): + page, context = await self.get_page(crawlerRunConfig) + pages.append((page, context)) + return pages + @abstractmethod async def close(self): """Close the browser and clean up resources.""" @@ -136,9 +152,6 @@ class BaseBrowserStrategy(ABC): if self.config.cookies: await context.add_cookies(self.config.cookies) - if self.config.storage_state: - await context.storage_state(path=None) - if self.config.accept_downloads: context.set_default_timeout(DOWNLOAD_PAGE_TIMEOUT) context.set_default_navigation_timeout(DOWNLOAD_PAGE_TIMEOUT) @@ -161,7 +174,7 @@ class BaseBrowserStrategy(ABC): { "name": "cookiesEnabled", "value": "true", - "url": crawlerRunConfig.url if crawlerRunConfig else "https://crawl4ai.com/", + "url": crawlerRunConfig and crawlerRunConfig.url or "https://crawl4ai.com/", } ] ) @@ -324,12 +337,31 @@ class PlaywrightBrowserStrategy(BaseBrowserStrategy): "viewport": viewport_settings, "proxy": proxy_settings, "accept_downloads": self.config.accept_downloads, - "storage_state": self.config.storage_state, "ignore_https_errors": self.config.ignore_https_errors, "device_scale_factor": 1.0, "java_script_enabled": self.config.java_script_enabled, } + # Handle storage state properly - this is key for persistence + if self.config.storage_state: + context_settings["storage_state"] = self.config.storage_state + if self.logger: + if isinstance(self.config.storage_state, str): + self.logger.debug(f"Using storage state from file: {self.config.storage_state}", tag="BROWSER") + else: + self.logger.debug("Using storage state from config object", tag="BROWSER") + + if self.config.user_data_dir: + context_settings["storage_state"] = os.path.join( + self.config.user_data_dir, "Default", "storage_state.json" + ) + # Create the file if it doesn't exist + if not os.path.exists(context_settings["storage_state"]): + os.makedirs(os.path.dirname(context_settings["storage_state"]), exist_ok=True) + with open(context_settings["storage_state"], "w") as f: + json.dump({}, f) + + if crawlerRunConfig: # Check if there is value for crawlerRunConfig.proxy_config set add that to context if crawlerRunConfig.proxy_config: @@ -428,6 +460,21 @@ class PlaywrightBrowserStrategy(BaseBrowserStrategy): if self.config.sleep_on_close: await asyncio.sleep(0.5) + # If we have a user_data_dir configured, ensure persistence of storage state + if self.config.user_data_dir and self.browser and self.default_context: + for context in self.browser.contexts: + try: + await context.storage_state(path=os.path.join(self.config.user_data_dir, "Default", "storage_state.json")) + if self.logger: + self.logger.debug("Ensuring storage state is persisted before closing browser", tag="BROWSER") + except Exception as e: + if self.logger: + self.logger.warning( + message="Failed to ensure storage persistence: {error}", + tag="BROWSER", + params={"error": str(e)} + ) + # Close all sessions session_ids = list(self.sessions.keys()) for session_id in session_ids: @@ -582,7 +629,7 @@ class CDPBrowserStrategy(BaseBrowserStrategy): Returns: List of command-line arguments for the browser """ - browser_path = get_browser_executable(self.config.browser_type) + browser_path = await get_browser_executable(self.config.browser_type) base_args = [browser_path] if self.config.browser_type == "chromium": @@ -727,6 +774,22 @@ class CDPBrowserStrategy(BaseBrowserStrategy): if self.config.sleep_on_close: await asyncio.sleep(0.5) + # If we have a user_data_dir configured, ensure persistence of storage state + if self.config.user_data_dir and self.browser: + try: + # Create a brief sleep to allow the browser to flush any pending operations + # This helps ensure all storage state (localStorage, cookies, etc.) gets saved + await asyncio.sleep(0.3) + if self.logger: + self.logger.debug("Ensuring storage state is persisted before closing CDP browser", tag="BROWSER") + except Exception as e: + if self.logger: + self.logger.warning( + message="Failed to ensure storage persistence: {error}", + tag="BROWSER", + params={"error": str(e)} + ) + # Close all sessions session_ids = list(self.sessions.keys()) for session_id in session_ids: @@ -775,19 +838,46 @@ class BuiltinBrowserStrategy(CDPBrowserStrategy): logger: Logger for recording events and errors """ super().__init__(config, logger) - self.builtin_browser_dir = os.path.join(get_home_folder(), "builtin-browser") + self.builtin_browser_dir = os.path.join(get_home_folder(), "builtin-browser") if not self.config.user_data_dir else self.config.user_data_dir self.builtin_config_file = os.path.join(self.builtin_browser_dir, "browser_config.json") + + # Raise error if user data dir is already engaged + if self._check_user_dir_is_engaged(self.builtin_browser_dir): + raise Exception(f"User data directory {self.builtin_browser_dir} is already engaged by another browser instance.") + os.makedirs(self.builtin_browser_dir, exist_ok=True) + def _check_user_dir_is_engaged(self, user_data_dir: str) -> bool: + """Check if the user data directory is already in use. + + Returns: + bool: True if the directory is engaged, False otherwise + """ + # Load browser config file, then iterate in port_map values, check "user_data_dir" key if it matches + # the current user data directory + if os.path.exists(self.builtin_config_file): + try: + with open(self.builtin_config_file, 'r') as f: + browser_info_dict = json.load(f) + + # Check if user data dir is already engaged + for port_str, browser_info in browser_info_dict.get("port_map", {}).items(): + if browser_info.get("user_data_dir") == user_data_dir: + return True + except Exception as e: + if self.logger: + self.logger.error(f"Error reading built-in browser config: {str(e)}", tag="BUILTIN") + return False + async def start(self): """Start or connect to the built-in browser. Returns: self: For method chaining """ - # Check for existing built-in browser - browser_info = self.get_builtin_browser_info() - if browser_info and self._is_browser_running(browser_info.get('pid')): + # Check for existing built-in browser (get_browser_info already checks if running) + browser_info = self.get_browser_info() + if browser_info: if self.logger: self.logger.info(f"Using existing built-in browser at {browser_info.get('cdp_url')}", tag="BROWSER") self.config.cdp_url = browser_info.get('cdp_url') @@ -797,7 +887,7 @@ class BuiltinBrowserStrategy(CDPBrowserStrategy): cdp_url = await self.launch_builtin_browser( browser_type=self.config.browser_type, debugging_port=self.config.debugging_port, - headless=self.config.headless + headless=self.config.headless, ) if not cdp_url: if self.logger: @@ -808,55 +898,62 @@ class BuiltinBrowserStrategy(CDPBrowserStrategy): # Call parent class implementation with updated CDP URL return await super().start() - def get_builtin_browser_info(self) -> Optional[Dict[str, Any]]: - """Get information about the built-in browser. - - Returns: - dict: Browser information or None if no built-in browser is configured - """ - if not os.path.exists(self.builtin_config_file): - return None - - try: - with open(self.builtin_config_file, 'r') as f: - browser_info = json.load(f) - - # Check if the browser is still running - if not self._is_browser_running(browser_info.get('pid')): - if self.logger: - self.logger.warning("Built-in browser is not running", tag="BUILTIN") - return None - - return browser_info - except Exception as e: - if self.logger: - self.logger.error(f"Error reading built-in browser config: {str(e)}", tag="BUILTIN") - return None - - def _is_browser_running(self, pid: Optional[int]) -> bool: - """Check if a process with the given PID is running. + @classmethod + def get_builtin_browser_info(cls, debugging_port: int, config_file: str, logger: Optional[AsyncLogger] = None) -> Optional[Dict[str, Any]]: + """Get information about the built-in browser for a specific debugging port. Args: - pid: Process ID to check + debugging_port: The debugging port to look for + config_file: Path to the config file + logger: Optional logger for recording events Returns: - bool: True if the process is running, False otherwise + dict: Browser information or None if no running browser is configured for this port """ - if not pid: - return False + if not os.path.exists(config_file): + return None try: - # Check if the process exists - if is_windows(): - process = subprocess.run(["tasklist", "/FI", f"PID eq {pid}"], - capture_output=True, text=True) - return str(pid) in process.stdout - else: - # Unix-like systems - os.kill(pid, 0) # This doesn't actually kill the process, just checks if it exists - return True - except (ProcessLookupError, PermissionError, OSError): - return False + with open(config_file, 'r') as f: + browser_info_dict = json.load(f) + + # Get browser info from port map + if isinstance(browser_info_dict, dict) and "port_map" in browser_info_dict: + port_str = str(debugging_port) + if port_str in browser_info_dict["port_map"]: + browser_info = browser_info_dict["port_map"][port_str] + + # Check if the browser is still running + if not is_browser_running(browser_info.get('pid')): + if logger: + logger.warning(f"Built-in browser on port {debugging_port} is not running", tag="BUILTIN") + # Remove this port from the dictionary + del browser_info_dict["port_map"][port_str] + with open(config_file, 'w') as f: + json.dump(browser_info_dict, f, indent=2) + return None + + return browser_info + + return None + + except Exception as e: + if logger: + logger.error(f"Error reading built-in browser config: {str(e)}", tag="BUILTIN") + return None + + def get_browser_info(self) -> Optional[Dict[str, Any]]: + """Get information about the current built-in browser instance. + + Returns: + dict: Browser information or None if no running browser is configured + """ + return self.get_builtin_browser_info( + debugging_port=self.config.debugging_port, + config_file=self.builtin_config_file, + logger=self.logger + ) + async def launch_builtin_browser(self, browser_type: str = "chromium", @@ -873,18 +970,27 @@ class BuiltinBrowserStrategy(CDPBrowserStrategy): str: CDP URL for the browser, or None if launch failed """ # Check if there's an existing browser still running - browser_info = self.get_builtin_browser_info() - if browser_info and self._is_browser_running(browser_info.get('pid')): + browser_info = self.get_builtin_browser_info( + debugging_port=debugging_port, + config_file=self.builtin_config_file, + logger=self.logger + ) + if browser_info: if self.logger: - self.logger.info("Built-in browser is already running", tag="BUILTIN") + self.logger.info(f"Built-in browser is already running on port {debugging_port}", tag="BUILTIN") return browser_info.get('cdp_url') # Create a user data directory for the built-in browser user_data_dir = os.path.join(self.builtin_browser_dir, "user_data") + # Raise error if user data dir is already engaged + if self._check_user_dir_is_engaged(user_data_dir): + raise Exception(f"User data directory {user_data_dir} is already engaged by another browser instance.") + + # Create the user data directory if it doesn't exist os.makedirs(user_data_dir, exist_ok=True) # Prepare browser launch arguments - browser_path = get_browser_executable(browser_type) + browser_path = await get_browser_executable(browser_type) if browser_type == "chromium": args = [ browser_path, @@ -957,7 +1063,7 @@ class BuiltinBrowserStrategy(CDPBrowserStrategy): if self.logger: self.logger.warning(f"Could not verify browser: {str(e)}", tag="BUILTIN") - # Save browser info + # Create browser info browser_info = { 'pid': process.pid, 'cdp_url': cdp_url, @@ -968,8 +1074,31 @@ class BuiltinBrowserStrategy(CDPBrowserStrategy): 'config': config_json } + # Read existing config file if it exists + port_map = {} + if os.path.exists(self.builtin_config_file): + try: + with open(self.builtin_config_file, 'r') as f: + existing_data = json.load(f) + + # Check if it already uses port mapping + if isinstance(existing_data, dict) and "port_map" in existing_data: + port_map = existing_data["port_map"] + # Convert legacy format to port mapping + elif isinstance(existing_data, dict) and "debugging_port" in existing_data: + old_port = str(existing_data.get("debugging_port")) + if self._is_browser_running(existing_data.get("pid")): + port_map[old_port] = existing_data + except Exception as e: + if self.logger: + self.logger.warning(f"Could not read existing config: {str(e)}", tag="BUILTIN") + + # Add/update this browser in the port map + port_map[str(debugging_port)] = browser_info + + # Write updated config with open(self.builtin_config_file, 'w') as f: - json.dump(browser_info, f, indent=2) + json.dump({"port_map": port_map}, f, indent=2) # Detach from the browser process - don't keep any references # This is important to allow the Python script to exit while the browser continues running @@ -990,10 +1119,10 @@ class BuiltinBrowserStrategy(CDPBrowserStrategy): Returns: bool: True if the browser was killed, False otherwise """ - browser_info = self.get_builtin_browser_info() + browser_info = self.get_browser_info() if not browser_info: if self.logger: - self.logger.warning("No built-in browser found", tag="BUILTIN") + self.logger.warning(f"No built-in browser found on port {self.config.debugging_port}", tag="BUILTIN") return False pid = browser_info.get('pid') @@ -1007,16 +1136,29 @@ class BuiltinBrowserStrategy(CDPBrowserStrategy): os.kill(pid, signal.SIGTERM) # Wait for termination for _ in range(5): - if not self._is_browser_running(pid): + if not is_browser_running(pid): break await asyncio.sleep(0.5) else: # Force kill if still running os.kill(pid, signal.SIGKILL) - # Remove config file - if os.path.exists(self.builtin_config_file): - os.unlink(self.builtin_config_file) + # Update config file to remove this browser + with open(self.builtin_config_file, 'r') as f: + browser_info_dict = json.load(f) + # Remove this port from the dictionary + port_str = str(self.config.debugging_port) + if port_str in browser_info_dict.get("port_map", {}): + del browser_info_dict["port_map"][port_str] + with open(self.builtin_config_file, 'w') as f: + json.dump(browser_info_dict, f, indent=2) + # Remove user data directory if it exists + if os.path.exists(self.builtin_browser_dir): + shutil.rmtree(self.builtin_browser_dir) + # Clear the browser info cache + self.browser = None + self.temp_dir = None + self.shutting_down = True if self.logger: self.logger.success("Built-in browser terminated", tag="BUILTIN") @@ -1032,17 +1174,29 @@ class BuiltinBrowserStrategy(CDPBrowserStrategy): Returns: dict: Status information with running, cdp_url, and info fields """ - browser_info = self.get_builtin_browser_info() + browser_info = self.get_browser_info() if not browser_info: return { 'running': False, 'cdp_url': None, - 'info': None + 'info': None, + 'port': self.config.debugging_port } return { 'running': True, 'cdp_url': browser_info.get('cdp_url'), - 'info': browser_info + 'info': browser_info, + 'port': self.config.debugging_port } + + # Override the close method to handle built-in browser cleanup + async def close(self): + """Close the built-in browser and clean up resources.""" + # Call parent class close method + await super().close() + + # Clean up built-in browser if we created it + if self.shutting_down: + await self.kill_builtin_browser() diff --git a/crawl4ai/browser/utils.py b/crawl4ai/browser/utils.py index 2dff0924..74d2ea12 100644 --- a/crawl4ai/browser/utils.py +++ b/crawl4ai/browser/utils.py @@ -8,14 +8,18 @@ and Playwright instance management. import asyncio import os import sys -import platform +import time import tempfile -from typing import Optional, Any +import subprocess +from typing import Optional from playwright.async_api import async_playwright -from ..async_logger import AsyncLogger from ..utils import get_chromium_path +from ..async_configs import BrowserConfig, CrawlerRunConfig + +from ..async_logger import AsyncLogger + _playwright_instance = None @@ -30,7 +34,7 @@ async def get_playwright(): _playwright_instance = await async_playwright().start() return _playwright_instance -def get_browser_executable(browser_type: str) -> str: +async def get_browser_executable(browser_type: str) -> str: """Get the path to browser executable, with platform-specific handling. Args: @@ -39,7 +43,7 @@ def get_browser_executable(browser_type: str) -> str: Returns: Path to browser executable """ - return get_chromium_path(browser_type) + return await get_chromium_path(browser_type) def create_temp_directory(prefix="browser-profile-") -> str: """Create a temporary directory for browser data. @@ -75,6 +79,31 @@ def is_linux() -> bool: True if Linux, False otherwise """ return not (is_windows() or is_macos()) + +def is_browser_running(pid: Optional[int]) -> bool: + """Check if a process with the given PID is running. + + Args: + pid: Process ID to check + + Returns: + bool: True if the process is running, False otherwise + """ + if not pid: + return False + + try: + # Check if the process exists + if is_windows(): + process = subprocess.run(["tasklist", "/FI", f"PID eq {pid}"], + capture_output=True, text=True) + return str(pid) in process.stdout + else: + # Unix-like systems + os.kill(pid, 0) # This doesn't actually kill the process, just checks if it exists + return True + except (ProcessLookupError, PermissionError, OSError): + return False def get_browser_disable_options() -> list: """Get standard list of browser disable options for performance. @@ -103,3 +132,197 @@ def get_browser_disable_options() -> list: "--password-store=basic", "--use-mock-keychain", ] + + +async def find_optimal_browser_config(total_urls=50, verbose=True, rate_limit_delay=0.2): + """Find optimal browser configuration for crawling a specific number of URLs. + + Args: + total_urls: Number of URLs to crawl + verbose: Whether to print progress + rate_limit_delay: Delay between page loads to avoid rate limiting + + Returns: + dict: Contains fastest, lowest_memory, and optimal configurations + """ + from .manager import BrowserManager + if verbose: + print(f"\n=== Finding optimal configuration for crawling {total_urls} URLs ===\n") + + # Generate test URLs with timestamp to avoid caching + timestamp = int(time.time()) + urls = [f"https://example.com/page_{i}?t={timestamp}" for i in range(total_urls)] + + # Limit browser configurations to test (1 browser to max 10) + max_browsers = min(10, total_urls) + configs_to_test = [] + + # Generate configurations (browser count, pages distribution) + for num_browsers in range(1, max_browsers + 1): + base_pages = total_urls // num_browsers + remainder = total_urls % num_browsers + + # Create distribution array like [3, 3, 2, 2] (some browsers get one more page) + if remainder > 0: + distribution = [base_pages + 1] * remainder + [base_pages] * (num_browsers - remainder) + else: + distribution = [base_pages] * num_browsers + + configs_to_test.append((num_browsers, distribution)) + + results = [] + + # Test each configuration + for browser_count, page_distribution in configs_to_test: + if verbose: + print(f"Testing {browser_count} browsers with distribution {tuple(page_distribution)}") + + try: + # Track memory if possible + try: + import psutil + process = psutil.Process() + start_memory = process.memory_info().rss / (1024 * 1024) # MB + except ImportError: + if verbose: + print("Memory tracking not available (psutil not installed)") + start_memory = 0 + + # Start browsers in parallel + managers = [] + start_tasks = [] + start_time = time.time() + + logger = AsyncLogger(verbose=True, log_file=None) + + for i in range(browser_count): + config = BrowserConfig(headless=True) + manager = BrowserManager(browser_config=config, logger=logger) + start_tasks.append(manager.start()) + managers.append(manager) + + await asyncio.gather(*start_tasks) + + # Distribute URLs among browsers + urls_per_manager = {} + url_index = 0 + + for i, manager in enumerate(managers): + pages_for_this_browser = page_distribution[i] + end_index = url_index + pages_for_this_browser + urls_per_manager[manager] = urls[url_index:end_index] + url_index = end_index + + # Create pages for each browser + all_pages = [] + for manager, manager_urls in urls_per_manager.items(): + if not manager_urls: + continue + pages = await manager.get_pages(CrawlerRunConfig(), count=len(manager_urls)) + all_pages.extend(zip(pages, manager_urls)) + + # Crawl pages with delay to avoid rate limiting + async def crawl_page(page_ctx, url): + page, _ = page_ctx + try: + await page.goto(url) + if rate_limit_delay > 0: + await asyncio.sleep(rate_limit_delay) + title = await page.title() + return title + finally: + await page.close() + + crawl_start = time.time() + crawl_tasks = [crawl_page(page_ctx, url) for page_ctx, url in all_pages] + await asyncio.gather(*crawl_tasks) + crawl_time = time.time() - crawl_start + total_time = time.time() - start_time + + # Measure final memory usage + if start_memory > 0: + end_memory = process.memory_info().rss / (1024 * 1024) + memory_used = end_memory - start_memory + else: + memory_used = 0 + + # Close all browsers + for manager in managers: + await manager.close() + + # Calculate metrics + pages_per_second = total_urls / crawl_time + + # Calculate efficiency score (higher is better) + # This balances speed vs memory + if memory_used > 0: + efficiency = pages_per_second / (memory_used + 1) + else: + efficiency = pages_per_second + + # Store result + result = { + "browser_count": browser_count, + "distribution": tuple(page_distribution), + "crawl_time": crawl_time, + "total_time": total_time, + "memory_used": memory_used, + "pages_per_second": pages_per_second, + "efficiency": efficiency + } + + results.append(result) + + if verbose: + print(f" ✓ Crawled {total_urls} pages in {crawl_time:.2f}s ({pages_per_second:.1f} pages/sec)") + if memory_used > 0: + print(f" ✓ Memory used: {memory_used:.1f}MB ({memory_used/total_urls:.1f}MB per page)") + print(f" ✓ Efficiency score: {efficiency:.4f}") + + except Exception as e: + if verbose: + print(f" ✗ Error: {str(e)}") + + # Clean up + for manager in managers: + try: + await manager.close() + except: + pass + + # If no successful results, return None + if not results: + return None + + # Find best configurations + fastest = sorted(results, key=lambda x: x["crawl_time"])[0] + + # Only consider memory if available + memory_results = [r for r in results if r["memory_used"] > 0] + if memory_results: + lowest_memory = sorted(memory_results, key=lambda x: x["memory_used"])[0] + else: + lowest_memory = fastest + + # Find most efficient (balanced speed vs memory) + optimal = sorted(results, key=lambda x: x["efficiency"], reverse=True)[0] + + # Print summary + if verbose: + print("\n=== OPTIMAL CONFIGURATIONS ===") + print(f"⚡ Fastest: {fastest['browser_count']} browsers {fastest['distribution']}") + print(f" {fastest['crawl_time']:.2f}s, {fastest['pages_per_second']:.1f} pages/sec") + + print(f"💾 Memory-efficient: {lowest_memory['browser_count']} browsers {lowest_memory['distribution']}") + if lowest_memory["memory_used"] > 0: + print(f" {lowest_memory['memory_used']:.1f}MB, {lowest_memory['memory_used']/total_urls:.2f}MB per page") + + print(f"🌟 Balanced optimal: {optimal['browser_count']} browsers {optimal['distribution']}") + print(f" {optimal['crawl_time']:.2f}s, {optimal['pages_per_second']:.1f} pages/sec, score: {optimal['efficiency']:.4f}") + + return { + "fastest": fastest, + "lowest_memory": lowest_memory, + "optimal": optimal, + "all_configs": results + } diff --git a/tests/browser/test_browser_manager.py b/tests/browser/test_browser_manager.py index 2293b90d..d8f9376d 100644 --- a/tests/browser/test_browser_manager.py +++ b/tests/browser/test_browser_manager.py @@ -171,9 +171,9 @@ async def run_tests(): """Run all tests sequentially.""" results = [] - # results.append(await test_basic_browser_manager()) - # results.append(await test_custom_browser_config()) - # results.append(await test_multiple_pages()) + results.append(await test_basic_browser_manager()) + results.append(await test_custom_browser_config()) + results.append(await test_multiple_pages()) results.append(await test_session_management()) # Print summary diff --git a/tests/browser/test_builtin_browser.py b/tests/browser/test_builtin_browser.py index 9a273ef7..013da637 100644 --- a/tests/browser/test_builtin_browser.py +++ b/tests/browser/test_builtin_browser.py @@ -1,12 +1,12 @@ """ -Test script for browser_profiler and builtin browser functionality. +Test script for builtin browser functionality in the browser module. This script tests: 1. Creating a builtin browser 2. Getting browser information 3. Killing the browser 4. Restarting the browser -5. Testing crawling with different browser modes +5. Testing operations with different browser strategies 6. Testing edge cases """ @@ -14,13 +14,20 @@ import asyncio import os import sys import time -from colorama import Fore, init +from typing import List, Dict, Any +from colorama import Fore, Style, init # Add the project root to the path for imports -sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '../..'))) +sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), "../.."))) -from crawl4ai.browser_profiler import BrowserProfiler -from crawl4ai.async_webcrawler import AsyncWebCrawler +from rich.console import Console +from rich.table import Table +from rich.panel import Panel +from rich.text import Text +from rich.box import Box, SIMPLE + +from crawl4ai.browser import BrowserManager +from crawl4ai.browser.strategies import BuiltinBrowserStrategy from crawl4ai.async_configs import BrowserConfig, CrawlerRunConfig from crawl4ai.async_logger import AsyncLogger @@ -37,264 +44,765 @@ RESET = Fore.RESET # Create logger logger = AsyncLogger(verbose=True) -async def test_browser_profiler(): - """Test the BrowserProfiler class functionality""" - print(f"\n{INFO}========== Testing BrowserProfiler =========={RESET}") - - # Initialize browser profiler - profiler = BrowserProfiler(logger=logger) - - # Step 1: Check if builtin browser exists and kill it if it does - print(f"\n{INFO}1. Checking if builtin browser exists{RESET}") - browser_info = profiler.get_builtin_browser_info() - if browser_info: - print(f"{SUCCESS}Builtin browser found: {browser_info['cdp_url']}{RESET}") - # Kill it to start with a clean state - print(f"{INFO}Killing existing browser...{RESET}") - await profiler.kill_builtin_browser() - browser_info = profiler.get_builtin_browser_info() - if not browser_info: - print(f"{SUCCESS}Browser successfully killed{RESET}") - else: - print(f"{ERROR}Failed to kill browser{RESET}") + +async def test_builtin_browser_creation(): + """Test creating a builtin browser using the BrowserManager with BuiltinBrowserStrategy""" + print(f"\n{INFO}========== Testing Builtin Browser Creation =========={RESET}") + + # Step 1: Create a BrowserManager with builtin mode + print(f"\n{INFO}1. Creating BrowserManager with builtin mode{RESET}") + browser_config = BrowserConfig(browser_mode="builtin", headless=True, verbose=True) + manager = BrowserManager(browser_config=browser_config, logger=logger) + + # Step 2: Check if we have a BuiltinBrowserStrategy + print(f"\n{INFO}2. Checking if we have a BuiltinBrowserStrategy{RESET}") + if isinstance(manager._strategy, BuiltinBrowserStrategy): + print( + f"{SUCCESS}Correct strategy type: {manager._strategy.__class__.__name__}{RESET}" + ) else: - print(f"{WARNING}No builtin browser found{RESET}") - - # Step 2: Launch a new builtin browser - print(f"\n{INFO}2. Launching new builtin browser{RESET}") - cdp_url = await profiler.launch_builtin_browser(headless=True) - if cdp_url: - print(f"{SUCCESS}Builtin browser launched at: {cdp_url}{RESET}") - else: - print(f"{ERROR}Failed to launch builtin browser{RESET}") - return - - # Step 3: Get and display browser information - print(f"\n{INFO}3. Getting browser information{RESET}") - browser_info = profiler.get_builtin_browser_info() + print( + f"{ERROR}Wrong strategy type: {manager._strategy.__class__.__name__}{RESET}" + ) + return None + + # Step 3: Start the manager to launch or connect to builtin browser + print(f"\n{INFO}3. Starting the browser manager{RESET}") + try: + await manager.start() + print(f"{SUCCESS}Browser manager started successfully{RESET}") + except Exception as e: + print(f"{ERROR}Failed to start browser manager: {str(e)}{RESET}") + return None + + # Step 4: Get browser info from the strategy + print(f"\n{INFO}4. Getting browser information{RESET}") + browser_info = manager._strategy.get_builtin_browser_info() if browser_info: print(f"{SUCCESS}Browser info retrieved:{RESET}") for key, value in browser_info.items(): - if key != 'config': # Skip the verbose config section + if key != "config": # Skip the verbose config section print(f" {key}: {value}") + + cdp_url = browser_info.get("cdp_url") + print(f"{SUCCESS}CDP URL: {cdp_url}{RESET}") else: print(f"{ERROR}Failed to get browser information{RESET}") - - # Step 4: Get browser status - print(f"\n{INFO}4. Getting browser status{RESET}") - status = await profiler.get_builtin_browser_status() - print(f"Running: {status['running']}") - print(f"CDP URL: {status['cdp_url']}") - - # Pause to let the browser run for a moment - print(f"\n{INFO}Waiting for 2 seconds...{RESET}") - await asyncio.sleep(2) - - return cdp_url # Return the CDP URL for the crawling tests + cdp_url = None -async def test_crawling_with_builtin_browser(cdp_url): - """Test crawling with the builtin browser""" - print(f"\n{INFO}========== Testing Crawling with Builtin Browser =========={RESET}") - - # Step 1: Create a crawler with 'builtin' browser mode - print(f"\n{INFO}1. Creating crawler with 'builtin' browser mode{RESET}") - browser_config = BrowserConfig( - browser_mode="builtin", - headless=True + # Save manager for later tests + return manager, cdp_url + + +async def test_page_operations(manager: BrowserManager): + """Test page operations with the builtin browser""" + print( + f"\n{INFO}========== Testing Page Operations with Builtin Browser =========={RESET}" ) - crawler = AsyncWebCrawler(config=browser_config) - - # Step 2: Test crawling without explicitly starting (should auto-start) - print(f"\n{INFO}2. Testing auto-start with arun{RESET}") + + # Step 1: Get a single page + print(f"\n{INFO}1. Getting a single page{RESET}") try: - result = await crawler.arun("https://crawl4ai.com") - print(f"{SUCCESS}Auto-start crawling successful!{RESET}") - print(f" Got {len(result.markdown.raw_markdown)} chars of markdown content") + crawler_config = CrawlerRunConfig() + page, context = await manager.get_page(crawler_config) + print(f"{SUCCESS}Got page successfully{RESET}") + + # Navigate to a test URL + await page.goto("https://example.com") + title = await page.title() + print(f"{SUCCESS}Page title: {title}{RESET}") + + # Close the page + await page.close() + print(f"{SUCCESS}Page closed successfully{RESET}") except Exception as e: - print(f"{ERROR}Auto-start crawling failed: {str(e)}{RESET}") - - # Close the crawler - await crawler.close() - - # Step 3: Test with explicit start - print(f"\n{INFO}3. Testing with explicit start{RESET}") - crawler = AsyncWebCrawler(config=browser_config) + print(f"{ERROR}Page operation failed: {str(e)}{RESET}") + return False + + # Step 2: Get multiple pages + print(f"\n{INFO}2. Getting multiple pages with get_pages(){RESET}") try: - await crawler.start() - print(f"{SUCCESS}Explicit start successful!{RESET}") - result = await crawler.arun("https://example.com") - print(f" Got {len(result.markdown.raw_markdown)} chars of markdown content") - # Try second time, no start needed - print(f"{INFO}Testing second arun call without start{RESET}") - result = await crawler.arun("https://example.com") - print(f" Got {len(result.markdown.raw_markdown)} chars of markdown content") + # Request 3 pages + crawler_config = CrawlerRunConfig() + pages = await manager.get_pages(crawler_config, count=3) + print(f"{SUCCESS}Got {len(pages)} pages{RESET}") + + # Test each page + for i, (page, context) in enumerate(pages): + await page.goto(f"https://example.com?test={i}") + title = await page.title() + print(f"{SUCCESS}Page {i + 1} title: {title}{RESET}") + await page.close() + + print(f"{SUCCESS}All pages tested and closed successfully{RESET}") except Exception as e: - print(f"{ERROR}Explicit start crawling failed: {str(e)}{RESET}") - - # Close the crawler - await crawler.close() - - # Step 4: Test with context manager - print(f"\n{INFO}4. Testing with context manager{RESET}") - try: - async with AsyncWebCrawler(config=browser_config) as crawler: - result = await crawler.arun("https://httpbin.org/html") - print(f"{SUCCESS}Context manager crawling successful!{RESET}") - print(f" Got {len(result.markdown.raw_markdown)} chars of markdown content") - except Exception as e: - print(f"{ERROR}Context manager crawling failed: {str(e)}{RESET}") - + print(f"{ERROR}Multiple page operation failed: {str(e)}{RESET}") + return False + return True -async def test_crawling_without_builtin_browser(): - """Test crawling after killing the builtin browser""" - print(f"\n{INFO}========== Testing Crawling Without Builtin Browser =========={RESET}") - - # Step 1: Kill the builtin browser - print(f"\n{INFO}1. Killing the builtin browser{RESET}") - profiler = BrowserProfiler(logger=logger) - await profiler.kill_builtin_browser() - - # Step 2: Create a crawler with 'builtin' mode (should fall back to dedicated) - print(f"\n{INFO}2. Creating crawler with 'builtin' mode (should fall back){RESET}") - browser_config = BrowserConfig( - browser_mode="builtin", - headless=True - ) - + +async def test_browser_status_management(manager: BrowserManager): + """Test browser status and management operations""" + print(f"\n{INFO}========== Testing Browser Status and Management =========={RESET}") + + # Step 1: Get browser status + print(f"\n{INFO}1. Getting browser status{RESET}") try: - async with AsyncWebCrawler(config=browser_config) as crawler: - result = await crawler.arun("https://httpbin.org/get") - print(f"{SUCCESS}Fallback to dedicated browser successful!{RESET}") - print(f" Got {len(result.markdown.raw_markdown)} chars of markdown content") + status = await manager._strategy.get_builtin_browser_status() + print(f"{SUCCESS}Browser status:{RESET}") + print(f" Running: {status['running']}") + print(f" CDP URL: {status['cdp_url']}") except Exception as e: - print(f"{ERROR}Fallback crawler failed: {str(e)}{RESET}") - - # Step 3: Test with direct CDP URL - print(f"\n{INFO}3. Testing with direct CDP URL connection{RESET}") - - # Launch a standalone browser to get a CDP URL - print(f"{INFO}Launching standalone browser...{RESET}") - cdp_url = await profiler.launch_standalone_browser(headless=True) - if not cdp_url: - print(f"{ERROR}Failed to launch standalone browser{RESET}") - return - - print(f"{SUCCESS}Got CDP URL: {cdp_url}{RESET}") - - # Create a crawler with the CDP URL - browser_config = BrowserConfig( - browser_mode="dedicated", - cdp_url=cdp_url, - use_managed_browser=True, - headless=True - ) - + print(f"{ERROR}Failed to get browser status: {str(e)}{RESET}") + return False + + # Step 2: Test killing the browser + print(f"\n{INFO}2. Testing killing the browser{RESET}") try: - async with AsyncWebCrawler(config=browser_config) as crawler: - result = await crawler.arun("https://httpbin.org/ip") - print(f"{SUCCESS}Direct CDP URL crawling successful!{RESET}") - print(f" Got {len(result.markdown.raw_markdown)} chars of markdown content") + result = await manager._strategy.kill_builtin_browser() + if result: + print(f"{SUCCESS}Browser killed successfully{RESET}") + else: + print(f"{ERROR}Failed to kill browser{RESET}") except Exception as e: - print(f"{ERROR}Direct CDP URL crawling failed: {str(e)}{RESET}") - + print(f"{ERROR}Browser kill operation failed: {str(e)}{RESET}") + return False + + # Step 3: Check status after kill + print(f"\n{INFO}3. Checking status after kill{RESET}") + try: + status = await manager._strategy.get_builtin_browser_status() + if not status["running"]: + print(f"{SUCCESS}Browser is correctly reported as not running{RESET}") + else: + print(f"{ERROR}Browser is incorrectly reported as still running{RESET}") + except Exception as e: + print(f"{ERROR}Failed to get browser status: {str(e)}{RESET}") + return False + + # Step 4: Launch a new browser + print(f"\n{INFO}4. Launching a new browser{RESET}") + try: + cdp_url = await manager._strategy.launch_builtin_browser( + browser_type="chromium", headless=True + ) + if cdp_url: + print(f"{SUCCESS}New browser launched at: {cdp_url}{RESET}") + else: + print(f"{ERROR}Failed to launch new browser{RESET}") + return False + except Exception as e: + print(f"{ERROR}Browser launch failed: {str(e)}{RESET}") + return False + return True + +async def test_multiple_managers(): + """Test creating multiple BrowserManagers that use the same builtin browser""" + print(f"\n{INFO}========== Testing Multiple Browser Managers =========={RESET}") + + # Step 1: Create first manager + print(f"\n{INFO}1. Creating first browser manager{RESET}") + browser_config1 = (BrowserConfig(browser_mode="builtin", headless=True),) + manager1 = BrowserManager(browser_config=browser_config1, logger=logger) + + # Step 2: Create second manager + print(f"\n{INFO}2. Creating second browser manager{RESET}") + browser_config2 = BrowserConfig(browser_mode="builtin", headless=True) + manager2 = BrowserManager(browser_config=browser_config2, logger=logger) + + # Step 3: Start both managers (should connect to the same builtin browser) + print(f"\n{INFO}3. Starting both managers{RESET}") + try: + await manager1.start() + print(f"{SUCCESS}First manager started{RESET}") + + await manager2.start() + print(f"{SUCCESS}Second manager started{RESET}") + + # Check if they got the same CDP URL + cdp_url1 = manager1._strategy.config.cdp_url + cdp_url2 = manager2._strategy.config.cdp_url + + if cdp_url1 == cdp_url2: + print( + f"{SUCCESS}Both managers connected to the same browser: {cdp_url1}{RESET}" + ) + else: + print( + f"{WARNING}Managers connected to different browsers: {cdp_url1} and {cdp_url2}{RESET}" + ) + except Exception as e: + print(f"{ERROR}Failed to start managers: {str(e)}{RESET}") + return False + + # Step 4: Test using both managers + print(f"\n{INFO}4. Testing operations with both managers{RESET}") + try: + # First manager creates a page + page1, ctx1 = await manager1.get_page(CrawlerRunConfig()) + await page1.goto("https://example.com") + title1 = await page1.title() + print(f"{SUCCESS}Manager 1 page title: {title1}{RESET}") + + # Second manager creates a page + page2, ctx2 = await manager2.get_page(CrawlerRunConfig()) + await page2.goto("https://example.org") + title2 = await page2.title() + print(f"{SUCCESS}Manager 2 page title: {title2}{RESET}") + + # Clean up + await page1.close() + await page2.close() + except Exception as e: + print(f"{ERROR}Failed to use both managers: {str(e)}{RESET}") + return False + + # Step 5: Close both managers + print(f"\n{INFO}5. Closing both managers{RESET}") + try: + await manager1.close() + print(f"{SUCCESS}First manager closed{RESET}") + + await manager2.close() + print(f"{SUCCESS}Second manager closed{RESET}") + except Exception as e: + print(f"{ERROR}Failed to close managers: {str(e)}{RESET}") + return False + + return True + + async def test_edge_cases(): - """Test edge cases like multiple starts, killing browser during crawl, etc.""" + """Test edge cases like multiple starts, killing browser during operations, etc.""" print(f"\n{INFO}========== Testing Edge Cases =========={RESET}") - - # Step 1: Launch the builtin browser if it doesn't exist - print(f"\n{INFO}1. Ensuring builtin browser exists{RESET}") - profiler = BrowserProfiler(logger=logger) - browser_info = profiler.get_builtin_browser_info() - if not browser_info: - cdp_url = await profiler.launch_builtin_browser(headless=True) - if cdp_url: - print(f"{SUCCESS}Builtin browser launched at: {cdp_url}{RESET}") - else: - print(f"{ERROR}Failed to launch builtin browser{RESET}") - return - else: - print(f"{SUCCESS}Using existing builtin browser: {browser_info['cdp_url']}{RESET}") - - # Step 2: Test multiple starts with the same crawler - print(f"\n{INFO}2. Testing multiple starts with the same crawler{RESET}") + + # Step 1: Test multiple starts with the same manager + print(f"\n{INFO}1. Testing multiple starts with the same manager{RESET}") browser_config = BrowserConfig(browser_mode="builtin", headless=True) - crawler = AsyncWebCrawler(config=browser_config) - - await crawler.start() - print(f"{SUCCESS}First start successful!{RESET}") - + manager = BrowserManager(browser_config=browser_config, logger=logger) + try: - await crawler.start() - print(f"{SUCCESS}Second start didn't cause errors!{RESET}") + await manager.start() + print(f"{SUCCESS}First start successful{RESET}") + + # Try to start again + await manager.start() + print(f"{SUCCESS}Second start completed without errors{RESET}") + + # Test if it's still functional + page, context = await manager.get_page(CrawlerRunConfig()) + await page.goto("https://example.com") + title = await page.title() + print( + f"{SUCCESS}Page operations work after multiple starts. Title: {title}{RESET}" + ) + await page.close() except Exception as e: - print(f"{ERROR}Second start failed: {str(e)}{RESET}") - - # Run a crawl to verify functionality + print(f"{ERROR}Multiple starts test failed: {str(e)}{RESET}") + return False + finally: + await manager.close() + + # Step 2: Test killing the browser while manager is active + print(f"\n{INFO}2. Testing killing the browser while manager is active{RESET}") + manager = BrowserManager(browser_config=browser_config, logger=logger) + try: - result = await crawler.arun("https://httpbin.org/user-agent") - print(f"{SUCCESS}Crawling after multiple starts successful!{RESET}") - print(f" Got {len(result.markdown.raw_markdown)} chars of markdown content") + await manager.start() + print(f"{SUCCESS}Manager started{RESET}") + + # Kill the browser directly + print(f"{INFO}Killing the browser...{RESET}") + await manager._strategy.kill_builtin_browser() + print(f"{SUCCESS}Browser killed{RESET}") + + # Try to get a page (should fail or launch a new browser) + try: + page, context = await manager.get_page(CrawlerRunConfig()) + print( + f"{WARNING}Page request succeeded despite killed browser (might have auto-restarted){RESET}" + ) + title = await page.title() + print(f"{SUCCESS}Got page title: {title}{RESET}") + await page.close() + except Exception as e: + print( + f"{SUCCESS}Page request failed as expected after browser was killed: {str(e)}{RESET}" + ) except Exception as e: - print(f"{ERROR}Crawling after multiple starts failed: {str(e)}{RESET}") - - await crawler.close() - - # Step 3: Test killing browser while crawler is active - print(f"\n{INFO}3. Testing killing browser while crawler is active{RESET}") - - # Create and start a crawler - browser_config = BrowserConfig(browser_mode="builtin", headless=True) - crawler = AsyncWebCrawler(config=browser_config) - await crawler.start() - - # Kill the browser - print(f"{INFO}Killing the browser...{RESET}") - await profiler.kill_builtin_browser() - - # Try to crawl (should fail) - try: - result = await crawler.arun("https://httpbin.org/get") - print(f"{WARNING}Crawling succeeded despite killed browser!{RESET}") - except Exception as e: - print(f"{SUCCESS}Crawling failed as expected: {str(e)}{RESET}") - - await crawler.close() - + print(f"{ERROR}Kill during operation test failed: {str(e)}{RESET}") + return False + finally: + await manager.close() + return True + +async def cleanup_browsers(): + """Clean up any remaining builtin browsers""" + print(f"\n{INFO}========== Cleaning Up Builtin Browsers =========={RESET}") + + browser_config = BrowserConfig(browser_mode="builtin", headless=True) + manager = BrowserManager(browser_config=browser_config, logger=logger) + + try: + # No need to start, just access the strategy directly + strategy = manager._strategy + if isinstance(strategy, BuiltinBrowserStrategy): + result = await strategy.kill_builtin_browser() + if result: + print(f"{SUCCESS}Successfully killed all builtin browsers{RESET}") + else: + print(f"{WARNING}No builtin browsers found to kill{RESET}") + else: + print(f"{ERROR}Wrong strategy type: {strategy.__class__.__name__}{RESET}") + except Exception as e: + print(f"{ERROR}Cleanup failed: {str(e)}{RESET}") + finally: + # Just to be safe + try: + await manager.close() + except: + pass + + +async def test_performance_scaling(): + """Test performance with multiple browsers and pages. + + This test creates multiple browsers on different ports, + spawns multiple pages per browser, and measures performance metrics. + """ + print(f"\n{INFO}========== Testing Performance Scaling =========={RESET}") + + # Configuration parameters + num_browsers = 10 + pages_per_browser = 10 + total_pages = num_browsers * pages_per_browser + base_port = 9222 + + # Set up a measuring mechanism for memory + import psutil + import gc + + # Force garbage collection before starting + gc.collect() + process = psutil.Process() + initial_memory = process.memory_info().rss / 1024 / 1024 # in MB + peak_memory = initial_memory + + # Report initial configuration + print( + f"{INFO}Test configuration: {num_browsers} browsers × {pages_per_browser} pages = {total_pages} total crawls{RESET}" + ) + + # List to track managers + managers: List[BrowserManager] = [] + all_pages = [] + + + + # Get crawl4ai home directory + crawl4ai_home = os.path.expanduser("~/.crawl4ai") + temp_dir = os.path.join(crawl4ai_home, "temp") + os.makedirs(temp_dir, exist_ok=True) + + # Create all managers but don't start them yet + manager_configs = [] + for i in range(num_browsers): + port = base_port + i + browser_config = BrowserConfig( + browser_mode="builtin", + headless=True, + debugging_port=port, + user_data_dir=os.path.join(temp_dir, f"browser_profile_{i}"), + ) + manager = BrowserManager(browser_config=browser_config, logger=logger) + manager._strategy.shutting_down = True + manager_configs.append((manager, i, port)) + + # Define async function to start a single manager + async def start_manager(manager, index, port): + try: + await manager.start() + return manager + except Exception as e: + print( + f"{ERROR}Failed to start browser {index + 1} on port {port}: {str(e)}{RESET}" + ) + return None + + # Start all managers in parallel + start_tasks = [ + start_manager(manager, i, port) for manager, i, port in manager_configs + ] + started_managers = await asyncio.gather(*start_tasks) + + # Filter out None values (failed starts) and add to managers list + managers = [m for m in started_managers if m is not None] + + if len(managers) == 0: + print(f"{ERROR}All browser managers failed to start. Aborting test.{RESET}") + return False + + if len(managers) < num_browsers: + print( + f"{WARNING}Only {len(managers)} out of {num_browsers} browser managers started successfully{RESET}" + ) + + # Create pages for each browser + for i, manager in enumerate(managers): + try: + pages = await manager.get_pages(CrawlerRunConfig(), count=pages_per_browser) + all_pages.extend(pages) + except Exception as e: + print(f"{ERROR}Failed to create pages for browser {i + 1}: {str(e)}{RESET}") + + # Check memory after page creation + gc.collect() + current_memory = process.memory_info().rss / 1024 / 1024 + peak_memory = max(peak_memory, current_memory) + + # Ask for confirmation before loading + confirmation = input( + f"{WARNING}Do you want to proceed with loading pages? (y/n): {RESET}" + ) + # Step 1: Create and start multiple browser managers in parallel + start_time = time.time() + + if confirmation.lower() == "y": + load_start_time = time.time() + + # Function to load a single page + async def load_page(page_ctx, index): + page, _ = page_ctx + try: + await page.goto(f"https://example.com/page{index}", timeout=30000) + title = await page.title() + return title + except Exception as e: + return f"Error: {str(e)}" + + # Load all pages concurrently + load_tasks = [load_page(page_ctx, i) for i, page_ctx in enumerate(all_pages)] + load_results = await asyncio.gather(*load_tasks, return_exceptions=True) + + # Count successes and failures + successes = sum( + 1 for r in load_results if isinstance(r, str) and not r.startswith("Error") + ) + failures = len(load_results) - successes + + load_time = time.time() - load_start_time + total_test_time = time.time() - start_time + + # Check memory after loading (peak memory) + gc.collect() + current_memory = process.memory_info().rss / 1024 / 1024 + peak_memory = max(peak_memory, current_memory) + + # Calculate key metrics + memory_per_page = peak_memory / successes if successes > 0 else 0 + time_per_crawl = total_test_time / successes if successes > 0 else 0 + crawls_per_second = successes / total_test_time if total_test_time > 0 else 0 + crawls_per_minute = crawls_per_second * 60 + crawls_per_hour = crawls_per_minute * 60 + + # Print simplified performance summary + from rich.console import Console + from rich.table import Table + + console = Console() + + # Create a simple summary table + table = Table(title="CRAWL4AI PERFORMANCE SUMMARY") + + table.add_column("Metric", style="cyan") + table.add_column("Value", style="green") + + table.add_row("Total Crawls Completed", f"{successes}") + table.add_row("Total Time", f"{total_test_time:.2f} seconds") + table.add_row("Time Per Crawl", f"{time_per_crawl:.2f} seconds") + table.add_row("Crawling Speed", f"{crawls_per_second:.2f} crawls/second") + table.add_row("Projected Rate (1 minute)", f"{crawls_per_minute:.0f} crawls") + table.add_row("Projected Rate (1 hour)", f"{crawls_per_hour:.0f} crawls") + table.add_row("Peak Memory Usage", f"{peak_memory:.2f} MB") + table.add_row("Memory Per Crawl", f"{memory_per_page:.2f} MB") + + # Display the table + console.print(table) + + # Ask confirmation before cleanup + confirmation = input( + f"{WARNING}Do you want to proceed with cleanup? (y/n): {RESET}" + ) + if confirmation.lower() != "y": + print(f"{WARNING}Cleanup aborted by user{RESET}") + return False + + # Close all pages + for page, _ in all_pages: + try: + await page.close() + except: + pass + + # Close all managers + for manager in managers: + try: + await manager.close() + except: + pass + + # Remove the temp directory + import shutil + + if os.path.exists(temp_dir): + shutil.rmtree(temp_dir) + + return True + + +async def test_performance_scaling_lab( num_browsers: int = 10, pages_per_browser: int = 10): + """Test performance with multiple browsers and pages. + + This test creates multiple browsers on different ports, + spawns multiple pages per browser, and measures performance metrics. + """ + print(f"\n{INFO}========== Testing Performance Scaling =========={RESET}") + + # Configuration parameters + num_browsers = num_browsers + pages_per_browser = pages_per_browser + total_pages = num_browsers * pages_per_browser + base_port = 9222 + + # Set up a measuring mechanism for memory + import psutil + import gc + + # Force garbage collection before starting + gc.collect() + process = psutil.Process() + initial_memory = process.memory_info().rss / 1024 / 1024 # in MB + peak_memory = initial_memory + + # Report initial configuration + print( + f"{INFO}Test configuration: {num_browsers} browsers × {pages_per_browser} pages = {total_pages} total crawls{RESET}" + ) + + # List to track managers + managers: List[BrowserManager] = [] + all_pages = [] + + # Get crawl4ai home directory + crawl4ai_home = os.path.expanduser("~/.crawl4ai") + temp_dir = os.path.join(crawl4ai_home, "temp") + os.makedirs(temp_dir, exist_ok=True) + + # Create all managers but don't start them yet + manager_configs = [] + for i in range(num_browsers): + port = base_port + i + browser_config = BrowserConfig( + browser_mode="builtin", + headless=True, + debugging_port=port, + user_data_dir=os.path.join(temp_dir, f"browser_profile_{i}"), + ) + manager = BrowserManager(browser_config=browser_config, logger=logger) + manager._strategy.shutting_down = True + manager_configs.append((manager, i, port)) + + # Define async function to start a single manager + async def start_manager(manager, index, port): + try: + await manager.start() + return manager + except Exception as e: + print( + f"{ERROR}Failed to start browser {index + 1} on port {port}: {str(e)}{RESET}" + ) + return None + + # Start all managers in parallel + start_tasks = [ + start_manager(manager, i, port) for manager, i, port in manager_configs + ] + started_managers = await asyncio.gather(*start_tasks) + + # Filter out None values (failed starts) and add to managers list + managers = [m for m in started_managers if m is not None] + + if len(managers) == 0: + print(f"{ERROR}All browser managers failed to start. Aborting test.{RESET}") + return False + + if len(managers) < num_browsers: + print( + f"{WARNING}Only {len(managers)} out of {num_browsers} browser managers started successfully{RESET}" + ) + + # Create pages for each browser + for i, manager in enumerate(managers): + try: + pages = await manager.get_pages(CrawlerRunConfig(), count=pages_per_browser) + all_pages.extend(pages) + except Exception as e: + print(f"{ERROR}Failed to create pages for browser {i + 1}: {str(e)}{RESET}") + + # Check memory after page creation + gc.collect() + current_memory = process.memory_info().rss / 1024 / 1024 + peak_memory = max(peak_memory, current_memory) + + # Ask for confirmation before loading + confirmation = input( + f"{WARNING}Do you want to proceed with loading pages? (y/n): {RESET}" + ) + # Step 1: Create and start multiple browser managers in parallel + start_time = time.time() + + if confirmation.lower() == "y": + load_start_time = time.time() + + # Function to load a single page + async def load_page(page_ctx, index): + page, _ = page_ctx + try: + await page.goto(f"https://example.com/page{index}", timeout=30000) + title = await page.title() + return title + except Exception as e: + return f"Error: {str(e)}" + + # Load all pages concurrently + load_tasks = [load_page(page_ctx, i) for i, page_ctx in enumerate(all_pages)] + load_results = await asyncio.gather(*load_tasks, return_exceptions=True) + + # Count successes and failures + successes = sum( + 1 for r in load_results if isinstance(r, str) and not r.startswith("Error") + ) + failures = len(load_results) - successes + + load_time = time.time() - load_start_time + total_test_time = time.time() - start_time + + # Check memory after loading (peak memory) + gc.collect() + current_memory = process.memory_info().rss / 1024 / 1024 + peak_memory = max(peak_memory, current_memory) + + # Calculate key metrics + memory_per_page = peak_memory / successes if successes > 0 else 0 + time_per_crawl = total_test_time / successes if successes > 0 else 0 + crawls_per_second = successes / total_test_time if total_test_time > 0 else 0 + crawls_per_minute = crawls_per_second * 60 + crawls_per_hour = crawls_per_minute * 60 + + # Print simplified performance summary + from rich.console import Console + from rich.table import Table + + console = Console() + + # Create a simple summary table + table = Table(title="CRAWL4AI PERFORMANCE SUMMARY") + + table.add_column("Metric", style="cyan") + table.add_column("Value", style="green") + + table.add_row("Total Crawls Completed", f"{successes}") + table.add_row("Total Time", f"{total_test_time:.2f} seconds") + table.add_row("Time Per Crawl", f"{time_per_crawl:.2f} seconds") + table.add_row("Crawling Speed", f"{crawls_per_second:.2f} crawls/second") + table.add_row("Projected Rate (1 minute)", f"{crawls_per_minute:.0f} crawls") + table.add_row("Projected Rate (1 hour)", f"{crawls_per_hour:.0f} crawls") + table.add_row("Peak Memory Usage", f"{peak_memory:.2f} MB") + table.add_row("Memory Per Crawl", f"{memory_per_page:.2f} MB") + + # Display the table + console.print(table) + + # Ask confirmation before cleanup + confirmation = input( + f"{WARNING}Do you want to proceed with cleanup? (y/n): {RESET}" + ) + if confirmation.lower() != "y": + print(f"{WARNING}Cleanup aborted by user{RESET}") + return False + + # Close all pages + for page, _ in all_pages: + try: + await page.close() + except: + pass + + # Close all managers + for manager in managers: + try: + await manager.close() + except: + pass + + # Remove the temp directory + import shutil + + if os.path.exists(temp_dir): + shutil.rmtree(temp_dir) + + return True + + + async def main(): """Run all tests""" try: - print(f"{INFO}Starting browser_profiler and builtin browser tests{RESET}") - - # Run browser profiler tests - cdp_url = await test_browser_profiler() - - # Run crawling tests with builtin browser - if cdp_url: - await test_crawling_with_builtin_browser(cdp_url) - - # Run tests without builtin browser - # await test_crawling_without_builtin_browser() - - # Run edge case tests + print(f"{INFO}Starting builtin browser tests with browser module{RESET}") + + # # Run browser creation test + # manager, cdp_url = await test_builtin_browser_creation() + # if not manager: + # print(f"{ERROR}Browser creation failed, cannot continue tests{RESET}") + # return + + # # Run page operations test + # await test_page_operations(manager) + + # # Run browser status and management test + # await test_browser_status_management(manager) + + # # Close manager before multiple manager test + # await manager.close() + + # Run multiple managers test + # await test_multiple_managers() + + # Run performance scaling test + await test_performance_scaling() + # Run cleanup test + # await cleanup_browsers() + + # Run edge cases test # await test_edge_cases() - + print(f"\n{SUCCESS}All tests completed!{RESET}") - + except Exception as e: print(f"\n{ERROR}Test failed with error: {str(e)}{RESET}") import traceback + traceback.print_exc() finally: - # Clean up: kill any remaining builtin browser - print(f"\n{INFO}Cleaning up: killing any remaining builtin browser{RESET}") - profiler = BrowserProfiler(logger=logger) - await profiler.kill_builtin_browser() + # Clean up: kill any remaining builtin browsers + await cleanup_browsers() print(f"{SUCCESS}Test cleanup complete{RESET}") + if __name__ == "__main__": - asyncio.run(main()) \ No newline at end of file + asyncio.run(main()) diff --git a/tests/browser/test_parallel_crawling.py b/tests/browser/test_parallel_crawling.py new file mode 100644 index 00000000..9e72f06e --- /dev/null +++ b/tests/browser/test_parallel_crawling.py @@ -0,0 +1,902 @@ +""" +Test examples for parallel crawling with the browser module. + +These examples demonstrate the functionality of parallel page creation +and serve as functional tests for multi-page crawling performance. +""" + +import asyncio +import os +import sys +import time +from typing import List + +# Add the project root to Python path if running directly +if __name__ == "__main__": + sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '../..'))) + +from crawl4ai.browser import BrowserManager +from crawl4ai.async_configs import BrowserConfig, CrawlerRunConfig +from crawl4ai.async_logger import AsyncLogger + +# Create a logger for clear terminal output +logger = AsyncLogger(verbose=True, log_file=None) + +async def test_get_pages_basic(): + """Test basic functionality of get_pages method.""" + logger.info("Testing basic get_pages functionality", tag="TEST") + + browser_config = BrowserConfig(headless=True) + manager = BrowserManager(browser_config=browser_config, logger=logger) + + try: + await manager.start() + + # Request 3 pages + crawler_config = CrawlerRunConfig() + pages = await manager.get_pages(crawler_config, count=3) + + # Verify we got the correct number of pages + assert len(pages) == 3, f"Expected 3 pages, got {len(pages)}" + + # Verify each page is valid + for i, (page, context) in enumerate(pages): + await page.goto("https://example.com") + title = await page.title() + logger.info(f"Page {i+1} title: {title}", tag="TEST") + assert title, f"Page {i+1} has no title" + + await manager.close() + logger.success("Basic get_pages test completed successfully", tag="TEST") + return True + except Exception as e: + logger.error(f"Test failed: {str(e)}", tag="TEST") + try: + await manager.close() + except: + pass + return False + +async def test_parallel_approaches_comparison(): + """Compare two parallel crawling approaches: + 1. Create a page for each URL on-demand (get_page + gather) + 2. Get all pages upfront with get_pages, then use them (get_pages + gather) + """ + logger.info("Comparing different parallel crawling approaches", tag="TEST") + + urls = [ + "https://example.com/page1", + "https://crawl4ai.com", + "https://kidocode.com", + "https://bbc.com", + # "https://example.com/page1", + # "https://example.com/page2", + # "https://example.com/page3", + # "https://example.com/page4", + ] + + browser_config = BrowserConfig(headless=False) + manager = BrowserManager(browser_config=browser_config, logger=logger) + + try: + await manager.start() + + # Approach 1: Create a page for each URL on-demand and run in parallel + logger.info("Testing approach 1: get_page for each URL + gather", tag="TEST") + start_time = time.time() + + async def fetch_title_approach1(url): + """Create a new page for each URL, go to the URL, and get title""" + crawler_config = CrawlerRunConfig(url=url) + page, context = await manager.get_page(crawler_config) + try: + await page.goto(url) + title = await page.title() + return title + finally: + await page.close() + + # Run fetch_title_approach1 for each URL in parallel + tasks = [fetch_title_approach1(url) for url in urls] + approach1_results = await asyncio.gather(*tasks) + + approach1_time = time.time() - start_time + logger.info(f"Approach 1 time (get_page + gather): {approach1_time:.2f}s", tag="TEST") + + # Approach 2: Get all pages upfront with get_pages, then use them in parallel + logger.info("Testing approach 2: get_pages upfront + gather", tag="TEST") + start_time = time.time() + + # Get all pages upfront + crawler_config = CrawlerRunConfig() + pages = await manager.get_pages(crawler_config, count=len(urls)) + + async def fetch_title_approach2(page_ctx, url): + """Use a pre-created page to go to URL and get title""" + page, _ = page_ctx + try: + await page.goto(url) + title = await page.title() + return title + finally: + await page.close() + + # Use the pre-created pages to fetch titles in parallel + tasks = [fetch_title_approach2(page_ctx, url) for page_ctx, url in zip(pages, urls)] + approach2_results = await asyncio.gather(*tasks) + + approach2_time = time.time() - start_time + logger.info(f"Approach 2 time (get_pages + gather): {approach2_time:.2f}s", tag="TEST") + + # Compare results and performance + speedup = approach1_time / approach2_time if approach2_time > 0 else 0 + if speedup > 1: + logger.success(f"Approach 2 (get_pages upfront) was {speedup:.2f}x faster", tag="TEST") + else: + logger.info(f"Approach 1 (get_page + gather) was {1/speedup:.2f}x faster", tag="TEST") + + # Verify same content was retrieved in both approaches + assert len(approach1_results) == len(approach2_results), "Result count mismatch" + + # Sort results for comparison since parallel execution might complete in different order + assert sorted(approach1_results) == sorted(approach2_results), "Results content mismatch" + + await manager.close() + return True + + except Exception as e: + logger.error(f"Test failed: {str(e)}", tag="TEST") + try: + await manager.close() + except: + pass + return False + +async def test_multi_browser_scaling(num_browsers=3, pages_per_browser=5): + """Test performance with multiple browsers and pages per browser. + Compares two approaches: + 1. On-demand page creation (get_page + gather) + 2. Pre-created pages (get_pages + gather) + """ + logger.info(f"Testing multi-browser scaling with {num_browsers} browsers × {pages_per_browser} pages", tag="TEST") + + # Generate test URLs + total_pages = num_browsers * pages_per_browser + urls = [f"https://example.com/page_{i}" for i in range(total_pages)] + + # Create browser managers + managers = [] + base_port = 9222 + + try: + # Start all browsers in parallel + start_tasks = [] + for i in range(num_browsers): + browser_config = BrowserConfig( + headless=True # Using default browser mode like in test_parallel_approaches_comparison + ) + manager = BrowserManager(browser_config=browser_config, logger=logger) + start_tasks.append(manager.start()) + managers.append(manager) + + await asyncio.gather(*start_tasks) + + # Distribute URLs among managers + urls_per_manager = {} + for i, manager in enumerate(managers): + start_idx = i * pages_per_browser + end_idx = min(start_idx + pages_per_browser, len(urls)) + urls_per_manager[manager] = urls[start_idx:end_idx] + + # Approach 1: Create a page for each URL on-demand and run in parallel + logger.info("Testing approach 1: get_page for each URL + gather", tag="TEST") + start_time = time.time() + + async def fetch_title_approach1(manager, url): + """Create a new page for the URL, go to the URL, and get title""" + crawler_config = CrawlerRunConfig(url=url) + page, context = await manager.get_page(crawler_config) + try: + await page.goto(url) + title = await page.title() + return title + finally: + await page.close() + + # Run fetch_title_approach1 for each URL in parallel + tasks = [] + for manager, manager_urls in urls_per_manager.items(): + for url in manager_urls: + tasks.append(fetch_title_approach1(manager, url)) + + approach1_results = await asyncio.gather(*tasks) + + approach1_time = time.time() - start_time + logger.info(f"Approach 1 time (get_page + gather): {approach1_time:.2f}s", tag="TEST") + + # Approach 2: Get all pages upfront with get_pages, then use them in parallel + logger.info("Testing approach 2: get_pages upfront + gather", tag="TEST") + start_time = time.time() + + # Get all pages upfront for each manager + all_pages = [] + for manager, manager_urls in urls_per_manager.items(): + crawler_config = CrawlerRunConfig() + pages = await manager.get_pages(crawler_config, count=len(manager_urls)) + all_pages.extend(zip(pages, manager_urls)) + + async def fetch_title_approach2(page_ctx, url): + """Use a pre-created page to go to URL and get title""" + page, _ = page_ctx + try: + await page.goto(url) + title = await page.title() + return title + finally: + await page.close() + + # Use the pre-created pages to fetch titles in parallel + tasks = [fetch_title_approach2(page_ctx, url) for page_ctx, url in all_pages] + approach2_results = await asyncio.gather(*tasks) + + approach2_time = time.time() - start_time + logger.info(f"Approach 2 time (get_pages + gather): {approach2_time:.2f}s", tag="TEST") + + # Compare results and performance + speedup = approach1_time / approach2_time if approach2_time > 0 else 0 + pages_per_second = total_pages / approach2_time + + # Show a simple summary + logger.info(f"📊 Summary: {num_browsers} browsers × {pages_per_browser} pages = {total_pages} total crawls", tag="TEST") + logger.info(f"⚡ Performance: {pages_per_second:.1f} pages/second ({pages_per_second*60:.0f} pages/minute)", tag="TEST") + logger.info(f"🚀 Total crawl time: {approach2_time:.2f} seconds", tag="TEST") + + if speedup > 1: + logger.success(f"✅ Approach 2 (get_pages upfront) was {speedup:.2f}x faster", tag="TEST") + else: + logger.info(f"✅ Approach 1 (get_page + gather) was {1/speedup:.2f}x faster", tag="TEST") + + # Close all managers + for manager in managers: + await manager.close() + + return True + + except Exception as e: + logger.error(f"Test failed: {str(e)}", tag="TEST") + # Clean up + for manager in managers: + try: + await manager.close() + except: + pass + return False + +async def grid_search_optimal_configuration(total_urls=50): + """Perform a grid search to find the optimal balance between number of browsers and pages per browser. + + This function tests different combinations of browser count and pages per browser, + while keeping the total number of URLs constant. It measures performance metrics + for each configuration to find the "sweet spot" that provides the best speed + with reasonable memory usage. + + Args: + total_urls: Total number of URLs to crawl (default: 50) + """ + logger.info(f"=== GRID SEARCH FOR OPTIMAL CRAWLING CONFIGURATION ({total_urls} URLs) ===", tag="TEST") + + # Generate test URLs once + urls = [f"https://example.com/page_{i}" for i in range(total_urls)] + + # Define grid search configurations + # We'll use more flexible approach: test all browser counts from 1 to min(20, total_urls) + # and distribute pages evenly (some browsers may have 1 more page than others) + configurations = [] + + # Maximum number of browsers to test + max_browsers_to_test = min(20, total_urls) + + # Try configurations with 1 to max_browsers_to_test browsers + for num_browsers in range(1, max_browsers_to_test + 1): + base_pages_per_browser = total_urls // num_browsers + remainder = total_urls % num_browsers + + # Generate exact page distribution array + if remainder > 0: + # First 'remainder' browsers get one more page + page_distribution = [base_pages_per_browser + 1] * remainder + [base_pages_per_browser] * (num_browsers - remainder) + pages_distribution = f"{base_pages_per_browser+1} pages × {remainder} browsers, {base_pages_per_browser} pages × {num_browsers - remainder} browsers" + else: + # All browsers get the same number of pages + page_distribution = [base_pages_per_browser] * num_browsers + pages_distribution = f"{base_pages_per_browser} pages × {num_browsers} browsers" + + # Format the distribution as a tuple string like (4, 4, 3, 3) + distribution_str = str(tuple(page_distribution)) + + configurations.append((num_browsers, base_pages_per_browser, pages_distribution, page_distribution, distribution_str)) + + # Track results + results = [] + + # Test each configuration + for num_browsers, pages_per_browser, pages_distribution, page_distribution, distribution_str in configurations: + logger.info("-" * 80, tag="TEST") + logger.info(f"Testing configuration: {num_browsers} browsers with distribution: {distribution_str}", tag="TEST") + logger.info(f"Details: {pages_distribution}", tag="TEST") + # Sleep a bit for randomness + await asyncio.sleep(0.5) + + try: + # Import psutil for memory tracking + try: + import psutil + process = psutil.Process() + initial_memory = process.memory_info().rss / (1024 * 1024) # MB + except ImportError: + logger.warning("psutil not available, memory metrics will not be tracked", tag="TEST") + initial_memory = 0 + + # Create and start browser managers + managers = [] + start_time = time.time() + + # Start all browsers in parallel + start_tasks = [] + for i in range(num_browsers): + browser_config = BrowserConfig( + headless=True + ) + manager = BrowserManager(browser_config=browser_config, logger=logger) + start_tasks.append(manager.start()) + managers.append(manager) + + await asyncio.gather(*start_tasks) + browser_startup_time = time.time() - start_time + + # Measure memory after browser startup + if initial_memory > 0: + browser_memory = process.memory_info().rss / (1024 * 1024) - initial_memory + else: + browser_memory = 0 + + # Distribute URLs among managers using the exact page distribution + urls_per_manager = {} + total_assigned = 0 + + for i, manager in enumerate(managers): + if i < len(page_distribution): + # Get the exact number of pages for this browser from our distribution + manager_pages = page_distribution[i] + + # Get the URL slice for this manager + start_idx = total_assigned + end_idx = start_idx + manager_pages + urls_per_manager[manager] = urls[start_idx:end_idx] + total_assigned += manager_pages + else: + # If we have more managers than our distribution (should never happen) + urls_per_manager[manager] = [] + + # Use the more efficient approach (pre-created pages) + logger.info("Running page crawling test...", tag="TEST") + crawl_start_time = time.time() + + # Get all pages upfront for each manager + all_pages = [] + for manager, manager_urls in urls_per_manager.items(): + if not manager_urls: # Skip managers with no URLs + continue + crawler_config = CrawlerRunConfig() + pages = await manager.get_pages(crawler_config, count=len(manager_urls)) + all_pages.extend(zip(pages, manager_urls)) + + # Measure memory after page creation + if initial_memory > 0: + pages_memory = process.memory_info().rss / (1024 * 1024) - browser_memory - initial_memory + else: + pages_memory = 0 + + # Function to crawl a URL with a pre-created page + async def fetch_title(page_ctx, url): + page, _ = page_ctx + try: + await page.goto(url) + title = await page.title() + return title + finally: + await page.close() + + # Use the pre-created pages to fetch titles in parallel + tasks = [fetch_title(page_ctx, url) for page_ctx, url in all_pages] + crawl_results = await asyncio.gather(*tasks) + + crawl_time = time.time() - crawl_start_time + total_time = time.time() - start_time + + # Final memory measurement + if initial_memory > 0: + peak_memory = max(browser_memory + pages_memory, process.memory_info().rss / (1024 * 1024) - initial_memory) + else: + peak_memory = 0 + + # Close all managers + for manager in managers: + await manager.close() + + # Calculate metrics + pages_per_second = total_urls / crawl_time + + # Store result metrics + result = { + "num_browsers": num_browsers, + "pages_per_browser": pages_per_browser, + "page_distribution": page_distribution, + "distribution_str": distribution_str, + "total_urls": total_urls, + "browser_startup_time": browser_startup_time, + "crawl_time": crawl_time, + "total_time": total_time, + "browser_memory": browser_memory, + "pages_memory": pages_memory, + "peak_memory": peak_memory, + "pages_per_second": pages_per_second, + # Calculate efficiency score (higher is better) + # This balances speed vs memory usage + "efficiency_score": pages_per_second / (peak_memory + 1) if peak_memory > 0 else pages_per_second, + } + + results.append(result) + + # Log the results + logger.info(f"Browser startup: {browser_startup_time:.2f}s", tag="TEST") + logger.info(f"Crawl time: {crawl_time:.2f}s", tag="TEST") + logger.info(f"Total time: {total_time:.2f}s", tag="TEST") + logger.info(f"Performance: {pages_per_second:.1f} pages/second", tag="TEST") + + if peak_memory > 0: + logger.info(f"Browser memory: {browser_memory:.1f}MB", tag="TEST") + logger.info(f"Pages memory: {pages_memory:.1f}MB", tag="TEST") + logger.info(f"Peak memory: {peak_memory:.1f}MB", tag="TEST") + logger.info(f"Efficiency score: {result['efficiency_score']:.6f}", tag="TEST") + + except Exception as e: + logger.error(f"Error testing configuration: {str(e)}", tag="TEST") + import traceback + traceback.print_exc() + + # Clean up + for manager in managers: + try: + await manager.close() + except: + pass + + # Print summary of all configurations + logger.info("=" * 100, tag="TEST") + logger.info("GRID SEARCH RESULTS SUMMARY", tag="TEST") + logger.info("=" * 100, tag="TEST") + + # Rank configurations by efficiency score + ranked_results = sorted(results, key=lambda x: x["efficiency_score"], reverse=True) + + # Also determine rankings by different metrics + fastest = sorted(results, key=lambda x: x["crawl_time"])[0] + lowest_memory = sorted(results, key=lambda x: x["peak_memory"] if x["peak_memory"] > 0 else float('inf'))[0] + most_efficient = ranked_results[0] + + # Print top performers by category + logger.info("🏆 TOP PERFORMERS BY CATEGORY:", tag="TEST") + logger.info(f"⚡ Fastest: {fastest['num_browsers']} browsers × ~{fastest['pages_per_browser']} pages " + + f"({fastest['crawl_time']:.2f}s, {fastest['pages_per_second']:.1f} pages/s)", tag="TEST") + + if lowest_memory["peak_memory"] > 0: + logger.info(f"💾 Lowest memory: {lowest_memory['num_browsers']} browsers × ~{lowest_memory['pages_per_browser']} pages " + + f"({lowest_memory['peak_memory']:.1f}MB)", tag="TEST") + + logger.info(f"🌟 Most efficient: {most_efficient['num_browsers']} browsers × ~{most_efficient['pages_per_browser']} pages " + + f"(score: {most_efficient['efficiency_score']:.6f})", tag="TEST") + + # Print result table header + logger.info("\n📊 COMPLETE RANKING TABLE (SORTED BY EFFICIENCY SCORE):", tag="TEST") + logger.info("-" * 120, tag="TEST") + + # Define table header + header = f"{'Rank':<5} | {'Browsers':<8} | {'Distribution':<55} | {'Total Time(s)':<12} | {'Speed(p/s)':<12} | {'Memory(MB)':<12} | {'Efficiency':<10} | {'Notes'}" + logger.info(header, tag="TEST") + logger.info("-" * 120, tag="TEST") + + # Print each configuration in ranked order + for rank, result in enumerate(ranked_results, 1): + # Add special notes for top performers + notes = [] + if result == fastest: + notes.append("⚡ Fastest") + if result == lowest_memory: + notes.append("💾 Lowest Memory") + if result == most_efficient: + notes.append("🌟 Most Efficient") + + notes_str = " | ".join(notes) if notes else "" + + # Format memory if available + memory_str = f"{result['peak_memory']:.1f}" if result['peak_memory'] > 0 else "N/A" + + # Get the distribution string + dist_str = result.get('distribution_str', str(tuple([result['pages_per_browser']] * result['num_browsers']))) + + # Build the row + row = f"{rank:<5} | {result['num_browsers']:<8} | {dist_str:<55} | {result['total_time']:.2f}s{' ':<7} | " + row += f"{result['pages_per_second']:.2f}{' ':<6} | {memory_str}{' ':<6} | {result['efficiency_score']:.4f}{' ':<4} | {notes_str}" + + logger.info(row, tag="TEST") + + logger.info("-" * 120, tag="TEST") + + # Generate visualization if matplotlib is available + try: + import matplotlib.pyplot as plt + import numpy as np + + # Extract data for plotting from ranked results + browser_counts = [r["num_browsers"] for r in ranked_results] + efficiency_scores = [r["efficiency_score"] for r in ranked_results] + crawl_times = [r["crawl_time"] for r in ranked_results] + total_times = [r["total_time"] for r in ranked_results] + + # Filter results with memory data + memory_results = [r for r in ranked_results if r["peak_memory"] > 0] + memory_browser_counts = [r["num_browsers"] for r in memory_results] + peak_memories = [r["peak_memory"] for r in memory_results] + + # Create figure with clean design + plt.figure(figsize=(14, 12), facecolor='white') + plt.style.use('ggplot') + + # Create grid for subplots + gs = plt.GridSpec(3, 1, height_ratios=[1, 1, 1], hspace=0.3) + + # Plot 1: Efficiency Score (higher is better) + ax1 = plt.subplot(gs[0]) + bar_colors = ['#3498db'] * len(browser_counts) + + # Highlight the most efficient + most_efficient_idx = browser_counts.index(most_efficient["num_browsers"]) + bar_colors[most_efficient_idx] = '#e74c3c' # Red for most efficient + + bars = ax1.bar(range(len(browser_counts)), efficiency_scores, color=bar_colors) + ax1.set_xticks(range(len(browser_counts))) + ax1.set_xticklabels([f"{bc}" for bc in browser_counts], rotation=45) + ax1.set_xlabel('Number of Browsers') + ax1.set_ylabel('Efficiency Score (higher is better)') + ax1.set_title('Browser Configuration Efficiency (higher is better)') + + # Add value labels on top of bars + for bar, score in zip(bars, efficiency_scores): + height = bar.get_height() + ax1.text(bar.get_x() + bar.get_width()/2., height + 0.02*max(efficiency_scores), + f'{score:.3f}', ha='center', va='bottom', rotation=90, fontsize=8) + + # Highlight best configuration + ax1.text(0.02, 0.90, f"🌟 Most Efficient: {most_efficient['num_browsers']} browsers with ~{most_efficient['pages_per_browser']} pages", + transform=ax1.transAxes, fontsize=12, verticalalignment='top', + bbox=dict(boxstyle='round,pad=0.5', facecolor='yellow', alpha=0.3)) + + # Plot 2: Time Performance + ax2 = plt.subplot(gs[1]) + + # Plot both total time and crawl time + ax2.plot(browser_counts, crawl_times, 'bo-', label='Crawl Time (s)', linewidth=2) + ax2.plot(browser_counts, total_times, 'go--', label='Total Time (s)', linewidth=2, alpha=0.6) + + # Mark the fastest configuration + fastest_idx = browser_counts.index(fastest["num_browsers"]) + ax2.plot(browser_counts[fastest_idx], crawl_times[fastest_idx], 'ro', ms=10, + label=f'Fastest: {fastest["num_browsers"]} browsers') + + ax2.set_xlabel('Number of Browsers') + ax2.set_ylabel('Time (seconds)') + ax2.set_title(f'Time Performance for {total_urls} URLs by Browser Count') + ax2.grid(True, linestyle='--', alpha=0.7) + ax2.legend(loc='upper right') + + # Plot pages per second on second y-axis + pages_per_second = [total_urls/t for t in crawl_times] + ax2_twin = ax2.twinx() + ax2_twin.plot(browser_counts, pages_per_second, 'r^--', label='Pages/second', alpha=0.5) + ax2_twin.set_ylabel('Pages per second') + + # Add note about the fastest configuration + ax2.text(0.02, 0.90, f"⚡ Fastest: {fastest['num_browsers']} browsers with ~{fastest['pages_per_browser']} pages" + + f"\n {fastest['crawl_time']:.2f}s ({fastest['pages_per_second']:.1f} pages/s)", + transform=ax2.transAxes, fontsize=12, verticalalignment='top', + bbox=dict(boxstyle='round,pad=0.5', facecolor='lightblue', alpha=0.3)) + + # Plot 3: Memory Usage (if available) + if memory_results: + ax3 = plt.subplot(gs[2]) + + # Prepare data for grouped bar chart + memory_per_browser = [m/n for m, n in zip(peak_memories, memory_browser_counts)] + memory_per_page = [m/(n*p) for m, n, p in zip( + [r["peak_memory"] for r in memory_results], + [r["num_browsers"] for r in memory_results], + [r["pages_per_browser"] for r in memory_results])] + + x = np.arange(len(memory_browser_counts)) + width = 0.35 + + # Create grouped bars + ax3.bar(x - width/2, peak_memories, width, label='Total Memory (MB)', color='#9b59b6') + ax3.bar(x + width/2, memory_per_browser, width, label='Memory per Browser (MB)', color='#3498db') + + # Configure axis + ax3.set_xticks(x) + ax3.set_xticklabels([f"{bc}" for bc in memory_browser_counts], rotation=45) + ax3.set_xlabel('Number of Browsers') + ax3.set_ylabel('Memory (MB)') + ax3.set_title('Memory Usage by Browser Configuration') + ax3.legend(loc='upper left') + ax3.grid(True, linestyle='--', alpha=0.7) + + # Add second y-axis for memory per page + ax3_twin = ax3.twinx() + ax3_twin.plot(x, memory_per_page, 'ro-', label='Memory per Page (MB)') + ax3_twin.set_ylabel('Memory per Page (MB)') + + # Get lowest memory configuration + lowest_memory_idx = memory_browser_counts.index(lowest_memory["num_browsers"]) + + # Add note about lowest memory configuration + ax3.text(0.02, 0.90, f"💾 Lowest Memory: {lowest_memory['num_browsers']} browsers with ~{lowest_memory['pages_per_browser']} pages" + + f"\n {lowest_memory['peak_memory']:.1f}MB ({lowest_memory['peak_memory']/total_urls:.2f}MB per page)", + transform=ax3.transAxes, fontsize=12, verticalalignment='top', + bbox=dict(boxstyle='round,pad=0.5', facecolor='lightgreen', alpha=0.3)) + + # Add overall title + plt.suptitle(f'Browser Scaling Grid Search Results for {total_urls} URLs', fontsize=16, y=0.98) + + # Add timestamp and info at the bottom + plt.figtext(0.5, 0.01, f"Generated by Crawl4AI at {time.strftime('%Y-%m-%d %H:%M:%S')}", + ha="center", fontsize=10, style='italic') + + # Get current directory and save the figure there + import os + __current_file = os.path.abspath(__file__) + current_dir = os.path.dirname(__current_file) + output_file = os.path.join(current_dir, 'browser_scaling_grid_search.png') + + # Adjust layout and save figure with high DPI + plt.tight_layout(rect=[0, 0.03, 1, 0.97]) + plt.savefig(output_file, dpi=200, bbox_inches='tight') + logger.success(f"Visualization saved to {output_file}", tag="TEST") + + except ImportError: + logger.warning("matplotlib not available, skipping visualization", tag="TEST") + + return most_efficient["num_browsers"], most_efficient["pages_per_browser"] + +async def find_optimal_browser_config(total_urls=50, verbose=True, rate_limit_delay=0.2): + """Find optimal browser configuration for crawling a specific number of URLs. + + Args: + total_urls: Number of URLs to crawl + verbose: Whether to print progress + rate_limit_delay: Delay between page loads to avoid rate limiting + + Returns: + dict: Contains fastest, lowest_memory, and optimal configurations + """ + if verbose: + print(f"\n=== Finding optimal configuration for crawling {total_urls} URLs ===\n") + + # Generate test URLs with timestamp to avoid caching + timestamp = int(time.time()) + urls = [f"https://example.com/page_{i}?t={timestamp}" for i in range(total_urls)] + + # Limit browser configurations to test (1 browser to max 10) + max_browsers = min(10, total_urls) + configs_to_test = [] + + # Generate configurations (browser count, pages distribution) + for num_browsers in range(1, max_browsers + 1): + base_pages = total_urls // num_browsers + remainder = total_urls % num_browsers + + # Create distribution array like [3, 3, 2, 2] (some browsers get one more page) + if remainder > 0: + distribution = [base_pages + 1] * remainder + [base_pages] * (num_browsers - remainder) + else: + distribution = [base_pages] * num_browsers + + configs_to_test.append((num_browsers, distribution)) + + results = [] + + # Test each configuration + for browser_count, page_distribution in configs_to_test: + if verbose: + print(f"Testing {browser_count} browsers with distribution {tuple(page_distribution)}") + + try: + # Track memory if possible + try: + import psutil + process = psutil.Process() + start_memory = process.memory_info().rss / (1024 * 1024) # MB + except ImportError: + if verbose: + print("Memory tracking not available (psutil not installed)") + start_memory = 0 + + # Start browsers in parallel + managers = [] + start_tasks = [] + start_time = time.time() + + for i in range(browser_count): + config = BrowserConfig(headless=True) + manager = BrowserManager(browser_config=config, logger=logger) + start_tasks.append(manager.start()) + managers.append(manager) + + await asyncio.gather(*start_tasks) + + # Distribute URLs among browsers + urls_per_manager = {} + url_index = 0 + + for i, manager in enumerate(managers): + pages_for_this_browser = page_distribution[i] + end_index = url_index + pages_for_this_browser + urls_per_manager[manager] = urls[url_index:end_index] + url_index = end_index + + # Create pages for each browser + all_pages = [] + for manager, manager_urls in urls_per_manager.items(): + if not manager_urls: + continue + pages = await manager.get_pages(CrawlerRunConfig(), count=len(manager_urls)) + all_pages.extend(zip(pages, manager_urls)) + + # Crawl pages with delay to avoid rate limiting + async def crawl_page(page_ctx, url): + page, _ = page_ctx + try: + await page.goto(url) + if rate_limit_delay > 0: + await asyncio.sleep(rate_limit_delay) + title = await page.title() + return title + finally: + await page.close() + + crawl_start = time.time() + crawl_tasks = [crawl_page(page_ctx, url) for page_ctx, url in all_pages] + await asyncio.gather(*crawl_tasks) + crawl_time = time.time() - crawl_start + total_time = time.time() - start_time + + # Measure final memory usage + if start_memory > 0: + end_memory = process.memory_info().rss / (1024 * 1024) + memory_used = end_memory - start_memory + else: + memory_used = 0 + + # Close all browsers + for manager in managers: + await manager.close() + + # Calculate metrics + pages_per_second = total_urls / crawl_time + + # Calculate efficiency score (higher is better) + # This balances speed vs memory + if memory_used > 0: + efficiency = pages_per_second / (memory_used + 1) + else: + efficiency = pages_per_second + + # Store result + result = { + "browser_count": browser_count, + "distribution": tuple(page_distribution), + "crawl_time": crawl_time, + "total_time": total_time, + "memory_used": memory_used, + "pages_per_second": pages_per_second, + "efficiency": efficiency + } + + results.append(result) + + if verbose: + print(f" ✓ Crawled {total_urls} pages in {crawl_time:.2f}s ({pages_per_second:.1f} pages/sec)") + if memory_used > 0: + print(f" ✓ Memory used: {memory_used:.1f}MB ({memory_used/total_urls:.1f}MB per page)") + print(f" ✓ Efficiency score: {efficiency:.4f}") + + except Exception as e: + if verbose: + print(f" ✗ Error: {str(e)}") + + # Clean up + for manager in managers: + try: + await manager.close() + except: + pass + + # If no successful results, return None + if not results: + return None + + # Find best configurations + fastest = sorted(results, key=lambda x: x["crawl_time"])[0] + + # Only consider memory if available + memory_results = [r for r in results if r["memory_used"] > 0] + if memory_results: + lowest_memory = sorted(memory_results, key=lambda x: x["memory_used"])[0] + else: + lowest_memory = fastest + + # Find most efficient (balanced speed vs memory) + optimal = sorted(results, key=lambda x: x["efficiency"], reverse=True)[0] + + # Print summary + if verbose: + print("\n=== OPTIMAL CONFIGURATIONS ===") + print(f"⚡ Fastest: {fastest['browser_count']} browsers {fastest['distribution']}") + print(f" {fastest['crawl_time']:.2f}s, {fastest['pages_per_second']:.1f} pages/sec") + + print(f"💾 Memory-efficient: {lowest_memory['browser_count']} browsers {lowest_memory['distribution']}") + if lowest_memory["memory_used"] > 0: + print(f" {lowest_memory['memory_used']:.1f}MB, {lowest_memory['memory_used']/total_urls:.2f}MB per page") + + print(f"🌟 Balanced optimal: {optimal['browser_count']} browsers {optimal['distribution']}") + print(f" {optimal['crawl_time']:.2f}s, {optimal['pages_per_second']:.1f} pages/sec, score: {optimal['efficiency']:.4f}") + + return { + "fastest": fastest, + "lowest_memory": lowest_memory, + "optimal": optimal, + "all_configs": results + } + +async def run_tests(): + """Run all tests sequentially.""" + results = [] + + # Find optimal configuration using our utility function + configs = await find_optimal_browser_config( + total_urls=20, # Use a small number for faster testing + verbose=True, + rate_limit_delay=0.2 # 200ms delay between page loads to avoid rate limiting + ) + + if configs: + # Show the optimal configuration + optimal = configs["optimal"] + print(f"\n🎯 Recommended configuration for production use:") + print(f" {optimal['browser_count']} browsers with distribution {optimal['distribution']}") + print(f" Estimated performance: {optimal['pages_per_second']:.1f} pages/second") + results.append(True) + else: + print("\n❌ Failed to find optimal configuration") + results.append(False) + + # Print summary + total = len(results) + passed = sum(results) + print(f"\nTests complete: {passed}/{total} passed") + + if passed == total: + print("All tests passed!") + else: + print(f"{total - passed} tests failed") + +if __name__ == "__main__": + asyncio.run(run_tests()) \ No newline at end of file From 6eeb2e4076d9822b429e71081d34f64875a92b5d Mon Sep 17 00:00:00 2001 From: UncleCode Date: Sun, 23 Mar 2025 19:07:13 +0800 Subject: [PATCH 3/5] feat(browser): enhance browser context creation with user data directory support and improved storage state handling --- crawl4ai/browser/strategies.py | 222 +++++++++++++++++++++------------ 1 file changed, 139 insertions(+), 83 deletions(-) diff --git a/crawl4ai/browser/strategies.py b/crawl4ai/browser/strategies.py index 85feef36..68d2d97d 100644 --- a/crawl4ai/browser/strategies.py +++ b/crawl4ai/browser/strategies.py @@ -139,6 +139,112 @@ class BaseBrowserStrategy(ABC): signature_hash = hashlib.sha256(signature_json.encode("utf-8")).hexdigest() return signature_hash + async def create_browser_context(self, crawlerRunConfig: Optional[CrawlerRunConfig] = None) -> BrowserContext: + """Creates and returns a new browser context with configured settings. + + Args: + crawlerRunConfig: Configuration object for the crawler run + + Returns: + BrowserContext: Browser context object with the specified configurations + """ + if not self.browser: + raise ValueError("Browser must be initialized before creating context") + + # Base settings + user_agent = self.config.headers.get("User-Agent", self.config.user_agent) + viewport_settings = { + "width": self.config.viewport_width, + "height": self.config.viewport_height, + } + proxy_settings = {"server": self.config.proxy} if self.config.proxy else None + + # Define blocked extensions for resource optimization + blocked_extensions = [ + # Images + "jpg", "jpeg", "png", "gif", "webp", "svg", "ico", "bmp", "tiff", "psd", + # Fonts + "woff", "woff2", "ttf", "otf", "eot", + # Media + "mp4", "webm", "ogg", "avi", "mov", "wmv", "flv", "m4v", "mp3", "wav", "aac", + "m4a", "opus", "flac", + # Documents + "pdf", "doc", "docx", "xls", "xlsx", "ppt", "pptx", + # Archives + "zip", "rar", "7z", "tar", "gz", + # Scripts and data + "xml", "swf", "wasm", + ] + + # Common context settings + context_settings = { + "user_agent": user_agent, + "viewport": viewport_settings, + "proxy": proxy_settings, + "accept_downloads": self.config.accept_downloads, + "ignore_https_errors": self.config.ignore_https_errors, + "device_scale_factor": 1.0, + "java_script_enabled": self.config.java_script_enabled, + } + + # Apply text mode settings if enabled + if self.config.text_mode: + text_mode_settings = { + "has_touch": False, + "is_mobile": False, + # Disable javascript in text mode + "java_script_enabled": False + } + # Update context settings with text mode settings + context_settings.update(text_mode_settings) + if self.logger: + self.logger.debug("Text mode enabled for browser context", tag="BROWSER") + + # Handle storage state properly - this is key for persistence + if self.config.storage_state: + context_settings["storage_state"] = self.config.storage_state + if self.logger: + if isinstance(self.config.storage_state, str): + self.logger.debug(f"Using storage state from file: {self.config.storage_state}", tag="BROWSER") + else: + self.logger.debug("Using storage state from config object", tag="BROWSER") + + # If user_data_dir is specified, browser persistence should be automatic + if self.config.user_data_dir and self.logger: + self.logger.debug(f"Using user data directory: {self.config.user_data_dir}", tag="BROWSER") + + # Apply crawler-specific configurations if provided + if crawlerRunConfig: + # Check if there is value for crawlerRunConfig.proxy_config set add that to context + if crawlerRunConfig.proxy_config: + proxy_settings = { + "server": crawlerRunConfig.proxy_config.server, + } + if crawlerRunConfig.proxy_config.username: + proxy_settings.update({ + "username": crawlerRunConfig.proxy_config.username, + "password": crawlerRunConfig.proxy_config.password, + }) + context_settings["proxy"] = proxy_settings + + # Create and return the context + try: + # Create the context with appropriate settings + context = await self.browser.new_context(**context_settings) + + # Apply text mode resource blocking if enabled + if self.config.text_mode: + # Create and apply route patterns for each extension + for ext in blocked_extensions: + await context.route(f"**/*.{ext}", lambda route: route.abort()) + + return context + except Exception as e: + if self.logger: + self.logger.error(f"Error creating browser context: {str(e)}", tag="BROWSER") + # Fallback to basic context creation if the advanced settings fail + return await self.browser.new_context() + async def setup_context(self, context: BrowserContext, crawlerRunConfig: Optional[CrawlerRunConfig] = None): """Set up a browser context with the configured options. @@ -301,97 +407,32 @@ class PlaywrightBrowserStrategy(BaseBrowserStrategy): async def create_browser_context(self, crawlerRunConfig: Optional[CrawlerRunConfig] = None) -> BrowserContext: """Creates and returns a new browser context with configured settings. + This implementation extends the base class version to handle user_data_dir specifically. + Args: crawlerRunConfig: Configuration object for the crawler run Returns: BrowserContext: Browser context object with the specified configurations """ - # Base settings - user_agent = self.config.headers.get("User-Agent", self.config.user_agent) - viewport_settings = { - "width": self.config.viewport_width, - "height": self.config.viewport_height, - } - proxy_settings = {"server": self.config.proxy} if self.config.proxy else None - - blocked_extensions = [ - # Images - "jpg", "jpeg", "png", "gif", "webp", "svg", "ico", "bmp", "tiff", "psd", - # Fonts - "woff", "woff2", "ttf", "otf", "eot", - # Media - "mp4", "webm", "ogg", "avi", "mov", "wmv", "flv", "m4v", "mp3", "wav", "aac", - "m4a", "opus", "flac", - # Documents - "pdf", "doc", "docx", "xls", "xlsx", "ppt", "pptx", - # Archives - "zip", "rar", "7z", "tar", "gz", - # Scripts and data - "xml", "swf", "wasm", - ] - - # Common context settings - context_settings = { - "user_agent": user_agent, - "viewport": viewport_settings, - "proxy": proxy_settings, - "accept_downloads": self.config.accept_downloads, - "ignore_https_errors": self.config.ignore_https_errors, - "device_scale_factor": 1.0, - "java_script_enabled": self.config.java_script_enabled, - } - - # Handle storage state properly - this is key for persistence - if self.config.storage_state: - context_settings["storage_state"] = self.config.storage_state - if self.logger: - if isinstance(self.config.storage_state, str): - self.logger.debug(f"Using storage state from file: {self.config.storage_state}", tag="BROWSER") - else: - self.logger.debug("Using storage state from config object", tag="BROWSER") - + # Handle user_data_dir explicitly to ensure storage persistence if self.config.user_data_dir: - context_settings["storage_state"] = os.path.join( - self.config.user_data_dir, "Default", "storage_state.json" - ) + # Create a storage state file path if none exists + storage_path = os.path.join(self.config.user_data_dir, "Default", "storage_state.json") + # Create the file if it doesn't exist - if not os.path.exists(context_settings["storage_state"]): - os.makedirs(os.path.dirname(context_settings["storage_state"]), exist_ok=True) - with open(context_settings["storage_state"], "w") as f: + if not os.path.exists(storage_path): + os.makedirs(os.path.dirname(storage_path), exist_ok=True) + with open(storage_path, "w") as f: json.dump({}, f) - - - if crawlerRunConfig: - # Check if there is value for crawlerRunConfig.proxy_config set add that to context - if crawlerRunConfig.proxy_config: - proxy_settings = { - "server": crawlerRunConfig.proxy_config.server, - } - if crawlerRunConfig.proxy_config.username: - proxy_settings.update({ - "username": crawlerRunConfig.proxy_config.username, - "password": crawlerRunConfig.proxy_config.password, - }) - context_settings["proxy"] = proxy_settings - - if self.config.text_mode: - text_mode_settings = { - "has_touch": False, - "is_mobile": False, - } - # Update context settings with text mode settings - context_settings.update(text_mode_settings) - - # Create and return the context with all settings - context = await self.browser.new_context(**context_settings) - - # Apply text mode settings if enabled - if self.config.text_mode: - # Create and apply route patterns for each extension - for ext in blocked_extensions: - await context.route(f"**/*.{ext}", lambda route: route.abort()) - return context + + # Override storage_state with our specific path + self.config.storage_state = storage_path + if self.logger: + self.logger.debug(f"Using persistent storage state at: {storage_path}", tag="BROWSER") + + # Now call the base class implementation which handles everything else + return await super().create_browser_context(crawlerRunConfig) def _cleanup_expired_sessions(self): """Clean up expired sessions based on TTL.""" @@ -704,13 +745,28 @@ class CDPBrowserStrategy(BaseBrowserStrategy): async def create_browser_context(self, crawlerRunConfig: Optional[CrawlerRunConfig] = None) -> BrowserContext: """Create a new browser context. + Uses the base class implementation which handles all configurations. + Args: crawlerRunConfig: Configuration object for the crawler run Returns: BrowserContext: Browser context object """ - return await self.browser.new_context() + # Handle user_data_dir for CDP browsers + if self.config.user_data_dir: + # For CDP-based browsers, storage persistence is typically handled by the user_data_dir + # at the browser level, but we'll create a storage_state location for Playwright as well + storage_path = os.path.join(self.config.user_data_dir, "storage_state.json") + if not os.path.exists(storage_path): + # Create parent directory if it doesn't exist + os.makedirs(os.path.dirname(storage_path), exist_ok=True) + with open(storage_path, "w") as f: + json.dump({}, f) + self.config.storage_state = storage_path + + # Use the base class implementation + return await super().create_browser_context(crawlerRunConfig) def _cleanup_expired_sessions(self): """Clean up expired sessions based on TTL.""" From 462d5765e29293170fb9d320f0090a4061985883 Mon Sep 17 00:00:00 2001 From: UncleCode Date: Sun, 23 Mar 2025 21:06:41 +0800 Subject: [PATCH 4/5] fix(browser): improve storage state persistence in CDP strategy Enhance storage state persistence mechanism in CDP browser strategy by: - Explicitly saving storage state for each browser context - Using proper file path for storage state - Removing unnecessary sleep delay Also includes test improvements: - Simplified test configurations in playwright tests - Temporarily disabled some CDP tests --- crawl4ai/browser/manager.py | 4 +-- crawl4ai/browser/profiles.py | 1 - crawl4ai/browser/strategies.py | 32 +++++++++++------------ tests/browser/test_cdp_strategy.py | 4 +-- tests/browser/test_playwright_strategy.py | 14 +++------- 5 files changed, 21 insertions(+), 34 deletions(-) diff --git a/crawl4ai/browser/manager.py b/crawl4ai/browser/manager.py index 3a37efcb..9b0cf073 100644 --- a/crawl4ai/browser/manager.py +++ b/crawl4ai/browser/manager.py @@ -7,9 +7,7 @@ It also implements a page pooling mechanism for improved performance. import asyncio import time -import os -import psutil -from typing import Optional, Tuple, Dict, Any, List, Set +from typing import Optional, Tuple, List from playwright.async_api import Page, BrowserContext diff --git a/crawl4ai/browser/profiles.py b/crawl4ai/browser/profiles.py index 58a8bff2..afd0d78a 100644 --- a/crawl4ai/browser/profiles.py +++ b/crawl4ai/browser/profiles.py @@ -17,7 +17,6 @@ from colorama import Fore, Style, init from ..async_configs import BrowserConfig from ..async_logger import AsyncLogger, AsyncLoggerBase from ..utils import get_home_folder -from .strategies import is_windows class BrowserProfileManager: """Manages browser profiles for Crawl4AI. diff --git a/crawl4ai/browser/strategies.py b/crawl4ai/browser/strategies.py index 68d2d97d..f2a9525e 100644 --- a/crawl4ai/browser/strategies.py +++ b/crawl4ai/browser/strategies.py @@ -11,12 +11,11 @@ import time import json import hashlib import subprocess -import sys import shutil import signal from typing import Optional, Dict, Tuple, List, Any -from playwright.async_api import Browser, BrowserContext, Page, ProxySettings +from playwright.async_api import BrowserContext, Page, ProxySettings from ..async_logger import AsyncLogger from ..async_configs import BrowserConfig, CrawlerRunConfig @@ -831,26 +830,25 @@ class CDPBrowserStrategy(BaseBrowserStrategy): await asyncio.sleep(0.5) # If we have a user_data_dir configured, ensure persistence of storage state - if self.config.user_data_dir and self.browser: - try: - # Create a brief sleep to allow the browser to flush any pending operations - # This helps ensure all storage state (localStorage, cookies, etc.) gets saved - await asyncio.sleep(0.3) - if self.logger: - self.logger.debug("Ensuring storage state is persisted before closing CDP browser", tag="BROWSER") - except Exception as e: - if self.logger: - self.logger.warning( - message="Failed to ensure storage persistence: {error}", - tag="BROWSER", - params={"error": str(e)} - ) + if self.config.user_data_dir and self.browser and self.default_context: + for context in self.browser.contexts: + try: + await context.storage_state(path=os.path.join(self.config.user_data_dir, "Default", "storage_state.json")) + if self.logger: + self.logger.debug("Ensuring storage state is persisted before closing browser", tag="BROWSER") + except Exception as e: + if self.logger: + self.logger.warning( + message="Failed to ensure storage persistence: {error}", + tag="BROWSER", + params={"error": str(e)} + ) # Close all sessions session_ids = list(self.sessions.keys()) for session_id in session_ids: await self._kill_session(session_id) - + # Close browser if self.browser: await self.browser.close() diff --git a/tests/browser/test_cdp_strategy.py b/tests/browser/test_cdp_strategy.py index 4ec1f7f1..abadf42a 100644 --- a/tests/browser/test_cdp_strategy.py +++ b/tests/browser/test_cdp_strategy.py @@ -209,8 +209,8 @@ async def run_tests(): """Run all tests sequentially.""" results = [] - results.append(await test_cdp_launch_connect()) - results.append(await test_cdp_with_user_data_dir()) + # results.append(await test_cdp_launch_connect()) + # results.append(await test_cdp_with_user_data_dir()) results.append(await test_cdp_session_management()) # Print summary diff --git a/tests/browser/test_playwright_strategy.py b/tests/browser/test_playwright_strategy.py index 1d897bcf..2344c9ba 100644 --- a/tests/browser/test_playwright_strategy.py +++ b/tests/browser/test_playwright_strategy.py @@ -143,15 +143,11 @@ async def test_playwright_context_reuse(): # Create identical crawler configs crawler_config1 = CrawlerRunConfig( - url="https://example.com", - viewport_width=1280, - viewport_height=800 + css_selector="body", ) crawler_config2 = CrawlerRunConfig( - url="https://example.org", # Different URL but same browser parameters - viewport_width=1280, - viewport_height=800 + css_selector="body", ) # Get pages with these configs @@ -163,11 +159,7 @@ async def test_playwright_context_reuse(): logger.info(f"Contexts reused: {is_same_context}", tag="TEST") # Now try with a different config - crawler_config3 = CrawlerRunConfig( - url="https://example.net", - viewport_width=800, # Different viewport size - viewport_height=600 - ) + crawler_config3 = CrawlerRunConfig() page3, context3 = await manager.get_page(crawler_config3) From 8c085213017c0a898c81e1be38968666e5eff90d Mon Sep 17 00:00:00 2001 From: UncleCode Date: Mon, 24 Mar 2025 21:36:58 +0800 Subject: [PATCH 5/5] feat(browser): add Docker-based browser automation strategy Implements a new browser strategy that runs Chrome in Docker containers, providing better isolation and cross-platform consistency. Features include: - Connect and launch modes for different container configurations - Persistent storage support for maintaining browser state - Container registry for efficient reuse - Comprehensive test suite for Docker browser functionality This addition allows users to run browser automation workloads in isolated containers, improving security and resource management. --- crawl4ai/async_configs.py | 30 +- crawl4ai/browser/docker/connect.Dockerfile | 61 ++ crawl4ai/browser/docker/launch.Dockerfile | 57 ++ crawl4ai/browser/docker_config.py | 133 ++++ crawl4ai/browser/docker_registry.py | 174 ++++++ crawl4ai/browser/docker_strategy.py | 286 +++++++++ crawl4ai/browser/docker_utils.py | 582 +++++++++++++++++ crawl4ai/browser/manager.py | 16 + tests/browser/docker/__init__.py | 4 + tests/browser/docker/test_docker_browser.py | 653 ++++++++++++++++++++ 10 files changed, 1995 insertions(+), 1 deletion(-) create mode 100644 crawl4ai/browser/docker/connect.Dockerfile create mode 100644 crawl4ai/browser/docker/launch.Dockerfile create mode 100644 crawl4ai/browser/docker_config.py create mode 100644 crawl4ai/browser/docker_registry.py create mode 100644 crawl4ai/browser/docker_strategy.py create mode 100644 crawl4ai/browser/docker_utils.py create mode 100644 tests/browser/docker/__init__.py create mode 100644 tests/browser/docker/test_docker_browser.py diff --git a/crawl4ai/async_configs.py b/crawl4ai/async_configs.py index 2306a0a6..c7f9e739 100644 --- a/crawl4ai/async_configs.py +++ b/crawl4ai/async_configs.py @@ -28,6 +28,10 @@ from typing import Any, Dict, Optional from enum import Enum from .proxy_strategy import ProxyConfig +try: + from .browser.docker_config import DockerConfig +except ImportError: + DockerConfig = None def to_serializable_dict(obj: Any, ignore_default_value : bool = False) -> Dict: @@ -173,6 +177,7 @@ class BrowserConfig: "builtin" - use the builtin CDP browser running in background "dedicated" - create a new dedicated browser instance each time "custom" - use explicit CDP settings provided in cdp_url + "docker" - run browser in Docker container with isolation Default: "dedicated" use_managed_browser (bool): Launch the browser using a managed approach (e.g., via CDP), allowing advanced manipulation. Default: False. @@ -190,6 +195,8 @@ class BrowserConfig: Default: None. proxy_config (ProxyConfig or dict or None): Detailed proxy configuration, e.g. {"server": "...", "username": "..."}. If None, no additional proxy config. Default: None. + docker_config (DockerConfig or dict or None): Configuration for Docker-based browser automation. + Contains settings for Docker container operation. Default: None. viewport_width (int): Default viewport width for pages. Default: 1080. viewport_height (int): Default viewport height for pages. Default: 600. viewport (dict): Default viewport dimensions for pages. If set, overrides viewport_width and viewport_height. @@ -235,6 +242,7 @@ class BrowserConfig: channel: str = "chromium", proxy: str = None, proxy_config: Union[ProxyConfig, dict, None] = None, + docker_config: Union["DockerConfig", dict, None] = None, viewport_width: int = 1080, viewport_height: int = 600, viewport: dict = None, @@ -275,6 +283,12 @@ class BrowserConfig: self.chrome_channel = "" self.proxy = proxy self.proxy_config = proxy_config + + # Handle docker configuration + if isinstance(docker_config, dict) and DockerConfig is not None: + self.docker_config = DockerConfig.from_kwargs(docker_config) + else: + self.docker_config = docker_config self.viewport_width = viewport_width self.viewport_height = viewport_height self.viewport = viewport @@ -315,6 +329,10 @@ class BrowserConfig: # Builtin mode uses managed browser connecting to builtin CDP endpoint self.use_managed_browser = True # cdp_url will be set later by browser_manager + elif self.browser_mode == "docker": + # Docker mode uses managed browser with CDP to connect to browser in container + self.use_managed_browser = True + # cdp_url will be set later by docker browser strategy elif self.browser_mode == "custom" and self.cdp_url: # Custom mode with explicit CDP URL self.use_managed_browser = True @@ -340,6 +358,7 @@ class BrowserConfig: channel=kwargs.get("channel", "chromium"), proxy=kwargs.get("proxy"), proxy_config=kwargs.get("proxy_config", None), + docker_config=kwargs.get("docker_config", None), viewport_width=kwargs.get("viewport_width", 1080), viewport_height=kwargs.get("viewport_height", 600), accept_downloads=kwargs.get("accept_downloads", False), @@ -364,7 +383,7 @@ class BrowserConfig: ) def to_dict(self): - return { + result = { "browser_type": self.browser_type, "headless": self.headless, "browser_mode": self.browser_mode, @@ -396,6 +415,15 @@ class BrowserConfig: "debugging_port": self.debugging_port, "host": self.host, } + + # Include docker_config if it exists + if hasattr(self, "docker_config") and self.docker_config is not None: + if hasattr(self.docker_config, "to_dict"): + result["docker_config"] = self.docker_config.to_dict() + else: + result["docker_config"] = self.docker_config + + return result def clone(self, **kwargs): """Create a copy of this configuration with updated values. diff --git a/crawl4ai/browser/docker/connect.Dockerfile b/crawl4ai/browser/docker/connect.Dockerfile new file mode 100644 index 00000000..d2d955b6 --- /dev/null +++ b/crawl4ai/browser/docker/connect.Dockerfile @@ -0,0 +1,61 @@ +FROM ubuntu:22.04 + +# Install dependencies with comprehensive Chromium support +RUN apt-get update && apt-get install -y --no-install-recommends \ + wget \ + gnupg \ + ca-certificates \ + fonts-liberation \ + # Sound support + libasound2 \ + # Accessibility support + libatspi2.0-0 \ + libatk1.0-0 \ + libatk-bridge2.0-0 \ + # Graphics and rendering + libdrm2 \ + libgbm1 \ + libgtk-3-0 \ + libxcomposite1 \ + libxdamage1 \ + libxext6 \ + libxfixes3 \ + libxrandr2 \ + # X11 and window system + libx11-6 \ + libxcb1 \ + libxkbcommon0 \ + # Text and internationalization + libpango-1.0-0 \ + libcairo2 \ + # Printing support + libcups2 \ + # System libraries + libdbus-1-3 \ + libnss3 \ + libnspr4 \ + libglib2.0-0 \ + # Utilities + xdg-utils \ + socat \ + # Process management + procps \ + # Clean up + && rm -rf /var/lib/apt/lists/* + +# Install Chrome +RUN wget -q -O - https://dl-ssl.google.com/linux/linux_signing_key.pub | apt-key add - && \ + echo "deb [arch=amd64] http://dl.google.com/linux/chrome/deb/ stable main" >> /etc/apt/sources.list.d/google.list && \ + apt-get update && \ + apt-get install -y google-chrome-stable && \ + rm -rf /var/lib/apt/lists/* + +# Create data directory for user data +RUN mkdir -p /data && chmod 777 /data + +# Add a startup script +COPY start.sh /start.sh +RUN chmod +x /start.sh + +# Set entrypoint +ENTRYPOINT ["/start.sh"] \ No newline at end of file diff --git a/crawl4ai/browser/docker/launch.Dockerfile b/crawl4ai/browser/docker/launch.Dockerfile new file mode 100644 index 00000000..042f724d --- /dev/null +++ b/crawl4ai/browser/docker/launch.Dockerfile @@ -0,0 +1,57 @@ +FROM ubuntu:22.04 + +# Install dependencies with comprehensive Chromium support +RUN apt-get update && apt-get install -y --no-install-recommends \ + wget \ + gnupg \ + ca-certificates \ + fonts-liberation \ + # Sound support + libasound2 \ + # Accessibility support + libatspi2.0-0 \ + libatk1.0-0 \ + libatk-bridge2.0-0 \ + # Graphics and rendering + libdrm2 \ + libgbm1 \ + libgtk-3-0 \ + libxcomposite1 \ + libxdamage1 \ + libxext6 \ + libxfixes3 \ + libxrandr2 \ + # X11 and window system + libx11-6 \ + libxcb1 \ + libxkbcommon0 \ + # Text and internationalization + libpango-1.0-0 \ + libcairo2 \ + # Printing support + libcups2 \ + # System libraries + libdbus-1-3 \ + libnss3 \ + libnspr4 \ + libglib2.0-0 \ + # Utilities + xdg-utils \ + socat \ + # Process management + procps \ + # Clean up + && rm -rf /var/lib/apt/lists/* + +# Install Chrome +RUN wget -q -O - https://dl-ssl.google.com/linux/linux_signing_key.pub | apt-key add - && \ + echo "deb [arch=amd64] http://dl.google.com/linux/chrome/deb/ stable main" >> /etc/apt/sources.list.d/google.list && \ + apt-get update && \ + apt-get install -y google-chrome-stable && \ + rm -rf /var/lib/apt/lists/* + +# Create data directory for user data +RUN mkdir -p /data && chmod 777 /data + +# Keep container running without starting Chrome +CMD ["tail", "-f", "/dev/null"] \ No newline at end of file diff --git a/crawl4ai/browser/docker_config.py b/crawl4ai/browser/docker_config.py new file mode 100644 index 00000000..a63c480c --- /dev/null +++ b/crawl4ai/browser/docker_config.py @@ -0,0 +1,133 @@ +"""Docker configuration module for Crawl4AI browser automation. + +This module provides configuration classes for Docker-based browser automation, +allowing flexible configuration of Docker containers for browsing. +""" + +from typing import Dict, List, Optional, Union + + +class DockerConfig: + """Configuration for Docker-based browser automation. + + This class contains Docker-specific settings to avoid cluttering BrowserConfig. + + Attributes: + mode (str): Docker operation mode - "connect" or "launch". + - "connect": Uses a container with Chrome already running + - "launch": Dynamically configures and starts Chrome in container + image (str): Docker image to use. If None, defaults from DockerUtils are used. + registry_file (str): Path to container registry file for persistence. + persistent (bool): Keep container running after browser closes. + remove_on_exit (bool): Remove container on exit when not persistent. + network (str): Docker network to use. + volumes (List[str]): Volume mappings (e.g., ["host_path:container_path"]). + env_vars (Dict[str, str]): Environment variables to set in container. + extra_args (List[str]): Additional docker run arguments. + host_port (int): Host port to map to container's 9223 port. + user_data_dir (str): Path to user data directory on host. + container_user_data_dir (str): Path to user data directory in container. + """ + + def __init__( + self, + mode: str = "connect", # "connect" or "launch" + image: Optional[str] = None, # Docker image to use + registry_file: Optional[str] = None, # Path to registry file + persistent: bool = False, # Keep container running after browser closes + remove_on_exit: bool = True, # Remove container on exit when not persistent + network: Optional[str] = None, # Docker network to use + volumes: List[str] = None, # Volume mappings + env_vars: Dict[str, str] = None, # Environment variables + extra_args: List[str] = None, # Additional docker run arguments + host_port: Optional[int] = None, # Host port to map to container's 9223 + user_data_dir: Optional[str] = None, # Path to user data directory on host + container_user_data_dir: str = "/data", # Path to user data directory in container + ): + """Initialize Docker configuration. + + Args: + mode: Docker operation mode ("connect" or "launch") + image: Docker image to use + registry_file: Path to container registry file + persistent: Whether to keep container running after browser closes + remove_on_exit: Whether to remove container on exit when not persistent + network: Docker network to use + volumes: Volume mappings as list of strings + env_vars: Environment variables as dictionary + extra_args: Additional docker run arguments + host_port: Host port to map to container's 9223 + user_data_dir: Path to user data directory on host + container_user_data_dir: Path to user data directory in container + """ + self.mode = mode + self.image = image # If None, defaults will be used from DockerUtils + self.registry_file = registry_file + self.persistent = persistent + self.remove_on_exit = remove_on_exit + self.network = network + self.volumes = volumes or [] + self.env_vars = env_vars or {} + self.extra_args = extra_args or [] + self.host_port = host_port + self.user_data_dir = user_data_dir + self.container_user_data_dir = container_user_data_dir + + def to_dict(self) -> Dict: + """Convert this configuration to a dictionary. + + Returns: + Dictionary representation of this configuration + """ + return { + "mode": self.mode, + "image": self.image, + "registry_file": self.registry_file, + "persistent": self.persistent, + "remove_on_exit": self.remove_on_exit, + "network": self.network, + "volumes": self.volumes, + "env_vars": self.env_vars, + "extra_args": self.extra_args, + "host_port": self.host_port, + "user_data_dir": self.user_data_dir, + "container_user_data_dir": self.container_user_data_dir + } + + @staticmethod + def from_kwargs(kwargs: Dict) -> "DockerConfig": + """Create a DockerConfig from a dictionary of keyword arguments. + + Args: + kwargs: Dictionary of configuration options + + Returns: + New DockerConfig instance + """ + return DockerConfig( + mode=kwargs.get("mode", "connect"), + image=kwargs.get("image"), + registry_file=kwargs.get("registry_file"), + persistent=kwargs.get("persistent", False), + remove_on_exit=kwargs.get("remove_on_exit", True), + network=kwargs.get("network"), + volumes=kwargs.get("volumes"), + env_vars=kwargs.get("env_vars"), + extra_args=kwargs.get("extra_args"), + host_port=kwargs.get("host_port"), + user_data_dir=kwargs.get("user_data_dir"), + container_user_data_dir=kwargs.get("container_user_data_dir", "/data") + ) + + def clone(self, **kwargs) -> "DockerConfig": + """Create a copy of this configuration with updated values. + + Args: + **kwargs: Key-value pairs of configuration options to update + + Returns: + DockerConfig: A new instance with the specified updates + """ + config_dict = self.to_dict() + config_dict.update(kwargs) + return DockerConfig.from_kwargs(config_dict) \ No newline at end of file diff --git a/crawl4ai/browser/docker_registry.py b/crawl4ai/browser/docker_registry.py new file mode 100644 index 00000000..91f81c5e --- /dev/null +++ b/crawl4ai/browser/docker_registry.py @@ -0,0 +1,174 @@ +"""Docker registry module for Crawl4AI. + +This module provides a registry system for tracking and reusing Docker containers +across browser sessions, improving performance and resource utilization. +""" + +import os +import json +import time +from typing import Dict, Optional + +from ..utils import get_home_folder + + +class DockerRegistry: + """Manages a registry of Docker containers used for browser automation. + + This registry tracks containers by configuration hash, allowing reuse of appropriately + configured containers instead of creating new ones for each session. + + Attributes: + registry_file (str): Path to the registry file + containers (dict): Dictionary of container information + port_map (dict): Map of host ports to container IDs + last_port (int): Last port assigned + """ + + def __init__(self, registry_file: Optional[str] = None): + """Initialize the registry with an optional path to the registry file. + + Args: + registry_file: Path to the registry file. If None, uses default path. + """ + self.registry_file = registry_file or os.path.join(get_home_folder(), "docker_browser_registry.json") + self.containers = {} + self.port_map = {} + self.last_port = 9222 + self.load() + + def load(self): + """Load container registry from file.""" + if os.path.exists(self.registry_file): + try: + with open(self.registry_file, 'r') as f: + registry_data = json.load(f) + self.containers = registry_data.get("containers", {}) + self.port_map = registry_data.get("ports", {}) + self.last_port = registry_data.get("last_port", 9222) + except Exception: + # Reset to defaults on error + self.containers = {} + self.port_map = {} + self.last_port = 9222 + else: + # Initialize with defaults if file doesn't exist + self.containers = {} + self.port_map = {} + self.last_port = 9222 + + def save(self): + """Save container registry to file.""" + os.makedirs(os.path.dirname(self.registry_file), exist_ok=True) + with open(self.registry_file, 'w') as f: + json.dump({ + "containers": self.containers, + "ports": self.port_map, + "last_port": self.last_port + }, f, indent=2) + + def register_container(self, container_id: str, host_port: int, config_hash: str): + """Register a container with its configuration hash and port mapping. + + Args: + container_id: Docker container ID + host_port: Host port mapped to container + config_hash: Hash of configuration used to create container + """ + self.containers[container_id] = { + "host_port": host_port, + "config_hash": config_hash, + "created_at": time.time() + } + self.port_map[str(host_port)] = container_id + self.save() + + def unregister_container(self, container_id: str): + """Unregister a container. + + Args: + container_id: Docker container ID to unregister + """ + if container_id in self.containers: + host_port = self.containers[container_id]["host_port"] + if str(host_port) in self.port_map: + del self.port_map[str(host_port)] + del self.containers[container_id] + self.save() + + def find_container_by_config(self, config_hash: str, docker_utils) -> Optional[str]: + """Find a container that matches the given configuration hash. + + Args: + config_hash: Hash of configuration to match + docker_utils: DockerUtils instance to check running containers + + Returns: + Container ID if found, None otherwise + """ + for container_id, data in self.containers.items(): + if data["config_hash"] == config_hash and docker_utils.is_container_running(container_id): + return container_id + return None + + def get_container_host_port(self, container_id: str) -> Optional[int]: + """Get the host port mapped to the container. + + Args: + container_id: Docker container ID + + Returns: + Host port if container is registered, None otherwise + """ + if container_id in self.containers: + return self.containers[container_id]["host_port"] + return None + + def get_next_available_port(self, docker_utils) -> int: + """Get the next available host port for Docker mapping. + + Args: + docker_utils: DockerUtils instance to check port availability + + Returns: + Available port number + """ + # Start from last port + 1 + port = self.last_port + 1 + + # Check if port is in use (either in our registry or system-wide) + while port in self.port_map or docker_utils.is_port_in_use(port): + port += 1 + + # Update last port + self.last_port = port + self.save() + + return port + + def get_container_config_hash(self, container_id: str) -> Optional[str]: + """Get the configuration hash for a container. + + Args: + container_id: Docker container ID + + Returns: + Configuration hash if container is registered, None otherwise + """ + if container_id in self.containers: + return self.containers[container_id]["config_hash"] + return None + + def cleanup_stale_containers(self, docker_utils): + """Clean up containers that are no longer running. + + Args: + docker_utils: DockerUtils instance to check container status + """ + to_remove = [] + for container_id in self.containers: + if not docker_utils.is_container_running(container_id): + to_remove.append(container_id) + + for container_id in to_remove: + self.unregister_container(container_id) \ No newline at end of file diff --git a/crawl4ai/browser/docker_strategy.py b/crawl4ai/browser/docker_strategy.py new file mode 100644 index 00000000..639abd84 --- /dev/null +++ b/crawl4ai/browser/docker_strategy.py @@ -0,0 +1,286 @@ +"""Docker browser strategy module for Crawl4AI. + +This module provides browser strategies for running browsers in Docker containers, +which offers better isolation, consistency across platforms, and easy scaling. +""" + +import os +import uuid +import asyncio +from typing import Dict, List, Optional, Tuple, Union +from pathlib import Path + +from playwright.async_api import Page, BrowserContext + +from ..async_logger import AsyncLogger +from ..async_configs import BrowserConfig, CrawlerRunConfig +from .docker_config import DockerConfig +from .docker_registry import DockerRegistry +from .docker_utils import DockerUtils +from .strategies import BuiltinBrowserStrategy + + +class DockerBrowserStrategy(BuiltinBrowserStrategy): + """Docker-based browser strategy. + + Extends the BuiltinBrowserStrategy to run browsers in Docker containers. + Supports two modes: + 1. "connect" - Uses a Docker image with Chrome already running + 2. "launch" - Starts Chrome within the container with custom settings + + Attributes: + docker_config: Docker-specific configuration options + container_id: ID of current Docker container + container_name: Name assigned to the container + registry: Registry for tracking and reusing containers + docker_utils: Utilities for Docker operations + chrome_process_id: Process ID of Chrome within container + socat_process_id: Process ID of socat within container + internal_cdp_port: Chrome's internal CDP port + internal_mapped_port: Port that socat maps to internally + """ + + def __init__(self, config: BrowserConfig, logger: Optional[AsyncLogger] = None): + """Initialize the Docker browser strategy. + + Args: + config: Browser configuration including Docker-specific settings + logger: Logger for recording events and errors + """ + super().__init__(config, logger) + + # Initialize Docker-specific attributes + self.docker_config = self.config.docker_config or DockerConfig() + self.container_id = None + self.container_name = f"crawl4ai-browser-{uuid.uuid4().hex[:8]}" + self.registry = DockerRegistry(self.docker_config.registry_file) + self.docker_utils = DockerUtils(logger) + self.chrome_process_id = None + self.socat_process_id = None + self.internal_cdp_port = 9222 # Chrome's internal CDP port + self.internal_mapped_port = 9223 # Port that socat maps to internally + self.shutting_down = False + + async def _generate_config_hash(self) -> str: + """Generate a hash of the configuration for container matching. + + Returns: + Hash string uniquely identifying this configuration + """ + # Create a dict with the relevant parts of the config + config_dict = { + "image": self.docker_config.image, + "mode": self.docker_config.mode, + "browser_type": self.config.browser_type, + "headless": self.config.headless, + } + + # Add browser-specific config if in launch mode + if self.docker_config.mode == "launch": + config_dict.update({ + "text_mode": self.config.text_mode, + "light_mode": self.config.light_mode, + "viewport_width": self.config.viewport_width, + "viewport_height": self.config.viewport_height, + }) + + # Use the utility method to generate the hash + return self.docker_utils.generate_config_hash(config_dict) + + async def _get_or_create_cdp_url(self) -> str: + """Get CDP URL by either creating a new container or using an existing one. + + Returns: + CDP URL for connecting to the browser + + Raises: + Exception: If container creation or browser launch fails + """ + # If CDP URL is explicitly provided, use it + if self.config.cdp_url: + return self.config.cdp_url + + # Ensure Docker image exists (will build if needed) + image_name = await self.docker_utils.ensure_docker_image_exists( + self.docker_config.image, + self.docker_config.mode + ) + + # Generate config hash for container matching + config_hash = await self._generate_config_hash() + + # Look for existing container with matching config + container_id = self.registry.find_container_by_config(config_hash, self.docker_utils) + + if container_id: + # Use existing container + self.container_id = container_id + host_port = self.registry.get_container_host_port(container_id) + if self.logger: + self.logger.info(f"Using existing Docker container: {container_id[:12]}", tag="DOCKER") + else: + # Get a port for the new container + host_port = self.docker_config.host_port or self.registry.get_next_available_port(self.docker_utils) + + # Prepare volumes list + volumes = list(self.docker_config.volumes) + + # Add user data directory if specified + if self.docker_config.user_data_dir: + # Ensure user data directory exists + os.makedirs(self.docker_config.user_data_dir, exist_ok=True) + volumes.append(f"{self.docker_config.user_data_dir}:{self.docker_config.container_user_data_dir}") + + # Update config user_data_dir to point to container path + self.config.user_data_dir = self.docker_config.container_user_data_dir + + # Create a new container + container_id = await self.docker_utils.create_container( + image_name=image_name, + host_port=host_port, + container_name=self.container_name, + volumes=volumes, + network=self.docker_config.network, + env_vars=self.docker_config.env_vars, + extra_args=self.docker_config.extra_args + ) + + if not container_id: + raise Exception("Failed to create Docker container") + + self.container_id = container_id + + # Register the container + self.registry.register_container(container_id, host_port, config_hash) + + # Wait for container to be ready + await self.docker_utils.wait_for_container_ready(container_id) + + # Handle specific setup based on mode + if self.docker_config.mode == "launch": + # In launch mode, we need to start socat and Chrome + await self.docker_utils.start_socat_in_container(container_id) + + # Build browser arguments + browser_args = self._build_browser_args() + + # Launch Chrome + await self.docker_utils.launch_chrome_in_container(container_id, browser_args) + + # Get PIDs for later cleanup + self.chrome_process_id = await self.docker_utils.get_process_id_in_container( + container_id, "chrome" + ) + self.socat_process_id = await self.docker_utils.get_process_id_in_container( + container_id, "socat" + ) + + # Wait for CDP to be ready + await self.docker_utils.wait_for_cdp_ready(host_port) + + if self.logger: + self.logger.success(f"Docker container ready: {container_id[:12]} on port {host_port}", tag="DOCKER") + + # Return CDP URL + return f"http://localhost:{host_port}" + + def _build_browser_args(self) -> List[str]: + """Build Chrome command line arguments based on BrowserConfig. + + Returns: + List of command line arguments for Chrome + """ + args = [ + "--no-sandbox", + "--disable-gpu", + f"--remote-debugging-port={self.internal_cdp_port}", + "--remote-debugging-address=0.0.0.0", # Allow external connections + "--disable-dev-shm-usage", + ] + + if self.config.headless: + args.append("--headless=new") + + if self.config.viewport_width and self.config.viewport_height: + args.append(f"--window-size={self.config.viewport_width},{self.config.viewport_height}") + + if self.config.user_agent: + args.append(f"--user-agent={self.config.user_agent}") + + if self.config.text_mode: + args.extend([ + "--blink-settings=imagesEnabled=false", + "--disable-remote-fonts", + "--disable-images", + "--disable-javascript", + ]) + + if self.config.light_mode: + # Import here to avoid circular import + from .utils import get_browser_disable_options + args.extend(get_browser_disable_options()) + + if self.config.user_data_dir: + args.append(f"--user-data-dir={self.config.user_data_dir}") + + if self.config.extra_args: + args.extend(self.config.extra_args) + + return args + + async def close(self): + """Close the browser and clean up Docker container if needed.""" + # Set shutting_down flag to prevent race conditions + self.shutting_down = True + + # Store state if needed before closing + if self.browser and self.docker_config.user_data_dir and self.docker_config.persistent: + for context in self.browser.contexts: + try: + storage_path = os.path.join(self.docker_config.user_data_dir, "storage_state.json") + await context.storage_state(path=storage_path) + if self.logger: + self.logger.debug("Persisted storage state before closing browser", tag="DOCKER") + except Exception as e: + if self.logger: + self.logger.warning( + message="Failed to persist storage state: {error}", + tag="DOCKER", + params={"error": str(e)} + ) + + # Close browser connection (but not container) + if self.browser: + await self.browser.close() + self.browser = None + + # Only clean up container if not persistent + if self.container_id and not self.docker_config.persistent: + # Stop Chrome process in "launch" mode + if self.docker_config.mode == "launch" and self.chrome_process_id: + await self.docker_utils.stop_process_in_container( + self.container_id, self.chrome_process_id + ) + + # Stop socat process in "launch" mode + if self.docker_config.mode == "launch" and self.socat_process_id: + await self.docker_utils.stop_process_in_container( + self.container_id, self.socat_process_id + ) + + # Remove or stop container based on configuration + if self.docker_config.remove_on_exit: + await self.docker_utils.remove_container(self.container_id) + # Unregister from registry + self.registry.unregister_container(self.container_id) + else: + await self.docker_utils.stop_container(self.container_id) + + self.container_id = None + + # Close Playwright + if self.playwright: + await self.playwright.stop() + self.playwright = None + + self.shutting_down = False \ No newline at end of file diff --git a/crawl4ai/browser/docker_utils.py b/crawl4ai/browser/docker_utils.py new file mode 100644 index 00000000..0597c2d5 --- /dev/null +++ b/crawl4ai/browser/docker_utils.py @@ -0,0 +1,582 @@ +import os +import json +import asyncio +import hashlib +import tempfile +import shutil +import socket +import subprocess +from typing import Dict, List, Optional, Tuple, Union + +class DockerUtils: + """Utility class for Docker operations in browser automation. + + This class provides methods for managing Docker images, containers, + and related operations needed for browser automation. It handles + image building, container lifecycle, port management, and registry operations. + + Attributes: + DOCKER_FOLDER (str): Path to folder containing Docker files + DOCKER_CONNECT_FILE (str): Path to Dockerfile for connect mode + DOCKER_LAUNCH_FILE (str): Path to Dockerfile for launch mode + DOCKER_START_SCRIPT (str): Path to startup script for connect mode + DEFAULT_CONNECT_IMAGE (str): Default image name for connect mode + DEFAULT_LAUNCH_IMAGE (str): Default image name for launch mode + logger: Optional logger instance + """ + + # File paths for Docker resources + DOCKER_FOLDER = os.path.join(os.path.dirname(__file__), "docker") + DOCKER_CONNECT_FILE = os.path.join(DOCKER_FOLDER, "connect.Dockerfile") + DOCKER_LAUNCH_FILE = os.path.join(DOCKER_FOLDER, "launch.Dockerfile") + DOCKER_START_SCRIPT = os.path.join(DOCKER_FOLDER, "start.sh") + + # Default image names + DEFAULT_CONNECT_IMAGE = "crawl4ai/browser-connect:latest" + DEFAULT_LAUNCH_IMAGE = "crawl4ai/browser-launch:latest" + + def __init__(self, logger=None): + """Initialize Docker utilities. + + Args: + logger: Optional logger for recording operations + """ + self.logger = logger + + # Image Management Methods + + async def check_image_exists(self, image_name: str) -> bool: + """Check if a Docker image exists. + + Args: + image_name: Name of the Docker image to check + + Returns: + bool: True if the image exists, False otherwise + """ + cmd = ["docker", "image", "inspect", image_name] + + try: + process = await asyncio.create_subprocess_exec( + *cmd, stdout=asyncio.subprocess.PIPE, stderr=asyncio.subprocess.PIPE + ) + _, _ = await process.communicate() + return process.returncode == 0 + except Exception as e: + if self.logger: + self.logger.debug(f"Error checking if image exists: {str(e)}", tag="DOCKER") + return False + + async def build_docker_image(self, image_name: str, dockerfile_path: str, + files_to_copy: Dict[str, str] = None) -> bool: + """Build a Docker image from a Dockerfile. + + Args: + image_name: Name to give the built image + dockerfile_path: Path to the Dockerfile + files_to_copy: Dict of {dest_name: source_path} for files to copy to build context + + Returns: + bool: True if image was built successfully, False otherwise + """ + # Create a temporary build context + with tempfile.TemporaryDirectory() as temp_dir: + # Copy the Dockerfile + shutil.copy(dockerfile_path, os.path.join(temp_dir, "Dockerfile")) + + # Copy any additional files needed + if files_to_copy: + for dest_name, source_path in files_to_copy.items(): + shutil.copy(source_path, os.path.join(temp_dir, dest_name)) + + # Build the image + cmd = [ + "docker", "build", + "-t", image_name, + temp_dir + ] + + if self.logger: + self.logger.debug(f"Building Docker image with command: {' '.join(cmd)}", tag="DOCKER") + + process = await asyncio.create_subprocess_exec( + *cmd, stdout=asyncio.subprocess.PIPE, stderr=asyncio.subprocess.PIPE + ) + stdout, stderr = await process.communicate() + + if process.returncode != 0: + if self.logger: + self.logger.error( + message="Failed to build Docker image: {error}", + tag="DOCKER", + params={"error": stderr.decode()} + ) + return False + + if self.logger: + self.logger.success(f"Successfully built Docker image: {image_name}", tag="DOCKER") + return True + + async def ensure_docker_image_exists(self, image_name: str, mode: str = "connect") -> str: + """Ensure the required Docker image exists, creating it if necessary. + + Args: + image_name: Name of the Docker image + mode: Either "connect" or "launch" to determine which image to build + + Returns: + str: Name of the available Docker image + + Raises: + Exception: If image doesn't exist and can't be built + """ + # If image name is not specified, use default based on mode + if not image_name: + image_name = self.DEFAULT_CONNECT_IMAGE if mode == "connect" else self.DEFAULT_LAUNCH_IMAGE + + # Check if the image already exists + if await self.check_image_exists(image_name): + if self.logger: + self.logger.debug(f"Docker image {image_name} already exists", tag="DOCKER") + return image_name + + # If we're using a custom image that doesn't exist, warn and fail + if (image_name != self.DEFAULT_CONNECT_IMAGE and image_name != self.DEFAULT_LAUNCH_IMAGE): + if self.logger: + self.logger.warning( + f"Custom Docker image {image_name} not found and cannot be automatically created", + tag="DOCKER" + ) + raise Exception(f"Docker image {image_name} not found") + + # Build the appropriate default image + if self.logger: + self.logger.info(f"Docker image {image_name} not found, creating it now...", tag="DOCKER") + + if mode == "connect": + success = await self.build_docker_image( + image_name, + self.DOCKER_CONNECT_FILE, + {"start.sh": self.DOCKER_START_SCRIPT} + ) + else: + success = await self.build_docker_image( + image_name, + self.DOCKER_LAUNCH_FILE + ) + + if not success: + raise Exception(f"Failed to create Docker image {image_name}") + + return image_name + + # Container Management Methods + + async def create_container(self, image_name: str, host_port: int, + container_name: Optional[str] = None, + volumes: List[str] = None, + network: Optional[str] = None, + env_vars: Dict[str, str] = None, + extra_args: List[str] = None) -> Optional[str]: + """Create a new Docker container. + + Args: + image_name: Docker image to use + host_port: Port on host to map to container port 9223 + container_name: Optional name for the container + volumes: List of volume mappings (e.g., ["host_path:container_path"]) + network: Optional Docker network to use + env_vars: Dictionary of environment variables + extra_args: Additional docker run arguments + + Returns: + str: Container ID if successful, None otherwise + """ + # Prepare container command + cmd = [ + "docker", "run", + "--detach", + ] + + # Add container name if specified + if container_name: + cmd.extend(["--name", container_name]) + + # Add port mapping + cmd.extend(["-p", f"{host_port}:9223"]) + + # Add volumes + if volumes: + for volume in volumes: + cmd.extend(["-v", volume]) + + # Add network if specified + if network: + cmd.extend(["--network", network]) + + # Add environment variables + if env_vars: + for key, value in env_vars.items(): + cmd.extend(["-e", f"{key}={value}"]) + + # Add extra args + if extra_args: + cmd.extend(extra_args) + + # Add image + cmd.append(image_name) + + if self.logger: + self.logger.debug(f"Creating Docker container with command: {' '.join(cmd)}", tag="DOCKER") + + # Run docker command + try: + process = await asyncio.create_subprocess_exec( + *cmd, stdout=asyncio.subprocess.PIPE, stderr=asyncio.subprocess.PIPE + ) + stdout, stderr = await process.communicate() + + if process.returncode != 0: + if self.logger: + self.logger.error( + message="Failed to create Docker container: {error}", + tag="DOCKER", + params={"error": stderr.decode()} + ) + return None + + # Get container ID + container_id = stdout.decode().strip() + + if self.logger: + self.logger.success(f"Created Docker container: {container_id[:12]}", tag="DOCKER") + + return container_id + + except Exception as e: + if self.logger: + self.logger.error( + message="Error creating Docker container: {error}", + tag="DOCKER", + params={"error": str(e)} + ) + return None + + async def is_container_running(self, container_id: str) -> bool: + """Check if a container is running. + + Args: + container_id: ID of the container to check + + Returns: + bool: True if the container is running, False otherwise + """ + cmd = ["docker", "inspect", "--format", "{{.State.Running}}", container_id] + + try: + process = await asyncio.create_subprocess_exec( + *cmd, stdout=asyncio.subprocess.PIPE, stderr=asyncio.subprocess.PIPE + ) + stdout, _ = await process.communicate() + + return process.returncode == 0 and stdout.decode().strip() == "true" + except Exception as e: + if self.logger: + self.logger.debug(f"Error checking if container is running: {str(e)}", tag="DOCKER") + return False + + async def wait_for_container_ready(self, container_id: str, timeout: int = 30) -> bool: + """Wait for the container to be in running state. + + Args: + container_id: ID of the container to wait for + timeout: Maximum time to wait in seconds + + Returns: + bool: True if container is ready, False if timeout occurred + """ + for _ in range(timeout): + if await self.is_container_running(container_id): + return True + await asyncio.sleep(1) + + if self.logger: + self.logger.warning(f"Container {container_id[:12]} not ready after {timeout}s timeout", tag="DOCKER") + return False + + async def stop_container(self, container_id: str) -> bool: + """Stop a Docker container. + + Args: + container_id: ID of the container to stop + + Returns: + bool: True if stopped successfully, False otherwise + """ + cmd = ["docker", "stop", container_id] + + try: + process = await asyncio.create_subprocess_exec(*cmd) + await process.communicate() + + if self.logger: + self.logger.debug(f"Stopped container: {container_id[:12]}", tag="DOCKER") + + return process.returncode == 0 + except Exception as e: + if self.logger: + self.logger.warning( + message="Failed to stop container: {error}", + tag="DOCKER", + params={"error": str(e)} + ) + return False + + async def remove_container(self, container_id: str, force: bool = True) -> bool: + """Remove a Docker container. + + Args: + container_id: ID of the container to remove + force: Whether to force removal + + Returns: + bool: True if removed successfully, False otherwise + """ + cmd = ["docker", "rm"] + if force: + cmd.append("-f") + cmd.append(container_id) + + try: + process = await asyncio.create_subprocess_exec(*cmd) + await process.communicate() + + if self.logger: + self.logger.debug(f"Removed container: {container_id[:12]}", tag="DOCKER") + + return process.returncode == 0 + except Exception as e: + if self.logger: + self.logger.warning( + message="Failed to remove container: {error}", + tag="DOCKER", + params={"error": str(e)} + ) + return False + + # Container Command Execution Methods + + async def exec_in_container(self, container_id: str, command: List[str], + detach: bool = False) -> Tuple[int, str, str]: + """Execute a command in a running container. + + Args: + container_id: ID of the container + command: Command to execute as a list of strings + detach: Whether to run the command in detached mode + + Returns: + Tuple of (return_code, stdout, stderr) + """ + cmd = ["docker", "exec"] + if detach: + cmd.append("-d") + cmd.append(container_id) + cmd.extend(command) + + try: + process = await asyncio.create_subprocess_exec( + *cmd, stdout=asyncio.subprocess.PIPE, stderr=asyncio.subprocess.PIPE + ) + stdout, stderr = await process.communicate() + + return process.returncode, stdout.decode(), stderr.decode() + except Exception as e: + if self.logger: + self.logger.error( + message="Error executing command in container: {error}", + tag="DOCKER", + params={"error": str(e)} + ) + return -1, "", str(e) + + async def start_socat_in_container(self, container_id: str) -> bool: + """Start socat in the container to map port 9222 to 9223. + + Args: + container_id: ID of the container + + Returns: + bool: True if socat started successfully, False otherwise + """ + # Command to run socat as a background process + cmd = ["socat", "TCP-LISTEN:9223,fork", "TCP:localhost:9222"] + + returncode, _, stderr = await self.exec_in_container(container_id, cmd, detach=True) + + if returncode != 0: + if self.logger: + self.logger.error( + message="Failed to start socat in container: {error}", + tag="DOCKER", + params={"error": stderr} + ) + return False + + if self.logger: + self.logger.debug(f"Started socat in container: {container_id[:12]}", tag="DOCKER") + + # Wait a moment for socat to start + await asyncio.sleep(1) + return True + + async def launch_chrome_in_container(self, container_id: str, browser_args: List[str]) -> bool: + """Launch Chrome inside the container with specified arguments. + + Args: + container_id: ID of the container + browser_args: Chrome command line arguments + + Returns: + bool: True if Chrome started successfully, False otherwise + """ + # Build Chrome command + chrome_cmd = ["google-chrome"] + chrome_cmd.extend(browser_args) + + returncode, _, stderr = await self.exec_in_container(container_id, chrome_cmd, detach=True) + + if returncode != 0: + if self.logger: + self.logger.error( + message="Failed to launch Chrome in container: {error}", + tag="DOCKER", + params={"error": stderr} + ) + return False + + if self.logger: + self.logger.debug(f"Launched Chrome in container: {container_id[:12]}", tag="DOCKER") + + return True + + async def get_process_id_in_container(self, container_id: str, process_name: str) -> Optional[int]: + """Get the process ID for a process in the container. + + Args: + container_id: ID of the container + process_name: Name pattern to search for + + Returns: + int: Process ID if found, None otherwise + """ + cmd = ["pgrep", "-f", process_name] + + returncode, stdout, _ = await self.exec_in_container(container_id, cmd) + + if returncode == 0 and stdout.strip(): + pid = int(stdout.strip().split("\n")[0]) + return pid + + return None + + async def stop_process_in_container(self, container_id: str, pid: int) -> bool: + """Stop a process in the container by PID. + + Args: + container_id: ID of the container + pid: Process ID to stop + + Returns: + bool: True if process was stopped, False otherwise + """ + cmd = ["kill", "-TERM", str(pid)] + + returncode, _, stderr = await self.exec_in_container(container_id, cmd) + + if returncode != 0: + if self.logger: + self.logger.warning( + message="Failed to stop process in container: {error}", + tag="DOCKER", + params={"error": stderr} + ) + return False + + if self.logger: + self.logger.debug(f"Stopped process {pid} in container: {container_id[:12]}", tag="DOCKER") + + return True + + # Network and Port Methods + + async def wait_for_cdp_ready(self, host_port: int, timeout: int = 30) -> bool: + """Wait for the CDP endpoint to be ready. + + Args: + host_port: Port to check for CDP endpoint + timeout: Maximum time to wait in seconds + + Returns: + bool: True if CDP endpoint is ready, False if timeout occurred + """ + import aiohttp + + url = f"http://localhost:{host_port}/json/version" + + for _ in range(timeout): + try: + async with aiohttp.ClientSession() as session: + async with session.get(url, timeout=1) as response: + if response.status == 200: + if self.logger: + self.logger.debug(f"CDP endpoint ready on port {host_port}", tag="DOCKER") + return True + except Exception: + pass + await asyncio.sleep(1) + + if self.logger: + self.logger.warning(f"CDP endpoint not ready on port {host_port} after {timeout}s timeout", tag="DOCKER") + return False + + def is_port_in_use(self, port: int) -> bool: + """Check if a port is already in use on the host. + + Args: + port: Port number to check + + Returns: + bool: True if port is in use, False otherwise + """ + with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s: + return s.connect_ex(('localhost', port)) == 0 + + def get_next_available_port(self, start_port: int = 9223) -> int: + """Get the next available port starting from a given port. + + Args: + start_port: Port number to start checking from + + Returns: + int: First available port number + """ + port = start_port + while self.is_port_in_use(port): + port += 1 + return port + + # Configuration Hash Methods + + def generate_config_hash(self, config_dict: Dict) -> str: + """Generate a hash of the configuration for container matching. + + Args: + config_dict: Dictionary of configuration parameters + + Returns: + str: Hash string uniquely identifying this configuration + """ + # Convert to canonical JSON string and hash + config_json = json.dumps(config_dict, sort_keys=True) + return hashlib.sha256(config_json.encode()).hexdigest() \ No newline at end of file diff --git a/crawl4ai/browser/manager.py b/crawl4ai/browser/manager.py index 9b0cf073..31411844 100644 --- a/crawl4ai/browser/manager.py +++ b/crawl4ai/browser/manager.py @@ -21,6 +21,12 @@ from .strategies import ( BuiltinBrowserStrategy ) +# Import DockerBrowserStrategy if available +try: + from .docker_strategy import DockerBrowserStrategy +except ImportError: + DockerBrowserStrategy = None + class BrowserManager: """Main interface for browser management in Crawl4AI. @@ -69,6 +75,16 @@ class BrowserManager: """ if self.config.browser_mode == "builtin": return BuiltinBrowserStrategy(self.config, self.logger) + elif self.config.browser_mode == "docker": + if DockerBrowserStrategy is None: + if self.logger: + self.logger.error( + "Docker browser strategy requested but not available. " + "Falling back to PlaywrightBrowserStrategy.", + tag="BROWSER" + ) + return PlaywrightBrowserStrategy(self.config, self.logger) + return DockerBrowserStrategy(self.config, self.logger) elif self.config.cdp_url or self.config.use_managed_browser: return CDPBrowserStrategy(self.config, self.logger) else: diff --git a/tests/browser/docker/__init__.py b/tests/browser/docker/__init__.py new file mode 100644 index 00000000..b86e573c --- /dev/null +++ b/tests/browser/docker/__init__.py @@ -0,0 +1,4 @@ +"""Docker browser strategy tests. + +This package contains tests for the Docker browser strategy implementation. +""" \ No newline at end of file diff --git a/tests/browser/docker/test_docker_browser.py b/tests/browser/docker/test_docker_browser.py new file mode 100644 index 00000000..65f0b649 --- /dev/null +++ b/tests/browser/docker/test_docker_browser.py @@ -0,0 +1,653 @@ +"""Test examples for Docker Browser Strategy. + +These examples demonstrate the functionality of Docker Browser Strategy +and serve as functional tests. +""" + +import asyncio +import os +import sys +import shutil +import uuid +import json +from typing import List, Dict, Any, Optional, Tuple + +# Add the project root to Python path if running directly +if __name__ == "__main__": + sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '../../..'))) + +from crawl4ai.browser import BrowserManager +from crawl4ai.async_configs import BrowserConfig, CrawlerRunConfig +from crawl4ai.async_logger import AsyncLogger +from crawl4ai.browser.docker_config import DockerConfig +from crawl4ai.browser.docker_registry import DockerRegistry +from crawl4ai.browser.docker_utils import DockerUtils + +# Create a logger for clear terminal output +logger = AsyncLogger(verbose=True, log_file=None) + +# Global Docker utils instance +docker_utils = DockerUtils(logger) + +async def test_docker_components(): + """Test Docker utilities, registry, and image building. + + This function tests the core Docker components before running the browser tests. + It validates DockerRegistry, DockerUtils, and builds test images to ensure + everything is functioning correctly. + """ + logger.info("Testing Docker components", tag="SETUP") + + # Create a test registry directory + registry_dir = os.path.join(os.path.dirname(__file__), "test_registry") + registry_file = os.path.join(registry_dir, "test_registry.json") + os.makedirs(registry_dir, exist_ok=True) + + try: + # 1. Test DockerRegistry + logger.info("Testing DockerRegistry...", tag="SETUP") + registry = DockerRegistry(registry_file) + + # Test saving and loading registry + test_container_id = "test-container-123" + registry.register_container(test_container_id, 9876, "test-hash-123") + registry.save() + + # Create a new registry instance that loads from the file + registry2 = DockerRegistry(registry_file) + port = registry2.get_container_host_port(test_container_id) + hash_value = registry2.get_container_config_hash(test_container_id) + + if port != 9876 or hash_value != "test-hash-123": + logger.error("DockerRegistry persistence failed", tag="SETUP") + return False + + # Clean up test container from registry + registry2.unregister_container(test_container_id) + logger.success("DockerRegistry works correctly", tag="SETUP") + + # 2. Test DockerUtils + logger.info("Testing DockerUtils...", tag="SETUP") + + # Test port detection + in_use = docker_utils.is_port_in_use(22) # SSH port is usually in use + logger.info(f"Port 22 in use: {in_use}", tag="SETUP") + + # Get next available port + available_port = docker_utils.get_next_available_port(9000) + logger.info(f"Next available port: {available_port}", tag="SETUP") + + # Test config hash generation + config_dict = {"mode": "connect", "headless": True} + config_hash = docker_utils.generate_config_hash(config_dict) + logger.info(f"Generated config hash: {config_hash[:8]}...", tag="SETUP") + + # 3. Test Docker is available + logger.info("Checking Docker availability...", tag="SETUP") + if not await check_docker_available(): + logger.error("Docker is not available - cannot continue tests", tag="SETUP") + return False + + # 4. Test building connect image + logger.info("Building connect mode Docker image...", tag="SETUP") + connect_image = await docker_utils.ensure_docker_image_exists(None, "connect") + if not connect_image: + logger.error("Failed to build connect mode image", tag="SETUP") + return False + logger.success(f"Successfully built connect image: {connect_image}", tag="SETUP") + + # 5. Test building launch image + logger.info("Building launch mode Docker image...", tag="SETUP") + launch_image = await docker_utils.ensure_docker_image_exists(None, "launch") + if not launch_image: + logger.error("Failed to build launch mode image", tag="SETUP") + return False + logger.success(f"Successfully built launch image: {launch_image}", tag="SETUP") + + # 6. Test creating and removing container + logger.info("Testing container creation and removal...", tag="SETUP") + container_id = await docker_utils.create_container( + image_name=launch_image, + host_port=available_port, + container_name="crawl4ai-test-container" + ) + + if not container_id: + logger.error("Failed to create test container", tag="SETUP") + return False + + logger.info(f"Created test container: {container_id[:12]}", tag="SETUP") + + # Verify container is running + running = await docker_utils.is_container_running(container_id) + if not running: + logger.error("Test container is not running", tag="SETUP") + await docker_utils.remove_container(container_id) + return False + + # Test commands in container + logger.info("Testing command execution in container...", tag="SETUP") + returncode, stdout, stderr = await docker_utils.exec_in_container( + container_id, ["ls", "-la", "/"] + ) + + if returncode != 0: + logger.error(f"Command execution failed: {stderr}", tag="SETUP") + await docker_utils.remove_container(container_id) + return False + + # Verify Chrome is installed in the container + returncode, stdout, stderr = await docker_utils.exec_in_container( + container_id, ["which", "google-chrome"] + ) + + if returncode != 0: + logger.error("Chrome not found in container", tag="SETUP") + await docker_utils.remove_container(container_id) + return False + + chrome_path = stdout.strip() + logger.info(f"Chrome found at: {chrome_path}", tag="SETUP") + + # Test Chrome version + returncode, stdout, stderr = await docker_utils.exec_in_container( + container_id, ["google-chrome", "--version"] + ) + + if returncode != 0: + logger.error(f"Failed to get Chrome version: {stderr}", tag="SETUP") + await docker_utils.remove_container(container_id) + return False + + logger.info(f"Chrome version: {stdout.strip()}", tag="SETUP") + + # Remove test container + removed = await docker_utils.remove_container(container_id) + if not removed: + logger.error("Failed to remove test container", tag="SETUP") + return False + + logger.success("Test container removed successfully", tag="SETUP") + + # All components tested successfully + logger.success("All Docker components tested successfully", tag="SETUP") + return True + + except Exception as e: + logger.error(f"Docker component tests failed: {str(e)}", tag="SETUP") + return False + finally: + # Clean up registry test directory + if os.path.exists(registry_dir): + shutil.rmtree(registry_dir) + +async def test_docker_connect_mode(): + """Test Docker browser in connect mode. + + This tests the basic functionality of creating a browser in Docker + connect mode and using it for navigation. + """ + logger.info("Testing Docker browser in connect mode", tag="TEST") + + # Create temp directory for user data + temp_dir = os.path.join(os.path.dirname(__file__), "tmp_user_data") + os.makedirs(temp_dir, exist_ok=True) + + try: + # Create Docker configuration + docker_config = DockerConfig( + mode="connect", + persistent=False, + remove_on_exit=True, + user_data_dir=temp_dir + ) + + # Create browser configuration + browser_config = BrowserConfig( + browser_mode="docker", + headless=True, + docker_config=docker_config + ) + + # Create browser manager + manager = BrowserManager(browser_config=browser_config, logger=logger) + + # Start the browser + await manager.start() + logger.info("Browser started successfully", tag="TEST") + + # Create crawler config + crawler_config = CrawlerRunConfig(url="https://example.com") + + # Get a page + page, context = await manager.get_page(crawler_config) + logger.info("Got page successfully", tag="TEST") + + # Navigate to a website + await page.goto("https://example.com") + logger.info("Navigated to example.com", tag="TEST") + + # Get page title + title = await page.title() + logger.info(f"Page title: {title}", tag="TEST") + + # Clean up + await manager.close() + logger.info("Browser closed successfully", tag="TEST") + + return True + except Exception as e: + logger.error(f"Test failed: {str(e)}", tag="TEST") + # Ensure cleanup + try: + await manager.close() + except: + pass + return False + finally: + # Clean up the temp directory + if os.path.exists(temp_dir): + shutil.rmtree(temp_dir) + +async def test_docker_launch_mode(): + """Test Docker browser in launch mode. + + This tests launching a Chrome browser within a Docker container + on demand with custom settings. + """ + logger.info("Testing Docker browser in launch mode", tag="TEST") + + # Create temp directory for user data + temp_dir = os.path.join(os.path.dirname(__file__), "tmp_user_data_launch") + os.makedirs(temp_dir, exist_ok=True) + + try: + # Create Docker configuration + docker_config = DockerConfig( + mode="launch", + persistent=False, + remove_on_exit=True, + user_data_dir=temp_dir + ) + + # Create browser configuration + browser_config = BrowserConfig( + browser_mode="docker", + headless=True, + text_mode=True, # Enable text mode for faster operation + docker_config=docker_config + ) + + # Create browser manager + manager = BrowserManager(browser_config=browser_config, logger=logger) + + # Start the browser + await manager.start() + logger.info("Browser started successfully", tag="TEST") + + # Create crawler config + crawler_config = CrawlerRunConfig(url="https://example.com") + + # Get a page + page, context = await manager.get_page(crawler_config) + logger.info("Got page successfully", tag="TEST") + + # Navigate to a website + await page.goto("https://example.com") + logger.info("Navigated to example.com", tag="TEST") + + # Get page title + title = await page.title() + logger.info(f"Page title: {title}", tag="TEST") + + # Clean up + await manager.close() + logger.info("Browser closed successfully", tag="TEST") + + return True + except Exception as e: + logger.error(f"Test failed: {str(e)}", tag="TEST") + # Ensure cleanup + try: + await manager.close() + except: + pass + return False + finally: + # Clean up the temp directory + if os.path.exists(temp_dir): + shutil.rmtree(temp_dir) + +async def test_docker_persistent_storage(): + """Test Docker browser with persistent storage. + + This tests creating localStorage data in one session and verifying + it persists to another session when using persistent storage. + """ + logger.info("Testing Docker browser with persistent storage", tag="TEST") + + # Create a unique temp directory + test_id = uuid.uuid4().hex[:8] + temp_dir = os.path.join(os.path.dirname(__file__), f"tmp_user_data_persist_{test_id}") + os.makedirs(temp_dir, exist_ok=True) + + manager1 = None + manager2 = None + + try: + # Create Docker configuration with persistence + docker_config = DockerConfig( + mode="connect", + persistent=True, # Keep container running between sessions + user_data_dir=temp_dir, + container_user_data_dir="/data" + ) + + # Create browser configuration + browser_config = BrowserConfig( + browser_mode="docker", + headless=True, + docker_config=docker_config + ) + + # Create first browser manager + manager1 = BrowserManager(browser_config=browser_config, logger=logger) + + # Start the browser + await manager1.start() + logger.info("First browser started successfully", tag="TEST") + + # Create crawler config + crawler_config = CrawlerRunConfig() + + # Get a page + page1, context1 = await manager1.get_page(crawler_config) + + # Navigate to example.com + await page1.goto("https://example.com") + + # Set localStorage item + test_value = f"test_value_{test_id}" + await page1.evaluate(f"localStorage.setItem('test_key', '{test_value}')") + logger.info(f"Set localStorage test_key = {test_value}", tag="TEST") + + # Close the first browser manager + await manager1.close() + logger.info("First browser closed", tag="TEST") + + # Create second browser manager with same config + manager2 = BrowserManager(browser_config=browser_config, logger=logger) + + # Start the browser + await manager2.start() + logger.info("Second browser started successfully", tag="TEST") + + # Get a page + page2, context2 = await manager2.get_page(crawler_config) + + # Navigate to same site + await page2.goto("https://example.com") + + # Get localStorage item + value = await page2.evaluate("localStorage.getItem('test_key')") + logger.info(f"Retrieved localStorage test_key = {value}", tag="TEST") + + # Check if persistence worked + if value == test_value: + logger.success("Storage persistence verified!", tag="TEST") + else: + logger.error(f"Storage persistence failed! Expected {test_value}, got {value}", tag="TEST") + + # Clean up + await manager2.close() + logger.info("Second browser closed successfully", tag="TEST") + + return value == test_value + except Exception as e: + logger.error(f"Test failed: {str(e)}", tag="TEST") + # Ensure cleanup + try: + if manager1: + await manager1.close() + if manager2: + await manager2.close() + except: + pass + return False + finally: + # Clean up the temp directory + if os.path.exists(temp_dir): + shutil.rmtree(temp_dir) + +async def test_docker_parallel_pages(): + """Test Docker browser with parallel page creation. + + This tests the ability to create and use multiple pages in parallel + from a single Docker browser instance. + """ + logger.info("Testing Docker browser with parallel pages", tag="TEST") + + try: + # Create Docker configuration + docker_config = DockerConfig( + mode="connect", + persistent=False, + remove_on_exit=True + ) + + # Create browser configuration + browser_config = BrowserConfig( + browser_mode="docker", + headless=True, + docker_config=docker_config + ) + + # Create browser manager + manager = BrowserManager(browser_config=browser_config, logger=logger) + + # Start the browser + await manager.start() + logger.info("Browser started successfully", tag="TEST") + + # Create crawler config + crawler_config = CrawlerRunConfig() + + # Get multiple pages + page_count = 3 + pages = await manager.get_pages(crawler_config, count=page_count) + logger.info(f"Got {len(pages)} pages successfully", tag="TEST") + + if len(pages) != page_count: + logger.error(f"Expected {page_count} pages, got {len(pages)}", tag="TEST") + await manager.close() + return False + + # Navigate to different sites with each page + tasks = [] + for i, (page, _) in enumerate(pages): + tasks.append(page.goto(f"https://example.com?page={i}")) + + # Wait for all navigations to complete + await asyncio.gather(*tasks) + logger.info("All pages navigated successfully", tag="TEST") + + # Get titles from all pages + titles = [] + for i, (page, _) in enumerate(pages): + title = await page.title() + titles.append(title) + logger.info(f"Page {i+1} title: {title}", tag="TEST") + + # Clean up + await manager.close() + logger.info("Browser closed successfully", tag="TEST") + + return True + except Exception as e: + logger.error(f"Test failed: {str(e)}", tag="TEST") + # Ensure cleanup + try: + await manager.close() + except: + pass + return False + +async def test_docker_registry_reuse(): + """Test Docker container reuse via registry. + + This tests that containers with matching configurations + are reused rather than creating new ones. + """ + logger.info("Testing Docker container reuse via registry", tag="TEST") + + # Create registry for this test + registry_dir = os.path.join(os.path.dirname(__file__), "registry_reuse_test") + registry_file = os.path.join(registry_dir, "registry.json") + os.makedirs(registry_dir, exist_ok=True) + + manager1 = None + manager2 = None + container_id1 = None + + try: + # Create identical Docker configurations with custom registry + docker_config1 = DockerConfig( + mode="connect", + persistent=True, # Keep container running after closing + registry_file=registry_file + ) + + # Create first browser configuration + browser_config1 = BrowserConfig( + browser_mode="docker", + headless=True, + docker_config=docker_config1 + ) + + # Create first browser manager + manager1 = BrowserManager(browser_config=browser_config1, logger=logger) + + # Start the first browser + await manager1.start() + logger.info("First browser started successfully", tag="TEST") + + # Get container ID from the strategy + docker_strategy1 = manager1._strategy + container_id1 = docker_strategy1.container_id + logger.info(f"First browser container ID: {container_id1[:12]}", tag="TEST") + + # Close the first manager but keep container running + await manager1.close() + logger.info("First browser closed", tag="TEST") + + # Create second Docker configuration identical to first + docker_config2 = DockerConfig( + mode="connect", + persistent=True, + registry_file=registry_file + ) + + # Create second browser configuration + browser_config2 = BrowserConfig( + browser_mode="docker", + headless=True, + docker_config=docker_config2 + ) + + # Create second browser manager + manager2 = BrowserManager(browser_config=browser_config2, logger=logger) + + # Start the second browser - should reuse existing container + await manager2.start() + logger.info("Second browser started successfully", tag="TEST") + + # Get container ID from the second strategy + docker_strategy2 = manager2._strategy + container_id2 = docker_strategy2.container_id + logger.info(f"Second browser container ID: {container_id2[:12]}", tag="TEST") + + # Verify container reuse + if container_id1 == container_id2: + logger.success("Container reuse successful - using same container!", tag="TEST") + else: + logger.error("Container reuse failed - new container created!", tag="TEST") + + # Clean up + docker_strategy2.docker_config.persistent = False + docker_strategy2.docker_config.remove_on_exit = True + await manager2.close() + logger.info("Second browser closed and container removed", tag="TEST") + + return container_id1 == container_id2 + except Exception as e: + logger.error(f"Test failed: {str(e)}", tag="TEST") + # Ensure cleanup + try: + if manager1: + await manager1.close() + if manager2: + await manager2.close() + # Make sure container is removed + if container_id1: + await docker_utils.remove_container(container_id1, force=True) + except: + pass + return False + finally: + # Clean up registry directory + if os.path.exists(registry_dir): + shutil.rmtree(registry_dir) + +async def run_tests(): + """Run all tests sequentially.""" + results = [] + + logger.info("Starting Docker Browser Strategy tests", tag="TEST") + + # Check if Docker is available + if not await check_docker_available(): + logger.error("Docker is not available - skipping tests", tag="TEST") + return + + # First test Docker components + setup_result = await test_docker_components() + if not setup_result: + logger.error("Docker component tests failed - skipping browser tests", tag="TEST") + return + + # Run browser tests + results.append(await test_docker_connect_mode()) + results.append(await test_docker_launch_mode()) + results.append(await test_docker_persistent_storage()) + results.append(await test_docker_parallel_pages()) + results.append(await test_docker_registry_reuse()) + + # Print summary + total = len(results) + passed = sum(1 for r in results if r) + logger.info(f"Tests complete: {passed}/{total} passed", tag="SUMMARY") + + if passed == total: + logger.success("All tests passed!", tag="SUMMARY") + else: + logger.error(f"{total - passed} tests failed", tag="SUMMARY") + +async def check_docker_available() -> bool: + """Check if Docker is available on the system. + + Returns: + bool: True if Docker is available, False otherwise + """ + try: + proc = await asyncio.create_subprocess_exec( + "docker", "--version", + stdout=asyncio.subprocess.PIPE, + stderr=asyncio.subprocess.PIPE + ) + stdout, _ = await proc.communicate() + return proc.returncode == 0 and stdout + except: + return False + +if __name__ == "__main__": + asyncio.run(run_tests()) \ No newline at end of file