From 4ab0893ffb7d6308d8ccdaf29cedb9ae8bdb919f Mon Sep 17 00:00:00 2001 From: UncleCode Date: Fri, 21 Mar 2025 22:50:00 +0800 Subject: [PATCH] feat(browser): implement modular browser management system Adds a new browser management system with strategy pattern implementation: - Introduces BrowserManager class with strategy pattern support - Adds PlaywrightBrowserStrategy, CDPBrowserStrategy, and BuiltinBrowserStrategy - Implements BrowserProfileManager for profile management - Adds PagePoolConfig for browser page pooling - Includes comprehensive test suite for all browser strategies BREAKING CHANGE: Browser management has been moved to browser/ module. Direct usage of browser_manager.py and browser_profiler.py is deprecated. --- crawl4ai/async_configs.py | 48 + crawl4ai/browser/__init__.py | 10 + crawl4ai/browser/manager.py | 165 ++++ crawl4ai/browser/models.py | 0 crawl4ai/browser/profiles.py | 458 +++++++++ crawl4ai/browser/strategies.py | 1048 +++++++++++++++++++++ crawl4ai/browser/utils.py | 105 +++ crawl4ai/browser_manager.py | 1 + crawl4ai/browser_profiler.py | 1 - docs/examples/hello_world.py | 31 +- tests/browser/test_browser_manager.py | 190 ++++ tests/browser/test_builtin_strategy.py | 160 ++++ tests/browser/test_cdp_strategy.py | 227 +++++ tests/browser/test_combined.py | 77 ++ tests/browser/test_playwright_strategy.py | 275 ++++++ tests/browser/test_profiles.py | 176 ++++ 16 files changed, 2964 insertions(+), 8 deletions(-) create mode 100644 crawl4ai/browser/__init__.py create mode 100644 crawl4ai/browser/manager.py create mode 100644 crawl4ai/browser/models.py create mode 100644 crawl4ai/browser/profiles.py create mode 100644 crawl4ai/browser/strategies.py create mode 100644 crawl4ai/browser/utils.py create mode 100644 tests/browser/test_browser_manager.py create mode 100644 tests/browser/test_builtin_strategy.py create mode 100644 tests/browser/test_cdp_strategy.py create mode 100644 tests/browser/test_combined.py create mode 100644 tests/browser/test_playwright_strategy.py create mode 100644 tests/browser/test_profiles.py diff --git a/crawl4ai/async_configs.py b/crawl4ai/async_configs.py index fa0b97f4..0606c656 100644 --- a/crawl4ai/async_configs.py +++ b/crawl4ai/async_configs.py @@ -156,6 +156,41 @@ def is_empty_value(value: Any) -> bool: return False +class PagePoolConfig: + """Configuration for browser page pooling. + + This class configures the page pooling mechanism that maintains pre-warmed + browser pages ready for immediate use, improving performance for scenarios + where multiple URLs need to be processed in sequence. + + Attributes: + mode (str): Pooling mode - "static" or "adaptive". + "static" uses a fixed pool size defined by static_size. + "adaptive" calculates optimal size based on available system memory. + Default: "static". + static_size (int): Number of pages to maintain in the pool when mode is "static". + Default: 10. + memory_per_page (int): Estimated memory used by a single page in MB. + Used for "adaptive" mode calculations. + Default: 200. + memory_threshold (float): Maximum percentage of system memory to use in "adaptive" mode. + Default: 0.7 (70% of available memory). + timeout (float): Seconds to wait for a page from the pool before creating a new one. + Default: 5.0. + """ + + def __init__(self, + mode="static", + static_size=10, + memory_per_page=200, + memory_threshold=0.7, + timeout=5.0): + self.mode = mode + self.static_size = static_size + self.memory_per_page = memory_per_page + self.memory_threshold = memory_threshold + self.timeout = timeout + class BrowserConfig: """ Configuration class for setting up a browser instance and its context in AsyncPlaywrightCrawlerStrategy. @@ -220,6 +255,9 @@ class BrowserConfig: light_mode (bool): Disables certain background features for performance gains. Default: False. extra_args (list): Additional command-line arguments passed to the browser. Default: []. + page_pool_config (PagePoolConfig or None): Configuration for page pooling mechanism. + If None, page pooling is disabled. + Default: None. """ def __init__( @@ -260,6 +298,7 @@ class BrowserConfig: extra_args: list = None, debugging_port: int = 9222, host: str = "localhost", + page_pool_config: Optional[PagePoolConfig] = None, ): self.browser_type = browser_type self.headless = headless @@ -298,6 +337,7 @@ class BrowserConfig: self.verbose = verbose self.debugging_port = debugging_port self.host = host + self.page_pool_config = page_pool_config fa_user_agenr_generator = ValidUAGenerator() if self.user_agent_mode == "random": @@ -328,6 +368,12 @@ class BrowserConfig: @staticmethod def from_kwargs(kwargs: dict) -> "BrowserConfig": + # Handle page_pool_config + page_pool_config = kwargs.get("page_pool_config") + if isinstance(page_pool_config, dict): + # If it's a dict, convert to PagePoolConfig + page_pool_config = PagePoolConfig(**page_pool_config) + return BrowserConfig( browser_type=kwargs.get("browser_type", "chromium"), headless=kwargs.get("headless", True), @@ -361,6 +407,7 @@ class BrowserConfig: extra_args=kwargs.get("extra_args", []), debugging_port=kwargs.get("debugging_port", 9222), host=kwargs.get("host", "localhost"), + page_pool_config=page_pool_config, ) def to_dict(self): @@ -395,6 +442,7 @@ class BrowserConfig: "verbose": self.verbose, "debugging_port": self.debugging_port, "host": self.host, + "page_pool_config": self.page_pool_config, } def clone(self, **kwargs): diff --git a/crawl4ai/browser/__init__.py b/crawl4ai/browser/__init__.py new file mode 100644 index 00000000..fb14b59d --- /dev/null +++ b/crawl4ai/browser/__init__.py @@ -0,0 +1,10 @@ +"""Browser management module for Crawl4AI. + +This module provides browser management capabilities using different strategies +for browser creation and interaction. +""" + +from .manager import BrowserManager +from .profiles import BrowserProfileManager + +__all__ = ['BrowserManager', 'BrowserProfileManager'] \ No newline at end of file diff --git a/crawl4ai/browser/manager.py b/crawl4ai/browser/manager.py new file mode 100644 index 00000000..4ebee637 --- /dev/null +++ b/crawl4ai/browser/manager.py @@ -0,0 +1,165 @@ +"""Browser manager module for Crawl4AI. + +This module provides a central browser management class that uses the +strategy pattern internally while maintaining the existing API. +""" + +import asyncio +import time +from typing import Optional, Tuple, Dict, Any + +from playwright.async_api import Page, BrowserContext + +from ..async_logger import AsyncLogger +from ..async_configs import BrowserConfig, CrawlerRunConfig + +from .strategies import ( + BaseBrowserStrategy, + PlaywrightBrowserStrategy, + CDPBrowserStrategy, + BuiltinBrowserStrategy +) + +class BrowserManager: + """Main interface for browser management in Crawl4AI. + + This class maintains backward compatibility with the existing implementation + while using the strategy pattern internally for different browser types. + + Attributes: + config (BrowserConfig): Configuration object containing all browser settings + logger: Logger instance for recording events and errors + browser: The browser instance + default_context: The default browser context + managed_browser: The managed browser instance + playwright: The Playwright instance + sessions: Dictionary to store session information + session_ttl: Session timeout in seconds + """ + + def __init__(self, browser_config: Optional[BrowserConfig] = None, logger: Optional[AsyncLogger] = None): + """Initialize the BrowserManager with a browser configuration. + + Args: + browser_config: Configuration object containing all browser settings + logger: Logger instance for recording events and errors + """ + self.config = browser_config or BrowserConfig() + self.logger = logger + + # Create strategy based on configuration + self._strategy = self._create_strategy() + + # Initialize state variables for compatibility with existing code + self.browser = None + self.default_context = None + self.managed_browser = None + self.playwright = None + + # For session management (from existing implementation) + self.sessions = {} + self.session_ttl = 1800 # 30 minutes + + def _create_strategy(self) -> BaseBrowserStrategy: + """Create appropriate browser strategy based on configuration. + + Returns: + BaseBrowserStrategy: The selected browser strategy + """ + if self.config.browser_mode == "builtin": + return BuiltinBrowserStrategy(self.config, self.logger) + elif self.config.cdp_url or self.config.use_managed_browser: + return CDPBrowserStrategy(self.config, self.logger) + else: + return PlaywrightBrowserStrategy(self.config, self.logger) + + async def start(self): + """Start the browser instance and set up the default context. + + Returns: + self: For method chaining + """ + # Start the strategy + await self._strategy.start() + + # Update legacy references + self.browser = self._strategy.browser + self.default_context = self._strategy.default_context + + # Set browser process reference (for CDP strategy) + if hasattr(self._strategy, 'browser_process'): + self.managed_browser = self._strategy + + # Set Playwright reference + self.playwright = self._strategy.playwright + + # Sync sessions if needed + if hasattr(self._strategy, 'sessions'): + self.sessions = self._strategy.sessions + self.session_ttl = self._strategy.session_ttl + + return self + + async def get_page(self, crawlerRunConfig: CrawlerRunConfig) -> Tuple[Page, BrowserContext]: + """Get a page for the given configuration. + + Args: + crawlerRunConfig: Configuration object for the crawler run + + Returns: + Tuple of (Page, BrowserContext) + """ + # Delegate to strategy + page, context = await self._strategy.get_page(crawlerRunConfig) + + # Sync sessions if needed + if hasattr(self._strategy, 'sessions'): + self.sessions = self._strategy.sessions + + return page, context + + async def kill_session(self, session_id: str): + """Kill a browser session and clean up resources. + + Args: + session_id: The session ID to kill + """ + # Handle kill_session via our strategy if it supports it + if hasattr(self._strategy, '_kill_session'): + await self._strategy._kill_session(session_id) + elif session_id in self.sessions: + context, page, _ = self.sessions[session_id] + await page.close() + # Only close context if not using CDP + if not self.config.use_managed_browser and not self.config.cdp_url and not self.config.browser_mode == "builtin": + await context.close() + del self.sessions[session_id] + + def _cleanup_expired_sessions(self): + """Clean up expired sessions based on TTL.""" + # Use strategy's implementation if available + if hasattr(self._strategy, '_cleanup_expired_sessions'): + self._strategy._cleanup_expired_sessions() + return + + # Otherwise use our own implementation + current_time = time.time() + expired_sessions = [ + sid + for sid, (_, _, last_used) in self.sessions.items() + if current_time - last_used > self.session_ttl + ] + for sid in expired_sessions: + asyncio.create_task(self.kill_session(sid)) + + async def close(self): + """Close the browser and clean up resources.""" + # Delegate to strategy + await self._strategy.close() + + # Reset legacy references + self.browser = None + self.default_context = None + self.managed_browser = None + self.playwright = None + self.sessions = {} diff --git a/crawl4ai/browser/models.py b/crawl4ai/browser/models.py new file mode 100644 index 00000000..e69de29b diff --git a/crawl4ai/browser/profiles.py b/crawl4ai/browser/profiles.py new file mode 100644 index 00000000..58a8bff2 --- /dev/null +++ b/crawl4ai/browser/profiles.py @@ -0,0 +1,458 @@ +"""Browser profile management module for Crawl4AI. + +This module provides functionality for creating and managing browser profiles +that can be used for authenticated browsing. +""" + +import os +import asyncio +import signal +import sys +import datetime +import uuid +import shutil +from typing import List, Dict, Optional, Any +from colorama import Fore, Style, init + +from ..async_configs import BrowserConfig +from ..async_logger import AsyncLogger, AsyncLoggerBase +from ..utils import get_home_folder +from .strategies import is_windows + +class BrowserProfileManager: + """Manages browser profiles for Crawl4AI. + + This class provides functionality to create and manage browser profiles + that can be used for authenticated browsing with Crawl4AI. + + Profiles are stored by default in ~/.crawl4ai/profiles/ + """ + + def __init__(self, logger: Optional[AsyncLoggerBase] = None): + """Initialize the BrowserProfileManager. + + Args: + logger: Logger for outputting messages. If None, a default AsyncLogger is created. + """ + # Initialize colorama for colorful terminal output + init() + + # Create a logger if not provided + if logger is None: + self.logger = AsyncLogger(verbose=True) + elif not isinstance(logger, AsyncLoggerBase): + self.logger = AsyncLogger(verbose=True) + else: + self.logger = logger + + # Ensure profiles directory exists + self.profiles_dir = os.path.join(get_home_folder(), "profiles") + os.makedirs(self.profiles_dir, exist_ok=True) + + async def create_profile(self, + profile_name: Optional[str] = None, + browser_config: Optional[BrowserConfig] = None) -> Optional[str]: + """Create a browser profile interactively. + + Args: + profile_name: Name for the profile. If None, a name is generated. + browser_config: Configuration for the browser. If None, a default configuration is used. + + Returns: + Path to the created profile directory, or None if creation failed + """ + # Create default browser config if none provided + if browser_config is None: + browser_config = BrowserConfig( + browser_type="chromium", + headless=False, # Must be visible for user interaction + verbose=True + ) + else: + # Ensure headless is False for user interaction + browser_config.headless = False + + # Generate profile name if not provided + if not profile_name: + timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S") + profile_name = f"profile_{timestamp}_{uuid.uuid4().hex[:6]}" + + # Sanitize profile name (replace spaces and special chars) + profile_name = "".join(c if c.isalnum() or c in "-_" else "_" for c in profile_name) + + # Set user data directory + profile_path = os.path.join(self.profiles_dir, profile_name) + os.makedirs(profile_path, exist_ok=True) + + # Print instructions for the user with colorama formatting + border = f"{Fore.CYAN}{'='*80}{Style.RESET_ALL}" + self.logger.info(f"\n{border}", tag="PROFILE") + self.logger.info(f"Creating browser profile: {Fore.GREEN}{profile_name}{Style.RESET_ALL}", tag="PROFILE") + self.logger.info(f"Profile directory: {Fore.YELLOW}{profile_path}{Style.RESET_ALL}", tag="PROFILE") + + self.logger.info("\nInstructions:", tag="PROFILE") + self.logger.info("1. A browser window will open for you to set up your profile.", tag="PROFILE") + self.logger.info(f"2. {Fore.CYAN}Log in to websites{Style.RESET_ALL}, configure settings, etc. as needed.", tag="PROFILE") + self.logger.info(f"3. When you're done, {Fore.YELLOW}press 'q' in this terminal{Style.RESET_ALL} to close the browser.", tag="PROFILE") + self.logger.info("4. The profile will be saved and ready to use with Crawl4AI.", tag="PROFILE") + self.logger.info(f"{border}\n", tag="PROFILE") + + # Import the necessary classes with local imports to avoid circular references + from .strategies import CDPBrowserStrategy + + # Set browser config to use the profile path + browser_config.user_data_dir = profile_path + + # Create a CDP browser strategy for the profile creation + browser_strategy = CDPBrowserStrategy(browser_config, self.logger) + + # Set up signal handlers to ensure cleanup on interrupt + original_sigint = signal.getsignal(signal.SIGINT) + original_sigterm = signal.getsignal(signal.SIGTERM) + + # Define cleanup handler for signals + async def cleanup_handler(sig, frame): + self.logger.warning("\nCleaning up browser process...", tag="PROFILE") + await browser_strategy.close() + # Restore original signal handlers + signal.signal(signal.SIGINT, original_sigint) + signal.signal(signal.SIGTERM, original_sigterm) + if sig == signal.SIGINT: + self.logger.error("Profile creation interrupted. Profile may be incomplete.", tag="PROFILE") + sys.exit(1) + + # Set signal handlers + def sigint_handler(sig, frame): + asyncio.create_task(cleanup_handler(sig, frame)) + + signal.signal(signal.SIGINT, sigint_handler) + signal.signal(signal.SIGTERM, sigint_handler) + + # Event to signal when user is done with the browser + user_done_event = asyncio.Event() + + # Run keyboard input loop in a separate task + async def listen_for_quit_command(): + import termios + import tty + import select + + # First output the prompt + self.logger.info(f"{Fore.CYAN}Press '{Fore.WHITE}q{Fore.CYAN}' when you've finished using the browser...{Style.RESET_ALL}", tag="PROFILE") + + # Save original terminal settings + fd = sys.stdin.fileno() + old_settings = termios.tcgetattr(fd) + + try: + # Switch to non-canonical mode (no line buffering) + tty.setcbreak(fd) + + while True: + # Check if input is available (non-blocking) + readable, _, _ = select.select([sys.stdin], [], [], 0.5) + if readable: + key = sys.stdin.read(1) + if key.lower() == 'q': + self.logger.info(f"{Fore.GREEN}Closing browser and saving profile...{Style.RESET_ALL}", tag="PROFILE") + user_done_event.set() + return + + # Check if the browser process has already exited + if browser_strategy.browser_process and browser_strategy.browser_process.poll() is not None: + self.logger.info("Browser already closed. Ending input listener.", tag="PROFILE") + user_done_event.set() + return + + await asyncio.sleep(0.1) + + finally: + # Restore terminal settings + termios.tcsetattr(fd, termios.TCSADRAIN, old_settings) + + try: + # Start the browser + await browser_strategy.start() + + # Check if browser started successfully + if not browser_strategy.browser_process: + self.logger.error("Failed to start browser process.", tag="PROFILE") + return None + + self.logger.info(f"Browser launched. {Fore.CYAN}Waiting for you to finish...{Style.RESET_ALL}", tag="PROFILE") + + # Start listening for keyboard input + listener_task = asyncio.create_task(listen_for_quit_command()) + + # Wait for either the user to press 'q' or for the browser process to exit naturally + while not user_done_event.is_set() and browser_strategy.browser_process.poll() is None: + await asyncio.sleep(0.5) + + # Cancel the listener task if it's still running + if not listener_task.done(): + listener_task.cancel() + try: + await listener_task + except asyncio.CancelledError: + pass + + # If the browser is still running and the user pressed 'q', terminate it + if browser_strategy.browser_process.poll() is None and user_done_event.is_set(): + self.logger.info("Terminating browser process...", tag="PROFILE") + await browser_strategy.close() + + self.logger.success(f"Browser closed. Profile saved at: {Fore.GREEN}{profile_path}{Style.RESET_ALL}", tag="PROFILE") + + except Exception as e: + self.logger.error(f"Error creating profile: {str(e)}", tag="PROFILE") + await browser_strategy.close() + return None + finally: + # Restore original signal handlers + signal.signal(signal.SIGINT, original_sigint) + signal.signal(signal.SIGTERM, original_sigterm) + + # Make sure browser is fully cleaned up + await browser_strategy.close() + + # Return the profile path + return profile_path + + def list_profiles(self) -> List[Dict[str, Any]]: + """List all available browser profiles. + + Returns: + List of dictionaries containing profile information + """ + if not os.path.exists(self.profiles_dir): + return [] + + profiles = [] + + for name in os.listdir(self.profiles_dir): + profile_path = os.path.join(self.profiles_dir, name) + + # Skip if not a directory + if not os.path.isdir(profile_path): + continue + + # Check if this looks like a valid browser profile + # For Chromium: Look for Preferences file + # For Firefox: Look for prefs.js file + is_valid = False + + if os.path.exists(os.path.join(profile_path, "Preferences")) or \ + os.path.exists(os.path.join(profile_path, "Default", "Preferences")): + is_valid = "chromium" + elif os.path.exists(os.path.join(profile_path, "prefs.js")): + is_valid = "firefox" + + if is_valid: + # Get creation time + created = datetime.datetime.fromtimestamp( + os.path.getctime(profile_path) + ) + + profiles.append({ + "name": name, + "path": profile_path, + "created": created, + "type": is_valid + }) + + # Sort by creation time, newest first + profiles.sort(key=lambda x: x["created"], reverse=True) + + return profiles + + def get_profile_path(self, profile_name: str) -> Optional[str]: + """Get the full path to a profile by name. + + Args: + profile_name: Name of the profile (not the full path) + + Returns: + Full path to the profile directory, or None if not found + """ + profile_path = os.path.join(self.profiles_dir, profile_name) + + # Check if path exists and is a valid profile + if not os.path.isdir(profile_path): + # Check if profile_name itself is full path + if os.path.isabs(profile_name): + profile_path = profile_name + else: + return None + + # Look for profile indicators + is_profile = ( + os.path.exists(os.path.join(profile_path, "Preferences")) or + os.path.exists(os.path.join(profile_path, "Default", "Preferences")) or + os.path.exists(os.path.join(profile_path, "prefs.js")) + ) + + if not is_profile: + return None # Not a valid browser profile + + return profile_path + + def delete_profile(self, profile_name_or_path: str) -> bool: + """Delete a browser profile by name or path. + + Args: + profile_name_or_path: Name of the profile or full path to profile directory + + Returns: + True if the profile was deleted successfully, False otherwise + """ + # Determine if input is a name or a path + if os.path.isabs(profile_name_or_path): + # Full path provided + profile_path = profile_name_or_path + else: + # Just a name provided, construct path + profile_path = os.path.join(self.profiles_dir, profile_name_or_path) + + # Check if path exists and is a valid profile + if not os.path.isdir(profile_path): + return False + + # Look for profile indicators + is_profile = ( + os.path.exists(os.path.join(profile_path, "Preferences")) or + os.path.exists(os.path.join(profile_path, "Default", "Preferences")) or + os.path.exists(os.path.join(profile_path, "prefs.js")) + ) + + if not is_profile: + return False # Not a valid browser profile + + # Delete the profile directory + try: + shutil.rmtree(profile_path) + return True + except Exception: + return False + + async def interactive_manager(self, crawl_callback=None): + """Launch an interactive profile management console. + + Args: + crawl_callback: Function to call when selecting option to use + a profile for crawling. It will be called with (profile_path, url). + """ + while True: + self.logger.info(f"\n{Fore.CYAN}Profile Management Options:{Style.RESET_ALL}", tag="MENU") + self.logger.info(f"1. {Fore.GREEN}Create a new profile{Style.RESET_ALL}", tag="MENU") + self.logger.info(f"2. {Fore.YELLOW}List available profiles{Style.RESET_ALL}", tag="MENU") + self.logger.info(f"3. {Fore.RED}Delete a profile{Style.RESET_ALL}", tag="MENU") + + # Only show crawl option if callback provided + if crawl_callback: + self.logger.info(f"4. {Fore.CYAN}Use a profile to crawl a website{Style.RESET_ALL}", tag="MENU") + self.logger.info(f"5. {Fore.MAGENTA}Exit{Style.RESET_ALL}", tag="MENU") + exit_option = "5" + else: + self.logger.info(f"4. {Fore.MAGENTA}Exit{Style.RESET_ALL}", tag="MENU") + exit_option = "4" + + choice = input(f"\n{Fore.CYAN}Enter your choice (1-{exit_option}): {Style.RESET_ALL}") + + if choice == "1": + # Create new profile + name = input(f"{Fore.GREEN}Enter a name for the new profile (or press Enter for auto-generated name): {Style.RESET_ALL}") + await self.create_profile(name or None) + + elif choice == "2": + # List profiles + profiles = self.list_profiles() + + if not profiles: + self.logger.warning(" No profiles found. Create one first with option 1.", tag="PROFILES") + continue + + # Print profile information with colorama formatting + self.logger.info("\nAvailable profiles:", tag="PROFILES") + for i, profile in enumerate(profiles): + self.logger.info(f"[{i+1}] {Fore.CYAN}{profile['name']}{Style.RESET_ALL}", tag="PROFILES") + self.logger.info(f" Path: {Fore.YELLOW}{profile['path']}{Style.RESET_ALL}", tag="PROFILES") + self.logger.info(f" Created: {profile['created'].strftime('%Y-%m-%d %H:%M:%S')}", tag="PROFILES") + self.logger.info(f" Browser type: {profile['type']}", tag="PROFILES") + self.logger.info("", tag="PROFILES") # Empty line for spacing + + elif choice == "3": + # Delete profile + profiles = self.list_profiles() + if not profiles: + self.logger.warning("No profiles found to delete", tag="PROFILES") + continue + + # Display numbered list + self.logger.info(f"\n{Fore.YELLOW}Available profiles:{Style.RESET_ALL}", tag="PROFILES") + for i, profile in enumerate(profiles): + self.logger.info(f"[{i+1}] {profile['name']}", tag="PROFILES") + + # Get profile to delete + profile_idx = input(f"{Fore.RED}Enter the number of the profile to delete (or 'c' to cancel): {Style.RESET_ALL}") + if profile_idx.lower() == 'c': + continue + + try: + idx = int(profile_idx) - 1 + if 0 <= idx < len(profiles): + profile_name = profiles[idx]["name"] + self.logger.info(f"Deleting profile: {Fore.YELLOW}{profile_name}{Style.RESET_ALL}", tag="PROFILES") + + # Confirm deletion + confirm = input(f"{Fore.RED}Are you sure you want to delete this profile? (y/n): {Style.RESET_ALL}") + if confirm.lower() == 'y': + success = self.delete_profile(profiles[idx]["path"]) + + if success: + self.logger.success(f"Profile {Fore.GREEN}{profile_name}{Style.RESET_ALL} deleted successfully", tag="PROFILES") + else: + self.logger.error(f"Failed to delete profile {Fore.RED}{profile_name}{Style.RESET_ALL}", tag="PROFILES") + else: + self.logger.error("Invalid profile number", tag="PROFILES") + except ValueError: + self.logger.error("Please enter a valid number", tag="PROFILES") + + elif choice == "4" and crawl_callback: + # Use profile to crawl a site + profiles = self.list_profiles() + if not profiles: + self.logger.warning("No profiles found. Create one first.", tag="PROFILES") + continue + + # Display numbered list + self.logger.info(f"\n{Fore.YELLOW}Available profiles:{Style.RESET_ALL}", tag="PROFILES") + for i, profile in enumerate(profiles): + self.logger.info(f"[{i+1}] {profile['name']}", tag="PROFILES") + + # Get profile to use + profile_idx = input(f"{Fore.CYAN}Enter the number of the profile to use (or 'c' to cancel): {Style.RESET_ALL}") + if profile_idx.lower() == 'c': + continue + + try: + idx = int(profile_idx) - 1 + if 0 <= idx < len(profiles): + profile_path = profiles[idx]["path"] + url = input(f"{Fore.CYAN}Enter the URL to crawl: {Style.RESET_ALL}") + if url: + # Call the provided crawl callback + await crawl_callback(profile_path, url) + else: + self.logger.error("No URL provided", tag="CRAWL") + else: + self.logger.error("Invalid profile number", tag="PROFILES") + except ValueError: + self.logger.error("Please enter a valid number", tag="PROFILES") + + elif (choice == "4" and not crawl_callback) or (choice == "5" and crawl_callback): + # Exit + self.logger.info("Exiting profile management", tag="MENU") + break + + else: + self.logger.error(f"Invalid choice. Please enter a number between 1 and {exit_option}.", tag="MENU") diff --git a/crawl4ai/browser/strategies.py b/crawl4ai/browser/strategies.py new file mode 100644 index 00000000..fd47f30e --- /dev/null +++ b/crawl4ai/browser/strategies.py @@ -0,0 +1,1048 @@ +"""Browser strategies module for Crawl4AI. + +This module implements the browser strategy pattern for different +browser implementations, including Playwright, CDP, and builtin browsers. +""" + +from abc import ABC, abstractmethod +import asyncio +import os +import time +import json +import hashlib +import subprocess +import sys +import shutil +import signal +from typing import Optional, Dict, Tuple, List, Any + +from playwright.async_api import Browser, BrowserContext, Page, ProxySettings + +from ..async_logger import AsyncLogger +from ..async_configs import BrowserConfig, CrawlerRunConfig +from ..config import DOWNLOAD_PAGE_TIMEOUT +from ..js_snippet import load_js_script +from ..utils import get_home_folder +from .utils import get_playwright, get_browser_executable, get_browser_disable_options, create_temp_directory, is_windows + +from playwright_stealth import StealthConfig + +stealth_config = StealthConfig( + webdriver=True, + chrome_app=True, + chrome_csi=True, + chrome_load_times=True, + chrome_runtime=True, + navigator_languages=True, + navigator_plugins=True, + navigator_permissions=True, + webgl_vendor=True, + outerdimensions=True, + navigator_hardware_concurrency=True, + media_codecs=True, +) + +class BaseBrowserStrategy(ABC): + """Base class for all browser strategies. + + This abstract class defines the interface that all browser strategies + must implement. It handles common functionality like context caching. + """ + + def __init__(self, config: BrowserConfig, logger: Optional[AsyncLogger] = None): + """Initialize the strategy with configuration and logger. + + Args: + config: Browser configuration + logger: Logger for recording events and errors + """ + self.config = config + self.logger = logger + self.browser = None + self.default_context = None + self.contexts_by_config = {} + self._contexts_lock = asyncio.Lock() + self.playwright = None + + @abstractmethod + async def start(self): + """Start the browser. + + Returns: + self: For method chaining + """ + pass + + @abstractmethod + async def get_page(self, crawlerRunConfig: CrawlerRunConfig) -> Tuple[Page, BrowserContext]: + """Get a page with specified configuration. + + Args: + crawlerRunConfig: Crawler run configuration + + Returns: + Tuple of (Page, BrowserContext) + """ + pass + + @abstractmethod + async def close(self): + """Close the browser and clean up resources.""" + pass + + def _make_config_signature(self, crawlerRunConfig: CrawlerRunConfig) -> str: + """Create a signature hash from configuration for context caching. + + Args: + crawlerRunConfig: Crawler run configuration + + Returns: + str: Unique hash for this configuration + """ + config_dict = crawlerRunConfig.__dict__.copy() + # Exclude items that do not affect browser-level setup + ephemeral_keys = [ + "session_id", + "js_code", + "scraping_strategy", + "extraction_strategy", + "chunking_strategy", + "cache_mode", + "content_filter", + "semaphore_count", + "url" + ] + for key in ephemeral_keys: + if key in config_dict: + del config_dict[key] + + # Convert to canonical JSON string + signature_json = json.dumps(config_dict, sort_keys=True, default=str) + + # Hash the JSON so we get a compact, unique string + signature_hash = hashlib.sha256(signature_json.encode("utf-8")).hexdigest() + return signature_hash + + async def setup_context(self, context: BrowserContext, crawlerRunConfig: Optional[CrawlerRunConfig] = None): + """Set up a browser context with the configured options. + + Args: + context: The browser context to set up + crawlerRunConfig: Configuration object containing all browser settings + """ + if self.config.headers: + await context.set_extra_http_headers(self.config.headers) + + if self.config.cookies: + await context.add_cookies(self.config.cookies) + + if self.config.storage_state: + await context.storage_state(path=None) + + if self.config.accept_downloads: + context.set_default_timeout(DOWNLOAD_PAGE_TIMEOUT) + context.set_default_navigation_timeout(DOWNLOAD_PAGE_TIMEOUT) + if self.config.downloads_path: + context._impl_obj._options["accept_downloads"] = True + context._impl_obj._options["downloads_path"] = self.config.downloads_path + + # Handle user agent and browser hints + if self.config.user_agent: + combined_headers = { + "User-Agent": self.config.user_agent, + "sec-ch-ua": self.config.browser_hint, + } + combined_headers.update(self.config.headers) + await context.set_extra_http_headers(combined_headers) + + # Add default cookie + await context.add_cookies( + [ + { + "name": "cookiesEnabled", + "value": "true", + "url": crawlerRunConfig.url if crawlerRunConfig else "https://crawl4ai.com/", + } + ] + ) + + # Handle navigator overrides + if crawlerRunConfig: + if ( + crawlerRunConfig.override_navigator + or crawlerRunConfig.simulate_user + or crawlerRunConfig.magic + ): + await context.add_init_script(load_js_script("navigator_overrider")) + +class PlaywrightBrowserStrategy(BaseBrowserStrategy): + """Standard Playwright browser strategy. + + This strategy launches a new browser instance using Playwright + and manages browser contexts. + """ + + def __init__(self, config: BrowserConfig, logger: Optional[AsyncLogger] = None): + """Initialize the Playwright browser strategy. + + Args: + config: Browser configuration + logger: Logger for recording events and errors + """ + super().__init__(config, logger) + # Add session management + self.sessions = {} + self.session_ttl = 1800 # 30 minutes + + async def start(self): + """Start the browser instance. + + Returns: + self: For method chaining + """ + self.playwright = await get_playwright() + browser_args = self._build_browser_args() + + # Launch appropriate browser type + if self.config.browser_type == "firefox": + self.browser = await self.playwright.firefox.launch(**browser_args) + elif self.config.browser_type == "webkit": + self.browser = await self.playwright.webkit.launch(**browser_args) + else: + self.browser = await self.playwright.chromium.launch(**browser_args) + + self.default_context = self.browser + return self + + def _build_browser_args(self) -> dict: + """Build browser launch arguments from config. + + Returns: + dict: Browser launch arguments + """ + args = [ + "--disable-gpu", + "--disable-gpu-compositing", + "--disable-software-rasterizer", + "--no-sandbox", + "--disable-dev-shm-usage", + "--no-first-run", + "--no-default-browser-check", + "--disable-infobars", + "--window-position=0,0", + "--ignore-certificate-errors", + "--ignore-certificate-errors-spki-list", + "--disable-blink-features=AutomationControlled", + "--window-position=400,0", + "--disable-renderer-backgrounding", + "--disable-ipc-flooding-protection", + "--force-color-profile=srgb", + "--mute-audio", + "--disable-background-timer-throttling", + f"--window-size={self.config.viewport_width},{self.config.viewport_height}", + ] + + if self.config.light_mode: + args.extend(get_browser_disable_options()) + + if self.config.text_mode: + args.extend( + [ + "--blink-settings=imagesEnabled=false", + "--disable-remote-fonts", + "--disable-images", + "--disable-javascript", + "--disable-software-rasterizer", + "--disable-dev-shm-usage", + ] + ) + + if self.config.extra_args: + args.extend(self.config.extra_args) + + browser_args = {"headless": self.config.headless, "args": args} + + if self.config.chrome_channel: + browser_args["channel"] = self.config.chrome_channel + + if self.config.accept_downloads: + browser_args["downloads_path"] = self.config.downloads_path or os.path.join( + os.getcwd(), "downloads" + ) + os.makedirs(browser_args["downloads_path"], exist_ok=True) + + if self.config.proxy or self.config.proxy_config: + proxy_settings = ( + ProxySettings(server=self.config.proxy) + if self.config.proxy + else ProxySettings( + server=self.config.proxy_config.server, + username=self.config.proxy_config.username, + password=self.config.proxy_config.password, + ) + ) + browser_args["proxy"] = proxy_settings + + return browser_args + + async def create_browser_context(self, crawlerRunConfig: Optional[CrawlerRunConfig] = None) -> BrowserContext: + """Creates and returns a new browser context with configured settings. + + Args: + crawlerRunConfig: Configuration object for the crawler run + + Returns: + BrowserContext: Browser context object with the specified configurations + """ + # Base settings + user_agent = self.config.headers.get("User-Agent", self.config.user_agent) + viewport_settings = { + "width": self.config.viewport_width, + "height": self.config.viewport_height, + } + proxy_settings = {"server": self.config.proxy} if self.config.proxy else None + + blocked_extensions = [ + # Images + "jpg", "jpeg", "png", "gif", "webp", "svg", "ico", "bmp", "tiff", "psd", + # Fonts + "woff", "woff2", "ttf", "otf", "eot", + # Media + "mp4", "webm", "ogg", "avi", "mov", "wmv", "flv", "m4v", "mp3", "wav", "aac", + "m4a", "opus", "flac", + # Documents + "pdf", "doc", "docx", "xls", "xlsx", "ppt", "pptx", + # Archives + "zip", "rar", "7z", "tar", "gz", + # Scripts and data + "xml", "swf", "wasm", + ] + + # Common context settings + context_settings = { + "user_agent": user_agent, + "viewport": viewport_settings, + "proxy": proxy_settings, + "accept_downloads": self.config.accept_downloads, + "storage_state": self.config.storage_state, + "ignore_https_errors": self.config.ignore_https_errors, + "device_scale_factor": 1.0, + "java_script_enabled": self.config.java_script_enabled, + } + + if crawlerRunConfig: + # Check if there is value for crawlerRunConfig.proxy_config set add that to context + if crawlerRunConfig.proxy_config: + proxy_settings = { + "server": crawlerRunConfig.proxy_config.server, + } + if crawlerRunConfig.proxy_config.username: + proxy_settings.update({ + "username": crawlerRunConfig.proxy_config.username, + "password": crawlerRunConfig.proxy_config.password, + }) + context_settings["proxy"] = proxy_settings + + if self.config.text_mode: + text_mode_settings = { + "has_touch": False, + "is_mobile": False, + } + # Update context settings with text mode settings + context_settings.update(text_mode_settings) + + # Create and return the context with all settings + context = await self.browser.new_context(**context_settings) + + # Apply text mode settings if enabled + if self.config.text_mode: + # Create and apply route patterns for each extension + for ext in blocked_extensions: + await context.route(f"**/*.{ext}", lambda route: route.abort()) + return context + + def _cleanup_expired_sessions(self): + """Clean up expired sessions based on TTL.""" + current_time = time.time() + expired_sessions = [ + sid + for sid, (_, _, last_used) in self.sessions.items() + if current_time - last_used > self.session_ttl + ] + for sid in expired_sessions: + asyncio.create_task(self._kill_session(sid)) + + async def _kill_session(self, session_id: str): + """Kill a browser session and clean up resources. + + Args: + session_id: The session ID to kill + """ + if session_id in self.sessions: + context, page, _ = self.sessions[session_id] + await page.close() + del self.sessions[session_id] + + async def get_page(self, crawlerRunConfig: CrawlerRunConfig) -> Tuple[Page, BrowserContext]: + """Get a page for the given configuration. + + Args: + crawlerRunConfig: Configuration object for the crawler run + + Returns: + Tuple of (Page, BrowserContext) + """ + # Clean up expired sessions first + self._cleanup_expired_sessions() + + # If a session_id is provided and we already have it, reuse that page + context + if crawlerRunConfig.session_id and crawlerRunConfig.session_id in self.sessions: + context, page, _ = self.sessions[crawlerRunConfig.session_id] + # Update last-used timestamp + self.sessions[crawlerRunConfig.session_id] = (context, page, time.time()) + return page, context + + # Otherwise, check if we have an existing context for this config + config_signature = self._make_config_signature(crawlerRunConfig) + + async with self._contexts_lock: + if config_signature in self.contexts_by_config: + context = self.contexts_by_config[config_signature] + else: + # Create and setup a new context + context = await self.create_browser_context(crawlerRunConfig) + await self.setup_context(context, crawlerRunConfig) + self.contexts_by_config[config_signature] = context + + # Create a new page from the chosen context + page = await context.new_page() + + # If a session_id is specified, store this session so we can reuse later + if crawlerRunConfig.session_id: + self.sessions[crawlerRunConfig.session_id] = (context, page, time.time()) + + return page, context + + async def close(self): + """Close the browser and clean up resources.""" + if self.config.sleep_on_close: + await asyncio.sleep(0.5) + + # Close all sessions + session_ids = list(self.sessions.keys()) + for session_id in session_ids: + await self._kill_session(session_id) + + # Close all contexts we created + for ctx in self.contexts_by_config.values(): + try: + await ctx.close() + except Exception as e: + if self.logger: + self.logger.error( + message="Error closing context: {error}", + tag="ERROR", + params={"error": str(e)} + ) + self.contexts_by_config.clear() + + if self.browser: + await self.browser.close() + self.browser = None + + if self.playwright: + await self.playwright.stop() + self.playwright = None + +class CDPBrowserStrategy(BaseBrowserStrategy): + """CDP-based browser strategy. + + This strategy connects to an existing browser using CDP protocol or + launches and connects to a browser using CDP. + """ + + def __init__(self, config: BrowserConfig, logger: Optional[AsyncLogger] = None): + """Initialize the CDP browser strategy. + + Args: + config: Browser configuration + logger: Logger for recording events and errors + """ + super().__init__(config, logger) + self.sessions = {} + self.session_ttl = 1800 # 30 minutes + self.browser_process = None + self.temp_dir = None + self.shutting_down = False + + async def start(self): + """Start or connect to the browser using CDP. + + Returns: + self: For method chaining + """ + self.playwright = await get_playwright() + + # Get or create CDP URL + cdp_url = await self._get_or_create_cdp_url() + + # Connect to the browser using CDP + self.browser = await self.playwright.chromium.connect_over_cdp(cdp_url) + + # Get or create default context + contexts = self.browser.contexts + if contexts: + self.default_context = contexts[0] + else: + self.default_context = await self.create_browser_context() + + await self.setup_context(self.default_context) + return self + + async def _get_or_create_cdp_url(self) -> str: + """Get existing CDP URL or launch a browser and return its CDP URL. + + Returns: + str: CDP URL for connecting to the browser + """ + # If CDP URL is provided, just return it + if self.config.cdp_url: + return self.config.cdp_url + + # Create temp dir if needed + if not self.config.user_data_dir: + self.temp_dir = create_temp_directory() + user_data_dir = self.temp_dir + else: + user_data_dir = self.config.user_data_dir + + # Get browser args based on OS and browser type + args = await self._get_browser_args(user_data_dir) + + # Start browser process + try: + # Use DETACHED_PROCESS flag on Windows to fully detach the process + # On Unix, we'll use preexec_fn=os.setpgrp to start the process in a new process group + if is_windows(): + self.browser_process = subprocess.Popen( + args, + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + creationflags=subprocess.DETACHED_PROCESS | subprocess.CREATE_NEW_PROCESS_GROUP + ) + else: + self.browser_process = subprocess.Popen( + args, + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + preexec_fn=os.setpgrp # Start in a new process group + ) + + # Monitor for a short time to make sure it starts properly + await asyncio.sleep(0.5) # Give browser time to start + await self._initial_startup_check() + await asyncio.sleep(2) # Give browser more time to start + return f"http://localhost:{self.config.debugging_port}" + except Exception as e: + await self._cleanup_process() + raise Exception(f"Failed to start browser: {e}") + + async def _initial_startup_check(self): + """Perform a quick check to make sure the browser started successfully.""" + if not self.browser_process: + return + + # Check that process started without immediate termination + await asyncio.sleep(0.5) + if self.browser_process.poll() is not None: + # Process already terminated + stdout, stderr = b"", b"" + try: + stdout, stderr = self.browser_process.communicate(timeout=0.5) + except subprocess.TimeoutExpired: + pass + + if self.logger: + self.logger.error( + message="Browser process terminated during startup | Code: {code} | STDOUT: {stdout} | STDERR: {stderr}", + tag="ERROR", + params={ + "code": self.browser_process.returncode, + "stdout": stdout.decode() if stdout else "", + "stderr": stderr.decode() if stderr else "", + }, + ) + + async def _get_browser_args(self, user_data_dir: str) -> List[str]: + """Returns browser-specific command line arguments. + + Args: + user_data_dir: Path to user data directory + + Returns: + List of command-line arguments for the browser + """ + browser_path = get_browser_executable(self.config.browser_type) + base_args = [browser_path] + + if self.config.browser_type == "chromium": + args = [ + f"--remote-debugging-port={self.config.debugging_port}", + f"--user-data-dir={user_data_dir}", + ] + if self.config.headless: + args.append("--headless=new") + elif self.config.browser_type == "firefox": + args = [ + "--remote-debugging-port", + str(self.config.debugging_port), + "--profile", + user_data_dir, + ] + if self.config.headless: + args.append("--headless") + else: + raise NotImplementedError(f"Browser type {self.config.browser_type} not supported") + + return base_args + args + + async def _cleanup_process(self): + """Cleanup browser process and temporary directory.""" + # Set shutting_down flag BEFORE any termination actions + self.shutting_down = True + + if self.browser_process: + try: + # Only terminate if we have proper control over the process + if not self.browser_process.poll(): + # Process is still running + self.browser_process.terminate() + # Wait for process to end gracefully + for _ in range(10): # 10 attempts, 100ms each + if self.browser_process.poll() is not None: + break + await asyncio.sleep(0.1) + + # Force kill if still running + if self.browser_process.poll() is None: + if is_windows(): + # On Windows we might need taskkill for detached processes + try: + subprocess.run(["taskkill", "/F", "/PID", str(self.browser_process.pid)]) + except Exception: + self.browser_process.kill() + else: + self.browser_process.kill() + await asyncio.sleep(0.1) # Brief wait for kill to take effect + + except Exception as e: + if self.logger: + self.logger.error( + message="Error terminating browser: {error}", + tag="ERROR", + params={"error": str(e)}, + ) + + if self.temp_dir and os.path.exists(self.temp_dir): + try: + shutil.rmtree(self.temp_dir) + except Exception as e: + if self.logger: + self.logger.error( + message="Error removing temporary directory: {error}", + tag="ERROR", + params={"error": str(e)}, + ) + + async def create_browser_context(self, crawlerRunConfig: Optional[CrawlerRunConfig] = None) -> BrowserContext: + """Create a new browser context. + + Args: + crawlerRunConfig: Configuration object for the crawler run + + Returns: + BrowserContext: Browser context object + """ + return await self.browser.new_context() + + def _cleanup_expired_sessions(self): + """Clean up expired sessions based on TTL.""" + current_time = time.time() + expired_sessions = [ + sid + for sid, (_, _, last_used) in self.sessions.items() + if current_time - last_used > self.session_ttl + ] + for sid in expired_sessions: + asyncio.create_task(self._kill_session(sid)) + + async def _kill_session(self, session_id: str): + """Kill a browser session and clean up resources. + + Args: + session_id: The session ID to kill + """ + if session_id in self.sessions: + context, page, _ = self.sessions[session_id] + await page.close() + del self.sessions[session_id] + + async def get_page(self, crawlerRunConfig: CrawlerRunConfig) -> Tuple[Page, BrowserContext]: + """Get a page for the given configuration. + + Args: + crawlerRunConfig: Configuration object for the crawler run + + Returns: + Tuple of (Page, BrowserContext) + """ + self._cleanup_expired_sessions() + + # If a session_id is provided and we already have it, reuse that page + context + if crawlerRunConfig.session_id and crawlerRunConfig.session_id in self.sessions: + context, page, _ = self.sessions[crawlerRunConfig.session_id] + # Update last-used timestamp + self.sessions[crawlerRunConfig.session_id] = (context, page, time.time()) + return page, context + + # For CDP, we typically use the shared default_context + context = self.default_context + pages = context.pages + page = next((p for p in pages if p.url == crawlerRunConfig.url), None) + if not page: + page = await context.new_page() + + # If a session_id is specified, store this session so we can reuse later + if crawlerRunConfig.session_id: + self.sessions[crawlerRunConfig.session_id] = (context, page, time.time()) + + return page, context + + async def close(self): + """Close the browser and clean up resources.""" + # Skip cleanup if using external CDP URL and not launched by us + if self.config.cdp_url and not self.browser_process: + return + + if self.config.sleep_on_close: + await asyncio.sleep(0.5) + + # Close all sessions + session_ids = list(self.sessions.keys()) + for session_id in session_ids: + await self._kill_session(session_id) + + # Close browser + if self.browser: + await self.browser.close() + self.browser = None + + # Clean up managed browser if we created it + if self.browser_process: + await asyncio.sleep(0.5) + await self._cleanup_process() + self.browser_process = None + + # Close temporary directory + if self.temp_dir and os.path.exists(self.temp_dir): + try: + shutil.rmtree(self.temp_dir) + self.temp_dir = None + except Exception as e: + if self.logger: + self.logger.error( + message="Error removing temporary directory: {error}", + tag="ERROR", + params={"error": str(e)}, + ) + + # Stop playwright + if self.playwright: + await self.playwright.stop() + self.playwright = None + +class BuiltinBrowserStrategy(CDPBrowserStrategy): + """Built-in browser strategy. + + This strategy extends the CDP strategy to use the built-in browser. + """ + + def __init__(self, config: BrowserConfig, logger: Optional[AsyncLogger] = None): + """Initialize the built-in browser strategy. + + Args: + config: Browser configuration + logger: Logger for recording events and errors + """ + super().__init__(config, logger) + self.builtin_browser_dir = os.path.join(get_home_folder(), "builtin-browser") + self.builtin_config_file = os.path.join(self.builtin_browser_dir, "browser_config.json") + os.makedirs(self.builtin_browser_dir, exist_ok=True) + + async def start(self): + """Start or connect to the built-in browser. + + Returns: + self: For method chaining + """ + # Check for existing built-in browser + browser_info = self.get_builtin_browser_info() + if browser_info and self._is_browser_running(browser_info.get('pid')): + if self.logger: + self.logger.info(f"Using existing built-in browser at {browser_info.get('cdp_url')}", tag="BROWSER") + self.config.cdp_url = browser_info.get('cdp_url') + else: + if self.logger: + self.logger.info("Built-in browser not found, launching new instance...", tag="BROWSER") + cdp_url = await self.launch_builtin_browser( + browser_type=self.config.browser_type, + debugging_port=self.config.debugging_port, + headless=self.config.headless + ) + if not cdp_url: + if self.logger: + self.logger.warning("Failed to launch built-in browser, falling back to regular CDP strategy", tag="BROWSER") + return await super().start() + self.config.cdp_url = cdp_url + + # Call parent class implementation with updated CDP URL + return await super().start() + + def get_builtin_browser_info(self) -> Optional[Dict[str, Any]]: + """Get information about the built-in browser. + + Returns: + dict: Browser information or None if no built-in browser is configured + """ + if not os.path.exists(self.builtin_config_file): + return None + + try: + with open(self.builtin_config_file, 'r') as f: + browser_info = json.load(f) + + # Check if the browser is still running + if not self._is_browser_running(browser_info.get('pid')): + if self.logger: + self.logger.warning("Built-in browser is not running", tag="BUILTIN") + return None + + return browser_info + except Exception as e: + if self.logger: + self.logger.error(f"Error reading built-in browser config: {str(e)}", tag="BUILTIN") + return None + + def _is_browser_running(self, pid: Optional[int]) -> bool: + """Check if a process with the given PID is running. + + Args: + pid: Process ID to check + + Returns: + bool: True if the process is running, False otherwise + """ + if not pid: + return False + + try: + # Check if the process exists + if is_windows(): + process = subprocess.run(["tasklist", "/FI", f"PID eq {pid}"], + capture_output=True, text=True) + return str(pid) in process.stdout + else: + # Unix-like systems + os.kill(pid, 0) # This doesn't actually kill the process, just checks if it exists + return True + except (ProcessLookupError, PermissionError, OSError): + return False + + async def launch_builtin_browser(self, + browser_type: str = "chromium", + debugging_port: int = 9222, + headless: bool = True) -> Optional[str]: + """Launch a browser in the background for use as the built-in browser. + + Args: + browser_type: Type of browser to launch ('chromium' or 'firefox') + debugging_port: Port to use for CDP debugging + headless: Whether to run in headless mode + + Returns: + str: CDP URL for the browser, or None if launch failed + """ + # Check if there's an existing browser still running + browser_info = self.get_builtin_browser_info() + if browser_info and self._is_browser_running(browser_info.get('pid')): + if self.logger: + self.logger.info("Built-in browser is already running", tag="BUILTIN") + return browser_info.get('cdp_url') + + # Create a user data directory for the built-in browser + user_data_dir = os.path.join(self.builtin_browser_dir, "user_data") + os.makedirs(user_data_dir, exist_ok=True) + + # Prepare browser launch arguments + browser_path = get_browser_executable(browser_type) + if browser_type == "chromium": + args = [ + browser_path, + f"--remote-debugging-port={debugging_port}", + f"--user-data-dir={user_data_dir}", + ] + if headless: + args.append("--headless=new") + elif browser_type == "firefox": + args = [ + browser_path, + "--remote-debugging-port", + str(debugging_port), + "--profile", + user_data_dir, + ] + if headless: + args.append("--headless") + else: + if self.logger: + self.logger.error(f"Browser type {browser_type} not supported for built-in browser", tag="BUILTIN") + return None + + try: + # Start the browser process detached + if is_windows(): + process = subprocess.Popen( + args, + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + creationflags=subprocess.DETACHED_PROCESS | subprocess.CREATE_NEW_PROCESS_GROUP + ) + else: + process = subprocess.Popen( + args, + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + preexec_fn=os.setpgrp # Start in a new process group + ) + + # Wait briefly to ensure the process starts successfully + await asyncio.sleep(2.0) + + # Check if the process is still running + if process.poll() is not None: + if self.logger: + self.logger.error(f"Browser process exited immediately with code {process.returncode}", tag="BUILTIN") + return None + + # Construct CDP URL + cdp_url = f"http://localhost:{debugging_port}" + + # Try to verify browser is responsive by fetching version info + import aiohttp + json_url = f"{cdp_url}/json/version" + config_json = None + + try: + async with aiohttp.ClientSession() as session: + for _ in range(10): # Try multiple times + try: + async with session.get(json_url) as response: + if response.status == 200: + config_json = await response.json() + break + except Exception: + pass + await asyncio.sleep(0.5) + except Exception as e: + if self.logger: + self.logger.warning(f"Could not verify browser: {str(e)}", tag="BUILTIN") + + # Save browser info + browser_info = { + 'pid': process.pid, + 'cdp_url': cdp_url, + 'user_data_dir': user_data_dir, + 'browser_type': browser_type, + 'debugging_port': debugging_port, + 'start_time': time.time(), + 'config': config_json + } + + with open(self.builtin_config_file, 'w') as f: + json.dump(browser_info, f, indent=2) + + # Detach from the browser process - don't keep any references + # This is important to allow the Python script to exit while the browser continues running + process = None + + if self.logger: + self.logger.success(f"Built-in browser launched at CDP URL: {cdp_url}", tag="BUILTIN") + return cdp_url + + except Exception as e: + if self.logger: + self.logger.error(f"Error launching built-in browser: {str(e)}", tag="BUILTIN") + return None + + async def kill_builtin_browser(self) -> bool: + """Kill the built-in browser if it's running. + + Returns: + bool: True if the browser was killed, False otherwise + """ + browser_info = self.get_builtin_browser_info() + if not browser_info: + if self.logger: + self.logger.warning("No built-in browser found", tag="BUILTIN") + return False + + pid = browser_info.get('pid') + if not pid: + return False + + try: + if is_windows(): + subprocess.run(["taskkill", "/F", "/PID", str(pid)], check=True) + else: + os.kill(pid, signal.SIGTERM) + # Wait for termination + for _ in range(5): + if not self._is_browser_running(pid): + break + await asyncio.sleep(0.5) + else: + # Force kill if still running + os.kill(pid, signal.SIGKILL) + + # Remove config file + if os.path.exists(self.builtin_config_file): + os.unlink(self.builtin_config_file) + + if self.logger: + self.logger.success("Built-in browser terminated", tag="BUILTIN") + return True + except Exception as e: + if self.logger: + self.logger.error(f"Error killing built-in browser: {str(e)}", tag="BUILTIN") + return False + + async def get_builtin_browser_status(self) -> Dict[str, Any]: + """Get status information about the built-in browser. + + Returns: + dict: Status information with running, cdp_url, and info fields + """ + browser_info = self.get_builtin_browser_info() + + if not browser_info: + return { + 'running': False, + 'cdp_url': None, + 'info': None + } + + return { + 'running': True, + 'cdp_url': browser_info.get('cdp_url'), + 'info': browser_info + } diff --git a/crawl4ai/browser/utils.py b/crawl4ai/browser/utils.py new file mode 100644 index 00000000..2dff0924 --- /dev/null +++ b/crawl4ai/browser/utils.py @@ -0,0 +1,105 @@ +"""Browser utilities module for Crawl4AI. + +This module provides utility functions for browser management, +including process management, CDP connection utilities, +and Playwright instance management. +""" + +import asyncio +import os +import sys +import platform +import tempfile +from typing import Optional, Any + +from playwright.async_api import async_playwright + +from ..async_logger import AsyncLogger +from ..utils import get_chromium_path + +_playwright_instance = None + +async def get_playwright(): + """Get or create the Playwright instance (singleton pattern). + + Returns: + Playwright: The Playwright instance + """ + global _playwright_instance + if _playwright_instance is None or True: + _playwright_instance = await async_playwright().start() + return _playwright_instance + +def get_browser_executable(browser_type: str) -> str: + """Get the path to browser executable, with platform-specific handling. + + Args: + browser_type: Type of browser (chromium, firefox, webkit) + + Returns: + Path to browser executable + """ + return get_chromium_path(browser_type) + +def create_temp_directory(prefix="browser-profile-") -> str: + """Create a temporary directory for browser data. + + Args: + prefix: Prefix for the temporary directory name + + Returns: + Path to the created temporary directory + """ + return tempfile.mkdtemp(prefix=prefix) + +def is_windows() -> bool: + """Check if the current platform is Windows. + + Returns: + True if Windows, False otherwise + """ + return sys.platform == "win32" + +def is_macos() -> bool: + """Check if the current platform is macOS. + + Returns: + True if macOS, False otherwise + """ + return sys.platform == "darwin" + +def is_linux() -> bool: + """Check if the current platform is Linux. + + Returns: + True if Linux, False otherwise + """ + return not (is_windows() or is_macos()) + +def get_browser_disable_options() -> list: + """Get standard list of browser disable options for performance. + + Returns: + List of command-line options to disable various browser features + """ + return [ + "--disable-background-networking", + "--disable-background-timer-throttling", + "--disable-backgrounding-occluded-windows", + "--disable-breakpad", + "--disable-client-side-phishing-detection", + "--disable-component-extensions-with-background-pages", + "--disable-default-apps", + "--disable-extensions", + "--disable-features=TranslateUI", + "--disable-hang-monitor", + "--disable-ipc-flooding-protection", + "--disable-popup-blocking", + "--disable-prompt-on-repost", + "--disable-sync", + "--force-color-profile=srgb", + "--metrics-recording-only", + "--no-first-run", + "--password-store=basic", + "--use-mock-keychain", + ] diff --git a/crawl4ai/browser_manager.py b/crawl4ai/browser_manager.py index f40efbbc..df0886c7 100644 --- a/crawl4ai/browser_manager.py +++ b/crawl4ai/browser_manager.py @@ -163,6 +163,7 @@ class ManagedBrowser: ) # We'll monitor for a short time to make sure it starts properly, but won't keep monitoring + await asyncio.sleep(0.5) # Give browser time to start await self._initial_startup_check() await asyncio.sleep(2) # Give browser time to start return f"http://{self.host}:{self.debugging_port}" diff --git a/crawl4ai/browser_profiler.py b/crawl4ai/browser_profiler.py index 1fd76ddc..2291faa2 100644 --- a/crawl4ai/browser_profiler.py +++ b/crawl4ai/browser_profiler.py @@ -555,7 +555,6 @@ class BrowserProfiler: else: self.logger.error(f"Invalid choice. Please enter a number between 1 and {exit_option}.", tag="MENU") - async def launch_standalone_browser(self, browser_type: str = "chromium", user_data_dir: Optional[str] = None, diff --git a/docs/examples/hello_world.py b/docs/examples/hello_world.py index c44908d5..fbdd5283 100644 --- a/docs/examples/hello_world.py +++ b/docs/examples/hello_world.py @@ -9,6 +9,26 @@ from crawl4ai import ( CrawlResult ) +async def example_cdp(): + browser_conf = BrowserConfig( + headless=False, + cdp_url="http://localhost:9223" + ) + crawler_config = CrawlerRunConfig( + session_id="test", + js_code = """(() => { return {"result": "Hello World!"} })()""", + js_only=True + ) + async with AsyncWebCrawler( + config=browser_conf, + verbose=True, + ) as crawler: + result : CrawlResult = await crawler.arun( + url="https://www.helloworld.org", + config=crawler_config, + ) + print(result.js_execution_result) + async def main(): browser_config = BrowserConfig(headless=True, verbose=True) @@ -16,18 +36,15 @@ async def main(): crawler_config = CrawlerRunConfig( cache_mode=CacheMode.BYPASS, markdown_generator=DefaultMarkdownGenerator( - # content_filter=PruningContentFilter( - # threshold=0.48, threshold_type="fixed", min_word_threshold=0 - # ) + content_filter=PruningContentFilter( + threshold=0.48, threshold_type="fixed", min_word_threshold=0 + ) ), ) result : CrawlResult = await crawler.arun( - # url="https://www.helloworld.org", config=crawler_config - url="https://www.kidocode.com", config=crawler_config + url="https://www.helloworld.org", config=crawler_config ) print(result.markdown.raw_markdown[:500]) - # print(result.model_dump()) - if __name__ == "__main__": asyncio.run(main()) diff --git a/tests/browser/test_browser_manager.py b/tests/browser/test_browser_manager.py new file mode 100644 index 00000000..2293b90d --- /dev/null +++ b/tests/browser/test_browser_manager.py @@ -0,0 +1,190 @@ +"""Test examples for BrowserManager. + +These examples demonstrate the functionality of BrowserManager +and serve as functional tests. +""" + +import asyncio +import os +import sys +from typing import List + +# Add the project root to Python path if running directly +if __name__ == "__main__": + sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '../..'))) + +from crawl4ai.browser import BrowserManager +from crawl4ai.async_configs import BrowserConfig, CrawlerRunConfig +from crawl4ai.async_logger import AsyncLogger + +# Create a logger for clear terminal output +logger = AsyncLogger(verbose=True, log_file=None) + +async def test_basic_browser_manager(): + """Test basic BrowserManager functionality with default configuration.""" + logger.info("Starting test_basic_browser_manager", tag="TEST") + + try: + # Create a browser manager with default config + manager = BrowserManager(logger=logger) + + # Start the browser + await manager.start() + logger.info("Browser started successfully", tag="TEST") + + # Get a page + crawler_config = CrawlerRunConfig(url="https://example.com") + page, context = await manager.get_page(crawler_config) + logger.info("Page created successfully", tag="TEST") + + # Navigate to a website + await page.goto("https://example.com") + title = await page.title() + logger.info(f"Page title: {title}", tag="TEST") + + # Clean up + await manager.close() + logger.success("test_basic_browser_manager completed successfully", tag="TEST") + return True + except Exception as e: + logger.error(f"test_basic_browser_manager failed: {str(e)}", tag="TEST") + return False + +async def test_custom_browser_config(): + """Test BrowserManager with custom browser configuration.""" + logger.info("Starting test_custom_browser_config", tag="TEST") + + try: + # Create a custom browser config + browser_config = BrowserConfig( + browser_type="chromium", + headless=True, + viewport_width=1280, + viewport_height=800, + light_mode=True + ) + + # Create browser manager with the config + manager = BrowserManager(browser_config=browser_config, logger=logger) + + # Start the browser + await manager.start() + logger.info("Browser started successfully with custom config", tag="TEST") + + # Get a page + crawler_config = CrawlerRunConfig(url="https://example.com") + page, context = await manager.get_page(crawler_config) + + # Navigate to a website + await page.goto("https://example.com") + title = await page.title() + logger.info(f"Page title: {title}", tag="TEST") + + # Verify viewport size + viewport_size = await page.evaluate("() => ({ width: window.innerWidth, height: window.innerHeight })") + logger.info(f"Viewport size: {viewport_size}", tag="TEST") + + # Clean up + await manager.close() + logger.success("test_custom_browser_config completed successfully", tag="TEST") + return True + except Exception as e: + logger.error(f"test_custom_browser_config failed: {str(e)}", tag="TEST") + return False + +async def test_multiple_pages(): + """Test BrowserManager with multiple pages.""" + logger.info("Starting test_multiple_pages", tag="TEST") + + try: + # Create browser manager + manager = BrowserManager(logger=logger) + + # Start the browser + await manager.start() + logger.info("Browser started successfully", tag="TEST") + + # Create multiple pages + pages = [] + urls = ["https://example.com", "https://example.org", "https://mozilla.org"] + + for i, url in enumerate(urls): + crawler_config = CrawlerRunConfig(url=url) + page, context = await manager.get_page(crawler_config) + await page.goto(url) + pages.append((page, url)) + logger.info(f"Created page {i+1} for {url}", tag="TEST") + + # Verify all pages are loaded correctly + for i, (page, url) in enumerate(pages): + title = await page.title() + logger.info(f"Page {i+1} title: {title}", tag="TEST") + + # Clean up + await manager.close() + logger.success("test_multiple_pages completed successfully", tag="TEST") + return True + except Exception as e: + logger.error(f"test_multiple_pages failed: {str(e)}", tag="TEST") + return False + +async def test_session_management(): + """Test session management in BrowserManager.""" + logger.info("Starting test_session_management", tag="TEST") + + try: + # Create browser manager + manager = BrowserManager(logger=logger) + + # Start the browser + await manager.start() + logger.info("Browser started successfully", tag="TEST") + + # Create a session + session_id = "test_session_1" + crawler_config = CrawlerRunConfig(url="https://example.com", session_id=session_id) + page1, context1 = await manager.get_page(crawler_config) + await page1.goto("https://example.com") + logger.info(f"Created session with ID: {session_id}", tag="TEST") + + # Get the same session again + page2, context2 = await manager.get_page(crawler_config) + + # Verify it's the same page/context + is_same_page = page1 == page2 + is_same_context = context1 == context2 + logger.info(f"Same page: {is_same_page}, Same context: {is_same_context}", tag="TEST") + + # Kill the session + await manager.kill_session(session_id) + logger.info(f"Killed session with ID: {session_id}", tag="TEST") + + # Clean up + await manager.close() + logger.success("test_session_management completed successfully", tag="TEST") + return True + except Exception as e: + logger.error(f"test_session_management failed: {str(e)}", tag="TEST") + return False + +async def run_tests(): + """Run all tests sequentially.""" + results = [] + + # results.append(await test_basic_browser_manager()) + # results.append(await test_custom_browser_config()) + # results.append(await test_multiple_pages()) + results.append(await test_session_management()) + + # Print summary + total = len(results) + passed = sum(results) + logger.info(f"Tests complete: {passed}/{total} passed", tag="SUMMARY") + + if passed == total: + logger.success("All tests passed!", tag="SUMMARY") + else: + logger.error(f"{total - passed} tests failed", tag="SUMMARY") + +if __name__ == "__main__": + asyncio.run(run_tests()) diff --git a/tests/browser/test_builtin_strategy.py b/tests/browser/test_builtin_strategy.py new file mode 100644 index 00000000..7c435b3d --- /dev/null +++ b/tests/browser/test_builtin_strategy.py @@ -0,0 +1,160 @@ +"""Test examples for BuiltinBrowserStrategy. + +These examples demonstrate the functionality of BuiltinBrowserStrategy +and serve as functional tests. +""" + +import asyncio +import os +import sys + +# Add the project root to Python path if running directly +if __name__ == "__main__": + sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '../..'))) + +from crawl4ai.browser import BrowserManager +from crawl4ai.async_configs import BrowserConfig, CrawlerRunConfig +from crawl4ai.async_logger import AsyncLogger + +# Create a logger for clear terminal output +logger = AsyncLogger(verbose=True, log_file=None) + +async def test_builtin_browser(): + """Test using a builtin browser that persists between sessions.""" + logger.info("Testing builtin browser", tag="TEST") + + browser_config = BrowserConfig( + browser_mode="builtin", + headless=True + ) + + manager = BrowserManager(browser_config=browser_config, logger=logger) + + try: + # Start should connect to existing builtin browser or create one + await manager.start() + logger.info("Connected to builtin browser", tag="TEST") + + # Test page creation + crawler_config = CrawlerRunConfig() + page, context = await manager.get_page(crawler_config) + + # Test navigation + await page.goto("https://example.com") + title = await page.title() + logger.info(f"Page title: {title}", tag="TEST") + + # Close manager (should not close the builtin browser) + await manager.close() + logger.info("First session closed", tag="TEST") + + # Create a second manager to verify browser persistence + logger.info("Creating second session to verify persistence", tag="TEST") + manager2 = BrowserManager(browser_config=browser_config, logger=logger) + + await manager2.start() + logger.info("Connected to existing builtin browser", tag="TEST") + + page2, context2 = await manager2.get_page(crawler_config) + await page2.goto("https://example.org") + title2 = await page2.title() + logger.info(f"Second session page title: {title2}", tag="TEST") + + await manager2.close() + logger.info("Second session closed successfully", tag="TEST") + + return True + except Exception as e: + logger.error(f"Test failed: {str(e)}", tag="TEST") + try: + await manager.close() + except: + pass + return False + +async def test_builtin_browser_status(): + """Test getting status of the builtin browser.""" + logger.info("Testing builtin browser status", tag="TEST") + + from crawl4ai.browser.strategies import BuiltinBrowserStrategy + + browser_config = BrowserConfig( + browser_mode="builtin", + headless=True + ) + + # Create strategy directly to access its status methods + strategy = BuiltinBrowserStrategy(browser_config, logger) + + try: + # Get status before starting (should be not running) + status_before = await strategy.get_builtin_browser_status() + logger.info(f"Initial status: {status_before}", tag="TEST") + + # Start the browser + await strategy.start() + logger.info("Browser started successfully", tag="TEST") + + # Get status after starting + status_after = await strategy.get_builtin_browser_status() + logger.info(f"Status after start: {status_after}", tag="TEST") + + # Create a page to verify functionality + crawler_config = CrawlerRunConfig() + page, context = await strategy.get_page(crawler_config) + await page.goto("https://example.com") + title = await page.title() + logger.info(f"Page title: {title}", tag="TEST") + + # Close strategy (should not kill the builtin browser) + await strategy.close() + logger.info("Strategy closed successfully", tag="TEST") + + # Create a new strategy object + strategy2 = BuiltinBrowserStrategy(browser_config, logger) + + # Get status again (should still be running) + status_final = await strategy2.get_builtin_browser_status() + logger.info(f"Final status: {status_final}", tag="TEST") + + # Verify that the status shows the browser is running + is_running = status_final.get('running', False) + logger.info(f"Builtin browser persistence confirmed: {is_running}", tag="TEST") + + # Kill the builtin browser to clean up + logger.info("Killing builtin browser", tag="TEST") + success = await strategy2.kill_builtin_browser() + logger.info(f"Killed builtin browser successfully: {success}", tag="TEST") + + return is_running and success + except Exception as e: + logger.error(f"Test failed: {str(e)}", tag="TEST") + try: + await strategy.close() + + # Try to kill the builtin browser to clean up + strategy2 = BuiltinBrowserStrategy(browser_config, logger) + await strategy2.kill_builtin_browser() + except: + pass + return False + +async def run_tests(): + """Run all tests sequentially.""" + results = [] + + results.append(await test_builtin_browser()) + results.append(await test_builtin_browser_status()) + + # Print summary + total = len(results) + passed = sum(results) + logger.info(f"Tests complete: {passed}/{total} passed", tag="SUMMARY") + + if passed == total: + logger.success("All tests passed!", tag="SUMMARY") + else: + logger.error(f"{total - passed} tests failed", tag="SUMMARY") + +if __name__ == "__main__": + asyncio.run(run_tests()) diff --git a/tests/browser/test_cdp_strategy.py b/tests/browser/test_cdp_strategy.py new file mode 100644 index 00000000..4ec1f7f1 --- /dev/null +++ b/tests/browser/test_cdp_strategy.py @@ -0,0 +1,227 @@ +"""Test examples for CDPBrowserStrategy. + +These examples demonstrate the functionality of CDPBrowserStrategy +and serve as functional tests. +""" + +import asyncio +import os +import sys + +# Add the project root to Python path if running directly +if __name__ == "__main__": + sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '../..'))) + +from crawl4ai.browser import BrowserManager +from crawl4ai.async_configs import BrowserConfig, CrawlerRunConfig +from crawl4ai.async_logger import AsyncLogger + +# Create a logger for clear terminal output +logger = AsyncLogger(verbose=True, log_file=None) + +async def test_cdp_launch_connect(): + """Test launching a browser and connecting via CDP.""" + logger.info("Testing launch and connect via CDP", tag="TEST") + + browser_config = BrowserConfig( + use_managed_browser=True, + headless=True + ) + + manager = BrowserManager(browser_config=browser_config, logger=logger) + + try: + await manager.start() + logger.info("Browser launched and connected via CDP", tag="TEST") + + # Test with multiple pages + pages = [] + for i in range(3): + crawler_config = CrawlerRunConfig() + page, context = await manager.get_page(crawler_config) + await page.goto(f"https://example.com?test={i}") + pages.append(page) + logger.info(f"Created page {i+1}", tag="TEST") + + # Verify all pages are working + for i, page in enumerate(pages): + title = await page.title() + logger.info(f"Page {i+1} title: {title}", tag="TEST") + + await manager.close() + logger.info("Browser closed successfully", tag="TEST") + + return True + except Exception as e: + logger.error(f"Test failed: {str(e)}", tag="TEST") + try: + await manager.close() + except: + pass + return False + +async def test_cdp_with_user_data_dir(): + """Test CDP browser with a user data directory.""" + logger.info("Testing CDP browser with user data directory", tag="TEST") + + # Create a temporary user data directory + import tempfile + user_data_dir = tempfile.mkdtemp(prefix="crawl4ai-test-") + logger.info(f"Created temporary user data directory: {user_data_dir}", tag="TEST") + + browser_config = BrowserConfig( + use_managed_browser=True, + headless=True, + user_data_dir=user_data_dir + ) + + manager = BrowserManager(browser_config=browser_config, logger=logger) + + try: + await manager.start() + logger.info("Browser launched with user data directory", tag="TEST") + + # Navigate to a page and store some data + crawler_config = CrawlerRunConfig() + page, context = await manager.get_page(crawler_config) + + # Set a cookie + await context.add_cookies([{ + "name": "test_cookie", + "value": "test_value", + "url": "https://example.com" + }]) + + # Visit the site + await page.goto("https://example.com") + + # Verify cookie was set + cookies = await context.cookies(["https://example.com"]) + has_test_cookie = any(cookie["name"] == "test_cookie" for cookie in cookies) + logger.info(f"Cookie set successfully: {has_test_cookie}", tag="TEST") + + # Close the browser + await manager.close() + logger.info("First browser session closed", tag="TEST") + + # Start a new browser with the same user data directory + logger.info("Starting second browser session with same user data directory", tag="TEST") + manager2 = BrowserManager(browser_config=browser_config, logger=logger) + await manager2.start() + + # Get a new page and check if the cookie persists + page2, context2 = await manager2.get_page(crawler_config) + await page2.goto("https://example.com") + + # Verify cookie persisted + cookies2 = await context2.cookies(["https://example.com"]) + has_test_cookie2 = any(cookie["name"] == "test_cookie" for cookie in cookies2) + logger.info(f"Cookie persisted across sessions: {has_test_cookie2}", tag="TEST") + + # Clean up + await manager2.close() + + # Remove temporary directory + import shutil + shutil.rmtree(user_data_dir, ignore_errors=True) + logger.info(f"Removed temporary user data directory", tag="TEST") + + return has_test_cookie and has_test_cookie2 + except Exception as e: + logger.error(f"Test failed: {str(e)}", tag="TEST") + try: + await manager.close() + except: + pass + + # Clean up temporary directory + try: + import shutil + shutil.rmtree(user_data_dir, ignore_errors=True) + except: + pass + + return False + +async def test_cdp_session_management(): + """Test session management with CDP browser.""" + logger.info("Testing session management with CDP browser", tag="TEST") + + browser_config = BrowserConfig( + use_managed_browser=True, + headless=True + ) + + manager = BrowserManager(browser_config=browser_config, logger=logger) + + try: + await manager.start() + logger.info("Browser launched successfully", tag="TEST") + + # Create two sessions + session1_id = "test_session_1" + session2_id = "test_session_2" + + # Set up first session + crawler_config1 = CrawlerRunConfig(session_id=session1_id) + page1, context1 = await manager.get_page(crawler_config1) + await page1.goto("https://example.com") + await page1.evaluate("localStorage.setItem('session1_data', 'test_value')") + logger.info(f"Set up session 1 with ID: {session1_id}", tag="TEST") + + # Set up second session + crawler_config2 = CrawlerRunConfig(session_id=session2_id) + page2, context2 = await manager.get_page(crawler_config2) + await page2.goto("https://example.org") + await page2.evaluate("localStorage.setItem('session2_data', 'test_value2')") + logger.info(f"Set up session 2 with ID: {session2_id}", tag="TEST") + + # Get first session again + page1_again, _ = await manager.get_page(crawler_config1) + + # Verify it's the same page and data persists + is_same_page = page1 == page1_again + data1 = await page1_again.evaluate("localStorage.getItem('session1_data')") + logger.info(f"Session 1 reuse successful: {is_same_page}, data: {data1}", tag="TEST") + + # Kill first session + await manager.kill_session(session1_id) + logger.info(f"Killed session 1", tag="TEST") + + # Verify second session still works + data2 = await page2.evaluate("localStorage.getItem('session2_data')") + logger.info(f"Session 2 still functional after killing session 1, data: {data2}", tag="TEST") + + # Clean up + await manager.close() + logger.info("Browser closed successfully", tag="TEST") + + return is_same_page and data1 == "test_value" and data2 == "test_value2" + except Exception as e: + logger.error(f"Test failed: {str(e)}", tag="TEST") + try: + await manager.close() + except: + pass + return False + +async def run_tests(): + """Run all tests sequentially.""" + results = [] + + results.append(await test_cdp_launch_connect()) + results.append(await test_cdp_with_user_data_dir()) + results.append(await test_cdp_session_management()) + + # Print summary + total = len(results) + passed = sum(results) + logger.info(f"Tests complete: {passed}/{total} passed", tag="SUMMARY") + + if passed == total: + logger.success("All tests passed!", tag="SUMMARY") + else: + logger.error(f"{total - passed} tests failed", tag="SUMMARY") + +if __name__ == "__main__": + asyncio.run(run_tests()) diff --git a/tests/browser/test_combined.py b/tests/browser/test_combined.py new file mode 100644 index 00000000..b5bce3cd --- /dev/null +++ b/tests/browser/test_combined.py @@ -0,0 +1,77 @@ +"""Combined test runner for all browser module tests. + +This script runs all the browser module tests in sequence and +provides a comprehensive summary. +""" + +import asyncio +import os +import sys +import time + +# Add the project root to Python path if running directly +if __name__ == "__main__": + sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '../..'))) + +from crawl4ai.async_logger import AsyncLogger + +# Create a logger for clear terminal output +logger = AsyncLogger(verbose=True, log_file=None) + +async def run_test_module(module_name, header): + """Run all tests in a module and return results.""" + logger.info(f"\n{'-'*30}", tag="TEST") + logger.info(f"RUNNING: {header}", tag="TEST") + logger.info(f"{'-'*30}", tag="TEST") + + # Import the module dynamically + module = __import__(f"tests.browser.{module_name}", fromlist=["run_tests"]) + + # Track time for performance measurement + start_time = time.time() + + # Run the tests + await module.run_tests() + + # Calculate time taken + time_taken = time.time() - start_time + logger.info(f"Time taken: {time_taken:.2f} seconds", tag="TIMING") + + return time_taken + +async def main(): + """Run all test modules.""" + logger.info("STARTING COMPREHENSIVE BROWSER MODULE TESTS", tag="MAIN") + + # List of test modules to run + test_modules = [ + ("test_browser_manager", "Browser Manager Tests"), + ("test_playwright_strategy", "Playwright Strategy Tests"), + ("test_cdp_strategy", "CDP Strategy Tests"), + ("test_builtin_strategy", "Builtin Browser Strategy Tests"), + ("test_profiles", "Profile Management Tests") + ] + + # Run each test module + timings = {} + for module_name, header in test_modules: + try: + time_taken = await run_test_module(module_name, header) + timings[module_name] = time_taken + except Exception as e: + logger.error(f"Error running {module_name}: {str(e)}", tag="ERROR") + + # Print summary + logger.info("\n\nTEST SUMMARY:", tag="SUMMARY") + logger.info(f"{'-'*50}", tag="SUMMARY") + for module_name, header in test_modules: + if module_name in timings: + logger.info(f"{header}: {timings[module_name]:.2f} seconds", tag="SUMMARY") + else: + logger.error(f"{header}: FAILED TO RUN", tag="SUMMARY") + logger.info(f"{'-'*50}", tag="SUMMARY") + total_time = sum(timings.values()) + logger.info(f"Total time: {total_time:.2f} seconds", tag="SUMMARY") + +if __name__ == "__main__": + asyncio.run(main()) diff --git a/tests/browser/test_playwright_strategy.py b/tests/browser/test_playwright_strategy.py new file mode 100644 index 00000000..1d897bcf --- /dev/null +++ b/tests/browser/test_playwright_strategy.py @@ -0,0 +1,275 @@ +"""Test examples for PlaywrightBrowserStrategy. + +These examples demonstrate the functionality of PlaywrightBrowserStrategy +and serve as functional tests. +""" + +import asyncio +import os +import sys + +# Add the project root to Python path if running directly +if __name__ == "__main__": + sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '../..'))) + +from crawl4ai.browser import BrowserManager +from crawl4ai.async_configs import BrowserConfig, CrawlerRunConfig +from crawl4ai.async_logger import AsyncLogger + +# Create a logger for clear terminal output +logger = AsyncLogger(verbose=True, log_file=None) + +async def test_playwright_basic(): + """Test basic Playwright browser functionality.""" + logger.info("Testing standard Playwright browser", tag="TEST") + + # Create browser config for standard Playwright + browser_config = BrowserConfig( + headless=True, + viewport_width=1280, + viewport_height=800 + ) + + # Create browser manager with the config + manager = BrowserManager(browser_config=browser_config, logger=logger) + + try: + # Start the browser + await manager.start() + logger.info("Browser started successfully", tag="TEST") + + # Create crawler config + crawler_config = CrawlerRunConfig(url="https://example.com") + + # Get a page + page, context = await manager.get_page(crawler_config) + logger.info("Got page successfully", tag="TEST") + + # Navigate to a website + await page.goto("https://example.com") + logger.info("Navigated to example.com", tag="TEST") + + # Get page title + title = await page.title() + logger.info(f"Page title: {title}", tag="TEST") + + # Clean up + await manager.close() + logger.info("Browser closed successfully", tag="TEST") + + return True + except Exception as e: + logger.error(f"Test failed: {str(e)}", tag="TEST") + # Ensure cleanup + try: + await manager.close() + except: + pass + return False + +async def test_playwright_text_mode(): + """Test Playwright browser in text-only mode.""" + logger.info("Testing Playwright text mode", tag="TEST") + + # Create browser config with text mode enabled + browser_config = BrowserConfig( + headless=True, + text_mode=True # Enable text-only mode + ) + + # Create browser manager with the config + manager = BrowserManager(browser_config=browser_config, logger=logger) + + try: + # Start the browser + await manager.start() + logger.info("Browser started successfully in text mode", tag="TEST") + + # Get a page + crawler_config = CrawlerRunConfig(url="https://example.com") + page, context = await manager.get_page(crawler_config) + + # Navigate to a website + await page.goto("https://example.com") + logger.info("Navigated to example.com", tag="TEST") + + # Get page title + title = await page.title() + logger.info(f"Page title: {title}", tag="TEST") + + # Check if images are blocked in text mode + # We'll check if any image requests were made + has_images = False + async with page.expect_request("**/*.{png,jpg,jpeg,gif,webp,svg}", timeout=1000) as request_info: + try: + # Try to load a page with images + await page.goto("https://picsum.photos/", wait_until="domcontentloaded") + request = await request_info.value + has_images = True + except: + # Timeout without image requests means text mode is working + has_images = False + + logger.info(f"Text mode image blocking working: {not has_images}", tag="TEST") + + # Clean up + await manager.close() + logger.info("Browser closed successfully", tag="TEST") + + return True + except Exception as e: + logger.error(f"Test failed: {str(e)}", tag="TEST") + # Ensure cleanup + try: + await manager.close() + except: + pass + return False + +async def test_playwright_context_reuse(): + """Test context caching and reuse with identical configurations.""" + logger.info("Testing context reuse with identical configurations", tag="TEST") + + # Create browser config + browser_config = BrowserConfig(headless=True) + + # Create browser manager + manager = BrowserManager(browser_config=browser_config, logger=logger) + + try: + # Start the browser + await manager.start() + logger.info("Browser started successfully", tag="TEST") + + # Create identical crawler configs + crawler_config1 = CrawlerRunConfig( + url="https://example.com", + viewport_width=1280, + viewport_height=800 + ) + + crawler_config2 = CrawlerRunConfig( + url="https://example.org", # Different URL but same browser parameters + viewport_width=1280, + viewport_height=800 + ) + + # Get pages with these configs + page1, context1 = await manager.get_page(crawler_config1) + page2, context2 = await manager.get_page(crawler_config2) + + # Check if contexts are reused + is_same_context = context1 == context2 + logger.info(f"Contexts reused: {is_same_context}", tag="TEST") + + # Now try with a different config + crawler_config3 = CrawlerRunConfig( + url="https://example.net", + viewport_width=800, # Different viewport size + viewport_height=600 + ) + + page3, context3 = await manager.get_page(crawler_config3) + + # This should be a different context + is_different_context = context1 != context3 + logger.info(f"Different contexts for different configs: {is_different_context}", tag="TEST") + + # Clean up + await manager.close() + logger.info("Browser closed successfully", tag="TEST") + + # Both tests should pass for success + return is_same_context and is_different_context + except Exception as e: + logger.error(f"Test failed: {str(e)}", tag="TEST") + # Ensure cleanup + try: + await manager.close() + except: + pass + return False + +async def test_playwright_session_management(): + """Test session management with Playwright browser.""" + logger.info("Testing session management with Playwright browser", tag="TEST") + + browser_config = BrowserConfig( + headless=True + ) + + manager = BrowserManager(browser_config=browser_config, logger=logger) + + try: + await manager.start() + logger.info("Browser launched successfully", tag="TEST") + + # Create two sessions + session1_id = "playwright_session_1" + session2_id = "playwright_session_2" + + # Set up first session + crawler_config1 = CrawlerRunConfig(session_id=session1_id, url="https://example.com") + page1, context1 = await manager.get_page(crawler_config1) + await page1.goto("https://example.com") + await page1.evaluate("localStorage.setItem('playwright_session1_data', 'test_value1')") + logger.info(f"Set up session 1 with ID: {session1_id}", tag="TEST") + + # Set up second session + crawler_config2 = CrawlerRunConfig(session_id=session2_id, url="https://example.org") + page2, context2 = await manager.get_page(crawler_config2) + await page2.goto("https://example.org") + await page2.evaluate("localStorage.setItem('playwright_session2_data', 'test_value2')") + logger.info(f"Set up session 2 with ID: {session2_id}", tag="TEST") + + # Get first session again + page1_again, context1_again = await manager.get_page(crawler_config1) + + # Verify it's the same page and data persists + is_same_page = page1 == page1_again + is_same_context = context1 == context1_again + data1 = await page1_again.evaluate("localStorage.getItem('playwright_session1_data')") + logger.info(f"Session 1 reuse successful: {is_same_page}, data: {data1}", tag="TEST") + + # Kill first session + await manager.kill_session(session1_id) + logger.info(f"Killed session 1", tag="TEST") + + # Verify second session still works + data2 = await page2.evaluate("localStorage.getItem('playwright_session2_data')") + logger.info(f"Session 2 still functional after killing session 1, data: {data2}", tag="TEST") + + # Clean up + await manager.close() + logger.info("Browser closed successfully", tag="TEST") + + return is_same_page and is_same_context and data1 == "test_value1" and data2 == "test_value2" + except Exception as e: + logger.error(f"Test failed: {str(e)}", tag="TEST") + try: + await manager.close() + except: + pass + return False + +async def run_tests(): + """Run all tests sequentially.""" + results = [] + + results.append(await test_playwright_basic()) + results.append(await test_playwright_text_mode()) + results.append(await test_playwright_context_reuse()) + results.append(await test_playwright_session_management()) + + # Print summary + total = len(results) + passed = sum(results) + logger.info(f"Tests complete: {passed}/{total} passed", tag="SUMMARY") + + if passed == total: + logger.success("All tests passed!", tag="SUMMARY") + else: + logger.error(f"{total - passed} tests failed", tag="SUMMARY") + +if __name__ == "__main__": + asyncio.run(run_tests()) diff --git a/tests/browser/test_profiles.py b/tests/browser/test_profiles.py new file mode 100644 index 00000000..8325b561 --- /dev/null +++ b/tests/browser/test_profiles.py @@ -0,0 +1,176 @@ +"""Test examples for BrowserProfileManager. + +These examples demonstrate the functionality of BrowserProfileManager +and serve as functional tests. +""" + +import asyncio +import os +import sys +import uuid +import shutil + +# Add the project root to Python path if running directly +if __name__ == "__main__": + sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '../..'))) + +from crawl4ai.browser import BrowserManager, BrowserProfileManager +from crawl4ai.async_configs import BrowserConfig, CrawlerRunConfig +from crawl4ai.async_logger import AsyncLogger + +# Create a logger for clear terminal output +logger = AsyncLogger(verbose=True, log_file=None) + +async def test_profile_creation(): + """Test creating and managing browser profiles.""" + logger.info("Testing profile creation and management", tag="TEST") + + profile_manager = BrowserProfileManager(logger=logger) + + try: + # List existing profiles + profiles = profile_manager.list_profiles() + logger.info(f"Found {len(profiles)} existing profiles", tag="TEST") + + # Generate a unique profile name for testing + test_profile_name = f"test-profile-{uuid.uuid4().hex[:8]}" + + # Create a test profile directory + profile_path = os.path.join(profile_manager.profiles_dir, test_profile_name) + os.makedirs(os.path.join(profile_path, "Default"), exist_ok=True) + + # Create a dummy Preferences file to simulate a Chrome profile + with open(os.path.join(profile_path, "Default", "Preferences"), "w") as f: + f.write("{\"test\": true}") + + logger.info(f"Created test profile at: {profile_path}", tag="TEST") + + # Verify the profile is now in the list + profiles = profile_manager.list_profiles() + profile_found = any(p["name"] == test_profile_name for p in profiles) + logger.info(f"Profile found in list: {profile_found}", tag="TEST") + + # Try to get the profile path + retrieved_path = profile_manager.get_profile_path(test_profile_name) + path_match = retrieved_path == profile_path + logger.info(f"Retrieved correct profile path: {path_match}", tag="TEST") + + # Delete the profile + success = profile_manager.delete_profile(test_profile_name) + logger.info(f"Profile deletion successful: {success}", tag="TEST") + + # Verify it's gone + profiles_after = profile_manager.list_profiles() + profile_removed = not any(p["name"] == test_profile_name for p in profiles_after) + logger.info(f"Profile removed from list: {profile_removed}", tag="TEST") + + # Clean up just in case + if os.path.exists(profile_path): + shutil.rmtree(profile_path, ignore_errors=True) + + return profile_found and path_match and success and profile_removed + except Exception as e: + logger.error(f"Test failed: {str(e)}", tag="TEST") + # Clean up test directory + try: + if os.path.exists(profile_path): + shutil.rmtree(profile_path, ignore_errors=True) + except: + pass + return False + +async def test_profile_with_browser(): + """Test using a profile with a browser.""" + logger.info("Testing using a profile with a browser", tag="TEST") + + profile_manager = BrowserProfileManager(logger=logger) + test_profile_name = f"test-browser-profile-{uuid.uuid4().hex[:8]}" + profile_path = None + + try: + # Create a test profile directory + profile_path = os.path.join(profile_manager.profiles_dir, test_profile_name) + os.makedirs(os.path.join(profile_path, "Default"), exist_ok=True) + + # Create a dummy Preferences file to simulate a Chrome profile + with open(os.path.join(profile_path, "Default", "Preferences"), "w") as f: + f.write("{\"test\": true}") + + logger.info(f"Created test profile at: {profile_path}", tag="TEST") + + # Now use this profile with a browser + browser_config = BrowserConfig( + user_data_dir=profile_path, + headless=True + ) + + manager = BrowserManager(browser_config=browser_config, logger=logger) + + # Start the browser with the profile + await manager.start() + logger.info("Browser started with profile", tag="TEST") + + # Create a page + crawler_config = CrawlerRunConfig() + page, context = await manager.get_page(crawler_config) + + # Navigate and set some data to verify profile works + await page.goto("https://example.com") + await page.evaluate("localStorage.setItem('test_data', 'profile_value')") + + # Close browser + await manager.close() + logger.info("First browser session closed", tag="TEST") + + # Create a new browser with the same profile + manager2 = BrowserManager(browser_config=browser_config, logger=logger) + await manager2.start() + logger.info("Second browser session started with same profile", tag="TEST") + + # Get a page and check if the data persists + page2, context2 = await manager2.get_page(crawler_config) + await page2.goto("https://example.com") + data = await page2.evaluate("localStorage.getItem('test_data')") + + # Verify data persisted + data_persisted = data == "profile_value" + logger.info(f"Data persisted across sessions: {data_persisted}", tag="TEST") + + # Clean up + await manager2.close() + logger.info("Second browser session closed", tag="TEST") + + # Delete the test profile + success = profile_manager.delete_profile(test_profile_name) + logger.info(f"Test profile deleted: {success}", tag="TEST") + + return data_persisted and success + except Exception as e: + logger.error(f"Test failed: {str(e)}", tag="TEST") + # Clean up + try: + if profile_path and os.path.exists(profile_path): + shutil.rmtree(profile_path, ignore_errors=True) + except: + pass + return False + +async def run_tests(): + """Run all tests sequentially.""" + results = [] + + results.append(await test_profile_creation()) + results.append(await test_profile_with_browser()) + + # Print summary + total = len(results) + passed = sum(results) + logger.info(f"Tests complete: {passed}/{total} passed", tag="SUMMARY") + + if passed == total: + logger.success("All tests passed!", tag="SUMMARY") + else: + logger.error(f"{total - passed} tests failed", tag="SUMMARY") + +if __name__ == "__main__": + asyncio.run(run_tests())