feat(browser): implement modular browser management system

Adds a new browser management system with strategy pattern implementation:
- Introduces BrowserManager class with strategy pattern support
- Adds PlaywrightBrowserStrategy, CDPBrowserStrategy, and BuiltinBrowserStrategy
- Implements BrowserProfileManager for profile management
- Adds PagePoolConfig for browser page pooling
- Includes comprehensive test suite for all browser strategies

BREAKING CHANGE: Browser management has been moved to browser/ module. Direct usage of browser_manager.py and browser_profiler.py is deprecated.
This commit is contained in:
UncleCode
2025-03-21 22:50:00 +08:00
parent 6432ff1257
commit 4ab0893ffb
16 changed files with 2964 additions and 8 deletions

View File

@@ -156,6 +156,41 @@ def is_empty_value(value: Any) -> bool:
return False
class PagePoolConfig:
"""Configuration for browser page pooling.
This class configures the page pooling mechanism that maintains pre-warmed
browser pages ready for immediate use, improving performance for scenarios
where multiple URLs need to be processed in sequence.
Attributes:
mode (str): Pooling mode - "static" or "adaptive".
"static" uses a fixed pool size defined by static_size.
"adaptive" calculates optimal size based on available system memory.
Default: "static".
static_size (int): Number of pages to maintain in the pool when mode is "static".
Default: 10.
memory_per_page (int): Estimated memory used by a single page in MB.
Used for "adaptive" mode calculations.
Default: 200.
memory_threshold (float): Maximum percentage of system memory to use in "adaptive" mode.
Default: 0.7 (70% of available memory).
timeout (float): Seconds to wait for a page from the pool before creating a new one.
Default: 5.0.
"""
def __init__(self,
mode="static",
static_size=10,
memory_per_page=200,
memory_threshold=0.7,
timeout=5.0):
self.mode = mode
self.static_size = static_size
self.memory_per_page = memory_per_page
self.memory_threshold = memory_threshold
self.timeout = timeout
class BrowserConfig:
"""
Configuration class for setting up a browser instance and its context in AsyncPlaywrightCrawlerStrategy.
@@ -220,6 +255,9 @@ class BrowserConfig:
light_mode (bool): Disables certain background features for performance gains. Default: False.
extra_args (list): Additional command-line arguments passed to the browser.
Default: [].
page_pool_config (PagePoolConfig or None): Configuration for page pooling mechanism.
If None, page pooling is disabled.
Default: None.
"""
def __init__(
@@ -260,6 +298,7 @@ class BrowserConfig:
extra_args: list = None,
debugging_port: int = 9222,
host: str = "localhost",
page_pool_config: Optional[PagePoolConfig] = None,
):
self.browser_type = browser_type
self.headless = headless
@@ -298,6 +337,7 @@ class BrowserConfig:
self.verbose = verbose
self.debugging_port = debugging_port
self.host = host
self.page_pool_config = page_pool_config
fa_user_agenr_generator = ValidUAGenerator()
if self.user_agent_mode == "random":
@@ -328,6 +368,12 @@ class BrowserConfig:
@staticmethod
def from_kwargs(kwargs: dict) -> "BrowserConfig":
# Handle page_pool_config
page_pool_config = kwargs.get("page_pool_config")
if isinstance(page_pool_config, dict):
# If it's a dict, convert to PagePoolConfig
page_pool_config = PagePoolConfig(**page_pool_config)
return BrowserConfig(
browser_type=kwargs.get("browser_type", "chromium"),
headless=kwargs.get("headless", True),
@@ -361,6 +407,7 @@ class BrowserConfig:
extra_args=kwargs.get("extra_args", []),
debugging_port=kwargs.get("debugging_port", 9222),
host=kwargs.get("host", "localhost"),
page_pool_config=page_pool_config,
)
def to_dict(self):
@@ -395,6 +442,7 @@ class BrowserConfig:
"verbose": self.verbose,
"debugging_port": self.debugging_port,
"host": self.host,
"page_pool_config": self.page_pool_config,
}
def clone(self, **kwargs):

View File

@@ -0,0 +1,10 @@
"""Browser management module for Crawl4AI.
This module provides browser management capabilities using different strategies
for browser creation and interaction.
"""
from .manager import BrowserManager
from .profiles import BrowserProfileManager
__all__ = ['BrowserManager', 'BrowserProfileManager']

165
crawl4ai/browser/manager.py Normal file
View File

@@ -0,0 +1,165 @@
"""Browser manager module for Crawl4AI.
This module provides a central browser management class that uses the
strategy pattern internally while maintaining the existing API.
"""
import asyncio
import time
from typing import Optional, Tuple, Dict, Any
from playwright.async_api import Page, BrowserContext
from ..async_logger import AsyncLogger
from ..async_configs import BrowserConfig, CrawlerRunConfig
from .strategies import (
BaseBrowserStrategy,
PlaywrightBrowserStrategy,
CDPBrowserStrategy,
BuiltinBrowserStrategy
)
class BrowserManager:
"""Main interface for browser management in Crawl4AI.
This class maintains backward compatibility with the existing implementation
while using the strategy pattern internally for different browser types.
Attributes:
config (BrowserConfig): Configuration object containing all browser settings
logger: Logger instance for recording events and errors
browser: The browser instance
default_context: The default browser context
managed_browser: The managed browser instance
playwright: The Playwright instance
sessions: Dictionary to store session information
session_ttl: Session timeout in seconds
"""
def __init__(self, browser_config: Optional[BrowserConfig] = None, logger: Optional[AsyncLogger] = None):
"""Initialize the BrowserManager with a browser configuration.
Args:
browser_config: Configuration object containing all browser settings
logger: Logger instance for recording events and errors
"""
self.config = browser_config or BrowserConfig()
self.logger = logger
# Create strategy based on configuration
self._strategy = self._create_strategy()
# Initialize state variables for compatibility with existing code
self.browser = None
self.default_context = None
self.managed_browser = None
self.playwright = None
# For session management (from existing implementation)
self.sessions = {}
self.session_ttl = 1800 # 30 minutes
def _create_strategy(self) -> BaseBrowserStrategy:
"""Create appropriate browser strategy based on configuration.
Returns:
BaseBrowserStrategy: The selected browser strategy
"""
if self.config.browser_mode == "builtin":
return BuiltinBrowserStrategy(self.config, self.logger)
elif self.config.cdp_url or self.config.use_managed_browser:
return CDPBrowserStrategy(self.config, self.logger)
else:
return PlaywrightBrowserStrategy(self.config, self.logger)
async def start(self):
"""Start the browser instance and set up the default context.
Returns:
self: For method chaining
"""
# Start the strategy
await self._strategy.start()
# Update legacy references
self.browser = self._strategy.browser
self.default_context = self._strategy.default_context
# Set browser process reference (for CDP strategy)
if hasattr(self._strategy, 'browser_process'):
self.managed_browser = self._strategy
# Set Playwright reference
self.playwright = self._strategy.playwright
# Sync sessions if needed
if hasattr(self._strategy, 'sessions'):
self.sessions = self._strategy.sessions
self.session_ttl = self._strategy.session_ttl
return self
async def get_page(self, crawlerRunConfig: CrawlerRunConfig) -> Tuple[Page, BrowserContext]:
"""Get a page for the given configuration.
Args:
crawlerRunConfig: Configuration object for the crawler run
Returns:
Tuple of (Page, BrowserContext)
"""
# Delegate to strategy
page, context = await self._strategy.get_page(crawlerRunConfig)
# Sync sessions if needed
if hasattr(self._strategy, 'sessions'):
self.sessions = self._strategy.sessions
return page, context
async def kill_session(self, session_id: str):
"""Kill a browser session and clean up resources.
Args:
session_id: The session ID to kill
"""
# Handle kill_session via our strategy if it supports it
if hasattr(self._strategy, '_kill_session'):
await self._strategy._kill_session(session_id)
elif session_id in self.sessions:
context, page, _ = self.sessions[session_id]
await page.close()
# Only close context if not using CDP
if not self.config.use_managed_browser and not self.config.cdp_url and not self.config.browser_mode == "builtin":
await context.close()
del self.sessions[session_id]
def _cleanup_expired_sessions(self):
"""Clean up expired sessions based on TTL."""
# Use strategy's implementation if available
if hasattr(self._strategy, '_cleanup_expired_sessions'):
self._strategy._cleanup_expired_sessions()
return
# Otherwise use our own implementation
current_time = time.time()
expired_sessions = [
sid
for sid, (_, _, last_used) in self.sessions.items()
if current_time - last_used > self.session_ttl
]
for sid in expired_sessions:
asyncio.create_task(self.kill_session(sid))
async def close(self):
"""Close the browser and clean up resources."""
# Delegate to strategy
await self._strategy.close()
# Reset legacy references
self.browser = None
self.default_context = None
self.managed_browser = None
self.playwright = None
self.sessions = {}

View File

View File

@@ -0,0 +1,458 @@
"""Browser profile management module for Crawl4AI.
This module provides functionality for creating and managing browser profiles
that can be used for authenticated browsing.
"""
import os
import asyncio
import signal
import sys
import datetime
import uuid
import shutil
from typing import List, Dict, Optional, Any
from colorama import Fore, Style, init
from ..async_configs import BrowserConfig
from ..async_logger import AsyncLogger, AsyncLoggerBase
from ..utils import get_home_folder
from .strategies import is_windows
class BrowserProfileManager:
"""Manages browser profiles for Crawl4AI.
This class provides functionality to create and manage browser profiles
that can be used for authenticated browsing with Crawl4AI.
Profiles are stored by default in ~/.crawl4ai/profiles/
"""
def __init__(self, logger: Optional[AsyncLoggerBase] = None):
"""Initialize the BrowserProfileManager.
Args:
logger: Logger for outputting messages. If None, a default AsyncLogger is created.
"""
# Initialize colorama for colorful terminal output
init()
# Create a logger if not provided
if logger is None:
self.logger = AsyncLogger(verbose=True)
elif not isinstance(logger, AsyncLoggerBase):
self.logger = AsyncLogger(verbose=True)
else:
self.logger = logger
# Ensure profiles directory exists
self.profiles_dir = os.path.join(get_home_folder(), "profiles")
os.makedirs(self.profiles_dir, exist_ok=True)
async def create_profile(self,
profile_name: Optional[str] = None,
browser_config: Optional[BrowserConfig] = None) -> Optional[str]:
"""Create a browser profile interactively.
Args:
profile_name: Name for the profile. If None, a name is generated.
browser_config: Configuration for the browser. If None, a default configuration is used.
Returns:
Path to the created profile directory, or None if creation failed
"""
# Create default browser config if none provided
if browser_config is None:
browser_config = BrowserConfig(
browser_type="chromium",
headless=False, # Must be visible for user interaction
verbose=True
)
else:
# Ensure headless is False for user interaction
browser_config.headless = False
# Generate profile name if not provided
if not profile_name:
timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
profile_name = f"profile_{timestamp}_{uuid.uuid4().hex[:6]}"
# Sanitize profile name (replace spaces and special chars)
profile_name = "".join(c if c.isalnum() or c in "-_" else "_" for c in profile_name)
# Set user data directory
profile_path = os.path.join(self.profiles_dir, profile_name)
os.makedirs(profile_path, exist_ok=True)
# Print instructions for the user with colorama formatting
border = f"{Fore.CYAN}{'='*80}{Style.RESET_ALL}"
self.logger.info(f"\n{border}", tag="PROFILE")
self.logger.info(f"Creating browser profile: {Fore.GREEN}{profile_name}{Style.RESET_ALL}", tag="PROFILE")
self.logger.info(f"Profile directory: {Fore.YELLOW}{profile_path}{Style.RESET_ALL}", tag="PROFILE")
self.logger.info("\nInstructions:", tag="PROFILE")
self.logger.info("1. A browser window will open for you to set up your profile.", tag="PROFILE")
self.logger.info(f"2. {Fore.CYAN}Log in to websites{Style.RESET_ALL}, configure settings, etc. as needed.", tag="PROFILE")
self.logger.info(f"3. When you're done, {Fore.YELLOW}press 'q' in this terminal{Style.RESET_ALL} to close the browser.", tag="PROFILE")
self.logger.info("4. The profile will be saved and ready to use with Crawl4AI.", tag="PROFILE")
self.logger.info(f"{border}\n", tag="PROFILE")
# Import the necessary classes with local imports to avoid circular references
from .strategies import CDPBrowserStrategy
# Set browser config to use the profile path
browser_config.user_data_dir = profile_path
# Create a CDP browser strategy for the profile creation
browser_strategy = CDPBrowserStrategy(browser_config, self.logger)
# Set up signal handlers to ensure cleanup on interrupt
original_sigint = signal.getsignal(signal.SIGINT)
original_sigterm = signal.getsignal(signal.SIGTERM)
# Define cleanup handler for signals
async def cleanup_handler(sig, frame):
self.logger.warning("\nCleaning up browser process...", tag="PROFILE")
await browser_strategy.close()
# Restore original signal handlers
signal.signal(signal.SIGINT, original_sigint)
signal.signal(signal.SIGTERM, original_sigterm)
if sig == signal.SIGINT:
self.logger.error("Profile creation interrupted. Profile may be incomplete.", tag="PROFILE")
sys.exit(1)
# Set signal handlers
def sigint_handler(sig, frame):
asyncio.create_task(cleanup_handler(sig, frame))
signal.signal(signal.SIGINT, sigint_handler)
signal.signal(signal.SIGTERM, sigint_handler)
# Event to signal when user is done with the browser
user_done_event = asyncio.Event()
# Run keyboard input loop in a separate task
async def listen_for_quit_command():
import termios
import tty
import select
# First output the prompt
self.logger.info(f"{Fore.CYAN}Press '{Fore.WHITE}q{Fore.CYAN}' when you've finished using the browser...{Style.RESET_ALL}", tag="PROFILE")
# Save original terminal settings
fd = sys.stdin.fileno()
old_settings = termios.tcgetattr(fd)
try:
# Switch to non-canonical mode (no line buffering)
tty.setcbreak(fd)
while True:
# Check if input is available (non-blocking)
readable, _, _ = select.select([sys.stdin], [], [], 0.5)
if readable:
key = sys.stdin.read(1)
if key.lower() == 'q':
self.logger.info(f"{Fore.GREEN}Closing browser and saving profile...{Style.RESET_ALL}", tag="PROFILE")
user_done_event.set()
return
# Check if the browser process has already exited
if browser_strategy.browser_process and browser_strategy.browser_process.poll() is not None:
self.logger.info("Browser already closed. Ending input listener.", tag="PROFILE")
user_done_event.set()
return
await asyncio.sleep(0.1)
finally:
# Restore terminal settings
termios.tcsetattr(fd, termios.TCSADRAIN, old_settings)
try:
# Start the browser
await browser_strategy.start()
# Check if browser started successfully
if not browser_strategy.browser_process:
self.logger.error("Failed to start browser process.", tag="PROFILE")
return None
self.logger.info(f"Browser launched. {Fore.CYAN}Waiting for you to finish...{Style.RESET_ALL}", tag="PROFILE")
# Start listening for keyboard input
listener_task = asyncio.create_task(listen_for_quit_command())
# Wait for either the user to press 'q' or for the browser process to exit naturally
while not user_done_event.is_set() and browser_strategy.browser_process.poll() is None:
await asyncio.sleep(0.5)
# Cancel the listener task if it's still running
if not listener_task.done():
listener_task.cancel()
try:
await listener_task
except asyncio.CancelledError:
pass
# If the browser is still running and the user pressed 'q', terminate it
if browser_strategy.browser_process.poll() is None and user_done_event.is_set():
self.logger.info("Terminating browser process...", tag="PROFILE")
await browser_strategy.close()
self.logger.success(f"Browser closed. Profile saved at: {Fore.GREEN}{profile_path}{Style.RESET_ALL}", tag="PROFILE")
except Exception as e:
self.logger.error(f"Error creating profile: {str(e)}", tag="PROFILE")
await browser_strategy.close()
return None
finally:
# Restore original signal handlers
signal.signal(signal.SIGINT, original_sigint)
signal.signal(signal.SIGTERM, original_sigterm)
# Make sure browser is fully cleaned up
await browser_strategy.close()
# Return the profile path
return profile_path
def list_profiles(self) -> List[Dict[str, Any]]:
"""List all available browser profiles.
Returns:
List of dictionaries containing profile information
"""
if not os.path.exists(self.profiles_dir):
return []
profiles = []
for name in os.listdir(self.profiles_dir):
profile_path = os.path.join(self.profiles_dir, name)
# Skip if not a directory
if not os.path.isdir(profile_path):
continue
# Check if this looks like a valid browser profile
# For Chromium: Look for Preferences file
# For Firefox: Look for prefs.js file
is_valid = False
if os.path.exists(os.path.join(profile_path, "Preferences")) or \
os.path.exists(os.path.join(profile_path, "Default", "Preferences")):
is_valid = "chromium"
elif os.path.exists(os.path.join(profile_path, "prefs.js")):
is_valid = "firefox"
if is_valid:
# Get creation time
created = datetime.datetime.fromtimestamp(
os.path.getctime(profile_path)
)
profiles.append({
"name": name,
"path": profile_path,
"created": created,
"type": is_valid
})
# Sort by creation time, newest first
profiles.sort(key=lambda x: x["created"], reverse=True)
return profiles
def get_profile_path(self, profile_name: str) -> Optional[str]:
"""Get the full path to a profile by name.
Args:
profile_name: Name of the profile (not the full path)
Returns:
Full path to the profile directory, or None if not found
"""
profile_path = os.path.join(self.profiles_dir, profile_name)
# Check if path exists and is a valid profile
if not os.path.isdir(profile_path):
# Check if profile_name itself is full path
if os.path.isabs(profile_name):
profile_path = profile_name
else:
return None
# Look for profile indicators
is_profile = (
os.path.exists(os.path.join(profile_path, "Preferences")) or
os.path.exists(os.path.join(profile_path, "Default", "Preferences")) or
os.path.exists(os.path.join(profile_path, "prefs.js"))
)
if not is_profile:
return None # Not a valid browser profile
return profile_path
def delete_profile(self, profile_name_or_path: str) -> bool:
"""Delete a browser profile by name or path.
Args:
profile_name_or_path: Name of the profile or full path to profile directory
Returns:
True if the profile was deleted successfully, False otherwise
"""
# Determine if input is a name or a path
if os.path.isabs(profile_name_or_path):
# Full path provided
profile_path = profile_name_or_path
else:
# Just a name provided, construct path
profile_path = os.path.join(self.profiles_dir, profile_name_or_path)
# Check if path exists and is a valid profile
if not os.path.isdir(profile_path):
return False
# Look for profile indicators
is_profile = (
os.path.exists(os.path.join(profile_path, "Preferences")) or
os.path.exists(os.path.join(profile_path, "Default", "Preferences")) or
os.path.exists(os.path.join(profile_path, "prefs.js"))
)
if not is_profile:
return False # Not a valid browser profile
# Delete the profile directory
try:
shutil.rmtree(profile_path)
return True
except Exception:
return False
async def interactive_manager(self, crawl_callback=None):
"""Launch an interactive profile management console.
Args:
crawl_callback: Function to call when selecting option to use
a profile for crawling. It will be called with (profile_path, url).
"""
while True:
self.logger.info(f"\n{Fore.CYAN}Profile Management Options:{Style.RESET_ALL}", tag="MENU")
self.logger.info(f"1. {Fore.GREEN}Create a new profile{Style.RESET_ALL}", tag="MENU")
self.logger.info(f"2. {Fore.YELLOW}List available profiles{Style.RESET_ALL}", tag="MENU")
self.logger.info(f"3. {Fore.RED}Delete a profile{Style.RESET_ALL}", tag="MENU")
# Only show crawl option if callback provided
if crawl_callback:
self.logger.info(f"4. {Fore.CYAN}Use a profile to crawl a website{Style.RESET_ALL}", tag="MENU")
self.logger.info(f"5. {Fore.MAGENTA}Exit{Style.RESET_ALL}", tag="MENU")
exit_option = "5"
else:
self.logger.info(f"4. {Fore.MAGENTA}Exit{Style.RESET_ALL}", tag="MENU")
exit_option = "4"
choice = input(f"\n{Fore.CYAN}Enter your choice (1-{exit_option}): {Style.RESET_ALL}")
if choice == "1":
# Create new profile
name = input(f"{Fore.GREEN}Enter a name for the new profile (or press Enter for auto-generated name): {Style.RESET_ALL}")
await self.create_profile(name or None)
elif choice == "2":
# List profiles
profiles = self.list_profiles()
if not profiles:
self.logger.warning(" No profiles found. Create one first with option 1.", tag="PROFILES")
continue
# Print profile information with colorama formatting
self.logger.info("\nAvailable profiles:", tag="PROFILES")
for i, profile in enumerate(profiles):
self.logger.info(f"[{i+1}] {Fore.CYAN}{profile['name']}{Style.RESET_ALL}", tag="PROFILES")
self.logger.info(f" Path: {Fore.YELLOW}{profile['path']}{Style.RESET_ALL}", tag="PROFILES")
self.logger.info(f" Created: {profile['created'].strftime('%Y-%m-%d %H:%M:%S')}", tag="PROFILES")
self.logger.info(f" Browser type: {profile['type']}", tag="PROFILES")
self.logger.info("", tag="PROFILES") # Empty line for spacing
elif choice == "3":
# Delete profile
profiles = self.list_profiles()
if not profiles:
self.logger.warning("No profiles found to delete", tag="PROFILES")
continue
# Display numbered list
self.logger.info(f"\n{Fore.YELLOW}Available profiles:{Style.RESET_ALL}", tag="PROFILES")
for i, profile in enumerate(profiles):
self.logger.info(f"[{i+1}] {profile['name']}", tag="PROFILES")
# Get profile to delete
profile_idx = input(f"{Fore.RED}Enter the number of the profile to delete (or 'c' to cancel): {Style.RESET_ALL}")
if profile_idx.lower() == 'c':
continue
try:
idx = int(profile_idx) - 1
if 0 <= idx < len(profiles):
profile_name = profiles[idx]["name"]
self.logger.info(f"Deleting profile: {Fore.YELLOW}{profile_name}{Style.RESET_ALL}", tag="PROFILES")
# Confirm deletion
confirm = input(f"{Fore.RED}Are you sure you want to delete this profile? (y/n): {Style.RESET_ALL}")
if confirm.lower() == 'y':
success = self.delete_profile(profiles[idx]["path"])
if success:
self.logger.success(f"Profile {Fore.GREEN}{profile_name}{Style.RESET_ALL} deleted successfully", tag="PROFILES")
else:
self.logger.error(f"Failed to delete profile {Fore.RED}{profile_name}{Style.RESET_ALL}", tag="PROFILES")
else:
self.logger.error("Invalid profile number", tag="PROFILES")
except ValueError:
self.logger.error("Please enter a valid number", tag="PROFILES")
elif choice == "4" and crawl_callback:
# Use profile to crawl a site
profiles = self.list_profiles()
if not profiles:
self.logger.warning("No profiles found. Create one first.", tag="PROFILES")
continue
# Display numbered list
self.logger.info(f"\n{Fore.YELLOW}Available profiles:{Style.RESET_ALL}", tag="PROFILES")
for i, profile in enumerate(profiles):
self.logger.info(f"[{i+1}] {profile['name']}", tag="PROFILES")
# Get profile to use
profile_idx = input(f"{Fore.CYAN}Enter the number of the profile to use (or 'c' to cancel): {Style.RESET_ALL}")
if profile_idx.lower() == 'c':
continue
try:
idx = int(profile_idx) - 1
if 0 <= idx < len(profiles):
profile_path = profiles[idx]["path"]
url = input(f"{Fore.CYAN}Enter the URL to crawl: {Style.RESET_ALL}")
if url:
# Call the provided crawl callback
await crawl_callback(profile_path, url)
else:
self.logger.error("No URL provided", tag="CRAWL")
else:
self.logger.error("Invalid profile number", tag="PROFILES")
except ValueError:
self.logger.error("Please enter a valid number", tag="PROFILES")
elif (choice == "4" and not crawl_callback) or (choice == "5" and crawl_callback):
# Exit
self.logger.info("Exiting profile management", tag="MENU")
break
else:
self.logger.error(f"Invalid choice. Please enter a number between 1 and {exit_option}.", tag="MENU")

File diff suppressed because it is too large Load Diff

105
crawl4ai/browser/utils.py Normal file
View File

@@ -0,0 +1,105 @@
"""Browser utilities module for Crawl4AI.
This module provides utility functions for browser management,
including process management, CDP connection utilities,
and Playwright instance management.
"""
import asyncio
import os
import sys
import platform
import tempfile
from typing import Optional, Any
from playwright.async_api import async_playwright
from ..async_logger import AsyncLogger
from ..utils import get_chromium_path
_playwright_instance = None
async def get_playwright():
"""Get or create the Playwright instance (singleton pattern).
Returns:
Playwright: The Playwright instance
"""
global _playwright_instance
if _playwright_instance is None or True:
_playwright_instance = await async_playwright().start()
return _playwright_instance
def get_browser_executable(browser_type: str) -> str:
"""Get the path to browser executable, with platform-specific handling.
Args:
browser_type: Type of browser (chromium, firefox, webkit)
Returns:
Path to browser executable
"""
return get_chromium_path(browser_type)
def create_temp_directory(prefix="browser-profile-") -> str:
"""Create a temporary directory for browser data.
Args:
prefix: Prefix for the temporary directory name
Returns:
Path to the created temporary directory
"""
return tempfile.mkdtemp(prefix=prefix)
def is_windows() -> bool:
"""Check if the current platform is Windows.
Returns:
True if Windows, False otherwise
"""
return sys.platform == "win32"
def is_macos() -> bool:
"""Check if the current platform is macOS.
Returns:
True if macOS, False otherwise
"""
return sys.platform == "darwin"
def is_linux() -> bool:
"""Check if the current platform is Linux.
Returns:
True if Linux, False otherwise
"""
return not (is_windows() or is_macos())
def get_browser_disable_options() -> list:
"""Get standard list of browser disable options for performance.
Returns:
List of command-line options to disable various browser features
"""
return [
"--disable-background-networking",
"--disable-background-timer-throttling",
"--disable-backgrounding-occluded-windows",
"--disable-breakpad",
"--disable-client-side-phishing-detection",
"--disable-component-extensions-with-background-pages",
"--disable-default-apps",
"--disable-extensions",
"--disable-features=TranslateUI",
"--disable-hang-monitor",
"--disable-ipc-flooding-protection",
"--disable-popup-blocking",
"--disable-prompt-on-repost",
"--disable-sync",
"--force-color-profile=srgb",
"--metrics-recording-only",
"--no-first-run",
"--password-store=basic",
"--use-mock-keychain",
]

View File

@@ -163,6 +163,7 @@ class ManagedBrowser:
)
# We'll monitor for a short time to make sure it starts properly, but won't keep monitoring
await asyncio.sleep(0.5) # Give browser time to start
await self._initial_startup_check()
await asyncio.sleep(2) # Give browser time to start
return f"http://{self.host}:{self.debugging_port}"

View File

@@ -555,7 +555,6 @@ class BrowserProfiler:
else:
self.logger.error(f"Invalid choice. Please enter a number between 1 and {exit_option}.", tag="MENU")
async def launch_standalone_browser(self,
browser_type: str = "chromium",
user_data_dir: Optional[str] = None,

View File

@@ -9,6 +9,26 @@ from crawl4ai import (
CrawlResult
)
async def example_cdp():
browser_conf = BrowserConfig(
headless=False,
cdp_url="http://localhost:9223"
)
crawler_config = CrawlerRunConfig(
session_id="test",
js_code = """(() => { return {"result": "Hello World!"} })()""",
js_only=True
)
async with AsyncWebCrawler(
config=browser_conf,
verbose=True,
) as crawler:
result : CrawlResult = await crawler.arun(
url="https://www.helloworld.org",
config=crawler_config,
)
print(result.js_execution_result)
async def main():
browser_config = BrowserConfig(headless=True, verbose=True)
@@ -16,18 +36,15 @@ async def main():
crawler_config = CrawlerRunConfig(
cache_mode=CacheMode.BYPASS,
markdown_generator=DefaultMarkdownGenerator(
# content_filter=PruningContentFilter(
# threshold=0.48, threshold_type="fixed", min_word_threshold=0
# )
content_filter=PruningContentFilter(
threshold=0.48, threshold_type="fixed", min_word_threshold=0
)
),
)
result : CrawlResult = await crawler.arun(
# url="https://www.helloworld.org", config=crawler_config
url="https://www.kidocode.com", config=crawler_config
url="https://www.helloworld.org", config=crawler_config
)
print(result.markdown.raw_markdown[:500])
# print(result.model_dump())
if __name__ == "__main__":
asyncio.run(main())

View File

@@ -0,0 +1,190 @@
"""Test examples for BrowserManager.
These examples demonstrate the functionality of BrowserManager
and serve as functional tests.
"""
import asyncio
import os
import sys
from typing import List
# Add the project root to Python path if running directly
if __name__ == "__main__":
sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '../..')))
from crawl4ai.browser import BrowserManager
from crawl4ai.async_configs import BrowserConfig, CrawlerRunConfig
from crawl4ai.async_logger import AsyncLogger
# Create a logger for clear terminal output
logger = AsyncLogger(verbose=True, log_file=None)
async def test_basic_browser_manager():
"""Test basic BrowserManager functionality with default configuration."""
logger.info("Starting test_basic_browser_manager", tag="TEST")
try:
# Create a browser manager with default config
manager = BrowserManager(logger=logger)
# Start the browser
await manager.start()
logger.info("Browser started successfully", tag="TEST")
# Get a page
crawler_config = CrawlerRunConfig(url="https://example.com")
page, context = await manager.get_page(crawler_config)
logger.info("Page created successfully", tag="TEST")
# Navigate to a website
await page.goto("https://example.com")
title = await page.title()
logger.info(f"Page title: {title}", tag="TEST")
# Clean up
await manager.close()
logger.success("test_basic_browser_manager completed successfully", tag="TEST")
return True
except Exception as e:
logger.error(f"test_basic_browser_manager failed: {str(e)}", tag="TEST")
return False
async def test_custom_browser_config():
"""Test BrowserManager with custom browser configuration."""
logger.info("Starting test_custom_browser_config", tag="TEST")
try:
# Create a custom browser config
browser_config = BrowserConfig(
browser_type="chromium",
headless=True,
viewport_width=1280,
viewport_height=800,
light_mode=True
)
# Create browser manager with the config
manager = BrowserManager(browser_config=browser_config, logger=logger)
# Start the browser
await manager.start()
logger.info("Browser started successfully with custom config", tag="TEST")
# Get a page
crawler_config = CrawlerRunConfig(url="https://example.com")
page, context = await manager.get_page(crawler_config)
# Navigate to a website
await page.goto("https://example.com")
title = await page.title()
logger.info(f"Page title: {title}", tag="TEST")
# Verify viewport size
viewport_size = await page.evaluate("() => ({ width: window.innerWidth, height: window.innerHeight })")
logger.info(f"Viewport size: {viewport_size}", tag="TEST")
# Clean up
await manager.close()
logger.success("test_custom_browser_config completed successfully", tag="TEST")
return True
except Exception as e:
logger.error(f"test_custom_browser_config failed: {str(e)}", tag="TEST")
return False
async def test_multiple_pages():
"""Test BrowserManager with multiple pages."""
logger.info("Starting test_multiple_pages", tag="TEST")
try:
# Create browser manager
manager = BrowserManager(logger=logger)
# Start the browser
await manager.start()
logger.info("Browser started successfully", tag="TEST")
# Create multiple pages
pages = []
urls = ["https://example.com", "https://example.org", "https://mozilla.org"]
for i, url in enumerate(urls):
crawler_config = CrawlerRunConfig(url=url)
page, context = await manager.get_page(crawler_config)
await page.goto(url)
pages.append((page, url))
logger.info(f"Created page {i+1} for {url}", tag="TEST")
# Verify all pages are loaded correctly
for i, (page, url) in enumerate(pages):
title = await page.title()
logger.info(f"Page {i+1} title: {title}", tag="TEST")
# Clean up
await manager.close()
logger.success("test_multiple_pages completed successfully", tag="TEST")
return True
except Exception as e:
logger.error(f"test_multiple_pages failed: {str(e)}", tag="TEST")
return False
async def test_session_management():
"""Test session management in BrowserManager."""
logger.info("Starting test_session_management", tag="TEST")
try:
# Create browser manager
manager = BrowserManager(logger=logger)
# Start the browser
await manager.start()
logger.info("Browser started successfully", tag="TEST")
# Create a session
session_id = "test_session_1"
crawler_config = CrawlerRunConfig(url="https://example.com", session_id=session_id)
page1, context1 = await manager.get_page(crawler_config)
await page1.goto("https://example.com")
logger.info(f"Created session with ID: {session_id}", tag="TEST")
# Get the same session again
page2, context2 = await manager.get_page(crawler_config)
# Verify it's the same page/context
is_same_page = page1 == page2
is_same_context = context1 == context2
logger.info(f"Same page: {is_same_page}, Same context: {is_same_context}", tag="TEST")
# Kill the session
await manager.kill_session(session_id)
logger.info(f"Killed session with ID: {session_id}", tag="TEST")
# Clean up
await manager.close()
logger.success("test_session_management completed successfully", tag="TEST")
return True
except Exception as e:
logger.error(f"test_session_management failed: {str(e)}", tag="TEST")
return False
async def run_tests():
"""Run all tests sequentially."""
results = []
# results.append(await test_basic_browser_manager())
# results.append(await test_custom_browser_config())
# results.append(await test_multiple_pages())
results.append(await test_session_management())
# Print summary
total = len(results)
passed = sum(results)
logger.info(f"Tests complete: {passed}/{total} passed", tag="SUMMARY")
if passed == total:
logger.success("All tests passed!", tag="SUMMARY")
else:
logger.error(f"{total - passed} tests failed", tag="SUMMARY")
if __name__ == "__main__":
asyncio.run(run_tests())

View File

@@ -0,0 +1,160 @@
"""Test examples for BuiltinBrowserStrategy.
These examples demonstrate the functionality of BuiltinBrowserStrategy
and serve as functional tests.
"""
import asyncio
import os
import sys
# Add the project root to Python path if running directly
if __name__ == "__main__":
sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '../..')))
from crawl4ai.browser import BrowserManager
from crawl4ai.async_configs import BrowserConfig, CrawlerRunConfig
from crawl4ai.async_logger import AsyncLogger
# Create a logger for clear terminal output
logger = AsyncLogger(verbose=True, log_file=None)
async def test_builtin_browser():
"""Test using a builtin browser that persists between sessions."""
logger.info("Testing builtin browser", tag="TEST")
browser_config = BrowserConfig(
browser_mode="builtin",
headless=True
)
manager = BrowserManager(browser_config=browser_config, logger=logger)
try:
# Start should connect to existing builtin browser or create one
await manager.start()
logger.info("Connected to builtin browser", tag="TEST")
# Test page creation
crawler_config = CrawlerRunConfig()
page, context = await manager.get_page(crawler_config)
# Test navigation
await page.goto("https://example.com")
title = await page.title()
logger.info(f"Page title: {title}", tag="TEST")
# Close manager (should not close the builtin browser)
await manager.close()
logger.info("First session closed", tag="TEST")
# Create a second manager to verify browser persistence
logger.info("Creating second session to verify persistence", tag="TEST")
manager2 = BrowserManager(browser_config=browser_config, logger=logger)
await manager2.start()
logger.info("Connected to existing builtin browser", tag="TEST")
page2, context2 = await manager2.get_page(crawler_config)
await page2.goto("https://example.org")
title2 = await page2.title()
logger.info(f"Second session page title: {title2}", tag="TEST")
await manager2.close()
logger.info("Second session closed successfully", tag="TEST")
return True
except Exception as e:
logger.error(f"Test failed: {str(e)}", tag="TEST")
try:
await manager.close()
except:
pass
return False
async def test_builtin_browser_status():
"""Test getting status of the builtin browser."""
logger.info("Testing builtin browser status", tag="TEST")
from crawl4ai.browser.strategies import BuiltinBrowserStrategy
browser_config = BrowserConfig(
browser_mode="builtin",
headless=True
)
# Create strategy directly to access its status methods
strategy = BuiltinBrowserStrategy(browser_config, logger)
try:
# Get status before starting (should be not running)
status_before = await strategy.get_builtin_browser_status()
logger.info(f"Initial status: {status_before}", tag="TEST")
# Start the browser
await strategy.start()
logger.info("Browser started successfully", tag="TEST")
# Get status after starting
status_after = await strategy.get_builtin_browser_status()
logger.info(f"Status after start: {status_after}", tag="TEST")
# Create a page to verify functionality
crawler_config = CrawlerRunConfig()
page, context = await strategy.get_page(crawler_config)
await page.goto("https://example.com")
title = await page.title()
logger.info(f"Page title: {title}", tag="TEST")
# Close strategy (should not kill the builtin browser)
await strategy.close()
logger.info("Strategy closed successfully", tag="TEST")
# Create a new strategy object
strategy2 = BuiltinBrowserStrategy(browser_config, logger)
# Get status again (should still be running)
status_final = await strategy2.get_builtin_browser_status()
logger.info(f"Final status: {status_final}", tag="TEST")
# Verify that the status shows the browser is running
is_running = status_final.get('running', False)
logger.info(f"Builtin browser persistence confirmed: {is_running}", tag="TEST")
# Kill the builtin browser to clean up
logger.info("Killing builtin browser", tag="TEST")
success = await strategy2.kill_builtin_browser()
logger.info(f"Killed builtin browser successfully: {success}", tag="TEST")
return is_running and success
except Exception as e:
logger.error(f"Test failed: {str(e)}", tag="TEST")
try:
await strategy.close()
# Try to kill the builtin browser to clean up
strategy2 = BuiltinBrowserStrategy(browser_config, logger)
await strategy2.kill_builtin_browser()
except:
pass
return False
async def run_tests():
"""Run all tests sequentially."""
results = []
results.append(await test_builtin_browser())
results.append(await test_builtin_browser_status())
# Print summary
total = len(results)
passed = sum(results)
logger.info(f"Tests complete: {passed}/{total} passed", tag="SUMMARY")
if passed == total:
logger.success("All tests passed!", tag="SUMMARY")
else:
logger.error(f"{total - passed} tests failed", tag="SUMMARY")
if __name__ == "__main__":
asyncio.run(run_tests())

View File

@@ -0,0 +1,227 @@
"""Test examples for CDPBrowserStrategy.
These examples demonstrate the functionality of CDPBrowserStrategy
and serve as functional tests.
"""
import asyncio
import os
import sys
# Add the project root to Python path if running directly
if __name__ == "__main__":
sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '../..')))
from crawl4ai.browser import BrowserManager
from crawl4ai.async_configs import BrowserConfig, CrawlerRunConfig
from crawl4ai.async_logger import AsyncLogger
# Create a logger for clear terminal output
logger = AsyncLogger(verbose=True, log_file=None)
async def test_cdp_launch_connect():
"""Test launching a browser and connecting via CDP."""
logger.info("Testing launch and connect via CDP", tag="TEST")
browser_config = BrowserConfig(
use_managed_browser=True,
headless=True
)
manager = BrowserManager(browser_config=browser_config, logger=logger)
try:
await manager.start()
logger.info("Browser launched and connected via CDP", tag="TEST")
# Test with multiple pages
pages = []
for i in range(3):
crawler_config = CrawlerRunConfig()
page, context = await manager.get_page(crawler_config)
await page.goto(f"https://example.com?test={i}")
pages.append(page)
logger.info(f"Created page {i+1}", tag="TEST")
# Verify all pages are working
for i, page in enumerate(pages):
title = await page.title()
logger.info(f"Page {i+1} title: {title}", tag="TEST")
await manager.close()
logger.info("Browser closed successfully", tag="TEST")
return True
except Exception as e:
logger.error(f"Test failed: {str(e)}", tag="TEST")
try:
await manager.close()
except:
pass
return False
async def test_cdp_with_user_data_dir():
"""Test CDP browser with a user data directory."""
logger.info("Testing CDP browser with user data directory", tag="TEST")
# Create a temporary user data directory
import tempfile
user_data_dir = tempfile.mkdtemp(prefix="crawl4ai-test-")
logger.info(f"Created temporary user data directory: {user_data_dir}", tag="TEST")
browser_config = BrowserConfig(
use_managed_browser=True,
headless=True,
user_data_dir=user_data_dir
)
manager = BrowserManager(browser_config=browser_config, logger=logger)
try:
await manager.start()
logger.info("Browser launched with user data directory", tag="TEST")
# Navigate to a page and store some data
crawler_config = CrawlerRunConfig()
page, context = await manager.get_page(crawler_config)
# Set a cookie
await context.add_cookies([{
"name": "test_cookie",
"value": "test_value",
"url": "https://example.com"
}])
# Visit the site
await page.goto("https://example.com")
# Verify cookie was set
cookies = await context.cookies(["https://example.com"])
has_test_cookie = any(cookie["name"] == "test_cookie" for cookie in cookies)
logger.info(f"Cookie set successfully: {has_test_cookie}", tag="TEST")
# Close the browser
await manager.close()
logger.info("First browser session closed", tag="TEST")
# Start a new browser with the same user data directory
logger.info("Starting second browser session with same user data directory", tag="TEST")
manager2 = BrowserManager(browser_config=browser_config, logger=logger)
await manager2.start()
# Get a new page and check if the cookie persists
page2, context2 = await manager2.get_page(crawler_config)
await page2.goto("https://example.com")
# Verify cookie persisted
cookies2 = await context2.cookies(["https://example.com"])
has_test_cookie2 = any(cookie["name"] == "test_cookie" for cookie in cookies2)
logger.info(f"Cookie persisted across sessions: {has_test_cookie2}", tag="TEST")
# Clean up
await manager2.close()
# Remove temporary directory
import shutil
shutil.rmtree(user_data_dir, ignore_errors=True)
logger.info(f"Removed temporary user data directory", tag="TEST")
return has_test_cookie and has_test_cookie2
except Exception as e:
logger.error(f"Test failed: {str(e)}", tag="TEST")
try:
await manager.close()
except:
pass
# Clean up temporary directory
try:
import shutil
shutil.rmtree(user_data_dir, ignore_errors=True)
except:
pass
return False
async def test_cdp_session_management():
"""Test session management with CDP browser."""
logger.info("Testing session management with CDP browser", tag="TEST")
browser_config = BrowserConfig(
use_managed_browser=True,
headless=True
)
manager = BrowserManager(browser_config=browser_config, logger=logger)
try:
await manager.start()
logger.info("Browser launched successfully", tag="TEST")
# Create two sessions
session1_id = "test_session_1"
session2_id = "test_session_2"
# Set up first session
crawler_config1 = CrawlerRunConfig(session_id=session1_id)
page1, context1 = await manager.get_page(crawler_config1)
await page1.goto("https://example.com")
await page1.evaluate("localStorage.setItem('session1_data', 'test_value')")
logger.info(f"Set up session 1 with ID: {session1_id}", tag="TEST")
# Set up second session
crawler_config2 = CrawlerRunConfig(session_id=session2_id)
page2, context2 = await manager.get_page(crawler_config2)
await page2.goto("https://example.org")
await page2.evaluate("localStorage.setItem('session2_data', 'test_value2')")
logger.info(f"Set up session 2 with ID: {session2_id}", tag="TEST")
# Get first session again
page1_again, _ = await manager.get_page(crawler_config1)
# Verify it's the same page and data persists
is_same_page = page1 == page1_again
data1 = await page1_again.evaluate("localStorage.getItem('session1_data')")
logger.info(f"Session 1 reuse successful: {is_same_page}, data: {data1}", tag="TEST")
# Kill first session
await manager.kill_session(session1_id)
logger.info(f"Killed session 1", tag="TEST")
# Verify second session still works
data2 = await page2.evaluate("localStorage.getItem('session2_data')")
logger.info(f"Session 2 still functional after killing session 1, data: {data2}", tag="TEST")
# Clean up
await manager.close()
logger.info("Browser closed successfully", tag="TEST")
return is_same_page and data1 == "test_value" and data2 == "test_value2"
except Exception as e:
logger.error(f"Test failed: {str(e)}", tag="TEST")
try:
await manager.close()
except:
pass
return False
async def run_tests():
"""Run all tests sequentially."""
results = []
results.append(await test_cdp_launch_connect())
results.append(await test_cdp_with_user_data_dir())
results.append(await test_cdp_session_management())
# Print summary
total = len(results)
passed = sum(results)
logger.info(f"Tests complete: {passed}/{total} passed", tag="SUMMARY")
if passed == total:
logger.success("All tests passed!", tag="SUMMARY")
else:
logger.error(f"{total - passed} tests failed", tag="SUMMARY")
if __name__ == "__main__":
asyncio.run(run_tests())

View File

@@ -0,0 +1,77 @@
"""Combined test runner for all browser module tests.
This script runs all the browser module tests in sequence and
provides a comprehensive summary.
"""
import asyncio
import os
import sys
import time
# Add the project root to Python path if running directly
if __name__ == "__main__":
sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '../..')))
from crawl4ai.async_logger import AsyncLogger
# Create a logger for clear terminal output
logger = AsyncLogger(verbose=True, log_file=None)
async def run_test_module(module_name, header):
"""Run all tests in a module and return results."""
logger.info(f"\n{'-'*30}", tag="TEST")
logger.info(f"RUNNING: {header}", tag="TEST")
logger.info(f"{'-'*30}", tag="TEST")
# Import the module dynamically
module = __import__(f"tests.browser.{module_name}", fromlist=["run_tests"])
# Track time for performance measurement
start_time = time.time()
# Run the tests
await module.run_tests()
# Calculate time taken
time_taken = time.time() - start_time
logger.info(f"Time taken: {time_taken:.2f} seconds", tag="TIMING")
return time_taken
async def main():
"""Run all test modules."""
logger.info("STARTING COMPREHENSIVE BROWSER MODULE TESTS", tag="MAIN")
# List of test modules to run
test_modules = [
("test_browser_manager", "Browser Manager Tests"),
("test_playwright_strategy", "Playwright Strategy Tests"),
("test_cdp_strategy", "CDP Strategy Tests"),
("test_builtin_strategy", "Builtin Browser Strategy Tests"),
("test_profiles", "Profile Management Tests")
]
# Run each test module
timings = {}
for module_name, header in test_modules:
try:
time_taken = await run_test_module(module_name, header)
timings[module_name] = time_taken
except Exception as e:
logger.error(f"Error running {module_name}: {str(e)}", tag="ERROR")
# Print summary
logger.info("\n\nTEST SUMMARY:", tag="SUMMARY")
logger.info(f"{'-'*50}", tag="SUMMARY")
for module_name, header in test_modules:
if module_name in timings:
logger.info(f"{header}: {timings[module_name]:.2f} seconds", tag="SUMMARY")
else:
logger.error(f"{header}: FAILED TO RUN", tag="SUMMARY")
logger.info(f"{'-'*50}", tag="SUMMARY")
total_time = sum(timings.values())
logger.info(f"Total time: {total_time:.2f} seconds", tag="SUMMARY")
if __name__ == "__main__":
asyncio.run(main())

View File

@@ -0,0 +1,275 @@
"""Test examples for PlaywrightBrowserStrategy.
These examples demonstrate the functionality of PlaywrightBrowserStrategy
and serve as functional tests.
"""
import asyncio
import os
import sys
# Add the project root to Python path if running directly
if __name__ == "__main__":
sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '../..')))
from crawl4ai.browser import BrowserManager
from crawl4ai.async_configs import BrowserConfig, CrawlerRunConfig
from crawl4ai.async_logger import AsyncLogger
# Create a logger for clear terminal output
logger = AsyncLogger(verbose=True, log_file=None)
async def test_playwright_basic():
"""Test basic Playwright browser functionality."""
logger.info("Testing standard Playwright browser", tag="TEST")
# Create browser config for standard Playwright
browser_config = BrowserConfig(
headless=True,
viewport_width=1280,
viewport_height=800
)
# Create browser manager with the config
manager = BrowserManager(browser_config=browser_config, logger=logger)
try:
# Start the browser
await manager.start()
logger.info("Browser started successfully", tag="TEST")
# Create crawler config
crawler_config = CrawlerRunConfig(url="https://example.com")
# Get a page
page, context = await manager.get_page(crawler_config)
logger.info("Got page successfully", tag="TEST")
# Navigate to a website
await page.goto("https://example.com")
logger.info("Navigated to example.com", tag="TEST")
# Get page title
title = await page.title()
logger.info(f"Page title: {title}", tag="TEST")
# Clean up
await manager.close()
logger.info("Browser closed successfully", tag="TEST")
return True
except Exception as e:
logger.error(f"Test failed: {str(e)}", tag="TEST")
# Ensure cleanup
try:
await manager.close()
except:
pass
return False
async def test_playwright_text_mode():
"""Test Playwright browser in text-only mode."""
logger.info("Testing Playwright text mode", tag="TEST")
# Create browser config with text mode enabled
browser_config = BrowserConfig(
headless=True,
text_mode=True # Enable text-only mode
)
# Create browser manager with the config
manager = BrowserManager(browser_config=browser_config, logger=logger)
try:
# Start the browser
await manager.start()
logger.info("Browser started successfully in text mode", tag="TEST")
# Get a page
crawler_config = CrawlerRunConfig(url="https://example.com")
page, context = await manager.get_page(crawler_config)
# Navigate to a website
await page.goto("https://example.com")
logger.info("Navigated to example.com", tag="TEST")
# Get page title
title = await page.title()
logger.info(f"Page title: {title}", tag="TEST")
# Check if images are blocked in text mode
# We'll check if any image requests were made
has_images = False
async with page.expect_request("**/*.{png,jpg,jpeg,gif,webp,svg}", timeout=1000) as request_info:
try:
# Try to load a page with images
await page.goto("https://picsum.photos/", wait_until="domcontentloaded")
request = await request_info.value
has_images = True
except:
# Timeout without image requests means text mode is working
has_images = False
logger.info(f"Text mode image blocking working: {not has_images}", tag="TEST")
# Clean up
await manager.close()
logger.info("Browser closed successfully", tag="TEST")
return True
except Exception as e:
logger.error(f"Test failed: {str(e)}", tag="TEST")
# Ensure cleanup
try:
await manager.close()
except:
pass
return False
async def test_playwright_context_reuse():
"""Test context caching and reuse with identical configurations."""
logger.info("Testing context reuse with identical configurations", tag="TEST")
# Create browser config
browser_config = BrowserConfig(headless=True)
# Create browser manager
manager = BrowserManager(browser_config=browser_config, logger=logger)
try:
# Start the browser
await manager.start()
logger.info("Browser started successfully", tag="TEST")
# Create identical crawler configs
crawler_config1 = CrawlerRunConfig(
url="https://example.com",
viewport_width=1280,
viewport_height=800
)
crawler_config2 = CrawlerRunConfig(
url="https://example.org", # Different URL but same browser parameters
viewport_width=1280,
viewport_height=800
)
# Get pages with these configs
page1, context1 = await manager.get_page(crawler_config1)
page2, context2 = await manager.get_page(crawler_config2)
# Check if contexts are reused
is_same_context = context1 == context2
logger.info(f"Contexts reused: {is_same_context}", tag="TEST")
# Now try with a different config
crawler_config3 = CrawlerRunConfig(
url="https://example.net",
viewport_width=800, # Different viewport size
viewport_height=600
)
page3, context3 = await manager.get_page(crawler_config3)
# This should be a different context
is_different_context = context1 != context3
logger.info(f"Different contexts for different configs: {is_different_context}", tag="TEST")
# Clean up
await manager.close()
logger.info("Browser closed successfully", tag="TEST")
# Both tests should pass for success
return is_same_context and is_different_context
except Exception as e:
logger.error(f"Test failed: {str(e)}", tag="TEST")
# Ensure cleanup
try:
await manager.close()
except:
pass
return False
async def test_playwright_session_management():
"""Test session management with Playwright browser."""
logger.info("Testing session management with Playwright browser", tag="TEST")
browser_config = BrowserConfig(
headless=True
)
manager = BrowserManager(browser_config=browser_config, logger=logger)
try:
await manager.start()
logger.info("Browser launched successfully", tag="TEST")
# Create two sessions
session1_id = "playwright_session_1"
session2_id = "playwright_session_2"
# Set up first session
crawler_config1 = CrawlerRunConfig(session_id=session1_id, url="https://example.com")
page1, context1 = await manager.get_page(crawler_config1)
await page1.goto("https://example.com")
await page1.evaluate("localStorage.setItem('playwright_session1_data', 'test_value1')")
logger.info(f"Set up session 1 with ID: {session1_id}", tag="TEST")
# Set up second session
crawler_config2 = CrawlerRunConfig(session_id=session2_id, url="https://example.org")
page2, context2 = await manager.get_page(crawler_config2)
await page2.goto("https://example.org")
await page2.evaluate("localStorage.setItem('playwright_session2_data', 'test_value2')")
logger.info(f"Set up session 2 with ID: {session2_id}", tag="TEST")
# Get first session again
page1_again, context1_again = await manager.get_page(crawler_config1)
# Verify it's the same page and data persists
is_same_page = page1 == page1_again
is_same_context = context1 == context1_again
data1 = await page1_again.evaluate("localStorage.getItem('playwright_session1_data')")
logger.info(f"Session 1 reuse successful: {is_same_page}, data: {data1}", tag="TEST")
# Kill first session
await manager.kill_session(session1_id)
logger.info(f"Killed session 1", tag="TEST")
# Verify second session still works
data2 = await page2.evaluate("localStorage.getItem('playwright_session2_data')")
logger.info(f"Session 2 still functional after killing session 1, data: {data2}", tag="TEST")
# Clean up
await manager.close()
logger.info("Browser closed successfully", tag="TEST")
return is_same_page and is_same_context and data1 == "test_value1" and data2 == "test_value2"
except Exception as e:
logger.error(f"Test failed: {str(e)}", tag="TEST")
try:
await manager.close()
except:
pass
return False
async def run_tests():
"""Run all tests sequentially."""
results = []
results.append(await test_playwright_basic())
results.append(await test_playwright_text_mode())
results.append(await test_playwright_context_reuse())
results.append(await test_playwright_session_management())
# Print summary
total = len(results)
passed = sum(results)
logger.info(f"Tests complete: {passed}/{total} passed", tag="SUMMARY")
if passed == total:
logger.success("All tests passed!", tag="SUMMARY")
else:
logger.error(f"{total - passed} tests failed", tag="SUMMARY")
if __name__ == "__main__":
asyncio.run(run_tests())

View File

@@ -0,0 +1,176 @@
"""Test examples for BrowserProfileManager.
These examples demonstrate the functionality of BrowserProfileManager
and serve as functional tests.
"""
import asyncio
import os
import sys
import uuid
import shutil
# Add the project root to Python path if running directly
if __name__ == "__main__":
sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '../..')))
from crawl4ai.browser import BrowserManager, BrowserProfileManager
from crawl4ai.async_configs import BrowserConfig, CrawlerRunConfig
from crawl4ai.async_logger import AsyncLogger
# Create a logger for clear terminal output
logger = AsyncLogger(verbose=True, log_file=None)
async def test_profile_creation():
"""Test creating and managing browser profiles."""
logger.info("Testing profile creation and management", tag="TEST")
profile_manager = BrowserProfileManager(logger=logger)
try:
# List existing profiles
profiles = profile_manager.list_profiles()
logger.info(f"Found {len(profiles)} existing profiles", tag="TEST")
# Generate a unique profile name for testing
test_profile_name = f"test-profile-{uuid.uuid4().hex[:8]}"
# Create a test profile directory
profile_path = os.path.join(profile_manager.profiles_dir, test_profile_name)
os.makedirs(os.path.join(profile_path, "Default"), exist_ok=True)
# Create a dummy Preferences file to simulate a Chrome profile
with open(os.path.join(profile_path, "Default", "Preferences"), "w") as f:
f.write("{\"test\": true}")
logger.info(f"Created test profile at: {profile_path}", tag="TEST")
# Verify the profile is now in the list
profiles = profile_manager.list_profiles()
profile_found = any(p["name"] == test_profile_name for p in profiles)
logger.info(f"Profile found in list: {profile_found}", tag="TEST")
# Try to get the profile path
retrieved_path = profile_manager.get_profile_path(test_profile_name)
path_match = retrieved_path == profile_path
logger.info(f"Retrieved correct profile path: {path_match}", tag="TEST")
# Delete the profile
success = profile_manager.delete_profile(test_profile_name)
logger.info(f"Profile deletion successful: {success}", tag="TEST")
# Verify it's gone
profiles_after = profile_manager.list_profiles()
profile_removed = not any(p["name"] == test_profile_name for p in profiles_after)
logger.info(f"Profile removed from list: {profile_removed}", tag="TEST")
# Clean up just in case
if os.path.exists(profile_path):
shutil.rmtree(profile_path, ignore_errors=True)
return profile_found and path_match and success and profile_removed
except Exception as e:
logger.error(f"Test failed: {str(e)}", tag="TEST")
# Clean up test directory
try:
if os.path.exists(profile_path):
shutil.rmtree(profile_path, ignore_errors=True)
except:
pass
return False
async def test_profile_with_browser():
"""Test using a profile with a browser."""
logger.info("Testing using a profile with a browser", tag="TEST")
profile_manager = BrowserProfileManager(logger=logger)
test_profile_name = f"test-browser-profile-{uuid.uuid4().hex[:8]}"
profile_path = None
try:
# Create a test profile directory
profile_path = os.path.join(profile_manager.profiles_dir, test_profile_name)
os.makedirs(os.path.join(profile_path, "Default"), exist_ok=True)
# Create a dummy Preferences file to simulate a Chrome profile
with open(os.path.join(profile_path, "Default", "Preferences"), "w") as f:
f.write("{\"test\": true}")
logger.info(f"Created test profile at: {profile_path}", tag="TEST")
# Now use this profile with a browser
browser_config = BrowserConfig(
user_data_dir=profile_path,
headless=True
)
manager = BrowserManager(browser_config=browser_config, logger=logger)
# Start the browser with the profile
await manager.start()
logger.info("Browser started with profile", tag="TEST")
# Create a page
crawler_config = CrawlerRunConfig()
page, context = await manager.get_page(crawler_config)
# Navigate and set some data to verify profile works
await page.goto("https://example.com")
await page.evaluate("localStorage.setItem('test_data', 'profile_value')")
# Close browser
await manager.close()
logger.info("First browser session closed", tag="TEST")
# Create a new browser with the same profile
manager2 = BrowserManager(browser_config=browser_config, logger=logger)
await manager2.start()
logger.info("Second browser session started with same profile", tag="TEST")
# Get a page and check if the data persists
page2, context2 = await manager2.get_page(crawler_config)
await page2.goto("https://example.com")
data = await page2.evaluate("localStorage.getItem('test_data')")
# Verify data persisted
data_persisted = data == "profile_value"
logger.info(f"Data persisted across sessions: {data_persisted}", tag="TEST")
# Clean up
await manager2.close()
logger.info("Second browser session closed", tag="TEST")
# Delete the test profile
success = profile_manager.delete_profile(test_profile_name)
logger.info(f"Test profile deleted: {success}", tag="TEST")
return data_persisted and success
except Exception as e:
logger.error(f"Test failed: {str(e)}", tag="TEST")
# Clean up
try:
if profile_path and os.path.exists(profile_path):
shutil.rmtree(profile_path, ignore_errors=True)
except:
pass
return False
async def run_tests():
"""Run all tests sequentially."""
results = []
results.append(await test_profile_creation())
results.append(await test_profile_with_browser())
# Print summary
total = len(results)
passed = sum(results)
logger.info(f"Tests complete: {passed}/{total} passed", tag="SUMMARY")
if passed == total:
logger.success("All tests passed!", tag="SUMMARY")
else:
logger.error(f"{total - passed} tests failed", tag="SUMMARY")
if __name__ == "__main__":
asyncio.run(run_tests())