feat(browser): implement modular browser management system
Adds a new browser management system with strategy pattern implementation: - Introduces BrowserManager class with strategy pattern support - Adds PlaywrightBrowserStrategy, CDPBrowserStrategy, and BuiltinBrowserStrategy - Implements BrowserProfileManager for profile management - Adds PagePoolConfig for browser page pooling - Includes comprehensive test suite for all browser strategies BREAKING CHANGE: Browser management has been moved to browser/ module. Direct usage of browser_manager.py and browser_profiler.py is deprecated.
This commit is contained in:
@@ -156,6 +156,41 @@ def is_empty_value(value: Any) -> bool:
|
||||
return False
|
||||
|
||||
|
||||
class PagePoolConfig:
|
||||
"""Configuration for browser page pooling.
|
||||
|
||||
This class configures the page pooling mechanism that maintains pre-warmed
|
||||
browser pages ready for immediate use, improving performance for scenarios
|
||||
where multiple URLs need to be processed in sequence.
|
||||
|
||||
Attributes:
|
||||
mode (str): Pooling mode - "static" or "adaptive".
|
||||
"static" uses a fixed pool size defined by static_size.
|
||||
"adaptive" calculates optimal size based on available system memory.
|
||||
Default: "static".
|
||||
static_size (int): Number of pages to maintain in the pool when mode is "static".
|
||||
Default: 10.
|
||||
memory_per_page (int): Estimated memory used by a single page in MB.
|
||||
Used for "adaptive" mode calculations.
|
||||
Default: 200.
|
||||
memory_threshold (float): Maximum percentage of system memory to use in "adaptive" mode.
|
||||
Default: 0.7 (70% of available memory).
|
||||
timeout (float): Seconds to wait for a page from the pool before creating a new one.
|
||||
Default: 5.0.
|
||||
"""
|
||||
|
||||
def __init__(self,
|
||||
mode="static",
|
||||
static_size=10,
|
||||
memory_per_page=200,
|
||||
memory_threshold=0.7,
|
||||
timeout=5.0):
|
||||
self.mode = mode
|
||||
self.static_size = static_size
|
||||
self.memory_per_page = memory_per_page
|
||||
self.memory_threshold = memory_threshold
|
||||
self.timeout = timeout
|
||||
|
||||
class BrowserConfig:
|
||||
"""
|
||||
Configuration class for setting up a browser instance and its context in AsyncPlaywrightCrawlerStrategy.
|
||||
@@ -220,6 +255,9 @@ class BrowserConfig:
|
||||
light_mode (bool): Disables certain background features for performance gains. Default: False.
|
||||
extra_args (list): Additional command-line arguments passed to the browser.
|
||||
Default: [].
|
||||
page_pool_config (PagePoolConfig or None): Configuration for page pooling mechanism.
|
||||
If None, page pooling is disabled.
|
||||
Default: None.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
@@ -260,6 +298,7 @@ class BrowserConfig:
|
||||
extra_args: list = None,
|
||||
debugging_port: int = 9222,
|
||||
host: str = "localhost",
|
||||
page_pool_config: Optional[PagePoolConfig] = None,
|
||||
):
|
||||
self.browser_type = browser_type
|
||||
self.headless = headless
|
||||
@@ -298,6 +337,7 @@ class BrowserConfig:
|
||||
self.verbose = verbose
|
||||
self.debugging_port = debugging_port
|
||||
self.host = host
|
||||
self.page_pool_config = page_pool_config
|
||||
|
||||
fa_user_agenr_generator = ValidUAGenerator()
|
||||
if self.user_agent_mode == "random":
|
||||
@@ -328,6 +368,12 @@ class BrowserConfig:
|
||||
|
||||
@staticmethod
|
||||
def from_kwargs(kwargs: dict) -> "BrowserConfig":
|
||||
# Handle page_pool_config
|
||||
page_pool_config = kwargs.get("page_pool_config")
|
||||
if isinstance(page_pool_config, dict):
|
||||
# If it's a dict, convert to PagePoolConfig
|
||||
page_pool_config = PagePoolConfig(**page_pool_config)
|
||||
|
||||
return BrowserConfig(
|
||||
browser_type=kwargs.get("browser_type", "chromium"),
|
||||
headless=kwargs.get("headless", True),
|
||||
@@ -361,6 +407,7 @@ class BrowserConfig:
|
||||
extra_args=kwargs.get("extra_args", []),
|
||||
debugging_port=kwargs.get("debugging_port", 9222),
|
||||
host=kwargs.get("host", "localhost"),
|
||||
page_pool_config=page_pool_config,
|
||||
)
|
||||
|
||||
def to_dict(self):
|
||||
@@ -395,6 +442,7 @@ class BrowserConfig:
|
||||
"verbose": self.verbose,
|
||||
"debugging_port": self.debugging_port,
|
||||
"host": self.host,
|
||||
"page_pool_config": self.page_pool_config,
|
||||
}
|
||||
|
||||
def clone(self, **kwargs):
|
||||
|
||||
10
crawl4ai/browser/__init__.py
Normal file
10
crawl4ai/browser/__init__.py
Normal file
@@ -0,0 +1,10 @@
|
||||
"""Browser management module for Crawl4AI.
|
||||
|
||||
This module provides browser management capabilities using different strategies
|
||||
for browser creation and interaction.
|
||||
"""
|
||||
|
||||
from .manager import BrowserManager
|
||||
from .profiles import BrowserProfileManager
|
||||
|
||||
__all__ = ['BrowserManager', 'BrowserProfileManager']
|
||||
165
crawl4ai/browser/manager.py
Normal file
165
crawl4ai/browser/manager.py
Normal file
@@ -0,0 +1,165 @@
|
||||
"""Browser manager module for Crawl4AI.
|
||||
|
||||
This module provides a central browser management class that uses the
|
||||
strategy pattern internally while maintaining the existing API.
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
import time
|
||||
from typing import Optional, Tuple, Dict, Any
|
||||
|
||||
from playwright.async_api import Page, BrowserContext
|
||||
|
||||
from ..async_logger import AsyncLogger
|
||||
from ..async_configs import BrowserConfig, CrawlerRunConfig
|
||||
|
||||
from .strategies import (
|
||||
BaseBrowserStrategy,
|
||||
PlaywrightBrowserStrategy,
|
||||
CDPBrowserStrategy,
|
||||
BuiltinBrowserStrategy
|
||||
)
|
||||
|
||||
class BrowserManager:
|
||||
"""Main interface for browser management in Crawl4AI.
|
||||
|
||||
This class maintains backward compatibility with the existing implementation
|
||||
while using the strategy pattern internally for different browser types.
|
||||
|
||||
Attributes:
|
||||
config (BrowserConfig): Configuration object containing all browser settings
|
||||
logger: Logger instance for recording events and errors
|
||||
browser: The browser instance
|
||||
default_context: The default browser context
|
||||
managed_browser: The managed browser instance
|
||||
playwright: The Playwright instance
|
||||
sessions: Dictionary to store session information
|
||||
session_ttl: Session timeout in seconds
|
||||
"""
|
||||
|
||||
def __init__(self, browser_config: Optional[BrowserConfig] = None, logger: Optional[AsyncLogger] = None):
|
||||
"""Initialize the BrowserManager with a browser configuration.
|
||||
|
||||
Args:
|
||||
browser_config: Configuration object containing all browser settings
|
||||
logger: Logger instance for recording events and errors
|
||||
"""
|
||||
self.config = browser_config or BrowserConfig()
|
||||
self.logger = logger
|
||||
|
||||
# Create strategy based on configuration
|
||||
self._strategy = self._create_strategy()
|
||||
|
||||
# Initialize state variables for compatibility with existing code
|
||||
self.browser = None
|
||||
self.default_context = None
|
||||
self.managed_browser = None
|
||||
self.playwright = None
|
||||
|
||||
# For session management (from existing implementation)
|
||||
self.sessions = {}
|
||||
self.session_ttl = 1800 # 30 minutes
|
||||
|
||||
def _create_strategy(self) -> BaseBrowserStrategy:
|
||||
"""Create appropriate browser strategy based on configuration.
|
||||
|
||||
Returns:
|
||||
BaseBrowserStrategy: The selected browser strategy
|
||||
"""
|
||||
if self.config.browser_mode == "builtin":
|
||||
return BuiltinBrowserStrategy(self.config, self.logger)
|
||||
elif self.config.cdp_url or self.config.use_managed_browser:
|
||||
return CDPBrowserStrategy(self.config, self.logger)
|
||||
else:
|
||||
return PlaywrightBrowserStrategy(self.config, self.logger)
|
||||
|
||||
async def start(self):
|
||||
"""Start the browser instance and set up the default context.
|
||||
|
||||
Returns:
|
||||
self: For method chaining
|
||||
"""
|
||||
# Start the strategy
|
||||
await self._strategy.start()
|
||||
|
||||
# Update legacy references
|
||||
self.browser = self._strategy.browser
|
||||
self.default_context = self._strategy.default_context
|
||||
|
||||
# Set browser process reference (for CDP strategy)
|
||||
if hasattr(self._strategy, 'browser_process'):
|
||||
self.managed_browser = self._strategy
|
||||
|
||||
# Set Playwright reference
|
||||
self.playwright = self._strategy.playwright
|
||||
|
||||
# Sync sessions if needed
|
||||
if hasattr(self._strategy, 'sessions'):
|
||||
self.sessions = self._strategy.sessions
|
||||
self.session_ttl = self._strategy.session_ttl
|
||||
|
||||
return self
|
||||
|
||||
async def get_page(self, crawlerRunConfig: CrawlerRunConfig) -> Tuple[Page, BrowserContext]:
|
||||
"""Get a page for the given configuration.
|
||||
|
||||
Args:
|
||||
crawlerRunConfig: Configuration object for the crawler run
|
||||
|
||||
Returns:
|
||||
Tuple of (Page, BrowserContext)
|
||||
"""
|
||||
# Delegate to strategy
|
||||
page, context = await self._strategy.get_page(crawlerRunConfig)
|
||||
|
||||
# Sync sessions if needed
|
||||
if hasattr(self._strategy, 'sessions'):
|
||||
self.sessions = self._strategy.sessions
|
||||
|
||||
return page, context
|
||||
|
||||
async def kill_session(self, session_id: str):
|
||||
"""Kill a browser session and clean up resources.
|
||||
|
||||
Args:
|
||||
session_id: The session ID to kill
|
||||
"""
|
||||
# Handle kill_session via our strategy if it supports it
|
||||
if hasattr(self._strategy, '_kill_session'):
|
||||
await self._strategy._kill_session(session_id)
|
||||
elif session_id in self.sessions:
|
||||
context, page, _ = self.sessions[session_id]
|
||||
await page.close()
|
||||
# Only close context if not using CDP
|
||||
if not self.config.use_managed_browser and not self.config.cdp_url and not self.config.browser_mode == "builtin":
|
||||
await context.close()
|
||||
del self.sessions[session_id]
|
||||
|
||||
def _cleanup_expired_sessions(self):
|
||||
"""Clean up expired sessions based on TTL."""
|
||||
# Use strategy's implementation if available
|
||||
if hasattr(self._strategy, '_cleanup_expired_sessions'):
|
||||
self._strategy._cleanup_expired_sessions()
|
||||
return
|
||||
|
||||
# Otherwise use our own implementation
|
||||
current_time = time.time()
|
||||
expired_sessions = [
|
||||
sid
|
||||
for sid, (_, _, last_used) in self.sessions.items()
|
||||
if current_time - last_used > self.session_ttl
|
||||
]
|
||||
for sid in expired_sessions:
|
||||
asyncio.create_task(self.kill_session(sid))
|
||||
|
||||
async def close(self):
|
||||
"""Close the browser and clean up resources."""
|
||||
# Delegate to strategy
|
||||
await self._strategy.close()
|
||||
|
||||
# Reset legacy references
|
||||
self.browser = None
|
||||
self.default_context = None
|
||||
self.managed_browser = None
|
||||
self.playwright = None
|
||||
self.sessions = {}
|
||||
0
crawl4ai/browser/models.py
Normal file
0
crawl4ai/browser/models.py
Normal file
458
crawl4ai/browser/profiles.py
Normal file
458
crawl4ai/browser/profiles.py
Normal file
@@ -0,0 +1,458 @@
|
||||
"""Browser profile management module for Crawl4AI.
|
||||
|
||||
This module provides functionality for creating and managing browser profiles
|
||||
that can be used for authenticated browsing.
|
||||
"""
|
||||
|
||||
import os
|
||||
import asyncio
|
||||
import signal
|
||||
import sys
|
||||
import datetime
|
||||
import uuid
|
||||
import shutil
|
||||
from typing import List, Dict, Optional, Any
|
||||
from colorama import Fore, Style, init
|
||||
|
||||
from ..async_configs import BrowserConfig
|
||||
from ..async_logger import AsyncLogger, AsyncLoggerBase
|
||||
from ..utils import get_home_folder
|
||||
from .strategies import is_windows
|
||||
|
||||
class BrowserProfileManager:
|
||||
"""Manages browser profiles for Crawl4AI.
|
||||
|
||||
This class provides functionality to create and manage browser profiles
|
||||
that can be used for authenticated browsing with Crawl4AI.
|
||||
|
||||
Profiles are stored by default in ~/.crawl4ai/profiles/
|
||||
"""
|
||||
|
||||
def __init__(self, logger: Optional[AsyncLoggerBase] = None):
|
||||
"""Initialize the BrowserProfileManager.
|
||||
|
||||
Args:
|
||||
logger: Logger for outputting messages. If None, a default AsyncLogger is created.
|
||||
"""
|
||||
# Initialize colorama for colorful terminal output
|
||||
init()
|
||||
|
||||
# Create a logger if not provided
|
||||
if logger is None:
|
||||
self.logger = AsyncLogger(verbose=True)
|
||||
elif not isinstance(logger, AsyncLoggerBase):
|
||||
self.logger = AsyncLogger(verbose=True)
|
||||
else:
|
||||
self.logger = logger
|
||||
|
||||
# Ensure profiles directory exists
|
||||
self.profiles_dir = os.path.join(get_home_folder(), "profiles")
|
||||
os.makedirs(self.profiles_dir, exist_ok=True)
|
||||
|
||||
async def create_profile(self,
|
||||
profile_name: Optional[str] = None,
|
||||
browser_config: Optional[BrowserConfig] = None) -> Optional[str]:
|
||||
"""Create a browser profile interactively.
|
||||
|
||||
Args:
|
||||
profile_name: Name for the profile. If None, a name is generated.
|
||||
browser_config: Configuration for the browser. If None, a default configuration is used.
|
||||
|
||||
Returns:
|
||||
Path to the created profile directory, or None if creation failed
|
||||
"""
|
||||
# Create default browser config if none provided
|
||||
if browser_config is None:
|
||||
browser_config = BrowserConfig(
|
||||
browser_type="chromium",
|
||||
headless=False, # Must be visible for user interaction
|
||||
verbose=True
|
||||
)
|
||||
else:
|
||||
# Ensure headless is False for user interaction
|
||||
browser_config.headless = False
|
||||
|
||||
# Generate profile name if not provided
|
||||
if not profile_name:
|
||||
timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
|
||||
profile_name = f"profile_{timestamp}_{uuid.uuid4().hex[:6]}"
|
||||
|
||||
# Sanitize profile name (replace spaces and special chars)
|
||||
profile_name = "".join(c if c.isalnum() or c in "-_" else "_" for c in profile_name)
|
||||
|
||||
# Set user data directory
|
||||
profile_path = os.path.join(self.profiles_dir, profile_name)
|
||||
os.makedirs(profile_path, exist_ok=True)
|
||||
|
||||
# Print instructions for the user with colorama formatting
|
||||
border = f"{Fore.CYAN}{'='*80}{Style.RESET_ALL}"
|
||||
self.logger.info(f"\n{border}", tag="PROFILE")
|
||||
self.logger.info(f"Creating browser profile: {Fore.GREEN}{profile_name}{Style.RESET_ALL}", tag="PROFILE")
|
||||
self.logger.info(f"Profile directory: {Fore.YELLOW}{profile_path}{Style.RESET_ALL}", tag="PROFILE")
|
||||
|
||||
self.logger.info("\nInstructions:", tag="PROFILE")
|
||||
self.logger.info("1. A browser window will open for you to set up your profile.", tag="PROFILE")
|
||||
self.logger.info(f"2. {Fore.CYAN}Log in to websites{Style.RESET_ALL}, configure settings, etc. as needed.", tag="PROFILE")
|
||||
self.logger.info(f"3. When you're done, {Fore.YELLOW}press 'q' in this terminal{Style.RESET_ALL} to close the browser.", tag="PROFILE")
|
||||
self.logger.info("4. The profile will be saved and ready to use with Crawl4AI.", tag="PROFILE")
|
||||
self.logger.info(f"{border}\n", tag="PROFILE")
|
||||
|
||||
# Import the necessary classes with local imports to avoid circular references
|
||||
from .strategies import CDPBrowserStrategy
|
||||
|
||||
# Set browser config to use the profile path
|
||||
browser_config.user_data_dir = profile_path
|
||||
|
||||
# Create a CDP browser strategy for the profile creation
|
||||
browser_strategy = CDPBrowserStrategy(browser_config, self.logger)
|
||||
|
||||
# Set up signal handlers to ensure cleanup on interrupt
|
||||
original_sigint = signal.getsignal(signal.SIGINT)
|
||||
original_sigterm = signal.getsignal(signal.SIGTERM)
|
||||
|
||||
# Define cleanup handler for signals
|
||||
async def cleanup_handler(sig, frame):
|
||||
self.logger.warning("\nCleaning up browser process...", tag="PROFILE")
|
||||
await browser_strategy.close()
|
||||
# Restore original signal handlers
|
||||
signal.signal(signal.SIGINT, original_sigint)
|
||||
signal.signal(signal.SIGTERM, original_sigterm)
|
||||
if sig == signal.SIGINT:
|
||||
self.logger.error("Profile creation interrupted. Profile may be incomplete.", tag="PROFILE")
|
||||
sys.exit(1)
|
||||
|
||||
# Set signal handlers
|
||||
def sigint_handler(sig, frame):
|
||||
asyncio.create_task(cleanup_handler(sig, frame))
|
||||
|
||||
signal.signal(signal.SIGINT, sigint_handler)
|
||||
signal.signal(signal.SIGTERM, sigint_handler)
|
||||
|
||||
# Event to signal when user is done with the browser
|
||||
user_done_event = asyncio.Event()
|
||||
|
||||
# Run keyboard input loop in a separate task
|
||||
async def listen_for_quit_command():
|
||||
import termios
|
||||
import tty
|
||||
import select
|
||||
|
||||
# First output the prompt
|
||||
self.logger.info(f"{Fore.CYAN}Press '{Fore.WHITE}q{Fore.CYAN}' when you've finished using the browser...{Style.RESET_ALL}", tag="PROFILE")
|
||||
|
||||
# Save original terminal settings
|
||||
fd = sys.stdin.fileno()
|
||||
old_settings = termios.tcgetattr(fd)
|
||||
|
||||
try:
|
||||
# Switch to non-canonical mode (no line buffering)
|
||||
tty.setcbreak(fd)
|
||||
|
||||
while True:
|
||||
# Check if input is available (non-blocking)
|
||||
readable, _, _ = select.select([sys.stdin], [], [], 0.5)
|
||||
if readable:
|
||||
key = sys.stdin.read(1)
|
||||
if key.lower() == 'q':
|
||||
self.logger.info(f"{Fore.GREEN}Closing browser and saving profile...{Style.RESET_ALL}", tag="PROFILE")
|
||||
user_done_event.set()
|
||||
return
|
||||
|
||||
# Check if the browser process has already exited
|
||||
if browser_strategy.browser_process and browser_strategy.browser_process.poll() is not None:
|
||||
self.logger.info("Browser already closed. Ending input listener.", tag="PROFILE")
|
||||
user_done_event.set()
|
||||
return
|
||||
|
||||
await asyncio.sleep(0.1)
|
||||
|
||||
finally:
|
||||
# Restore terminal settings
|
||||
termios.tcsetattr(fd, termios.TCSADRAIN, old_settings)
|
||||
|
||||
try:
|
||||
# Start the browser
|
||||
await browser_strategy.start()
|
||||
|
||||
# Check if browser started successfully
|
||||
if not browser_strategy.browser_process:
|
||||
self.logger.error("Failed to start browser process.", tag="PROFILE")
|
||||
return None
|
||||
|
||||
self.logger.info(f"Browser launched. {Fore.CYAN}Waiting for you to finish...{Style.RESET_ALL}", tag="PROFILE")
|
||||
|
||||
# Start listening for keyboard input
|
||||
listener_task = asyncio.create_task(listen_for_quit_command())
|
||||
|
||||
# Wait for either the user to press 'q' or for the browser process to exit naturally
|
||||
while not user_done_event.is_set() and browser_strategy.browser_process.poll() is None:
|
||||
await asyncio.sleep(0.5)
|
||||
|
||||
# Cancel the listener task if it's still running
|
||||
if not listener_task.done():
|
||||
listener_task.cancel()
|
||||
try:
|
||||
await listener_task
|
||||
except asyncio.CancelledError:
|
||||
pass
|
||||
|
||||
# If the browser is still running and the user pressed 'q', terminate it
|
||||
if browser_strategy.browser_process.poll() is None and user_done_event.is_set():
|
||||
self.logger.info("Terminating browser process...", tag="PROFILE")
|
||||
await browser_strategy.close()
|
||||
|
||||
self.logger.success(f"Browser closed. Profile saved at: {Fore.GREEN}{profile_path}{Style.RESET_ALL}", tag="PROFILE")
|
||||
|
||||
except Exception as e:
|
||||
self.logger.error(f"Error creating profile: {str(e)}", tag="PROFILE")
|
||||
await browser_strategy.close()
|
||||
return None
|
||||
finally:
|
||||
# Restore original signal handlers
|
||||
signal.signal(signal.SIGINT, original_sigint)
|
||||
signal.signal(signal.SIGTERM, original_sigterm)
|
||||
|
||||
# Make sure browser is fully cleaned up
|
||||
await browser_strategy.close()
|
||||
|
||||
# Return the profile path
|
||||
return profile_path
|
||||
|
||||
def list_profiles(self) -> List[Dict[str, Any]]:
|
||||
"""List all available browser profiles.
|
||||
|
||||
Returns:
|
||||
List of dictionaries containing profile information
|
||||
"""
|
||||
if not os.path.exists(self.profiles_dir):
|
||||
return []
|
||||
|
||||
profiles = []
|
||||
|
||||
for name in os.listdir(self.profiles_dir):
|
||||
profile_path = os.path.join(self.profiles_dir, name)
|
||||
|
||||
# Skip if not a directory
|
||||
if not os.path.isdir(profile_path):
|
||||
continue
|
||||
|
||||
# Check if this looks like a valid browser profile
|
||||
# For Chromium: Look for Preferences file
|
||||
# For Firefox: Look for prefs.js file
|
||||
is_valid = False
|
||||
|
||||
if os.path.exists(os.path.join(profile_path, "Preferences")) or \
|
||||
os.path.exists(os.path.join(profile_path, "Default", "Preferences")):
|
||||
is_valid = "chromium"
|
||||
elif os.path.exists(os.path.join(profile_path, "prefs.js")):
|
||||
is_valid = "firefox"
|
||||
|
||||
if is_valid:
|
||||
# Get creation time
|
||||
created = datetime.datetime.fromtimestamp(
|
||||
os.path.getctime(profile_path)
|
||||
)
|
||||
|
||||
profiles.append({
|
||||
"name": name,
|
||||
"path": profile_path,
|
||||
"created": created,
|
||||
"type": is_valid
|
||||
})
|
||||
|
||||
# Sort by creation time, newest first
|
||||
profiles.sort(key=lambda x: x["created"], reverse=True)
|
||||
|
||||
return profiles
|
||||
|
||||
def get_profile_path(self, profile_name: str) -> Optional[str]:
|
||||
"""Get the full path to a profile by name.
|
||||
|
||||
Args:
|
||||
profile_name: Name of the profile (not the full path)
|
||||
|
||||
Returns:
|
||||
Full path to the profile directory, or None if not found
|
||||
"""
|
||||
profile_path = os.path.join(self.profiles_dir, profile_name)
|
||||
|
||||
# Check if path exists and is a valid profile
|
||||
if not os.path.isdir(profile_path):
|
||||
# Check if profile_name itself is full path
|
||||
if os.path.isabs(profile_name):
|
||||
profile_path = profile_name
|
||||
else:
|
||||
return None
|
||||
|
||||
# Look for profile indicators
|
||||
is_profile = (
|
||||
os.path.exists(os.path.join(profile_path, "Preferences")) or
|
||||
os.path.exists(os.path.join(profile_path, "Default", "Preferences")) or
|
||||
os.path.exists(os.path.join(profile_path, "prefs.js"))
|
||||
)
|
||||
|
||||
if not is_profile:
|
||||
return None # Not a valid browser profile
|
||||
|
||||
return profile_path
|
||||
|
||||
def delete_profile(self, profile_name_or_path: str) -> bool:
|
||||
"""Delete a browser profile by name or path.
|
||||
|
||||
Args:
|
||||
profile_name_or_path: Name of the profile or full path to profile directory
|
||||
|
||||
Returns:
|
||||
True if the profile was deleted successfully, False otherwise
|
||||
"""
|
||||
# Determine if input is a name or a path
|
||||
if os.path.isabs(profile_name_or_path):
|
||||
# Full path provided
|
||||
profile_path = profile_name_or_path
|
||||
else:
|
||||
# Just a name provided, construct path
|
||||
profile_path = os.path.join(self.profiles_dir, profile_name_or_path)
|
||||
|
||||
# Check if path exists and is a valid profile
|
||||
if not os.path.isdir(profile_path):
|
||||
return False
|
||||
|
||||
# Look for profile indicators
|
||||
is_profile = (
|
||||
os.path.exists(os.path.join(profile_path, "Preferences")) or
|
||||
os.path.exists(os.path.join(profile_path, "Default", "Preferences")) or
|
||||
os.path.exists(os.path.join(profile_path, "prefs.js"))
|
||||
)
|
||||
|
||||
if not is_profile:
|
||||
return False # Not a valid browser profile
|
||||
|
||||
# Delete the profile directory
|
||||
try:
|
||||
shutil.rmtree(profile_path)
|
||||
return True
|
||||
except Exception:
|
||||
return False
|
||||
|
||||
async def interactive_manager(self, crawl_callback=None):
|
||||
"""Launch an interactive profile management console.
|
||||
|
||||
Args:
|
||||
crawl_callback: Function to call when selecting option to use
|
||||
a profile for crawling. It will be called with (profile_path, url).
|
||||
"""
|
||||
while True:
|
||||
self.logger.info(f"\n{Fore.CYAN}Profile Management Options:{Style.RESET_ALL}", tag="MENU")
|
||||
self.logger.info(f"1. {Fore.GREEN}Create a new profile{Style.RESET_ALL}", tag="MENU")
|
||||
self.logger.info(f"2. {Fore.YELLOW}List available profiles{Style.RESET_ALL}", tag="MENU")
|
||||
self.logger.info(f"3. {Fore.RED}Delete a profile{Style.RESET_ALL}", tag="MENU")
|
||||
|
||||
# Only show crawl option if callback provided
|
||||
if crawl_callback:
|
||||
self.logger.info(f"4. {Fore.CYAN}Use a profile to crawl a website{Style.RESET_ALL}", tag="MENU")
|
||||
self.logger.info(f"5. {Fore.MAGENTA}Exit{Style.RESET_ALL}", tag="MENU")
|
||||
exit_option = "5"
|
||||
else:
|
||||
self.logger.info(f"4. {Fore.MAGENTA}Exit{Style.RESET_ALL}", tag="MENU")
|
||||
exit_option = "4"
|
||||
|
||||
choice = input(f"\n{Fore.CYAN}Enter your choice (1-{exit_option}): {Style.RESET_ALL}")
|
||||
|
||||
if choice == "1":
|
||||
# Create new profile
|
||||
name = input(f"{Fore.GREEN}Enter a name for the new profile (or press Enter for auto-generated name): {Style.RESET_ALL}")
|
||||
await self.create_profile(name or None)
|
||||
|
||||
elif choice == "2":
|
||||
# List profiles
|
||||
profiles = self.list_profiles()
|
||||
|
||||
if not profiles:
|
||||
self.logger.warning(" No profiles found. Create one first with option 1.", tag="PROFILES")
|
||||
continue
|
||||
|
||||
# Print profile information with colorama formatting
|
||||
self.logger.info("\nAvailable profiles:", tag="PROFILES")
|
||||
for i, profile in enumerate(profiles):
|
||||
self.logger.info(f"[{i+1}] {Fore.CYAN}{profile['name']}{Style.RESET_ALL}", tag="PROFILES")
|
||||
self.logger.info(f" Path: {Fore.YELLOW}{profile['path']}{Style.RESET_ALL}", tag="PROFILES")
|
||||
self.logger.info(f" Created: {profile['created'].strftime('%Y-%m-%d %H:%M:%S')}", tag="PROFILES")
|
||||
self.logger.info(f" Browser type: {profile['type']}", tag="PROFILES")
|
||||
self.logger.info("", tag="PROFILES") # Empty line for spacing
|
||||
|
||||
elif choice == "3":
|
||||
# Delete profile
|
||||
profiles = self.list_profiles()
|
||||
if not profiles:
|
||||
self.logger.warning("No profiles found to delete", tag="PROFILES")
|
||||
continue
|
||||
|
||||
# Display numbered list
|
||||
self.logger.info(f"\n{Fore.YELLOW}Available profiles:{Style.RESET_ALL}", tag="PROFILES")
|
||||
for i, profile in enumerate(profiles):
|
||||
self.logger.info(f"[{i+1}] {profile['name']}", tag="PROFILES")
|
||||
|
||||
# Get profile to delete
|
||||
profile_idx = input(f"{Fore.RED}Enter the number of the profile to delete (or 'c' to cancel): {Style.RESET_ALL}")
|
||||
if profile_idx.lower() == 'c':
|
||||
continue
|
||||
|
||||
try:
|
||||
idx = int(profile_idx) - 1
|
||||
if 0 <= idx < len(profiles):
|
||||
profile_name = profiles[idx]["name"]
|
||||
self.logger.info(f"Deleting profile: {Fore.YELLOW}{profile_name}{Style.RESET_ALL}", tag="PROFILES")
|
||||
|
||||
# Confirm deletion
|
||||
confirm = input(f"{Fore.RED}Are you sure you want to delete this profile? (y/n): {Style.RESET_ALL}")
|
||||
if confirm.lower() == 'y':
|
||||
success = self.delete_profile(profiles[idx]["path"])
|
||||
|
||||
if success:
|
||||
self.logger.success(f"Profile {Fore.GREEN}{profile_name}{Style.RESET_ALL} deleted successfully", tag="PROFILES")
|
||||
else:
|
||||
self.logger.error(f"Failed to delete profile {Fore.RED}{profile_name}{Style.RESET_ALL}", tag="PROFILES")
|
||||
else:
|
||||
self.logger.error("Invalid profile number", tag="PROFILES")
|
||||
except ValueError:
|
||||
self.logger.error("Please enter a valid number", tag="PROFILES")
|
||||
|
||||
elif choice == "4" and crawl_callback:
|
||||
# Use profile to crawl a site
|
||||
profiles = self.list_profiles()
|
||||
if not profiles:
|
||||
self.logger.warning("No profiles found. Create one first.", tag="PROFILES")
|
||||
continue
|
||||
|
||||
# Display numbered list
|
||||
self.logger.info(f"\n{Fore.YELLOW}Available profiles:{Style.RESET_ALL}", tag="PROFILES")
|
||||
for i, profile in enumerate(profiles):
|
||||
self.logger.info(f"[{i+1}] {profile['name']}", tag="PROFILES")
|
||||
|
||||
# Get profile to use
|
||||
profile_idx = input(f"{Fore.CYAN}Enter the number of the profile to use (or 'c' to cancel): {Style.RESET_ALL}")
|
||||
if profile_idx.lower() == 'c':
|
||||
continue
|
||||
|
||||
try:
|
||||
idx = int(profile_idx) - 1
|
||||
if 0 <= idx < len(profiles):
|
||||
profile_path = profiles[idx]["path"]
|
||||
url = input(f"{Fore.CYAN}Enter the URL to crawl: {Style.RESET_ALL}")
|
||||
if url:
|
||||
# Call the provided crawl callback
|
||||
await crawl_callback(profile_path, url)
|
||||
else:
|
||||
self.logger.error("No URL provided", tag="CRAWL")
|
||||
else:
|
||||
self.logger.error("Invalid profile number", tag="PROFILES")
|
||||
except ValueError:
|
||||
self.logger.error("Please enter a valid number", tag="PROFILES")
|
||||
|
||||
elif (choice == "4" and not crawl_callback) or (choice == "5" and crawl_callback):
|
||||
# Exit
|
||||
self.logger.info("Exiting profile management", tag="MENU")
|
||||
break
|
||||
|
||||
else:
|
||||
self.logger.error(f"Invalid choice. Please enter a number between 1 and {exit_option}.", tag="MENU")
|
||||
1048
crawl4ai/browser/strategies.py
Normal file
1048
crawl4ai/browser/strategies.py
Normal file
File diff suppressed because it is too large
Load Diff
105
crawl4ai/browser/utils.py
Normal file
105
crawl4ai/browser/utils.py
Normal file
@@ -0,0 +1,105 @@
|
||||
"""Browser utilities module for Crawl4AI.
|
||||
|
||||
This module provides utility functions for browser management,
|
||||
including process management, CDP connection utilities,
|
||||
and Playwright instance management.
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
import os
|
||||
import sys
|
||||
import platform
|
||||
import tempfile
|
||||
from typing import Optional, Any
|
||||
|
||||
from playwright.async_api import async_playwright
|
||||
|
||||
from ..async_logger import AsyncLogger
|
||||
from ..utils import get_chromium_path
|
||||
|
||||
_playwright_instance = None
|
||||
|
||||
async def get_playwright():
|
||||
"""Get or create the Playwright instance (singleton pattern).
|
||||
|
||||
Returns:
|
||||
Playwright: The Playwright instance
|
||||
"""
|
||||
global _playwright_instance
|
||||
if _playwright_instance is None or True:
|
||||
_playwright_instance = await async_playwright().start()
|
||||
return _playwright_instance
|
||||
|
||||
def get_browser_executable(browser_type: str) -> str:
|
||||
"""Get the path to browser executable, with platform-specific handling.
|
||||
|
||||
Args:
|
||||
browser_type: Type of browser (chromium, firefox, webkit)
|
||||
|
||||
Returns:
|
||||
Path to browser executable
|
||||
"""
|
||||
return get_chromium_path(browser_type)
|
||||
|
||||
def create_temp_directory(prefix="browser-profile-") -> str:
|
||||
"""Create a temporary directory for browser data.
|
||||
|
||||
Args:
|
||||
prefix: Prefix for the temporary directory name
|
||||
|
||||
Returns:
|
||||
Path to the created temporary directory
|
||||
"""
|
||||
return tempfile.mkdtemp(prefix=prefix)
|
||||
|
||||
def is_windows() -> bool:
|
||||
"""Check if the current platform is Windows.
|
||||
|
||||
Returns:
|
||||
True if Windows, False otherwise
|
||||
"""
|
||||
return sys.platform == "win32"
|
||||
|
||||
def is_macos() -> bool:
|
||||
"""Check if the current platform is macOS.
|
||||
|
||||
Returns:
|
||||
True if macOS, False otherwise
|
||||
"""
|
||||
return sys.platform == "darwin"
|
||||
|
||||
def is_linux() -> bool:
|
||||
"""Check if the current platform is Linux.
|
||||
|
||||
Returns:
|
||||
True if Linux, False otherwise
|
||||
"""
|
||||
return not (is_windows() or is_macos())
|
||||
|
||||
def get_browser_disable_options() -> list:
|
||||
"""Get standard list of browser disable options for performance.
|
||||
|
||||
Returns:
|
||||
List of command-line options to disable various browser features
|
||||
"""
|
||||
return [
|
||||
"--disable-background-networking",
|
||||
"--disable-background-timer-throttling",
|
||||
"--disable-backgrounding-occluded-windows",
|
||||
"--disable-breakpad",
|
||||
"--disable-client-side-phishing-detection",
|
||||
"--disable-component-extensions-with-background-pages",
|
||||
"--disable-default-apps",
|
||||
"--disable-extensions",
|
||||
"--disable-features=TranslateUI",
|
||||
"--disable-hang-monitor",
|
||||
"--disable-ipc-flooding-protection",
|
||||
"--disable-popup-blocking",
|
||||
"--disable-prompt-on-repost",
|
||||
"--disable-sync",
|
||||
"--force-color-profile=srgb",
|
||||
"--metrics-recording-only",
|
||||
"--no-first-run",
|
||||
"--password-store=basic",
|
||||
"--use-mock-keychain",
|
||||
]
|
||||
@@ -163,6 +163,7 @@ class ManagedBrowser:
|
||||
)
|
||||
|
||||
# We'll monitor for a short time to make sure it starts properly, but won't keep monitoring
|
||||
await asyncio.sleep(0.5) # Give browser time to start
|
||||
await self._initial_startup_check()
|
||||
await asyncio.sleep(2) # Give browser time to start
|
||||
return f"http://{self.host}:{self.debugging_port}"
|
||||
|
||||
@@ -555,7 +555,6 @@ class BrowserProfiler:
|
||||
else:
|
||||
self.logger.error(f"Invalid choice. Please enter a number between 1 and {exit_option}.", tag="MENU")
|
||||
|
||||
|
||||
async def launch_standalone_browser(self,
|
||||
browser_type: str = "chromium",
|
||||
user_data_dir: Optional[str] = None,
|
||||
|
||||
@@ -9,6 +9,26 @@ from crawl4ai import (
|
||||
CrawlResult
|
||||
)
|
||||
|
||||
async def example_cdp():
|
||||
browser_conf = BrowserConfig(
|
||||
headless=False,
|
||||
cdp_url="http://localhost:9223"
|
||||
)
|
||||
crawler_config = CrawlerRunConfig(
|
||||
session_id="test",
|
||||
js_code = """(() => { return {"result": "Hello World!"} })()""",
|
||||
js_only=True
|
||||
)
|
||||
async with AsyncWebCrawler(
|
||||
config=browser_conf,
|
||||
verbose=True,
|
||||
) as crawler:
|
||||
result : CrawlResult = await crawler.arun(
|
||||
url="https://www.helloworld.org",
|
||||
config=crawler_config,
|
||||
)
|
||||
print(result.js_execution_result)
|
||||
|
||||
|
||||
async def main():
|
||||
browser_config = BrowserConfig(headless=True, verbose=True)
|
||||
@@ -16,18 +36,15 @@ async def main():
|
||||
crawler_config = CrawlerRunConfig(
|
||||
cache_mode=CacheMode.BYPASS,
|
||||
markdown_generator=DefaultMarkdownGenerator(
|
||||
# content_filter=PruningContentFilter(
|
||||
# threshold=0.48, threshold_type="fixed", min_word_threshold=0
|
||||
# )
|
||||
content_filter=PruningContentFilter(
|
||||
threshold=0.48, threshold_type="fixed", min_word_threshold=0
|
||||
)
|
||||
),
|
||||
)
|
||||
result : CrawlResult = await crawler.arun(
|
||||
# url="https://www.helloworld.org", config=crawler_config
|
||||
url="https://www.kidocode.com", config=crawler_config
|
||||
url="https://www.helloworld.org", config=crawler_config
|
||||
)
|
||||
print(result.markdown.raw_markdown[:500])
|
||||
# print(result.model_dump())
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(main())
|
||||
|
||||
190
tests/browser/test_browser_manager.py
Normal file
190
tests/browser/test_browser_manager.py
Normal file
@@ -0,0 +1,190 @@
|
||||
"""Test examples for BrowserManager.
|
||||
|
||||
These examples demonstrate the functionality of BrowserManager
|
||||
and serve as functional tests.
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
import os
|
||||
import sys
|
||||
from typing import List
|
||||
|
||||
# Add the project root to Python path if running directly
|
||||
if __name__ == "__main__":
|
||||
sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '../..')))
|
||||
|
||||
from crawl4ai.browser import BrowserManager
|
||||
from crawl4ai.async_configs import BrowserConfig, CrawlerRunConfig
|
||||
from crawl4ai.async_logger import AsyncLogger
|
||||
|
||||
# Create a logger for clear terminal output
|
||||
logger = AsyncLogger(verbose=True, log_file=None)
|
||||
|
||||
async def test_basic_browser_manager():
|
||||
"""Test basic BrowserManager functionality with default configuration."""
|
||||
logger.info("Starting test_basic_browser_manager", tag="TEST")
|
||||
|
||||
try:
|
||||
# Create a browser manager with default config
|
||||
manager = BrowserManager(logger=logger)
|
||||
|
||||
# Start the browser
|
||||
await manager.start()
|
||||
logger.info("Browser started successfully", tag="TEST")
|
||||
|
||||
# Get a page
|
||||
crawler_config = CrawlerRunConfig(url="https://example.com")
|
||||
page, context = await manager.get_page(crawler_config)
|
||||
logger.info("Page created successfully", tag="TEST")
|
||||
|
||||
# Navigate to a website
|
||||
await page.goto("https://example.com")
|
||||
title = await page.title()
|
||||
logger.info(f"Page title: {title}", tag="TEST")
|
||||
|
||||
# Clean up
|
||||
await manager.close()
|
||||
logger.success("test_basic_browser_manager completed successfully", tag="TEST")
|
||||
return True
|
||||
except Exception as e:
|
||||
logger.error(f"test_basic_browser_manager failed: {str(e)}", tag="TEST")
|
||||
return False
|
||||
|
||||
async def test_custom_browser_config():
|
||||
"""Test BrowserManager with custom browser configuration."""
|
||||
logger.info("Starting test_custom_browser_config", tag="TEST")
|
||||
|
||||
try:
|
||||
# Create a custom browser config
|
||||
browser_config = BrowserConfig(
|
||||
browser_type="chromium",
|
||||
headless=True,
|
||||
viewport_width=1280,
|
||||
viewport_height=800,
|
||||
light_mode=True
|
||||
)
|
||||
|
||||
# Create browser manager with the config
|
||||
manager = BrowserManager(browser_config=browser_config, logger=logger)
|
||||
|
||||
# Start the browser
|
||||
await manager.start()
|
||||
logger.info("Browser started successfully with custom config", tag="TEST")
|
||||
|
||||
# Get a page
|
||||
crawler_config = CrawlerRunConfig(url="https://example.com")
|
||||
page, context = await manager.get_page(crawler_config)
|
||||
|
||||
# Navigate to a website
|
||||
await page.goto("https://example.com")
|
||||
title = await page.title()
|
||||
logger.info(f"Page title: {title}", tag="TEST")
|
||||
|
||||
# Verify viewport size
|
||||
viewport_size = await page.evaluate("() => ({ width: window.innerWidth, height: window.innerHeight })")
|
||||
logger.info(f"Viewport size: {viewport_size}", tag="TEST")
|
||||
|
||||
# Clean up
|
||||
await manager.close()
|
||||
logger.success("test_custom_browser_config completed successfully", tag="TEST")
|
||||
return True
|
||||
except Exception as e:
|
||||
logger.error(f"test_custom_browser_config failed: {str(e)}", tag="TEST")
|
||||
return False
|
||||
|
||||
async def test_multiple_pages():
|
||||
"""Test BrowserManager with multiple pages."""
|
||||
logger.info("Starting test_multiple_pages", tag="TEST")
|
||||
|
||||
try:
|
||||
# Create browser manager
|
||||
manager = BrowserManager(logger=logger)
|
||||
|
||||
# Start the browser
|
||||
await manager.start()
|
||||
logger.info("Browser started successfully", tag="TEST")
|
||||
|
||||
# Create multiple pages
|
||||
pages = []
|
||||
urls = ["https://example.com", "https://example.org", "https://mozilla.org"]
|
||||
|
||||
for i, url in enumerate(urls):
|
||||
crawler_config = CrawlerRunConfig(url=url)
|
||||
page, context = await manager.get_page(crawler_config)
|
||||
await page.goto(url)
|
||||
pages.append((page, url))
|
||||
logger.info(f"Created page {i+1} for {url}", tag="TEST")
|
||||
|
||||
# Verify all pages are loaded correctly
|
||||
for i, (page, url) in enumerate(pages):
|
||||
title = await page.title()
|
||||
logger.info(f"Page {i+1} title: {title}", tag="TEST")
|
||||
|
||||
# Clean up
|
||||
await manager.close()
|
||||
logger.success("test_multiple_pages completed successfully", tag="TEST")
|
||||
return True
|
||||
except Exception as e:
|
||||
logger.error(f"test_multiple_pages failed: {str(e)}", tag="TEST")
|
||||
return False
|
||||
|
||||
async def test_session_management():
|
||||
"""Test session management in BrowserManager."""
|
||||
logger.info("Starting test_session_management", tag="TEST")
|
||||
|
||||
try:
|
||||
# Create browser manager
|
||||
manager = BrowserManager(logger=logger)
|
||||
|
||||
# Start the browser
|
||||
await manager.start()
|
||||
logger.info("Browser started successfully", tag="TEST")
|
||||
|
||||
# Create a session
|
||||
session_id = "test_session_1"
|
||||
crawler_config = CrawlerRunConfig(url="https://example.com", session_id=session_id)
|
||||
page1, context1 = await manager.get_page(crawler_config)
|
||||
await page1.goto("https://example.com")
|
||||
logger.info(f"Created session with ID: {session_id}", tag="TEST")
|
||||
|
||||
# Get the same session again
|
||||
page2, context2 = await manager.get_page(crawler_config)
|
||||
|
||||
# Verify it's the same page/context
|
||||
is_same_page = page1 == page2
|
||||
is_same_context = context1 == context2
|
||||
logger.info(f"Same page: {is_same_page}, Same context: {is_same_context}", tag="TEST")
|
||||
|
||||
# Kill the session
|
||||
await manager.kill_session(session_id)
|
||||
logger.info(f"Killed session with ID: {session_id}", tag="TEST")
|
||||
|
||||
# Clean up
|
||||
await manager.close()
|
||||
logger.success("test_session_management completed successfully", tag="TEST")
|
||||
return True
|
||||
except Exception as e:
|
||||
logger.error(f"test_session_management failed: {str(e)}", tag="TEST")
|
||||
return False
|
||||
|
||||
async def run_tests():
|
||||
"""Run all tests sequentially."""
|
||||
results = []
|
||||
|
||||
# results.append(await test_basic_browser_manager())
|
||||
# results.append(await test_custom_browser_config())
|
||||
# results.append(await test_multiple_pages())
|
||||
results.append(await test_session_management())
|
||||
|
||||
# Print summary
|
||||
total = len(results)
|
||||
passed = sum(results)
|
||||
logger.info(f"Tests complete: {passed}/{total} passed", tag="SUMMARY")
|
||||
|
||||
if passed == total:
|
||||
logger.success("All tests passed!", tag="SUMMARY")
|
||||
else:
|
||||
logger.error(f"{total - passed} tests failed", tag="SUMMARY")
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(run_tests())
|
||||
160
tests/browser/test_builtin_strategy.py
Normal file
160
tests/browser/test_builtin_strategy.py
Normal file
@@ -0,0 +1,160 @@
|
||||
"""Test examples for BuiltinBrowserStrategy.
|
||||
|
||||
These examples demonstrate the functionality of BuiltinBrowserStrategy
|
||||
and serve as functional tests.
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
import os
|
||||
import sys
|
||||
|
||||
# Add the project root to Python path if running directly
|
||||
if __name__ == "__main__":
|
||||
sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '../..')))
|
||||
|
||||
from crawl4ai.browser import BrowserManager
|
||||
from crawl4ai.async_configs import BrowserConfig, CrawlerRunConfig
|
||||
from crawl4ai.async_logger import AsyncLogger
|
||||
|
||||
# Create a logger for clear terminal output
|
||||
logger = AsyncLogger(verbose=True, log_file=None)
|
||||
|
||||
async def test_builtin_browser():
|
||||
"""Test using a builtin browser that persists between sessions."""
|
||||
logger.info("Testing builtin browser", tag="TEST")
|
||||
|
||||
browser_config = BrowserConfig(
|
||||
browser_mode="builtin",
|
||||
headless=True
|
||||
)
|
||||
|
||||
manager = BrowserManager(browser_config=browser_config, logger=logger)
|
||||
|
||||
try:
|
||||
# Start should connect to existing builtin browser or create one
|
||||
await manager.start()
|
||||
logger.info("Connected to builtin browser", tag="TEST")
|
||||
|
||||
# Test page creation
|
||||
crawler_config = CrawlerRunConfig()
|
||||
page, context = await manager.get_page(crawler_config)
|
||||
|
||||
# Test navigation
|
||||
await page.goto("https://example.com")
|
||||
title = await page.title()
|
||||
logger.info(f"Page title: {title}", tag="TEST")
|
||||
|
||||
# Close manager (should not close the builtin browser)
|
||||
await manager.close()
|
||||
logger.info("First session closed", tag="TEST")
|
||||
|
||||
# Create a second manager to verify browser persistence
|
||||
logger.info("Creating second session to verify persistence", tag="TEST")
|
||||
manager2 = BrowserManager(browser_config=browser_config, logger=logger)
|
||||
|
||||
await manager2.start()
|
||||
logger.info("Connected to existing builtin browser", tag="TEST")
|
||||
|
||||
page2, context2 = await manager2.get_page(crawler_config)
|
||||
await page2.goto("https://example.org")
|
||||
title2 = await page2.title()
|
||||
logger.info(f"Second session page title: {title2}", tag="TEST")
|
||||
|
||||
await manager2.close()
|
||||
logger.info("Second session closed successfully", tag="TEST")
|
||||
|
||||
return True
|
||||
except Exception as e:
|
||||
logger.error(f"Test failed: {str(e)}", tag="TEST")
|
||||
try:
|
||||
await manager.close()
|
||||
except:
|
||||
pass
|
||||
return False
|
||||
|
||||
async def test_builtin_browser_status():
|
||||
"""Test getting status of the builtin browser."""
|
||||
logger.info("Testing builtin browser status", tag="TEST")
|
||||
|
||||
from crawl4ai.browser.strategies import BuiltinBrowserStrategy
|
||||
|
||||
browser_config = BrowserConfig(
|
||||
browser_mode="builtin",
|
||||
headless=True
|
||||
)
|
||||
|
||||
# Create strategy directly to access its status methods
|
||||
strategy = BuiltinBrowserStrategy(browser_config, logger)
|
||||
|
||||
try:
|
||||
# Get status before starting (should be not running)
|
||||
status_before = await strategy.get_builtin_browser_status()
|
||||
logger.info(f"Initial status: {status_before}", tag="TEST")
|
||||
|
||||
# Start the browser
|
||||
await strategy.start()
|
||||
logger.info("Browser started successfully", tag="TEST")
|
||||
|
||||
# Get status after starting
|
||||
status_after = await strategy.get_builtin_browser_status()
|
||||
logger.info(f"Status after start: {status_after}", tag="TEST")
|
||||
|
||||
# Create a page to verify functionality
|
||||
crawler_config = CrawlerRunConfig()
|
||||
page, context = await strategy.get_page(crawler_config)
|
||||
await page.goto("https://example.com")
|
||||
title = await page.title()
|
||||
logger.info(f"Page title: {title}", tag="TEST")
|
||||
|
||||
# Close strategy (should not kill the builtin browser)
|
||||
await strategy.close()
|
||||
logger.info("Strategy closed successfully", tag="TEST")
|
||||
|
||||
# Create a new strategy object
|
||||
strategy2 = BuiltinBrowserStrategy(browser_config, logger)
|
||||
|
||||
# Get status again (should still be running)
|
||||
status_final = await strategy2.get_builtin_browser_status()
|
||||
logger.info(f"Final status: {status_final}", tag="TEST")
|
||||
|
||||
# Verify that the status shows the browser is running
|
||||
is_running = status_final.get('running', False)
|
||||
logger.info(f"Builtin browser persistence confirmed: {is_running}", tag="TEST")
|
||||
|
||||
# Kill the builtin browser to clean up
|
||||
logger.info("Killing builtin browser", tag="TEST")
|
||||
success = await strategy2.kill_builtin_browser()
|
||||
logger.info(f"Killed builtin browser successfully: {success}", tag="TEST")
|
||||
|
||||
return is_running and success
|
||||
except Exception as e:
|
||||
logger.error(f"Test failed: {str(e)}", tag="TEST")
|
||||
try:
|
||||
await strategy.close()
|
||||
|
||||
# Try to kill the builtin browser to clean up
|
||||
strategy2 = BuiltinBrowserStrategy(browser_config, logger)
|
||||
await strategy2.kill_builtin_browser()
|
||||
except:
|
||||
pass
|
||||
return False
|
||||
|
||||
async def run_tests():
|
||||
"""Run all tests sequentially."""
|
||||
results = []
|
||||
|
||||
results.append(await test_builtin_browser())
|
||||
results.append(await test_builtin_browser_status())
|
||||
|
||||
# Print summary
|
||||
total = len(results)
|
||||
passed = sum(results)
|
||||
logger.info(f"Tests complete: {passed}/{total} passed", tag="SUMMARY")
|
||||
|
||||
if passed == total:
|
||||
logger.success("All tests passed!", tag="SUMMARY")
|
||||
else:
|
||||
logger.error(f"{total - passed} tests failed", tag="SUMMARY")
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(run_tests())
|
||||
227
tests/browser/test_cdp_strategy.py
Normal file
227
tests/browser/test_cdp_strategy.py
Normal file
@@ -0,0 +1,227 @@
|
||||
"""Test examples for CDPBrowserStrategy.
|
||||
|
||||
These examples demonstrate the functionality of CDPBrowserStrategy
|
||||
and serve as functional tests.
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
import os
|
||||
import sys
|
||||
|
||||
# Add the project root to Python path if running directly
|
||||
if __name__ == "__main__":
|
||||
sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '../..')))
|
||||
|
||||
from crawl4ai.browser import BrowserManager
|
||||
from crawl4ai.async_configs import BrowserConfig, CrawlerRunConfig
|
||||
from crawl4ai.async_logger import AsyncLogger
|
||||
|
||||
# Create a logger for clear terminal output
|
||||
logger = AsyncLogger(verbose=True, log_file=None)
|
||||
|
||||
async def test_cdp_launch_connect():
|
||||
"""Test launching a browser and connecting via CDP."""
|
||||
logger.info("Testing launch and connect via CDP", tag="TEST")
|
||||
|
||||
browser_config = BrowserConfig(
|
||||
use_managed_browser=True,
|
||||
headless=True
|
||||
)
|
||||
|
||||
manager = BrowserManager(browser_config=browser_config, logger=logger)
|
||||
|
||||
try:
|
||||
await manager.start()
|
||||
logger.info("Browser launched and connected via CDP", tag="TEST")
|
||||
|
||||
# Test with multiple pages
|
||||
pages = []
|
||||
for i in range(3):
|
||||
crawler_config = CrawlerRunConfig()
|
||||
page, context = await manager.get_page(crawler_config)
|
||||
await page.goto(f"https://example.com?test={i}")
|
||||
pages.append(page)
|
||||
logger.info(f"Created page {i+1}", tag="TEST")
|
||||
|
||||
# Verify all pages are working
|
||||
for i, page in enumerate(pages):
|
||||
title = await page.title()
|
||||
logger.info(f"Page {i+1} title: {title}", tag="TEST")
|
||||
|
||||
await manager.close()
|
||||
logger.info("Browser closed successfully", tag="TEST")
|
||||
|
||||
return True
|
||||
except Exception as e:
|
||||
logger.error(f"Test failed: {str(e)}", tag="TEST")
|
||||
try:
|
||||
await manager.close()
|
||||
except:
|
||||
pass
|
||||
return False
|
||||
|
||||
async def test_cdp_with_user_data_dir():
|
||||
"""Test CDP browser with a user data directory."""
|
||||
logger.info("Testing CDP browser with user data directory", tag="TEST")
|
||||
|
||||
# Create a temporary user data directory
|
||||
import tempfile
|
||||
user_data_dir = tempfile.mkdtemp(prefix="crawl4ai-test-")
|
||||
logger.info(f"Created temporary user data directory: {user_data_dir}", tag="TEST")
|
||||
|
||||
browser_config = BrowserConfig(
|
||||
use_managed_browser=True,
|
||||
headless=True,
|
||||
user_data_dir=user_data_dir
|
||||
)
|
||||
|
||||
manager = BrowserManager(browser_config=browser_config, logger=logger)
|
||||
|
||||
try:
|
||||
await manager.start()
|
||||
logger.info("Browser launched with user data directory", tag="TEST")
|
||||
|
||||
# Navigate to a page and store some data
|
||||
crawler_config = CrawlerRunConfig()
|
||||
page, context = await manager.get_page(crawler_config)
|
||||
|
||||
# Set a cookie
|
||||
await context.add_cookies([{
|
||||
"name": "test_cookie",
|
||||
"value": "test_value",
|
||||
"url": "https://example.com"
|
||||
}])
|
||||
|
||||
# Visit the site
|
||||
await page.goto("https://example.com")
|
||||
|
||||
# Verify cookie was set
|
||||
cookies = await context.cookies(["https://example.com"])
|
||||
has_test_cookie = any(cookie["name"] == "test_cookie" for cookie in cookies)
|
||||
logger.info(f"Cookie set successfully: {has_test_cookie}", tag="TEST")
|
||||
|
||||
# Close the browser
|
||||
await manager.close()
|
||||
logger.info("First browser session closed", tag="TEST")
|
||||
|
||||
# Start a new browser with the same user data directory
|
||||
logger.info("Starting second browser session with same user data directory", tag="TEST")
|
||||
manager2 = BrowserManager(browser_config=browser_config, logger=logger)
|
||||
await manager2.start()
|
||||
|
||||
# Get a new page and check if the cookie persists
|
||||
page2, context2 = await manager2.get_page(crawler_config)
|
||||
await page2.goto("https://example.com")
|
||||
|
||||
# Verify cookie persisted
|
||||
cookies2 = await context2.cookies(["https://example.com"])
|
||||
has_test_cookie2 = any(cookie["name"] == "test_cookie" for cookie in cookies2)
|
||||
logger.info(f"Cookie persisted across sessions: {has_test_cookie2}", tag="TEST")
|
||||
|
||||
# Clean up
|
||||
await manager2.close()
|
||||
|
||||
# Remove temporary directory
|
||||
import shutil
|
||||
shutil.rmtree(user_data_dir, ignore_errors=True)
|
||||
logger.info(f"Removed temporary user data directory", tag="TEST")
|
||||
|
||||
return has_test_cookie and has_test_cookie2
|
||||
except Exception as e:
|
||||
logger.error(f"Test failed: {str(e)}", tag="TEST")
|
||||
try:
|
||||
await manager.close()
|
||||
except:
|
||||
pass
|
||||
|
||||
# Clean up temporary directory
|
||||
try:
|
||||
import shutil
|
||||
shutil.rmtree(user_data_dir, ignore_errors=True)
|
||||
except:
|
||||
pass
|
||||
|
||||
return False
|
||||
|
||||
async def test_cdp_session_management():
|
||||
"""Test session management with CDP browser."""
|
||||
logger.info("Testing session management with CDP browser", tag="TEST")
|
||||
|
||||
browser_config = BrowserConfig(
|
||||
use_managed_browser=True,
|
||||
headless=True
|
||||
)
|
||||
|
||||
manager = BrowserManager(browser_config=browser_config, logger=logger)
|
||||
|
||||
try:
|
||||
await manager.start()
|
||||
logger.info("Browser launched successfully", tag="TEST")
|
||||
|
||||
# Create two sessions
|
||||
session1_id = "test_session_1"
|
||||
session2_id = "test_session_2"
|
||||
|
||||
# Set up first session
|
||||
crawler_config1 = CrawlerRunConfig(session_id=session1_id)
|
||||
page1, context1 = await manager.get_page(crawler_config1)
|
||||
await page1.goto("https://example.com")
|
||||
await page1.evaluate("localStorage.setItem('session1_data', 'test_value')")
|
||||
logger.info(f"Set up session 1 with ID: {session1_id}", tag="TEST")
|
||||
|
||||
# Set up second session
|
||||
crawler_config2 = CrawlerRunConfig(session_id=session2_id)
|
||||
page2, context2 = await manager.get_page(crawler_config2)
|
||||
await page2.goto("https://example.org")
|
||||
await page2.evaluate("localStorage.setItem('session2_data', 'test_value2')")
|
||||
logger.info(f"Set up session 2 with ID: {session2_id}", tag="TEST")
|
||||
|
||||
# Get first session again
|
||||
page1_again, _ = await manager.get_page(crawler_config1)
|
||||
|
||||
# Verify it's the same page and data persists
|
||||
is_same_page = page1 == page1_again
|
||||
data1 = await page1_again.evaluate("localStorage.getItem('session1_data')")
|
||||
logger.info(f"Session 1 reuse successful: {is_same_page}, data: {data1}", tag="TEST")
|
||||
|
||||
# Kill first session
|
||||
await manager.kill_session(session1_id)
|
||||
logger.info(f"Killed session 1", tag="TEST")
|
||||
|
||||
# Verify second session still works
|
||||
data2 = await page2.evaluate("localStorage.getItem('session2_data')")
|
||||
logger.info(f"Session 2 still functional after killing session 1, data: {data2}", tag="TEST")
|
||||
|
||||
# Clean up
|
||||
await manager.close()
|
||||
logger.info("Browser closed successfully", tag="TEST")
|
||||
|
||||
return is_same_page and data1 == "test_value" and data2 == "test_value2"
|
||||
except Exception as e:
|
||||
logger.error(f"Test failed: {str(e)}", tag="TEST")
|
||||
try:
|
||||
await manager.close()
|
||||
except:
|
||||
pass
|
||||
return False
|
||||
|
||||
async def run_tests():
|
||||
"""Run all tests sequentially."""
|
||||
results = []
|
||||
|
||||
results.append(await test_cdp_launch_connect())
|
||||
results.append(await test_cdp_with_user_data_dir())
|
||||
results.append(await test_cdp_session_management())
|
||||
|
||||
# Print summary
|
||||
total = len(results)
|
||||
passed = sum(results)
|
||||
logger.info(f"Tests complete: {passed}/{total} passed", tag="SUMMARY")
|
||||
|
||||
if passed == total:
|
||||
logger.success("All tests passed!", tag="SUMMARY")
|
||||
else:
|
||||
logger.error(f"{total - passed} tests failed", tag="SUMMARY")
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(run_tests())
|
||||
77
tests/browser/test_combined.py
Normal file
77
tests/browser/test_combined.py
Normal file
@@ -0,0 +1,77 @@
|
||||
"""Combined test runner for all browser module tests.
|
||||
|
||||
This script runs all the browser module tests in sequence and
|
||||
provides a comprehensive summary.
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
import os
|
||||
import sys
|
||||
import time
|
||||
|
||||
# Add the project root to Python path if running directly
|
||||
if __name__ == "__main__":
|
||||
sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '../..')))
|
||||
|
||||
from crawl4ai.async_logger import AsyncLogger
|
||||
|
||||
# Create a logger for clear terminal output
|
||||
logger = AsyncLogger(verbose=True, log_file=None)
|
||||
|
||||
async def run_test_module(module_name, header):
|
||||
"""Run all tests in a module and return results."""
|
||||
logger.info(f"\n{'-'*30}", tag="TEST")
|
||||
logger.info(f"RUNNING: {header}", tag="TEST")
|
||||
logger.info(f"{'-'*30}", tag="TEST")
|
||||
|
||||
# Import the module dynamically
|
||||
module = __import__(f"tests.browser.{module_name}", fromlist=["run_tests"])
|
||||
|
||||
# Track time for performance measurement
|
||||
start_time = time.time()
|
||||
|
||||
# Run the tests
|
||||
await module.run_tests()
|
||||
|
||||
# Calculate time taken
|
||||
time_taken = time.time() - start_time
|
||||
logger.info(f"Time taken: {time_taken:.2f} seconds", tag="TIMING")
|
||||
|
||||
return time_taken
|
||||
|
||||
async def main():
|
||||
"""Run all test modules."""
|
||||
logger.info("STARTING COMPREHENSIVE BROWSER MODULE TESTS", tag="MAIN")
|
||||
|
||||
# List of test modules to run
|
||||
test_modules = [
|
||||
("test_browser_manager", "Browser Manager Tests"),
|
||||
("test_playwright_strategy", "Playwright Strategy Tests"),
|
||||
("test_cdp_strategy", "CDP Strategy Tests"),
|
||||
("test_builtin_strategy", "Builtin Browser Strategy Tests"),
|
||||
("test_profiles", "Profile Management Tests")
|
||||
]
|
||||
|
||||
# Run each test module
|
||||
timings = {}
|
||||
for module_name, header in test_modules:
|
||||
try:
|
||||
time_taken = await run_test_module(module_name, header)
|
||||
timings[module_name] = time_taken
|
||||
except Exception as e:
|
||||
logger.error(f"Error running {module_name}: {str(e)}", tag="ERROR")
|
||||
|
||||
# Print summary
|
||||
logger.info("\n\nTEST SUMMARY:", tag="SUMMARY")
|
||||
logger.info(f"{'-'*50}", tag="SUMMARY")
|
||||
for module_name, header in test_modules:
|
||||
if module_name in timings:
|
||||
logger.info(f"{header}: {timings[module_name]:.2f} seconds", tag="SUMMARY")
|
||||
else:
|
||||
logger.error(f"{header}: FAILED TO RUN", tag="SUMMARY")
|
||||
logger.info(f"{'-'*50}", tag="SUMMARY")
|
||||
total_time = sum(timings.values())
|
||||
logger.info(f"Total time: {total_time:.2f} seconds", tag="SUMMARY")
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(main())
|
||||
275
tests/browser/test_playwright_strategy.py
Normal file
275
tests/browser/test_playwright_strategy.py
Normal file
@@ -0,0 +1,275 @@
|
||||
"""Test examples for PlaywrightBrowserStrategy.
|
||||
|
||||
These examples demonstrate the functionality of PlaywrightBrowserStrategy
|
||||
and serve as functional tests.
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
import os
|
||||
import sys
|
||||
|
||||
# Add the project root to Python path if running directly
|
||||
if __name__ == "__main__":
|
||||
sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '../..')))
|
||||
|
||||
from crawl4ai.browser import BrowserManager
|
||||
from crawl4ai.async_configs import BrowserConfig, CrawlerRunConfig
|
||||
from crawl4ai.async_logger import AsyncLogger
|
||||
|
||||
# Create a logger for clear terminal output
|
||||
logger = AsyncLogger(verbose=True, log_file=None)
|
||||
|
||||
async def test_playwright_basic():
|
||||
"""Test basic Playwright browser functionality."""
|
||||
logger.info("Testing standard Playwright browser", tag="TEST")
|
||||
|
||||
# Create browser config for standard Playwright
|
||||
browser_config = BrowserConfig(
|
||||
headless=True,
|
||||
viewport_width=1280,
|
||||
viewport_height=800
|
||||
)
|
||||
|
||||
# Create browser manager with the config
|
||||
manager = BrowserManager(browser_config=browser_config, logger=logger)
|
||||
|
||||
try:
|
||||
# Start the browser
|
||||
await manager.start()
|
||||
logger.info("Browser started successfully", tag="TEST")
|
||||
|
||||
# Create crawler config
|
||||
crawler_config = CrawlerRunConfig(url="https://example.com")
|
||||
|
||||
# Get a page
|
||||
page, context = await manager.get_page(crawler_config)
|
||||
logger.info("Got page successfully", tag="TEST")
|
||||
|
||||
# Navigate to a website
|
||||
await page.goto("https://example.com")
|
||||
logger.info("Navigated to example.com", tag="TEST")
|
||||
|
||||
# Get page title
|
||||
title = await page.title()
|
||||
logger.info(f"Page title: {title}", tag="TEST")
|
||||
|
||||
# Clean up
|
||||
await manager.close()
|
||||
logger.info("Browser closed successfully", tag="TEST")
|
||||
|
||||
return True
|
||||
except Exception as e:
|
||||
logger.error(f"Test failed: {str(e)}", tag="TEST")
|
||||
# Ensure cleanup
|
||||
try:
|
||||
await manager.close()
|
||||
except:
|
||||
pass
|
||||
return False
|
||||
|
||||
async def test_playwright_text_mode():
|
||||
"""Test Playwright browser in text-only mode."""
|
||||
logger.info("Testing Playwright text mode", tag="TEST")
|
||||
|
||||
# Create browser config with text mode enabled
|
||||
browser_config = BrowserConfig(
|
||||
headless=True,
|
||||
text_mode=True # Enable text-only mode
|
||||
)
|
||||
|
||||
# Create browser manager with the config
|
||||
manager = BrowserManager(browser_config=browser_config, logger=logger)
|
||||
|
||||
try:
|
||||
# Start the browser
|
||||
await manager.start()
|
||||
logger.info("Browser started successfully in text mode", tag="TEST")
|
||||
|
||||
# Get a page
|
||||
crawler_config = CrawlerRunConfig(url="https://example.com")
|
||||
page, context = await manager.get_page(crawler_config)
|
||||
|
||||
# Navigate to a website
|
||||
await page.goto("https://example.com")
|
||||
logger.info("Navigated to example.com", tag="TEST")
|
||||
|
||||
# Get page title
|
||||
title = await page.title()
|
||||
logger.info(f"Page title: {title}", tag="TEST")
|
||||
|
||||
# Check if images are blocked in text mode
|
||||
# We'll check if any image requests were made
|
||||
has_images = False
|
||||
async with page.expect_request("**/*.{png,jpg,jpeg,gif,webp,svg}", timeout=1000) as request_info:
|
||||
try:
|
||||
# Try to load a page with images
|
||||
await page.goto("https://picsum.photos/", wait_until="domcontentloaded")
|
||||
request = await request_info.value
|
||||
has_images = True
|
||||
except:
|
||||
# Timeout without image requests means text mode is working
|
||||
has_images = False
|
||||
|
||||
logger.info(f"Text mode image blocking working: {not has_images}", tag="TEST")
|
||||
|
||||
# Clean up
|
||||
await manager.close()
|
||||
logger.info("Browser closed successfully", tag="TEST")
|
||||
|
||||
return True
|
||||
except Exception as e:
|
||||
logger.error(f"Test failed: {str(e)}", tag="TEST")
|
||||
# Ensure cleanup
|
||||
try:
|
||||
await manager.close()
|
||||
except:
|
||||
pass
|
||||
return False
|
||||
|
||||
async def test_playwright_context_reuse():
|
||||
"""Test context caching and reuse with identical configurations."""
|
||||
logger.info("Testing context reuse with identical configurations", tag="TEST")
|
||||
|
||||
# Create browser config
|
||||
browser_config = BrowserConfig(headless=True)
|
||||
|
||||
# Create browser manager
|
||||
manager = BrowserManager(browser_config=browser_config, logger=logger)
|
||||
|
||||
try:
|
||||
# Start the browser
|
||||
await manager.start()
|
||||
logger.info("Browser started successfully", tag="TEST")
|
||||
|
||||
# Create identical crawler configs
|
||||
crawler_config1 = CrawlerRunConfig(
|
||||
url="https://example.com",
|
||||
viewport_width=1280,
|
||||
viewport_height=800
|
||||
)
|
||||
|
||||
crawler_config2 = CrawlerRunConfig(
|
||||
url="https://example.org", # Different URL but same browser parameters
|
||||
viewport_width=1280,
|
||||
viewport_height=800
|
||||
)
|
||||
|
||||
# Get pages with these configs
|
||||
page1, context1 = await manager.get_page(crawler_config1)
|
||||
page2, context2 = await manager.get_page(crawler_config2)
|
||||
|
||||
# Check if contexts are reused
|
||||
is_same_context = context1 == context2
|
||||
logger.info(f"Contexts reused: {is_same_context}", tag="TEST")
|
||||
|
||||
# Now try with a different config
|
||||
crawler_config3 = CrawlerRunConfig(
|
||||
url="https://example.net",
|
||||
viewport_width=800, # Different viewport size
|
||||
viewport_height=600
|
||||
)
|
||||
|
||||
page3, context3 = await manager.get_page(crawler_config3)
|
||||
|
||||
# This should be a different context
|
||||
is_different_context = context1 != context3
|
||||
logger.info(f"Different contexts for different configs: {is_different_context}", tag="TEST")
|
||||
|
||||
# Clean up
|
||||
await manager.close()
|
||||
logger.info("Browser closed successfully", tag="TEST")
|
||||
|
||||
# Both tests should pass for success
|
||||
return is_same_context and is_different_context
|
||||
except Exception as e:
|
||||
logger.error(f"Test failed: {str(e)}", tag="TEST")
|
||||
# Ensure cleanup
|
||||
try:
|
||||
await manager.close()
|
||||
except:
|
||||
pass
|
||||
return False
|
||||
|
||||
async def test_playwright_session_management():
|
||||
"""Test session management with Playwright browser."""
|
||||
logger.info("Testing session management with Playwright browser", tag="TEST")
|
||||
|
||||
browser_config = BrowserConfig(
|
||||
headless=True
|
||||
)
|
||||
|
||||
manager = BrowserManager(browser_config=browser_config, logger=logger)
|
||||
|
||||
try:
|
||||
await manager.start()
|
||||
logger.info("Browser launched successfully", tag="TEST")
|
||||
|
||||
# Create two sessions
|
||||
session1_id = "playwright_session_1"
|
||||
session2_id = "playwright_session_2"
|
||||
|
||||
# Set up first session
|
||||
crawler_config1 = CrawlerRunConfig(session_id=session1_id, url="https://example.com")
|
||||
page1, context1 = await manager.get_page(crawler_config1)
|
||||
await page1.goto("https://example.com")
|
||||
await page1.evaluate("localStorage.setItem('playwright_session1_data', 'test_value1')")
|
||||
logger.info(f"Set up session 1 with ID: {session1_id}", tag="TEST")
|
||||
|
||||
# Set up second session
|
||||
crawler_config2 = CrawlerRunConfig(session_id=session2_id, url="https://example.org")
|
||||
page2, context2 = await manager.get_page(crawler_config2)
|
||||
await page2.goto("https://example.org")
|
||||
await page2.evaluate("localStorage.setItem('playwright_session2_data', 'test_value2')")
|
||||
logger.info(f"Set up session 2 with ID: {session2_id}", tag="TEST")
|
||||
|
||||
# Get first session again
|
||||
page1_again, context1_again = await manager.get_page(crawler_config1)
|
||||
|
||||
# Verify it's the same page and data persists
|
||||
is_same_page = page1 == page1_again
|
||||
is_same_context = context1 == context1_again
|
||||
data1 = await page1_again.evaluate("localStorage.getItem('playwright_session1_data')")
|
||||
logger.info(f"Session 1 reuse successful: {is_same_page}, data: {data1}", tag="TEST")
|
||||
|
||||
# Kill first session
|
||||
await manager.kill_session(session1_id)
|
||||
logger.info(f"Killed session 1", tag="TEST")
|
||||
|
||||
# Verify second session still works
|
||||
data2 = await page2.evaluate("localStorage.getItem('playwright_session2_data')")
|
||||
logger.info(f"Session 2 still functional after killing session 1, data: {data2}", tag="TEST")
|
||||
|
||||
# Clean up
|
||||
await manager.close()
|
||||
logger.info("Browser closed successfully", tag="TEST")
|
||||
|
||||
return is_same_page and is_same_context and data1 == "test_value1" and data2 == "test_value2"
|
||||
except Exception as e:
|
||||
logger.error(f"Test failed: {str(e)}", tag="TEST")
|
||||
try:
|
||||
await manager.close()
|
||||
except:
|
||||
pass
|
||||
return False
|
||||
|
||||
async def run_tests():
|
||||
"""Run all tests sequentially."""
|
||||
results = []
|
||||
|
||||
results.append(await test_playwright_basic())
|
||||
results.append(await test_playwright_text_mode())
|
||||
results.append(await test_playwright_context_reuse())
|
||||
results.append(await test_playwright_session_management())
|
||||
|
||||
# Print summary
|
||||
total = len(results)
|
||||
passed = sum(results)
|
||||
logger.info(f"Tests complete: {passed}/{total} passed", tag="SUMMARY")
|
||||
|
||||
if passed == total:
|
||||
logger.success("All tests passed!", tag="SUMMARY")
|
||||
else:
|
||||
logger.error(f"{total - passed} tests failed", tag="SUMMARY")
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(run_tests())
|
||||
176
tests/browser/test_profiles.py
Normal file
176
tests/browser/test_profiles.py
Normal file
@@ -0,0 +1,176 @@
|
||||
"""Test examples for BrowserProfileManager.
|
||||
|
||||
These examples demonstrate the functionality of BrowserProfileManager
|
||||
and serve as functional tests.
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
import os
|
||||
import sys
|
||||
import uuid
|
||||
import shutil
|
||||
|
||||
# Add the project root to Python path if running directly
|
||||
if __name__ == "__main__":
|
||||
sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '../..')))
|
||||
|
||||
from crawl4ai.browser import BrowserManager, BrowserProfileManager
|
||||
from crawl4ai.async_configs import BrowserConfig, CrawlerRunConfig
|
||||
from crawl4ai.async_logger import AsyncLogger
|
||||
|
||||
# Create a logger for clear terminal output
|
||||
logger = AsyncLogger(verbose=True, log_file=None)
|
||||
|
||||
async def test_profile_creation():
|
||||
"""Test creating and managing browser profiles."""
|
||||
logger.info("Testing profile creation and management", tag="TEST")
|
||||
|
||||
profile_manager = BrowserProfileManager(logger=logger)
|
||||
|
||||
try:
|
||||
# List existing profiles
|
||||
profiles = profile_manager.list_profiles()
|
||||
logger.info(f"Found {len(profiles)} existing profiles", tag="TEST")
|
||||
|
||||
# Generate a unique profile name for testing
|
||||
test_profile_name = f"test-profile-{uuid.uuid4().hex[:8]}"
|
||||
|
||||
# Create a test profile directory
|
||||
profile_path = os.path.join(profile_manager.profiles_dir, test_profile_name)
|
||||
os.makedirs(os.path.join(profile_path, "Default"), exist_ok=True)
|
||||
|
||||
# Create a dummy Preferences file to simulate a Chrome profile
|
||||
with open(os.path.join(profile_path, "Default", "Preferences"), "w") as f:
|
||||
f.write("{\"test\": true}")
|
||||
|
||||
logger.info(f"Created test profile at: {profile_path}", tag="TEST")
|
||||
|
||||
# Verify the profile is now in the list
|
||||
profiles = profile_manager.list_profiles()
|
||||
profile_found = any(p["name"] == test_profile_name for p in profiles)
|
||||
logger.info(f"Profile found in list: {profile_found}", tag="TEST")
|
||||
|
||||
# Try to get the profile path
|
||||
retrieved_path = profile_manager.get_profile_path(test_profile_name)
|
||||
path_match = retrieved_path == profile_path
|
||||
logger.info(f"Retrieved correct profile path: {path_match}", tag="TEST")
|
||||
|
||||
# Delete the profile
|
||||
success = profile_manager.delete_profile(test_profile_name)
|
||||
logger.info(f"Profile deletion successful: {success}", tag="TEST")
|
||||
|
||||
# Verify it's gone
|
||||
profiles_after = profile_manager.list_profiles()
|
||||
profile_removed = not any(p["name"] == test_profile_name for p in profiles_after)
|
||||
logger.info(f"Profile removed from list: {profile_removed}", tag="TEST")
|
||||
|
||||
# Clean up just in case
|
||||
if os.path.exists(profile_path):
|
||||
shutil.rmtree(profile_path, ignore_errors=True)
|
||||
|
||||
return profile_found and path_match and success and profile_removed
|
||||
except Exception as e:
|
||||
logger.error(f"Test failed: {str(e)}", tag="TEST")
|
||||
# Clean up test directory
|
||||
try:
|
||||
if os.path.exists(profile_path):
|
||||
shutil.rmtree(profile_path, ignore_errors=True)
|
||||
except:
|
||||
pass
|
||||
return False
|
||||
|
||||
async def test_profile_with_browser():
|
||||
"""Test using a profile with a browser."""
|
||||
logger.info("Testing using a profile with a browser", tag="TEST")
|
||||
|
||||
profile_manager = BrowserProfileManager(logger=logger)
|
||||
test_profile_name = f"test-browser-profile-{uuid.uuid4().hex[:8]}"
|
||||
profile_path = None
|
||||
|
||||
try:
|
||||
# Create a test profile directory
|
||||
profile_path = os.path.join(profile_manager.profiles_dir, test_profile_name)
|
||||
os.makedirs(os.path.join(profile_path, "Default"), exist_ok=True)
|
||||
|
||||
# Create a dummy Preferences file to simulate a Chrome profile
|
||||
with open(os.path.join(profile_path, "Default", "Preferences"), "w") as f:
|
||||
f.write("{\"test\": true}")
|
||||
|
||||
logger.info(f"Created test profile at: {profile_path}", tag="TEST")
|
||||
|
||||
# Now use this profile with a browser
|
||||
browser_config = BrowserConfig(
|
||||
user_data_dir=profile_path,
|
||||
headless=True
|
||||
)
|
||||
|
||||
manager = BrowserManager(browser_config=browser_config, logger=logger)
|
||||
|
||||
# Start the browser with the profile
|
||||
await manager.start()
|
||||
logger.info("Browser started with profile", tag="TEST")
|
||||
|
||||
# Create a page
|
||||
crawler_config = CrawlerRunConfig()
|
||||
page, context = await manager.get_page(crawler_config)
|
||||
|
||||
# Navigate and set some data to verify profile works
|
||||
await page.goto("https://example.com")
|
||||
await page.evaluate("localStorage.setItem('test_data', 'profile_value')")
|
||||
|
||||
# Close browser
|
||||
await manager.close()
|
||||
logger.info("First browser session closed", tag="TEST")
|
||||
|
||||
# Create a new browser with the same profile
|
||||
manager2 = BrowserManager(browser_config=browser_config, logger=logger)
|
||||
await manager2.start()
|
||||
logger.info("Second browser session started with same profile", tag="TEST")
|
||||
|
||||
# Get a page and check if the data persists
|
||||
page2, context2 = await manager2.get_page(crawler_config)
|
||||
await page2.goto("https://example.com")
|
||||
data = await page2.evaluate("localStorage.getItem('test_data')")
|
||||
|
||||
# Verify data persisted
|
||||
data_persisted = data == "profile_value"
|
||||
logger.info(f"Data persisted across sessions: {data_persisted}", tag="TEST")
|
||||
|
||||
# Clean up
|
||||
await manager2.close()
|
||||
logger.info("Second browser session closed", tag="TEST")
|
||||
|
||||
# Delete the test profile
|
||||
success = profile_manager.delete_profile(test_profile_name)
|
||||
logger.info(f"Test profile deleted: {success}", tag="TEST")
|
||||
|
||||
return data_persisted and success
|
||||
except Exception as e:
|
||||
logger.error(f"Test failed: {str(e)}", tag="TEST")
|
||||
# Clean up
|
||||
try:
|
||||
if profile_path and os.path.exists(profile_path):
|
||||
shutil.rmtree(profile_path, ignore_errors=True)
|
||||
except:
|
||||
pass
|
||||
return False
|
||||
|
||||
async def run_tests():
|
||||
"""Run all tests sequentially."""
|
||||
results = []
|
||||
|
||||
results.append(await test_profile_creation())
|
||||
results.append(await test_profile_with_browser())
|
||||
|
||||
# Print summary
|
||||
total = len(results)
|
||||
passed = sum(results)
|
||||
logger.info(f"Tests complete: {passed}/{total} passed", tag="SUMMARY")
|
||||
|
||||
if passed == total:
|
||||
logger.success("All tests passed!", tag="SUMMARY")
|
||||
else:
|
||||
logger.error(f"{total - passed} tests failed", tag="SUMMARY")
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(run_tests())
|
||||
Reference in New Issue
Block a user