feat(browser): implement modular browser management system
Adds a new browser management system with strategy pattern implementation: - Introduces BrowserManager class with strategy pattern support - Adds PlaywrightBrowserStrategy, CDPBrowserStrategy, and BuiltinBrowserStrategy - Implements BrowserProfileManager for profile management - Adds PagePoolConfig for browser page pooling - Includes comprehensive test suite for all browser strategies BREAKING CHANGE: Browser management has been moved to browser/ module. Direct usage of browser_manager.py and browser_profiler.py is deprecated.
This commit is contained in:
@@ -156,6 +156,41 @@ def is_empty_value(value: Any) -> bool:
|
|||||||
return False
|
return False
|
||||||
|
|
||||||
|
|
||||||
|
class PagePoolConfig:
|
||||||
|
"""Configuration for browser page pooling.
|
||||||
|
|
||||||
|
This class configures the page pooling mechanism that maintains pre-warmed
|
||||||
|
browser pages ready for immediate use, improving performance for scenarios
|
||||||
|
where multiple URLs need to be processed in sequence.
|
||||||
|
|
||||||
|
Attributes:
|
||||||
|
mode (str): Pooling mode - "static" or "adaptive".
|
||||||
|
"static" uses a fixed pool size defined by static_size.
|
||||||
|
"adaptive" calculates optimal size based on available system memory.
|
||||||
|
Default: "static".
|
||||||
|
static_size (int): Number of pages to maintain in the pool when mode is "static".
|
||||||
|
Default: 10.
|
||||||
|
memory_per_page (int): Estimated memory used by a single page in MB.
|
||||||
|
Used for "adaptive" mode calculations.
|
||||||
|
Default: 200.
|
||||||
|
memory_threshold (float): Maximum percentage of system memory to use in "adaptive" mode.
|
||||||
|
Default: 0.7 (70% of available memory).
|
||||||
|
timeout (float): Seconds to wait for a page from the pool before creating a new one.
|
||||||
|
Default: 5.0.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self,
|
||||||
|
mode="static",
|
||||||
|
static_size=10,
|
||||||
|
memory_per_page=200,
|
||||||
|
memory_threshold=0.7,
|
||||||
|
timeout=5.0):
|
||||||
|
self.mode = mode
|
||||||
|
self.static_size = static_size
|
||||||
|
self.memory_per_page = memory_per_page
|
||||||
|
self.memory_threshold = memory_threshold
|
||||||
|
self.timeout = timeout
|
||||||
|
|
||||||
class BrowserConfig:
|
class BrowserConfig:
|
||||||
"""
|
"""
|
||||||
Configuration class for setting up a browser instance and its context in AsyncPlaywrightCrawlerStrategy.
|
Configuration class for setting up a browser instance and its context in AsyncPlaywrightCrawlerStrategy.
|
||||||
@@ -220,6 +255,9 @@ class BrowserConfig:
|
|||||||
light_mode (bool): Disables certain background features for performance gains. Default: False.
|
light_mode (bool): Disables certain background features for performance gains. Default: False.
|
||||||
extra_args (list): Additional command-line arguments passed to the browser.
|
extra_args (list): Additional command-line arguments passed to the browser.
|
||||||
Default: [].
|
Default: [].
|
||||||
|
page_pool_config (PagePoolConfig or None): Configuration for page pooling mechanism.
|
||||||
|
If None, page pooling is disabled.
|
||||||
|
Default: None.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(
|
def __init__(
|
||||||
@@ -260,6 +298,7 @@ class BrowserConfig:
|
|||||||
extra_args: list = None,
|
extra_args: list = None,
|
||||||
debugging_port: int = 9222,
|
debugging_port: int = 9222,
|
||||||
host: str = "localhost",
|
host: str = "localhost",
|
||||||
|
page_pool_config: Optional[PagePoolConfig] = None,
|
||||||
):
|
):
|
||||||
self.browser_type = browser_type
|
self.browser_type = browser_type
|
||||||
self.headless = headless
|
self.headless = headless
|
||||||
@@ -298,6 +337,7 @@ class BrowserConfig:
|
|||||||
self.verbose = verbose
|
self.verbose = verbose
|
||||||
self.debugging_port = debugging_port
|
self.debugging_port = debugging_port
|
||||||
self.host = host
|
self.host = host
|
||||||
|
self.page_pool_config = page_pool_config
|
||||||
|
|
||||||
fa_user_agenr_generator = ValidUAGenerator()
|
fa_user_agenr_generator = ValidUAGenerator()
|
||||||
if self.user_agent_mode == "random":
|
if self.user_agent_mode == "random":
|
||||||
@@ -328,6 +368,12 @@ class BrowserConfig:
|
|||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def from_kwargs(kwargs: dict) -> "BrowserConfig":
|
def from_kwargs(kwargs: dict) -> "BrowserConfig":
|
||||||
|
# Handle page_pool_config
|
||||||
|
page_pool_config = kwargs.get("page_pool_config")
|
||||||
|
if isinstance(page_pool_config, dict):
|
||||||
|
# If it's a dict, convert to PagePoolConfig
|
||||||
|
page_pool_config = PagePoolConfig(**page_pool_config)
|
||||||
|
|
||||||
return BrowserConfig(
|
return BrowserConfig(
|
||||||
browser_type=kwargs.get("browser_type", "chromium"),
|
browser_type=kwargs.get("browser_type", "chromium"),
|
||||||
headless=kwargs.get("headless", True),
|
headless=kwargs.get("headless", True),
|
||||||
@@ -361,6 +407,7 @@ class BrowserConfig:
|
|||||||
extra_args=kwargs.get("extra_args", []),
|
extra_args=kwargs.get("extra_args", []),
|
||||||
debugging_port=kwargs.get("debugging_port", 9222),
|
debugging_port=kwargs.get("debugging_port", 9222),
|
||||||
host=kwargs.get("host", "localhost"),
|
host=kwargs.get("host", "localhost"),
|
||||||
|
page_pool_config=page_pool_config,
|
||||||
)
|
)
|
||||||
|
|
||||||
def to_dict(self):
|
def to_dict(self):
|
||||||
@@ -395,6 +442,7 @@ class BrowserConfig:
|
|||||||
"verbose": self.verbose,
|
"verbose": self.verbose,
|
||||||
"debugging_port": self.debugging_port,
|
"debugging_port": self.debugging_port,
|
||||||
"host": self.host,
|
"host": self.host,
|
||||||
|
"page_pool_config": self.page_pool_config,
|
||||||
}
|
}
|
||||||
|
|
||||||
def clone(self, **kwargs):
|
def clone(self, **kwargs):
|
||||||
|
|||||||
10
crawl4ai/browser/__init__.py
Normal file
10
crawl4ai/browser/__init__.py
Normal file
@@ -0,0 +1,10 @@
|
|||||||
|
"""Browser management module for Crawl4AI.
|
||||||
|
|
||||||
|
This module provides browser management capabilities using different strategies
|
||||||
|
for browser creation and interaction.
|
||||||
|
"""
|
||||||
|
|
||||||
|
from .manager import BrowserManager
|
||||||
|
from .profiles import BrowserProfileManager
|
||||||
|
|
||||||
|
__all__ = ['BrowserManager', 'BrowserProfileManager']
|
||||||
165
crawl4ai/browser/manager.py
Normal file
165
crawl4ai/browser/manager.py
Normal file
@@ -0,0 +1,165 @@
|
|||||||
|
"""Browser manager module for Crawl4AI.
|
||||||
|
|
||||||
|
This module provides a central browser management class that uses the
|
||||||
|
strategy pattern internally while maintaining the existing API.
|
||||||
|
"""
|
||||||
|
|
||||||
|
import asyncio
|
||||||
|
import time
|
||||||
|
from typing import Optional, Tuple, Dict, Any
|
||||||
|
|
||||||
|
from playwright.async_api import Page, BrowserContext
|
||||||
|
|
||||||
|
from ..async_logger import AsyncLogger
|
||||||
|
from ..async_configs import BrowserConfig, CrawlerRunConfig
|
||||||
|
|
||||||
|
from .strategies import (
|
||||||
|
BaseBrowserStrategy,
|
||||||
|
PlaywrightBrowserStrategy,
|
||||||
|
CDPBrowserStrategy,
|
||||||
|
BuiltinBrowserStrategy
|
||||||
|
)
|
||||||
|
|
||||||
|
class BrowserManager:
|
||||||
|
"""Main interface for browser management in Crawl4AI.
|
||||||
|
|
||||||
|
This class maintains backward compatibility with the existing implementation
|
||||||
|
while using the strategy pattern internally for different browser types.
|
||||||
|
|
||||||
|
Attributes:
|
||||||
|
config (BrowserConfig): Configuration object containing all browser settings
|
||||||
|
logger: Logger instance for recording events and errors
|
||||||
|
browser: The browser instance
|
||||||
|
default_context: The default browser context
|
||||||
|
managed_browser: The managed browser instance
|
||||||
|
playwright: The Playwright instance
|
||||||
|
sessions: Dictionary to store session information
|
||||||
|
session_ttl: Session timeout in seconds
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self, browser_config: Optional[BrowserConfig] = None, logger: Optional[AsyncLogger] = None):
|
||||||
|
"""Initialize the BrowserManager with a browser configuration.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
browser_config: Configuration object containing all browser settings
|
||||||
|
logger: Logger instance for recording events and errors
|
||||||
|
"""
|
||||||
|
self.config = browser_config or BrowserConfig()
|
||||||
|
self.logger = logger
|
||||||
|
|
||||||
|
# Create strategy based on configuration
|
||||||
|
self._strategy = self._create_strategy()
|
||||||
|
|
||||||
|
# Initialize state variables for compatibility with existing code
|
||||||
|
self.browser = None
|
||||||
|
self.default_context = None
|
||||||
|
self.managed_browser = None
|
||||||
|
self.playwright = None
|
||||||
|
|
||||||
|
# For session management (from existing implementation)
|
||||||
|
self.sessions = {}
|
||||||
|
self.session_ttl = 1800 # 30 minutes
|
||||||
|
|
||||||
|
def _create_strategy(self) -> BaseBrowserStrategy:
|
||||||
|
"""Create appropriate browser strategy based on configuration.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
BaseBrowserStrategy: The selected browser strategy
|
||||||
|
"""
|
||||||
|
if self.config.browser_mode == "builtin":
|
||||||
|
return BuiltinBrowserStrategy(self.config, self.logger)
|
||||||
|
elif self.config.cdp_url or self.config.use_managed_browser:
|
||||||
|
return CDPBrowserStrategy(self.config, self.logger)
|
||||||
|
else:
|
||||||
|
return PlaywrightBrowserStrategy(self.config, self.logger)
|
||||||
|
|
||||||
|
async def start(self):
|
||||||
|
"""Start the browser instance and set up the default context.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
self: For method chaining
|
||||||
|
"""
|
||||||
|
# Start the strategy
|
||||||
|
await self._strategy.start()
|
||||||
|
|
||||||
|
# Update legacy references
|
||||||
|
self.browser = self._strategy.browser
|
||||||
|
self.default_context = self._strategy.default_context
|
||||||
|
|
||||||
|
# Set browser process reference (for CDP strategy)
|
||||||
|
if hasattr(self._strategy, 'browser_process'):
|
||||||
|
self.managed_browser = self._strategy
|
||||||
|
|
||||||
|
# Set Playwright reference
|
||||||
|
self.playwright = self._strategy.playwright
|
||||||
|
|
||||||
|
# Sync sessions if needed
|
||||||
|
if hasattr(self._strategy, 'sessions'):
|
||||||
|
self.sessions = self._strategy.sessions
|
||||||
|
self.session_ttl = self._strategy.session_ttl
|
||||||
|
|
||||||
|
return self
|
||||||
|
|
||||||
|
async def get_page(self, crawlerRunConfig: CrawlerRunConfig) -> Tuple[Page, BrowserContext]:
|
||||||
|
"""Get a page for the given configuration.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
crawlerRunConfig: Configuration object for the crawler run
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Tuple of (Page, BrowserContext)
|
||||||
|
"""
|
||||||
|
# Delegate to strategy
|
||||||
|
page, context = await self._strategy.get_page(crawlerRunConfig)
|
||||||
|
|
||||||
|
# Sync sessions if needed
|
||||||
|
if hasattr(self._strategy, 'sessions'):
|
||||||
|
self.sessions = self._strategy.sessions
|
||||||
|
|
||||||
|
return page, context
|
||||||
|
|
||||||
|
async def kill_session(self, session_id: str):
|
||||||
|
"""Kill a browser session and clean up resources.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
session_id: The session ID to kill
|
||||||
|
"""
|
||||||
|
# Handle kill_session via our strategy if it supports it
|
||||||
|
if hasattr(self._strategy, '_kill_session'):
|
||||||
|
await self._strategy._kill_session(session_id)
|
||||||
|
elif session_id in self.sessions:
|
||||||
|
context, page, _ = self.sessions[session_id]
|
||||||
|
await page.close()
|
||||||
|
# Only close context if not using CDP
|
||||||
|
if not self.config.use_managed_browser and not self.config.cdp_url and not self.config.browser_mode == "builtin":
|
||||||
|
await context.close()
|
||||||
|
del self.sessions[session_id]
|
||||||
|
|
||||||
|
def _cleanup_expired_sessions(self):
|
||||||
|
"""Clean up expired sessions based on TTL."""
|
||||||
|
# Use strategy's implementation if available
|
||||||
|
if hasattr(self._strategy, '_cleanup_expired_sessions'):
|
||||||
|
self._strategy._cleanup_expired_sessions()
|
||||||
|
return
|
||||||
|
|
||||||
|
# Otherwise use our own implementation
|
||||||
|
current_time = time.time()
|
||||||
|
expired_sessions = [
|
||||||
|
sid
|
||||||
|
for sid, (_, _, last_used) in self.sessions.items()
|
||||||
|
if current_time - last_used > self.session_ttl
|
||||||
|
]
|
||||||
|
for sid in expired_sessions:
|
||||||
|
asyncio.create_task(self.kill_session(sid))
|
||||||
|
|
||||||
|
async def close(self):
|
||||||
|
"""Close the browser and clean up resources."""
|
||||||
|
# Delegate to strategy
|
||||||
|
await self._strategy.close()
|
||||||
|
|
||||||
|
# Reset legacy references
|
||||||
|
self.browser = None
|
||||||
|
self.default_context = None
|
||||||
|
self.managed_browser = None
|
||||||
|
self.playwright = None
|
||||||
|
self.sessions = {}
|
||||||
0
crawl4ai/browser/models.py
Normal file
0
crawl4ai/browser/models.py
Normal file
458
crawl4ai/browser/profiles.py
Normal file
458
crawl4ai/browser/profiles.py
Normal file
@@ -0,0 +1,458 @@
|
|||||||
|
"""Browser profile management module for Crawl4AI.
|
||||||
|
|
||||||
|
This module provides functionality for creating and managing browser profiles
|
||||||
|
that can be used for authenticated browsing.
|
||||||
|
"""
|
||||||
|
|
||||||
|
import os
|
||||||
|
import asyncio
|
||||||
|
import signal
|
||||||
|
import sys
|
||||||
|
import datetime
|
||||||
|
import uuid
|
||||||
|
import shutil
|
||||||
|
from typing import List, Dict, Optional, Any
|
||||||
|
from colorama import Fore, Style, init
|
||||||
|
|
||||||
|
from ..async_configs import BrowserConfig
|
||||||
|
from ..async_logger import AsyncLogger, AsyncLoggerBase
|
||||||
|
from ..utils import get_home_folder
|
||||||
|
from .strategies import is_windows
|
||||||
|
|
||||||
|
class BrowserProfileManager:
|
||||||
|
"""Manages browser profiles for Crawl4AI.
|
||||||
|
|
||||||
|
This class provides functionality to create and manage browser profiles
|
||||||
|
that can be used for authenticated browsing with Crawl4AI.
|
||||||
|
|
||||||
|
Profiles are stored by default in ~/.crawl4ai/profiles/
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self, logger: Optional[AsyncLoggerBase] = None):
|
||||||
|
"""Initialize the BrowserProfileManager.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
logger: Logger for outputting messages. If None, a default AsyncLogger is created.
|
||||||
|
"""
|
||||||
|
# Initialize colorama for colorful terminal output
|
||||||
|
init()
|
||||||
|
|
||||||
|
# Create a logger if not provided
|
||||||
|
if logger is None:
|
||||||
|
self.logger = AsyncLogger(verbose=True)
|
||||||
|
elif not isinstance(logger, AsyncLoggerBase):
|
||||||
|
self.logger = AsyncLogger(verbose=True)
|
||||||
|
else:
|
||||||
|
self.logger = logger
|
||||||
|
|
||||||
|
# Ensure profiles directory exists
|
||||||
|
self.profiles_dir = os.path.join(get_home_folder(), "profiles")
|
||||||
|
os.makedirs(self.profiles_dir, exist_ok=True)
|
||||||
|
|
||||||
|
async def create_profile(self,
|
||||||
|
profile_name: Optional[str] = None,
|
||||||
|
browser_config: Optional[BrowserConfig] = None) -> Optional[str]:
|
||||||
|
"""Create a browser profile interactively.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
profile_name: Name for the profile. If None, a name is generated.
|
||||||
|
browser_config: Configuration for the browser. If None, a default configuration is used.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Path to the created profile directory, or None if creation failed
|
||||||
|
"""
|
||||||
|
# Create default browser config if none provided
|
||||||
|
if browser_config is None:
|
||||||
|
browser_config = BrowserConfig(
|
||||||
|
browser_type="chromium",
|
||||||
|
headless=False, # Must be visible for user interaction
|
||||||
|
verbose=True
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
# Ensure headless is False for user interaction
|
||||||
|
browser_config.headless = False
|
||||||
|
|
||||||
|
# Generate profile name if not provided
|
||||||
|
if not profile_name:
|
||||||
|
timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
|
||||||
|
profile_name = f"profile_{timestamp}_{uuid.uuid4().hex[:6]}"
|
||||||
|
|
||||||
|
# Sanitize profile name (replace spaces and special chars)
|
||||||
|
profile_name = "".join(c if c.isalnum() or c in "-_" else "_" for c in profile_name)
|
||||||
|
|
||||||
|
# Set user data directory
|
||||||
|
profile_path = os.path.join(self.profiles_dir, profile_name)
|
||||||
|
os.makedirs(profile_path, exist_ok=True)
|
||||||
|
|
||||||
|
# Print instructions for the user with colorama formatting
|
||||||
|
border = f"{Fore.CYAN}{'='*80}{Style.RESET_ALL}"
|
||||||
|
self.logger.info(f"\n{border}", tag="PROFILE")
|
||||||
|
self.logger.info(f"Creating browser profile: {Fore.GREEN}{profile_name}{Style.RESET_ALL}", tag="PROFILE")
|
||||||
|
self.logger.info(f"Profile directory: {Fore.YELLOW}{profile_path}{Style.RESET_ALL}", tag="PROFILE")
|
||||||
|
|
||||||
|
self.logger.info("\nInstructions:", tag="PROFILE")
|
||||||
|
self.logger.info("1. A browser window will open for you to set up your profile.", tag="PROFILE")
|
||||||
|
self.logger.info(f"2. {Fore.CYAN}Log in to websites{Style.RESET_ALL}, configure settings, etc. as needed.", tag="PROFILE")
|
||||||
|
self.logger.info(f"3. When you're done, {Fore.YELLOW}press 'q' in this terminal{Style.RESET_ALL} to close the browser.", tag="PROFILE")
|
||||||
|
self.logger.info("4. The profile will be saved and ready to use with Crawl4AI.", tag="PROFILE")
|
||||||
|
self.logger.info(f"{border}\n", tag="PROFILE")
|
||||||
|
|
||||||
|
# Import the necessary classes with local imports to avoid circular references
|
||||||
|
from .strategies import CDPBrowserStrategy
|
||||||
|
|
||||||
|
# Set browser config to use the profile path
|
||||||
|
browser_config.user_data_dir = profile_path
|
||||||
|
|
||||||
|
# Create a CDP browser strategy for the profile creation
|
||||||
|
browser_strategy = CDPBrowserStrategy(browser_config, self.logger)
|
||||||
|
|
||||||
|
# Set up signal handlers to ensure cleanup on interrupt
|
||||||
|
original_sigint = signal.getsignal(signal.SIGINT)
|
||||||
|
original_sigterm = signal.getsignal(signal.SIGTERM)
|
||||||
|
|
||||||
|
# Define cleanup handler for signals
|
||||||
|
async def cleanup_handler(sig, frame):
|
||||||
|
self.logger.warning("\nCleaning up browser process...", tag="PROFILE")
|
||||||
|
await browser_strategy.close()
|
||||||
|
# Restore original signal handlers
|
||||||
|
signal.signal(signal.SIGINT, original_sigint)
|
||||||
|
signal.signal(signal.SIGTERM, original_sigterm)
|
||||||
|
if sig == signal.SIGINT:
|
||||||
|
self.logger.error("Profile creation interrupted. Profile may be incomplete.", tag="PROFILE")
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
|
# Set signal handlers
|
||||||
|
def sigint_handler(sig, frame):
|
||||||
|
asyncio.create_task(cleanup_handler(sig, frame))
|
||||||
|
|
||||||
|
signal.signal(signal.SIGINT, sigint_handler)
|
||||||
|
signal.signal(signal.SIGTERM, sigint_handler)
|
||||||
|
|
||||||
|
# Event to signal when user is done with the browser
|
||||||
|
user_done_event = asyncio.Event()
|
||||||
|
|
||||||
|
# Run keyboard input loop in a separate task
|
||||||
|
async def listen_for_quit_command():
|
||||||
|
import termios
|
||||||
|
import tty
|
||||||
|
import select
|
||||||
|
|
||||||
|
# First output the prompt
|
||||||
|
self.logger.info(f"{Fore.CYAN}Press '{Fore.WHITE}q{Fore.CYAN}' when you've finished using the browser...{Style.RESET_ALL}", tag="PROFILE")
|
||||||
|
|
||||||
|
# Save original terminal settings
|
||||||
|
fd = sys.stdin.fileno()
|
||||||
|
old_settings = termios.tcgetattr(fd)
|
||||||
|
|
||||||
|
try:
|
||||||
|
# Switch to non-canonical mode (no line buffering)
|
||||||
|
tty.setcbreak(fd)
|
||||||
|
|
||||||
|
while True:
|
||||||
|
# Check if input is available (non-blocking)
|
||||||
|
readable, _, _ = select.select([sys.stdin], [], [], 0.5)
|
||||||
|
if readable:
|
||||||
|
key = sys.stdin.read(1)
|
||||||
|
if key.lower() == 'q':
|
||||||
|
self.logger.info(f"{Fore.GREEN}Closing browser and saving profile...{Style.RESET_ALL}", tag="PROFILE")
|
||||||
|
user_done_event.set()
|
||||||
|
return
|
||||||
|
|
||||||
|
# Check if the browser process has already exited
|
||||||
|
if browser_strategy.browser_process and browser_strategy.browser_process.poll() is not None:
|
||||||
|
self.logger.info("Browser already closed. Ending input listener.", tag="PROFILE")
|
||||||
|
user_done_event.set()
|
||||||
|
return
|
||||||
|
|
||||||
|
await asyncio.sleep(0.1)
|
||||||
|
|
||||||
|
finally:
|
||||||
|
# Restore terminal settings
|
||||||
|
termios.tcsetattr(fd, termios.TCSADRAIN, old_settings)
|
||||||
|
|
||||||
|
try:
|
||||||
|
# Start the browser
|
||||||
|
await browser_strategy.start()
|
||||||
|
|
||||||
|
# Check if browser started successfully
|
||||||
|
if not browser_strategy.browser_process:
|
||||||
|
self.logger.error("Failed to start browser process.", tag="PROFILE")
|
||||||
|
return None
|
||||||
|
|
||||||
|
self.logger.info(f"Browser launched. {Fore.CYAN}Waiting for you to finish...{Style.RESET_ALL}", tag="PROFILE")
|
||||||
|
|
||||||
|
# Start listening for keyboard input
|
||||||
|
listener_task = asyncio.create_task(listen_for_quit_command())
|
||||||
|
|
||||||
|
# Wait for either the user to press 'q' or for the browser process to exit naturally
|
||||||
|
while not user_done_event.is_set() and browser_strategy.browser_process.poll() is None:
|
||||||
|
await asyncio.sleep(0.5)
|
||||||
|
|
||||||
|
# Cancel the listener task if it's still running
|
||||||
|
if not listener_task.done():
|
||||||
|
listener_task.cancel()
|
||||||
|
try:
|
||||||
|
await listener_task
|
||||||
|
except asyncio.CancelledError:
|
||||||
|
pass
|
||||||
|
|
||||||
|
# If the browser is still running and the user pressed 'q', terminate it
|
||||||
|
if browser_strategy.browser_process.poll() is None and user_done_event.is_set():
|
||||||
|
self.logger.info("Terminating browser process...", tag="PROFILE")
|
||||||
|
await browser_strategy.close()
|
||||||
|
|
||||||
|
self.logger.success(f"Browser closed. Profile saved at: {Fore.GREEN}{profile_path}{Style.RESET_ALL}", tag="PROFILE")
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
self.logger.error(f"Error creating profile: {str(e)}", tag="PROFILE")
|
||||||
|
await browser_strategy.close()
|
||||||
|
return None
|
||||||
|
finally:
|
||||||
|
# Restore original signal handlers
|
||||||
|
signal.signal(signal.SIGINT, original_sigint)
|
||||||
|
signal.signal(signal.SIGTERM, original_sigterm)
|
||||||
|
|
||||||
|
# Make sure browser is fully cleaned up
|
||||||
|
await browser_strategy.close()
|
||||||
|
|
||||||
|
# Return the profile path
|
||||||
|
return profile_path
|
||||||
|
|
||||||
|
def list_profiles(self) -> List[Dict[str, Any]]:
|
||||||
|
"""List all available browser profiles.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
List of dictionaries containing profile information
|
||||||
|
"""
|
||||||
|
if not os.path.exists(self.profiles_dir):
|
||||||
|
return []
|
||||||
|
|
||||||
|
profiles = []
|
||||||
|
|
||||||
|
for name in os.listdir(self.profiles_dir):
|
||||||
|
profile_path = os.path.join(self.profiles_dir, name)
|
||||||
|
|
||||||
|
# Skip if not a directory
|
||||||
|
if not os.path.isdir(profile_path):
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Check if this looks like a valid browser profile
|
||||||
|
# For Chromium: Look for Preferences file
|
||||||
|
# For Firefox: Look for prefs.js file
|
||||||
|
is_valid = False
|
||||||
|
|
||||||
|
if os.path.exists(os.path.join(profile_path, "Preferences")) or \
|
||||||
|
os.path.exists(os.path.join(profile_path, "Default", "Preferences")):
|
||||||
|
is_valid = "chromium"
|
||||||
|
elif os.path.exists(os.path.join(profile_path, "prefs.js")):
|
||||||
|
is_valid = "firefox"
|
||||||
|
|
||||||
|
if is_valid:
|
||||||
|
# Get creation time
|
||||||
|
created = datetime.datetime.fromtimestamp(
|
||||||
|
os.path.getctime(profile_path)
|
||||||
|
)
|
||||||
|
|
||||||
|
profiles.append({
|
||||||
|
"name": name,
|
||||||
|
"path": profile_path,
|
||||||
|
"created": created,
|
||||||
|
"type": is_valid
|
||||||
|
})
|
||||||
|
|
||||||
|
# Sort by creation time, newest first
|
||||||
|
profiles.sort(key=lambda x: x["created"], reverse=True)
|
||||||
|
|
||||||
|
return profiles
|
||||||
|
|
||||||
|
def get_profile_path(self, profile_name: str) -> Optional[str]:
|
||||||
|
"""Get the full path to a profile by name.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
profile_name: Name of the profile (not the full path)
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Full path to the profile directory, or None if not found
|
||||||
|
"""
|
||||||
|
profile_path = os.path.join(self.profiles_dir, profile_name)
|
||||||
|
|
||||||
|
# Check if path exists and is a valid profile
|
||||||
|
if not os.path.isdir(profile_path):
|
||||||
|
# Check if profile_name itself is full path
|
||||||
|
if os.path.isabs(profile_name):
|
||||||
|
profile_path = profile_name
|
||||||
|
else:
|
||||||
|
return None
|
||||||
|
|
||||||
|
# Look for profile indicators
|
||||||
|
is_profile = (
|
||||||
|
os.path.exists(os.path.join(profile_path, "Preferences")) or
|
||||||
|
os.path.exists(os.path.join(profile_path, "Default", "Preferences")) or
|
||||||
|
os.path.exists(os.path.join(profile_path, "prefs.js"))
|
||||||
|
)
|
||||||
|
|
||||||
|
if not is_profile:
|
||||||
|
return None # Not a valid browser profile
|
||||||
|
|
||||||
|
return profile_path
|
||||||
|
|
||||||
|
def delete_profile(self, profile_name_or_path: str) -> bool:
|
||||||
|
"""Delete a browser profile by name or path.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
profile_name_or_path: Name of the profile or full path to profile directory
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
True if the profile was deleted successfully, False otherwise
|
||||||
|
"""
|
||||||
|
# Determine if input is a name or a path
|
||||||
|
if os.path.isabs(profile_name_or_path):
|
||||||
|
# Full path provided
|
||||||
|
profile_path = profile_name_or_path
|
||||||
|
else:
|
||||||
|
# Just a name provided, construct path
|
||||||
|
profile_path = os.path.join(self.profiles_dir, profile_name_or_path)
|
||||||
|
|
||||||
|
# Check if path exists and is a valid profile
|
||||||
|
if not os.path.isdir(profile_path):
|
||||||
|
return False
|
||||||
|
|
||||||
|
# Look for profile indicators
|
||||||
|
is_profile = (
|
||||||
|
os.path.exists(os.path.join(profile_path, "Preferences")) or
|
||||||
|
os.path.exists(os.path.join(profile_path, "Default", "Preferences")) or
|
||||||
|
os.path.exists(os.path.join(profile_path, "prefs.js"))
|
||||||
|
)
|
||||||
|
|
||||||
|
if not is_profile:
|
||||||
|
return False # Not a valid browser profile
|
||||||
|
|
||||||
|
# Delete the profile directory
|
||||||
|
try:
|
||||||
|
shutil.rmtree(profile_path)
|
||||||
|
return True
|
||||||
|
except Exception:
|
||||||
|
return False
|
||||||
|
|
||||||
|
async def interactive_manager(self, crawl_callback=None):
|
||||||
|
"""Launch an interactive profile management console.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
crawl_callback: Function to call when selecting option to use
|
||||||
|
a profile for crawling. It will be called with (profile_path, url).
|
||||||
|
"""
|
||||||
|
while True:
|
||||||
|
self.logger.info(f"\n{Fore.CYAN}Profile Management Options:{Style.RESET_ALL}", tag="MENU")
|
||||||
|
self.logger.info(f"1. {Fore.GREEN}Create a new profile{Style.RESET_ALL}", tag="MENU")
|
||||||
|
self.logger.info(f"2. {Fore.YELLOW}List available profiles{Style.RESET_ALL}", tag="MENU")
|
||||||
|
self.logger.info(f"3. {Fore.RED}Delete a profile{Style.RESET_ALL}", tag="MENU")
|
||||||
|
|
||||||
|
# Only show crawl option if callback provided
|
||||||
|
if crawl_callback:
|
||||||
|
self.logger.info(f"4. {Fore.CYAN}Use a profile to crawl a website{Style.RESET_ALL}", tag="MENU")
|
||||||
|
self.logger.info(f"5. {Fore.MAGENTA}Exit{Style.RESET_ALL}", tag="MENU")
|
||||||
|
exit_option = "5"
|
||||||
|
else:
|
||||||
|
self.logger.info(f"4. {Fore.MAGENTA}Exit{Style.RESET_ALL}", tag="MENU")
|
||||||
|
exit_option = "4"
|
||||||
|
|
||||||
|
choice = input(f"\n{Fore.CYAN}Enter your choice (1-{exit_option}): {Style.RESET_ALL}")
|
||||||
|
|
||||||
|
if choice == "1":
|
||||||
|
# Create new profile
|
||||||
|
name = input(f"{Fore.GREEN}Enter a name for the new profile (or press Enter for auto-generated name): {Style.RESET_ALL}")
|
||||||
|
await self.create_profile(name or None)
|
||||||
|
|
||||||
|
elif choice == "2":
|
||||||
|
# List profiles
|
||||||
|
profiles = self.list_profiles()
|
||||||
|
|
||||||
|
if not profiles:
|
||||||
|
self.logger.warning(" No profiles found. Create one first with option 1.", tag="PROFILES")
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Print profile information with colorama formatting
|
||||||
|
self.logger.info("\nAvailable profiles:", tag="PROFILES")
|
||||||
|
for i, profile in enumerate(profiles):
|
||||||
|
self.logger.info(f"[{i+1}] {Fore.CYAN}{profile['name']}{Style.RESET_ALL}", tag="PROFILES")
|
||||||
|
self.logger.info(f" Path: {Fore.YELLOW}{profile['path']}{Style.RESET_ALL}", tag="PROFILES")
|
||||||
|
self.logger.info(f" Created: {profile['created'].strftime('%Y-%m-%d %H:%M:%S')}", tag="PROFILES")
|
||||||
|
self.logger.info(f" Browser type: {profile['type']}", tag="PROFILES")
|
||||||
|
self.logger.info("", tag="PROFILES") # Empty line for spacing
|
||||||
|
|
||||||
|
elif choice == "3":
|
||||||
|
# Delete profile
|
||||||
|
profiles = self.list_profiles()
|
||||||
|
if not profiles:
|
||||||
|
self.logger.warning("No profiles found to delete", tag="PROFILES")
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Display numbered list
|
||||||
|
self.logger.info(f"\n{Fore.YELLOW}Available profiles:{Style.RESET_ALL}", tag="PROFILES")
|
||||||
|
for i, profile in enumerate(profiles):
|
||||||
|
self.logger.info(f"[{i+1}] {profile['name']}", tag="PROFILES")
|
||||||
|
|
||||||
|
# Get profile to delete
|
||||||
|
profile_idx = input(f"{Fore.RED}Enter the number of the profile to delete (or 'c' to cancel): {Style.RESET_ALL}")
|
||||||
|
if profile_idx.lower() == 'c':
|
||||||
|
continue
|
||||||
|
|
||||||
|
try:
|
||||||
|
idx = int(profile_idx) - 1
|
||||||
|
if 0 <= idx < len(profiles):
|
||||||
|
profile_name = profiles[idx]["name"]
|
||||||
|
self.logger.info(f"Deleting profile: {Fore.YELLOW}{profile_name}{Style.RESET_ALL}", tag="PROFILES")
|
||||||
|
|
||||||
|
# Confirm deletion
|
||||||
|
confirm = input(f"{Fore.RED}Are you sure you want to delete this profile? (y/n): {Style.RESET_ALL}")
|
||||||
|
if confirm.lower() == 'y':
|
||||||
|
success = self.delete_profile(profiles[idx]["path"])
|
||||||
|
|
||||||
|
if success:
|
||||||
|
self.logger.success(f"Profile {Fore.GREEN}{profile_name}{Style.RESET_ALL} deleted successfully", tag="PROFILES")
|
||||||
|
else:
|
||||||
|
self.logger.error(f"Failed to delete profile {Fore.RED}{profile_name}{Style.RESET_ALL}", tag="PROFILES")
|
||||||
|
else:
|
||||||
|
self.logger.error("Invalid profile number", tag="PROFILES")
|
||||||
|
except ValueError:
|
||||||
|
self.logger.error("Please enter a valid number", tag="PROFILES")
|
||||||
|
|
||||||
|
elif choice == "4" and crawl_callback:
|
||||||
|
# Use profile to crawl a site
|
||||||
|
profiles = self.list_profiles()
|
||||||
|
if not profiles:
|
||||||
|
self.logger.warning("No profiles found. Create one first.", tag="PROFILES")
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Display numbered list
|
||||||
|
self.logger.info(f"\n{Fore.YELLOW}Available profiles:{Style.RESET_ALL}", tag="PROFILES")
|
||||||
|
for i, profile in enumerate(profiles):
|
||||||
|
self.logger.info(f"[{i+1}] {profile['name']}", tag="PROFILES")
|
||||||
|
|
||||||
|
# Get profile to use
|
||||||
|
profile_idx = input(f"{Fore.CYAN}Enter the number of the profile to use (or 'c' to cancel): {Style.RESET_ALL}")
|
||||||
|
if profile_idx.lower() == 'c':
|
||||||
|
continue
|
||||||
|
|
||||||
|
try:
|
||||||
|
idx = int(profile_idx) - 1
|
||||||
|
if 0 <= idx < len(profiles):
|
||||||
|
profile_path = profiles[idx]["path"]
|
||||||
|
url = input(f"{Fore.CYAN}Enter the URL to crawl: {Style.RESET_ALL}")
|
||||||
|
if url:
|
||||||
|
# Call the provided crawl callback
|
||||||
|
await crawl_callback(profile_path, url)
|
||||||
|
else:
|
||||||
|
self.logger.error("No URL provided", tag="CRAWL")
|
||||||
|
else:
|
||||||
|
self.logger.error("Invalid profile number", tag="PROFILES")
|
||||||
|
except ValueError:
|
||||||
|
self.logger.error("Please enter a valid number", tag="PROFILES")
|
||||||
|
|
||||||
|
elif (choice == "4" and not crawl_callback) or (choice == "5" and crawl_callback):
|
||||||
|
# Exit
|
||||||
|
self.logger.info("Exiting profile management", tag="MENU")
|
||||||
|
break
|
||||||
|
|
||||||
|
else:
|
||||||
|
self.logger.error(f"Invalid choice. Please enter a number between 1 and {exit_option}.", tag="MENU")
|
||||||
1048
crawl4ai/browser/strategies.py
Normal file
1048
crawl4ai/browser/strategies.py
Normal file
File diff suppressed because it is too large
Load Diff
105
crawl4ai/browser/utils.py
Normal file
105
crawl4ai/browser/utils.py
Normal file
@@ -0,0 +1,105 @@
|
|||||||
|
"""Browser utilities module for Crawl4AI.
|
||||||
|
|
||||||
|
This module provides utility functions for browser management,
|
||||||
|
including process management, CDP connection utilities,
|
||||||
|
and Playwright instance management.
|
||||||
|
"""
|
||||||
|
|
||||||
|
import asyncio
|
||||||
|
import os
|
||||||
|
import sys
|
||||||
|
import platform
|
||||||
|
import tempfile
|
||||||
|
from typing import Optional, Any
|
||||||
|
|
||||||
|
from playwright.async_api import async_playwright
|
||||||
|
|
||||||
|
from ..async_logger import AsyncLogger
|
||||||
|
from ..utils import get_chromium_path
|
||||||
|
|
||||||
|
_playwright_instance = None
|
||||||
|
|
||||||
|
async def get_playwright():
|
||||||
|
"""Get or create the Playwright instance (singleton pattern).
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Playwright: The Playwright instance
|
||||||
|
"""
|
||||||
|
global _playwright_instance
|
||||||
|
if _playwright_instance is None or True:
|
||||||
|
_playwright_instance = await async_playwright().start()
|
||||||
|
return _playwright_instance
|
||||||
|
|
||||||
|
def get_browser_executable(browser_type: str) -> str:
|
||||||
|
"""Get the path to browser executable, with platform-specific handling.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
browser_type: Type of browser (chromium, firefox, webkit)
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Path to browser executable
|
||||||
|
"""
|
||||||
|
return get_chromium_path(browser_type)
|
||||||
|
|
||||||
|
def create_temp_directory(prefix="browser-profile-") -> str:
|
||||||
|
"""Create a temporary directory for browser data.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
prefix: Prefix for the temporary directory name
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Path to the created temporary directory
|
||||||
|
"""
|
||||||
|
return tempfile.mkdtemp(prefix=prefix)
|
||||||
|
|
||||||
|
def is_windows() -> bool:
|
||||||
|
"""Check if the current platform is Windows.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
True if Windows, False otherwise
|
||||||
|
"""
|
||||||
|
return sys.platform == "win32"
|
||||||
|
|
||||||
|
def is_macos() -> bool:
|
||||||
|
"""Check if the current platform is macOS.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
True if macOS, False otherwise
|
||||||
|
"""
|
||||||
|
return sys.platform == "darwin"
|
||||||
|
|
||||||
|
def is_linux() -> bool:
|
||||||
|
"""Check if the current platform is Linux.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
True if Linux, False otherwise
|
||||||
|
"""
|
||||||
|
return not (is_windows() or is_macos())
|
||||||
|
|
||||||
|
def get_browser_disable_options() -> list:
|
||||||
|
"""Get standard list of browser disable options for performance.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
List of command-line options to disable various browser features
|
||||||
|
"""
|
||||||
|
return [
|
||||||
|
"--disable-background-networking",
|
||||||
|
"--disable-background-timer-throttling",
|
||||||
|
"--disable-backgrounding-occluded-windows",
|
||||||
|
"--disable-breakpad",
|
||||||
|
"--disable-client-side-phishing-detection",
|
||||||
|
"--disable-component-extensions-with-background-pages",
|
||||||
|
"--disable-default-apps",
|
||||||
|
"--disable-extensions",
|
||||||
|
"--disable-features=TranslateUI",
|
||||||
|
"--disable-hang-monitor",
|
||||||
|
"--disable-ipc-flooding-protection",
|
||||||
|
"--disable-popup-blocking",
|
||||||
|
"--disable-prompt-on-repost",
|
||||||
|
"--disable-sync",
|
||||||
|
"--force-color-profile=srgb",
|
||||||
|
"--metrics-recording-only",
|
||||||
|
"--no-first-run",
|
||||||
|
"--password-store=basic",
|
||||||
|
"--use-mock-keychain",
|
||||||
|
]
|
||||||
@@ -163,6 +163,7 @@ class ManagedBrowser:
|
|||||||
)
|
)
|
||||||
|
|
||||||
# We'll monitor for a short time to make sure it starts properly, but won't keep monitoring
|
# We'll monitor for a short time to make sure it starts properly, but won't keep monitoring
|
||||||
|
await asyncio.sleep(0.5) # Give browser time to start
|
||||||
await self._initial_startup_check()
|
await self._initial_startup_check()
|
||||||
await asyncio.sleep(2) # Give browser time to start
|
await asyncio.sleep(2) # Give browser time to start
|
||||||
return f"http://{self.host}:{self.debugging_port}"
|
return f"http://{self.host}:{self.debugging_port}"
|
||||||
|
|||||||
@@ -555,7 +555,6 @@ class BrowserProfiler:
|
|||||||
else:
|
else:
|
||||||
self.logger.error(f"Invalid choice. Please enter a number between 1 and {exit_option}.", tag="MENU")
|
self.logger.error(f"Invalid choice. Please enter a number between 1 and {exit_option}.", tag="MENU")
|
||||||
|
|
||||||
|
|
||||||
async def launch_standalone_browser(self,
|
async def launch_standalone_browser(self,
|
||||||
browser_type: str = "chromium",
|
browser_type: str = "chromium",
|
||||||
user_data_dir: Optional[str] = None,
|
user_data_dir: Optional[str] = None,
|
||||||
|
|||||||
@@ -9,6 +9,26 @@ from crawl4ai import (
|
|||||||
CrawlResult
|
CrawlResult
|
||||||
)
|
)
|
||||||
|
|
||||||
|
async def example_cdp():
|
||||||
|
browser_conf = BrowserConfig(
|
||||||
|
headless=False,
|
||||||
|
cdp_url="http://localhost:9223"
|
||||||
|
)
|
||||||
|
crawler_config = CrawlerRunConfig(
|
||||||
|
session_id="test",
|
||||||
|
js_code = """(() => { return {"result": "Hello World!"} })()""",
|
||||||
|
js_only=True
|
||||||
|
)
|
||||||
|
async with AsyncWebCrawler(
|
||||||
|
config=browser_conf,
|
||||||
|
verbose=True,
|
||||||
|
) as crawler:
|
||||||
|
result : CrawlResult = await crawler.arun(
|
||||||
|
url="https://www.helloworld.org",
|
||||||
|
config=crawler_config,
|
||||||
|
)
|
||||||
|
print(result.js_execution_result)
|
||||||
|
|
||||||
|
|
||||||
async def main():
|
async def main():
|
||||||
browser_config = BrowserConfig(headless=True, verbose=True)
|
browser_config = BrowserConfig(headless=True, verbose=True)
|
||||||
@@ -16,18 +36,15 @@ async def main():
|
|||||||
crawler_config = CrawlerRunConfig(
|
crawler_config = CrawlerRunConfig(
|
||||||
cache_mode=CacheMode.BYPASS,
|
cache_mode=CacheMode.BYPASS,
|
||||||
markdown_generator=DefaultMarkdownGenerator(
|
markdown_generator=DefaultMarkdownGenerator(
|
||||||
# content_filter=PruningContentFilter(
|
content_filter=PruningContentFilter(
|
||||||
# threshold=0.48, threshold_type="fixed", min_word_threshold=0
|
threshold=0.48, threshold_type="fixed", min_word_threshold=0
|
||||||
# )
|
)
|
||||||
),
|
),
|
||||||
)
|
)
|
||||||
result : CrawlResult = await crawler.arun(
|
result : CrawlResult = await crawler.arun(
|
||||||
# url="https://www.helloworld.org", config=crawler_config
|
url="https://www.helloworld.org", config=crawler_config
|
||||||
url="https://www.kidocode.com", config=crawler_config
|
|
||||||
)
|
)
|
||||||
print(result.markdown.raw_markdown[:500])
|
print(result.markdown.raw_markdown[:500])
|
||||||
# print(result.model_dump())
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
asyncio.run(main())
|
asyncio.run(main())
|
||||||
|
|||||||
190
tests/browser/test_browser_manager.py
Normal file
190
tests/browser/test_browser_manager.py
Normal file
@@ -0,0 +1,190 @@
|
|||||||
|
"""Test examples for BrowserManager.
|
||||||
|
|
||||||
|
These examples demonstrate the functionality of BrowserManager
|
||||||
|
and serve as functional tests.
|
||||||
|
"""
|
||||||
|
|
||||||
|
import asyncio
|
||||||
|
import os
|
||||||
|
import sys
|
||||||
|
from typing import List
|
||||||
|
|
||||||
|
# Add the project root to Python path if running directly
|
||||||
|
if __name__ == "__main__":
|
||||||
|
sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '../..')))
|
||||||
|
|
||||||
|
from crawl4ai.browser import BrowserManager
|
||||||
|
from crawl4ai.async_configs import BrowserConfig, CrawlerRunConfig
|
||||||
|
from crawl4ai.async_logger import AsyncLogger
|
||||||
|
|
||||||
|
# Create a logger for clear terminal output
|
||||||
|
logger = AsyncLogger(verbose=True, log_file=None)
|
||||||
|
|
||||||
|
async def test_basic_browser_manager():
|
||||||
|
"""Test basic BrowserManager functionality with default configuration."""
|
||||||
|
logger.info("Starting test_basic_browser_manager", tag="TEST")
|
||||||
|
|
||||||
|
try:
|
||||||
|
# Create a browser manager with default config
|
||||||
|
manager = BrowserManager(logger=logger)
|
||||||
|
|
||||||
|
# Start the browser
|
||||||
|
await manager.start()
|
||||||
|
logger.info("Browser started successfully", tag="TEST")
|
||||||
|
|
||||||
|
# Get a page
|
||||||
|
crawler_config = CrawlerRunConfig(url="https://example.com")
|
||||||
|
page, context = await manager.get_page(crawler_config)
|
||||||
|
logger.info("Page created successfully", tag="TEST")
|
||||||
|
|
||||||
|
# Navigate to a website
|
||||||
|
await page.goto("https://example.com")
|
||||||
|
title = await page.title()
|
||||||
|
logger.info(f"Page title: {title}", tag="TEST")
|
||||||
|
|
||||||
|
# Clean up
|
||||||
|
await manager.close()
|
||||||
|
logger.success("test_basic_browser_manager completed successfully", tag="TEST")
|
||||||
|
return True
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"test_basic_browser_manager failed: {str(e)}", tag="TEST")
|
||||||
|
return False
|
||||||
|
|
||||||
|
async def test_custom_browser_config():
|
||||||
|
"""Test BrowserManager with custom browser configuration."""
|
||||||
|
logger.info("Starting test_custom_browser_config", tag="TEST")
|
||||||
|
|
||||||
|
try:
|
||||||
|
# Create a custom browser config
|
||||||
|
browser_config = BrowserConfig(
|
||||||
|
browser_type="chromium",
|
||||||
|
headless=True,
|
||||||
|
viewport_width=1280,
|
||||||
|
viewport_height=800,
|
||||||
|
light_mode=True
|
||||||
|
)
|
||||||
|
|
||||||
|
# Create browser manager with the config
|
||||||
|
manager = BrowserManager(browser_config=browser_config, logger=logger)
|
||||||
|
|
||||||
|
# Start the browser
|
||||||
|
await manager.start()
|
||||||
|
logger.info("Browser started successfully with custom config", tag="TEST")
|
||||||
|
|
||||||
|
# Get a page
|
||||||
|
crawler_config = CrawlerRunConfig(url="https://example.com")
|
||||||
|
page, context = await manager.get_page(crawler_config)
|
||||||
|
|
||||||
|
# Navigate to a website
|
||||||
|
await page.goto("https://example.com")
|
||||||
|
title = await page.title()
|
||||||
|
logger.info(f"Page title: {title}", tag="TEST")
|
||||||
|
|
||||||
|
# Verify viewport size
|
||||||
|
viewport_size = await page.evaluate("() => ({ width: window.innerWidth, height: window.innerHeight })")
|
||||||
|
logger.info(f"Viewport size: {viewport_size}", tag="TEST")
|
||||||
|
|
||||||
|
# Clean up
|
||||||
|
await manager.close()
|
||||||
|
logger.success("test_custom_browser_config completed successfully", tag="TEST")
|
||||||
|
return True
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"test_custom_browser_config failed: {str(e)}", tag="TEST")
|
||||||
|
return False
|
||||||
|
|
||||||
|
async def test_multiple_pages():
|
||||||
|
"""Test BrowserManager with multiple pages."""
|
||||||
|
logger.info("Starting test_multiple_pages", tag="TEST")
|
||||||
|
|
||||||
|
try:
|
||||||
|
# Create browser manager
|
||||||
|
manager = BrowserManager(logger=logger)
|
||||||
|
|
||||||
|
# Start the browser
|
||||||
|
await manager.start()
|
||||||
|
logger.info("Browser started successfully", tag="TEST")
|
||||||
|
|
||||||
|
# Create multiple pages
|
||||||
|
pages = []
|
||||||
|
urls = ["https://example.com", "https://example.org", "https://mozilla.org"]
|
||||||
|
|
||||||
|
for i, url in enumerate(urls):
|
||||||
|
crawler_config = CrawlerRunConfig(url=url)
|
||||||
|
page, context = await manager.get_page(crawler_config)
|
||||||
|
await page.goto(url)
|
||||||
|
pages.append((page, url))
|
||||||
|
logger.info(f"Created page {i+1} for {url}", tag="TEST")
|
||||||
|
|
||||||
|
# Verify all pages are loaded correctly
|
||||||
|
for i, (page, url) in enumerate(pages):
|
||||||
|
title = await page.title()
|
||||||
|
logger.info(f"Page {i+1} title: {title}", tag="TEST")
|
||||||
|
|
||||||
|
# Clean up
|
||||||
|
await manager.close()
|
||||||
|
logger.success("test_multiple_pages completed successfully", tag="TEST")
|
||||||
|
return True
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"test_multiple_pages failed: {str(e)}", tag="TEST")
|
||||||
|
return False
|
||||||
|
|
||||||
|
async def test_session_management():
|
||||||
|
"""Test session management in BrowserManager."""
|
||||||
|
logger.info("Starting test_session_management", tag="TEST")
|
||||||
|
|
||||||
|
try:
|
||||||
|
# Create browser manager
|
||||||
|
manager = BrowserManager(logger=logger)
|
||||||
|
|
||||||
|
# Start the browser
|
||||||
|
await manager.start()
|
||||||
|
logger.info("Browser started successfully", tag="TEST")
|
||||||
|
|
||||||
|
# Create a session
|
||||||
|
session_id = "test_session_1"
|
||||||
|
crawler_config = CrawlerRunConfig(url="https://example.com", session_id=session_id)
|
||||||
|
page1, context1 = await manager.get_page(crawler_config)
|
||||||
|
await page1.goto("https://example.com")
|
||||||
|
logger.info(f"Created session with ID: {session_id}", tag="TEST")
|
||||||
|
|
||||||
|
# Get the same session again
|
||||||
|
page2, context2 = await manager.get_page(crawler_config)
|
||||||
|
|
||||||
|
# Verify it's the same page/context
|
||||||
|
is_same_page = page1 == page2
|
||||||
|
is_same_context = context1 == context2
|
||||||
|
logger.info(f"Same page: {is_same_page}, Same context: {is_same_context}", tag="TEST")
|
||||||
|
|
||||||
|
# Kill the session
|
||||||
|
await manager.kill_session(session_id)
|
||||||
|
logger.info(f"Killed session with ID: {session_id}", tag="TEST")
|
||||||
|
|
||||||
|
# Clean up
|
||||||
|
await manager.close()
|
||||||
|
logger.success("test_session_management completed successfully", tag="TEST")
|
||||||
|
return True
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"test_session_management failed: {str(e)}", tag="TEST")
|
||||||
|
return False
|
||||||
|
|
||||||
|
async def run_tests():
|
||||||
|
"""Run all tests sequentially."""
|
||||||
|
results = []
|
||||||
|
|
||||||
|
# results.append(await test_basic_browser_manager())
|
||||||
|
# results.append(await test_custom_browser_config())
|
||||||
|
# results.append(await test_multiple_pages())
|
||||||
|
results.append(await test_session_management())
|
||||||
|
|
||||||
|
# Print summary
|
||||||
|
total = len(results)
|
||||||
|
passed = sum(results)
|
||||||
|
logger.info(f"Tests complete: {passed}/{total} passed", tag="SUMMARY")
|
||||||
|
|
||||||
|
if passed == total:
|
||||||
|
logger.success("All tests passed!", tag="SUMMARY")
|
||||||
|
else:
|
||||||
|
logger.error(f"{total - passed} tests failed", tag="SUMMARY")
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
asyncio.run(run_tests())
|
||||||
160
tests/browser/test_builtin_strategy.py
Normal file
160
tests/browser/test_builtin_strategy.py
Normal file
@@ -0,0 +1,160 @@
|
|||||||
|
"""Test examples for BuiltinBrowserStrategy.
|
||||||
|
|
||||||
|
These examples demonstrate the functionality of BuiltinBrowserStrategy
|
||||||
|
and serve as functional tests.
|
||||||
|
"""
|
||||||
|
|
||||||
|
import asyncio
|
||||||
|
import os
|
||||||
|
import sys
|
||||||
|
|
||||||
|
# Add the project root to Python path if running directly
|
||||||
|
if __name__ == "__main__":
|
||||||
|
sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '../..')))
|
||||||
|
|
||||||
|
from crawl4ai.browser import BrowserManager
|
||||||
|
from crawl4ai.async_configs import BrowserConfig, CrawlerRunConfig
|
||||||
|
from crawl4ai.async_logger import AsyncLogger
|
||||||
|
|
||||||
|
# Create a logger for clear terminal output
|
||||||
|
logger = AsyncLogger(verbose=True, log_file=None)
|
||||||
|
|
||||||
|
async def test_builtin_browser():
|
||||||
|
"""Test using a builtin browser that persists between sessions."""
|
||||||
|
logger.info("Testing builtin browser", tag="TEST")
|
||||||
|
|
||||||
|
browser_config = BrowserConfig(
|
||||||
|
browser_mode="builtin",
|
||||||
|
headless=True
|
||||||
|
)
|
||||||
|
|
||||||
|
manager = BrowserManager(browser_config=browser_config, logger=logger)
|
||||||
|
|
||||||
|
try:
|
||||||
|
# Start should connect to existing builtin browser or create one
|
||||||
|
await manager.start()
|
||||||
|
logger.info("Connected to builtin browser", tag="TEST")
|
||||||
|
|
||||||
|
# Test page creation
|
||||||
|
crawler_config = CrawlerRunConfig()
|
||||||
|
page, context = await manager.get_page(crawler_config)
|
||||||
|
|
||||||
|
# Test navigation
|
||||||
|
await page.goto("https://example.com")
|
||||||
|
title = await page.title()
|
||||||
|
logger.info(f"Page title: {title}", tag="TEST")
|
||||||
|
|
||||||
|
# Close manager (should not close the builtin browser)
|
||||||
|
await manager.close()
|
||||||
|
logger.info("First session closed", tag="TEST")
|
||||||
|
|
||||||
|
# Create a second manager to verify browser persistence
|
||||||
|
logger.info("Creating second session to verify persistence", tag="TEST")
|
||||||
|
manager2 = BrowserManager(browser_config=browser_config, logger=logger)
|
||||||
|
|
||||||
|
await manager2.start()
|
||||||
|
logger.info("Connected to existing builtin browser", tag="TEST")
|
||||||
|
|
||||||
|
page2, context2 = await manager2.get_page(crawler_config)
|
||||||
|
await page2.goto("https://example.org")
|
||||||
|
title2 = await page2.title()
|
||||||
|
logger.info(f"Second session page title: {title2}", tag="TEST")
|
||||||
|
|
||||||
|
await manager2.close()
|
||||||
|
logger.info("Second session closed successfully", tag="TEST")
|
||||||
|
|
||||||
|
return True
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Test failed: {str(e)}", tag="TEST")
|
||||||
|
try:
|
||||||
|
await manager.close()
|
||||||
|
except:
|
||||||
|
pass
|
||||||
|
return False
|
||||||
|
|
||||||
|
async def test_builtin_browser_status():
|
||||||
|
"""Test getting status of the builtin browser."""
|
||||||
|
logger.info("Testing builtin browser status", tag="TEST")
|
||||||
|
|
||||||
|
from crawl4ai.browser.strategies import BuiltinBrowserStrategy
|
||||||
|
|
||||||
|
browser_config = BrowserConfig(
|
||||||
|
browser_mode="builtin",
|
||||||
|
headless=True
|
||||||
|
)
|
||||||
|
|
||||||
|
# Create strategy directly to access its status methods
|
||||||
|
strategy = BuiltinBrowserStrategy(browser_config, logger)
|
||||||
|
|
||||||
|
try:
|
||||||
|
# Get status before starting (should be not running)
|
||||||
|
status_before = await strategy.get_builtin_browser_status()
|
||||||
|
logger.info(f"Initial status: {status_before}", tag="TEST")
|
||||||
|
|
||||||
|
# Start the browser
|
||||||
|
await strategy.start()
|
||||||
|
logger.info("Browser started successfully", tag="TEST")
|
||||||
|
|
||||||
|
# Get status after starting
|
||||||
|
status_after = await strategy.get_builtin_browser_status()
|
||||||
|
logger.info(f"Status after start: {status_after}", tag="TEST")
|
||||||
|
|
||||||
|
# Create a page to verify functionality
|
||||||
|
crawler_config = CrawlerRunConfig()
|
||||||
|
page, context = await strategy.get_page(crawler_config)
|
||||||
|
await page.goto("https://example.com")
|
||||||
|
title = await page.title()
|
||||||
|
logger.info(f"Page title: {title}", tag="TEST")
|
||||||
|
|
||||||
|
# Close strategy (should not kill the builtin browser)
|
||||||
|
await strategy.close()
|
||||||
|
logger.info("Strategy closed successfully", tag="TEST")
|
||||||
|
|
||||||
|
# Create a new strategy object
|
||||||
|
strategy2 = BuiltinBrowserStrategy(browser_config, logger)
|
||||||
|
|
||||||
|
# Get status again (should still be running)
|
||||||
|
status_final = await strategy2.get_builtin_browser_status()
|
||||||
|
logger.info(f"Final status: {status_final}", tag="TEST")
|
||||||
|
|
||||||
|
# Verify that the status shows the browser is running
|
||||||
|
is_running = status_final.get('running', False)
|
||||||
|
logger.info(f"Builtin browser persistence confirmed: {is_running}", tag="TEST")
|
||||||
|
|
||||||
|
# Kill the builtin browser to clean up
|
||||||
|
logger.info("Killing builtin browser", tag="TEST")
|
||||||
|
success = await strategy2.kill_builtin_browser()
|
||||||
|
logger.info(f"Killed builtin browser successfully: {success}", tag="TEST")
|
||||||
|
|
||||||
|
return is_running and success
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Test failed: {str(e)}", tag="TEST")
|
||||||
|
try:
|
||||||
|
await strategy.close()
|
||||||
|
|
||||||
|
# Try to kill the builtin browser to clean up
|
||||||
|
strategy2 = BuiltinBrowserStrategy(browser_config, logger)
|
||||||
|
await strategy2.kill_builtin_browser()
|
||||||
|
except:
|
||||||
|
pass
|
||||||
|
return False
|
||||||
|
|
||||||
|
async def run_tests():
|
||||||
|
"""Run all tests sequentially."""
|
||||||
|
results = []
|
||||||
|
|
||||||
|
results.append(await test_builtin_browser())
|
||||||
|
results.append(await test_builtin_browser_status())
|
||||||
|
|
||||||
|
# Print summary
|
||||||
|
total = len(results)
|
||||||
|
passed = sum(results)
|
||||||
|
logger.info(f"Tests complete: {passed}/{total} passed", tag="SUMMARY")
|
||||||
|
|
||||||
|
if passed == total:
|
||||||
|
logger.success("All tests passed!", tag="SUMMARY")
|
||||||
|
else:
|
||||||
|
logger.error(f"{total - passed} tests failed", tag="SUMMARY")
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
asyncio.run(run_tests())
|
||||||
227
tests/browser/test_cdp_strategy.py
Normal file
227
tests/browser/test_cdp_strategy.py
Normal file
@@ -0,0 +1,227 @@
|
|||||||
|
"""Test examples for CDPBrowserStrategy.
|
||||||
|
|
||||||
|
These examples demonstrate the functionality of CDPBrowserStrategy
|
||||||
|
and serve as functional tests.
|
||||||
|
"""
|
||||||
|
|
||||||
|
import asyncio
|
||||||
|
import os
|
||||||
|
import sys
|
||||||
|
|
||||||
|
# Add the project root to Python path if running directly
|
||||||
|
if __name__ == "__main__":
|
||||||
|
sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '../..')))
|
||||||
|
|
||||||
|
from crawl4ai.browser import BrowserManager
|
||||||
|
from crawl4ai.async_configs import BrowserConfig, CrawlerRunConfig
|
||||||
|
from crawl4ai.async_logger import AsyncLogger
|
||||||
|
|
||||||
|
# Create a logger for clear terminal output
|
||||||
|
logger = AsyncLogger(verbose=True, log_file=None)
|
||||||
|
|
||||||
|
async def test_cdp_launch_connect():
|
||||||
|
"""Test launching a browser and connecting via CDP."""
|
||||||
|
logger.info("Testing launch and connect via CDP", tag="TEST")
|
||||||
|
|
||||||
|
browser_config = BrowserConfig(
|
||||||
|
use_managed_browser=True,
|
||||||
|
headless=True
|
||||||
|
)
|
||||||
|
|
||||||
|
manager = BrowserManager(browser_config=browser_config, logger=logger)
|
||||||
|
|
||||||
|
try:
|
||||||
|
await manager.start()
|
||||||
|
logger.info("Browser launched and connected via CDP", tag="TEST")
|
||||||
|
|
||||||
|
# Test with multiple pages
|
||||||
|
pages = []
|
||||||
|
for i in range(3):
|
||||||
|
crawler_config = CrawlerRunConfig()
|
||||||
|
page, context = await manager.get_page(crawler_config)
|
||||||
|
await page.goto(f"https://example.com?test={i}")
|
||||||
|
pages.append(page)
|
||||||
|
logger.info(f"Created page {i+1}", tag="TEST")
|
||||||
|
|
||||||
|
# Verify all pages are working
|
||||||
|
for i, page in enumerate(pages):
|
||||||
|
title = await page.title()
|
||||||
|
logger.info(f"Page {i+1} title: {title}", tag="TEST")
|
||||||
|
|
||||||
|
await manager.close()
|
||||||
|
logger.info("Browser closed successfully", tag="TEST")
|
||||||
|
|
||||||
|
return True
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Test failed: {str(e)}", tag="TEST")
|
||||||
|
try:
|
||||||
|
await manager.close()
|
||||||
|
except:
|
||||||
|
pass
|
||||||
|
return False
|
||||||
|
|
||||||
|
async def test_cdp_with_user_data_dir():
|
||||||
|
"""Test CDP browser with a user data directory."""
|
||||||
|
logger.info("Testing CDP browser with user data directory", tag="TEST")
|
||||||
|
|
||||||
|
# Create a temporary user data directory
|
||||||
|
import tempfile
|
||||||
|
user_data_dir = tempfile.mkdtemp(prefix="crawl4ai-test-")
|
||||||
|
logger.info(f"Created temporary user data directory: {user_data_dir}", tag="TEST")
|
||||||
|
|
||||||
|
browser_config = BrowserConfig(
|
||||||
|
use_managed_browser=True,
|
||||||
|
headless=True,
|
||||||
|
user_data_dir=user_data_dir
|
||||||
|
)
|
||||||
|
|
||||||
|
manager = BrowserManager(browser_config=browser_config, logger=logger)
|
||||||
|
|
||||||
|
try:
|
||||||
|
await manager.start()
|
||||||
|
logger.info("Browser launched with user data directory", tag="TEST")
|
||||||
|
|
||||||
|
# Navigate to a page and store some data
|
||||||
|
crawler_config = CrawlerRunConfig()
|
||||||
|
page, context = await manager.get_page(crawler_config)
|
||||||
|
|
||||||
|
# Set a cookie
|
||||||
|
await context.add_cookies([{
|
||||||
|
"name": "test_cookie",
|
||||||
|
"value": "test_value",
|
||||||
|
"url": "https://example.com"
|
||||||
|
}])
|
||||||
|
|
||||||
|
# Visit the site
|
||||||
|
await page.goto("https://example.com")
|
||||||
|
|
||||||
|
# Verify cookie was set
|
||||||
|
cookies = await context.cookies(["https://example.com"])
|
||||||
|
has_test_cookie = any(cookie["name"] == "test_cookie" for cookie in cookies)
|
||||||
|
logger.info(f"Cookie set successfully: {has_test_cookie}", tag="TEST")
|
||||||
|
|
||||||
|
# Close the browser
|
||||||
|
await manager.close()
|
||||||
|
logger.info("First browser session closed", tag="TEST")
|
||||||
|
|
||||||
|
# Start a new browser with the same user data directory
|
||||||
|
logger.info("Starting second browser session with same user data directory", tag="TEST")
|
||||||
|
manager2 = BrowserManager(browser_config=browser_config, logger=logger)
|
||||||
|
await manager2.start()
|
||||||
|
|
||||||
|
# Get a new page and check if the cookie persists
|
||||||
|
page2, context2 = await manager2.get_page(crawler_config)
|
||||||
|
await page2.goto("https://example.com")
|
||||||
|
|
||||||
|
# Verify cookie persisted
|
||||||
|
cookies2 = await context2.cookies(["https://example.com"])
|
||||||
|
has_test_cookie2 = any(cookie["name"] == "test_cookie" for cookie in cookies2)
|
||||||
|
logger.info(f"Cookie persisted across sessions: {has_test_cookie2}", tag="TEST")
|
||||||
|
|
||||||
|
# Clean up
|
||||||
|
await manager2.close()
|
||||||
|
|
||||||
|
# Remove temporary directory
|
||||||
|
import shutil
|
||||||
|
shutil.rmtree(user_data_dir, ignore_errors=True)
|
||||||
|
logger.info(f"Removed temporary user data directory", tag="TEST")
|
||||||
|
|
||||||
|
return has_test_cookie and has_test_cookie2
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Test failed: {str(e)}", tag="TEST")
|
||||||
|
try:
|
||||||
|
await manager.close()
|
||||||
|
except:
|
||||||
|
pass
|
||||||
|
|
||||||
|
# Clean up temporary directory
|
||||||
|
try:
|
||||||
|
import shutil
|
||||||
|
shutil.rmtree(user_data_dir, ignore_errors=True)
|
||||||
|
except:
|
||||||
|
pass
|
||||||
|
|
||||||
|
return False
|
||||||
|
|
||||||
|
async def test_cdp_session_management():
|
||||||
|
"""Test session management with CDP browser."""
|
||||||
|
logger.info("Testing session management with CDP browser", tag="TEST")
|
||||||
|
|
||||||
|
browser_config = BrowserConfig(
|
||||||
|
use_managed_browser=True,
|
||||||
|
headless=True
|
||||||
|
)
|
||||||
|
|
||||||
|
manager = BrowserManager(browser_config=browser_config, logger=logger)
|
||||||
|
|
||||||
|
try:
|
||||||
|
await manager.start()
|
||||||
|
logger.info("Browser launched successfully", tag="TEST")
|
||||||
|
|
||||||
|
# Create two sessions
|
||||||
|
session1_id = "test_session_1"
|
||||||
|
session2_id = "test_session_2"
|
||||||
|
|
||||||
|
# Set up first session
|
||||||
|
crawler_config1 = CrawlerRunConfig(session_id=session1_id)
|
||||||
|
page1, context1 = await manager.get_page(crawler_config1)
|
||||||
|
await page1.goto("https://example.com")
|
||||||
|
await page1.evaluate("localStorage.setItem('session1_data', 'test_value')")
|
||||||
|
logger.info(f"Set up session 1 with ID: {session1_id}", tag="TEST")
|
||||||
|
|
||||||
|
# Set up second session
|
||||||
|
crawler_config2 = CrawlerRunConfig(session_id=session2_id)
|
||||||
|
page2, context2 = await manager.get_page(crawler_config2)
|
||||||
|
await page2.goto("https://example.org")
|
||||||
|
await page2.evaluate("localStorage.setItem('session2_data', 'test_value2')")
|
||||||
|
logger.info(f"Set up session 2 with ID: {session2_id}", tag="TEST")
|
||||||
|
|
||||||
|
# Get first session again
|
||||||
|
page1_again, _ = await manager.get_page(crawler_config1)
|
||||||
|
|
||||||
|
# Verify it's the same page and data persists
|
||||||
|
is_same_page = page1 == page1_again
|
||||||
|
data1 = await page1_again.evaluate("localStorage.getItem('session1_data')")
|
||||||
|
logger.info(f"Session 1 reuse successful: {is_same_page}, data: {data1}", tag="TEST")
|
||||||
|
|
||||||
|
# Kill first session
|
||||||
|
await manager.kill_session(session1_id)
|
||||||
|
logger.info(f"Killed session 1", tag="TEST")
|
||||||
|
|
||||||
|
# Verify second session still works
|
||||||
|
data2 = await page2.evaluate("localStorage.getItem('session2_data')")
|
||||||
|
logger.info(f"Session 2 still functional after killing session 1, data: {data2}", tag="TEST")
|
||||||
|
|
||||||
|
# Clean up
|
||||||
|
await manager.close()
|
||||||
|
logger.info("Browser closed successfully", tag="TEST")
|
||||||
|
|
||||||
|
return is_same_page and data1 == "test_value" and data2 == "test_value2"
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Test failed: {str(e)}", tag="TEST")
|
||||||
|
try:
|
||||||
|
await manager.close()
|
||||||
|
except:
|
||||||
|
pass
|
||||||
|
return False
|
||||||
|
|
||||||
|
async def run_tests():
|
||||||
|
"""Run all tests sequentially."""
|
||||||
|
results = []
|
||||||
|
|
||||||
|
results.append(await test_cdp_launch_connect())
|
||||||
|
results.append(await test_cdp_with_user_data_dir())
|
||||||
|
results.append(await test_cdp_session_management())
|
||||||
|
|
||||||
|
# Print summary
|
||||||
|
total = len(results)
|
||||||
|
passed = sum(results)
|
||||||
|
logger.info(f"Tests complete: {passed}/{total} passed", tag="SUMMARY")
|
||||||
|
|
||||||
|
if passed == total:
|
||||||
|
logger.success("All tests passed!", tag="SUMMARY")
|
||||||
|
else:
|
||||||
|
logger.error(f"{total - passed} tests failed", tag="SUMMARY")
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
asyncio.run(run_tests())
|
||||||
77
tests/browser/test_combined.py
Normal file
77
tests/browser/test_combined.py
Normal file
@@ -0,0 +1,77 @@
|
|||||||
|
"""Combined test runner for all browser module tests.
|
||||||
|
|
||||||
|
This script runs all the browser module tests in sequence and
|
||||||
|
provides a comprehensive summary.
|
||||||
|
"""
|
||||||
|
|
||||||
|
import asyncio
|
||||||
|
import os
|
||||||
|
import sys
|
||||||
|
import time
|
||||||
|
|
||||||
|
# Add the project root to Python path if running directly
|
||||||
|
if __name__ == "__main__":
|
||||||
|
sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '../..')))
|
||||||
|
|
||||||
|
from crawl4ai.async_logger import AsyncLogger
|
||||||
|
|
||||||
|
# Create a logger for clear terminal output
|
||||||
|
logger = AsyncLogger(verbose=True, log_file=None)
|
||||||
|
|
||||||
|
async def run_test_module(module_name, header):
|
||||||
|
"""Run all tests in a module and return results."""
|
||||||
|
logger.info(f"\n{'-'*30}", tag="TEST")
|
||||||
|
logger.info(f"RUNNING: {header}", tag="TEST")
|
||||||
|
logger.info(f"{'-'*30}", tag="TEST")
|
||||||
|
|
||||||
|
# Import the module dynamically
|
||||||
|
module = __import__(f"tests.browser.{module_name}", fromlist=["run_tests"])
|
||||||
|
|
||||||
|
# Track time for performance measurement
|
||||||
|
start_time = time.time()
|
||||||
|
|
||||||
|
# Run the tests
|
||||||
|
await module.run_tests()
|
||||||
|
|
||||||
|
# Calculate time taken
|
||||||
|
time_taken = time.time() - start_time
|
||||||
|
logger.info(f"Time taken: {time_taken:.2f} seconds", tag="TIMING")
|
||||||
|
|
||||||
|
return time_taken
|
||||||
|
|
||||||
|
async def main():
|
||||||
|
"""Run all test modules."""
|
||||||
|
logger.info("STARTING COMPREHENSIVE BROWSER MODULE TESTS", tag="MAIN")
|
||||||
|
|
||||||
|
# List of test modules to run
|
||||||
|
test_modules = [
|
||||||
|
("test_browser_manager", "Browser Manager Tests"),
|
||||||
|
("test_playwright_strategy", "Playwright Strategy Tests"),
|
||||||
|
("test_cdp_strategy", "CDP Strategy Tests"),
|
||||||
|
("test_builtin_strategy", "Builtin Browser Strategy Tests"),
|
||||||
|
("test_profiles", "Profile Management Tests")
|
||||||
|
]
|
||||||
|
|
||||||
|
# Run each test module
|
||||||
|
timings = {}
|
||||||
|
for module_name, header in test_modules:
|
||||||
|
try:
|
||||||
|
time_taken = await run_test_module(module_name, header)
|
||||||
|
timings[module_name] = time_taken
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Error running {module_name}: {str(e)}", tag="ERROR")
|
||||||
|
|
||||||
|
# Print summary
|
||||||
|
logger.info("\n\nTEST SUMMARY:", tag="SUMMARY")
|
||||||
|
logger.info(f"{'-'*50}", tag="SUMMARY")
|
||||||
|
for module_name, header in test_modules:
|
||||||
|
if module_name in timings:
|
||||||
|
logger.info(f"{header}: {timings[module_name]:.2f} seconds", tag="SUMMARY")
|
||||||
|
else:
|
||||||
|
logger.error(f"{header}: FAILED TO RUN", tag="SUMMARY")
|
||||||
|
logger.info(f"{'-'*50}", tag="SUMMARY")
|
||||||
|
total_time = sum(timings.values())
|
||||||
|
logger.info(f"Total time: {total_time:.2f} seconds", tag="SUMMARY")
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
asyncio.run(main())
|
||||||
275
tests/browser/test_playwright_strategy.py
Normal file
275
tests/browser/test_playwright_strategy.py
Normal file
@@ -0,0 +1,275 @@
|
|||||||
|
"""Test examples for PlaywrightBrowserStrategy.
|
||||||
|
|
||||||
|
These examples demonstrate the functionality of PlaywrightBrowserStrategy
|
||||||
|
and serve as functional tests.
|
||||||
|
"""
|
||||||
|
|
||||||
|
import asyncio
|
||||||
|
import os
|
||||||
|
import sys
|
||||||
|
|
||||||
|
# Add the project root to Python path if running directly
|
||||||
|
if __name__ == "__main__":
|
||||||
|
sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '../..')))
|
||||||
|
|
||||||
|
from crawl4ai.browser import BrowserManager
|
||||||
|
from crawl4ai.async_configs import BrowserConfig, CrawlerRunConfig
|
||||||
|
from crawl4ai.async_logger import AsyncLogger
|
||||||
|
|
||||||
|
# Create a logger for clear terminal output
|
||||||
|
logger = AsyncLogger(verbose=True, log_file=None)
|
||||||
|
|
||||||
|
async def test_playwright_basic():
|
||||||
|
"""Test basic Playwright browser functionality."""
|
||||||
|
logger.info("Testing standard Playwright browser", tag="TEST")
|
||||||
|
|
||||||
|
# Create browser config for standard Playwright
|
||||||
|
browser_config = BrowserConfig(
|
||||||
|
headless=True,
|
||||||
|
viewport_width=1280,
|
||||||
|
viewport_height=800
|
||||||
|
)
|
||||||
|
|
||||||
|
# Create browser manager with the config
|
||||||
|
manager = BrowserManager(browser_config=browser_config, logger=logger)
|
||||||
|
|
||||||
|
try:
|
||||||
|
# Start the browser
|
||||||
|
await manager.start()
|
||||||
|
logger.info("Browser started successfully", tag="TEST")
|
||||||
|
|
||||||
|
# Create crawler config
|
||||||
|
crawler_config = CrawlerRunConfig(url="https://example.com")
|
||||||
|
|
||||||
|
# Get a page
|
||||||
|
page, context = await manager.get_page(crawler_config)
|
||||||
|
logger.info("Got page successfully", tag="TEST")
|
||||||
|
|
||||||
|
# Navigate to a website
|
||||||
|
await page.goto("https://example.com")
|
||||||
|
logger.info("Navigated to example.com", tag="TEST")
|
||||||
|
|
||||||
|
# Get page title
|
||||||
|
title = await page.title()
|
||||||
|
logger.info(f"Page title: {title}", tag="TEST")
|
||||||
|
|
||||||
|
# Clean up
|
||||||
|
await manager.close()
|
||||||
|
logger.info("Browser closed successfully", tag="TEST")
|
||||||
|
|
||||||
|
return True
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Test failed: {str(e)}", tag="TEST")
|
||||||
|
# Ensure cleanup
|
||||||
|
try:
|
||||||
|
await manager.close()
|
||||||
|
except:
|
||||||
|
pass
|
||||||
|
return False
|
||||||
|
|
||||||
|
async def test_playwright_text_mode():
|
||||||
|
"""Test Playwright browser in text-only mode."""
|
||||||
|
logger.info("Testing Playwright text mode", tag="TEST")
|
||||||
|
|
||||||
|
# Create browser config with text mode enabled
|
||||||
|
browser_config = BrowserConfig(
|
||||||
|
headless=True,
|
||||||
|
text_mode=True # Enable text-only mode
|
||||||
|
)
|
||||||
|
|
||||||
|
# Create browser manager with the config
|
||||||
|
manager = BrowserManager(browser_config=browser_config, logger=logger)
|
||||||
|
|
||||||
|
try:
|
||||||
|
# Start the browser
|
||||||
|
await manager.start()
|
||||||
|
logger.info("Browser started successfully in text mode", tag="TEST")
|
||||||
|
|
||||||
|
# Get a page
|
||||||
|
crawler_config = CrawlerRunConfig(url="https://example.com")
|
||||||
|
page, context = await manager.get_page(crawler_config)
|
||||||
|
|
||||||
|
# Navigate to a website
|
||||||
|
await page.goto("https://example.com")
|
||||||
|
logger.info("Navigated to example.com", tag="TEST")
|
||||||
|
|
||||||
|
# Get page title
|
||||||
|
title = await page.title()
|
||||||
|
logger.info(f"Page title: {title}", tag="TEST")
|
||||||
|
|
||||||
|
# Check if images are blocked in text mode
|
||||||
|
# We'll check if any image requests were made
|
||||||
|
has_images = False
|
||||||
|
async with page.expect_request("**/*.{png,jpg,jpeg,gif,webp,svg}", timeout=1000) as request_info:
|
||||||
|
try:
|
||||||
|
# Try to load a page with images
|
||||||
|
await page.goto("https://picsum.photos/", wait_until="domcontentloaded")
|
||||||
|
request = await request_info.value
|
||||||
|
has_images = True
|
||||||
|
except:
|
||||||
|
# Timeout without image requests means text mode is working
|
||||||
|
has_images = False
|
||||||
|
|
||||||
|
logger.info(f"Text mode image blocking working: {not has_images}", tag="TEST")
|
||||||
|
|
||||||
|
# Clean up
|
||||||
|
await manager.close()
|
||||||
|
logger.info("Browser closed successfully", tag="TEST")
|
||||||
|
|
||||||
|
return True
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Test failed: {str(e)}", tag="TEST")
|
||||||
|
# Ensure cleanup
|
||||||
|
try:
|
||||||
|
await manager.close()
|
||||||
|
except:
|
||||||
|
pass
|
||||||
|
return False
|
||||||
|
|
||||||
|
async def test_playwright_context_reuse():
|
||||||
|
"""Test context caching and reuse with identical configurations."""
|
||||||
|
logger.info("Testing context reuse with identical configurations", tag="TEST")
|
||||||
|
|
||||||
|
# Create browser config
|
||||||
|
browser_config = BrowserConfig(headless=True)
|
||||||
|
|
||||||
|
# Create browser manager
|
||||||
|
manager = BrowserManager(browser_config=browser_config, logger=logger)
|
||||||
|
|
||||||
|
try:
|
||||||
|
# Start the browser
|
||||||
|
await manager.start()
|
||||||
|
logger.info("Browser started successfully", tag="TEST")
|
||||||
|
|
||||||
|
# Create identical crawler configs
|
||||||
|
crawler_config1 = CrawlerRunConfig(
|
||||||
|
url="https://example.com",
|
||||||
|
viewport_width=1280,
|
||||||
|
viewport_height=800
|
||||||
|
)
|
||||||
|
|
||||||
|
crawler_config2 = CrawlerRunConfig(
|
||||||
|
url="https://example.org", # Different URL but same browser parameters
|
||||||
|
viewport_width=1280,
|
||||||
|
viewport_height=800
|
||||||
|
)
|
||||||
|
|
||||||
|
# Get pages with these configs
|
||||||
|
page1, context1 = await manager.get_page(crawler_config1)
|
||||||
|
page2, context2 = await manager.get_page(crawler_config2)
|
||||||
|
|
||||||
|
# Check if contexts are reused
|
||||||
|
is_same_context = context1 == context2
|
||||||
|
logger.info(f"Contexts reused: {is_same_context}", tag="TEST")
|
||||||
|
|
||||||
|
# Now try with a different config
|
||||||
|
crawler_config3 = CrawlerRunConfig(
|
||||||
|
url="https://example.net",
|
||||||
|
viewport_width=800, # Different viewport size
|
||||||
|
viewport_height=600
|
||||||
|
)
|
||||||
|
|
||||||
|
page3, context3 = await manager.get_page(crawler_config3)
|
||||||
|
|
||||||
|
# This should be a different context
|
||||||
|
is_different_context = context1 != context3
|
||||||
|
logger.info(f"Different contexts for different configs: {is_different_context}", tag="TEST")
|
||||||
|
|
||||||
|
# Clean up
|
||||||
|
await manager.close()
|
||||||
|
logger.info("Browser closed successfully", tag="TEST")
|
||||||
|
|
||||||
|
# Both tests should pass for success
|
||||||
|
return is_same_context and is_different_context
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Test failed: {str(e)}", tag="TEST")
|
||||||
|
# Ensure cleanup
|
||||||
|
try:
|
||||||
|
await manager.close()
|
||||||
|
except:
|
||||||
|
pass
|
||||||
|
return False
|
||||||
|
|
||||||
|
async def test_playwright_session_management():
|
||||||
|
"""Test session management with Playwright browser."""
|
||||||
|
logger.info("Testing session management with Playwright browser", tag="TEST")
|
||||||
|
|
||||||
|
browser_config = BrowserConfig(
|
||||||
|
headless=True
|
||||||
|
)
|
||||||
|
|
||||||
|
manager = BrowserManager(browser_config=browser_config, logger=logger)
|
||||||
|
|
||||||
|
try:
|
||||||
|
await manager.start()
|
||||||
|
logger.info("Browser launched successfully", tag="TEST")
|
||||||
|
|
||||||
|
# Create two sessions
|
||||||
|
session1_id = "playwright_session_1"
|
||||||
|
session2_id = "playwright_session_2"
|
||||||
|
|
||||||
|
# Set up first session
|
||||||
|
crawler_config1 = CrawlerRunConfig(session_id=session1_id, url="https://example.com")
|
||||||
|
page1, context1 = await manager.get_page(crawler_config1)
|
||||||
|
await page1.goto("https://example.com")
|
||||||
|
await page1.evaluate("localStorage.setItem('playwright_session1_data', 'test_value1')")
|
||||||
|
logger.info(f"Set up session 1 with ID: {session1_id}", tag="TEST")
|
||||||
|
|
||||||
|
# Set up second session
|
||||||
|
crawler_config2 = CrawlerRunConfig(session_id=session2_id, url="https://example.org")
|
||||||
|
page2, context2 = await manager.get_page(crawler_config2)
|
||||||
|
await page2.goto("https://example.org")
|
||||||
|
await page2.evaluate("localStorage.setItem('playwright_session2_data', 'test_value2')")
|
||||||
|
logger.info(f"Set up session 2 with ID: {session2_id}", tag="TEST")
|
||||||
|
|
||||||
|
# Get first session again
|
||||||
|
page1_again, context1_again = await manager.get_page(crawler_config1)
|
||||||
|
|
||||||
|
# Verify it's the same page and data persists
|
||||||
|
is_same_page = page1 == page1_again
|
||||||
|
is_same_context = context1 == context1_again
|
||||||
|
data1 = await page1_again.evaluate("localStorage.getItem('playwright_session1_data')")
|
||||||
|
logger.info(f"Session 1 reuse successful: {is_same_page}, data: {data1}", tag="TEST")
|
||||||
|
|
||||||
|
# Kill first session
|
||||||
|
await manager.kill_session(session1_id)
|
||||||
|
logger.info(f"Killed session 1", tag="TEST")
|
||||||
|
|
||||||
|
# Verify second session still works
|
||||||
|
data2 = await page2.evaluate("localStorage.getItem('playwright_session2_data')")
|
||||||
|
logger.info(f"Session 2 still functional after killing session 1, data: {data2}", tag="TEST")
|
||||||
|
|
||||||
|
# Clean up
|
||||||
|
await manager.close()
|
||||||
|
logger.info("Browser closed successfully", tag="TEST")
|
||||||
|
|
||||||
|
return is_same_page and is_same_context and data1 == "test_value1" and data2 == "test_value2"
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Test failed: {str(e)}", tag="TEST")
|
||||||
|
try:
|
||||||
|
await manager.close()
|
||||||
|
except:
|
||||||
|
pass
|
||||||
|
return False
|
||||||
|
|
||||||
|
async def run_tests():
|
||||||
|
"""Run all tests sequentially."""
|
||||||
|
results = []
|
||||||
|
|
||||||
|
results.append(await test_playwright_basic())
|
||||||
|
results.append(await test_playwright_text_mode())
|
||||||
|
results.append(await test_playwright_context_reuse())
|
||||||
|
results.append(await test_playwright_session_management())
|
||||||
|
|
||||||
|
# Print summary
|
||||||
|
total = len(results)
|
||||||
|
passed = sum(results)
|
||||||
|
logger.info(f"Tests complete: {passed}/{total} passed", tag="SUMMARY")
|
||||||
|
|
||||||
|
if passed == total:
|
||||||
|
logger.success("All tests passed!", tag="SUMMARY")
|
||||||
|
else:
|
||||||
|
logger.error(f"{total - passed} tests failed", tag="SUMMARY")
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
asyncio.run(run_tests())
|
||||||
176
tests/browser/test_profiles.py
Normal file
176
tests/browser/test_profiles.py
Normal file
@@ -0,0 +1,176 @@
|
|||||||
|
"""Test examples for BrowserProfileManager.
|
||||||
|
|
||||||
|
These examples demonstrate the functionality of BrowserProfileManager
|
||||||
|
and serve as functional tests.
|
||||||
|
"""
|
||||||
|
|
||||||
|
import asyncio
|
||||||
|
import os
|
||||||
|
import sys
|
||||||
|
import uuid
|
||||||
|
import shutil
|
||||||
|
|
||||||
|
# Add the project root to Python path if running directly
|
||||||
|
if __name__ == "__main__":
|
||||||
|
sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '../..')))
|
||||||
|
|
||||||
|
from crawl4ai.browser import BrowserManager, BrowserProfileManager
|
||||||
|
from crawl4ai.async_configs import BrowserConfig, CrawlerRunConfig
|
||||||
|
from crawl4ai.async_logger import AsyncLogger
|
||||||
|
|
||||||
|
# Create a logger for clear terminal output
|
||||||
|
logger = AsyncLogger(verbose=True, log_file=None)
|
||||||
|
|
||||||
|
async def test_profile_creation():
|
||||||
|
"""Test creating and managing browser profiles."""
|
||||||
|
logger.info("Testing profile creation and management", tag="TEST")
|
||||||
|
|
||||||
|
profile_manager = BrowserProfileManager(logger=logger)
|
||||||
|
|
||||||
|
try:
|
||||||
|
# List existing profiles
|
||||||
|
profiles = profile_manager.list_profiles()
|
||||||
|
logger.info(f"Found {len(profiles)} existing profiles", tag="TEST")
|
||||||
|
|
||||||
|
# Generate a unique profile name for testing
|
||||||
|
test_profile_name = f"test-profile-{uuid.uuid4().hex[:8]}"
|
||||||
|
|
||||||
|
# Create a test profile directory
|
||||||
|
profile_path = os.path.join(profile_manager.profiles_dir, test_profile_name)
|
||||||
|
os.makedirs(os.path.join(profile_path, "Default"), exist_ok=True)
|
||||||
|
|
||||||
|
# Create a dummy Preferences file to simulate a Chrome profile
|
||||||
|
with open(os.path.join(profile_path, "Default", "Preferences"), "w") as f:
|
||||||
|
f.write("{\"test\": true}")
|
||||||
|
|
||||||
|
logger.info(f"Created test profile at: {profile_path}", tag="TEST")
|
||||||
|
|
||||||
|
# Verify the profile is now in the list
|
||||||
|
profiles = profile_manager.list_profiles()
|
||||||
|
profile_found = any(p["name"] == test_profile_name for p in profiles)
|
||||||
|
logger.info(f"Profile found in list: {profile_found}", tag="TEST")
|
||||||
|
|
||||||
|
# Try to get the profile path
|
||||||
|
retrieved_path = profile_manager.get_profile_path(test_profile_name)
|
||||||
|
path_match = retrieved_path == profile_path
|
||||||
|
logger.info(f"Retrieved correct profile path: {path_match}", tag="TEST")
|
||||||
|
|
||||||
|
# Delete the profile
|
||||||
|
success = profile_manager.delete_profile(test_profile_name)
|
||||||
|
logger.info(f"Profile deletion successful: {success}", tag="TEST")
|
||||||
|
|
||||||
|
# Verify it's gone
|
||||||
|
profiles_after = profile_manager.list_profiles()
|
||||||
|
profile_removed = not any(p["name"] == test_profile_name for p in profiles_after)
|
||||||
|
logger.info(f"Profile removed from list: {profile_removed}", tag="TEST")
|
||||||
|
|
||||||
|
# Clean up just in case
|
||||||
|
if os.path.exists(profile_path):
|
||||||
|
shutil.rmtree(profile_path, ignore_errors=True)
|
||||||
|
|
||||||
|
return profile_found and path_match and success and profile_removed
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Test failed: {str(e)}", tag="TEST")
|
||||||
|
# Clean up test directory
|
||||||
|
try:
|
||||||
|
if os.path.exists(profile_path):
|
||||||
|
shutil.rmtree(profile_path, ignore_errors=True)
|
||||||
|
except:
|
||||||
|
pass
|
||||||
|
return False
|
||||||
|
|
||||||
|
async def test_profile_with_browser():
|
||||||
|
"""Test using a profile with a browser."""
|
||||||
|
logger.info("Testing using a profile with a browser", tag="TEST")
|
||||||
|
|
||||||
|
profile_manager = BrowserProfileManager(logger=logger)
|
||||||
|
test_profile_name = f"test-browser-profile-{uuid.uuid4().hex[:8]}"
|
||||||
|
profile_path = None
|
||||||
|
|
||||||
|
try:
|
||||||
|
# Create a test profile directory
|
||||||
|
profile_path = os.path.join(profile_manager.profiles_dir, test_profile_name)
|
||||||
|
os.makedirs(os.path.join(profile_path, "Default"), exist_ok=True)
|
||||||
|
|
||||||
|
# Create a dummy Preferences file to simulate a Chrome profile
|
||||||
|
with open(os.path.join(profile_path, "Default", "Preferences"), "w") as f:
|
||||||
|
f.write("{\"test\": true}")
|
||||||
|
|
||||||
|
logger.info(f"Created test profile at: {profile_path}", tag="TEST")
|
||||||
|
|
||||||
|
# Now use this profile with a browser
|
||||||
|
browser_config = BrowserConfig(
|
||||||
|
user_data_dir=profile_path,
|
||||||
|
headless=True
|
||||||
|
)
|
||||||
|
|
||||||
|
manager = BrowserManager(browser_config=browser_config, logger=logger)
|
||||||
|
|
||||||
|
# Start the browser with the profile
|
||||||
|
await manager.start()
|
||||||
|
logger.info("Browser started with profile", tag="TEST")
|
||||||
|
|
||||||
|
# Create a page
|
||||||
|
crawler_config = CrawlerRunConfig()
|
||||||
|
page, context = await manager.get_page(crawler_config)
|
||||||
|
|
||||||
|
# Navigate and set some data to verify profile works
|
||||||
|
await page.goto("https://example.com")
|
||||||
|
await page.evaluate("localStorage.setItem('test_data', 'profile_value')")
|
||||||
|
|
||||||
|
# Close browser
|
||||||
|
await manager.close()
|
||||||
|
logger.info("First browser session closed", tag="TEST")
|
||||||
|
|
||||||
|
# Create a new browser with the same profile
|
||||||
|
manager2 = BrowserManager(browser_config=browser_config, logger=logger)
|
||||||
|
await manager2.start()
|
||||||
|
logger.info("Second browser session started with same profile", tag="TEST")
|
||||||
|
|
||||||
|
# Get a page and check if the data persists
|
||||||
|
page2, context2 = await manager2.get_page(crawler_config)
|
||||||
|
await page2.goto("https://example.com")
|
||||||
|
data = await page2.evaluate("localStorage.getItem('test_data')")
|
||||||
|
|
||||||
|
# Verify data persisted
|
||||||
|
data_persisted = data == "profile_value"
|
||||||
|
logger.info(f"Data persisted across sessions: {data_persisted}", tag="TEST")
|
||||||
|
|
||||||
|
# Clean up
|
||||||
|
await manager2.close()
|
||||||
|
logger.info("Second browser session closed", tag="TEST")
|
||||||
|
|
||||||
|
# Delete the test profile
|
||||||
|
success = profile_manager.delete_profile(test_profile_name)
|
||||||
|
logger.info(f"Test profile deleted: {success}", tag="TEST")
|
||||||
|
|
||||||
|
return data_persisted and success
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Test failed: {str(e)}", tag="TEST")
|
||||||
|
# Clean up
|
||||||
|
try:
|
||||||
|
if profile_path and os.path.exists(profile_path):
|
||||||
|
shutil.rmtree(profile_path, ignore_errors=True)
|
||||||
|
except:
|
||||||
|
pass
|
||||||
|
return False
|
||||||
|
|
||||||
|
async def run_tests():
|
||||||
|
"""Run all tests sequentially."""
|
||||||
|
results = []
|
||||||
|
|
||||||
|
results.append(await test_profile_creation())
|
||||||
|
results.append(await test_profile_with_browser())
|
||||||
|
|
||||||
|
# Print summary
|
||||||
|
total = len(results)
|
||||||
|
passed = sum(results)
|
||||||
|
logger.info(f"Tests complete: {passed}/{total} passed", tag="SUMMARY")
|
||||||
|
|
||||||
|
if passed == total:
|
||||||
|
logger.success("All tests passed!", tag="SUMMARY")
|
||||||
|
else:
|
||||||
|
logger.error(f"{total - passed} tests failed", tag="SUMMARY")
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
asyncio.run(run_tests())
|
||||||
Reference in New Issue
Block a user