refactor(browser): improve parallel crawling and browser management

Remove PagePoolConfig in favor of direct page management in browser strategies.
Add get_pages() method for efficient parallel page creation.
Improve storage state handling and persistence.
Add comprehensive parallel crawling tests and performance analysis.

BREAKING CHANGE: Removed PagePoolConfig class and related functionality.
This commit is contained in:
UncleCode
2025-03-23 18:53:24 +08:00
parent 4ab0893ffb
commit 0094cac675
7 changed files with 2115 additions and 351 deletions

View File

@@ -156,41 +156,6 @@ def is_empty_value(value: Any) -> bool:
return False return False
class PagePoolConfig:
"""Configuration for browser page pooling.
This class configures the page pooling mechanism that maintains pre-warmed
browser pages ready for immediate use, improving performance for scenarios
where multiple URLs need to be processed in sequence.
Attributes:
mode (str): Pooling mode - "static" or "adaptive".
"static" uses a fixed pool size defined by static_size.
"adaptive" calculates optimal size based on available system memory.
Default: "static".
static_size (int): Number of pages to maintain in the pool when mode is "static".
Default: 10.
memory_per_page (int): Estimated memory used by a single page in MB.
Used for "adaptive" mode calculations.
Default: 200.
memory_threshold (float): Maximum percentage of system memory to use in "adaptive" mode.
Default: 0.7 (70% of available memory).
timeout (float): Seconds to wait for a page from the pool before creating a new one.
Default: 5.0.
"""
def __init__(self,
mode="static",
static_size=10,
memory_per_page=200,
memory_threshold=0.7,
timeout=5.0):
self.mode = mode
self.static_size = static_size
self.memory_per_page = memory_per_page
self.memory_threshold = memory_threshold
self.timeout = timeout
class BrowserConfig: class BrowserConfig:
""" """
Configuration class for setting up a browser instance and its context in AsyncPlaywrightCrawlerStrategy. Configuration class for setting up a browser instance and its context in AsyncPlaywrightCrawlerStrategy.
@@ -235,7 +200,7 @@ class BrowserConfig:
Default: False. Default: False.
downloads_path (str or None): Directory to store downloaded files. If None and accept_downloads is True, downloads_path (str or None): Directory to store downloaded files. If None and accept_downloads is True,
a default path will be created. Default: None. a default path will be created. Default: None.
storage_state (str or dict or None): Path or object describing storage state (cookies, localStorage). storage_state (str or dict or None): An in-memory storage state (cookies, localStorage).
Default: None. Default: None.
ignore_https_errors (bool): Ignore HTTPS certificate errors. Default: True. ignore_https_errors (bool): Ignore HTTPS certificate errors. Default: True.
java_script_enabled (bool): Enable JavaScript execution in pages. Default: True. java_script_enabled (bool): Enable JavaScript execution in pages. Default: True.
@@ -255,9 +220,6 @@ class BrowserConfig:
light_mode (bool): Disables certain background features for performance gains. Default: False. light_mode (bool): Disables certain background features for performance gains. Default: False.
extra_args (list): Additional command-line arguments passed to the browser. extra_args (list): Additional command-line arguments passed to the browser.
Default: []. Default: [].
page_pool_config (PagePoolConfig or None): Configuration for page pooling mechanism.
If None, page pooling is disabled.
Default: None.
""" """
def __init__( def __init__(
@@ -298,7 +260,6 @@ class BrowserConfig:
extra_args: list = None, extra_args: list = None,
debugging_port: int = 9222, debugging_port: int = 9222,
host: str = "localhost", host: str = "localhost",
page_pool_config: Optional[PagePoolConfig] = None,
): ):
self.browser_type = browser_type self.browser_type = browser_type
self.headless = headless self.headless = headless
@@ -337,7 +298,6 @@ class BrowserConfig:
self.verbose = verbose self.verbose = verbose
self.debugging_port = debugging_port self.debugging_port = debugging_port
self.host = host self.host = host
self.page_pool_config = page_pool_config
fa_user_agenr_generator = ValidUAGenerator() fa_user_agenr_generator = ValidUAGenerator()
if self.user_agent_mode == "random": if self.user_agent_mode == "random":
@@ -368,12 +328,6 @@ class BrowserConfig:
@staticmethod @staticmethod
def from_kwargs(kwargs: dict) -> "BrowserConfig": def from_kwargs(kwargs: dict) -> "BrowserConfig":
# Handle page_pool_config
page_pool_config = kwargs.get("page_pool_config")
if isinstance(page_pool_config, dict):
# If it's a dict, convert to PagePoolConfig
page_pool_config = PagePoolConfig(**page_pool_config)
return BrowserConfig( return BrowserConfig(
browser_type=kwargs.get("browser_type", "chromium"), browser_type=kwargs.get("browser_type", "chromium"),
headless=kwargs.get("headless", True), headless=kwargs.get("headless", True),
@@ -407,7 +361,6 @@ class BrowserConfig:
extra_args=kwargs.get("extra_args", []), extra_args=kwargs.get("extra_args", []),
debugging_port=kwargs.get("debugging_port", 9222), debugging_port=kwargs.get("debugging_port", 9222),
host=kwargs.get("host", "localhost"), host=kwargs.get("host", "localhost"),
page_pool_config=page_pool_config,
) )
def to_dict(self): def to_dict(self):
@@ -442,7 +395,6 @@ class BrowserConfig:
"verbose": self.verbose, "verbose": self.verbose,
"debugging_port": self.debugging_port, "debugging_port": self.debugging_port,
"host": self.host, "host": self.host,
"page_pool_config": self.page_pool_config,
} }
def clone(self, **kwargs): def clone(self, **kwargs):

View File

@@ -2,11 +2,14 @@
This module provides a central browser management class that uses the This module provides a central browser management class that uses the
strategy pattern internally while maintaining the existing API. strategy pattern internally while maintaining the existing API.
It also implements a page pooling mechanism for improved performance.
""" """
import asyncio import asyncio
import time import time
from typing import Optional, Tuple, Dict, Any import os
import psutil
from typing import Optional, Tuple, Dict, Any, List, Set
from playwright.async_api import Page, BrowserContext from playwright.async_api import Page, BrowserContext
@@ -118,6 +121,28 @@ class BrowserManager:
return page, context return page, context
async def get_pages(self, crawlerRunConfig: CrawlerRunConfig, count: int = 1) -> List[Tuple[Page, BrowserContext]]:
"""Get multiple pages with the same configuration.
This method efficiently creates multiple browser pages using the same configuration,
which is useful for parallel crawling of multiple URLs.
Args:
crawlerRunConfig: Configuration for the pages
count: Number of pages to create
Returns:
List of (Page, Context) tuples
"""
# Delegate to strategy
pages = await self._strategy.get_pages(crawlerRunConfig, count)
# Sync sessions if needed
if hasattr(self._strategy, 'sessions'):
self.sessions = self._strategy.sessions
return pages
async def kill_session(self, session_id: str): async def kill_session(self, session_id: str):
"""Kill a browser session and clean up resources. """Kill a browser session and clean up resources.

View File

@@ -23,7 +23,7 @@ from ..async_configs import BrowserConfig, CrawlerRunConfig
from ..config import DOWNLOAD_PAGE_TIMEOUT from ..config import DOWNLOAD_PAGE_TIMEOUT
from ..js_snippet import load_js_script from ..js_snippet import load_js_script
from ..utils import get_home_folder from ..utils import get_home_folder
from .utils import get_playwright, get_browser_executable, get_browser_disable_options, create_temp_directory, is_windows from .utils import get_playwright, get_browser_executable, get_browser_disable_options, create_temp_directory, is_windows, is_browser_running
from playwright_stealth import StealthConfig from playwright_stealth import StealthConfig
@@ -85,6 +85,22 @@ class BaseBrowserStrategy(ABC):
""" """
pass pass
async def get_pages(self, crawlerRunConfig: CrawlerRunConfig, count: int = 1) -> List[Tuple[Page, BrowserContext]]:
"""Get multiple pages with the same configuration.
Args:
crawlerRunConfig: Configuration for the pages
count: Number of pages to create
Returns:
List of (Page, Context) tuples
"""
pages = []
for _ in range(count):
page, context = await self.get_page(crawlerRunConfig)
pages.append((page, context))
return pages
@abstractmethod @abstractmethod
async def close(self): async def close(self):
"""Close the browser and clean up resources.""" """Close the browser and clean up resources."""
@@ -136,9 +152,6 @@ class BaseBrowserStrategy(ABC):
if self.config.cookies: if self.config.cookies:
await context.add_cookies(self.config.cookies) await context.add_cookies(self.config.cookies)
if self.config.storage_state:
await context.storage_state(path=None)
if self.config.accept_downloads: if self.config.accept_downloads:
context.set_default_timeout(DOWNLOAD_PAGE_TIMEOUT) context.set_default_timeout(DOWNLOAD_PAGE_TIMEOUT)
context.set_default_navigation_timeout(DOWNLOAD_PAGE_TIMEOUT) context.set_default_navigation_timeout(DOWNLOAD_PAGE_TIMEOUT)
@@ -161,7 +174,7 @@ class BaseBrowserStrategy(ABC):
{ {
"name": "cookiesEnabled", "name": "cookiesEnabled",
"value": "true", "value": "true",
"url": crawlerRunConfig.url if crawlerRunConfig else "https://crawl4ai.com/", "url": crawlerRunConfig and crawlerRunConfig.url or "https://crawl4ai.com/",
} }
] ]
) )
@@ -324,12 +337,31 @@ class PlaywrightBrowserStrategy(BaseBrowserStrategy):
"viewport": viewport_settings, "viewport": viewport_settings,
"proxy": proxy_settings, "proxy": proxy_settings,
"accept_downloads": self.config.accept_downloads, "accept_downloads": self.config.accept_downloads,
"storage_state": self.config.storage_state,
"ignore_https_errors": self.config.ignore_https_errors, "ignore_https_errors": self.config.ignore_https_errors,
"device_scale_factor": 1.0, "device_scale_factor": 1.0,
"java_script_enabled": self.config.java_script_enabled, "java_script_enabled": self.config.java_script_enabled,
} }
# Handle storage state properly - this is key for persistence
if self.config.storage_state:
context_settings["storage_state"] = self.config.storage_state
if self.logger:
if isinstance(self.config.storage_state, str):
self.logger.debug(f"Using storage state from file: {self.config.storage_state}", tag="BROWSER")
else:
self.logger.debug("Using storage state from config object", tag="BROWSER")
if self.config.user_data_dir:
context_settings["storage_state"] = os.path.join(
self.config.user_data_dir, "Default", "storage_state.json"
)
# Create the file if it doesn't exist
if not os.path.exists(context_settings["storage_state"]):
os.makedirs(os.path.dirname(context_settings["storage_state"]), exist_ok=True)
with open(context_settings["storage_state"], "w") as f:
json.dump({}, f)
if crawlerRunConfig: if crawlerRunConfig:
# Check if there is value for crawlerRunConfig.proxy_config set add that to context # Check if there is value for crawlerRunConfig.proxy_config set add that to context
if crawlerRunConfig.proxy_config: if crawlerRunConfig.proxy_config:
@@ -428,6 +460,21 @@ class PlaywrightBrowserStrategy(BaseBrowserStrategy):
if self.config.sleep_on_close: if self.config.sleep_on_close:
await asyncio.sleep(0.5) await asyncio.sleep(0.5)
# If we have a user_data_dir configured, ensure persistence of storage state
if self.config.user_data_dir and self.browser and self.default_context:
for context in self.browser.contexts:
try:
await context.storage_state(path=os.path.join(self.config.user_data_dir, "Default", "storage_state.json"))
if self.logger:
self.logger.debug("Ensuring storage state is persisted before closing browser", tag="BROWSER")
except Exception as e:
if self.logger:
self.logger.warning(
message="Failed to ensure storage persistence: {error}",
tag="BROWSER",
params={"error": str(e)}
)
# Close all sessions # Close all sessions
session_ids = list(self.sessions.keys()) session_ids = list(self.sessions.keys())
for session_id in session_ids: for session_id in session_ids:
@@ -582,7 +629,7 @@ class CDPBrowserStrategy(BaseBrowserStrategy):
Returns: Returns:
List of command-line arguments for the browser List of command-line arguments for the browser
""" """
browser_path = get_browser_executable(self.config.browser_type) browser_path = await get_browser_executable(self.config.browser_type)
base_args = [browser_path] base_args = [browser_path]
if self.config.browser_type == "chromium": if self.config.browser_type == "chromium":
@@ -727,6 +774,22 @@ class CDPBrowserStrategy(BaseBrowserStrategy):
if self.config.sleep_on_close: if self.config.sleep_on_close:
await asyncio.sleep(0.5) await asyncio.sleep(0.5)
# If we have a user_data_dir configured, ensure persistence of storage state
if self.config.user_data_dir and self.browser:
try:
# Create a brief sleep to allow the browser to flush any pending operations
# This helps ensure all storage state (localStorage, cookies, etc.) gets saved
await asyncio.sleep(0.3)
if self.logger:
self.logger.debug("Ensuring storage state is persisted before closing CDP browser", tag="BROWSER")
except Exception as e:
if self.logger:
self.logger.warning(
message="Failed to ensure storage persistence: {error}",
tag="BROWSER",
params={"error": str(e)}
)
# Close all sessions # Close all sessions
session_ids = list(self.sessions.keys()) session_ids = list(self.sessions.keys())
for session_id in session_ids: for session_id in session_ids:
@@ -775,19 +838,46 @@ class BuiltinBrowserStrategy(CDPBrowserStrategy):
logger: Logger for recording events and errors logger: Logger for recording events and errors
""" """
super().__init__(config, logger) super().__init__(config, logger)
self.builtin_browser_dir = os.path.join(get_home_folder(), "builtin-browser") self.builtin_browser_dir = os.path.join(get_home_folder(), "builtin-browser") if not self.config.user_data_dir else self.config.user_data_dir
self.builtin_config_file = os.path.join(self.builtin_browser_dir, "browser_config.json") self.builtin_config_file = os.path.join(self.builtin_browser_dir, "browser_config.json")
# Raise error if user data dir is already engaged
if self._check_user_dir_is_engaged(self.builtin_browser_dir):
raise Exception(f"User data directory {self.builtin_browser_dir} is already engaged by another browser instance.")
os.makedirs(self.builtin_browser_dir, exist_ok=True) os.makedirs(self.builtin_browser_dir, exist_ok=True)
def _check_user_dir_is_engaged(self, user_data_dir: str) -> bool:
"""Check if the user data directory is already in use.
Returns:
bool: True if the directory is engaged, False otherwise
"""
# Load browser config file, then iterate in port_map values, check "user_data_dir" key if it matches
# the current user data directory
if os.path.exists(self.builtin_config_file):
try:
with open(self.builtin_config_file, 'r') as f:
browser_info_dict = json.load(f)
# Check if user data dir is already engaged
for port_str, browser_info in browser_info_dict.get("port_map", {}).items():
if browser_info.get("user_data_dir") == user_data_dir:
return True
except Exception as e:
if self.logger:
self.logger.error(f"Error reading built-in browser config: {str(e)}", tag="BUILTIN")
return False
async def start(self): async def start(self):
"""Start or connect to the built-in browser. """Start or connect to the built-in browser.
Returns: Returns:
self: For method chaining self: For method chaining
""" """
# Check for existing built-in browser # Check for existing built-in browser (get_browser_info already checks if running)
browser_info = self.get_builtin_browser_info() browser_info = self.get_browser_info()
if browser_info and self._is_browser_running(browser_info.get('pid')): if browser_info:
if self.logger: if self.logger:
self.logger.info(f"Using existing built-in browser at {browser_info.get('cdp_url')}", tag="BROWSER") self.logger.info(f"Using existing built-in browser at {browser_info.get('cdp_url')}", tag="BROWSER")
self.config.cdp_url = browser_info.get('cdp_url') self.config.cdp_url = browser_info.get('cdp_url')
@@ -797,7 +887,7 @@ class BuiltinBrowserStrategy(CDPBrowserStrategy):
cdp_url = await self.launch_builtin_browser( cdp_url = await self.launch_builtin_browser(
browser_type=self.config.browser_type, browser_type=self.config.browser_type,
debugging_port=self.config.debugging_port, debugging_port=self.config.debugging_port,
headless=self.config.headless headless=self.config.headless,
) )
if not cdp_url: if not cdp_url:
if self.logger: if self.logger:
@@ -808,55 +898,62 @@ class BuiltinBrowserStrategy(CDPBrowserStrategy):
# Call parent class implementation with updated CDP URL # Call parent class implementation with updated CDP URL
return await super().start() return await super().start()
def get_builtin_browser_info(self) -> Optional[Dict[str, Any]]: @classmethod
"""Get information about the built-in browser. def get_builtin_browser_info(cls, debugging_port: int, config_file: str, logger: Optional[AsyncLogger] = None) -> Optional[Dict[str, Any]]:
"""Get information about the built-in browser for a specific debugging port.
Returns:
dict: Browser information or None if no built-in browser is configured
"""
if not os.path.exists(self.builtin_config_file):
return None
try:
with open(self.builtin_config_file, 'r') as f:
browser_info = json.load(f)
# Check if the browser is still running
if not self._is_browser_running(browser_info.get('pid')):
if self.logger:
self.logger.warning("Built-in browser is not running", tag="BUILTIN")
return None
return browser_info
except Exception as e:
if self.logger:
self.logger.error(f"Error reading built-in browser config: {str(e)}", tag="BUILTIN")
return None
def _is_browser_running(self, pid: Optional[int]) -> bool:
"""Check if a process with the given PID is running.
Args: Args:
pid: Process ID to check debugging_port: The debugging port to look for
config_file: Path to the config file
logger: Optional logger for recording events
Returns: Returns:
bool: True if the process is running, False otherwise dict: Browser information or None if no running browser is configured for this port
""" """
if not pid: if not os.path.exists(config_file):
return False return None
try: try:
# Check if the process exists with open(config_file, 'r') as f:
if is_windows(): browser_info_dict = json.load(f)
process = subprocess.run(["tasklist", "/FI", f"PID eq {pid}"],
capture_output=True, text=True) # Get browser info from port map
return str(pid) in process.stdout if isinstance(browser_info_dict, dict) and "port_map" in browser_info_dict:
else: port_str = str(debugging_port)
# Unix-like systems if port_str in browser_info_dict["port_map"]:
os.kill(pid, 0) # This doesn't actually kill the process, just checks if it exists browser_info = browser_info_dict["port_map"][port_str]
return True
except (ProcessLookupError, PermissionError, OSError): # Check if the browser is still running
return False if not is_browser_running(browser_info.get('pid')):
if logger:
logger.warning(f"Built-in browser on port {debugging_port} is not running", tag="BUILTIN")
# Remove this port from the dictionary
del browser_info_dict["port_map"][port_str]
with open(config_file, 'w') as f:
json.dump(browser_info_dict, f, indent=2)
return None
return browser_info
return None
except Exception as e:
if logger:
logger.error(f"Error reading built-in browser config: {str(e)}", tag="BUILTIN")
return None
def get_browser_info(self) -> Optional[Dict[str, Any]]:
"""Get information about the current built-in browser instance.
Returns:
dict: Browser information or None if no running browser is configured
"""
return self.get_builtin_browser_info(
debugging_port=self.config.debugging_port,
config_file=self.builtin_config_file,
logger=self.logger
)
async def launch_builtin_browser(self, async def launch_builtin_browser(self,
browser_type: str = "chromium", browser_type: str = "chromium",
@@ -873,18 +970,27 @@ class BuiltinBrowserStrategy(CDPBrowserStrategy):
str: CDP URL for the browser, or None if launch failed str: CDP URL for the browser, or None if launch failed
""" """
# Check if there's an existing browser still running # Check if there's an existing browser still running
browser_info = self.get_builtin_browser_info() browser_info = self.get_builtin_browser_info(
if browser_info and self._is_browser_running(browser_info.get('pid')): debugging_port=debugging_port,
config_file=self.builtin_config_file,
logger=self.logger
)
if browser_info:
if self.logger: if self.logger:
self.logger.info("Built-in browser is already running", tag="BUILTIN") self.logger.info(f"Built-in browser is already running on port {debugging_port}", tag="BUILTIN")
return browser_info.get('cdp_url') return browser_info.get('cdp_url')
# Create a user data directory for the built-in browser # Create a user data directory for the built-in browser
user_data_dir = os.path.join(self.builtin_browser_dir, "user_data") user_data_dir = os.path.join(self.builtin_browser_dir, "user_data")
# Raise error if user data dir is already engaged
if self._check_user_dir_is_engaged(user_data_dir):
raise Exception(f"User data directory {user_data_dir} is already engaged by another browser instance.")
# Create the user data directory if it doesn't exist
os.makedirs(user_data_dir, exist_ok=True) os.makedirs(user_data_dir, exist_ok=True)
# Prepare browser launch arguments # Prepare browser launch arguments
browser_path = get_browser_executable(browser_type) browser_path = await get_browser_executable(browser_type)
if browser_type == "chromium": if browser_type == "chromium":
args = [ args = [
browser_path, browser_path,
@@ -957,7 +1063,7 @@ class BuiltinBrowserStrategy(CDPBrowserStrategy):
if self.logger: if self.logger:
self.logger.warning(f"Could not verify browser: {str(e)}", tag="BUILTIN") self.logger.warning(f"Could not verify browser: {str(e)}", tag="BUILTIN")
# Save browser info # Create browser info
browser_info = { browser_info = {
'pid': process.pid, 'pid': process.pid,
'cdp_url': cdp_url, 'cdp_url': cdp_url,
@@ -968,8 +1074,31 @@ class BuiltinBrowserStrategy(CDPBrowserStrategy):
'config': config_json 'config': config_json
} }
# Read existing config file if it exists
port_map = {}
if os.path.exists(self.builtin_config_file):
try:
with open(self.builtin_config_file, 'r') as f:
existing_data = json.load(f)
# Check if it already uses port mapping
if isinstance(existing_data, dict) and "port_map" in existing_data:
port_map = existing_data["port_map"]
# Convert legacy format to port mapping
elif isinstance(existing_data, dict) and "debugging_port" in existing_data:
old_port = str(existing_data.get("debugging_port"))
if self._is_browser_running(existing_data.get("pid")):
port_map[old_port] = existing_data
except Exception as e:
if self.logger:
self.logger.warning(f"Could not read existing config: {str(e)}", tag="BUILTIN")
# Add/update this browser in the port map
port_map[str(debugging_port)] = browser_info
# Write updated config
with open(self.builtin_config_file, 'w') as f: with open(self.builtin_config_file, 'w') as f:
json.dump(browser_info, f, indent=2) json.dump({"port_map": port_map}, f, indent=2)
# Detach from the browser process - don't keep any references # Detach from the browser process - don't keep any references
# This is important to allow the Python script to exit while the browser continues running # This is important to allow the Python script to exit while the browser continues running
@@ -990,10 +1119,10 @@ class BuiltinBrowserStrategy(CDPBrowserStrategy):
Returns: Returns:
bool: True if the browser was killed, False otherwise bool: True if the browser was killed, False otherwise
""" """
browser_info = self.get_builtin_browser_info() browser_info = self.get_browser_info()
if not browser_info: if not browser_info:
if self.logger: if self.logger:
self.logger.warning("No built-in browser found", tag="BUILTIN") self.logger.warning(f"No built-in browser found on port {self.config.debugging_port}", tag="BUILTIN")
return False return False
pid = browser_info.get('pid') pid = browser_info.get('pid')
@@ -1007,16 +1136,29 @@ class BuiltinBrowserStrategy(CDPBrowserStrategy):
os.kill(pid, signal.SIGTERM) os.kill(pid, signal.SIGTERM)
# Wait for termination # Wait for termination
for _ in range(5): for _ in range(5):
if not self._is_browser_running(pid): if not is_browser_running(pid):
break break
await asyncio.sleep(0.5) await asyncio.sleep(0.5)
else: else:
# Force kill if still running # Force kill if still running
os.kill(pid, signal.SIGKILL) os.kill(pid, signal.SIGKILL)
# Remove config file # Update config file to remove this browser
if os.path.exists(self.builtin_config_file): with open(self.builtin_config_file, 'r') as f:
os.unlink(self.builtin_config_file) browser_info_dict = json.load(f)
# Remove this port from the dictionary
port_str = str(self.config.debugging_port)
if port_str in browser_info_dict.get("port_map", {}):
del browser_info_dict["port_map"][port_str]
with open(self.builtin_config_file, 'w') as f:
json.dump(browser_info_dict, f, indent=2)
# Remove user data directory if it exists
if os.path.exists(self.builtin_browser_dir):
shutil.rmtree(self.builtin_browser_dir)
# Clear the browser info cache
self.browser = None
self.temp_dir = None
self.shutting_down = True
if self.logger: if self.logger:
self.logger.success("Built-in browser terminated", tag="BUILTIN") self.logger.success("Built-in browser terminated", tag="BUILTIN")
@@ -1032,17 +1174,29 @@ class BuiltinBrowserStrategy(CDPBrowserStrategy):
Returns: Returns:
dict: Status information with running, cdp_url, and info fields dict: Status information with running, cdp_url, and info fields
""" """
browser_info = self.get_builtin_browser_info() browser_info = self.get_browser_info()
if not browser_info: if not browser_info:
return { return {
'running': False, 'running': False,
'cdp_url': None, 'cdp_url': None,
'info': None 'info': None,
'port': self.config.debugging_port
} }
return { return {
'running': True, 'running': True,
'cdp_url': browser_info.get('cdp_url'), 'cdp_url': browser_info.get('cdp_url'),
'info': browser_info 'info': browser_info,
'port': self.config.debugging_port
} }
# Override the close method to handle built-in browser cleanup
async def close(self):
"""Close the built-in browser and clean up resources."""
# Call parent class close method
await super().close()
# Clean up built-in browser if we created it
if self.shutting_down:
await self.kill_builtin_browser()

View File

@@ -8,14 +8,18 @@ and Playwright instance management.
import asyncio import asyncio
import os import os
import sys import sys
import platform import time
import tempfile import tempfile
from typing import Optional, Any import subprocess
from typing import Optional
from playwright.async_api import async_playwright from playwright.async_api import async_playwright
from ..async_logger import AsyncLogger
from ..utils import get_chromium_path from ..utils import get_chromium_path
from ..async_configs import BrowserConfig, CrawlerRunConfig
from ..async_logger import AsyncLogger
_playwright_instance = None _playwright_instance = None
@@ -30,7 +34,7 @@ async def get_playwright():
_playwright_instance = await async_playwright().start() _playwright_instance = await async_playwright().start()
return _playwright_instance return _playwright_instance
def get_browser_executable(browser_type: str) -> str: async def get_browser_executable(browser_type: str) -> str:
"""Get the path to browser executable, with platform-specific handling. """Get the path to browser executable, with platform-specific handling.
Args: Args:
@@ -39,7 +43,7 @@ def get_browser_executable(browser_type: str) -> str:
Returns: Returns:
Path to browser executable Path to browser executable
""" """
return get_chromium_path(browser_type) return await get_chromium_path(browser_type)
def create_temp_directory(prefix="browser-profile-") -> str: def create_temp_directory(prefix="browser-profile-") -> str:
"""Create a temporary directory for browser data. """Create a temporary directory for browser data.
@@ -76,6 +80,31 @@ def is_linux() -> bool:
""" """
return not (is_windows() or is_macos()) return not (is_windows() or is_macos())
def is_browser_running(pid: Optional[int]) -> bool:
"""Check if a process with the given PID is running.
Args:
pid: Process ID to check
Returns:
bool: True if the process is running, False otherwise
"""
if not pid:
return False
try:
# Check if the process exists
if is_windows():
process = subprocess.run(["tasklist", "/FI", f"PID eq {pid}"],
capture_output=True, text=True)
return str(pid) in process.stdout
else:
# Unix-like systems
os.kill(pid, 0) # This doesn't actually kill the process, just checks if it exists
return True
except (ProcessLookupError, PermissionError, OSError):
return False
def get_browser_disable_options() -> list: def get_browser_disable_options() -> list:
"""Get standard list of browser disable options for performance. """Get standard list of browser disable options for performance.
@@ -103,3 +132,197 @@ def get_browser_disable_options() -> list:
"--password-store=basic", "--password-store=basic",
"--use-mock-keychain", "--use-mock-keychain",
] ]
async def find_optimal_browser_config(total_urls=50, verbose=True, rate_limit_delay=0.2):
"""Find optimal browser configuration for crawling a specific number of URLs.
Args:
total_urls: Number of URLs to crawl
verbose: Whether to print progress
rate_limit_delay: Delay between page loads to avoid rate limiting
Returns:
dict: Contains fastest, lowest_memory, and optimal configurations
"""
from .manager import BrowserManager
if verbose:
print(f"\n=== Finding optimal configuration for crawling {total_urls} URLs ===\n")
# Generate test URLs with timestamp to avoid caching
timestamp = int(time.time())
urls = [f"https://example.com/page_{i}?t={timestamp}" for i in range(total_urls)]
# Limit browser configurations to test (1 browser to max 10)
max_browsers = min(10, total_urls)
configs_to_test = []
# Generate configurations (browser count, pages distribution)
for num_browsers in range(1, max_browsers + 1):
base_pages = total_urls // num_browsers
remainder = total_urls % num_browsers
# Create distribution array like [3, 3, 2, 2] (some browsers get one more page)
if remainder > 0:
distribution = [base_pages + 1] * remainder + [base_pages] * (num_browsers - remainder)
else:
distribution = [base_pages] * num_browsers
configs_to_test.append((num_browsers, distribution))
results = []
# Test each configuration
for browser_count, page_distribution in configs_to_test:
if verbose:
print(f"Testing {browser_count} browsers with distribution {tuple(page_distribution)}")
try:
# Track memory if possible
try:
import psutil
process = psutil.Process()
start_memory = process.memory_info().rss / (1024 * 1024) # MB
except ImportError:
if verbose:
print("Memory tracking not available (psutil not installed)")
start_memory = 0
# Start browsers in parallel
managers = []
start_tasks = []
start_time = time.time()
logger = AsyncLogger(verbose=True, log_file=None)
for i in range(browser_count):
config = BrowserConfig(headless=True)
manager = BrowserManager(browser_config=config, logger=logger)
start_tasks.append(manager.start())
managers.append(manager)
await asyncio.gather(*start_tasks)
# Distribute URLs among browsers
urls_per_manager = {}
url_index = 0
for i, manager in enumerate(managers):
pages_for_this_browser = page_distribution[i]
end_index = url_index + pages_for_this_browser
urls_per_manager[manager] = urls[url_index:end_index]
url_index = end_index
# Create pages for each browser
all_pages = []
for manager, manager_urls in urls_per_manager.items():
if not manager_urls:
continue
pages = await manager.get_pages(CrawlerRunConfig(), count=len(manager_urls))
all_pages.extend(zip(pages, manager_urls))
# Crawl pages with delay to avoid rate limiting
async def crawl_page(page_ctx, url):
page, _ = page_ctx
try:
await page.goto(url)
if rate_limit_delay > 0:
await asyncio.sleep(rate_limit_delay)
title = await page.title()
return title
finally:
await page.close()
crawl_start = time.time()
crawl_tasks = [crawl_page(page_ctx, url) for page_ctx, url in all_pages]
await asyncio.gather(*crawl_tasks)
crawl_time = time.time() - crawl_start
total_time = time.time() - start_time
# Measure final memory usage
if start_memory > 0:
end_memory = process.memory_info().rss / (1024 * 1024)
memory_used = end_memory - start_memory
else:
memory_used = 0
# Close all browsers
for manager in managers:
await manager.close()
# Calculate metrics
pages_per_second = total_urls / crawl_time
# Calculate efficiency score (higher is better)
# This balances speed vs memory
if memory_used > 0:
efficiency = pages_per_second / (memory_used + 1)
else:
efficiency = pages_per_second
# Store result
result = {
"browser_count": browser_count,
"distribution": tuple(page_distribution),
"crawl_time": crawl_time,
"total_time": total_time,
"memory_used": memory_used,
"pages_per_second": pages_per_second,
"efficiency": efficiency
}
results.append(result)
if verbose:
print(f" ✓ Crawled {total_urls} pages in {crawl_time:.2f}s ({pages_per_second:.1f} pages/sec)")
if memory_used > 0:
print(f" ✓ Memory used: {memory_used:.1f}MB ({memory_used/total_urls:.1f}MB per page)")
print(f" ✓ Efficiency score: {efficiency:.4f}")
except Exception as e:
if verbose:
print(f" ✗ Error: {str(e)}")
# Clean up
for manager in managers:
try:
await manager.close()
except:
pass
# If no successful results, return None
if not results:
return None
# Find best configurations
fastest = sorted(results, key=lambda x: x["crawl_time"])[0]
# Only consider memory if available
memory_results = [r for r in results if r["memory_used"] > 0]
if memory_results:
lowest_memory = sorted(memory_results, key=lambda x: x["memory_used"])[0]
else:
lowest_memory = fastest
# Find most efficient (balanced speed vs memory)
optimal = sorted(results, key=lambda x: x["efficiency"], reverse=True)[0]
# Print summary
if verbose:
print("\n=== OPTIMAL CONFIGURATIONS ===")
print(f"⚡ Fastest: {fastest['browser_count']} browsers {fastest['distribution']}")
print(f" {fastest['crawl_time']:.2f}s, {fastest['pages_per_second']:.1f} pages/sec")
print(f"💾 Memory-efficient: {lowest_memory['browser_count']} browsers {lowest_memory['distribution']}")
if lowest_memory["memory_used"] > 0:
print(f" {lowest_memory['memory_used']:.1f}MB, {lowest_memory['memory_used']/total_urls:.2f}MB per page")
print(f"🌟 Balanced optimal: {optimal['browser_count']} browsers {optimal['distribution']}")
print(f" {optimal['crawl_time']:.2f}s, {optimal['pages_per_second']:.1f} pages/sec, score: {optimal['efficiency']:.4f}")
return {
"fastest": fastest,
"lowest_memory": lowest_memory,
"optimal": optimal,
"all_configs": results
}

View File

@@ -171,9 +171,9 @@ async def run_tests():
"""Run all tests sequentially.""" """Run all tests sequentially."""
results = [] results = []
# results.append(await test_basic_browser_manager()) results.append(await test_basic_browser_manager())
# results.append(await test_custom_browser_config()) results.append(await test_custom_browser_config())
# results.append(await test_multiple_pages()) results.append(await test_multiple_pages())
results.append(await test_session_management()) results.append(await test_session_management())
# Print summary # Print summary

View File

@@ -1,12 +1,12 @@
""" """
Test script for browser_profiler and builtin browser functionality. Test script for builtin browser functionality in the browser module.
This script tests: This script tests:
1. Creating a builtin browser 1. Creating a builtin browser
2. Getting browser information 2. Getting browser information
3. Killing the browser 3. Killing the browser
4. Restarting the browser 4. Restarting the browser
5. Testing crawling with different browser modes 5. Testing operations with different browser strategies
6. Testing edge cases 6. Testing edge cases
""" """
@@ -14,13 +14,20 @@ import asyncio
import os import os
import sys import sys
import time import time
from colorama import Fore, init from typing import List, Dict, Any
from colorama import Fore, Style, init
# Add the project root to the path for imports # Add the project root to the path for imports
sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '../..'))) sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), "../..")))
from crawl4ai.browser_profiler import BrowserProfiler from rich.console import Console
from crawl4ai.async_webcrawler import AsyncWebCrawler from rich.table import Table
from rich.panel import Panel
from rich.text import Text
from rich.box import Box, SIMPLE
from crawl4ai.browser import BrowserManager
from crawl4ai.browser.strategies import BuiltinBrowserStrategy
from crawl4ai.async_configs import BrowserConfig, CrawlerRunConfig from crawl4ai.async_configs import BrowserConfig, CrawlerRunConfig
from crawl4ai.async_logger import AsyncLogger from crawl4ai.async_logger import AsyncLogger
@@ -37,250 +44,751 @@ RESET = Fore.RESET
# Create logger # Create logger
logger = AsyncLogger(verbose=True) logger = AsyncLogger(verbose=True)
async def test_browser_profiler():
"""Test the BrowserProfiler class functionality"""
print(f"\n{INFO}========== Testing BrowserProfiler =========={RESET}")
# Initialize browser profiler async def test_builtin_browser_creation():
profiler = BrowserProfiler(logger=logger) """Test creating a builtin browser using the BrowserManager with BuiltinBrowserStrategy"""
print(f"\n{INFO}========== Testing Builtin Browser Creation =========={RESET}")
# Step 1: Check if builtin browser exists and kill it if it does # Step 1: Create a BrowserManager with builtin mode
print(f"\n{INFO}1. Checking if builtin browser exists{RESET}") print(f"\n{INFO}1. Creating BrowserManager with builtin mode{RESET}")
browser_info = profiler.get_builtin_browser_info() browser_config = BrowserConfig(browser_mode="builtin", headless=True, verbose=True)
if browser_info: manager = BrowserManager(browser_config=browser_config, logger=logger)
print(f"{SUCCESS}Builtin browser found: {browser_info['cdp_url']}{RESET}")
# Kill it to start with a clean state # Step 2: Check if we have a BuiltinBrowserStrategy
print(f"{INFO}Killing existing browser...{RESET}") print(f"\n{INFO}2. Checking if we have a BuiltinBrowserStrategy{RESET}")
await profiler.kill_builtin_browser() if isinstance(manager._strategy, BuiltinBrowserStrategy):
browser_info = profiler.get_builtin_browser_info() print(
if not browser_info: f"{SUCCESS}Correct strategy type: {manager._strategy.__class__.__name__}{RESET}"
print(f"{SUCCESS}Browser successfully killed{RESET}") )
else:
print(f"{ERROR}Failed to kill browser{RESET}")
else: else:
print(f"{WARNING}No builtin browser found{RESET}") print(
f"{ERROR}Wrong strategy type: {manager._strategy.__class__.__name__}{RESET}"
)
return None
# Step 2: Launch a new builtin browser # Step 3: Start the manager to launch or connect to builtin browser
print(f"\n{INFO}2. Launching new builtin browser{RESET}") print(f"\n{INFO}3. Starting the browser manager{RESET}")
cdp_url = await profiler.launch_builtin_browser(headless=True) try:
if cdp_url: await manager.start()
print(f"{SUCCESS}Builtin browser launched at: {cdp_url}{RESET}") print(f"{SUCCESS}Browser manager started successfully{RESET}")
else: except Exception as e:
print(f"{ERROR}Failed to launch builtin browser{RESET}") print(f"{ERROR}Failed to start browser manager: {str(e)}{RESET}")
return return None
# Step 3: Get and display browser information # Step 4: Get browser info from the strategy
print(f"\n{INFO}3. Getting browser information{RESET}") print(f"\n{INFO}4. Getting browser information{RESET}")
browser_info = profiler.get_builtin_browser_info() browser_info = manager._strategy.get_builtin_browser_info()
if browser_info: if browser_info:
print(f"{SUCCESS}Browser info retrieved:{RESET}") print(f"{SUCCESS}Browser info retrieved:{RESET}")
for key, value in browser_info.items(): for key, value in browser_info.items():
if key != 'config': # Skip the verbose config section if key != "config": # Skip the verbose config section
print(f" {key}: {value}") print(f" {key}: {value}")
cdp_url = browser_info.get("cdp_url")
print(f"{SUCCESS}CDP URL: {cdp_url}{RESET}")
else: else:
print(f"{ERROR}Failed to get browser information{RESET}") print(f"{ERROR}Failed to get browser information{RESET}")
cdp_url = None
# Step 4: Get browser status # Save manager for later tests
print(f"\n{INFO}4. Getting browser status{RESET}") return manager, cdp_url
status = await profiler.get_builtin_browser_status()
print(f"Running: {status['running']}")
print(f"CDP URL: {status['cdp_url']}")
# Pause to let the browser run for a moment
print(f"\n{INFO}Waiting for 2 seconds...{RESET}")
await asyncio.sleep(2)
return cdp_url # Return the CDP URL for the crawling tests async def test_page_operations(manager: BrowserManager):
"""Test page operations with the builtin browser"""
async def test_crawling_with_builtin_browser(cdp_url): print(
"""Test crawling with the builtin browser""" f"\n{INFO}========== Testing Page Operations with Builtin Browser =========={RESET}"
print(f"\n{INFO}========== Testing Crawling with Builtin Browser =========={RESET}")
# Step 1: Create a crawler with 'builtin' browser mode
print(f"\n{INFO}1. Creating crawler with 'builtin' browser mode{RESET}")
browser_config = BrowserConfig(
browser_mode="builtin",
headless=True
) )
crawler = AsyncWebCrawler(config=browser_config)
# Step 2: Test crawling without explicitly starting (should auto-start) # Step 1: Get a single page
print(f"\n{INFO}2. Testing auto-start with arun{RESET}") print(f"\n{INFO}1. Getting a single page{RESET}")
try: try:
result = await crawler.arun("https://crawl4ai.com") crawler_config = CrawlerRunConfig()
print(f"{SUCCESS}Auto-start crawling successful!{RESET}") page, context = await manager.get_page(crawler_config)
print(f" Got {len(result.markdown.raw_markdown)} chars of markdown content") print(f"{SUCCESS}Got page successfully{RESET}")
# Navigate to a test URL
await page.goto("https://example.com")
title = await page.title()
print(f"{SUCCESS}Page title: {title}{RESET}")
# Close the page
await page.close()
print(f"{SUCCESS}Page closed successfully{RESET}")
except Exception as e: except Exception as e:
print(f"{ERROR}Auto-start crawling failed: {str(e)}{RESET}") print(f"{ERROR}Page operation failed: {str(e)}{RESET}")
return False
# Close the crawler # Step 2: Get multiple pages
await crawler.close() print(f"\n{INFO}2. Getting multiple pages with get_pages(){RESET}")
# Step 3: Test with explicit start
print(f"\n{INFO}3. Testing with explicit start{RESET}")
crawler = AsyncWebCrawler(config=browser_config)
try: try:
await crawler.start() # Request 3 pages
print(f"{SUCCESS}Explicit start successful!{RESET}") crawler_config = CrawlerRunConfig()
result = await crawler.arun("https://example.com") pages = await manager.get_pages(crawler_config, count=3)
print(f" Got {len(result.markdown.raw_markdown)} chars of markdown content") print(f"{SUCCESS}Got {len(pages)} pages{RESET}")
# Try second time, no start needed
print(f"{INFO}Testing second arun call without start{RESET}")
result = await crawler.arun("https://example.com")
print(f" Got {len(result.markdown.raw_markdown)} chars of markdown content")
except Exception as e:
print(f"{ERROR}Explicit start crawling failed: {str(e)}{RESET}")
# Close the crawler # Test each page
await crawler.close() for i, (page, context) in enumerate(pages):
await page.goto(f"https://example.com?test={i}")
title = await page.title()
print(f"{SUCCESS}Page {i + 1} title: {title}{RESET}")
await page.close()
# Step 4: Test with context manager print(f"{SUCCESS}All pages tested and closed successfully{RESET}")
print(f"\n{INFO}4. Testing with context manager{RESET}")
try:
async with AsyncWebCrawler(config=browser_config) as crawler:
result = await crawler.arun("https://httpbin.org/html")
print(f"{SUCCESS}Context manager crawling successful!{RESET}")
print(f" Got {len(result.markdown.raw_markdown)} chars of markdown content")
except Exception as e: except Exception as e:
print(f"{ERROR}Context manager crawling failed: {str(e)}{RESET}") print(f"{ERROR}Multiple page operation failed: {str(e)}{RESET}")
return False
return True return True
async def test_crawling_without_builtin_browser():
"""Test crawling after killing the builtin browser"""
print(f"\n{INFO}========== Testing Crawling Without Builtin Browser =========={RESET}")
# Step 1: Kill the builtin browser async def test_browser_status_management(manager: BrowserManager):
print(f"\n{INFO}1. Killing the builtin browser{RESET}") """Test browser status and management operations"""
profiler = BrowserProfiler(logger=logger) print(f"\n{INFO}========== Testing Browser Status and Management =========={RESET}")
await profiler.kill_builtin_browser()
# Step 2: Create a crawler with 'builtin' mode (should fall back to dedicated)
print(f"\n{INFO}2. Creating crawler with 'builtin' mode (should fall back){RESET}")
browser_config = BrowserConfig(
browser_mode="builtin",
headless=True
)
# Step 1: Get browser status
print(f"\n{INFO}1. Getting browser status{RESET}")
try: try:
async with AsyncWebCrawler(config=browser_config) as crawler: status = await manager._strategy.get_builtin_browser_status()
result = await crawler.arun("https://httpbin.org/get") print(f"{SUCCESS}Browser status:{RESET}")
print(f"{SUCCESS}Fallback to dedicated browser successful!{RESET}") print(f" Running: {status['running']}")
print(f" Got {len(result.markdown.raw_markdown)} chars of markdown content") print(f" CDP URL: {status['cdp_url']}")
except Exception as e: except Exception as e:
print(f"{ERROR}Fallback crawler failed: {str(e)}{RESET}") print(f"{ERROR}Failed to get browser status: {str(e)}{RESET}")
return False
# Step 3: Test with direct CDP URL
print(f"\n{INFO}3. Testing with direct CDP URL connection{RESET}")
# Launch a standalone browser to get a CDP URL
print(f"{INFO}Launching standalone browser...{RESET}")
cdp_url = await profiler.launch_standalone_browser(headless=True)
if not cdp_url:
print(f"{ERROR}Failed to launch standalone browser{RESET}")
return
print(f"{SUCCESS}Got CDP URL: {cdp_url}{RESET}")
# Create a crawler with the CDP URL
browser_config = BrowserConfig(
browser_mode="dedicated",
cdp_url=cdp_url,
use_managed_browser=True,
headless=True
)
# Step 2: Test killing the browser
print(f"\n{INFO}2. Testing killing the browser{RESET}")
try: try:
async with AsyncWebCrawler(config=browser_config) as crawler: result = await manager._strategy.kill_builtin_browser()
result = await crawler.arun("https://httpbin.org/ip") if result:
print(f"{SUCCESS}Direct CDP URL crawling successful!{RESET}") print(f"{SUCCESS}Browser killed successfully{RESET}")
print(f" Got {len(result.markdown.raw_markdown)} chars of markdown content") else:
print(f"{ERROR}Failed to kill browser{RESET}")
except Exception as e: except Exception as e:
print(f"{ERROR}Direct CDP URL crawling failed: {str(e)}{RESET}") print(f"{ERROR}Browser kill operation failed: {str(e)}{RESET}")
return False
# Step 3: Check status after kill
print(f"\n{INFO}3. Checking status after kill{RESET}")
try:
status = await manager._strategy.get_builtin_browser_status()
if not status["running"]:
print(f"{SUCCESS}Browser is correctly reported as not running{RESET}")
else:
print(f"{ERROR}Browser is incorrectly reported as still running{RESET}")
except Exception as e:
print(f"{ERROR}Failed to get browser status: {str(e)}{RESET}")
return False
# Step 4: Launch a new browser
print(f"\n{INFO}4. Launching a new browser{RESET}")
try:
cdp_url = await manager._strategy.launch_builtin_browser(
browser_type="chromium", headless=True
)
if cdp_url:
print(f"{SUCCESS}New browser launched at: {cdp_url}{RESET}")
else:
print(f"{ERROR}Failed to launch new browser{RESET}")
return False
except Exception as e:
print(f"{ERROR}Browser launch failed: {str(e)}{RESET}")
return False
return True return True
async def test_multiple_managers():
"""Test creating multiple BrowserManagers that use the same builtin browser"""
print(f"\n{INFO}========== Testing Multiple Browser Managers =========={RESET}")
# Step 1: Create first manager
print(f"\n{INFO}1. Creating first browser manager{RESET}")
browser_config1 = (BrowserConfig(browser_mode="builtin", headless=True),)
manager1 = BrowserManager(browser_config=browser_config1, logger=logger)
# Step 2: Create second manager
print(f"\n{INFO}2. Creating second browser manager{RESET}")
browser_config2 = BrowserConfig(browser_mode="builtin", headless=True)
manager2 = BrowserManager(browser_config=browser_config2, logger=logger)
# Step 3: Start both managers (should connect to the same builtin browser)
print(f"\n{INFO}3. Starting both managers{RESET}")
try:
await manager1.start()
print(f"{SUCCESS}First manager started{RESET}")
await manager2.start()
print(f"{SUCCESS}Second manager started{RESET}")
# Check if they got the same CDP URL
cdp_url1 = manager1._strategy.config.cdp_url
cdp_url2 = manager2._strategy.config.cdp_url
if cdp_url1 == cdp_url2:
print(
f"{SUCCESS}Both managers connected to the same browser: {cdp_url1}{RESET}"
)
else:
print(
f"{WARNING}Managers connected to different browsers: {cdp_url1} and {cdp_url2}{RESET}"
)
except Exception as e:
print(f"{ERROR}Failed to start managers: {str(e)}{RESET}")
return False
# Step 4: Test using both managers
print(f"\n{INFO}4. Testing operations with both managers{RESET}")
try:
# First manager creates a page
page1, ctx1 = await manager1.get_page(CrawlerRunConfig())
await page1.goto("https://example.com")
title1 = await page1.title()
print(f"{SUCCESS}Manager 1 page title: {title1}{RESET}")
# Second manager creates a page
page2, ctx2 = await manager2.get_page(CrawlerRunConfig())
await page2.goto("https://example.org")
title2 = await page2.title()
print(f"{SUCCESS}Manager 2 page title: {title2}{RESET}")
# Clean up
await page1.close()
await page2.close()
except Exception as e:
print(f"{ERROR}Failed to use both managers: {str(e)}{RESET}")
return False
# Step 5: Close both managers
print(f"\n{INFO}5. Closing both managers{RESET}")
try:
await manager1.close()
print(f"{SUCCESS}First manager closed{RESET}")
await manager2.close()
print(f"{SUCCESS}Second manager closed{RESET}")
except Exception as e:
print(f"{ERROR}Failed to close managers: {str(e)}{RESET}")
return False
return True
async def test_edge_cases(): async def test_edge_cases():
"""Test edge cases like multiple starts, killing browser during crawl, etc.""" """Test edge cases like multiple starts, killing browser during operations, etc."""
print(f"\n{INFO}========== Testing Edge Cases =========={RESET}") print(f"\n{INFO}========== Testing Edge Cases =========={RESET}")
# Step 1: Launch the builtin browser if it doesn't exist # Step 1: Test multiple starts with the same manager
print(f"\n{INFO}1. Ensuring builtin browser exists{RESET}") print(f"\n{INFO}1. Testing multiple starts with the same manager{RESET}")
profiler = BrowserProfiler(logger=logger)
browser_info = profiler.get_builtin_browser_info()
if not browser_info:
cdp_url = await profiler.launch_builtin_browser(headless=True)
if cdp_url:
print(f"{SUCCESS}Builtin browser launched at: {cdp_url}{RESET}")
else:
print(f"{ERROR}Failed to launch builtin browser{RESET}")
return
else:
print(f"{SUCCESS}Using existing builtin browser: {browser_info['cdp_url']}{RESET}")
# Step 2: Test multiple starts with the same crawler
print(f"\n{INFO}2. Testing multiple starts with the same crawler{RESET}")
browser_config = BrowserConfig(browser_mode="builtin", headless=True) browser_config = BrowserConfig(browser_mode="builtin", headless=True)
crawler = AsyncWebCrawler(config=browser_config) manager = BrowserManager(browser_config=browser_config, logger=logger)
await crawler.start()
print(f"{SUCCESS}First start successful!{RESET}")
try: try:
await crawler.start() await manager.start()
print(f"{SUCCESS}Second start didn't cause errors!{RESET}") print(f"{SUCCESS}First start successful{RESET}")
except Exception as e:
print(f"{ERROR}Second start failed: {str(e)}{RESET}") # Try to start again
await manager.start()
print(f"{SUCCESS}Second start completed without errors{RESET}")
# Test if it's still functional
page, context = await manager.get_page(CrawlerRunConfig())
await page.goto("https://example.com")
title = await page.title()
print(
f"{SUCCESS}Page operations work after multiple starts. Title: {title}{RESET}"
)
await page.close()
except Exception as e:
print(f"{ERROR}Multiple starts test failed: {str(e)}{RESET}")
return False
finally:
await manager.close()
# Step 2: Test killing the browser while manager is active
print(f"\n{INFO}2. Testing killing the browser while manager is active{RESET}")
manager = BrowserManager(browser_config=browser_config, logger=logger)
# Run a crawl to verify functionality
try: try:
result = await crawler.arun("https://httpbin.org/user-agent") await manager.start()
print(f"{SUCCESS}Crawling after multiple starts successful!{RESET}") print(f"{SUCCESS}Manager started{RESET}")
print(f" Got {len(result.markdown.raw_markdown)} chars of markdown content")
# Kill the browser directly
print(f"{INFO}Killing the browser...{RESET}")
await manager._strategy.kill_builtin_browser()
print(f"{SUCCESS}Browser killed{RESET}")
# Try to get a page (should fail or launch a new browser)
try:
page, context = await manager.get_page(CrawlerRunConfig())
print(
f"{WARNING}Page request succeeded despite killed browser (might have auto-restarted){RESET}"
)
title = await page.title()
print(f"{SUCCESS}Got page title: {title}{RESET}")
await page.close()
except Exception as e:
print(
f"{SUCCESS}Page request failed as expected after browser was killed: {str(e)}{RESET}"
)
except Exception as e: except Exception as e:
print(f"{ERROR}Crawling after multiple starts failed: {str(e)}{RESET}") print(f"{ERROR}Kill during operation test failed: {str(e)}{RESET}")
return False
await crawler.close() finally:
await manager.close()
# Step 3: Test killing browser while crawler is active
print(f"\n{INFO}3. Testing killing browser while crawler is active{RESET}")
# Create and start a crawler
browser_config = BrowserConfig(browser_mode="builtin", headless=True)
crawler = AsyncWebCrawler(config=browser_config)
await crawler.start()
# Kill the browser
print(f"{INFO}Killing the browser...{RESET}")
await profiler.kill_builtin_browser()
# Try to crawl (should fail)
try:
result = await crawler.arun("https://httpbin.org/get")
print(f"{WARNING}Crawling succeeded despite killed browser!{RESET}")
except Exception as e:
print(f"{SUCCESS}Crawling failed as expected: {str(e)}{RESET}")
await crawler.close()
return True return True
async def cleanup_browsers():
"""Clean up any remaining builtin browsers"""
print(f"\n{INFO}========== Cleaning Up Builtin Browsers =========={RESET}")
browser_config = BrowserConfig(browser_mode="builtin", headless=True)
manager = BrowserManager(browser_config=browser_config, logger=logger)
try:
# No need to start, just access the strategy directly
strategy = manager._strategy
if isinstance(strategy, BuiltinBrowserStrategy):
result = await strategy.kill_builtin_browser()
if result:
print(f"{SUCCESS}Successfully killed all builtin browsers{RESET}")
else:
print(f"{WARNING}No builtin browsers found to kill{RESET}")
else:
print(f"{ERROR}Wrong strategy type: {strategy.__class__.__name__}{RESET}")
except Exception as e:
print(f"{ERROR}Cleanup failed: {str(e)}{RESET}")
finally:
# Just to be safe
try:
await manager.close()
except:
pass
async def test_performance_scaling():
"""Test performance with multiple browsers and pages.
This test creates multiple browsers on different ports,
spawns multiple pages per browser, and measures performance metrics.
"""
print(f"\n{INFO}========== Testing Performance Scaling =========={RESET}")
# Configuration parameters
num_browsers = 10
pages_per_browser = 10
total_pages = num_browsers * pages_per_browser
base_port = 9222
# Set up a measuring mechanism for memory
import psutil
import gc
# Force garbage collection before starting
gc.collect()
process = psutil.Process()
initial_memory = process.memory_info().rss / 1024 / 1024 # in MB
peak_memory = initial_memory
# Report initial configuration
print(
f"{INFO}Test configuration: {num_browsers} browsers × {pages_per_browser} pages = {total_pages} total crawls{RESET}"
)
# List to track managers
managers: List[BrowserManager] = []
all_pages = []
# Get crawl4ai home directory
crawl4ai_home = os.path.expanduser("~/.crawl4ai")
temp_dir = os.path.join(crawl4ai_home, "temp")
os.makedirs(temp_dir, exist_ok=True)
# Create all managers but don't start them yet
manager_configs = []
for i in range(num_browsers):
port = base_port + i
browser_config = BrowserConfig(
browser_mode="builtin",
headless=True,
debugging_port=port,
user_data_dir=os.path.join(temp_dir, f"browser_profile_{i}"),
)
manager = BrowserManager(browser_config=browser_config, logger=logger)
manager._strategy.shutting_down = True
manager_configs.append((manager, i, port))
# Define async function to start a single manager
async def start_manager(manager, index, port):
try:
await manager.start()
return manager
except Exception as e:
print(
f"{ERROR}Failed to start browser {index + 1} on port {port}: {str(e)}{RESET}"
)
return None
# Start all managers in parallel
start_tasks = [
start_manager(manager, i, port) for manager, i, port in manager_configs
]
started_managers = await asyncio.gather(*start_tasks)
# Filter out None values (failed starts) and add to managers list
managers = [m for m in started_managers if m is not None]
if len(managers) == 0:
print(f"{ERROR}All browser managers failed to start. Aborting test.{RESET}")
return False
if len(managers) < num_browsers:
print(
f"{WARNING}Only {len(managers)} out of {num_browsers} browser managers started successfully{RESET}"
)
# Create pages for each browser
for i, manager in enumerate(managers):
try:
pages = await manager.get_pages(CrawlerRunConfig(), count=pages_per_browser)
all_pages.extend(pages)
except Exception as e:
print(f"{ERROR}Failed to create pages for browser {i + 1}: {str(e)}{RESET}")
# Check memory after page creation
gc.collect()
current_memory = process.memory_info().rss / 1024 / 1024
peak_memory = max(peak_memory, current_memory)
# Ask for confirmation before loading
confirmation = input(
f"{WARNING}Do you want to proceed with loading pages? (y/n): {RESET}"
)
# Step 1: Create and start multiple browser managers in parallel
start_time = time.time()
if confirmation.lower() == "y":
load_start_time = time.time()
# Function to load a single page
async def load_page(page_ctx, index):
page, _ = page_ctx
try:
await page.goto(f"https://example.com/page{index}", timeout=30000)
title = await page.title()
return title
except Exception as e:
return f"Error: {str(e)}"
# Load all pages concurrently
load_tasks = [load_page(page_ctx, i) for i, page_ctx in enumerate(all_pages)]
load_results = await asyncio.gather(*load_tasks, return_exceptions=True)
# Count successes and failures
successes = sum(
1 for r in load_results if isinstance(r, str) and not r.startswith("Error")
)
failures = len(load_results) - successes
load_time = time.time() - load_start_time
total_test_time = time.time() - start_time
# Check memory after loading (peak memory)
gc.collect()
current_memory = process.memory_info().rss / 1024 / 1024
peak_memory = max(peak_memory, current_memory)
# Calculate key metrics
memory_per_page = peak_memory / successes if successes > 0 else 0
time_per_crawl = total_test_time / successes if successes > 0 else 0
crawls_per_second = successes / total_test_time if total_test_time > 0 else 0
crawls_per_minute = crawls_per_second * 60
crawls_per_hour = crawls_per_minute * 60
# Print simplified performance summary
from rich.console import Console
from rich.table import Table
console = Console()
# Create a simple summary table
table = Table(title="CRAWL4AI PERFORMANCE SUMMARY")
table.add_column("Metric", style="cyan")
table.add_column("Value", style="green")
table.add_row("Total Crawls Completed", f"{successes}")
table.add_row("Total Time", f"{total_test_time:.2f} seconds")
table.add_row("Time Per Crawl", f"{time_per_crawl:.2f} seconds")
table.add_row("Crawling Speed", f"{crawls_per_second:.2f} crawls/second")
table.add_row("Projected Rate (1 minute)", f"{crawls_per_minute:.0f} crawls")
table.add_row("Projected Rate (1 hour)", f"{crawls_per_hour:.0f} crawls")
table.add_row("Peak Memory Usage", f"{peak_memory:.2f} MB")
table.add_row("Memory Per Crawl", f"{memory_per_page:.2f} MB")
# Display the table
console.print(table)
# Ask confirmation before cleanup
confirmation = input(
f"{WARNING}Do you want to proceed with cleanup? (y/n): {RESET}"
)
if confirmation.lower() != "y":
print(f"{WARNING}Cleanup aborted by user{RESET}")
return False
# Close all pages
for page, _ in all_pages:
try:
await page.close()
except:
pass
# Close all managers
for manager in managers:
try:
await manager.close()
except:
pass
# Remove the temp directory
import shutil
if os.path.exists(temp_dir):
shutil.rmtree(temp_dir)
return True
async def test_performance_scaling_lab( num_browsers: int = 10, pages_per_browser: int = 10):
"""Test performance with multiple browsers and pages.
This test creates multiple browsers on different ports,
spawns multiple pages per browser, and measures performance metrics.
"""
print(f"\n{INFO}========== Testing Performance Scaling =========={RESET}")
# Configuration parameters
num_browsers = num_browsers
pages_per_browser = pages_per_browser
total_pages = num_browsers * pages_per_browser
base_port = 9222
# Set up a measuring mechanism for memory
import psutil
import gc
# Force garbage collection before starting
gc.collect()
process = psutil.Process()
initial_memory = process.memory_info().rss / 1024 / 1024 # in MB
peak_memory = initial_memory
# Report initial configuration
print(
f"{INFO}Test configuration: {num_browsers} browsers × {pages_per_browser} pages = {total_pages} total crawls{RESET}"
)
# List to track managers
managers: List[BrowserManager] = []
all_pages = []
# Get crawl4ai home directory
crawl4ai_home = os.path.expanduser("~/.crawl4ai")
temp_dir = os.path.join(crawl4ai_home, "temp")
os.makedirs(temp_dir, exist_ok=True)
# Create all managers but don't start them yet
manager_configs = []
for i in range(num_browsers):
port = base_port + i
browser_config = BrowserConfig(
browser_mode="builtin",
headless=True,
debugging_port=port,
user_data_dir=os.path.join(temp_dir, f"browser_profile_{i}"),
)
manager = BrowserManager(browser_config=browser_config, logger=logger)
manager._strategy.shutting_down = True
manager_configs.append((manager, i, port))
# Define async function to start a single manager
async def start_manager(manager, index, port):
try:
await manager.start()
return manager
except Exception as e:
print(
f"{ERROR}Failed to start browser {index + 1} on port {port}: {str(e)}{RESET}"
)
return None
# Start all managers in parallel
start_tasks = [
start_manager(manager, i, port) for manager, i, port in manager_configs
]
started_managers = await asyncio.gather(*start_tasks)
# Filter out None values (failed starts) and add to managers list
managers = [m for m in started_managers if m is not None]
if len(managers) == 0:
print(f"{ERROR}All browser managers failed to start. Aborting test.{RESET}")
return False
if len(managers) < num_browsers:
print(
f"{WARNING}Only {len(managers)} out of {num_browsers} browser managers started successfully{RESET}"
)
# Create pages for each browser
for i, manager in enumerate(managers):
try:
pages = await manager.get_pages(CrawlerRunConfig(), count=pages_per_browser)
all_pages.extend(pages)
except Exception as e:
print(f"{ERROR}Failed to create pages for browser {i + 1}: {str(e)}{RESET}")
# Check memory after page creation
gc.collect()
current_memory = process.memory_info().rss / 1024 / 1024
peak_memory = max(peak_memory, current_memory)
# Ask for confirmation before loading
confirmation = input(
f"{WARNING}Do you want to proceed with loading pages? (y/n): {RESET}"
)
# Step 1: Create and start multiple browser managers in parallel
start_time = time.time()
if confirmation.lower() == "y":
load_start_time = time.time()
# Function to load a single page
async def load_page(page_ctx, index):
page, _ = page_ctx
try:
await page.goto(f"https://example.com/page{index}", timeout=30000)
title = await page.title()
return title
except Exception as e:
return f"Error: {str(e)}"
# Load all pages concurrently
load_tasks = [load_page(page_ctx, i) for i, page_ctx in enumerate(all_pages)]
load_results = await asyncio.gather(*load_tasks, return_exceptions=True)
# Count successes and failures
successes = sum(
1 for r in load_results if isinstance(r, str) and not r.startswith("Error")
)
failures = len(load_results) - successes
load_time = time.time() - load_start_time
total_test_time = time.time() - start_time
# Check memory after loading (peak memory)
gc.collect()
current_memory = process.memory_info().rss / 1024 / 1024
peak_memory = max(peak_memory, current_memory)
# Calculate key metrics
memory_per_page = peak_memory / successes if successes > 0 else 0
time_per_crawl = total_test_time / successes if successes > 0 else 0
crawls_per_second = successes / total_test_time if total_test_time > 0 else 0
crawls_per_minute = crawls_per_second * 60
crawls_per_hour = crawls_per_minute * 60
# Print simplified performance summary
from rich.console import Console
from rich.table import Table
console = Console()
# Create a simple summary table
table = Table(title="CRAWL4AI PERFORMANCE SUMMARY")
table.add_column("Metric", style="cyan")
table.add_column("Value", style="green")
table.add_row("Total Crawls Completed", f"{successes}")
table.add_row("Total Time", f"{total_test_time:.2f} seconds")
table.add_row("Time Per Crawl", f"{time_per_crawl:.2f} seconds")
table.add_row("Crawling Speed", f"{crawls_per_second:.2f} crawls/second")
table.add_row("Projected Rate (1 minute)", f"{crawls_per_minute:.0f} crawls")
table.add_row("Projected Rate (1 hour)", f"{crawls_per_hour:.0f} crawls")
table.add_row("Peak Memory Usage", f"{peak_memory:.2f} MB")
table.add_row("Memory Per Crawl", f"{memory_per_page:.2f} MB")
# Display the table
console.print(table)
# Ask confirmation before cleanup
confirmation = input(
f"{WARNING}Do you want to proceed with cleanup? (y/n): {RESET}"
)
if confirmation.lower() != "y":
print(f"{WARNING}Cleanup aborted by user{RESET}")
return False
# Close all pages
for page, _ in all_pages:
try:
await page.close()
except:
pass
# Close all managers
for manager in managers:
try:
await manager.close()
except:
pass
# Remove the temp directory
import shutil
if os.path.exists(temp_dir):
shutil.rmtree(temp_dir)
return True
async def main(): async def main():
"""Run all tests""" """Run all tests"""
try: try:
print(f"{INFO}Starting browser_profiler and builtin browser tests{RESET}") print(f"{INFO}Starting builtin browser tests with browser module{RESET}")
# Run browser profiler tests # # Run browser creation test
cdp_url = await test_browser_profiler() # manager, cdp_url = await test_builtin_browser_creation()
# if not manager:
# print(f"{ERROR}Browser creation failed, cannot continue tests{RESET}")
# return
# Run crawling tests with builtin browser # # Run page operations test
if cdp_url: # await test_page_operations(manager)
await test_crawling_with_builtin_browser(cdp_url)
# Run tests without builtin browser # # Run browser status and management test
# await test_crawling_without_builtin_browser() # await test_browser_status_management(manager)
# Run edge case tests # # Close manager before multiple manager test
# await manager.close()
# Run multiple managers test
# await test_multiple_managers()
# Run performance scaling test
await test_performance_scaling()
# Run cleanup test
# await cleanup_browsers()
# Run edge cases test
# await test_edge_cases() # await test_edge_cases()
print(f"\n{SUCCESS}All tests completed!{RESET}") print(f"\n{SUCCESS}All tests completed!{RESET}")
@@ -288,13 +796,13 @@ async def main():
except Exception as e: except Exception as e:
print(f"\n{ERROR}Test failed with error: {str(e)}{RESET}") print(f"\n{ERROR}Test failed with error: {str(e)}{RESET}")
import traceback import traceback
traceback.print_exc() traceback.print_exc()
finally: finally:
# Clean up: kill any remaining builtin browser # Clean up: kill any remaining builtin browsers
print(f"\n{INFO}Cleaning up: killing any remaining builtin browser{RESET}") await cleanup_browsers()
profiler = BrowserProfiler(logger=logger)
await profiler.kill_builtin_browser()
print(f"{SUCCESS}Test cleanup complete{RESET}") print(f"{SUCCESS}Test cleanup complete{RESET}")
if __name__ == "__main__": if __name__ == "__main__":
asyncio.run(main()) asyncio.run(main())

View File

@@ -0,0 +1,902 @@
"""
Test examples for parallel crawling with the browser module.
These examples demonstrate the functionality of parallel page creation
and serve as functional tests for multi-page crawling performance.
"""
import asyncio
import os
import sys
import time
from typing import List
# Add the project root to Python path if running directly
if __name__ == "__main__":
sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '../..')))
from crawl4ai.browser import BrowserManager
from crawl4ai.async_configs import BrowserConfig, CrawlerRunConfig
from crawl4ai.async_logger import AsyncLogger
# Create a logger for clear terminal output
logger = AsyncLogger(verbose=True, log_file=None)
async def test_get_pages_basic():
"""Test basic functionality of get_pages method."""
logger.info("Testing basic get_pages functionality", tag="TEST")
browser_config = BrowserConfig(headless=True)
manager = BrowserManager(browser_config=browser_config, logger=logger)
try:
await manager.start()
# Request 3 pages
crawler_config = CrawlerRunConfig()
pages = await manager.get_pages(crawler_config, count=3)
# Verify we got the correct number of pages
assert len(pages) == 3, f"Expected 3 pages, got {len(pages)}"
# Verify each page is valid
for i, (page, context) in enumerate(pages):
await page.goto("https://example.com")
title = await page.title()
logger.info(f"Page {i+1} title: {title}", tag="TEST")
assert title, f"Page {i+1} has no title"
await manager.close()
logger.success("Basic get_pages test completed successfully", tag="TEST")
return True
except Exception as e:
logger.error(f"Test failed: {str(e)}", tag="TEST")
try:
await manager.close()
except:
pass
return False
async def test_parallel_approaches_comparison():
"""Compare two parallel crawling approaches:
1. Create a page for each URL on-demand (get_page + gather)
2. Get all pages upfront with get_pages, then use them (get_pages + gather)
"""
logger.info("Comparing different parallel crawling approaches", tag="TEST")
urls = [
"https://example.com/page1",
"https://crawl4ai.com",
"https://kidocode.com",
"https://bbc.com",
# "https://example.com/page1",
# "https://example.com/page2",
# "https://example.com/page3",
# "https://example.com/page4",
]
browser_config = BrowserConfig(headless=False)
manager = BrowserManager(browser_config=browser_config, logger=logger)
try:
await manager.start()
# Approach 1: Create a page for each URL on-demand and run in parallel
logger.info("Testing approach 1: get_page for each URL + gather", tag="TEST")
start_time = time.time()
async def fetch_title_approach1(url):
"""Create a new page for each URL, go to the URL, and get title"""
crawler_config = CrawlerRunConfig(url=url)
page, context = await manager.get_page(crawler_config)
try:
await page.goto(url)
title = await page.title()
return title
finally:
await page.close()
# Run fetch_title_approach1 for each URL in parallel
tasks = [fetch_title_approach1(url) for url in urls]
approach1_results = await asyncio.gather(*tasks)
approach1_time = time.time() - start_time
logger.info(f"Approach 1 time (get_page + gather): {approach1_time:.2f}s", tag="TEST")
# Approach 2: Get all pages upfront with get_pages, then use them in parallel
logger.info("Testing approach 2: get_pages upfront + gather", tag="TEST")
start_time = time.time()
# Get all pages upfront
crawler_config = CrawlerRunConfig()
pages = await manager.get_pages(crawler_config, count=len(urls))
async def fetch_title_approach2(page_ctx, url):
"""Use a pre-created page to go to URL and get title"""
page, _ = page_ctx
try:
await page.goto(url)
title = await page.title()
return title
finally:
await page.close()
# Use the pre-created pages to fetch titles in parallel
tasks = [fetch_title_approach2(page_ctx, url) for page_ctx, url in zip(pages, urls)]
approach2_results = await asyncio.gather(*tasks)
approach2_time = time.time() - start_time
logger.info(f"Approach 2 time (get_pages + gather): {approach2_time:.2f}s", tag="TEST")
# Compare results and performance
speedup = approach1_time / approach2_time if approach2_time > 0 else 0
if speedup > 1:
logger.success(f"Approach 2 (get_pages upfront) was {speedup:.2f}x faster", tag="TEST")
else:
logger.info(f"Approach 1 (get_page + gather) was {1/speedup:.2f}x faster", tag="TEST")
# Verify same content was retrieved in both approaches
assert len(approach1_results) == len(approach2_results), "Result count mismatch"
# Sort results for comparison since parallel execution might complete in different order
assert sorted(approach1_results) == sorted(approach2_results), "Results content mismatch"
await manager.close()
return True
except Exception as e:
logger.error(f"Test failed: {str(e)}", tag="TEST")
try:
await manager.close()
except:
pass
return False
async def test_multi_browser_scaling(num_browsers=3, pages_per_browser=5):
"""Test performance with multiple browsers and pages per browser.
Compares two approaches:
1. On-demand page creation (get_page + gather)
2. Pre-created pages (get_pages + gather)
"""
logger.info(f"Testing multi-browser scaling with {num_browsers} browsers × {pages_per_browser} pages", tag="TEST")
# Generate test URLs
total_pages = num_browsers * pages_per_browser
urls = [f"https://example.com/page_{i}" for i in range(total_pages)]
# Create browser managers
managers = []
base_port = 9222
try:
# Start all browsers in parallel
start_tasks = []
for i in range(num_browsers):
browser_config = BrowserConfig(
headless=True # Using default browser mode like in test_parallel_approaches_comparison
)
manager = BrowserManager(browser_config=browser_config, logger=logger)
start_tasks.append(manager.start())
managers.append(manager)
await asyncio.gather(*start_tasks)
# Distribute URLs among managers
urls_per_manager = {}
for i, manager in enumerate(managers):
start_idx = i * pages_per_browser
end_idx = min(start_idx + pages_per_browser, len(urls))
urls_per_manager[manager] = urls[start_idx:end_idx]
# Approach 1: Create a page for each URL on-demand and run in parallel
logger.info("Testing approach 1: get_page for each URL + gather", tag="TEST")
start_time = time.time()
async def fetch_title_approach1(manager, url):
"""Create a new page for the URL, go to the URL, and get title"""
crawler_config = CrawlerRunConfig(url=url)
page, context = await manager.get_page(crawler_config)
try:
await page.goto(url)
title = await page.title()
return title
finally:
await page.close()
# Run fetch_title_approach1 for each URL in parallel
tasks = []
for manager, manager_urls in urls_per_manager.items():
for url in manager_urls:
tasks.append(fetch_title_approach1(manager, url))
approach1_results = await asyncio.gather(*tasks)
approach1_time = time.time() - start_time
logger.info(f"Approach 1 time (get_page + gather): {approach1_time:.2f}s", tag="TEST")
# Approach 2: Get all pages upfront with get_pages, then use them in parallel
logger.info("Testing approach 2: get_pages upfront + gather", tag="TEST")
start_time = time.time()
# Get all pages upfront for each manager
all_pages = []
for manager, manager_urls in urls_per_manager.items():
crawler_config = CrawlerRunConfig()
pages = await manager.get_pages(crawler_config, count=len(manager_urls))
all_pages.extend(zip(pages, manager_urls))
async def fetch_title_approach2(page_ctx, url):
"""Use a pre-created page to go to URL and get title"""
page, _ = page_ctx
try:
await page.goto(url)
title = await page.title()
return title
finally:
await page.close()
# Use the pre-created pages to fetch titles in parallel
tasks = [fetch_title_approach2(page_ctx, url) for page_ctx, url in all_pages]
approach2_results = await asyncio.gather(*tasks)
approach2_time = time.time() - start_time
logger.info(f"Approach 2 time (get_pages + gather): {approach2_time:.2f}s", tag="TEST")
# Compare results and performance
speedup = approach1_time / approach2_time if approach2_time > 0 else 0
pages_per_second = total_pages / approach2_time
# Show a simple summary
logger.info(f"📊 Summary: {num_browsers} browsers × {pages_per_browser} pages = {total_pages} total crawls", tag="TEST")
logger.info(f"⚡ Performance: {pages_per_second:.1f} pages/second ({pages_per_second*60:.0f} pages/minute)", tag="TEST")
logger.info(f"🚀 Total crawl time: {approach2_time:.2f} seconds", tag="TEST")
if speedup > 1:
logger.success(f"✅ Approach 2 (get_pages upfront) was {speedup:.2f}x faster", tag="TEST")
else:
logger.info(f"✅ Approach 1 (get_page + gather) was {1/speedup:.2f}x faster", tag="TEST")
# Close all managers
for manager in managers:
await manager.close()
return True
except Exception as e:
logger.error(f"Test failed: {str(e)}", tag="TEST")
# Clean up
for manager in managers:
try:
await manager.close()
except:
pass
return False
async def grid_search_optimal_configuration(total_urls=50):
"""Perform a grid search to find the optimal balance between number of browsers and pages per browser.
This function tests different combinations of browser count and pages per browser,
while keeping the total number of URLs constant. It measures performance metrics
for each configuration to find the "sweet spot" that provides the best speed
with reasonable memory usage.
Args:
total_urls: Total number of URLs to crawl (default: 50)
"""
logger.info(f"=== GRID SEARCH FOR OPTIMAL CRAWLING CONFIGURATION ({total_urls} URLs) ===", tag="TEST")
# Generate test URLs once
urls = [f"https://example.com/page_{i}" for i in range(total_urls)]
# Define grid search configurations
# We'll use more flexible approach: test all browser counts from 1 to min(20, total_urls)
# and distribute pages evenly (some browsers may have 1 more page than others)
configurations = []
# Maximum number of browsers to test
max_browsers_to_test = min(20, total_urls)
# Try configurations with 1 to max_browsers_to_test browsers
for num_browsers in range(1, max_browsers_to_test + 1):
base_pages_per_browser = total_urls // num_browsers
remainder = total_urls % num_browsers
# Generate exact page distribution array
if remainder > 0:
# First 'remainder' browsers get one more page
page_distribution = [base_pages_per_browser + 1] * remainder + [base_pages_per_browser] * (num_browsers - remainder)
pages_distribution = f"{base_pages_per_browser+1} pages × {remainder} browsers, {base_pages_per_browser} pages × {num_browsers - remainder} browsers"
else:
# All browsers get the same number of pages
page_distribution = [base_pages_per_browser] * num_browsers
pages_distribution = f"{base_pages_per_browser} pages × {num_browsers} browsers"
# Format the distribution as a tuple string like (4, 4, 3, 3)
distribution_str = str(tuple(page_distribution))
configurations.append((num_browsers, base_pages_per_browser, pages_distribution, page_distribution, distribution_str))
# Track results
results = []
# Test each configuration
for num_browsers, pages_per_browser, pages_distribution, page_distribution, distribution_str in configurations:
logger.info("-" * 80, tag="TEST")
logger.info(f"Testing configuration: {num_browsers} browsers with distribution: {distribution_str}", tag="TEST")
logger.info(f"Details: {pages_distribution}", tag="TEST")
# Sleep a bit for randomness
await asyncio.sleep(0.5)
try:
# Import psutil for memory tracking
try:
import psutil
process = psutil.Process()
initial_memory = process.memory_info().rss / (1024 * 1024) # MB
except ImportError:
logger.warning("psutil not available, memory metrics will not be tracked", tag="TEST")
initial_memory = 0
# Create and start browser managers
managers = []
start_time = time.time()
# Start all browsers in parallel
start_tasks = []
for i in range(num_browsers):
browser_config = BrowserConfig(
headless=True
)
manager = BrowserManager(browser_config=browser_config, logger=logger)
start_tasks.append(manager.start())
managers.append(manager)
await asyncio.gather(*start_tasks)
browser_startup_time = time.time() - start_time
# Measure memory after browser startup
if initial_memory > 0:
browser_memory = process.memory_info().rss / (1024 * 1024) - initial_memory
else:
browser_memory = 0
# Distribute URLs among managers using the exact page distribution
urls_per_manager = {}
total_assigned = 0
for i, manager in enumerate(managers):
if i < len(page_distribution):
# Get the exact number of pages for this browser from our distribution
manager_pages = page_distribution[i]
# Get the URL slice for this manager
start_idx = total_assigned
end_idx = start_idx + manager_pages
urls_per_manager[manager] = urls[start_idx:end_idx]
total_assigned += manager_pages
else:
# If we have more managers than our distribution (should never happen)
urls_per_manager[manager] = []
# Use the more efficient approach (pre-created pages)
logger.info("Running page crawling test...", tag="TEST")
crawl_start_time = time.time()
# Get all pages upfront for each manager
all_pages = []
for manager, manager_urls in urls_per_manager.items():
if not manager_urls: # Skip managers with no URLs
continue
crawler_config = CrawlerRunConfig()
pages = await manager.get_pages(crawler_config, count=len(manager_urls))
all_pages.extend(zip(pages, manager_urls))
# Measure memory after page creation
if initial_memory > 0:
pages_memory = process.memory_info().rss / (1024 * 1024) - browser_memory - initial_memory
else:
pages_memory = 0
# Function to crawl a URL with a pre-created page
async def fetch_title(page_ctx, url):
page, _ = page_ctx
try:
await page.goto(url)
title = await page.title()
return title
finally:
await page.close()
# Use the pre-created pages to fetch titles in parallel
tasks = [fetch_title(page_ctx, url) for page_ctx, url in all_pages]
crawl_results = await asyncio.gather(*tasks)
crawl_time = time.time() - crawl_start_time
total_time = time.time() - start_time
# Final memory measurement
if initial_memory > 0:
peak_memory = max(browser_memory + pages_memory, process.memory_info().rss / (1024 * 1024) - initial_memory)
else:
peak_memory = 0
# Close all managers
for manager in managers:
await manager.close()
# Calculate metrics
pages_per_second = total_urls / crawl_time
# Store result metrics
result = {
"num_browsers": num_browsers,
"pages_per_browser": pages_per_browser,
"page_distribution": page_distribution,
"distribution_str": distribution_str,
"total_urls": total_urls,
"browser_startup_time": browser_startup_time,
"crawl_time": crawl_time,
"total_time": total_time,
"browser_memory": browser_memory,
"pages_memory": pages_memory,
"peak_memory": peak_memory,
"pages_per_second": pages_per_second,
# Calculate efficiency score (higher is better)
# This balances speed vs memory usage
"efficiency_score": pages_per_second / (peak_memory + 1) if peak_memory > 0 else pages_per_second,
}
results.append(result)
# Log the results
logger.info(f"Browser startup: {browser_startup_time:.2f}s", tag="TEST")
logger.info(f"Crawl time: {crawl_time:.2f}s", tag="TEST")
logger.info(f"Total time: {total_time:.2f}s", tag="TEST")
logger.info(f"Performance: {pages_per_second:.1f} pages/second", tag="TEST")
if peak_memory > 0:
logger.info(f"Browser memory: {browser_memory:.1f}MB", tag="TEST")
logger.info(f"Pages memory: {pages_memory:.1f}MB", tag="TEST")
logger.info(f"Peak memory: {peak_memory:.1f}MB", tag="TEST")
logger.info(f"Efficiency score: {result['efficiency_score']:.6f}", tag="TEST")
except Exception as e:
logger.error(f"Error testing configuration: {str(e)}", tag="TEST")
import traceback
traceback.print_exc()
# Clean up
for manager in managers:
try:
await manager.close()
except:
pass
# Print summary of all configurations
logger.info("=" * 100, tag="TEST")
logger.info("GRID SEARCH RESULTS SUMMARY", tag="TEST")
logger.info("=" * 100, tag="TEST")
# Rank configurations by efficiency score
ranked_results = sorted(results, key=lambda x: x["efficiency_score"], reverse=True)
# Also determine rankings by different metrics
fastest = sorted(results, key=lambda x: x["crawl_time"])[0]
lowest_memory = sorted(results, key=lambda x: x["peak_memory"] if x["peak_memory"] > 0 else float('inf'))[0]
most_efficient = ranked_results[0]
# Print top performers by category
logger.info("🏆 TOP PERFORMERS BY CATEGORY:", tag="TEST")
logger.info(f"⚡ Fastest: {fastest['num_browsers']} browsers × ~{fastest['pages_per_browser']} pages " +
f"({fastest['crawl_time']:.2f}s, {fastest['pages_per_second']:.1f} pages/s)", tag="TEST")
if lowest_memory["peak_memory"] > 0:
logger.info(f"💾 Lowest memory: {lowest_memory['num_browsers']} browsers × ~{lowest_memory['pages_per_browser']} pages " +
f"({lowest_memory['peak_memory']:.1f}MB)", tag="TEST")
logger.info(f"🌟 Most efficient: {most_efficient['num_browsers']} browsers × ~{most_efficient['pages_per_browser']} pages " +
f"(score: {most_efficient['efficiency_score']:.6f})", tag="TEST")
# Print result table header
logger.info("\n📊 COMPLETE RANKING TABLE (SORTED BY EFFICIENCY SCORE):", tag="TEST")
logger.info("-" * 120, tag="TEST")
# Define table header
header = f"{'Rank':<5} | {'Browsers':<8} | {'Distribution':<55} | {'Total Time(s)':<12} | {'Speed(p/s)':<12} | {'Memory(MB)':<12} | {'Efficiency':<10} | {'Notes'}"
logger.info(header, tag="TEST")
logger.info("-" * 120, tag="TEST")
# Print each configuration in ranked order
for rank, result in enumerate(ranked_results, 1):
# Add special notes for top performers
notes = []
if result == fastest:
notes.append("⚡ Fastest")
if result == lowest_memory:
notes.append("💾 Lowest Memory")
if result == most_efficient:
notes.append("🌟 Most Efficient")
notes_str = " | ".join(notes) if notes else ""
# Format memory if available
memory_str = f"{result['peak_memory']:.1f}" if result['peak_memory'] > 0 else "N/A"
# Get the distribution string
dist_str = result.get('distribution_str', str(tuple([result['pages_per_browser']] * result['num_browsers'])))
# Build the row
row = f"{rank:<5} | {result['num_browsers']:<8} | {dist_str:<55} | {result['total_time']:.2f}s{' ':<7} | "
row += f"{result['pages_per_second']:.2f}{' ':<6} | {memory_str}{' ':<6} | {result['efficiency_score']:.4f}{' ':<4} | {notes_str}"
logger.info(row, tag="TEST")
logger.info("-" * 120, tag="TEST")
# Generate visualization if matplotlib is available
try:
import matplotlib.pyplot as plt
import numpy as np
# Extract data for plotting from ranked results
browser_counts = [r["num_browsers"] for r in ranked_results]
efficiency_scores = [r["efficiency_score"] for r in ranked_results]
crawl_times = [r["crawl_time"] for r in ranked_results]
total_times = [r["total_time"] for r in ranked_results]
# Filter results with memory data
memory_results = [r for r in ranked_results if r["peak_memory"] > 0]
memory_browser_counts = [r["num_browsers"] for r in memory_results]
peak_memories = [r["peak_memory"] for r in memory_results]
# Create figure with clean design
plt.figure(figsize=(14, 12), facecolor='white')
plt.style.use('ggplot')
# Create grid for subplots
gs = plt.GridSpec(3, 1, height_ratios=[1, 1, 1], hspace=0.3)
# Plot 1: Efficiency Score (higher is better)
ax1 = plt.subplot(gs[0])
bar_colors = ['#3498db'] * len(browser_counts)
# Highlight the most efficient
most_efficient_idx = browser_counts.index(most_efficient["num_browsers"])
bar_colors[most_efficient_idx] = '#e74c3c' # Red for most efficient
bars = ax1.bar(range(len(browser_counts)), efficiency_scores, color=bar_colors)
ax1.set_xticks(range(len(browser_counts)))
ax1.set_xticklabels([f"{bc}" for bc in browser_counts], rotation=45)
ax1.set_xlabel('Number of Browsers')
ax1.set_ylabel('Efficiency Score (higher is better)')
ax1.set_title('Browser Configuration Efficiency (higher is better)')
# Add value labels on top of bars
for bar, score in zip(bars, efficiency_scores):
height = bar.get_height()
ax1.text(bar.get_x() + bar.get_width()/2., height + 0.02*max(efficiency_scores),
f'{score:.3f}', ha='center', va='bottom', rotation=90, fontsize=8)
# Highlight best configuration
ax1.text(0.02, 0.90, f"🌟 Most Efficient: {most_efficient['num_browsers']} browsers with ~{most_efficient['pages_per_browser']} pages",
transform=ax1.transAxes, fontsize=12, verticalalignment='top',
bbox=dict(boxstyle='round,pad=0.5', facecolor='yellow', alpha=0.3))
# Plot 2: Time Performance
ax2 = plt.subplot(gs[1])
# Plot both total time and crawl time
ax2.plot(browser_counts, crawl_times, 'bo-', label='Crawl Time (s)', linewidth=2)
ax2.plot(browser_counts, total_times, 'go--', label='Total Time (s)', linewidth=2, alpha=0.6)
# Mark the fastest configuration
fastest_idx = browser_counts.index(fastest["num_browsers"])
ax2.plot(browser_counts[fastest_idx], crawl_times[fastest_idx], 'ro', ms=10,
label=f'Fastest: {fastest["num_browsers"]} browsers')
ax2.set_xlabel('Number of Browsers')
ax2.set_ylabel('Time (seconds)')
ax2.set_title(f'Time Performance for {total_urls} URLs by Browser Count')
ax2.grid(True, linestyle='--', alpha=0.7)
ax2.legend(loc='upper right')
# Plot pages per second on second y-axis
pages_per_second = [total_urls/t for t in crawl_times]
ax2_twin = ax2.twinx()
ax2_twin.plot(browser_counts, pages_per_second, 'r^--', label='Pages/second', alpha=0.5)
ax2_twin.set_ylabel('Pages per second')
# Add note about the fastest configuration
ax2.text(0.02, 0.90, f"⚡ Fastest: {fastest['num_browsers']} browsers with ~{fastest['pages_per_browser']} pages" +
f"\n {fastest['crawl_time']:.2f}s ({fastest['pages_per_second']:.1f} pages/s)",
transform=ax2.transAxes, fontsize=12, verticalalignment='top',
bbox=dict(boxstyle='round,pad=0.5', facecolor='lightblue', alpha=0.3))
# Plot 3: Memory Usage (if available)
if memory_results:
ax3 = plt.subplot(gs[2])
# Prepare data for grouped bar chart
memory_per_browser = [m/n for m, n in zip(peak_memories, memory_browser_counts)]
memory_per_page = [m/(n*p) for m, n, p in zip(
[r["peak_memory"] for r in memory_results],
[r["num_browsers"] for r in memory_results],
[r["pages_per_browser"] for r in memory_results])]
x = np.arange(len(memory_browser_counts))
width = 0.35
# Create grouped bars
ax3.bar(x - width/2, peak_memories, width, label='Total Memory (MB)', color='#9b59b6')
ax3.bar(x + width/2, memory_per_browser, width, label='Memory per Browser (MB)', color='#3498db')
# Configure axis
ax3.set_xticks(x)
ax3.set_xticklabels([f"{bc}" for bc in memory_browser_counts], rotation=45)
ax3.set_xlabel('Number of Browsers')
ax3.set_ylabel('Memory (MB)')
ax3.set_title('Memory Usage by Browser Configuration')
ax3.legend(loc='upper left')
ax3.grid(True, linestyle='--', alpha=0.7)
# Add second y-axis for memory per page
ax3_twin = ax3.twinx()
ax3_twin.plot(x, memory_per_page, 'ro-', label='Memory per Page (MB)')
ax3_twin.set_ylabel('Memory per Page (MB)')
# Get lowest memory configuration
lowest_memory_idx = memory_browser_counts.index(lowest_memory["num_browsers"])
# Add note about lowest memory configuration
ax3.text(0.02, 0.90, f"💾 Lowest Memory: {lowest_memory['num_browsers']} browsers with ~{lowest_memory['pages_per_browser']} pages" +
f"\n {lowest_memory['peak_memory']:.1f}MB ({lowest_memory['peak_memory']/total_urls:.2f}MB per page)",
transform=ax3.transAxes, fontsize=12, verticalalignment='top',
bbox=dict(boxstyle='round,pad=0.5', facecolor='lightgreen', alpha=0.3))
# Add overall title
plt.suptitle(f'Browser Scaling Grid Search Results for {total_urls} URLs', fontsize=16, y=0.98)
# Add timestamp and info at the bottom
plt.figtext(0.5, 0.01, f"Generated by Crawl4AI at {time.strftime('%Y-%m-%d %H:%M:%S')}",
ha="center", fontsize=10, style='italic')
# Get current directory and save the figure there
import os
__current_file = os.path.abspath(__file__)
current_dir = os.path.dirname(__current_file)
output_file = os.path.join(current_dir, 'browser_scaling_grid_search.png')
# Adjust layout and save figure with high DPI
plt.tight_layout(rect=[0, 0.03, 1, 0.97])
plt.savefig(output_file, dpi=200, bbox_inches='tight')
logger.success(f"Visualization saved to {output_file}", tag="TEST")
except ImportError:
logger.warning("matplotlib not available, skipping visualization", tag="TEST")
return most_efficient["num_browsers"], most_efficient["pages_per_browser"]
async def find_optimal_browser_config(total_urls=50, verbose=True, rate_limit_delay=0.2):
"""Find optimal browser configuration for crawling a specific number of URLs.
Args:
total_urls: Number of URLs to crawl
verbose: Whether to print progress
rate_limit_delay: Delay between page loads to avoid rate limiting
Returns:
dict: Contains fastest, lowest_memory, and optimal configurations
"""
if verbose:
print(f"\n=== Finding optimal configuration for crawling {total_urls} URLs ===\n")
# Generate test URLs with timestamp to avoid caching
timestamp = int(time.time())
urls = [f"https://example.com/page_{i}?t={timestamp}" for i in range(total_urls)]
# Limit browser configurations to test (1 browser to max 10)
max_browsers = min(10, total_urls)
configs_to_test = []
# Generate configurations (browser count, pages distribution)
for num_browsers in range(1, max_browsers + 1):
base_pages = total_urls // num_browsers
remainder = total_urls % num_browsers
# Create distribution array like [3, 3, 2, 2] (some browsers get one more page)
if remainder > 0:
distribution = [base_pages + 1] * remainder + [base_pages] * (num_browsers - remainder)
else:
distribution = [base_pages] * num_browsers
configs_to_test.append((num_browsers, distribution))
results = []
# Test each configuration
for browser_count, page_distribution in configs_to_test:
if verbose:
print(f"Testing {browser_count} browsers with distribution {tuple(page_distribution)}")
try:
# Track memory if possible
try:
import psutil
process = psutil.Process()
start_memory = process.memory_info().rss / (1024 * 1024) # MB
except ImportError:
if verbose:
print("Memory tracking not available (psutil not installed)")
start_memory = 0
# Start browsers in parallel
managers = []
start_tasks = []
start_time = time.time()
for i in range(browser_count):
config = BrowserConfig(headless=True)
manager = BrowserManager(browser_config=config, logger=logger)
start_tasks.append(manager.start())
managers.append(manager)
await asyncio.gather(*start_tasks)
# Distribute URLs among browsers
urls_per_manager = {}
url_index = 0
for i, manager in enumerate(managers):
pages_for_this_browser = page_distribution[i]
end_index = url_index + pages_for_this_browser
urls_per_manager[manager] = urls[url_index:end_index]
url_index = end_index
# Create pages for each browser
all_pages = []
for manager, manager_urls in urls_per_manager.items():
if not manager_urls:
continue
pages = await manager.get_pages(CrawlerRunConfig(), count=len(manager_urls))
all_pages.extend(zip(pages, manager_urls))
# Crawl pages with delay to avoid rate limiting
async def crawl_page(page_ctx, url):
page, _ = page_ctx
try:
await page.goto(url)
if rate_limit_delay > 0:
await asyncio.sleep(rate_limit_delay)
title = await page.title()
return title
finally:
await page.close()
crawl_start = time.time()
crawl_tasks = [crawl_page(page_ctx, url) for page_ctx, url in all_pages]
await asyncio.gather(*crawl_tasks)
crawl_time = time.time() - crawl_start
total_time = time.time() - start_time
# Measure final memory usage
if start_memory > 0:
end_memory = process.memory_info().rss / (1024 * 1024)
memory_used = end_memory - start_memory
else:
memory_used = 0
# Close all browsers
for manager in managers:
await manager.close()
# Calculate metrics
pages_per_second = total_urls / crawl_time
# Calculate efficiency score (higher is better)
# This balances speed vs memory
if memory_used > 0:
efficiency = pages_per_second / (memory_used + 1)
else:
efficiency = pages_per_second
# Store result
result = {
"browser_count": browser_count,
"distribution": tuple(page_distribution),
"crawl_time": crawl_time,
"total_time": total_time,
"memory_used": memory_used,
"pages_per_second": pages_per_second,
"efficiency": efficiency
}
results.append(result)
if verbose:
print(f" ✓ Crawled {total_urls} pages in {crawl_time:.2f}s ({pages_per_second:.1f} pages/sec)")
if memory_used > 0:
print(f" ✓ Memory used: {memory_used:.1f}MB ({memory_used/total_urls:.1f}MB per page)")
print(f" ✓ Efficiency score: {efficiency:.4f}")
except Exception as e:
if verbose:
print(f" ✗ Error: {str(e)}")
# Clean up
for manager in managers:
try:
await manager.close()
except:
pass
# If no successful results, return None
if not results:
return None
# Find best configurations
fastest = sorted(results, key=lambda x: x["crawl_time"])[0]
# Only consider memory if available
memory_results = [r for r in results if r["memory_used"] > 0]
if memory_results:
lowest_memory = sorted(memory_results, key=lambda x: x["memory_used"])[0]
else:
lowest_memory = fastest
# Find most efficient (balanced speed vs memory)
optimal = sorted(results, key=lambda x: x["efficiency"], reverse=True)[0]
# Print summary
if verbose:
print("\n=== OPTIMAL CONFIGURATIONS ===")
print(f"⚡ Fastest: {fastest['browser_count']} browsers {fastest['distribution']}")
print(f" {fastest['crawl_time']:.2f}s, {fastest['pages_per_second']:.1f} pages/sec")
print(f"💾 Memory-efficient: {lowest_memory['browser_count']} browsers {lowest_memory['distribution']}")
if lowest_memory["memory_used"] > 0:
print(f" {lowest_memory['memory_used']:.1f}MB, {lowest_memory['memory_used']/total_urls:.2f}MB per page")
print(f"🌟 Balanced optimal: {optimal['browser_count']} browsers {optimal['distribution']}")
print(f" {optimal['crawl_time']:.2f}s, {optimal['pages_per_second']:.1f} pages/sec, score: {optimal['efficiency']:.4f}")
return {
"fastest": fastest,
"lowest_memory": lowest_memory,
"optimal": optimal,
"all_configs": results
}
async def run_tests():
"""Run all tests sequentially."""
results = []
# Find optimal configuration using our utility function
configs = await find_optimal_browser_config(
total_urls=20, # Use a small number for faster testing
verbose=True,
rate_limit_delay=0.2 # 200ms delay between page loads to avoid rate limiting
)
if configs:
# Show the optimal configuration
optimal = configs["optimal"]
print(f"\n🎯 Recommended configuration for production use:")
print(f" {optimal['browser_count']} browsers with distribution {optimal['distribution']}")
print(f" Estimated performance: {optimal['pages_per_second']:.1f} pages/second")
results.append(True)
else:
print("\n❌ Failed to find optimal configuration")
results.append(False)
# Print summary
total = len(results)
passed = sum(results)
print(f"\nTests complete: {passed}/{total} passed")
if passed == total:
print("All tests passed!")
else:
print(f"{total - passed} tests failed")
if __name__ == "__main__":
asyncio.run(run_tests())