refactor(browser): improve parallel crawling and browser management

Remove PagePoolConfig in favor of direct page management in browser strategies.
Add get_pages() method for efficient parallel page creation.
Improve storage state handling and persistence.
Add comprehensive parallel crawling tests and performance analysis.

BREAKING CHANGE: Removed PagePoolConfig class and related functionality.
This commit is contained in:
UncleCode
2025-03-23 18:53:24 +08:00
parent 4ab0893ffb
commit 0094cac675
7 changed files with 2115 additions and 351 deletions

View File

@@ -156,41 +156,6 @@ def is_empty_value(value: Any) -> bool:
return False
class PagePoolConfig:
"""Configuration for browser page pooling.
This class configures the page pooling mechanism that maintains pre-warmed
browser pages ready for immediate use, improving performance for scenarios
where multiple URLs need to be processed in sequence.
Attributes:
mode (str): Pooling mode - "static" or "adaptive".
"static" uses a fixed pool size defined by static_size.
"adaptive" calculates optimal size based on available system memory.
Default: "static".
static_size (int): Number of pages to maintain in the pool when mode is "static".
Default: 10.
memory_per_page (int): Estimated memory used by a single page in MB.
Used for "adaptive" mode calculations.
Default: 200.
memory_threshold (float): Maximum percentage of system memory to use in "adaptive" mode.
Default: 0.7 (70% of available memory).
timeout (float): Seconds to wait for a page from the pool before creating a new one.
Default: 5.0.
"""
def __init__(self,
mode="static",
static_size=10,
memory_per_page=200,
memory_threshold=0.7,
timeout=5.0):
self.mode = mode
self.static_size = static_size
self.memory_per_page = memory_per_page
self.memory_threshold = memory_threshold
self.timeout = timeout
class BrowserConfig:
"""
Configuration class for setting up a browser instance and its context in AsyncPlaywrightCrawlerStrategy.
@@ -235,7 +200,7 @@ class BrowserConfig:
Default: False.
downloads_path (str or None): Directory to store downloaded files. If None and accept_downloads is True,
a default path will be created. Default: None.
storage_state (str or dict or None): Path or object describing storage state (cookies, localStorage).
storage_state (str or dict or None): An in-memory storage state (cookies, localStorage).
Default: None.
ignore_https_errors (bool): Ignore HTTPS certificate errors. Default: True.
java_script_enabled (bool): Enable JavaScript execution in pages. Default: True.
@@ -255,9 +220,6 @@ class BrowserConfig:
light_mode (bool): Disables certain background features for performance gains. Default: False.
extra_args (list): Additional command-line arguments passed to the browser.
Default: [].
page_pool_config (PagePoolConfig or None): Configuration for page pooling mechanism.
If None, page pooling is disabled.
Default: None.
"""
def __init__(
@@ -298,7 +260,6 @@ class BrowserConfig:
extra_args: list = None,
debugging_port: int = 9222,
host: str = "localhost",
page_pool_config: Optional[PagePoolConfig] = None,
):
self.browser_type = browser_type
self.headless = headless
@@ -337,7 +298,6 @@ class BrowserConfig:
self.verbose = verbose
self.debugging_port = debugging_port
self.host = host
self.page_pool_config = page_pool_config
fa_user_agenr_generator = ValidUAGenerator()
if self.user_agent_mode == "random":
@@ -368,12 +328,6 @@ class BrowserConfig:
@staticmethod
def from_kwargs(kwargs: dict) -> "BrowserConfig":
# Handle page_pool_config
page_pool_config = kwargs.get("page_pool_config")
if isinstance(page_pool_config, dict):
# If it's a dict, convert to PagePoolConfig
page_pool_config = PagePoolConfig(**page_pool_config)
return BrowserConfig(
browser_type=kwargs.get("browser_type", "chromium"),
headless=kwargs.get("headless", True),
@@ -407,7 +361,6 @@ class BrowserConfig:
extra_args=kwargs.get("extra_args", []),
debugging_port=kwargs.get("debugging_port", 9222),
host=kwargs.get("host", "localhost"),
page_pool_config=page_pool_config,
)
def to_dict(self):
@@ -442,7 +395,6 @@ class BrowserConfig:
"verbose": self.verbose,
"debugging_port": self.debugging_port,
"host": self.host,
"page_pool_config": self.page_pool_config,
}
def clone(self, **kwargs):

View File

@@ -2,11 +2,14 @@
This module provides a central browser management class that uses the
strategy pattern internally while maintaining the existing API.
It also implements a page pooling mechanism for improved performance.
"""
import asyncio
import time
from typing import Optional, Tuple, Dict, Any
import os
import psutil
from typing import Optional, Tuple, Dict, Any, List, Set
from playwright.async_api import Page, BrowserContext
@@ -117,6 +120,28 @@ class BrowserManager:
self.sessions = self._strategy.sessions
return page, context
async def get_pages(self, crawlerRunConfig: CrawlerRunConfig, count: int = 1) -> List[Tuple[Page, BrowserContext]]:
"""Get multiple pages with the same configuration.
This method efficiently creates multiple browser pages using the same configuration,
which is useful for parallel crawling of multiple URLs.
Args:
crawlerRunConfig: Configuration for the pages
count: Number of pages to create
Returns:
List of (Page, Context) tuples
"""
# Delegate to strategy
pages = await self._strategy.get_pages(crawlerRunConfig, count)
# Sync sessions if needed
if hasattr(self._strategy, 'sessions'):
self.sessions = self._strategy.sessions
return pages
async def kill_session(self, session_id: str):
"""Kill a browser session and clean up resources.

View File

@@ -23,7 +23,7 @@ from ..async_configs import BrowserConfig, CrawlerRunConfig
from ..config import DOWNLOAD_PAGE_TIMEOUT
from ..js_snippet import load_js_script
from ..utils import get_home_folder
from .utils import get_playwright, get_browser_executable, get_browser_disable_options, create_temp_directory, is_windows
from .utils import get_playwright, get_browser_executable, get_browser_disable_options, create_temp_directory, is_windows, is_browser_running
from playwright_stealth import StealthConfig
@@ -85,6 +85,22 @@ class BaseBrowserStrategy(ABC):
"""
pass
async def get_pages(self, crawlerRunConfig: CrawlerRunConfig, count: int = 1) -> List[Tuple[Page, BrowserContext]]:
"""Get multiple pages with the same configuration.
Args:
crawlerRunConfig: Configuration for the pages
count: Number of pages to create
Returns:
List of (Page, Context) tuples
"""
pages = []
for _ in range(count):
page, context = await self.get_page(crawlerRunConfig)
pages.append((page, context))
return pages
@abstractmethod
async def close(self):
"""Close the browser and clean up resources."""
@@ -136,9 +152,6 @@ class BaseBrowserStrategy(ABC):
if self.config.cookies:
await context.add_cookies(self.config.cookies)
if self.config.storage_state:
await context.storage_state(path=None)
if self.config.accept_downloads:
context.set_default_timeout(DOWNLOAD_PAGE_TIMEOUT)
context.set_default_navigation_timeout(DOWNLOAD_PAGE_TIMEOUT)
@@ -161,7 +174,7 @@ class BaseBrowserStrategy(ABC):
{
"name": "cookiesEnabled",
"value": "true",
"url": crawlerRunConfig.url if crawlerRunConfig else "https://crawl4ai.com/",
"url": crawlerRunConfig and crawlerRunConfig.url or "https://crawl4ai.com/",
}
]
)
@@ -324,12 +337,31 @@ class PlaywrightBrowserStrategy(BaseBrowserStrategy):
"viewport": viewport_settings,
"proxy": proxy_settings,
"accept_downloads": self.config.accept_downloads,
"storage_state": self.config.storage_state,
"ignore_https_errors": self.config.ignore_https_errors,
"device_scale_factor": 1.0,
"java_script_enabled": self.config.java_script_enabled,
}
# Handle storage state properly - this is key for persistence
if self.config.storage_state:
context_settings["storage_state"] = self.config.storage_state
if self.logger:
if isinstance(self.config.storage_state, str):
self.logger.debug(f"Using storage state from file: {self.config.storage_state}", tag="BROWSER")
else:
self.logger.debug("Using storage state from config object", tag="BROWSER")
if self.config.user_data_dir:
context_settings["storage_state"] = os.path.join(
self.config.user_data_dir, "Default", "storage_state.json"
)
# Create the file if it doesn't exist
if not os.path.exists(context_settings["storage_state"]):
os.makedirs(os.path.dirname(context_settings["storage_state"]), exist_ok=True)
with open(context_settings["storage_state"], "w") as f:
json.dump({}, f)
if crawlerRunConfig:
# Check if there is value for crawlerRunConfig.proxy_config set add that to context
if crawlerRunConfig.proxy_config:
@@ -428,6 +460,21 @@ class PlaywrightBrowserStrategy(BaseBrowserStrategy):
if self.config.sleep_on_close:
await asyncio.sleep(0.5)
# If we have a user_data_dir configured, ensure persistence of storage state
if self.config.user_data_dir and self.browser and self.default_context:
for context in self.browser.contexts:
try:
await context.storage_state(path=os.path.join(self.config.user_data_dir, "Default", "storage_state.json"))
if self.logger:
self.logger.debug("Ensuring storage state is persisted before closing browser", tag="BROWSER")
except Exception as e:
if self.logger:
self.logger.warning(
message="Failed to ensure storage persistence: {error}",
tag="BROWSER",
params={"error": str(e)}
)
# Close all sessions
session_ids = list(self.sessions.keys())
for session_id in session_ids:
@@ -582,7 +629,7 @@ class CDPBrowserStrategy(BaseBrowserStrategy):
Returns:
List of command-line arguments for the browser
"""
browser_path = get_browser_executable(self.config.browser_type)
browser_path = await get_browser_executable(self.config.browser_type)
base_args = [browser_path]
if self.config.browser_type == "chromium":
@@ -727,6 +774,22 @@ class CDPBrowserStrategy(BaseBrowserStrategy):
if self.config.sleep_on_close:
await asyncio.sleep(0.5)
# If we have a user_data_dir configured, ensure persistence of storage state
if self.config.user_data_dir and self.browser:
try:
# Create a brief sleep to allow the browser to flush any pending operations
# This helps ensure all storage state (localStorage, cookies, etc.) gets saved
await asyncio.sleep(0.3)
if self.logger:
self.logger.debug("Ensuring storage state is persisted before closing CDP browser", tag="BROWSER")
except Exception as e:
if self.logger:
self.logger.warning(
message="Failed to ensure storage persistence: {error}",
tag="BROWSER",
params={"error": str(e)}
)
# Close all sessions
session_ids = list(self.sessions.keys())
for session_id in session_ids:
@@ -775,19 +838,46 @@ class BuiltinBrowserStrategy(CDPBrowserStrategy):
logger: Logger for recording events and errors
"""
super().__init__(config, logger)
self.builtin_browser_dir = os.path.join(get_home_folder(), "builtin-browser")
self.builtin_browser_dir = os.path.join(get_home_folder(), "builtin-browser") if not self.config.user_data_dir else self.config.user_data_dir
self.builtin_config_file = os.path.join(self.builtin_browser_dir, "browser_config.json")
# Raise error if user data dir is already engaged
if self._check_user_dir_is_engaged(self.builtin_browser_dir):
raise Exception(f"User data directory {self.builtin_browser_dir} is already engaged by another browser instance.")
os.makedirs(self.builtin_browser_dir, exist_ok=True)
def _check_user_dir_is_engaged(self, user_data_dir: str) -> bool:
"""Check if the user data directory is already in use.
Returns:
bool: True if the directory is engaged, False otherwise
"""
# Load browser config file, then iterate in port_map values, check "user_data_dir" key if it matches
# the current user data directory
if os.path.exists(self.builtin_config_file):
try:
with open(self.builtin_config_file, 'r') as f:
browser_info_dict = json.load(f)
# Check if user data dir is already engaged
for port_str, browser_info in browser_info_dict.get("port_map", {}).items():
if browser_info.get("user_data_dir") == user_data_dir:
return True
except Exception as e:
if self.logger:
self.logger.error(f"Error reading built-in browser config: {str(e)}", tag="BUILTIN")
return False
async def start(self):
"""Start or connect to the built-in browser.
Returns:
self: For method chaining
"""
# Check for existing built-in browser
browser_info = self.get_builtin_browser_info()
if browser_info and self._is_browser_running(browser_info.get('pid')):
# Check for existing built-in browser (get_browser_info already checks if running)
browser_info = self.get_browser_info()
if browser_info:
if self.logger:
self.logger.info(f"Using existing built-in browser at {browser_info.get('cdp_url')}", tag="BROWSER")
self.config.cdp_url = browser_info.get('cdp_url')
@@ -797,7 +887,7 @@ class BuiltinBrowserStrategy(CDPBrowserStrategy):
cdp_url = await self.launch_builtin_browser(
browser_type=self.config.browser_type,
debugging_port=self.config.debugging_port,
headless=self.config.headless
headless=self.config.headless,
)
if not cdp_url:
if self.logger:
@@ -808,55 +898,62 @@ class BuiltinBrowserStrategy(CDPBrowserStrategy):
# Call parent class implementation with updated CDP URL
return await super().start()
def get_builtin_browser_info(self) -> Optional[Dict[str, Any]]:
"""Get information about the built-in browser.
Returns:
dict: Browser information or None if no built-in browser is configured
"""
if not os.path.exists(self.builtin_config_file):
return None
try:
with open(self.builtin_config_file, 'r') as f:
browser_info = json.load(f)
# Check if the browser is still running
if not self._is_browser_running(browser_info.get('pid')):
if self.logger:
self.logger.warning("Built-in browser is not running", tag="BUILTIN")
return None
return browser_info
except Exception as e:
if self.logger:
self.logger.error(f"Error reading built-in browser config: {str(e)}", tag="BUILTIN")
return None
def _is_browser_running(self, pid: Optional[int]) -> bool:
"""Check if a process with the given PID is running.
@classmethod
def get_builtin_browser_info(cls, debugging_port: int, config_file: str, logger: Optional[AsyncLogger] = None) -> Optional[Dict[str, Any]]:
"""Get information about the built-in browser for a specific debugging port.
Args:
pid: Process ID to check
debugging_port: The debugging port to look for
config_file: Path to the config file
logger: Optional logger for recording events
Returns:
bool: True if the process is running, False otherwise
dict: Browser information or None if no running browser is configured for this port
"""
if not pid:
return False
if not os.path.exists(config_file):
return None
try:
# Check if the process exists
if is_windows():
process = subprocess.run(["tasklist", "/FI", f"PID eq {pid}"],
capture_output=True, text=True)
return str(pid) in process.stdout
else:
# Unix-like systems
os.kill(pid, 0) # This doesn't actually kill the process, just checks if it exists
return True
except (ProcessLookupError, PermissionError, OSError):
return False
with open(config_file, 'r') as f:
browser_info_dict = json.load(f)
# Get browser info from port map
if isinstance(browser_info_dict, dict) and "port_map" in browser_info_dict:
port_str = str(debugging_port)
if port_str in browser_info_dict["port_map"]:
browser_info = browser_info_dict["port_map"][port_str]
# Check if the browser is still running
if not is_browser_running(browser_info.get('pid')):
if logger:
logger.warning(f"Built-in browser on port {debugging_port} is not running", tag="BUILTIN")
# Remove this port from the dictionary
del browser_info_dict["port_map"][port_str]
with open(config_file, 'w') as f:
json.dump(browser_info_dict, f, indent=2)
return None
return browser_info
return None
except Exception as e:
if logger:
logger.error(f"Error reading built-in browser config: {str(e)}", tag="BUILTIN")
return None
def get_browser_info(self) -> Optional[Dict[str, Any]]:
"""Get information about the current built-in browser instance.
Returns:
dict: Browser information or None if no running browser is configured
"""
return self.get_builtin_browser_info(
debugging_port=self.config.debugging_port,
config_file=self.builtin_config_file,
logger=self.logger
)
async def launch_builtin_browser(self,
browser_type: str = "chromium",
@@ -873,18 +970,27 @@ class BuiltinBrowserStrategy(CDPBrowserStrategy):
str: CDP URL for the browser, or None if launch failed
"""
# Check if there's an existing browser still running
browser_info = self.get_builtin_browser_info()
if browser_info and self._is_browser_running(browser_info.get('pid')):
browser_info = self.get_builtin_browser_info(
debugging_port=debugging_port,
config_file=self.builtin_config_file,
logger=self.logger
)
if browser_info:
if self.logger:
self.logger.info("Built-in browser is already running", tag="BUILTIN")
self.logger.info(f"Built-in browser is already running on port {debugging_port}", tag="BUILTIN")
return browser_info.get('cdp_url')
# Create a user data directory for the built-in browser
user_data_dir = os.path.join(self.builtin_browser_dir, "user_data")
# Raise error if user data dir is already engaged
if self._check_user_dir_is_engaged(user_data_dir):
raise Exception(f"User data directory {user_data_dir} is already engaged by another browser instance.")
# Create the user data directory if it doesn't exist
os.makedirs(user_data_dir, exist_ok=True)
# Prepare browser launch arguments
browser_path = get_browser_executable(browser_type)
browser_path = await get_browser_executable(browser_type)
if browser_type == "chromium":
args = [
browser_path,
@@ -957,7 +1063,7 @@ class BuiltinBrowserStrategy(CDPBrowserStrategy):
if self.logger:
self.logger.warning(f"Could not verify browser: {str(e)}", tag="BUILTIN")
# Save browser info
# Create browser info
browser_info = {
'pid': process.pid,
'cdp_url': cdp_url,
@@ -968,8 +1074,31 @@ class BuiltinBrowserStrategy(CDPBrowserStrategy):
'config': config_json
}
# Read existing config file if it exists
port_map = {}
if os.path.exists(self.builtin_config_file):
try:
with open(self.builtin_config_file, 'r') as f:
existing_data = json.load(f)
# Check if it already uses port mapping
if isinstance(existing_data, dict) and "port_map" in existing_data:
port_map = existing_data["port_map"]
# Convert legacy format to port mapping
elif isinstance(existing_data, dict) and "debugging_port" in existing_data:
old_port = str(existing_data.get("debugging_port"))
if self._is_browser_running(existing_data.get("pid")):
port_map[old_port] = existing_data
except Exception as e:
if self.logger:
self.logger.warning(f"Could not read existing config: {str(e)}", tag="BUILTIN")
# Add/update this browser in the port map
port_map[str(debugging_port)] = browser_info
# Write updated config
with open(self.builtin_config_file, 'w') as f:
json.dump(browser_info, f, indent=2)
json.dump({"port_map": port_map}, f, indent=2)
# Detach from the browser process - don't keep any references
# This is important to allow the Python script to exit while the browser continues running
@@ -990,10 +1119,10 @@ class BuiltinBrowserStrategy(CDPBrowserStrategy):
Returns:
bool: True if the browser was killed, False otherwise
"""
browser_info = self.get_builtin_browser_info()
browser_info = self.get_browser_info()
if not browser_info:
if self.logger:
self.logger.warning("No built-in browser found", tag="BUILTIN")
self.logger.warning(f"No built-in browser found on port {self.config.debugging_port}", tag="BUILTIN")
return False
pid = browser_info.get('pid')
@@ -1007,16 +1136,29 @@ class BuiltinBrowserStrategy(CDPBrowserStrategy):
os.kill(pid, signal.SIGTERM)
# Wait for termination
for _ in range(5):
if not self._is_browser_running(pid):
if not is_browser_running(pid):
break
await asyncio.sleep(0.5)
else:
# Force kill if still running
os.kill(pid, signal.SIGKILL)
# Remove config file
if os.path.exists(self.builtin_config_file):
os.unlink(self.builtin_config_file)
# Update config file to remove this browser
with open(self.builtin_config_file, 'r') as f:
browser_info_dict = json.load(f)
# Remove this port from the dictionary
port_str = str(self.config.debugging_port)
if port_str in browser_info_dict.get("port_map", {}):
del browser_info_dict["port_map"][port_str]
with open(self.builtin_config_file, 'w') as f:
json.dump(browser_info_dict, f, indent=2)
# Remove user data directory if it exists
if os.path.exists(self.builtin_browser_dir):
shutil.rmtree(self.builtin_browser_dir)
# Clear the browser info cache
self.browser = None
self.temp_dir = None
self.shutting_down = True
if self.logger:
self.logger.success("Built-in browser terminated", tag="BUILTIN")
@@ -1032,17 +1174,29 @@ class BuiltinBrowserStrategy(CDPBrowserStrategy):
Returns:
dict: Status information with running, cdp_url, and info fields
"""
browser_info = self.get_builtin_browser_info()
browser_info = self.get_browser_info()
if not browser_info:
return {
'running': False,
'cdp_url': None,
'info': None
'info': None,
'port': self.config.debugging_port
}
return {
'running': True,
'cdp_url': browser_info.get('cdp_url'),
'info': browser_info
'info': browser_info,
'port': self.config.debugging_port
}
# Override the close method to handle built-in browser cleanup
async def close(self):
"""Close the built-in browser and clean up resources."""
# Call parent class close method
await super().close()
# Clean up built-in browser if we created it
if self.shutting_down:
await self.kill_builtin_browser()

View File

@@ -8,14 +8,18 @@ and Playwright instance management.
import asyncio
import os
import sys
import platform
import time
import tempfile
from typing import Optional, Any
import subprocess
from typing import Optional
from playwright.async_api import async_playwright
from ..async_logger import AsyncLogger
from ..utils import get_chromium_path
from ..async_configs import BrowserConfig, CrawlerRunConfig
from ..async_logger import AsyncLogger
_playwright_instance = None
@@ -30,7 +34,7 @@ async def get_playwright():
_playwright_instance = await async_playwright().start()
return _playwright_instance
def get_browser_executable(browser_type: str) -> str:
async def get_browser_executable(browser_type: str) -> str:
"""Get the path to browser executable, with platform-specific handling.
Args:
@@ -39,7 +43,7 @@ def get_browser_executable(browser_type: str) -> str:
Returns:
Path to browser executable
"""
return get_chromium_path(browser_type)
return await get_chromium_path(browser_type)
def create_temp_directory(prefix="browser-profile-") -> str:
"""Create a temporary directory for browser data.
@@ -75,6 +79,31 @@ def is_linux() -> bool:
True if Linux, False otherwise
"""
return not (is_windows() or is_macos())
def is_browser_running(pid: Optional[int]) -> bool:
"""Check if a process with the given PID is running.
Args:
pid: Process ID to check
Returns:
bool: True if the process is running, False otherwise
"""
if not pid:
return False
try:
# Check if the process exists
if is_windows():
process = subprocess.run(["tasklist", "/FI", f"PID eq {pid}"],
capture_output=True, text=True)
return str(pid) in process.stdout
else:
# Unix-like systems
os.kill(pid, 0) # This doesn't actually kill the process, just checks if it exists
return True
except (ProcessLookupError, PermissionError, OSError):
return False
def get_browser_disable_options() -> list:
"""Get standard list of browser disable options for performance.
@@ -103,3 +132,197 @@ def get_browser_disable_options() -> list:
"--password-store=basic",
"--use-mock-keychain",
]
async def find_optimal_browser_config(total_urls=50, verbose=True, rate_limit_delay=0.2):
"""Find optimal browser configuration for crawling a specific number of URLs.
Args:
total_urls: Number of URLs to crawl
verbose: Whether to print progress
rate_limit_delay: Delay between page loads to avoid rate limiting
Returns:
dict: Contains fastest, lowest_memory, and optimal configurations
"""
from .manager import BrowserManager
if verbose:
print(f"\n=== Finding optimal configuration for crawling {total_urls} URLs ===\n")
# Generate test URLs with timestamp to avoid caching
timestamp = int(time.time())
urls = [f"https://example.com/page_{i}?t={timestamp}" for i in range(total_urls)]
# Limit browser configurations to test (1 browser to max 10)
max_browsers = min(10, total_urls)
configs_to_test = []
# Generate configurations (browser count, pages distribution)
for num_browsers in range(1, max_browsers + 1):
base_pages = total_urls // num_browsers
remainder = total_urls % num_browsers
# Create distribution array like [3, 3, 2, 2] (some browsers get one more page)
if remainder > 0:
distribution = [base_pages + 1] * remainder + [base_pages] * (num_browsers - remainder)
else:
distribution = [base_pages] * num_browsers
configs_to_test.append((num_browsers, distribution))
results = []
# Test each configuration
for browser_count, page_distribution in configs_to_test:
if verbose:
print(f"Testing {browser_count} browsers with distribution {tuple(page_distribution)}")
try:
# Track memory if possible
try:
import psutil
process = psutil.Process()
start_memory = process.memory_info().rss / (1024 * 1024) # MB
except ImportError:
if verbose:
print("Memory tracking not available (psutil not installed)")
start_memory = 0
# Start browsers in parallel
managers = []
start_tasks = []
start_time = time.time()
logger = AsyncLogger(verbose=True, log_file=None)
for i in range(browser_count):
config = BrowserConfig(headless=True)
manager = BrowserManager(browser_config=config, logger=logger)
start_tasks.append(manager.start())
managers.append(manager)
await asyncio.gather(*start_tasks)
# Distribute URLs among browsers
urls_per_manager = {}
url_index = 0
for i, manager in enumerate(managers):
pages_for_this_browser = page_distribution[i]
end_index = url_index + pages_for_this_browser
urls_per_manager[manager] = urls[url_index:end_index]
url_index = end_index
# Create pages for each browser
all_pages = []
for manager, manager_urls in urls_per_manager.items():
if not manager_urls:
continue
pages = await manager.get_pages(CrawlerRunConfig(), count=len(manager_urls))
all_pages.extend(zip(pages, manager_urls))
# Crawl pages with delay to avoid rate limiting
async def crawl_page(page_ctx, url):
page, _ = page_ctx
try:
await page.goto(url)
if rate_limit_delay > 0:
await asyncio.sleep(rate_limit_delay)
title = await page.title()
return title
finally:
await page.close()
crawl_start = time.time()
crawl_tasks = [crawl_page(page_ctx, url) for page_ctx, url in all_pages]
await asyncio.gather(*crawl_tasks)
crawl_time = time.time() - crawl_start
total_time = time.time() - start_time
# Measure final memory usage
if start_memory > 0:
end_memory = process.memory_info().rss / (1024 * 1024)
memory_used = end_memory - start_memory
else:
memory_used = 0
# Close all browsers
for manager in managers:
await manager.close()
# Calculate metrics
pages_per_second = total_urls / crawl_time
# Calculate efficiency score (higher is better)
# This balances speed vs memory
if memory_used > 0:
efficiency = pages_per_second / (memory_used + 1)
else:
efficiency = pages_per_second
# Store result
result = {
"browser_count": browser_count,
"distribution": tuple(page_distribution),
"crawl_time": crawl_time,
"total_time": total_time,
"memory_used": memory_used,
"pages_per_second": pages_per_second,
"efficiency": efficiency
}
results.append(result)
if verbose:
print(f" ✓ Crawled {total_urls} pages in {crawl_time:.2f}s ({pages_per_second:.1f} pages/sec)")
if memory_used > 0:
print(f" ✓ Memory used: {memory_used:.1f}MB ({memory_used/total_urls:.1f}MB per page)")
print(f" ✓ Efficiency score: {efficiency:.4f}")
except Exception as e:
if verbose:
print(f" ✗ Error: {str(e)}")
# Clean up
for manager in managers:
try:
await manager.close()
except:
pass
# If no successful results, return None
if not results:
return None
# Find best configurations
fastest = sorted(results, key=lambda x: x["crawl_time"])[0]
# Only consider memory if available
memory_results = [r for r in results if r["memory_used"] > 0]
if memory_results:
lowest_memory = sorted(memory_results, key=lambda x: x["memory_used"])[0]
else:
lowest_memory = fastest
# Find most efficient (balanced speed vs memory)
optimal = sorted(results, key=lambda x: x["efficiency"], reverse=True)[0]
# Print summary
if verbose:
print("\n=== OPTIMAL CONFIGURATIONS ===")
print(f"⚡ Fastest: {fastest['browser_count']} browsers {fastest['distribution']}")
print(f" {fastest['crawl_time']:.2f}s, {fastest['pages_per_second']:.1f} pages/sec")
print(f"💾 Memory-efficient: {lowest_memory['browser_count']} browsers {lowest_memory['distribution']}")
if lowest_memory["memory_used"] > 0:
print(f" {lowest_memory['memory_used']:.1f}MB, {lowest_memory['memory_used']/total_urls:.2f}MB per page")
print(f"🌟 Balanced optimal: {optimal['browser_count']} browsers {optimal['distribution']}")
print(f" {optimal['crawl_time']:.2f}s, {optimal['pages_per_second']:.1f} pages/sec, score: {optimal['efficiency']:.4f}")
return {
"fastest": fastest,
"lowest_memory": lowest_memory,
"optimal": optimal,
"all_configs": results
}