refactor(browser): improve parallel crawling and browser management
Remove PagePoolConfig in favor of direct page management in browser strategies. Add get_pages() method for efficient parallel page creation. Improve storage state handling and persistence. Add comprehensive parallel crawling tests and performance analysis. BREAKING CHANGE: Removed PagePoolConfig class and related functionality.
This commit is contained in:
@@ -156,41 +156,6 @@ def is_empty_value(value: Any) -> bool:
|
|||||||
return False
|
return False
|
||||||
|
|
||||||
|
|
||||||
class PagePoolConfig:
|
|
||||||
"""Configuration for browser page pooling.
|
|
||||||
|
|
||||||
This class configures the page pooling mechanism that maintains pre-warmed
|
|
||||||
browser pages ready for immediate use, improving performance for scenarios
|
|
||||||
where multiple URLs need to be processed in sequence.
|
|
||||||
|
|
||||||
Attributes:
|
|
||||||
mode (str): Pooling mode - "static" or "adaptive".
|
|
||||||
"static" uses a fixed pool size defined by static_size.
|
|
||||||
"adaptive" calculates optimal size based on available system memory.
|
|
||||||
Default: "static".
|
|
||||||
static_size (int): Number of pages to maintain in the pool when mode is "static".
|
|
||||||
Default: 10.
|
|
||||||
memory_per_page (int): Estimated memory used by a single page in MB.
|
|
||||||
Used for "adaptive" mode calculations.
|
|
||||||
Default: 200.
|
|
||||||
memory_threshold (float): Maximum percentage of system memory to use in "adaptive" mode.
|
|
||||||
Default: 0.7 (70% of available memory).
|
|
||||||
timeout (float): Seconds to wait for a page from the pool before creating a new one.
|
|
||||||
Default: 5.0.
|
|
||||||
"""
|
|
||||||
|
|
||||||
def __init__(self,
|
|
||||||
mode="static",
|
|
||||||
static_size=10,
|
|
||||||
memory_per_page=200,
|
|
||||||
memory_threshold=0.7,
|
|
||||||
timeout=5.0):
|
|
||||||
self.mode = mode
|
|
||||||
self.static_size = static_size
|
|
||||||
self.memory_per_page = memory_per_page
|
|
||||||
self.memory_threshold = memory_threshold
|
|
||||||
self.timeout = timeout
|
|
||||||
|
|
||||||
class BrowserConfig:
|
class BrowserConfig:
|
||||||
"""
|
"""
|
||||||
Configuration class for setting up a browser instance and its context in AsyncPlaywrightCrawlerStrategy.
|
Configuration class for setting up a browser instance and its context in AsyncPlaywrightCrawlerStrategy.
|
||||||
@@ -235,7 +200,7 @@ class BrowserConfig:
|
|||||||
Default: False.
|
Default: False.
|
||||||
downloads_path (str or None): Directory to store downloaded files. If None and accept_downloads is True,
|
downloads_path (str or None): Directory to store downloaded files. If None and accept_downloads is True,
|
||||||
a default path will be created. Default: None.
|
a default path will be created. Default: None.
|
||||||
storage_state (str or dict or None): Path or object describing storage state (cookies, localStorage).
|
storage_state (str or dict or None): An in-memory storage state (cookies, localStorage).
|
||||||
Default: None.
|
Default: None.
|
||||||
ignore_https_errors (bool): Ignore HTTPS certificate errors. Default: True.
|
ignore_https_errors (bool): Ignore HTTPS certificate errors. Default: True.
|
||||||
java_script_enabled (bool): Enable JavaScript execution in pages. Default: True.
|
java_script_enabled (bool): Enable JavaScript execution in pages. Default: True.
|
||||||
@@ -255,9 +220,6 @@ class BrowserConfig:
|
|||||||
light_mode (bool): Disables certain background features for performance gains. Default: False.
|
light_mode (bool): Disables certain background features for performance gains. Default: False.
|
||||||
extra_args (list): Additional command-line arguments passed to the browser.
|
extra_args (list): Additional command-line arguments passed to the browser.
|
||||||
Default: [].
|
Default: [].
|
||||||
page_pool_config (PagePoolConfig or None): Configuration for page pooling mechanism.
|
|
||||||
If None, page pooling is disabled.
|
|
||||||
Default: None.
|
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(
|
def __init__(
|
||||||
@@ -298,7 +260,6 @@ class BrowserConfig:
|
|||||||
extra_args: list = None,
|
extra_args: list = None,
|
||||||
debugging_port: int = 9222,
|
debugging_port: int = 9222,
|
||||||
host: str = "localhost",
|
host: str = "localhost",
|
||||||
page_pool_config: Optional[PagePoolConfig] = None,
|
|
||||||
):
|
):
|
||||||
self.browser_type = browser_type
|
self.browser_type = browser_type
|
||||||
self.headless = headless
|
self.headless = headless
|
||||||
@@ -337,7 +298,6 @@ class BrowserConfig:
|
|||||||
self.verbose = verbose
|
self.verbose = verbose
|
||||||
self.debugging_port = debugging_port
|
self.debugging_port = debugging_port
|
||||||
self.host = host
|
self.host = host
|
||||||
self.page_pool_config = page_pool_config
|
|
||||||
|
|
||||||
fa_user_agenr_generator = ValidUAGenerator()
|
fa_user_agenr_generator = ValidUAGenerator()
|
||||||
if self.user_agent_mode == "random":
|
if self.user_agent_mode == "random":
|
||||||
@@ -368,12 +328,6 @@ class BrowserConfig:
|
|||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def from_kwargs(kwargs: dict) -> "BrowserConfig":
|
def from_kwargs(kwargs: dict) -> "BrowserConfig":
|
||||||
# Handle page_pool_config
|
|
||||||
page_pool_config = kwargs.get("page_pool_config")
|
|
||||||
if isinstance(page_pool_config, dict):
|
|
||||||
# If it's a dict, convert to PagePoolConfig
|
|
||||||
page_pool_config = PagePoolConfig(**page_pool_config)
|
|
||||||
|
|
||||||
return BrowserConfig(
|
return BrowserConfig(
|
||||||
browser_type=kwargs.get("browser_type", "chromium"),
|
browser_type=kwargs.get("browser_type", "chromium"),
|
||||||
headless=kwargs.get("headless", True),
|
headless=kwargs.get("headless", True),
|
||||||
@@ -407,7 +361,6 @@ class BrowserConfig:
|
|||||||
extra_args=kwargs.get("extra_args", []),
|
extra_args=kwargs.get("extra_args", []),
|
||||||
debugging_port=kwargs.get("debugging_port", 9222),
|
debugging_port=kwargs.get("debugging_port", 9222),
|
||||||
host=kwargs.get("host", "localhost"),
|
host=kwargs.get("host", "localhost"),
|
||||||
page_pool_config=page_pool_config,
|
|
||||||
)
|
)
|
||||||
|
|
||||||
def to_dict(self):
|
def to_dict(self):
|
||||||
@@ -442,7 +395,6 @@ class BrowserConfig:
|
|||||||
"verbose": self.verbose,
|
"verbose": self.verbose,
|
||||||
"debugging_port": self.debugging_port,
|
"debugging_port": self.debugging_port,
|
||||||
"host": self.host,
|
"host": self.host,
|
||||||
"page_pool_config": self.page_pool_config,
|
|
||||||
}
|
}
|
||||||
|
|
||||||
def clone(self, **kwargs):
|
def clone(self, **kwargs):
|
||||||
|
|||||||
@@ -2,11 +2,14 @@
|
|||||||
|
|
||||||
This module provides a central browser management class that uses the
|
This module provides a central browser management class that uses the
|
||||||
strategy pattern internally while maintaining the existing API.
|
strategy pattern internally while maintaining the existing API.
|
||||||
|
It also implements a page pooling mechanism for improved performance.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
import asyncio
|
import asyncio
|
||||||
import time
|
import time
|
||||||
from typing import Optional, Tuple, Dict, Any
|
import os
|
||||||
|
import psutil
|
||||||
|
from typing import Optional, Tuple, Dict, Any, List, Set
|
||||||
|
|
||||||
from playwright.async_api import Page, BrowserContext
|
from playwright.async_api import Page, BrowserContext
|
||||||
|
|
||||||
@@ -118,6 +121,28 @@ class BrowserManager:
|
|||||||
|
|
||||||
return page, context
|
return page, context
|
||||||
|
|
||||||
|
async def get_pages(self, crawlerRunConfig: CrawlerRunConfig, count: int = 1) -> List[Tuple[Page, BrowserContext]]:
|
||||||
|
"""Get multiple pages with the same configuration.
|
||||||
|
|
||||||
|
This method efficiently creates multiple browser pages using the same configuration,
|
||||||
|
which is useful for parallel crawling of multiple URLs.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
crawlerRunConfig: Configuration for the pages
|
||||||
|
count: Number of pages to create
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
List of (Page, Context) tuples
|
||||||
|
"""
|
||||||
|
# Delegate to strategy
|
||||||
|
pages = await self._strategy.get_pages(crawlerRunConfig, count)
|
||||||
|
|
||||||
|
# Sync sessions if needed
|
||||||
|
if hasattr(self._strategy, 'sessions'):
|
||||||
|
self.sessions = self._strategy.sessions
|
||||||
|
|
||||||
|
return pages
|
||||||
|
|
||||||
async def kill_session(self, session_id: str):
|
async def kill_session(self, session_id: str):
|
||||||
"""Kill a browser session and clean up resources.
|
"""Kill a browser session and clean up resources.
|
||||||
|
|
||||||
|
|||||||
@@ -23,7 +23,7 @@ from ..async_configs import BrowserConfig, CrawlerRunConfig
|
|||||||
from ..config import DOWNLOAD_PAGE_TIMEOUT
|
from ..config import DOWNLOAD_PAGE_TIMEOUT
|
||||||
from ..js_snippet import load_js_script
|
from ..js_snippet import load_js_script
|
||||||
from ..utils import get_home_folder
|
from ..utils import get_home_folder
|
||||||
from .utils import get_playwright, get_browser_executable, get_browser_disable_options, create_temp_directory, is_windows
|
from .utils import get_playwright, get_browser_executable, get_browser_disable_options, create_temp_directory, is_windows, is_browser_running
|
||||||
|
|
||||||
from playwright_stealth import StealthConfig
|
from playwright_stealth import StealthConfig
|
||||||
|
|
||||||
@@ -85,6 +85,22 @@ class BaseBrowserStrategy(ABC):
|
|||||||
"""
|
"""
|
||||||
pass
|
pass
|
||||||
|
|
||||||
|
async def get_pages(self, crawlerRunConfig: CrawlerRunConfig, count: int = 1) -> List[Tuple[Page, BrowserContext]]:
|
||||||
|
"""Get multiple pages with the same configuration.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
crawlerRunConfig: Configuration for the pages
|
||||||
|
count: Number of pages to create
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
List of (Page, Context) tuples
|
||||||
|
"""
|
||||||
|
pages = []
|
||||||
|
for _ in range(count):
|
||||||
|
page, context = await self.get_page(crawlerRunConfig)
|
||||||
|
pages.append((page, context))
|
||||||
|
return pages
|
||||||
|
|
||||||
@abstractmethod
|
@abstractmethod
|
||||||
async def close(self):
|
async def close(self):
|
||||||
"""Close the browser and clean up resources."""
|
"""Close the browser and clean up resources."""
|
||||||
@@ -136,9 +152,6 @@ class BaseBrowserStrategy(ABC):
|
|||||||
if self.config.cookies:
|
if self.config.cookies:
|
||||||
await context.add_cookies(self.config.cookies)
|
await context.add_cookies(self.config.cookies)
|
||||||
|
|
||||||
if self.config.storage_state:
|
|
||||||
await context.storage_state(path=None)
|
|
||||||
|
|
||||||
if self.config.accept_downloads:
|
if self.config.accept_downloads:
|
||||||
context.set_default_timeout(DOWNLOAD_PAGE_TIMEOUT)
|
context.set_default_timeout(DOWNLOAD_PAGE_TIMEOUT)
|
||||||
context.set_default_navigation_timeout(DOWNLOAD_PAGE_TIMEOUT)
|
context.set_default_navigation_timeout(DOWNLOAD_PAGE_TIMEOUT)
|
||||||
@@ -161,7 +174,7 @@ class BaseBrowserStrategy(ABC):
|
|||||||
{
|
{
|
||||||
"name": "cookiesEnabled",
|
"name": "cookiesEnabled",
|
||||||
"value": "true",
|
"value": "true",
|
||||||
"url": crawlerRunConfig.url if crawlerRunConfig else "https://crawl4ai.com/",
|
"url": crawlerRunConfig and crawlerRunConfig.url or "https://crawl4ai.com/",
|
||||||
}
|
}
|
||||||
]
|
]
|
||||||
)
|
)
|
||||||
@@ -324,12 +337,31 @@ class PlaywrightBrowserStrategy(BaseBrowserStrategy):
|
|||||||
"viewport": viewport_settings,
|
"viewport": viewport_settings,
|
||||||
"proxy": proxy_settings,
|
"proxy": proxy_settings,
|
||||||
"accept_downloads": self.config.accept_downloads,
|
"accept_downloads": self.config.accept_downloads,
|
||||||
"storage_state": self.config.storage_state,
|
|
||||||
"ignore_https_errors": self.config.ignore_https_errors,
|
"ignore_https_errors": self.config.ignore_https_errors,
|
||||||
"device_scale_factor": 1.0,
|
"device_scale_factor": 1.0,
|
||||||
"java_script_enabled": self.config.java_script_enabled,
|
"java_script_enabled": self.config.java_script_enabled,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
# Handle storage state properly - this is key for persistence
|
||||||
|
if self.config.storage_state:
|
||||||
|
context_settings["storage_state"] = self.config.storage_state
|
||||||
|
if self.logger:
|
||||||
|
if isinstance(self.config.storage_state, str):
|
||||||
|
self.logger.debug(f"Using storage state from file: {self.config.storage_state}", tag="BROWSER")
|
||||||
|
else:
|
||||||
|
self.logger.debug("Using storage state from config object", tag="BROWSER")
|
||||||
|
|
||||||
|
if self.config.user_data_dir:
|
||||||
|
context_settings["storage_state"] = os.path.join(
|
||||||
|
self.config.user_data_dir, "Default", "storage_state.json"
|
||||||
|
)
|
||||||
|
# Create the file if it doesn't exist
|
||||||
|
if not os.path.exists(context_settings["storage_state"]):
|
||||||
|
os.makedirs(os.path.dirname(context_settings["storage_state"]), exist_ok=True)
|
||||||
|
with open(context_settings["storage_state"], "w") as f:
|
||||||
|
json.dump({}, f)
|
||||||
|
|
||||||
|
|
||||||
if crawlerRunConfig:
|
if crawlerRunConfig:
|
||||||
# Check if there is value for crawlerRunConfig.proxy_config set add that to context
|
# Check if there is value for crawlerRunConfig.proxy_config set add that to context
|
||||||
if crawlerRunConfig.proxy_config:
|
if crawlerRunConfig.proxy_config:
|
||||||
@@ -428,6 +460,21 @@ class PlaywrightBrowserStrategy(BaseBrowserStrategy):
|
|||||||
if self.config.sleep_on_close:
|
if self.config.sleep_on_close:
|
||||||
await asyncio.sleep(0.5)
|
await asyncio.sleep(0.5)
|
||||||
|
|
||||||
|
# If we have a user_data_dir configured, ensure persistence of storage state
|
||||||
|
if self.config.user_data_dir and self.browser and self.default_context:
|
||||||
|
for context in self.browser.contexts:
|
||||||
|
try:
|
||||||
|
await context.storage_state(path=os.path.join(self.config.user_data_dir, "Default", "storage_state.json"))
|
||||||
|
if self.logger:
|
||||||
|
self.logger.debug("Ensuring storage state is persisted before closing browser", tag="BROWSER")
|
||||||
|
except Exception as e:
|
||||||
|
if self.logger:
|
||||||
|
self.logger.warning(
|
||||||
|
message="Failed to ensure storage persistence: {error}",
|
||||||
|
tag="BROWSER",
|
||||||
|
params={"error": str(e)}
|
||||||
|
)
|
||||||
|
|
||||||
# Close all sessions
|
# Close all sessions
|
||||||
session_ids = list(self.sessions.keys())
|
session_ids = list(self.sessions.keys())
|
||||||
for session_id in session_ids:
|
for session_id in session_ids:
|
||||||
@@ -582,7 +629,7 @@ class CDPBrowserStrategy(BaseBrowserStrategy):
|
|||||||
Returns:
|
Returns:
|
||||||
List of command-line arguments for the browser
|
List of command-line arguments for the browser
|
||||||
"""
|
"""
|
||||||
browser_path = get_browser_executable(self.config.browser_type)
|
browser_path = await get_browser_executable(self.config.browser_type)
|
||||||
base_args = [browser_path]
|
base_args = [browser_path]
|
||||||
|
|
||||||
if self.config.browser_type == "chromium":
|
if self.config.browser_type == "chromium":
|
||||||
@@ -727,6 +774,22 @@ class CDPBrowserStrategy(BaseBrowserStrategy):
|
|||||||
if self.config.sleep_on_close:
|
if self.config.sleep_on_close:
|
||||||
await asyncio.sleep(0.5)
|
await asyncio.sleep(0.5)
|
||||||
|
|
||||||
|
# If we have a user_data_dir configured, ensure persistence of storage state
|
||||||
|
if self.config.user_data_dir and self.browser:
|
||||||
|
try:
|
||||||
|
# Create a brief sleep to allow the browser to flush any pending operations
|
||||||
|
# This helps ensure all storage state (localStorage, cookies, etc.) gets saved
|
||||||
|
await asyncio.sleep(0.3)
|
||||||
|
if self.logger:
|
||||||
|
self.logger.debug("Ensuring storage state is persisted before closing CDP browser", tag="BROWSER")
|
||||||
|
except Exception as e:
|
||||||
|
if self.logger:
|
||||||
|
self.logger.warning(
|
||||||
|
message="Failed to ensure storage persistence: {error}",
|
||||||
|
tag="BROWSER",
|
||||||
|
params={"error": str(e)}
|
||||||
|
)
|
||||||
|
|
||||||
# Close all sessions
|
# Close all sessions
|
||||||
session_ids = list(self.sessions.keys())
|
session_ids = list(self.sessions.keys())
|
||||||
for session_id in session_ids:
|
for session_id in session_ids:
|
||||||
@@ -775,19 +838,46 @@ class BuiltinBrowserStrategy(CDPBrowserStrategy):
|
|||||||
logger: Logger for recording events and errors
|
logger: Logger for recording events and errors
|
||||||
"""
|
"""
|
||||||
super().__init__(config, logger)
|
super().__init__(config, logger)
|
||||||
self.builtin_browser_dir = os.path.join(get_home_folder(), "builtin-browser")
|
self.builtin_browser_dir = os.path.join(get_home_folder(), "builtin-browser") if not self.config.user_data_dir else self.config.user_data_dir
|
||||||
self.builtin_config_file = os.path.join(self.builtin_browser_dir, "browser_config.json")
|
self.builtin_config_file = os.path.join(self.builtin_browser_dir, "browser_config.json")
|
||||||
|
|
||||||
|
# Raise error if user data dir is already engaged
|
||||||
|
if self._check_user_dir_is_engaged(self.builtin_browser_dir):
|
||||||
|
raise Exception(f"User data directory {self.builtin_browser_dir} is already engaged by another browser instance.")
|
||||||
|
|
||||||
os.makedirs(self.builtin_browser_dir, exist_ok=True)
|
os.makedirs(self.builtin_browser_dir, exist_ok=True)
|
||||||
|
|
||||||
|
def _check_user_dir_is_engaged(self, user_data_dir: str) -> bool:
|
||||||
|
"""Check if the user data directory is already in use.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
bool: True if the directory is engaged, False otherwise
|
||||||
|
"""
|
||||||
|
# Load browser config file, then iterate in port_map values, check "user_data_dir" key if it matches
|
||||||
|
# the current user data directory
|
||||||
|
if os.path.exists(self.builtin_config_file):
|
||||||
|
try:
|
||||||
|
with open(self.builtin_config_file, 'r') as f:
|
||||||
|
browser_info_dict = json.load(f)
|
||||||
|
|
||||||
|
# Check if user data dir is already engaged
|
||||||
|
for port_str, browser_info in browser_info_dict.get("port_map", {}).items():
|
||||||
|
if browser_info.get("user_data_dir") == user_data_dir:
|
||||||
|
return True
|
||||||
|
except Exception as e:
|
||||||
|
if self.logger:
|
||||||
|
self.logger.error(f"Error reading built-in browser config: {str(e)}", tag="BUILTIN")
|
||||||
|
return False
|
||||||
|
|
||||||
async def start(self):
|
async def start(self):
|
||||||
"""Start or connect to the built-in browser.
|
"""Start or connect to the built-in browser.
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
self: For method chaining
|
self: For method chaining
|
||||||
"""
|
"""
|
||||||
# Check for existing built-in browser
|
# Check for existing built-in browser (get_browser_info already checks if running)
|
||||||
browser_info = self.get_builtin_browser_info()
|
browser_info = self.get_browser_info()
|
||||||
if browser_info and self._is_browser_running(browser_info.get('pid')):
|
if browser_info:
|
||||||
if self.logger:
|
if self.logger:
|
||||||
self.logger.info(f"Using existing built-in browser at {browser_info.get('cdp_url')}", tag="BROWSER")
|
self.logger.info(f"Using existing built-in browser at {browser_info.get('cdp_url')}", tag="BROWSER")
|
||||||
self.config.cdp_url = browser_info.get('cdp_url')
|
self.config.cdp_url = browser_info.get('cdp_url')
|
||||||
@@ -797,7 +887,7 @@ class BuiltinBrowserStrategy(CDPBrowserStrategy):
|
|||||||
cdp_url = await self.launch_builtin_browser(
|
cdp_url = await self.launch_builtin_browser(
|
||||||
browser_type=self.config.browser_type,
|
browser_type=self.config.browser_type,
|
||||||
debugging_port=self.config.debugging_port,
|
debugging_port=self.config.debugging_port,
|
||||||
headless=self.config.headless
|
headless=self.config.headless,
|
||||||
)
|
)
|
||||||
if not cdp_url:
|
if not cdp_url:
|
||||||
if self.logger:
|
if self.logger:
|
||||||
@@ -808,55 +898,62 @@ class BuiltinBrowserStrategy(CDPBrowserStrategy):
|
|||||||
# Call parent class implementation with updated CDP URL
|
# Call parent class implementation with updated CDP URL
|
||||||
return await super().start()
|
return await super().start()
|
||||||
|
|
||||||
def get_builtin_browser_info(self) -> Optional[Dict[str, Any]]:
|
@classmethod
|
||||||
"""Get information about the built-in browser.
|
def get_builtin_browser_info(cls, debugging_port: int, config_file: str, logger: Optional[AsyncLogger] = None) -> Optional[Dict[str, Any]]:
|
||||||
|
"""Get information about the built-in browser for a specific debugging port.
|
||||||
Returns:
|
|
||||||
dict: Browser information or None if no built-in browser is configured
|
|
||||||
"""
|
|
||||||
if not os.path.exists(self.builtin_config_file):
|
|
||||||
return None
|
|
||||||
|
|
||||||
try:
|
|
||||||
with open(self.builtin_config_file, 'r') as f:
|
|
||||||
browser_info = json.load(f)
|
|
||||||
|
|
||||||
# Check if the browser is still running
|
|
||||||
if not self._is_browser_running(browser_info.get('pid')):
|
|
||||||
if self.logger:
|
|
||||||
self.logger.warning("Built-in browser is not running", tag="BUILTIN")
|
|
||||||
return None
|
|
||||||
|
|
||||||
return browser_info
|
|
||||||
except Exception as e:
|
|
||||||
if self.logger:
|
|
||||||
self.logger.error(f"Error reading built-in browser config: {str(e)}", tag="BUILTIN")
|
|
||||||
return None
|
|
||||||
|
|
||||||
def _is_browser_running(self, pid: Optional[int]) -> bool:
|
|
||||||
"""Check if a process with the given PID is running.
|
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
pid: Process ID to check
|
debugging_port: The debugging port to look for
|
||||||
|
config_file: Path to the config file
|
||||||
|
logger: Optional logger for recording events
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
bool: True if the process is running, False otherwise
|
dict: Browser information or None if no running browser is configured for this port
|
||||||
"""
|
"""
|
||||||
if not pid:
|
if not os.path.exists(config_file):
|
||||||
return False
|
return None
|
||||||
|
|
||||||
try:
|
try:
|
||||||
# Check if the process exists
|
with open(config_file, 'r') as f:
|
||||||
if is_windows():
|
browser_info_dict = json.load(f)
|
||||||
process = subprocess.run(["tasklist", "/FI", f"PID eq {pid}"],
|
|
||||||
capture_output=True, text=True)
|
# Get browser info from port map
|
||||||
return str(pid) in process.stdout
|
if isinstance(browser_info_dict, dict) and "port_map" in browser_info_dict:
|
||||||
else:
|
port_str = str(debugging_port)
|
||||||
# Unix-like systems
|
if port_str in browser_info_dict["port_map"]:
|
||||||
os.kill(pid, 0) # This doesn't actually kill the process, just checks if it exists
|
browser_info = browser_info_dict["port_map"][port_str]
|
||||||
return True
|
|
||||||
except (ProcessLookupError, PermissionError, OSError):
|
# Check if the browser is still running
|
||||||
return False
|
if not is_browser_running(browser_info.get('pid')):
|
||||||
|
if logger:
|
||||||
|
logger.warning(f"Built-in browser on port {debugging_port} is not running", tag="BUILTIN")
|
||||||
|
# Remove this port from the dictionary
|
||||||
|
del browser_info_dict["port_map"][port_str]
|
||||||
|
with open(config_file, 'w') as f:
|
||||||
|
json.dump(browser_info_dict, f, indent=2)
|
||||||
|
return None
|
||||||
|
|
||||||
|
return browser_info
|
||||||
|
|
||||||
|
return None
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
if logger:
|
||||||
|
logger.error(f"Error reading built-in browser config: {str(e)}", tag="BUILTIN")
|
||||||
|
return None
|
||||||
|
|
||||||
|
def get_browser_info(self) -> Optional[Dict[str, Any]]:
|
||||||
|
"""Get information about the current built-in browser instance.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
dict: Browser information or None if no running browser is configured
|
||||||
|
"""
|
||||||
|
return self.get_builtin_browser_info(
|
||||||
|
debugging_port=self.config.debugging_port,
|
||||||
|
config_file=self.builtin_config_file,
|
||||||
|
logger=self.logger
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
async def launch_builtin_browser(self,
|
async def launch_builtin_browser(self,
|
||||||
browser_type: str = "chromium",
|
browser_type: str = "chromium",
|
||||||
@@ -873,18 +970,27 @@ class BuiltinBrowserStrategy(CDPBrowserStrategy):
|
|||||||
str: CDP URL for the browser, or None if launch failed
|
str: CDP URL for the browser, or None if launch failed
|
||||||
"""
|
"""
|
||||||
# Check if there's an existing browser still running
|
# Check if there's an existing browser still running
|
||||||
browser_info = self.get_builtin_browser_info()
|
browser_info = self.get_builtin_browser_info(
|
||||||
if browser_info and self._is_browser_running(browser_info.get('pid')):
|
debugging_port=debugging_port,
|
||||||
|
config_file=self.builtin_config_file,
|
||||||
|
logger=self.logger
|
||||||
|
)
|
||||||
|
if browser_info:
|
||||||
if self.logger:
|
if self.logger:
|
||||||
self.logger.info("Built-in browser is already running", tag="BUILTIN")
|
self.logger.info(f"Built-in browser is already running on port {debugging_port}", tag="BUILTIN")
|
||||||
return browser_info.get('cdp_url')
|
return browser_info.get('cdp_url')
|
||||||
|
|
||||||
# Create a user data directory for the built-in browser
|
# Create a user data directory for the built-in browser
|
||||||
user_data_dir = os.path.join(self.builtin_browser_dir, "user_data")
|
user_data_dir = os.path.join(self.builtin_browser_dir, "user_data")
|
||||||
|
# Raise error if user data dir is already engaged
|
||||||
|
if self._check_user_dir_is_engaged(user_data_dir):
|
||||||
|
raise Exception(f"User data directory {user_data_dir} is already engaged by another browser instance.")
|
||||||
|
|
||||||
|
# Create the user data directory if it doesn't exist
|
||||||
os.makedirs(user_data_dir, exist_ok=True)
|
os.makedirs(user_data_dir, exist_ok=True)
|
||||||
|
|
||||||
# Prepare browser launch arguments
|
# Prepare browser launch arguments
|
||||||
browser_path = get_browser_executable(browser_type)
|
browser_path = await get_browser_executable(browser_type)
|
||||||
if browser_type == "chromium":
|
if browser_type == "chromium":
|
||||||
args = [
|
args = [
|
||||||
browser_path,
|
browser_path,
|
||||||
@@ -957,7 +1063,7 @@ class BuiltinBrowserStrategy(CDPBrowserStrategy):
|
|||||||
if self.logger:
|
if self.logger:
|
||||||
self.logger.warning(f"Could not verify browser: {str(e)}", tag="BUILTIN")
|
self.logger.warning(f"Could not verify browser: {str(e)}", tag="BUILTIN")
|
||||||
|
|
||||||
# Save browser info
|
# Create browser info
|
||||||
browser_info = {
|
browser_info = {
|
||||||
'pid': process.pid,
|
'pid': process.pid,
|
||||||
'cdp_url': cdp_url,
|
'cdp_url': cdp_url,
|
||||||
@@ -968,8 +1074,31 @@ class BuiltinBrowserStrategy(CDPBrowserStrategy):
|
|||||||
'config': config_json
|
'config': config_json
|
||||||
}
|
}
|
||||||
|
|
||||||
|
# Read existing config file if it exists
|
||||||
|
port_map = {}
|
||||||
|
if os.path.exists(self.builtin_config_file):
|
||||||
|
try:
|
||||||
|
with open(self.builtin_config_file, 'r') as f:
|
||||||
|
existing_data = json.load(f)
|
||||||
|
|
||||||
|
# Check if it already uses port mapping
|
||||||
|
if isinstance(existing_data, dict) and "port_map" in existing_data:
|
||||||
|
port_map = existing_data["port_map"]
|
||||||
|
# Convert legacy format to port mapping
|
||||||
|
elif isinstance(existing_data, dict) and "debugging_port" in existing_data:
|
||||||
|
old_port = str(existing_data.get("debugging_port"))
|
||||||
|
if self._is_browser_running(existing_data.get("pid")):
|
||||||
|
port_map[old_port] = existing_data
|
||||||
|
except Exception as e:
|
||||||
|
if self.logger:
|
||||||
|
self.logger.warning(f"Could not read existing config: {str(e)}", tag="BUILTIN")
|
||||||
|
|
||||||
|
# Add/update this browser in the port map
|
||||||
|
port_map[str(debugging_port)] = browser_info
|
||||||
|
|
||||||
|
# Write updated config
|
||||||
with open(self.builtin_config_file, 'w') as f:
|
with open(self.builtin_config_file, 'w') as f:
|
||||||
json.dump(browser_info, f, indent=2)
|
json.dump({"port_map": port_map}, f, indent=2)
|
||||||
|
|
||||||
# Detach from the browser process - don't keep any references
|
# Detach from the browser process - don't keep any references
|
||||||
# This is important to allow the Python script to exit while the browser continues running
|
# This is important to allow the Python script to exit while the browser continues running
|
||||||
@@ -990,10 +1119,10 @@ class BuiltinBrowserStrategy(CDPBrowserStrategy):
|
|||||||
Returns:
|
Returns:
|
||||||
bool: True if the browser was killed, False otherwise
|
bool: True if the browser was killed, False otherwise
|
||||||
"""
|
"""
|
||||||
browser_info = self.get_builtin_browser_info()
|
browser_info = self.get_browser_info()
|
||||||
if not browser_info:
|
if not browser_info:
|
||||||
if self.logger:
|
if self.logger:
|
||||||
self.logger.warning("No built-in browser found", tag="BUILTIN")
|
self.logger.warning(f"No built-in browser found on port {self.config.debugging_port}", tag="BUILTIN")
|
||||||
return False
|
return False
|
||||||
|
|
||||||
pid = browser_info.get('pid')
|
pid = browser_info.get('pid')
|
||||||
@@ -1007,16 +1136,29 @@ class BuiltinBrowserStrategy(CDPBrowserStrategy):
|
|||||||
os.kill(pid, signal.SIGTERM)
|
os.kill(pid, signal.SIGTERM)
|
||||||
# Wait for termination
|
# Wait for termination
|
||||||
for _ in range(5):
|
for _ in range(5):
|
||||||
if not self._is_browser_running(pid):
|
if not is_browser_running(pid):
|
||||||
break
|
break
|
||||||
await asyncio.sleep(0.5)
|
await asyncio.sleep(0.5)
|
||||||
else:
|
else:
|
||||||
# Force kill if still running
|
# Force kill if still running
|
||||||
os.kill(pid, signal.SIGKILL)
|
os.kill(pid, signal.SIGKILL)
|
||||||
|
|
||||||
# Remove config file
|
# Update config file to remove this browser
|
||||||
if os.path.exists(self.builtin_config_file):
|
with open(self.builtin_config_file, 'r') as f:
|
||||||
os.unlink(self.builtin_config_file)
|
browser_info_dict = json.load(f)
|
||||||
|
# Remove this port from the dictionary
|
||||||
|
port_str = str(self.config.debugging_port)
|
||||||
|
if port_str in browser_info_dict.get("port_map", {}):
|
||||||
|
del browser_info_dict["port_map"][port_str]
|
||||||
|
with open(self.builtin_config_file, 'w') as f:
|
||||||
|
json.dump(browser_info_dict, f, indent=2)
|
||||||
|
# Remove user data directory if it exists
|
||||||
|
if os.path.exists(self.builtin_browser_dir):
|
||||||
|
shutil.rmtree(self.builtin_browser_dir)
|
||||||
|
# Clear the browser info cache
|
||||||
|
self.browser = None
|
||||||
|
self.temp_dir = None
|
||||||
|
self.shutting_down = True
|
||||||
|
|
||||||
if self.logger:
|
if self.logger:
|
||||||
self.logger.success("Built-in browser terminated", tag="BUILTIN")
|
self.logger.success("Built-in browser terminated", tag="BUILTIN")
|
||||||
@@ -1032,17 +1174,29 @@ class BuiltinBrowserStrategy(CDPBrowserStrategy):
|
|||||||
Returns:
|
Returns:
|
||||||
dict: Status information with running, cdp_url, and info fields
|
dict: Status information with running, cdp_url, and info fields
|
||||||
"""
|
"""
|
||||||
browser_info = self.get_builtin_browser_info()
|
browser_info = self.get_browser_info()
|
||||||
|
|
||||||
if not browser_info:
|
if not browser_info:
|
||||||
return {
|
return {
|
||||||
'running': False,
|
'running': False,
|
||||||
'cdp_url': None,
|
'cdp_url': None,
|
||||||
'info': None
|
'info': None,
|
||||||
|
'port': self.config.debugging_port
|
||||||
}
|
}
|
||||||
|
|
||||||
return {
|
return {
|
||||||
'running': True,
|
'running': True,
|
||||||
'cdp_url': browser_info.get('cdp_url'),
|
'cdp_url': browser_info.get('cdp_url'),
|
||||||
'info': browser_info
|
'info': browser_info,
|
||||||
|
'port': self.config.debugging_port
|
||||||
}
|
}
|
||||||
|
|
||||||
|
# Override the close method to handle built-in browser cleanup
|
||||||
|
async def close(self):
|
||||||
|
"""Close the built-in browser and clean up resources."""
|
||||||
|
# Call parent class close method
|
||||||
|
await super().close()
|
||||||
|
|
||||||
|
# Clean up built-in browser if we created it
|
||||||
|
if self.shutting_down:
|
||||||
|
await self.kill_builtin_browser()
|
||||||
|
|||||||
@@ -8,14 +8,18 @@ and Playwright instance management.
|
|||||||
import asyncio
|
import asyncio
|
||||||
import os
|
import os
|
||||||
import sys
|
import sys
|
||||||
import platform
|
import time
|
||||||
import tempfile
|
import tempfile
|
||||||
from typing import Optional, Any
|
import subprocess
|
||||||
|
from typing import Optional
|
||||||
|
|
||||||
from playwright.async_api import async_playwright
|
from playwright.async_api import async_playwright
|
||||||
|
|
||||||
from ..async_logger import AsyncLogger
|
|
||||||
from ..utils import get_chromium_path
|
from ..utils import get_chromium_path
|
||||||
|
from ..async_configs import BrowserConfig, CrawlerRunConfig
|
||||||
|
|
||||||
|
from ..async_logger import AsyncLogger
|
||||||
|
|
||||||
|
|
||||||
_playwright_instance = None
|
_playwright_instance = None
|
||||||
|
|
||||||
@@ -30,7 +34,7 @@ async def get_playwright():
|
|||||||
_playwright_instance = await async_playwright().start()
|
_playwright_instance = await async_playwright().start()
|
||||||
return _playwright_instance
|
return _playwright_instance
|
||||||
|
|
||||||
def get_browser_executable(browser_type: str) -> str:
|
async def get_browser_executable(browser_type: str) -> str:
|
||||||
"""Get the path to browser executable, with platform-specific handling.
|
"""Get the path to browser executable, with platform-specific handling.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
@@ -39,7 +43,7 @@ def get_browser_executable(browser_type: str) -> str:
|
|||||||
Returns:
|
Returns:
|
||||||
Path to browser executable
|
Path to browser executable
|
||||||
"""
|
"""
|
||||||
return get_chromium_path(browser_type)
|
return await get_chromium_path(browser_type)
|
||||||
|
|
||||||
def create_temp_directory(prefix="browser-profile-") -> str:
|
def create_temp_directory(prefix="browser-profile-") -> str:
|
||||||
"""Create a temporary directory for browser data.
|
"""Create a temporary directory for browser data.
|
||||||
@@ -76,6 +80,31 @@ def is_linux() -> bool:
|
|||||||
"""
|
"""
|
||||||
return not (is_windows() or is_macos())
|
return not (is_windows() or is_macos())
|
||||||
|
|
||||||
|
def is_browser_running(pid: Optional[int]) -> bool:
|
||||||
|
"""Check if a process with the given PID is running.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
pid: Process ID to check
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
bool: True if the process is running, False otherwise
|
||||||
|
"""
|
||||||
|
if not pid:
|
||||||
|
return False
|
||||||
|
|
||||||
|
try:
|
||||||
|
# Check if the process exists
|
||||||
|
if is_windows():
|
||||||
|
process = subprocess.run(["tasklist", "/FI", f"PID eq {pid}"],
|
||||||
|
capture_output=True, text=True)
|
||||||
|
return str(pid) in process.stdout
|
||||||
|
else:
|
||||||
|
# Unix-like systems
|
||||||
|
os.kill(pid, 0) # This doesn't actually kill the process, just checks if it exists
|
||||||
|
return True
|
||||||
|
except (ProcessLookupError, PermissionError, OSError):
|
||||||
|
return False
|
||||||
|
|
||||||
def get_browser_disable_options() -> list:
|
def get_browser_disable_options() -> list:
|
||||||
"""Get standard list of browser disable options for performance.
|
"""Get standard list of browser disable options for performance.
|
||||||
|
|
||||||
@@ -103,3 +132,197 @@ def get_browser_disable_options() -> list:
|
|||||||
"--password-store=basic",
|
"--password-store=basic",
|
||||||
"--use-mock-keychain",
|
"--use-mock-keychain",
|
||||||
]
|
]
|
||||||
|
|
||||||
|
|
||||||
|
async def find_optimal_browser_config(total_urls=50, verbose=True, rate_limit_delay=0.2):
|
||||||
|
"""Find optimal browser configuration for crawling a specific number of URLs.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
total_urls: Number of URLs to crawl
|
||||||
|
verbose: Whether to print progress
|
||||||
|
rate_limit_delay: Delay between page loads to avoid rate limiting
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
dict: Contains fastest, lowest_memory, and optimal configurations
|
||||||
|
"""
|
||||||
|
from .manager import BrowserManager
|
||||||
|
if verbose:
|
||||||
|
print(f"\n=== Finding optimal configuration for crawling {total_urls} URLs ===\n")
|
||||||
|
|
||||||
|
# Generate test URLs with timestamp to avoid caching
|
||||||
|
timestamp = int(time.time())
|
||||||
|
urls = [f"https://example.com/page_{i}?t={timestamp}" for i in range(total_urls)]
|
||||||
|
|
||||||
|
# Limit browser configurations to test (1 browser to max 10)
|
||||||
|
max_browsers = min(10, total_urls)
|
||||||
|
configs_to_test = []
|
||||||
|
|
||||||
|
# Generate configurations (browser count, pages distribution)
|
||||||
|
for num_browsers in range(1, max_browsers + 1):
|
||||||
|
base_pages = total_urls // num_browsers
|
||||||
|
remainder = total_urls % num_browsers
|
||||||
|
|
||||||
|
# Create distribution array like [3, 3, 2, 2] (some browsers get one more page)
|
||||||
|
if remainder > 0:
|
||||||
|
distribution = [base_pages + 1] * remainder + [base_pages] * (num_browsers - remainder)
|
||||||
|
else:
|
||||||
|
distribution = [base_pages] * num_browsers
|
||||||
|
|
||||||
|
configs_to_test.append((num_browsers, distribution))
|
||||||
|
|
||||||
|
results = []
|
||||||
|
|
||||||
|
# Test each configuration
|
||||||
|
for browser_count, page_distribution in configs_to_test:
|
||||||
|
if verbose:
|
||||||
|
print(f"Testing {browser_count} browsers with distribution {tuple(page_distribution)}")
|
||||||
|
|
||||||
|
try:
|
||||||
|
# Track memory if possible
|
||||||
|
try:
|
||||||
|
import psutil
|
||||||
|
process = psutil.Process()
|
||||||
|
start_memory = process.memory_info().rss / (1024 * 1024) # MB
|
||||||
|
except ImportError:
|
||||||
|
if verbose:
|
||||||
|
print("Memory tracking not available (psutil not installed)")
|
||||||
|
start_memory = 0
|
||||||
|
|
||||||
|
# Start browsers in parallel
|
||||||
|
managers = []
|
||||||
|
start_tasks = []
|
||||||
|
start_time = time.time()
|
||||||
|
|
||||||
|
logger = AsyncLogger(verbose=True, log_file=None)
|
||||||
|
|
||||||
|
for i in range(browser_count):
|
||||||
|
config = BrowserConfig(headless=True)
|
||||||
|
manager = BrowserManager(browser_config=config, logger=logger)
|
||||||
|
start_tasks.append(manager.start())
|
||||||
|
managers.append(manager)
|
||||||
|
|
||||||
|
await asyncio.gather(*start_tasks)
|
||||||
|
|
||||||
|
# Distribute URLs among browsers
|
||||||
|
urls_per_manager = {}
|
||||||
|
url_index = 0
|
||||||
|
|
||||||
|
for i, manager in enumerate(managers):
|
||||||
|
pages_for_this_browser = page_distribution[i]
|
||||||
|
end_index = url_index + pages_for_this_browser
|
||||||
|
urls_per_manager[manager] = urls[url_index:end_index]
|
||||||
|
url_index = end_index
|
||||||
|
|
||||||
|
# Create pages for each browser
|
||||||
|
all_pages = []
|
||||||
|
for manager, manager_urls in urls_per_manager.items():
|
||||||
|
if not manager_urls:
|
||||||
|
continue
|
||||||
|
pages = await manager.get_pages(CrawlerRunConfig(), count=len(manager_urls))
|
||||||
|
all_pages.extend(zip(pages, manager_urls))
|
||||||
|
|
||||||
|
# Crawl pages with delay to avoid rate limiting
|
||||||
|
async def crawl_page(page_ctx, url):
|
||||||
|
page, _ = page_ctx
|
||||||
|
try:
|
||||||
|
await page.goto(url)
|
||||||
|
if rate_limit_delay > 0:
|
||||||
|
await asyncio.sleep(rate_limit_delay)
|
||||||
|
title = await page.title()
|
||||||
|
return title
|
||||||
|
finally:
|
||||||
|
await page.close()
|
||||||
|
|
||||||
|
crawl_start = time.time()
|
||||||
|
crawl_tasks = [crawl_page(page_ctx, url) for page_ctx, url in all_pages]
|
||||||
|
await asyncio.gather(*crawl_tasks)
|
||||||
|
crawl_time = time.time() - crawl_start
|
||||||
|
total_time = time.time() - start_time
|
||||||
|
|
||||||
|
# Measure final memory usage
|
||||||
|
if start_memory > 0:
|
||||||
|
end_memory = process.memory_info().rss / (1024 * 1024)
|
||||||
|
memory_used = end_memory - start_memory
|
||||||
|
else:
|
||||||
|
memory_used = 0
|
||||||
|
|
||||||
|
# Close all browsers
|
||||||
|
for manager in managers:
|
||||||
|
await manager.close()
|
||||||
|
|
||||||
|
# Calculate metrics
|
||||||
|
pages_per_second = total_urls / crawl_time
|
||||||
|
|
||||||
|
# Calculate efficiency score (higher is better)
|
||||||
|
# This balances speed vs memory
|
||||||
|
if memory_used > 0:
|
||||||
|
efficiency = pages_per_second / (memory_used + 1)
|
||||||
|
else:
|
||||||
|
efficiency = pages_per_second
|
||||||
|
|
||||||
|
# Store result
|
||||||
|
result = {
|
||||||
|
"browser_count": browser_count,
|
||||||
|
"distribution": tuple(page_distribution),
|
||||||
|
"crawl_time": crawl_time,
|
||||||
|
"total_time": total_time,
|
||||||
|
"memory_used": memory_used,
|
||||||
|
"pages_per_second": pages_per_second,
|
||||||
|
"efficiency": efficiency
|
||||||
|
}
|
||||||
|
|
||||||
|
results.append(result)
|
||||||
|
|
||||||
|
if verbose:
|
||||||
|
print(f" ✓ Crawled {total_urls} pages in {crawl_time:.2f}s ({pages_per_second:.1f} pages/sec)")
|
||||||
|
if memory_used > 0:
|
||||||
|
print(f" ✓ Memory used: {memory_used:.1f}MB ({memory_used/total_urls:.1f}MB per page)")
|
||||||
|
print(f" ✓ Efficiency score: {efficiency:.4f}")
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
if verbose:
|
||||||
|
print(f" ✗ Error: {str(e)}")
|
||||||
|
|
||||||
|
# Clean up
|
||||||
|
for manager in managers:
|
||||||
|
try:
|
||||||
|
await manager.close()
|
||||||
|
except:
|
||||||
|
pass
|
||||||
|
|
||||||
|
# If no successful results, return None
|
||||||
|
if not results:
|
||||||
|
return None
|
||||||
|
|
||||||
|
# Find best configurations
|
||||||
|
fastest = sorted(results, key=lambda x: x["crawl_time"])[0]
|
||||||
|
|
||||||
|
# Only consider memory if available
|
||||||
|
memory_results = [r for r in results if r["memory_used"] > 0]
|
||||||
|
if memory_results:
|
||||||
|
lowest_memory = sorted(memory_results, key=lambda x: x["memory_used"])[0]
|
||||||
|
else:
|
||||||
|
lowest_memory = fastest
|
||||||
|
|
||||||
|
# Find most efficient (balanced speed vs memory)
|
||||||
|
optimal = sorted(results, key=lambda x: x["efficiency"], reverse=True)[0]
|
||||||
|
|
||||||
|
# Print summary
|
||||||
|
if verbose:
|
||||||
|
print("\n=== OPTIMAL CONFIGURATIONS ===")
|
||||||
|
print(f"⚡ Fastest: {fastest['browser_count']} browsers {fastest['distribution']}")
|
||||||
|
print(f" {fastest['crawl_time']:.2f}s, {fastest['pages_per_second']:.1f} pages/sec")
|
||||||
|
|
||||||
|
print(f"💾 Memory-efficient: {lowest_memory['browser_count']} browsers {lowest_memory['distribution']}")
|
||||||
|
if lowest_memory["memory_used"] > 0:
|
||||||
|
print(f" {lowest_memory['memory_used']:.1f}MB, {lowest_memory['memory_used']/total_urls:.2f}MB per page")
|
||||||
|
|
||||||
|
print(f"🌟 Balanced optimal: {optimal['browser_count']} browsers {optimal['distribution']}")
|
||||||
|
print(f" {optimal['crawl_time']:.2f}s, {optimal['pages_per_second']:.1f} pages/sec, score: {optimal['efficiency']:.4f}")
|
||||||
|
|
||||||
|
return {
|
||||||
|
"fastest": fastest,
|
||||||
|
"lowest_memory": lowest_memory,
|
||||||
|
"optimal": optimal,
|
||||||
|
"all_configs": results
|
||||||
|
}
|
||||||
|
|||||||
@@ -171,9 +171,9 @@ async def run_tests():
|
|||||||
"""Run all tests sequentially."""
|
"""Run all tests sequentially."""
|
||||||
results = []
|
results = []
|
||||||
|
|
||||||
# results.append(await test_basic_browser_manager())
|
results.append(await test_basic_browser_manager())
|
||||||
# results.append(await test_custom_browser_config())
|
results.append(await test_custom_browser_config())
|
||||||
# results.append(await test_multiple_pages())
|
results.append(await test_multiple_pages())
|
||||||
results.append(await test_session_management())
|
results.append(await test_session_management())
|
||||||
|
|
||||||
# Print summary
|
# Print summary
|
||||||
|
|||||||
@@ -1,12 +1,12 @@
|
|||||||
"""
|
"""
|
||||||
Test script for browser_profiler and builtin browser functionality.
|
Test script for builtin browser functionality in the browser module.
|
||||||
|
|
||||||
This script tests:
|
This script tests:
|
||||||
1. Creating a builtin browser
|
1. Creating a builtin browser
|
||||||
2. Getting browser information
|
2. Getting browser information
|
||||||
3. Killing the browser
|
3. Killing the browser
|
||||||
4. Restarting the browser
|
4. Restarting the browser
|
||||||
5. Testing crawling with different browser modes
|
5. Testing operations with different browser strategies
|
||||||
6. Testing edge cases
|
6. Testing edge cases
|
||||||
"""
|
"""
|
||||||
|
|
||||||
@@ -14,13 +14,20 @@ import asyncio
|
|||||||
import os
|
import os
|
||||||
import sys
|
import sys
|
||||||
import time
|
import time
|
||||||
from colorama import Fore, init
|
from typing import List, Dict, Any
|
||||||
|
from colorama import Fore, Style, init
|
||||||
|
|
||||||
# Add the project root to the path for imports
|
# Add the project root to the path for imports
|
||||||
sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '../..')))
|
sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), "../..")))
|
||||||
|
|
||||||
from crawl4ai.browser_profiler import BrowserProfiler
|
from rich.console import Console
|
||||||
from crawl4ai.async_webcrawler import AsyncWebCrawler
|
from rich.table import Table
|
||||||
|
from rich.panel import Panel
|
||||||
|
from rich.text import Text
|
||||||
|
from rich.box import Box, SIMPLE
|
||||||
|
|
||||||
|
from crawl4ai.browser import BrowserManager
|
||||||
|
from crawl4ai.browser.strategies import BuiltinBrowserStrategy
|
||||||
from crawl4ai.async_configs import BrowserConfig, CrawlerRunConfig
|
from crawl4ai.async_configs import BrowserConfig, CrawlerRunConfig
|
||||||
from crawl4ai.async_logger import AsyncLogger
|
from crawl4ai.async_logger import AsyncLogger
|
||||||
|
|
||||||
@@ -37,250 +44,751 @@ RESET = Fore.RESET
|
|||||||
# Create logger
|
# Create logger
|
||||||
logger = AsyncLogger(verbose=True)
|
logger = AsyncLogger(verbose=True)
|
||||||
|
|
||||||
async def test_browser_profiler():
|
|
||||||
"""Test the BrowserProfiler class functionality"""
|
|
||||||
print(f"\n{INFO}========== Testing BrowserProfiler =========={RESET}")
|
|
||||||
|
|
||||||
# Initialize browser profiler
|
async def test_builtin_browser_creation():
|
||||||
profiler = BrowserProfiler(logger=logger)
|
"""Test creating a builtin browser using the BrowserManager with BuiltinBrowserStrategy"""
|
||||||
|
print(f"\n{INFO}========== Testing Builtin Browser Creation =========={RESET}")
|
||||||
|
|
||||||
# Step 1: Check if builtin browser exists and kill it if it does
|
# Step 1: Create a BrowserManager with builtin mode
|
||||||
print(f"\n{INFO}1. Checking if builtin browser exists{RESET}")
|
print(f"\n{INFO}1. Creating BrowserManager with builtin mode{RESET}")
|
||||||
browser_info = profiler.get_builtin_browser_info()
|
browser_config = BrowserConfig(browser_mode="builtin", headless=True, verbose=True)
|
||||||
if browser_info:
|
manager = BrowserManager(browser_config=browser_config, logger=logger)
|
||||||
print(f"{SUCCESS}Builtin browser found: {browser_info['cdp_url']}{RESET}")
|
|
||||||
# Kill it to start with a clean state
|
# Step 2: Check if we have a BuiltinBrowserStrategy
|
||||||
print(f"{INFO}Killing existing browser...{RESET}")
|
print(f"\n{INFO}2. Checking if we have a BuiltinBrowserStrategy{RESET}")
|
||||||
await profiler.kill_builtin_browser()
|
if isinstance(manager._strategy, BuiltinBrowserStrategy):
|
||||||
browser_info = profiler.get_builtin_browser_info()
|
print(
|
||||||
if not browser_info:
|
f"{SUCCESS}Correct strategy type: {manager._strategy.__class__.__name__}{RESET}"
|
||||||
print(f"{SUCCESS}Browser successfully killed{RESET}")
|
)
|
||||||
else:
|
|
||||||
print(f"{ERROR}Failed to kill browser{RESET}")
|
|
||||||
else:
|
else:
|
||||||
print(f"{WARNING}No builtin browser found{RESET}")
|
print(
|
||||||
|
f"{ERROR}Wrong strategy type: {manager._strategy.__class__.__name__}{RESET}"
|
||||||
|
)
|
||||||
|
return None
|
||||||
|
|
||||||
# Step 2: Launch a new builtin browser
|
# Step 3: Start the manager to launch or connect to builtin browser
|
||||||
print(f"\n{INFO}2. Launching new builtin browser{RESET}")
|
print(f"\n{INFO}3. Starting the browser manager{RESET}")
|
||||||
cdp_url = await profiler.launch_builtin_browser(headless=True)
|
try:
|
||||||
if cdp_url:
|
await manager.start()
|
||||||
print(f"{SUCCESS}Builtin browser launched at: {cdp_url}{RESET}")
|
print(f"{SUCCESS}Browser manager started successfully{RESET}")
|
||||||
else:
|
except Exception as e:
|
||||||
print(f"{ERROR}Failed to launch builtin browser{RESET}")
|
print(f"{ERROR}Failed to start browser manager: {str(e)}{RESET}")
|
||||||
return
|
return None
|
||||||
|
|
||||||
# Step 3: Get and display browser information
|
# Step 4: Get browser info from the strategy
|
||||||
print(f"\n{INFO}3. Getting browser information{RESET}")
|
print(f"\n{INFO}4. Getting browser information{RESET}")
|
||||||
browser_info = profiler.get_builtin_browser_info()
|
browser_info = manager._strategy.get_builtin_browser_info()
|
||||||
if browser_info:
|
if browser_info:
|
||||||
print(f"{SUCCESS}Browser info retrieved:{RESET}")
|
print(f"{SUCCESS}Browser info retrieved:{RESET}")
|
||||||
for key, value in browser_info.items():
|
for key, value in browser_info.items():
|
||||||
if key != 'config': # Skip the verbose config section
|
if key != "config": # Skip the verbose config section
|
||||||
print(f" {key}: {value}")
|
print(f" {key}: {value}")
|
||||||
|
|
||||||
|
cdp_url = browser_info.get("cdp_url")
|
||||||
|
print(f"{SUCCESS}CDP URL: {cdp_url}{RESET}")
|
||||||
else:
|
else:
|
||||||
print(f"{ERROR}Failed to get browser information{RESET}")
|
print(f"{ERROR}Failed to get browser information{RESET}")
|
||||||
|
cdp_url = None
|
||||||
|
|
||||||
# Step 4: Get browser status
|
# Save manager for later tests
|
||||||
print(f"\n{INFO}4. Getting browser status{RESET}")
|
return manager, cdp_url
|
||||||
status = await profiler.get_builtin_browser_status()
|
|
||||||
print(f"Running: {status['running']}")
|
|
||||||
print(f"CDP URL: {status['cdp_url']}")
|
|
||||||
|
|
||||||
# Pause to let the browser run for a moment
|
|
||||||
print(f"\n{INFO}Waiting for 2 seconds...{RESET}")
|
|
||||||
await asyncio.sleep(2)
|
|
||||||
|
|
||||||
return cdp_url # Return the CDP URL for the crawling tests
|
async def test_page_operations(manager: BrowserManager):
|
||||||
|
"""Test page operations with the builtin browser"""
|
||||||
async def test_crawling_with_builtin_browser(cdp_url):
|
print(
|
||||||
"""Test crawling with the builtin browser"""
|
f"\n{INFO}========== Testing Page Operations with Builtin Browser =========={RESET}"
|
||||||
print(f"\n{INFO}========== Testing Crawling with Builtin Browser =========={RESET}")
|
|
||||||
|
|
||||||
# Step 1: Create a crawler with 'builtin' browser mode
|
|
||||||
print(f"\n{INFO}1. Creating crawler with 'builtin' browser mode{RESET}")
|
|
||||||
browser_config = BrowserConfig(
|
|
||||||
browser_mode="builtin",
|
|
||||||
headless=True
|
|
||||||
)
|
)
|
||||||
crawler = AsyncWebCrawler(config=browser_config)
|
|
||||||
|
|
||||||
# Step 2: Test crawling without explicitly starting (should auto-start)
|
# Step 1: Get a single page
|
||||||
print(f"\n{INFO}2. Testing auto-start with arun{RESET}")
|
print(f"\n{INFO}1. Getting a single page{RESET}")
|
||||||
try:
|
try:
|
||||||
result = await crawler.arun("https://crawl4ai.com")
|
crawler_config = CrawlerRunConfig()
|
||||||
print(f"{SUCCESS}Auto-start crawling successful!{RESET}")
|
page, context = await manager.get_page(crawler_config)
|
||||||
print(f" Got {len(result.markdown.raw_markdown)} chars of markdown content")
|
print(f"{SUCCESS}Got page successfully{RESET}")
|
||||||
|
|
||||||
|
# Navigate to a test URL
|
||||||
|
await page.goto("https://example.com")
|
||||||
|
title = await page.title()
|
||||||
|
print(f"{SUCCESS}Page title: {title}{RESET}")
|
||||||
|
|
||||||
|
# Close the page
|
||||||
|
await page.close()
|
||||||
|
print(f"{SUCCESS}Page closed successfully{RESET}")
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
print(f"{ERROR}Auto-start crawling failed: {str(e)}{RESET}")
|
print(f"{ERROR}Page operation failed: {str(e)}{RESET}")
|
||||||
|
return False
|
||||||
|
|
||||||
# Close the crawler
|
# Step 2: Get multiple pages
|
||||||
await crawler.close()
|
print(f"\n{INFO}2. Getting multiple pages with get_pages(){RESET}")
|
||||||
|
|
||||||
# Step 3: Test with explicit start
|
|
||||||
print(f"\n{INFO}3. Testing with explicit start{RESET}")
|
|
||||||
crawler = AsyncWebCrawler(config=browser_config)
|
|
||||||
try:
|
try:
|
||||||
await crawler.start()
|
# Request 3 pages
|
||||||
print(f"{SUCCESS}Explicit start successful!{RESET}")
|
crawler_config = CrawlerRunConfig()
|
||||||
result = await crawler.arun("https://example.com")
|
pages = await manager.get_pages(crawler_config, count=3)
|
||||||
print(f" Got {len(result.markdown.raw_markdown)} chars of markdown content")
|
print(f"{SUCCESS}Got {len(pages)} pages{RESET}")
|
||||||
# Try second time, no start needed
|
|
||||||
print(f"{INFO}Testing second arun call without start{RESET}")
|
|
||||||
result = await crawler.arun("https://example.com")
|
|
||||||
print(f" Got {len(result.markdown.raw_markdown)} chars of markdown content")
|
|
||||||
except Exception as e:
|
|
||||||
print(f"{ERROR}Explicit start crawling failed: {str(e)}{RESET}")
|
|
||||||
|
|
||||||
# Close the crawler
|
# Test each page
|
||||||
await crawler.close()
|
for i, (page, context) in enumerate(pages):
|
||||||
|
await page.goto(f"https://example.com?test={i}")
|
||||||
|
title = await page.title()
|
||||||
|
print(f"{SUCCESS}Page {i + 1} title: {title}{RESET}")
|
||||||
|
await page.close()
|
||||||
|
|
||||||
# Step 4: Test with context manager
|
print(f"{SUCCESS}All pages tested and closed successfully{RESET}")
|
||||||
print(f"\n{INFO}4. Testing with context manager{RESET}")
|
|
||||||
try:
|
|
||||||
async with AsyncWebCrawler(config=browser_config) as crawler:
|
|
||||||
result = await crawler.arun("https://httpbin.org/html")
|
|
||||||
print(f"{SUCCESS}Context manager crawling successful!{RESET}")
|
|
||||||
print(f" Got {len(result.markdown.raw_markdown)} chars of markdown content")
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
print(f"{ERROR}Context manager crawling failed: {str(e)}{RESET}")
|
print(f"{ERROR}Multiple page operation failed: {str(e)}{RESET}")
|
||||||
|
return False
|
||||||
|
|
||||||
return True
|
return True
|
||||||
|
|
||||||
async def test_crawling_without_builtin_browser():
|
|
||||||
"""Test crawling after killing the builtin browser"""
|
|
||||||
print(f"\n{INFO}========== Testing Crawling Without Builtin Browser =========={RESET}")
|
|
||||||
|
|
||||||
# Step 1: Kill the builtin browser
|
async def test_browser_status_management(manager: BrowserManager):
|
||||||
print(f"\n{INFO}1. Killing the builtin browser{RESET}")
|
"""Test browser status and management operations"""
|
||||||
profiler = BrowserProfiler(logger=logger)
|
print(f"\n{INFO}========== Testing Browser Status and Management =========={RESET}")
|
||||||
await profiler.kill_builtin_browser()
|
|
||||||
|
|
||||||
# Step 2: Create a crawler with 'builtin' mode (should fall back to dedicated)
|
|
||||||
print(f"\n{INFO}2. Creating crawler with 'builtin' mode (should fall back){RESET}")
|
|
||||||
browser_config = BrowserConfig(
|
|
||||||
browser_mode="builtin",
|
|
||||||
headless=True
|
|
||||||
)
|
|
||||||
|
|
||||||
|
# Step 1: Get browser status
|
||||||
|
print(f"\n{INFO}1. Getting browser status{RESET}")
|
||||||
try:
|
try:
|
||||||
async with AsyncWebCrawler(config=browser_config) as crawler:
|
status = await manager._strategy.get_builtin_browser_status()
|
||||||
result = await crawler.arun("https://httpbin.org/get")
|
print(f"{SUCCESS}Browser status:{RESET}")
|
||||||
print(f"{SUCCESS}Fallback to dedicated browser successful!{RESET}")
|
print(f" Running: {status['running']}")
|
||||||
print(f" Got {len(result.markdown.raw_markdown)} chars of markdown content")
|
print(f" CDP URL: {status['cdp_url']}")
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
print(f"{ERROR}Fallback crawler failed: {str(e)}{RESET}")
|
print(f"{ERROR}Failed to get browser status: {str(e)}{RESET}")
|
||||||
|
return False
|
||||||
# Step 3: Test with direct CDP URL
|
|
||||||
print(f"\n{INFO}3. Testing with direct CDP URL connection{RESET}")
|
|
||||||
|
|
||||||
# Launch a standalone browser to get a CDP URL
|
|
||||||
print(f"{INFO}Launching standalone browser...{RESET}")
|
|
||||||
cdp_url = await profiler.launch_standalone_browser(headless=True)
|
|
||||||
if not cdp_url:
|
|
||||||
print(f"{ERROR}Failed to launch standalone browser{RESET}")
|
|
||||||
return
|
|
||||||
|
|
||||||
print(f"{SUCCESS}Got CDP URL: {cdp_url}{RESET}")
|
|
||||||
|
|
||||||
# Create a crawler with the CDP URL
|
|
||||||
browser_config = BrowserConfig(
|
|
||||||
browser_mode="dedicated",
|
|
||||||
cdp_url=cdp_url,
|
|
||||||
use_managed_browser=True,
|
|
||||||
headless=True
|
|
||||||
)
|
|
||||||
|
|
||||||
|
# Step 2: Test killing the browser
|
||||||
|
print(f"\n{INFO}2. Testing killing the browser{RESET}")
|
||||||
try:
|
try:
|
||||||
async with AsyncWebCrawler(config=browser_config) as crawler:
|
result = await manager._strategy.kill_builtin_browser()
|
||||||
result = await crawler.arun("https://httpbin.org/ip")
|
if result:
|
||||||
print(f"{SUCCESS}Direct CDP URL crawling successful!{RESET}")
|
print(f"{SUCCESS}Browser killed successfully{RESET}")
|
||||||
print(f" Got {len(result.markdown.raw_markdown)} chars of markdown content")
|
else:
|
||||||
|
print(f"{ERROR}Failed to kill browser{RESET}")
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
print(f"{ERROR}Direct CDP URL crawling failed: {str(e)}{RESET}")
|
print(f"{ERROR}Browser kill operation failed: {str(e)}{RESET}")
|
||||||
|
return False
|
||||||
|
|
||||||
|
# Step 3: Check status after kill
|
||||||
|
print(f"\n{INFO}3. Checking status after kill{RESET}")
|
||||||
|
try:
|
||||||
|
status = await manager._strategy.get_builtin_browser_status()
|
||||||
|
if not status["running"]:
|
||||||
|
print(f"{SUCCESS}Browser is correctly reported as not running{RESET}")
|
||||||
|
else:
|
||||||
|
print(f"{ERROR}Browser is incorrectly reported as still running{RESET}")
|
||||||
|
except Exception as e:
|
||||||
|
print(f"{ERROR}Failed to get browser status: {str(e)}{RESET}")
|
||||||
|
return False
|
||||||
|
|
||||||
|
# Step 4: Launch a new browser
|
||||||
|
print(f"\n{INFO}4. Launching a new browser{RESET}")
|
||||||
|
try:
|
||||||
|
cdp_url = await manager._strategy.launch_builtin_browser(
|
||||||
|
browser_type="chromium", headless=True
|
||||||
|
)
|
||||||
|
if cdp_url:
|
||||||
|
print(f"{SUCCESS}New browser launched at: {cdp_url}{RESET}")
|
||||||
|
else:
|
||||||
|
print(f"{ERROR}Failed to launch new browser{RESET}")
|
||||||
|
return False
|
||||||
|
except Exception as e:
|
||||||
|
print(f"{ERROR}Browser launch failed: {str(e)}{RESET}")
|
||||||
|
return False
|
||||||
|
|
||||||
return True
|
return True
|
||||||
|
|
||||||
|
|
||||||
|
async def test_multiple_managers():
|
||||||
|
"""Test creating multiple BrowserManagers that use the same builtin browser"""
|
||||||
|
print(f"\n{INFO}========== Testing Multiple Browser Managers =========={RESET}")
|
||||||
|
|
||||||
|
# Step 1: Create first manager
|
||||||
|
print(f"\n{INFO}1. Creating first browser manager{RESET}")
|
||||||
|
browser_config1 = (BrowserConfig(browser_mode="builtin", headless=True),)
|
||||||
|
manager1 = BrowserManager(browser_config=browser_config1, logger=logger)
|
||||||
|
|
||||||
|
# Step 2: Create second manager
|
||||||
|
print(f"\n{INFO}2. Creating second browser manager{RESET}")
|
||||||
|
browser_config2 = BrowserConfig(browser_mode="builtin", headless=True)
|
||||||
|
manager2 = BrowserManager(browser_config=browser_config2, logger=logger)
|
||||||
|
|
||||||
|
# Step 3: Start both managers (should connect to the same builtin browser)
|
||||||
|
print(f"\n{INFO}3. Starting both managers{RESET}")
|
||||||
|
try:
|
||||||
|
await manager1.start()
|
||||||
|
print(f"{SUCCESS}First manager started{RESET}")
|
||||||
|
|
||||||
|
await manager2.start()
|
||||||
|
print(f"{SUCCESS}Second manager started{RESET}")
|
||||||
|
|
||||||
|
# Check if they got the same CDP URL
|
||||||
|
cdp_url1 = manager1._strategy.config.cdp_url
|
||||||
|
cdp_url2 = manager2._strategy.config.cdp_url
|
||||||
|
|
||||||
|
if cdp_url1 == cdp_url2:
|
||||||
|
print(
|
||||||
|
f"{SUCCESS}Both managers connected to the same browser: {cdp_url1}{RESET}"
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
print(
|
||||||
|
f"{WARNING}Managers connected to different browsers: {cdp_url1} and {cdp_url2}{RESET}"
|
||||||
|
)
|
||||||
|
except Exception as e:
|
||||||
|
print(f"{ERROR}Failed to start managers: {str(e)}{RESET}")
|
||||||
|
return False
|
||||||
|
|
||||||
|
# Step 4: Test using both managers
|
||||||
|
print(f"\n{INFO}4. Testing operations with both managers{RESET}")
|
||||||
|
try:
|
||||||
|
# First manager creates a page
|
||||||
|
page1, ctx1 = await manager1.get_page(CrawlerRunConfig())
|
||||||
|
await page1.goto("https://example.com")
|
||||||
|
title1 = await page1.title()
|
||||||
|
print(f"{SUCCESS}Manager 1 page title: {title1}{RESET}")
|
||||||
|
|
||||||
|
# Second manager creates a page
|
||||||
|
page2, ctx2 = await manager2.get_page(CrawlerRunConfig())
|
||||||
|
await page2.goto("https://example.org")
|
||||||
|
title2 = await page2.title()
|
||||||
|
print(f"{SUCCESS}Manager 2 page title: {title2}{RESET}")
|
||||||
|
|
||||||
|
# Clean up
|
||||||
|
await page1.close()
|
||||||
|
await page2.close()
|
||||||
|
except Exception as e:
|
||||||
|
print(f"{ERROR}Failed to use both managers: {str(e)}{RESET}")
|
||||||
|
return False
|
||||||
|
|
||||||
|
# Step 5: Close both managers
|
||||||
|
print(f"\n{INFO}5. Closing both managers{RESET}")
|
||||||
|
try:
|
||||||
|
await manager1.close()
|
||||||
|
print(f"{SUCCESS}First manager closed{RESET}")
|
||||||
|
|
||||||
|
await manager2.close()
|
||||||
|
print(f"{SUCCESS}Second manager closed{RESET}")
|
||||||
|
except Exception as e:
|
||||||
|
print(f"{ERROR}Failed to close managers: {str(e)}{RESET}")
|
||||||
|
return False
|
||||||
|
|
||||||
|
return True
|
||||||
|
|
||||||
|
|
||||||
async def test_edge_cases():
|
async def test_edge_cases():
|
||||||
"""Test edge cases like multiple starts, killing browser during crawl, etc."""
|
"""Test edge cases like multiple starts, killing browser during operations, etc."""
|
||||||
print(f"\n{INFO}========== Testing Edge Cases =========={RESET}")
|
print(f"\n{INFO}========== Testing Edge Cases =========={RESET}")
|
||||||
|
|
||||||
# Step 1: Launch the builtin browser if it doesn't exist
|
# Step 1: Test multiple starts with the same manager
|
||||||
print(f"\n{INFO}1. Ensuring builtin browser exists{RESET}")
|
print(f"\n{INFO}1. Testing multiple starts with the same manager{RESET}")
|
||||||
profiler = BrowserProfiler(logger=logger)
|
|
||||||
browser_info = profiler.get_builtin_browser_info()
|
|
||||||
if not browser_info:
|
|
||||||
cdp_url = await profiler.launch_builtin_browser(headless=True)
|
|
||||||
if cdp_url:
|
|
||||||
print(f"{SUCCESS}Builtin browser launched at: {cdp_url}{RESET}")
|
|
||||||
else:
|
|
||||||
print(f"{ERROR}Failed to launch builtin browser{RESET}")
|
|
||||||
return
|
|
||||||
else:
|
|
||||||
print(f"{SUCCESS}Using existing builtin browser: {browser_info['cdp_url']}{RESET}")
|
|
||||||
|
|
||||||
# Step 2: Test multiple starts with the same crawler
|
|
||||||
print(f"\n{INFO}2. Testing multiple starts with the same crawler{RESET}")
|
|
||||||
browser_config = BrowserConfig(browser_mode="builtin", headless=True)
|
browser_config = BrowserConfig(browser_mode="builtin", headless=True)
|
||||||
crawler = AsyncWebCrawler(config=browser_config)
|
manager = BrowserManager(browser_config=browser_config, logger=logger)
|
||||||
|
|
||||||
await crawler.start()
|
|
||||||
print(f"{SUCCESS}First start successful!{RESET}")
|
|
||||||
|
|
||||||
try:
|
try:
|
||||||
await crawler.start()
|
await manager.start()
|
||||||
print(f"{SUCCESS}Second start didn't cause errors!{RESET}")
|
print(f"{SUCCESS}First start successful{RESET}")
|
||||||
except Exception as e:
|
|
||||||
print(f"{ERROR}Second start failed: {str(e)}{RESET}")
|
# Try to start again
|
||||||
|
await manager.start()
|
||||||
|
print(f"{SUCCESS}Second start completed without errors{RESET}")
|
||||||
|
|
||||||
|
# Test if it's still functional
|
||||||
|
page, context = await manager.get_page(CrawlerRunConfig())
|
||||||
|
await page.goto("https://example.com")
|
||||||
|
title = await page.title()
|
||||||
|
print(
|
||||||
|
f"{SUCCESS}Page operations work after multiple starts. Title: {title}{RESET}"
|
||||||
|
)
|
||||||
|
await page.close()
|
||||||
|
except Exception as e:
|
||||||
|
print(f"{ERROR}Multiple starts test failed: {str(e)}{RESET}")
|
||||||
|
return False
|
||||||
|
finally:
|
||||||
|
await manager.close()
|
||||||
|
|
||||||
|
# Step 2: Test killing the browser while manager is active
|
||||||
|
print(f"\n{INFO}2. Testing killing the browser while manager is active{RESET}")
|
||||||
|
manager = BrowserManager(browser_config=browser_config, logger=logger)
|
||||||
|
|
||||||
# Run a crawl to verify functionality
|
|
||||||
try:
|
try:
|
||||||
result = await crawler.arun("https://httpbin.org/user-agent")
|
await manager.start()
|
||||||
print(f"{SUCCESS}Crawling after multiple starts successful!{RESET}")
|
print(f"{SUCCESS}Manager started{RESET}")
|
||||||
print(f" Got {len(result.markdown.raw_markdown)} chars of markdown content")
|
|
||||||
|
# Kill the browser directly
|
||||||
|
print(f"{INFO}Killing the browser...{RESET}")
|
||||||
|
await manager._strategy.kill_builtin_browser()
|
||||||
|
print(f"{SUCCESS}Browser killed{RESET}")
|
||||||
|
|
||||||
|
# Try to get a page (should fail or launch a new browser)
|
||||||
|
try:
|
||||||
|
page, context = await manager.get_page(CrawlerRunConfig())
|
||||||
|
print(
|
||||||
|
f"{WARNING}Page request succeeded despite killed browser (might have auto-restarted){RESET}"
|
||||||
|
)
|
||||||
|
title = await page.title()
|
||||||
|
print(f"{SUCCESS}Got page title: {title}{RESET}")
|
||||||
|
await page.close()
|
||||||
|
except Exception as e:
|
||||||
|
print(
|
||||||
|
f"{SUCCESS}Page request failed as expected after browser was killed: {str(e)}{RESET}"
|
||||||
|
)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
print(f"{ERROR}Crawling after multiple starts failed: {str(e)}{RESET}")
|
print(f"{ERROR}Kill during operation test failed: {str(e)}{RESET}")
|
||||||
|
return False
|
||||||
await crawler.close()
|
finally:
|
||||||
|
await manager.close()
|
||||||
# Step 3: Test killing browser while crawler is active
|
|
||||||
print(f"\n{INFO}3. Testing killing browser while crawler is active{RESET}")
|
|
||||||
|
|
||||||
# Create and start a crawler
|
|
||||||
browser_config = BrowserConfig(browser_mode="builtin", headless=True)
|
|
||||||
crawler = AsyncWebCrawler(config=browser_config)
|
|
||||||
await crawler.start()
|
|
||||||
|
|
||||||
# Kill the browser
|
|
||||||
print(f"{INFO}Killing the browser...{RESET}")
|
|
||||||
await profiler.kill_builtin_browser()
|
|
||||||
|
|
||||||
# Try to crawl (should fail)
|
|
||||||
try:
|
|
||||||
result = await crawler.arun("https://httpbin.org/get")
|
|
||||||
print(f"{WARNING}Crawling succeeded despite killed browser!{RESET}")
|
|
||||||
except Exception as e:
|
|
||||||
print(f"{SUCCESS}Crawling failed as expected: {str(e)}{RESET}")
|
|
||||||
|
|
||||||
await crawler.close()
|
|
||||||
|
|
||||||
return True
|
return True
|
||||||
|
|
||||||
|
|
||||||
|
async def cleanup_browsers():
|
||||||
|
"""Clean up any remaining builtin browsers"""
|
||||||
|
print(f"\n{INFO}========== Cleaning Up Builtin Browsers =========={RESET}")
|
||||||
|
|
||||||
|
browser_config = BrowserConfig(browser_mode="builtin", headless=True)
|
||||||
|
manager = BrowserManager(browser_config=browser_config, logger=logger)
|
||||||
|
|
||||||
|
try:
|
||||||
|
# No need to start, just access the strategy directly
|
||||||
|
strategy = manager._strategy
|
||||||
|
if isinstance(strategy, BuiltinBrowserStrategy):
|
||||||
|
result = await strategy.kill_builtin_browser()
|
||||||
|
if result:
|
||||||
|
print(f"{SUCCESS}Successfully killed all builtin browsers{RESET}")
|
||||||
|
else:
|
||||||
|
print(f"{WARNING}No builtin browsers found to kill{RESET}")
|
||||||
|
else:
|
||||||
|
print(f"{ERROR}Wrong strategy type: {strategy.__class__.__name__}{RESET}")
|
||||||
|
except Exception as e:
|
||||||
|
print(f"{ERROR}Cleanup failed: {str(e)}{RESET}")
|
||||||
|
finally:
|
||||||
|
# Just to be safe
|
||||||
|
try:
|
||||||
|
await manager.close()
|
||||||
|
except:
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
async def test_performance_scaling():
|
||||||
|
"""Test performance with multiple browsers and pages.
|
||||||
|
|
||||||
|
This test creates multiple browsers on different ports,
|
||||||
|
spawns multiple pages per browser, and measures performance metrics.
|
||||||
|
"""
|
||||||
|
print(f"\n{INFO}========== Testing Performance Scaling =========={RESET}")
|
||||||
|
|
||||||
|
# Configuration parameters
|
||||||
|
num_browsers = 10
|
||||||
|
pages_per_browser = 10
|
||||||
|
total_pages = num_browsers * pages_per_browser
|
||||||
|
base_port = 9222
|
||||||
|
|
||||||
|
# Set up a measuring mechanism for memory
|
||||||
|
import psutil
|
||||||
|
import gc
|
||||||
|
|
||||||
|
# Force garbage collection before starting
|
||||||
|
gc.collect()
|
||||||
|
process = psutil.Process()
|
||||||
|
initial_memory = process.memory_info().rss / 1024 / 1024 # in MB
|
||||||
|
peak_memory = initial_memory
|
||||||
|
|
||||||
|
# Report initial configuration
|
||||||
|
print(
|
||||||
|
f"{INFO}Test configuration: {num_browsers} browsers × {pages_per_browser} pages = {total_pages} total crawls{RESET}"
|
||||||
|
)
|
||||||
|
|
||||||
|
# List to track managers
|
||||||
|
managers: List[BrowserManager] = []
|
||||||
|
all_pages = []
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
# Get crawl4ai home directory
|
||||||
|
crawl4ai_home = os.path.expanduser("~/.crawl4ai")
|
||||||
|
temp_dir = os.path.join(crawl4ai_home, "temp")
|
||||||
|
os.makedirs(temp_dir, exist_ok=True)
|
||||||
|
|
||||||
|
# Create all managers but don't start them yet
|
||||||
|
manager_configs = []
|
||||||
|
for i in range(num_browsers):
|
||||||
|
port = base_port + i
|
||||||
|
browser_config = BrowserConfig(
|
||||||
|
browser_mode="builtin",
|
||||||
|
headless=True,
|
||||||
|
debugging_port=port,
|
||||||
|
user_data_dir=os.path.join(temp_dir, f"browser_profile_{i}"),
|
||||||
|
)
|
||||||
|
manager = BrowserManager(browser_config=browser_config, logger=logger)
|
||||||
|
manager._strategy.shutting_down = True
|
||||||
|
manager_configs.append((manager, i, port))
|
||||||
|
|
||||||
|
# Define async function to start a single manager
|
||||||
|
async def start_manager(manager, index, port):
|
||||||
|
try:
|
||||||
|
await manager.start()
|
||||||
|
return manager
|
||||||
|
except Exception as e:
|
||||||
|
print(
|
||||||
|
f"{ERROR}Failed to start browser {index + 1} on port {port}: {str(e)}{RESET}"
|
||||||
|
)
|
||||||
|
return None
|
||||||
|
|
||||||
|
# Start all managers in parallel
|
||||||
|
start_tasks = [
|
||||||
|
start_manager(manager, i, port) for manager, i, port in manager_configs
|
||||||
|
]
|
||||||
|
started_managers = await asyncio.gather(*start_tasks)
|
||||||
|
|
||||||
|
# Filter out None values (failed starts) and add to managers list
|
||||||
|
managers = [m for m in started_managers if m is not None]
|
||||||
|
|
||||||
|
if len(managers) == 0:
|
||||||
|
print(f"{ERROR}All browser managers failed to start. Aborting test.{RESET}")
|
||||||
|
return False
|
||||||
|
|
||||||
|
if len(managers) < num_browsers:
|
||||||
|
print(
|
||||||
|
f"{WARNING}Only {len(managers)} out of {num_browsers} browser managers started successfully{RESET}"
|
||||||
|
)
|
||||||
|
|
||||||
|
# Create pages for each browser
|
||||||
|
for i, manager in enumerate(managers):
|
||||||
|
try:
|
||||||
|
pages = await manager.get_pages(CrawlerRunConfig(), count=pages_per_browser)
|
||||||
|
all_pages.extend(pages)
|
||||||
|
except Exception as e:
|
||||||
|
print(f"{ERROR}Failed to create pages for browser {i + 1}: {str(e)}{RESET}")
|
||||||
|
|
||||||
|
# Check memory after page creation
|
||||||
|
gc.collect()
|
||||||
|
current_memory = process.memory_info().rss / 1024 / 1024
|
||||||
|
peak_memory = max(peak_memory, current_memory)
|
||||||
|
|
||||||
|
# Ask for confirmation before loading
|
||||||
|
confirmation = input(
|
||||||
|
f"{WARNING}Do you want to proceed with loading pages? (y/n): {RESET}"
|
||||||
|
)
|
||||||
|
# Step 1: Create and start multiple browser managers in parallel
|
||||||
|
start_time = time.time()
|
||||||
|
|
||||||
|
if confirmation.lower() == "y":
|
||||||
|
load_start_time = time.time()
|
||||||
|
|
||||||
|
# Function to load a single page
|
||||||
|
async def load_page(page_ctx, index):
|
||||||
|
page, _ = page_ctx
|
||||||
|
try:
|
||||||
|
await page.goto(f"https://example.com/page{index}", timeout=30000)
|
||||||
|
title = await page.title()
|
||||||
|
return title
|
||||||
|
except Exception as e:
|
||||||
|
return f"Error: {str(e)}"
|
||||||
|
|
||||||
|
# Load all pages concurrently
|
||||||
|
load_tasks = [load_page(page_ctx, i) for i, page_ctx in enumerate(all_pages)]
|
||||||
|
load_results = await asyncio.gather(*load_tasks, return_exceptions=True)
|
||||||
|
|
||||||
|
# Count successes and failures
|
||||||
|
successes = sum(
|
||||||
|
1 for r in load_results if isinstance(r, str) and not r.startswith("Error")
|
||||||
|
)
|
||||||
|
failures = len(load_results) - successes
|
||||||
|
|
||||||
|
load_time = time.time() - load_start_time
|
||||||
|
total_test_time = time.time() - start_time
|
||||||
|
|
||||||
|
# Check memory after loading (peak memory)
|
||||||
|
gc.collect()
|
||||||
|
current_memory = process.memory_info().rss / 1024 / 1024
|
||||||
|
peak_memory = max(peak_memory, current_memory)
|
||||||
|
|
||||||
|
# Calculate key metrics
|
||||||
|
memory_per_page = peak_memory / successes if successes > 0 else 0
|
||||||
|
time_per_crawl = total_test_time / successes if successes > 0 else 0
|
||||||
|
crawls_per_second = successes / total_test_time if total_test_time > 0 else 0
|
||||||
|
crawls_per_minute = crawls_per_second * 60
|
||||||
|
crawls_per_hour = crawls_per_minute * 60
|
||||||
|
|
||||||
|
# Print simplified performance summary
|
||||||
|
from rich.console import Console
|
||||||
|
from rich.table import Table
|
||||||
|
|
||||||
|
console = Console()
|
||||||
|
|
||||||
|
# Create a simple summary table
|
||||||
|
table = Table(title="CRAWL4AI PERFORMANCE SUMMARY")
|
||||||
|
|
||||||
|
table.add_column("Metric", style="cyan")
|
||||||
|
table.add_column("Value", style="green")
|
||||||
|
|
||||||
|
table.add_row("Total Crawls Completed", f"{successes}")
|
||||||
|
table.add_row("Total Time", f"{total_test_time:.2f} seconds")
|
||||||
|
table.add_row("Time Per Crawl", f"{time_per_crawl:.2f} seconds")
|
||||||
|
table.add_row("Crawling Speed", f"{crawls_per_second:.2f} crawls/second")
|
||||||
|
table.add_row("Projected Rate (1 minute)", f"{crawls_per_minute:.0f} crawls")
|
||||||
|
table.add_row("Projected Rate (1 hour)", f"{crawls_per_hour:.0f} crawls")
|
||||||
|
table.add_row("Peak Memory Usage", f"{peak_memory:.2f} MB")
|
||||||
|
table.add_row("Memory Per Crawl", f"{memory_per_page:.2f} MB")
|
||||||
|
|
||||||
|
# Display the table
|
||||||
|
console.print(table)
|
||||||
|
|
||||||
|
# Ask confirmation before cleanup
|
||||||
|
confirmation = input(
|
||||||
|
f"{WARNING}Do you want to proceed with cleanup? (y/n): {RESET}"
|
||||||
|
)
|
||||||
|
if confirmation.lower() != "y":
|
||||||
|
print(f"{WARNING}Cleanup aborted by user{RESET}")
|
||||||
|
return False
|
||||||
|
|
||||||
|
# Close all pages
|
||||||
|
for page, _ in all_pages:
|
||||||
|
try:
|
||||||
|
await page.close()
|
||||||
|
except:
|
||||||
|
pass
|
||||||
|
|
||||||
|
# Close all managers
|
||||||
|
for manager in managers:
|
||||||
|
try:
|
||||||
|
await manager.close()
|
||||||
|
except:
|
||||||
|
pass
|
||||||
|
|
||||||
|
# Remove the temp directory
|
||||||
|
import shutil
|
||||||
|
|
||||||
|
if os.path.exists(temp_dir):
|
||||||
|
shutil.rmtree(temp_dir)
|
||||||
|
|
||||||
|
return True
|
||||||
|
|
||||||
|
|
||||||
|
async def test_performance_scaling_lab( num_browsers: int = 10, pages_per_browser: int = 10):
|
||||||
|
"""Test performance with multiple browsers and pages.
|
||||||
|
|
||||||
|
This test creates multiple browsers on different ports,
|
||||||
|
spawns multiple pages per browser, and measures performance metrics.
|
||||||
|
"""
|
||||||
|
print(f"\n{INFO}========== Testing Performance Scaling =========={RESET}")
|
||||||
|
|
||||||
|
# Configuration parameters
|
||||||
|
num_browsers = num_browsers
|
||||||
|
pages_per_browser = pages_per_browser
|
||||||
|
total_pages = num_browsers * pages_per_browser
|
||||||
|
base_port = 9222
|
||||||
|
|
||||||
|
# Set up a measuring mechanism for memory
|
||||||
|
import psutil
|
||||||
|
import gc
|
||||||
|
|
||||||
|
# Force garbage collection before starting
|
||||||
|
gc.collect()
|
||||||
|
process = psutil.Process()
|
||||||
|
initial_memory = process.memory_info().rss / 1024 / 1024 # in MB
|
||||||
|
peak_memory = initial_memory
|
||||||
|
|
||||||
|
# Report initial configuration
|
||||||
|
print(
|
||||||
|
f"{INFO}Test configuration: {num_browsers} browsers × {pages_per_browser} pages = {total_pages} total crawls{RESET}"
|
||||||
|
)
|
||||||
|
|
||||||
|
# List to track managers
|
||||||
|
managers: List[BrowserManager] = []
|
||||||
|
all_pages = []
|
||||||
|
|
||||||
|
# Get crawl4ai home directory
|
||||||
|
crawl4ai_home = os.path.expanduser("~/.crawl4ai")
|
||||||
|
temp_dir = os.path.join(crawl4ai_home, "temp")
|
||||||
|
os.makedirs(temp_dir, exist_ok=True)
|
||||||
|
|
||||||
|
# Create all managers but don't start them yet
|
||||||
|
manager_configs = []
|
||||||
|
for i in range(num_browsers):
|
||||||
|
port = base_port + i
|
||||||
|
browser_config = BrowserConfig(
|
||||||
|
browser_mode="builtin",
|
||||||
|
headless=True,
|
||||||
|
debugging_port=port,
|
||||||
|
user_data_dir=os.path.join(temp_dir, f"browser_profile_{i}"),
|
||||||
|
)
|
||||||
|
manager = BrowserManager(browser_config=browser_config, logger=logger)
|
||||||
|
manager._strategy.shutting_down = True
|
||||||
|
manager_configs.append((manager, i, port))
|
||||||
|
|
||||||
|
# Define async function to start a single manager
|
||||||
|
async def start_manager(manager, index, port):
|
||||||
|
try:
|
||||||
|
await manager.start()
|
||||||
|
return manager
|
||||||
|
except Exception as e:
|
||||||
|
print(
|
||||||
|
f"{ERROR}Failed to start browser {index + 1} on port {port}: {str(e)}{RESET}"
|
||||||
|
)
|
||||||
|
return None
|
||||||
|
|
||||||
|
# Start all managers in parallel
|
||||||
|
start_tasks = [
|
||||||
|
start_manager(manager, i, port) for manager, i, port in manager_configs
|
||||||
|
]
|
||||||
|
started_managers = await asyncio.gather(*start_tasks)
|
||||||
|
|
||||||
|
# Filter out None values (failed starts) and add to managers list
|
||||||
|
managers = [m for m in started_managers if m is not None]
|
||||||
|
|
||||||
|
if len(managers) == 0:
|
||||||
|
print(f"{ERROR}All browser managers failed to start. Aborting test.{RESET}")
|
||||||
|
return False
|
||||||
|
|
||||||
|
if len(managers) < num_browsers:
|
||||||
|
print(
|
||||||
|
f"{WARNING}Only {len(managers)} out of {num_browsers} browser managers started successfully{RESET}"
|
||||||
|
)
|
||||||
|
|
||||||
|
# Create pages for each browser
|
||||||
|
for i, manager in enumerate(managers):
|
||||||
|
try:
|
||||||
|
pages = await manager.get_pages(CrawlerRunConfig(), count=pages_per_browser)
|
||||||
|
all_pages.extend(pages)
|
||||||
|
except Exception as e:
|
||||||
|
print(f"{ERROR}Failed to create pages for browser {i + 1}: {str(e)}{RESET}")
|
||||||
|
|
||||||
|
# Check memory after page creation
|
||||||
|
gc.collect()
|
||||||
|
current_memory = process.memory_info().rss / 1024 / 1024
|
||||||
|
peak_memory = max(peak_memory, current_memory)
|
||||||
|
|
||||||
|
# Ask for confirmation before loading
|
||||||
|
confirmation = input(
|
||||||
|
f"{WARNING}Do you want to proceed with loading pages? (y/n): {RESET}"
|
||||||
|
)
|
||||||
|
# Step 1: Create and start multiple browser managers in parallel
|
||||||
|
start_time = time.time()
|
||||||
|
|
||||||
|
if confirmation.lower() == "y":
|
||||||
|
load_start_time = time.time()
|
||||||
|
|
||||||
|
# Function to load a single page
|
||||||
|
async def load_page(page_ctx, index):
|
||||||
|
page, _ = page_ctx
|
||||||
|
try:
|
||||||
|
await page.goto(f"https://example.com/page{index}", timeout=30000)
|
||||||
|
title = await page.title()
|
||||||
|
return title
|
||||||
|
except Exception as e:
|
||||||
|
return f"Error: {str(e)}"
|
||||||
|
|
||||||
|
# Load all pages concurrently
|
||||||
|
load_tasks = [load_page(page_ctx, i) for i, page_ctx in enumerate(all_pages)]
|
||||||
|
load_results = await asyncio.gather(*load_tasks, return_exceptions=True)
|
||||||
|
|
||||||
|
# Count successes and failures
|
||||||
|
successes = sum(
|
||||||
|
1 for r in load_results if isinstance(r, str) and not r.startswith("Error")
|
||||||
|
)
|
||||||
|
failures = len(load_results) - successes
|
||||||
|
|
||||||
|
load_time = time.time() - load_start_time
|
||||||
|
total_test_time = time.time() - start_time
|
||||||
|
|
||||||
|
# Check memory after loading (peak memory)
|
||||||
|
gc.collect()
|
||||||
|
current_memory = process.memory_info().rss / 1024 / 1024
|
||||||
|
peak_memory = max(peak_memory, current_memory)
|
||||||
|
|
||||||
|
# Calculate key metrics
|
||||||
|
memory_per_page = peak_memory / successes if successes > 0 else 0
|
||||||
|
time_per_crawl = total_test_time / successes if successes > 0 else 0
|
||||||
|
crawls_per_second = successes / total_test_time if total_test_time > 0 else 0
|
||||||
|
crawls_per_minute = crawls_per_second * 60
|
||||||
|
crawls_per_hour = crawls_per_minute * 60
|
||||||
|
|
||||||
|
# Print simplified performance summary
|
||||||
|
from rich.console import Console
|
||||||
|
from rich.table import Table
|
||||||
|
|
||||||
|
console = Console()
|
||||||
|
|
||||||
|
# Create a simple summary table
|
||||||
|
table = Table(title="CRAWL4AI PERFORMANCE SUMMARY")
|
||||||
|
|
||||||
|
table.add_column("Metric", style="cyan")
|
||||||
|
table.add_column("Value", style="green")
|
||||||
|
|
||||||
|
table.add_row("Total Crawls Completed", f"{successes}")
|
||||||
|
table.add_row("Total Time", f"{total_test_time:.2f} seconds")
|
||||||
|
table.add_row("Time Per Crawl", f"{time_per_crawl:.2f} seconds")
|
||||||
|
table.add_row("Crawling Speed", f"{crawls_per_second:.2f} crawls/second")
|
||||||
|
table.add_row("Projected Rate (1 minute)", f"{crawls_per_minute:.0f} crawls")
|
||||||
|
table.add_row("Projected Rate (1 hour)", f"{crawls_per_hour:.0f} crawls")
|
||||||
|
table.add_row("Peak Memory Usage", f"{peak_memory:.2f} MB")
|
||||||
|
table.add_row("Memory Per Crawl", f"{memory_per_page:.2f} MB")
|
||||||
|
|
||||||
|
# Display the table
|
||||||
|
console.print(table)
|
||||||
|
|
||||||
|
# Ask confirmation before cleanup
|
||||||
|
confirmation = input(
|
||||||
|
f"{WARNING}Do you want to proceed with cleanup? (y/n): {RESET}"
|
||||||
|
)
|
||||||
|
if confirmation.lower() != "y":
|
||||||
|
print(f"{WARNING}Cleanup aborted by user{RESET}")
|
||||||
|
return False
|
||||||
|
|
||||||
|
# Close all pages
|
||||||
|
for page, _ in all_pages:
|
||||||
|
try:
|
||||||
|
await page.close()
|
||||||
|
except:
|
||||||
|
pass
|
||||||
|
|
||||||
|
# Close all managers
|
||||||
|
for manager in managers:
|
||||||
|
try:
|
||||||
|
await manager.close()
|
||||||
|
except:
|
||||||
|
pass
|
||||||
|
|
||||||
|
# Remove the temp directory
|
||||||
|
import shutil
|
||||||
|
|
||||||
|
if os.path.exists(temp_dir):
|
||||||
|
shutil.rmtree(temp_dir)
|
||||||
|
|
||||||
|
return True
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
async def main():
|
async def main():
|
||||||
"""Run all tests"""
|
"""Run all tests"""
|
||||||
try:
|
try:
|
||||||
print(f"{INFO}Starting browser_profiler and builtin browser tests{RESET}")
|
print(f"{INFO}Starting builtin browser tests with browser module{RESET}")
|
||||||
|
|
||||||
# Run browser profiler tests
|
# # Run browser creation test
|
||||||
cdp_url = await test_browser_profiler()
|
# manager, cdp_url = await test_builtin_browser_creation()
|
||||||
|
# if not manager:
|
||||||
|
# print(f"{ERROR}Browser creation failed, cannot continue tests{RESET}")
|
||||||
|
# return
|
||||||
|
|
||||||
# Run crawling tests with builtin browser
|
# # Run page operations test
|
||||||
if cdp_url:
|
# await test_page_operations(manager)
|
||||||
await test_crawling_with_builtin_browser(cdp_url)
|
|
||||||
|
|
||||||
# Run tests without builtin browser
|
# # Run browser status and management test
|
||||||
# await test_crawling_without_builtin_browser()
|
# await test_browser_status_management(manager)
|
||||||
|
|
||||||
# Run edge case tests
|
# # Close manager before multiple manager test
|
||||||
|
# await manager.close()
|
||||||
|
|
||||||
|
# Run multiple managers test
|
||||||
|
# await test_multiple_managers()
|
||||||
|
|
||||||
|
# Run performance scaling test
|
||||||
|
await test_performance_scaling()
|
||||||
|
# Run cleanup test
|
||||||
|
# await cleanup_browsers()
|
||||||
|
|
||||||
|
# Run edge cases test
|
||||||
# await test_edge_cases()
|
# await test_edge_cases()
|
||||||
|
|
||||||
print(f"\n{SUCCESS}All tests completed!{RESET}")
|
print(f"\n{SUCCESS}All tests completed!{RESET}")
|
||||||
@@ -288,13 +796,13 @@ async def main():
|
|||||||
except Exception as e:
|
except Exception as e:
|
||||||
print(f"\n{ERROR}Test failed with error: {str(e)}{RESET}")
|
print(f"\n{ERROR}Test failed with error: {str(e)}{RESET}")
|
||||||
import traceback
|
import traceback
|
||||||
|
|
||||||
traceback.print_exc()
|
traceback.print_exc()
|
||||||
finally:
|
finally:
|
||||||
# Clean up: kill any remaining builtin browser
|
# Clean up: kill any remaining builtin browsers
|
||||||
print(f"\n{INFO}Cleaning up: killing any remaining builtin browser{RESET}")
|
await cleanup_browsers()
|
||||||
profiler = BrowserProfiler(logger=logger)
|
|
||||||
await profiler.kill_builtin_browser()
|
|
||||||
print(f"{SUCCESS}Test cleanup complete{RESET}")
|
print(f"{SUCCESS}Test cleanup complete{RESET}")
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
asyncio.run(main())
|
asyncio.run(main())
|
||||||
902
tests/browser/test_parallel_crawling.py
Normal file
902
tests/browser/test_parallel_crawling.py
Normal file
@@ -0,0 +1,902 @@
|
|||||||
|
"""
|
||||||
|
Test examples for parallel crawling with the browser module.
|
||||||
|
|
||||||
|
These examples demonstrate the functionality of parallel page creation
|
||||||
|
and serve as functional tests for multi-page crawling performance.
|
||||||
|
"""
|
||||||
|
|
||||||
|
import asyncio
|
||||||
|
import os
|
||||||
|
import sys
|
||||||
|
import time
|
||||||
|
from typing import List
|
||||||
|
|
||||||
|
# Add the project root to Python path if running directly
|
||||||
|
if __name__ == "__main__":
|
||||||
|
sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '../..')))
|
||||||
|
|
||||||
|
from crawl4ai.browser import BrowserManager
|
||||||
|
from crawl4ai.async_configs import BrowserConfig, CrawlerRunConfig
|
||||||
|
from crawl4ai.async_logger import AsyncLogger
|
||||||
|
|
||||||
|
# Create a logger for clear terminal output
|
||||||
|
logger = AsyncLogger(verbose=True, log_file=None)
|
||||||
|
|
||||||
|
async def test_get_pages_basic():
|
||||||
|
"""Test basic functionality of get_pages method."""
|
||||||
|
logger.info("Testing basic get_pages functionality", tag="TEST")
|
||||||
|
|
||||||
|
browser_config = BrowserConfig(headless=True)
|
||||||
|
manager = BrowserManager(browser_config=browser_config, logger=logger)
|
||||||
|
|
||||||
|
try:
|
||||||
|
await manager.start()
|
||||||
|
|
||||||
|
# Request 3 pages
|
||||||
|
crawler_config = CrawlerRunConfig()
|
||||||
|
pages = await manager.get_pages(crawler_config, count=3)
|
||||||
|
|
||||||
|
# Verify we got the correct number of pages
|
||||||
|
assert len(pages) == 3, f"Expected 3 pages, got {len(pages)}"
|
||||||
|
|
||||||
|
# Verify each page is valid
|
||||||
|
for i, (page, context) in enumerate(pages):
|
||||||
|
await page.goto("https://example.com")
|
||||||
|
title = await page.title()
|
||||||
|
logger.info(f"Page {i+1} title: {title}", tag="TEST")
|
||||||
|
assert title, f"Page {i+1} has no title"
|
||||||
|
|
||||||
|
await manager.close()
|
||||||
|
logger.success("Basic get_pages test completed successfully", tag="TEST")
|
||||||
|
return True
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Test failed: {str(e)}", tag="TEST")
|
||||||
|
try:
|
||||||
|
await manager.close()
|
||||||
|
except:
|
||||||
|
pass
|
||||||
|
return False
|
||||||
|
|
||||||
|
async def test_parallel_approaches_comparison():
|
||||||
|
"""Compare two parallel crawling approaches:
|
||||||
|
1. Create a page for each URL on-demand (get_page + gather)
|
||||||
|
2. Get all pages upfront with get_pages, then use them (get_pages + gather)
|
||||||
|
"""
|
||||||
|
logger.info("Comparing different parallel crawling approaches", tag="TEST")
|
||||||
|
|
||||||
|
urls = [
|
||||||
|
"https://example.com/page1",
|
||||||
|
"https://crawl4ai.com",
|
||||||
|
"https://kidocode.com",
|
||||||
|
"https://bbc.com",
|
||||||
|
# "https://example.com/page1",
|
||||||
|
# "https://example.com/page2",
|
||||||
|
# "https://example.com/page3",
|
||||||
|
# "https://example.com/page4",
|
||||||
|
]
|
||||||
|
|
||||||
|
browser_config = BrowserConfig(headless=False)
|
||||||
|
manager = BrowserManager(browser_config=browser_config, logger=logger)
|
||||||
|
|
||||||
|
try:
|
||||||
|
await manager.start()
|
||||||
|
|
||||||
|
# Approach 1: Create a page for each URL on-demand and run in parallel
|
||||||
|
logger.info("Testing approach 1: get_page for each URL + gather", tag="TEST")
|
||||||
|
start_time = time.time()
|
||||||
|
|
||||||
|
async def fetch_title_approach1(url):
|
||||||
|
"""Create a new page for each URL, go to the URL, and get title"""
|
||||||
|
crawler_config = CrawlerRunConfig(url=url)
|
||||||
|
page, context = await manager.get_page(crawler_config)
|
||||||
|
try:
|
||||||
|
await page.goto(url)
|
||||||
|
title = await page.title()
|
||||||
|
return title
|
||||||
|
finally:
|
||||||
|
await page.close()
|
||||||
|
|
||||||
|
# Run fetch_title_approach1 for each URL in parallel
|
||||||
|
tasks = [fetch_title_approach1(url) for url in urls]
|
||||||
|
approach1_results = await asyncio.gather(*tasks)
|
||||||
|
|
||||||
|
approach1_time = time.time() - start_time
|
||||||
|
logger.info(f"Approach 1 time (get_page + gather): {approach1_time:.2f}s", tag="TEST")
|
||||||
|
|
||||||
|
# Approach 2: Get all pages upfront with get_pages, then use them in parallel
|
||||||
|
logger.info("Testing approach 2: get_pages upfront + gather", tag="TEST")
|
||||||
|
start_time = time.time()
|
||||||
|
|
||||||
|
# Get all pages upfront
|
||||||
|
crawler_config = CrawlerRunConfig()
|
||||||
|
pages = await manager.get_pages(crawler_config, count=len(urls))
|
||||||
|
|
||||||
|
async def fetch_title_approach2(page_ctx, url):
|
||||||
|
"""Use a pre-created page to go to URL and get title"""
|
||||||
|
page, _ = page_ctx
|
||||||
|
try:
|
||||||
|
await page.goto(url)
|
||||||
|
title = await page.title()
|
||||||
|
return title
|
||||||
|
finally:
|
||||||
|
await page.close()
|
||||||
|
|
||||||
|
# Use the pre-created pages to fetch titles in parallel
|
||||||
|
tasks = [fetch_title_approach2(page_ctx, url) for page_ctx, url in zip(pages, urls)]
|
||||||
|
approach2_results = await asyncio.gather(*tasks)
|
||||||
|
|
||||||
|
approach2_time = time.time() - start_time
|
||||||
|
logger.info(f"Approach 2 time (get_pages + gather): {approach2_time:.2f}s", tag="TEST")
|
||||||
|
|
||||||
|
# Compare results and performance
|
||||||
|
speedup = approach1_time / approach2_time if approach2_time > 0 else 0
|
||||||
|
if speedup > 1:
|
||||||
|
logger.success(f"Approach 2 (get_pages upfront) was {speedup:.2f}x faster", tag="TEST")
|
||||||
|
else:
|
||||||
|
logger.info(f"Approach 1 (get_page + gather) was {1/speedup:.2f}x faster", tag="TEST")
|
||||||
|
|
||||||
|
# Verify same content was retrieved in both approaches
|
||||||
|
assert len(approach1_results) == len(approach2_results), "Result count mismatch"
|
||||||
|
|
||||||
|
# Sort results for comparison since parallel execution might complete in different order
|
||||||
|
assert sorted(approach1_results) == sorted(approach2_results), "Results content mismatch"
|
||||||
|
|
||||||
|
await manager.close()
|
||||||
|
return True
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Test failed: {str(e)}", tag="TEST")
|
||||||
|
try:
|
||||||
|
await manager.close()
|
||||||
|
except:
|
||||||
|
pass
|
||||||
|
return False
|
||||||
|
|
||||||
|
async def test_multi_browser_scaling(num_browsers=3, pages_per_browser=5):
|
||||||
|
"""Test performance with multiple browsers and pages per browser.
|
||||||
|
Compares two approaches:
|
||||||
|
1. On-demand page creation (get_page + gather)
|
||||||
|
2. Pre-created pages (get_pages + gather)
|
||||||
|
"""
|
||||||
|
logger.info(f"Testing multi-browser scaling with {num_browsers} browsers × {pages_per_browser} pages", tag="TEST")
|
||||||
|
|
||||||
|
# Generate test URLs
|
||||||
|
total_pages = num_browsers * pages_per_browser
|
||||||
|
urls = [f"https://example.com/page_{i}" for i in range(total_pages)]
|
||||||
|
|
||||||
|
# Create browser managers
|
||||||
|
managers = []
|
||||||
|
base_port = 9222
|
||||||
|
|
||||||
|
try:
|
||||||
|
# Start all browsers in parallel
|
||||||
|
start_tasks = []
|
||||||
|
for i in range(num_browsers):
|
||||||
|
browser_config = BrowserConfig(
|
||||||
|
headless=True # Using default browser mode like in test_parallel_approaches_comparison
|
||||||
|
)
|
||||||
|
manager = BrowserManager(browser_config=browser_config, logger=logger)
|
||||||
|
start_tasks.append(manager.start())
|
||||||
|
managers.append(manager)
|
||||||
|
|
||||||
|
await asyncio.gather(*start_tasks)
|
||||||
|
|
||||||
|
# Distribute URLs among managers
|
||||||
|
urls_per_manager = {}
|
||||||
|
for i, manager in enumerate(managers):
|
||||||
|
start_idx = i * pages_per_browser
|
||||||
|
end_idx = min(start_idx + pages_per_browser, len(urls))
|
||||||
|
urls_per_manager[manager] = urls[start_idx:end_idx]
|
||||||
|
|
||||||
|
# Approach 1: Create a page for each URL on-demand and run in parallel
|
||||||
|
logger.info("Testing approach 1: get_page for each URL + gather", tag="TEST")
|
||||||
|
start_time = time.time()
|
||||||
|
|
||||||
|
async def fetch_title_approach1(manager, url):
|
||||||
|
"""Create a new page for the URL, go to the URL, and get title"""
|
||||||
|
crawler_config = CrawlerRunConfig(url=url)
|
||||||
|
page, context = await manager.get_page(crawler_config)
|
||||||
|
try:
|
||||||
|
await page.goto(url)
|
||||||
|
title = await page.title()
|
||||||
|
return title
|
||||||
|
finally:
|
||||||
|
await page.close()
|
||||||
|
|
||||||
|
# Run fetch_title_approach1 for each URL in parallel
|
||||||
|
tasks = []
|
||||||
|
for manager, manager_urls in urls_per_manager.items():
|
||||||
|
for url in manager_urls:
|
||||||
|
tasks.append(fetch_title_approach1(manager, url))
|
||||||
|
|
||||||
|
approach1_results = await asyncio.gather(*tasks)
|
||||||
|
|
||||||
|
approach1_time = time.time() - start_time
|
||||||
|
logger.info(f"Approach 1 time (get_page + gather): {approach1_time:.2f}s", tag="TEST")
|
||||||
|
|
||||||
|
# Approach 2: Get all pages upfront with get_pages, then use them in parallel
|
||||||
|
logger.info("Testing approach 2: get_pages upfront + gather", tag="TEST")
|
||||||
|
start_time = time.time()
|
||||||
|
|
||||||
|
# Get all pages upfront for each manager
|
||||||
|
all_pages = []
|
||||||
|
for manager, manager_urls in urls_per_manager.items():
|
||||||
|
crawler_config = CrawlerRunConfig()
|
||||||
|
pages = await manager.get_pages(crawler_config, count=len(manager_urls))
|
||||||
|
all_pages.extend(zip(pages, manager_urls))
|
||||||
|
|
||||||
|
async def fetch_title_approach2(page_ctx, url):
|
||||||
|
"""Use a pre-created page to go to URL and get title"""
|
||||||
|
page, _ = page_ctx
|
||||||
|
try:
|
||||||
|
await page.goto(url)
|
||||||
|
title = await page.title()
|
||||||
|
return title
|
||||||
|
finally:
|
||||||
|
await page.close()
|
||||||
|
|
||||||
|
# Use the pre-created pages to fetch titles in parallel
|
||||||
|
tasks = [fetch_title_approach2(page_ctx, url) for page_ctx, url in all_pages]
|
||||||
|
approach2_results = await asyncio.gather(*tasks)
|
||||||
|
|
||||||
|
approach2_time = time.time() - start_time
|
||||||
|
logger.info(f"Approach 2 time (get_pages + gather): {approach2_time:.2f}s", tag="TEST")
|
||||||
|
|
||||||
|
# Compare results and performance
|
||||||
|
speedup = approach1_time / approach2_time if approach2_time > 0 else 0
|
||||||
|
pages_per_second = total_pages / approach2_time
|
||||||
|
|
||||||
|
# Show a simple summary
|
||||||
|
logger.info(f"📊 Summary: {num_browsers} browsers × {pages_per_browser} pages = {total_pages} total crawls", tag="TEST")
|
||||||
|
logger.info(f"⚡ Performance: {pages_per_second:.1f} pages/second ({pages_per_second*60:.0f} pages/minute)", tag="TEST")
|
||||||
|
logger.info(f"🚀 Total crawl time: {approach2_time:.2f} seconds", tag="TEST")
|
||||||
|
|
||||||
|
if speedup > 1:
|
||||||
|
logger.success(f"✅ Approach 2 (get_pages upfront) was {speedup:.2f}x faster", tag="TEST")
|
||||||
|
else:
|
||||||
|
logger.info(f"✅ Approach 1 (get_page + gather) was {1/speedup:.2f}x faster", tag="TEST")
|
||||||
|
|
||||||
|
# Close all managers
|
||||||
|
for manager in managers:
|
||||||
|
await manager.close()
|
||||||
|
|
||||||
|
return True
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Test failed: {str(e)}", tag="TEST")
|
||||||
|
# Clean up
|
||||||
|
for manager in managers:
|
||||||
|
try:
|
||||||
|
await manager.close()
|
||||||
|
except:
|
||||||
|
pass
|
||||||
|
return False
|
||||||
|
|
||||||
|
async def grid_search_optimal_configuration(total_urls=50):
|
||||||
|
"""Perform a grid search to find the optimal balance between number of browsers and pages per browser.
|
||||||
|
|
||||||
|
This function tests different combinations of browser count and pages per browser,
|
||||||
|
while keeping the total number of URLs constant. It measures performance metrics
|
||||||
|
for each configuration to find the "sweet spot" that provides the best speed
|
||||||
|
with reasonable memory usage.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
total_urls: Total number of URLs to crawl (default: 50)
|
||||||
|
"""
|
||||||
|
logger.info(f"=== GRID SEARCH FOR OPTIMAL CRAWLING CONFIGURATION ({total_urls} URLs) ===", tag="TEST")
|
||||||
|
|
||||||
|
# Generate test URLs once
|
||||||
|
urls = [f"https://example.com/page_{i}" for i in range(total_urls)]
|
||||||
|
|
||||||
|
# Define grid search configurations
|
||||||
|
# We'll use more flexible approach: test all browser counts from 1 to min(20, total_urls)
|
||||||
|
# and distribute pages evenly (some browsers may have 1 more page than others)
|
||||||
|
configurations = []
|
||||||
|
|
||||||
|
# Maximum number of browsers to test
|
||||||
|
max_browsers_to_test = min(20, total_urls)
|
||||||
|
|
||||||
|
# Try configurations with 1 to max_browsers_to_test browsers
|
||||||
|
for num_browsers in range(1, max_browsers_to_test + 1):
|
||||||
|
base_pages_per_browser = total_urls // num_browsers
|
||||||
|
remainder = total_urls % num_browsers
|
||||||
|
|
||||||
|
# Generate exact page distribution array
|
||||||
|
if remainder > 0:
|
||||||
|
# First 'remainder' browsers get one more page
|
||||||
|
page_distribution = [base_pages_per_browser + 1] * remainder + [base_pages_per_browser] * (num_browsers - remainder)
|
||||||
|
pages_distribution = f"{base_pages_per_browser+1} pages × {remainder} browsers, {base_pages_per_browser} pages × {num_browsers - remainder} browsers"
|
||||||
|
else:
|
||||||
|
# All browsers get the same number of pages
|
||||||
|
page_distribution = [base_pages_per_browser] * num_browsers
|
||||||
|
pages_distribution = f"{base_pages_per_browser} pages × {num_browsers} browsers"
|
||||||
|
|
||||||
|
# Format the distribution as a tuple string like (4, 4, 3, 3)
|
||||||
|
distribution_str = str(tuple(page_distribution))
|
||||||
|
|
||||||
|
configurations.append((num_browsers, base_pages_per_browser, pages_distribution, page_distribution, distribution_str))
|
||||||
|
|
||||||
|
# Track results
|
||||||
|
results = []
|
||||||
|
|
||||||
|
# Test each configuration
|
||||||
|
for num_browsers, pages_per_browser, pages_distribution, page_distribution, distribution_str in configurations:
|
||||||
|
logger.info("-" * 80, tag="TEST")
|
||||||
|
logger.info(f"Testing configuration: {num_browsers} browsers with distribution: {distribution_str}", tag="TEST")
|
||||||
|
logger.info(f"Details: {pages_distribution}", tag="TEST")
|
||||||
|
# Sleep a bit for randomness
|
||||||
|
await asyncio.sleep(0.5)
|
||||||
|
|
||||||
|
try:
|
||||||
|
# Import psutil for memory tracking
|
||||||
|
try:
|
||||||
|
import psutil
|
||||||
|
process = psutil.Process()
|
||||||
|
initial_memory = process.memory_info().rss / (1024 * 1024) # MB
|
||||||
|
except ImportError:
|
||||||
|
logger.warning("psutil not available, memory metrics will not be tracked", tag="TEST")
|
||||||
|
initial_memory = 0
|
||||||
|
|
||||||
|
# Create and start browser managers
|
||||||
|
managers = []
|
||||||
|
start_time = time.time()
|
||||||
|
|
||||||
|
# Start all browsers in parallel
|
||||||
|
start_tasks = []
|
||||||
|
for i in range(num_browsers):
|
||||||
|
browser_config = BrowserConfig(
|
||||||
|
headless=True
|
||||||
|
)
|
||||||
|
manager = BrowserManager(browser_config=browser_config, logger=logger)
|
||||||
|
start_tasks.append(manager.start())
|
||||||
|
managers.append(manager)
|
||||||
|
|
||||||
|
await asyncio.gather(*start_tasks)
|
||||||
|
browser_startup_time = time.time() - start_time
|
||||||
|
|
||||||
|
# Measure memory after browser startup
|
||||||
|
if initial_memory > 0:
|
||||||
|
browser_memory = process.memory_info().rss / (1024 * 1024) - initial_memory
|
||||||
|
else:
|
||||||
|
browser_memory = 0
|
||||||
|
|
||||||
|
# Distribute URLs among managers using the exact page distribution
|
||||||
|
urls_per_manager = {}
|
||||||
|
total_assigned = 0
|
||||||
|
|
||||||
|
for i, manager in enumerate(managers):
|
||||||
|
if i < len(page_distribution):
|
||||||
|
# Get the exact number of pages for this browser from our distribution
|
||||||
|
manager_pages = page_distribution[i]
|
||||||
|
|
||||||
|
# Get the URL slice for this manager
|
||||||
|
start_idx = total_assigned
|
||||||
|
end_idx = start_idx + manager_pages
|
||||||
|
urls_per_manager[manager] = urls[start_idx:end_idx]
|
||||||
|
total_assigned += manager_pages
|
||||||
|
else:
|
||||||
|
# If we have more managers than our distribution (should never happen)
|
||||||
|
urls_per_manager[manager] = []
|
||||||
|
|
||||||
|
# Use the more efficient approach (pre-created pages)
|
||||||
|
logger.info("Running page crawling test...", tag="TEST")
|
||||||
|
crawl_start_time = time.time()
|
||||||
|
|
||||||
|
# Get all pages upfront for each manager
|
||||||
|
all_pages = []
|
||||||
|
for manager, manager_urls in urls_per_manager.items():
|
||||||
|
if not manager_urls: # Skip managers with no URLs
|
||||||
|
continue
|
||||||
|
crawler_config = CrawlerRunConfig()
|
||||||
|
pages = await manager.get_pages(crawler_config, count=len(manager_urls))
|
||||||
|
all_pages.extend(zip(pages, manager_urls))
|
||||||
|
|
||||||
|
# Measure memory after page creation
|
||||||
|
if initial_memory > 0:
|
||||||
|
pages_memory = process.memory_info().rss / (1024 * 1024) - browser_memory - initial_memory
|
||||||
|
else:
|
||||||
|
pages_memory = 0
|
||||||
|
|
||||||
|
# Function to crawl a URL with a pre-created page
|
||||||
|
async def fetch_title(page_ctx, url):
|
||||||
|
page, _ = page_ctx
|
||||||
|
try:
|
||||||
|
await page.goto(url)
|
||||||
|
title = await page.title()
|
||||||
|
return title
|
||||||
|
finally:
|
||||||
|
await page.close()
|
||||||
|
|
||||||
|
# Use the pre-created pages to fetch titles in parallel
|
||||||
|
tasks = [fetch_title(page_ctx, url) for page_ctx, url in all_pages]
|
||||||
|
crawl_results = await asyncio.gather(*tasks)
|
||||||
|
|
||||||
|
crawl_time = time.time() - crawl_start_time
|
||||||
|
total_time = time.time() - start_time
|
||||||
|
|
||||||
|
# Final memory measurement
|
||||||
|
if initial_memory > 0:
|
||||||
|
peak_memory = max(browser_memory + pages_memory, process.memory_info().rss / (1024 * 1024) - initial_memory)
|
||||||
|
else:
|
||||||
|
peak_memory = 0
|
||||||
|
|
||||||
|
# Close all managers
|
||||||
|
for manager in managers:
|
||||||
|
await manager.close()
|
||||||
|
|
||||||
|
# Calculate metrics
|
||||||
|
pages_per_second = total_urls / crawl_time
|
||||||
|
|
||||||
|
# Store result metrics
|
||||||
|
result = {
|
||||||
|
"num_browsers": num_browsers,
|
||||||
|
"pages_per_browser": pages_per_browser,
|
||||||
|
"page_distribution": page_distribution,
|
||||||
|
"distribution_str": distribution_str,
|
||||||
|
"total_urls": total_urls,
|
||||||
|
"browser_startup_time": browser_startup_time,
|
||||||
|
"crawl_time": crawl_time,
|
||||||
|
"total_time": total_time,
|
||||||
|
"browser_memory": browser_memory,
|
||||||
|
"pages_memory": pages_memory,
|
||||||
|
"peak_memory": peak_memory,
|
||||||
|
"pages_per_second": pages_per_second,
|
||||||
|
# Calculate efficiency score (higher is better)
|
||||||
|
# This balances speed vs memory usage
|
||||||
|
"efficiency_score": pages_per_second / (peak_memory + 1) if peak_memory > 0 else pages_per_second,
|
||||||
|
}
|
||||||
|
|
||||||
|
results.append(result)
|
||||||
|
|
||||||
|
# Log the results
|
||||||
|
logger.info(f"Browser startup: {browser_startup_time:.2f}s", tag="TEST")
|
||||||
|
logger.info(f"Crawl time: {crawl_time:.2f}s", tag="TEST")
|
||||||
|
logger.info(f"Total time: {total_time:.2f}s", tag="TEST")
|
||||||
|
logger.info(f"Performance: {pages_per_second:.1f} pages/second", tag="TEST")
|
||||||
|
|
||||||
|
if peak_memory > 0:
|
||||||
|
logger.info(f"Browser memory: {browser_memory:.1f}MB", tag="TEST")
|
||||||
|
logger.info(f"Pages memory: {pages_memory:.1f}MB", tag="TEST")
|
||||||
|
logger.info(f"Peak memory: {peak_memory:.1f}MB", tag="TEST")
|
||||||
|
logger.info(f"Efficiency score: {result['efficiency_score']:.6f}", tag="TEST")
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Error testing configuration: {str(e)}", tag="TEST")
|
||||||
|
import traceback
|
||||||
|
traceback.print_exc()
|
||||||
|
|
||||||
|
# Clean up
|
||||||
|
for manager in managers:
|
||||||
|
try:
|
||||||
|
await manager.close()
|
||||||
|
except:
|
||||||
|
pass
|
||||||
|
|
||||||
|
# Print summary of all configurations
|
||||||
|
logger.info("=" * 100, tag="TEST")
|
||||||
|
logger.info("GRID SEARCH RESULTS SUMMARY", tag="TEST")
|
||||||
|
logger.info("=" * 100, tag="TEST")
|
||||||
|
|
||||||
|
# Rank configurations by efficiency score
|
||||||
|
ranked_results = sorted(results, key=lambda x: x["efficiency_score"], reverse=True)
|
||||||
|
|
||||||
|
# Also determine rankings by different metrics
|
||||||
|
fastest = sorted(results, key=lambda x: x["crawl_time"])[0]
|
||||||
|
lowest_memory = sorted(results, key=lambda x: x["peak_memory"] if x["peak_memory"] > 0 else float('inf'))[0]
|
||||||
|
most_efficient = ranked_results[0]
|
||||||
|
|
||||||
|
# Print top performers by category
|
||||||
|
logger.info("🏆 TOP PERFORMERS BY CATEGORY:", tag="TEST")
|
||||||
|
logger.info(f"⚡ Fastest: {fastest['num_browsers']} browsers × ~{fastest['pages_per_browser']} pages " +
|
||||||
|
f"({fastest['crawl_time']:.2f}s, {fastest['pages_per_second']:.1f} pages/s)", tag="TEST")
|
||||||
|
|
||||||
|
if lowest_memory["peak_memory"] > 0:
|
||||||
|
logger.info(f"💾 Lowest memory: {lowest_memory['num_browsers']} browsers × ~{lowest_memory['pages_per_browser']} pages " +
|
||||||
|
f"({lowest_memory['peak_memory']:.1f}MB)", tag="TEST")
|
||||||
|
|
||||||
|
logger.info(f"🌟 Most efficient: {most_efficient['num_browsers']} browsers × ~{most_efficient['pages_per_browser']} pages " +
|
||||||
|
f"(score: {most_efficient['efficiency_score']:.6f})", tag="TEST")
|
||||||
|
|
||||||
|
# Print result table header
|
||||||
|
logger.info("\n📊 COMPLETE RANKING TABLE (SORTED BY EFFICIENCY SCORE):", tag="TEST")
|
||||||
|
logger.info("-" * 120, tag="TEST")
|
||||||
|
|
||||||
|
# Define table header
|
||||||
|
header = f"{'Rank':<5} | {'Browsers':<8} | {'Distribution':<55} | {'Total Time(s)':<12} | {'Speed(p/s)':<12} | {'Memory(MB)':<12} | {'Efficiency':<10} | {'Notes'}"
|
||||||
|
logger.info(header, tag="TEST")
|
||||||
|
logger.info("-" * 120, tag="TEST")
|
||||||
|
|
||||||
|
# Print each configuration in ranked order
|
||||||
|
for rank, result in enumerate(ranked_results, 1):
|
||||||
|
# Add special notes for top performers
|
||||||
|
notes = []
|
||||||
|
if result == fastest:
|
||||||
|
notes.append("⚡ Fastest")
|
||||||
|
if result == lowest_memory:
|
||||||
|
notes.append("💾 Lowest Memory")
|
||||||
|
if result == most_efficient:
|
||||||
|
notes.append("🌟 Most Efficient")
|
||||||
|
|
||||||
|
notes_str = " | ".join(notes) if notes else ""
|
||||||
|
|
||||||
|
# Format memory if available
|
||||||
|
memory_str = f"{result['peak_memory']:.1f}" if result['peak_memory'] > 0 else "N/A"
|
||||||
|
|
||||||
|
# Get the distribution string
|
||||||
|
dist_str = result.get('distribution_str', str(tuple([result['pages_per_browser']] * result['num_browsers'])))
|
||||||
|
|
||||||
|
# Build the row
|
||||||
|
row = f"{rank:<5} | {result['num_browsers']:<8} | {dist_str:<55} | {result['total_time']:.2f}s{' ':<7} | "
|
||||||
|
row += f"{result['pages_per_second']:.2f}{' ':<6} | {memory_str}{' ':<6} | {result['efficiency_score']:.4f}{' ':<4} | {notes_str}"
|
||||||
|
|
||||||
|
logger.info(row, tag="TEST")
|
||||||
|
|
||||||
|
logger.info("-" * 120, tag="TEST")
|
||||||
|
|
||||||
|
# Generate visualization if matplotlib is available
|
||||||
|
try:
|
||||||
|
import matplotlib.pyplot as plt
|
||||||
|
import numpy as np
|
||||||
|
|
||||||
|
# Extract data for plotting from ranked results
|
||||||
|
browser_counts = [r["num_browsers"] for r in ranked_results]
|
||||||
|
efficiency_scores = [r["efficiency_score"] for r in ranked_results]
|
||||||
|
crawl_times = [r["crawl_time"] for r in ranked_results]
|
||||||
|
total_times = [r["total_time"] for r in ranked_results]
|
||||||
|
|
||||||
|
# Filter results with memory data
|
||||||
|
memory_results = [r for r in ranked_results if r["peak_memory"] > 0]
|
||||||
|
memory_browser_counts = [r["num_browsers"] for r in memory_results]
|
||||||
|
peak_memories = [r["peak_memory"] for r in memory_results]
|
||||||
|
|
||||||
|
# Create figure with clean design
|
||||||
|
plt.figure(figsize=(14, 12), facecolor='white')
|
||||||
|
plt.style.use('ggplot')
|
||||||
|
|
||||||
|
# Create grid for subplots
|
||||||
|
gs = plt.GridSpec(3, 1, height_ratios=[1, 1, 1], hspace=0.3)
|
||||||
|
|
||||||
|
# Plot 1: Efficiency Score (higher is better)
|
||||||
|
ax1 = plt.subplot(gs[0])
|
||||||
|
bar_colors = ['#3498db'] * len(browser_counts)
|
||||||
|
|
||||||
|
# Highlight the most efficient
|
||||||
|
most_efficient_idx = browser_counts.index(most_efficient["num_browsers"])
|
||||||
|
bar_colors[most_efficient_idx] = '#e74c3c' # Red for most efficient
|
||||||
|
|
||||||
|
bars = ax1.bar(range(len(browser_counts)), efficiency_scores, color=bar_colors)
|
||||||
|
ax1.set_xticks(range(len(browser_counts)))
|
||||||
|
ax1.set_xticklabels([f"{bc}" for bc in browser_counts], rotation=45)
|
||||||
|
ax1.set_xlabel('Number of Browsers')
|
||||||
|
ax1.set_ylabel('Efficiency Score (higher is better)')
|
||||||
|
ax1.set_title('Browser Configuration Efficiency (higher is better)')
|
||||||
|
|
||||||
|
# Add value labels on top of bars
|
||||||
|
for bar, score in zip(bars, efficiency_scores):
|
||||||
|
height = bar.get_height()
|
||||||
|
ax1.text(bar.get_x() + bar.get_width()/2., height + 0.02*max(efficiency_scores),
|
||||||
|
f'{score:.3f}', ha='center', va='bottom', rotation=90, fontsize=8)
|
||||||
|
|
||||||
|
# Highlight best configuration
|
||||||
|
ax1.text(0.02, 0.90, f"🌟 Most Efficient: {most_efficient['num_browsers']} browsers with ~{most_efficient['pages_per_browser']} pages",
|
||||||
|
transform=ax1.transAxes, fontsize=12, verticalalignment='top',
|
||||||
|
bbox=dict(boxstyle='round,pad=0.5', facecolor='yellow', alpha=0.3))
|
||||||
|
|
||||||
|
# Plot 2: Time Performance
|
||||||
|
ax2 = plt.subplot(gs[1])
|
||||||
|
|
||||||
|
# Plot both total time and crawl time
|
||||||
|
ax2.plot(browser_counts, crawl_times, 'bo-', label='Crawl Time (s)', linewidth=2)
|
||||||
|
ax2.plot(browser_counts, total_times, 'go--', label='Total Time (s)', linewidth=2, alpha=0.6)
|
||||||
|
|
||||||
|
# Mark the fastest configuration
|
||||||
|
fastest_idx = browser_counts.index(fastest["num_browsers"])
|
||||||
|
ax2.plot(browser_counts[fastest_idx], crawl_times[fastest_idx], 'ro', ms=10,
|
||||||
|
label=f'Fastest: {fastest["num_browsers"]} browsers')
|
||||||
|
|
||||||
|
ax2.set_xlabel('Number of Browsers')
|
||||||
|
ax2.set_ylabel('Time (seconds)')
|
||||||
|
ax2.set_title(f'Time Performance for {total_urls} URLs by Browser Count')
|
||||||
|
ax2.grid(True, linestyle='--', alpha=0.7)
|
||||||
|
ax2.legend(loc='upper right')
|
||||||
|
|
||||||
|
# Plot pages per second on second y-axis
|
||||||
|
pages_per_second = [total_urls/t for t in crawl_times]
|
||||||
|
ax2_twin = ax2.twinx()
|
||||||
|
ax2_twin.plot(browser_counts, pages_per_second, 'r^--', label='Pages/second', alpha=0.5)
|
||||||
|
ax2_twin.set_ylabel('Pages per second')
|
||||||
|
|
||||||
|
# Add note about the fastest configuration
|
||||||
|
ax2.text(0.02, 0.90, f"⚡ Fastest: {fastest['num_browsers']} browsers with ~{fastest['pages_per_browser']} pages" +
|
||||||
|
f"\n {fastest['crawl_time']:.2f}s ({fastest['pages_per_second']:.1f} pages/s)",
|
||||||
|
transform=ax2.transAxes, fontsize=12, verticalalignment='top',
|
||||||
|
bbox=dict(boxstyle='round,pad=0.5', facecolor='lightblue', alpha=0.3))
|
||||||
|
|
||||||
|
# Plot 3: Memory Usage (if available)
|
||||||
|
if memory_results:
|
||||||
|
ax3 = plt.subplot(gs[2])
|
||||||
|
|
||||||
|
# Prepare data for grouped bar chart
|
||||||
|
memory_per_browser = [m/n for m, n in zip(peak_memories, memory_browser_counts)]
|
||||||
|
memory_per_page = [m/(n*p) for m, n, p in zip(
|
||||||
|
[r["peak_memory"] for r in memory_results],
|
||||||
|
[r["num_browsers"] for r in memory_results],
|
||||||
|
[r["pages_per_browser"] for r in memory_results])]
|
||||||
|
|
||||||
|
x = np.arange(len(memory_browser_counts))
|
||||||
|
width = 0.35
|
||||||
|
|
||||||
|
# Create grouped bars
|
||||||
|
ax3.bar(x - width/2, peak_memories, width, label='Total Memory (MB)', color='#9b59b6')
|
||||||
|
ax3.bar(x + width/2, memory_per_browser, width, label='Memory per Browser (MB)', color='#3498db')
|
||||||
|
|
||||||
|
# Configure axis
|
||||||
|
ax3.set_xticks(x)
|
||||||
|
ax3.set_xticklabels([f"{bc}" for bc in memory_browser_counts], rotation=45)
|
||||||
|
ax3.set_xlabel('Number of Browsers')
|
||||||
|
ax3.set_ylabel('Memory (MB)')
|
||||||
|
ax3.set_title('Memory Usage by Browser Configuration')
|
||||||
|
ax3.legend(loc='upper left')
|
||||||
|
ax3.grid(True, linestyle='--', alpha=0.7)
|
||||||
|
|
||||||
|
# Add second y-axis for memory per page
|
||||||
|
ax3_twin = ax3.twinx()
|
||||||
|
ax3_twin.plot(x, memory_per_page, 'ro-', label='Memory per Page (MB)')
|
||||||
|
ax3_twin.set_ylabel('Memory per Page (MB)')
|
||||||
|
|
||||||
|
# Get lowest memory configuration
|
||||||
|
lowest_memory_idx = memory_browser_counts.index(lowest_memory["num_browsers"])
|
||||||
|
|
||||||
|
# Add note about lowest memory configuration
|
||||||
|
ax3.text(0.02, 0.90, f"💾 Lowest Memory: {lowest_memory['num_browsers']} browsers with ~{lowest_memory['pages_per_browser']} pages" +
|
||||||
|
f"\n {lowest_memory['peak_memory']:.1f}MB ({lowest_memory['peak_memory']/total_urls:.2f}MB per page)",
|
||||||
|
transform=ax3.transAxes, fontsize=12, verticalalignment='top',
|
||||||
|
bbox=dict(boxstyle='round,pad=0.5', facecolor='lightgreen', alpha=0.3))
|
||||||
|
|
||||||
|
# Add overall title
|
||||||
|
plt.suptitle(f'Browser Scaling Grid Search Results for {total_urls} URLs', fontsize=16, y=0.98)
|
||||||
|
|
||||||
|
# Add timestamp and info at the bottom
|
||||||
|
plt.figtext(0.5, 0.01, f"Generated by Crawl4AI at {time.strftime('%Y-%m-%d %H:%M:%S')}",
|
||||||
|
ha="center", fontsize=10, style='italic')
|
||||||
|
|
||||||
|
# Get current directory and save the figure there
|
||||||
|
import os
|
||||||
|
__current_file = os.path.abspath(__file__)
|
||||||
|
current_dir = os.path.dirname(__current_file)
|
||||||
|
output_file = os.path.join(current_dir, 'browser_scaling_grid_search.png')
|
||||||
|
|
||||||
|
# Adjust layout and save figure with high DPI
|
||||||
|
plt.tight_layout(rect=[0, 0.03, 1, 0.97])
|
||||||
|
plt.savefig(output_file, dpi=200, bbox_inches='tight')
|
||||||
|
logger.success(f"Visualization saved to {output_file}", tag="TEST")
|
||||||
|
|
||||||
|
except ImportError:
|
||||||
|
logger.warning("matplotlib not available, skipping visualization", tag="TEST")
|
||||||
|
|
||||||
|
return most_efficient["num_browsers"], most_efficient["pages_per_browser"]
|
||||||
|
|
||||||
|
async def find_optimal_browser_config(total_urls=50, verbose=True, rate_limit_delay=0.2):
|
||||||
|
"""Find optimal browser configuration for crawling a specific number of URLs.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
total_urls: Number of URLs to crawl
|
||||||
|
verbose: Whether to print progress
|
||||||
|
rate_limit_delay: Delay between page loads to avoid rate limiting
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
dict: Contains fastest, lowest_memory, and optimal configurations
|
||||||
|
"""
|
||||||
|
if verbose:
|
||||||
|
print(f"\n=== Finding optimal configuration for crawling {total_urls} URLs ===\n")
|
||||||
|
|
||||||
|
# Generate test URLs with timestamp to avoid caching
|
||||||
|
timestamp = int(time.time())
|
||||||
|
urls = [f"https://example.com/page_{i}?t={timestamp}" for i in range(total_urls)]
|
||||||
|
|
||||||
|
# Limit browser configurations to test (1 browser to max 10)
|
||||||
|
max_browsers = min(10, total_urls)
|
||||||
|
configs_to_test = []
|
||||||
|
|
||||||
|
# Generate configurations (browser count, pages distribution)
|
||||||
|
for num_browsers in range(1, max_browsers + 1):
|
||||||
|
base_pages = total_urls // num_browsers
|
||||||
|
remainder = total_urls % num_browsers
|
||||||
|
|
||||||
|
# Create distribution array like [3, 3, 2, 2] (some browsers get one more page)
|
||||||
|
if remainder > 0:
|
||||||
|
distribution = [base_pages + 1] * remainder + [base_pages] * (num_browsers - remainder)
|
||||||
|
else:
|
||||||
|
distribution = [base_pages] * num_browsers
|
||||||
|
|
||||||
|
configs_to_test.append((num_browsers, distribution))
|
||||||
|
|
||||||
|
results = []
|
||||||
|
|
||||||
|
# Test each configuration
|
||||||
|
for browser_count, page_distribution in configs_to_test:
|
||||||
|
if verbose:
|
||||||
|
print(f"Testing {browser_count} browsers with distribution {tuple(page_distribution)}")
|
||||||
|
|
||||||
|
try:
|
||||||
|
# Track memory if possible
|
||||||
|
try:
|
||||||
|
import psutil
|
||||||
|
process = psutil.Process()
|
||||||
|
start_memory = process.memory_info().rss / (1024 * 1024) # MB
|
||||||
|
except ImportError:
|
||||||
|
if verbose:
|
||||||
|
print("Memory tracking not available (psutil not installed)")
|
||||||
|
start_memory = 0
|
||||||
|
|
||||||
|
# Start browsers in parallel
|
||||||
|
managers = []
|
||||||
|
start_tasks = []
|
||||||
|
start_time = time.time()
|
||||||
|
|
||||||
|
for i in range(browser_count):
|
||||||
|
config = BrowserConfig(headless=True)
|
||||||
|
manager = BrowserManager(browser_config=config, logger=logger)
|
||||||
|
start_tasks.append(manager.start())
|
||||||
|
managers.append(manager)
|
||||||
|
|
||||||
|
await asyncio.gather(*start_tasks)
|
||||||
|
|
||||||
|
# Distribute URLs among browsers
|
||||||
|
urls_per_manager = {}
|
||||||
|
url_index = 0
|
||||||
|
|
||||||
|
for i, manager in enumerate(managers):
|
||||||
|
pages_for_this_browser = page_distribution[i]
|
||||||
|
end_index = url_index + pages_for_this_browser
|
||||||
|
urls_per_manager[manager] = urls[url_index:end_index]
|
||||||
|
url_index = end_index
|
||||||
|
|
||||||
|
# Create pages for each browser
|
||||||
|
all_pages = []
|
||||||
|
for manager, manager_urls in urls_per_manager.items():
|
||||||
|
if not manager_urls:
|
||||||
|
continue
|
||||||
|
pages = await manager.get_pages(CrawlerRunConfig(), count=len(manager_urls))
|
||||||
|
all_pages.extend(zip(pages, manager_urls))
|
||||||
|
|
||||||
|
# Crawl pages with delay to avoid rate limiting
|
||||||
|
async def crawl_page(page_ctx, url):
|
||||||
|
page, _ = page_ctx
|
||||||
|
try:
|
||||||
|
await page.goto(url)
|
||||||
|
if rate_limit_delay > 0:
|
||||||
|
await asyncio.sleep(rate_limit_delay)
|
||||||
|
title = await page.title()
|
||||||
|
return title
|
||||||
|
finally:
|
||||||
|
await page.close()
|
||||||
|
|
||||||
|
crawl_start = time.time()
|
||||||
|
crawl_tasks = [crawl_page(page_ctx, url) for page_ctx, url in all_pages]
|
||||||
|
await asyncio.gather(*crawl_tasks)
|
||||||
|
crawl_time = time.time() - crawl_start
|
||||||
|
total_time = time.time() - start_time
|
||||||
|
|
||||||
|
# Measure final memory usage
|
||||||
|
if start_memory > 0:
|
||||||
|
end_memory = process.memory_info().rss / (1024 * 1024)
|
||||||
|
memory_used = end_memory - start_memory
|
||||||
|
else:
|
||||||
|
memory_used = 0
|
||||||
|
|
||||||
|
# Close all browsers
|
||||||
|
for manager in managers:
|
||||||
|
await manager.close()
|
||||||
|
|
||||||
|
# Calculate metrics
|
||||||
|
pages_per_second = total_urls / crawl_time
|
||||||
|
|
||||||
|
# Calculate efficiency score (higher is better)
|
||||||
|
# This balances speed vs memory
|
||||||
|
if memory_used > 0:
|
||||||
|
efficiency = pages_per_second / (memory_used + 1)
|
||||||
|
else:
|
||||||
|
efficiency = pages_per_second
|
||||||
|
|
||||||
|
# Store result
|
||||||
|
result = {
|
||||||
|
"browser_count": browser_count,
|
||||||
|
"distribution": tuple(page_distribution),
|
||||||
|
"crawl_time": crawl_time,
|
||||||
|
"total_time": total_time,
|
||||||
|
"memory_used": memory_used,
|
||||||
|
"pages_per_second": pages_per_second,
|
||||||
|
"efficiency": efficiency
|
||||||
|
}
|
||||||
|
|
||||||
|
results.append(result)
|
||||||
|
|
||||||
|
if verbose:
|
||||||
|
print(f" ✓ Crawled {total_urls} pages in {crawl_time:.2f}s ({pages_per_second:.1f} pages/sec)")
|
||||||
|
if memory_used > 0:
|
||||||
|
print(f" ✓ Memory used: {memory_used:.1f}MB ({memory_used/total_urls:.1f}MB per page)")
|
||||||
|
print(f" ✓ Efficiency score: {efficiency:.4f}")
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
if verbose:
|
||||||
|
print(f" ✗ Error: {str(e)}")
|
||||||
|
|
||||||
|
# Clean up
|
||||||
|
for manager in managers:
|
||||||
|
try:
|
||||||
|
await manager.close()
|
||||||
|
except:
|
||||||
|
pass
|
||||||
|
|
||||||
|
# If no successful results, return None
|
||||||
|
if not results:
|
||||||
|
return None
|
||||||
|
|
||||||
|
# Find best configurations
|
||||||
|
fastest = sorted(results, key=lambda x: x["crawl_time"])[0]
|
||||||
|
|
||||||
|
# Only consider memory if available
|
||||||
|
memory_results = [r for r in results if r["memory_used"] > 0]
|
||||||
|
if memory_results:
|
||||||
|
lowest_memory = sorted(memory_results, key=lambda x: x["memory_used"])[0]
|
||||||
|
else:
|
||||||
|
lowest_memory = fastest
|
||||||
|
|
||||||
|
# Find most efficient (balanced speed vs memory)
|
||||||
|
optimal = sorted(results, key=lambda x: x["efficiency"], reverse=True)[0]
|
||||||
|
|
||||||
|
# Print summary
|
||||||
|
if verbose:
|
||||||
|
print("\n=== OPTIMAL CONFIGURATIONS ===")
|
||||||
|
print(f"⚡ Fastest: {fastest['browser_count']} browsers {fastest['distribution']}")
|
||||||
|
print(f" {fastest['crawl_time']:.2f}s, {fastest['pages_per_second']:.1f} pages/sec")
|
||||||
|
|
||||||
|
print(f"💾 Memory-efficient: {lowest_memory['browser_count']} browsers {lowest_memory['distribution']}")
|
||||||
|
if lowest_memory["memory_used"] > 0:
|
||||||
|
print(f" {lowest_memory['memory_used']:.1f}MB, {lowest_memory['memory_used']/total_urls:.2f}MB per page")
|
||||||
|
|
||||||
|
print(f"🌟 Balanced optimal: {optimal['browser_count']} browsers {optimal['distribution']}")
|
||||||
|
print(f" {optimal['crawl_time']:.2f}s, {optimal['pages_per_second']:.1f} pages/sec, score: {optimal['efficiency']:.4f}")
|
||||||
|
|
||||||
|
return {
|
||||||
|
"fastest": fastest,
|
||||||
|
"lowest_memory": lowest_memory,
|
||||||
|
"optimal": optimal,
|
||||||
|
"all_configs": results
|
||||||
|
}
|
||||||
|
|
||||||
|
async def run_tests():
|
||||||
|
"""Run all tests sequentially."""
|
||||||
|
results = []
|
||||||
|
|
||||||
|
# Find optimal configuration using our utility function
|
||||||
|
configs = await find_optimal_browser_config(
|
||||||
|
total_urls=20, # Use a small number for faster testing
|
||||||
|
verbose=True,
|
||||||
|
rate_limit_delay=0.2 # 200ms delay between page loads to avoid rate limiting
|
||||||
|
)
|
||||||
|
|
||||||
|
if configs:
|
||||||
|
# Show the optimal configuration
|
||||||
|
optimal = configs["optimal"]
|
||||||
|
print(f"\n🎯 Recommended configuration for production use:")
|
||||||
|
print(f" {optimal['browser_count']} browsers with distribution {optimal['distribution']}")
|
||||||
|
print(f" Estimated performance: {optimal['pages_per_second']:.1f} pages/second")
|
||||||
|
results.append(True)
|
||||||
|
else:
|
||||||
|
print("\n❌ Failed to find optimal configuration")
|
||||||
|
results.append(False)
|
||||||
|
|
||||||
|
# Print summary
|
||||||
|
total = len(results)
|
||||||
|
passed = sum(results)
|
||||||
|
print(f"\nTests complete: {passed}/{total} passed")
|
||||||
|
|
||||||
|
if passed == total:
|
||||||
|
print("All tests passed!")
|
||||||
|
else:
|
||||||
|
print(f"{total - passed} tests failed")
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
asyncio.run(run_tests())
|
||||||
Reference in New Issue
Block a user