Enhance storage state persistence mechanism in CDP browser strategy by: - Explicitly saving storage state for each browser context - Using proper file path for storage state - Removing unnecessary sleep delay Also includes test improvements: - Simplified test configurations in playwright tests - Temporarily disabled some CDP tests
189 lines
6.8 KiB
Python
189 lines
6.8 KiB
Python
"""Browser manager module for Crawl4AI.
|
|
|
|
This module provides a central browser management class that uses the
|
|
strategy pattern internally while maintaining the existing API.
|
|
It also implements a page pooling mechanism for improved performance.
|
|
"""
|
|
|
|
import asyncio
|
|
import time
|
|
from typing import Optional, Tuple, List
|
|
|
|
from playwright.async_api import Page, BrowserContext
|
|
|
|
from ..async_logger import AsyncLogger
|
|
from ..async_configs import BrowserConfig, CrawlerRunConfig
|
|
|
|
from .strategies import (
|
|
BaseBrowserStrategy,
|
|
PlaywrightBrowserStrategy,
|
|
CDPBrowserStrategy,
|
|
BuiltinBrowserStrategy
|
|
)
|
|
|
|
class BrowserManager:
|
|
"""Main interface for browser management in Crawl4AI.
|
|
|
|
This class maintains backward compatibility with the existing implementation
|
|
while using the strategy pattern internally for different browser types.
|
|
|
|
Attributes:
|
|
config (BrowserConfig): Configuration object containing all browser settings
|
|
logger: Logger instance for recording events and errors
|
|
browser: The browser instance
|
|
default_context: The default browser context
|
|
managed_browser: The managed browser instance
|
|
playwright: The Playwright instance
|
|
sessions: Dictionary to store session information
|
|
session_ttl: Session timeout in seconds
|
|
"""
|
|
|
|
def __init__(self, browser_config: Optional[BrowserConfig] = None, logger: Optional[AsyncLogger] = None):
|
|
"""Initialize the BrowserManager with a browser configuration.
|
|
|
|
Args:
|
|
browser_config: Configuration object containing all browser settings
|
|
logger: Logger instance for recording events and errors
|
|
"""
|
|
self.config = browser_config or BrowserConfig()
|
|
self.logger = logger
|
|
|
|
# Create strategy based on configuration
|
|
self._strategy = self._create_strategy()
|
|
|
|
# Initialize state variables for compatibility with existing code
|
|
self.browser = None
|
|
self.default_context = None
|
|
self.managed_browser = None
|
|
self.playwright = None
|
|
|
|
# For session management (from existing implementation)
|
|
self.sessions = {}
|
|
self.session_ttl = 1800 # 30 minutes
|
|
|
|
def _create_strategy(self) -> BaseBrowserStrategy:
|
|
"""Create appropriate browser strategy based on configuration.
|
|
|
|
Returns:
|
|
BaseBrowserStrategy: The selected browser strategy
|
|
"""
|
|
if self.config.browser_mode == "builtin":
|
|
return BuiltinBrowserStrategy(self.config, self.logger)
|
|
elif self.config.cdp_url or self.config.use_managed_browser:
|
|
return CDPBrowserStrategy(self.config, self.logger)
|
|
else:
|
|
return PlaywrightBrowserStrategy(self.config, self.logger)
|
|
|
|
async def start(self):
|
|
"""Start the browser instance and set up the default context.
|
|
|
|
Returns:
|
|
self: For method chaining
|
|
"""
|
|
# Start the strategy
|
|
await self._strategy.start()
|
|
|
|
# Update legacy references
|
|
self.browser = self._strategy.browser
|
|
self.default_context = self._strategy.default_context
|
|
|
|
# Set browser process reference (for CDP strategy)
|
|
if hasattr(self._strategy, 'browser_process'):
|
|
self.managed_browser = self._strategy
|
|
|
|
# Set Playwright reference
|
|
self.playwright = self._strategy.playwright
|
|
|
|
# Sync sessions if needed
|
|
if hasattr(self._strategy, 'sessions'):
|
|
self.sessions = self._strategy.sessions
|
|
self.session_ttl = self._strategy.session_ttl
|
|
|
|
return self
|
|
|
|
async def get_page(self, crawlerRunConfig: CrawlerRunConfig) -> Tuple[Page, BrowserContext]:
|
|
"""Get a page for the given configuration.
|
|
|
|
Args:
|
|
crawlerRunConfig: Configuration object for the crawler run
|
|
|
|
Returns:
|
|
Tuple of (Page, BrowserContext)
|
|
"""
|
|
# Delegate to strategy
|
|
page, context = await self._strategy.get_page(crawlerRunConfig)
|
|
|
|
# Sync sessions if needed
|
|
if hasattr(self._strategy, 'sessions'):
|
|
self.sessions = self._strategy.sessions
|
|
|
|
return page, context
|
|
|
|
async def get_pages(self, crawlerRunConfig: CrawlerRunConfig, count: int = 1) -> List[Tuple[Page, BrowserContext]]:
|
|
"""Get multiple pages with the same configuration.
|
|
|
|
This method efficiently creates multiple browser pages using the same configuration,
|
|
which is useful for parallel crawling of multiple URLs.
|
|
|
|
Args:
|
|
crawlerRunConfig: Configuration for the pages
|
|
count: Number of pages to create
|
|
|
|
Returns:
|
|
List of (Page, Context) tuples
|
|
"""
|
|
# Delegate to strategy
|
|
pages = await self._strategy.get_pages(crawlerRunConfig, count)
|
|
|
|
# Sync sessions if needed
|
|
if hasattr(self._strategy, 'sessions'):
|
|
self.sessions = self._strategy.sessions
|
|
|
|
return pages
|
|
|
|
async def kill_session(self, session_id: str):
|
|
"""Kill a browser session and clean up resources.
|
|
|
|
Args:
|
|
session_id: The session ID to kill
|
|
"""
|
|
# Handle kill_session via our strategy if it supports it
|
|
if hasattr(self._strategy, '_kill_session'):
|
|
await self._strategy._kill_session(session_id)
|
|
elif session_id in self.sessions:
|
|
context, page, _ = self.sessions[session_id]
|
|
await page.close()
|
|
# Only close context if not using CDP
|
|
if not self.config.use_managed_browser and not self.config.cdp_url and not self.config.browser_mode == "builtin":
|
|
await context.close()
|
|
del self.sessions[session_id]
|
|
|
|
def _cleanup_expired_sessions(self):
|
|
"""Clean up expired sessions based on TTL."""
|
|
# Use strategy's implementation if available
|
|
if hasattr(self._strategy, '_cleanup_expired_sessions'):
|
|
self._strategy._cleanup_expired_sessions()
|
|
return
|
|
|
|
# Otherwise use our own implementation
|
|
current_time = time.time()
|
|
expired_sessions = [
|
|
sid
|
|
for sid, (_, _, last_used) in self.sessions.items()
|
|
if current_time - last_used > self.session_ttl
|
|
]
|
|
for sid in expired_sessions:
|
|
asyncio.create_task(self.kill_session(sid))
|
|
|
|
async def close(self):
|
|
"""Close the browser and clean up resources."""
|
|
# Delegate to strategy
|
|
await self._strategy.close()
|
|
|
|
# Reset legacy references
|
|
self.browser = None
|
|
self.default_context = None
|
|
self.managed_browser = None
|
|
self.playwright = None
|
|
self.sessions = {}
|