refactor(browser): consolidate browser strategy implementations
Moves common browser functionality into BaseBrowserStrategy class to reduce code duplication and improve maintainability. Key changes: - Adds shared browser argument building and session management to base class - Standardizes storage state handling across strategies - Improves process cleanup and error handling - Consolidates CDP URL management and container lifecycle BREAKING CHANGE: Changes browser_mode="custom" to "cdp" for consistency
This commit is contained in:
@@ -8,6 +8,8 @@ from abc import ABC, abstractmethod
|
||||
import asyncio
|
||||
import json
|
||||
import hashlib
|
||||
import os
|
||||
import time
|
||||
from typing import Optional, Tuple, List
|
||||
|
||||
from playwright.async_api import BrowserContext, Page
|
||||
@@ -16,14 +18,31 @@ from ...async_logger import AsyncLogger
|
||||
from ...async_configs import BrowserConfig, CrawlerRunConfig
|
||||
from ...config import DOWNLOAD_PAGE_TIMEOUT
|
||||
from ...js_snippet import load_js_script
|
||||
from ..utils import get_playwright
|
||||
|
||||
|
||||
class BaseBrowserStrategy(ABC):
|
||||
"""Base class for all browser strategies.
|
||||
|
||||
This abstract class defines the interface that all browser strategies
|
||||
must implement. It handles common functionality like context caching.
|
||||
must implement. It handles common functionality like context caching,
|
||||
browser configuration, and session management.
|
||||
"""
|
||||
|
||||
_playwright_instance = None
|
||||
|
||||
@classmethod
|
||||
async def get_playwright(cls):
|
||||
"""Get or create a shared Playwright instance.
|
||||
|
||||
Returns:
|
||||
Playwright: The shared Playwright instance
|
||||
"""
|
||||
# For now I dont want Singleton pattern for Playwright
|
||||
if cls._playwright_instance is None or True:
|
||||
cls._playwright_instance = await get_playwright()
|
||||
return cls._playwright_instance
|
||||
|
||||
def __init__(self, config: BrowserConfig, logger: Optional[AsyncLogger] = None):
|
||||
"""Initialize the strategy with configuration and logger.
|
||||
|
||||
@@ -35,23 +54,40 @@ class BaseBrowserStrategy(ABC):
|
||||
self.logger = logger
|
||||
self.browser = None
|
||||
self.default_context = None
|
||||
self.contexts_by_config = {}
|
||||
self._contexts_lock = asyncio.Lock()
|
||||
self.playwright = None
|
||||
|
||||
# Context management
|
||||
self.contexts_by_config = {} # config_signature -> context
|
||||
|
||||
self._contexts_lock = asyncio.Lock()
|
||||
|
||||
# Session management
|
||||
self.sessions = {}
|
||||
self.session_ttl = 1800 # 30 minutes default
|
||||
|
||||
# Playwright instance
|
||||
self.playwright = None
|
||||
|
||||
@abstractmethod
|
||||
async def start(self):
|
||||
"""Start the browser.
|
||||
|
||||
This method should be implemented by concrete strategies to initialize
|
||||
the browser in the appropriate way (direct launch, CDP connection, etc.)
|
||||
|
||||
Returns:
|
||||
self: For method chaining
|
||||
"""
|
||||
pass
|
||||
# Base implementation gets the playwright instance
|
||||
self.playwright = await self.get_playwright()
|
||||
return self
|
||||
|
||||
@abstractmethod
|
||||
async def get_page(self, crawlerRunConfig: CrawlerRunConfig) -> Tuple[Page, BrowserContext]:
|
||||
"""Get a page with specified configuration.
|
||||
|
||||
This method should be implemented by concrete strategies to create
|
||||
or retrieve a page according to their browser management approach.
|
||||
|
||||
Args:
|
||||
crawlerRunConfig: Crawler run configuration
|
||||
|
||||
@@ -75,15 +111,122 @@ class BaseBrowserStrategy(ABC):
|
||||
page, context = await self.get_page(crawlerRunConfig)
|
||||
pages.append((page, context))
|
||||
return pages
|
||||
|
||||
@abstractmethod
|
||||
async def close(self):
|
||||
"""Close the browser and clean up resources."""
|
||||
pass
|
||||
|
||||
def _build_browser_args(self) -> dict:
|
||||
"""Build browser launch arguments from config.
|
||||
|
||||
Returns:
|
||||
dict: Browser launch arguments for Playwright
|
||||
"""
|
||||
# Define common browser arguments that improve performance and stability
|
||||
args = [
|
||||
"--disable-gpu",
|
||||
"--disable-gpu-compositing",
|
||||
"--disable-software-rasterizer",
|
||||
"--no-sandbox",
|
||||
"--disable-dev-shm-usage",
|
||||
"--no-first-run",
|
||||
"--no-default-browser-check",
|
||||
"--disable-infobars",
|
||||
"--window-position=0,0",
|
||||
"--ignore-certificate-errors",
|
||||
"--ignore-certificate-errors-spki-list",
|
||||
"--disable-blink-features=AutomationControlled",
|
||||
"--window-position=400,0",
|
||||
"--disable-renderer-backgrounding",
|
||||
"--disable-ipc-flooding-protection",
|
||||
"--force-color-profile=srgb",
|
||||
"--mute-audio",
|
||||
"--disable-background-timer-throttling",
|
||||
f"--window-size={self.config.viewport_width},{self.config.viewport_height}",
|
||||
]
|
||||
|
||||
# Define browser disable options for light mode
|
||||
browser_disable_options = [
|
||||
"--disable-background-networking",
|
||||
"--disable-background-timer-throttling",
|
||||
"--disable-backgrounding-occluded-windows",
|
||||
"--disable-breakpad",
|
||||
"--disable-client-side-phishing-detection",
|
||||
"--disable-component-extensions-with-background-pages",
|
||||
"--disable-default-apps",
|
||||
"--disable-extensions",
|
||||
"--disable-features=TranslateUI",
|
||||
"--disable-hang-monitor",
|
||||
"--disable-ipc-flooding-protection",
|
||||
"--disable-popup-blocking",
|
||||
"--disable-prompt-on-repost",
|
||||
"--disable-sync",
|
||||
"--force-color-profile=srgb",
|
||||
"--metrics-recording-only",
|
||||
"--no-first-run",
|
||||
"--password-store=basic",
|
||||
"--use-mock-keychain",
|
||||
]
|
||||
|
||||
# Apply light mode settings if enabled
|
||||
if self.config.light_mode:
|
||||
args.extend(browser_disable_options)
|
||||
|
||||
# Apply text mode settings if enabled (disables images, JS, etc)
|
||||
if self.config.text_mode:
|
||||
args.extend([
|
||||
"--blink-settings=imagesEnabled=false",
|
||||
"--disable-remote-fonts",
|
||||
"--disable-images",
|
||||
"--disable-javascript",
|
||||
"--disable-software-rasterizer",
|
||||
"--disable-dev-shm-usage",
|
||||
])
|
||||
|
||||
# Add any extra arguments from the config
|
||||
if self.config.extra_args:
|
||||
args.extend(self.config.extra_args)
|
||||
|
||||
# Build the core browser args dictionary
|
||||
browser_args = {"headless": self.config.headless, "args": args}
|
||||
|
||||
# Add chrome channel if specified
|
||||
if self.config.chrome_channel:
|
||||
browser_args["channel"] = self.config.chrome_channel
|
||||
|
||||
# Configure downloads
|
||||
if self.config.accept_downloads:
|
||||
browser_args["downloads_path"] = self.config.downloads_path or os.path.join(
|
||||
os.getcwd(), "downloads"
|
||||
)
|
||||
os.makedirs(browser_args["downloads_path"], exist_ok=True)
|
||||
|
||||
# Check for user data directory
|
||||
if self.config.user_data_dir:
|
||||
# Ensure the directory exists
|
||||
os.makedirs(self.config.user_data_dir, exist_ok=True)
|
||||
browser_args["user_data_dir"] = self.config.user_data_dir
|
||||
|
||||
# Configure proxy settings
|
||||
if self.config.proxy or self.config.proxy_config:
|
||||
from playwright.async_api import ProxySettings
|
||||
|
||||
proxy_settings = (
|
||||
ProxySettings(server=self.config.proxy)
|
||||
if self.config.proxy
|
||||
else ProxySettings(
|
||||
server=self.config.proxy_config.server,
|
||||
username=self.config.proxy_config.username,
|
||||
password=self.config.proxy_config.password,
|
||||
)
|
||||
)
|
||||
browser_args["proxy"] = proxy_settings
|
||||
|
||||
return browser_args
|
||||
|
||||
def _make_config_signature(self, crawlerRunConfig: CrawlerRunConfig) -> str:
|
||||
"""Create a signature hash from configuration for context caching.
|
||||
|
||||
Converts the crawlerRunConfig into a dict, excludes ephemeral fields,
|
||||
then returns a hash of the sorted JSON. This yields a stable signature
|
||||
that identifies configurations requiring a unique browser context.
|
||||
|
||||
Args:
|
||||
crawlerRunConfig: Crawler run configuration
|
||||
|
||||
@@ -157,6 +300,7 @@ class BaseBrowserStrategy(ABC):
|
||||
"viewport": viewport_settings,
|
||||
"proxy": proxy_settings,
|
||||
"accept_downloads": self.config.accept_downloads,
|
||||
"storage_state": self.config.storage_state,
|
||||
"ignore_https_errors": self.config.ignore_https_errors,
|
||||
"device_scale_factor": 1.0,
|
||||
"java_script_enabled": self.config.java_script_enabled,
|
||||
@@ -167,8 +311,7 @@ class BaseBrowserStrategy(ABC):
|
||||
text_mode_settings = {
|
||||
"has_touch": False,
|
||||
"is_mobile": False,
|
||||
# Disable javascript in text mode
|
||||
"java_script_enabled": False
|
||||
"java_script_enabled": False, # Disable javascript in text mode
|
||||
}
|
||||
# Update context settings with text mode settings
|
||||
context_settings.update(text_mode_settings)
|
||||
@@ -177,16 +320,25 @@ class BaseBrowserStrategy(ABC):
|
||||
|
||||
# Handle storage state properly - this is key for persistence
|
||||
if self.config.storage_state:
|
||||
context_settings["storage_state"] = self.config.storage_state
|
||||
if self.logger:
|
||||
if isinstance(self.config.storage_state, str):
|
||||
self.logger.debug(f"Using storage state from file: {self.config.storage_state}", tag="BROWSER")
|
||||
else:
|
||||
self.logger.debug("Using storage state from config object", tag="BROWSER")
|
||||
|
||||
# If user_data_dir is specified, browser persistence should be automatic
|
||||
if self.config.user_data_dir and self.logger:
|
||||
self.logger.debug(f"Using user data directory: {self.config.user_data_dir}", tag="BROWSER")
|
||||
|
||||
if self.config.user_data_dir:
|
||||
# For CDP-based browsers, storage persistence is typically handled by the user_data_dir
|
||||
# at the browser level, but we'll create a storage_state location for Playwright as well
|
||||
storage_path = os.path.join(self.config.user_data_dir, "storage_state.json")
|
||||
if not os.path.exists(storage_path):
|
||||
# Create parent directory if it doesn't exist
|
||||
os.makedirs(os.path.dirname(storage_path), exist_ok=True)
|
||||
with open(storage_path, "w") as f:
|
||||
json.dump({}, f)
|
||||
self.config.storage_state = storage_path
|
||||
|
||||
if self.logger:
|
||||
self.logger.debug(f"Using user data directory: {self.config.user_data_dir}", tag="BROWSER")
|
||||
|
||||
# Apply crawler-specific configurations if provided
|
||||
if crawlerRunConfig:
|
||||
@@ -227,12 +379,19 @@ class BaseBrowserStrategy(ABC):
|
||||
context: The browser context to set up
|
||||
crawlerRunConfig: Configuration object containing all browser settings
|
||||
"""
|
||||
# Set HTTP headers
|
||||
if self.config.headers:
|
||||
await context.set_extra_http_headers(self.config.headers)
|
||||
|
||||
# Add cookies
|
||||
if self.config.cookies:
|
||||
await context.add_cookies(self.config.cookies)
|
||||
|
||||
# Apply storage state if provided
|
||||
if self.config.storage_state:
|
||||
await context.storage_state(path=None)
|
||||
|
||||
# Configure downloads
|
||||
if self.config.accept_downloads:
|
||||
context.set_default_timeout(DOWNLOAD_PAGE_TIMEOUT)
|
||||
context.set_default_navigation_timeout(DOWNLOAD_PAGE_TIMEOUT)
|
||||
@@ -250,12 +409,13 @@ class BaseBrowserStrategy(ABC):
|
||||
await context.set_extra_http_headers(combined_headers)
|
||||
|
||||
# Add default cookie
|
||||
target_url = (crawlerRunConfig and crawlerRunConfig.url) or "https://crawl4ai.com/"
|
||||
await context.add_cookies(
|
||||
[
|
||||
{
|
||||
"name": "cookiesEnabled",
|
||||
"value": "true",
|
||||
"url": crawlerRunConfig and crawlerRunConfig.url or "https://crawl4ai.com/",
|
||||
"url": target_url,
|
||||
}
|
||||
]
|
||||
)
|
||||
@@ -268,3 +428,150 @@ class BaseBrowserStrategy(ABC):
|
||||
or crawlerRunConfig.magic
|
||||
):
|
||||
await context.add_init_script(load_js_script("navigator_overrider"))
|
||||
|
||||
async def kill_session(self, session_id: str):
|
||||
"""Kill a browser session and clean up resources.
|
||||
|
||||
Args:
|
||||
session_id (str): The session ID to kill.
|
||||
"""
|
||||
if session_id not in self.sessions:
|
||||
return
|
||||
|
||||
context, page, _ = self.sessions[session_id]
|
||||
|
||||
# Close the page
|
||||
try:
|
||||
await page.close()
|
||||
except Exception as e:
|
||||
if self.logger:
|
||||
self.logger.error(f"Error closing page for session {session_id}: {str(e)}", tag="BROWSER")
|
||||
|
||||
# Remove session from tracking
|
||||
del self.sessions[session_id]
|
||||
|
||||
# Clean up any contexts that no longer have pages
|
||||
await self._cleanup_unused_contexts()
|
||||
|
||||
if self.logger:
|
||||
self.logger.debug(f"Killed session: {session_id}", tag="BROWSER")
|
||||
|
||||
async def _cleanup_unused_contexts(self):
|
||||
"""Clean up contexts that no longer have any pages."""
|
||||
async with self._contexts_lock:
|
||||
# Get all contexts we're managing
|
||||
contexts_to_check = list(self.contexts_by_config.values())
|
||||
|
||||
for context in contexts_to_check:
|
||||
# Check if the context has any pages left
|
||||
if not context.pages:
|
||||
# No pages left, we can close this context
|
||||
config_signature = next((sig for sig, ctx in self.contexts_by_config.items()
|
||||
if ctx == context), None)
|
||||
if config_signature:
|
||||
try:
|
||||
await context.close()
|
||||
del self.contexts_by_config[config_signature]
|
||||
if self.logger:
|
||||
self.logger.debug(f"Closed unused context", tag="BROWSER")
|
||||
except Exception as e:
|
||||
if self.logger:
|
||||
self.logger.error(f"Error closing unused context: {str(e)}", tag="BROWSER")
|
||||
|
||||
def _cleanup_expired_sessions(self):
|
||||
"""Clean up expired sessions based on TTL."""
|
||||
current_time = time.time()
|
||||
expired_sessions = [
|
||||
sid
|
||||
for sid, (_, _, last_used) in self.sessions.items()
|
||||
if current_time - last_used > self.session_ttl
|
||||
]
|
||||
|
||||
for sid in expired_sessions:
|
||||
if self.logger:
|
||||
self.logger.debug(f"Session expired: {sid}", tag="BROWSER")
|
||||
asyncio.create_task(self.kill_session(sid))
|
||||
|
||||
async def close(self):
|
||||
"""Close the browser and clean up resources.
|
||||
|
||||
This method handles common cleanup tasks like:
|
||||
1. Persisting storage state if a user_data_dir is configured
|
||||
2. Closing all sessions
|
||||
3. Closing all browser contexts
|
||||
4. Closing the browser
|
||||
5. Stopping Playwright
|
||||
|
||||
Child classes should override this method to add their specific cleanup logic,
|
||||
but should call super().close() to ensure common cleanup tasks are performed.
|
||||
"""
|
||||
# Set a flag to prevent race conditions during cleanup
|
||||
self.shutting_down = True
|
||||
|
||||
try:
|
||||
# Add brief delay if configured
|
||||
if self.config.sleep_on_close:
|
||||
await asyncio.sleep(0.5)
|
||||
|
||||
# Persist storage state if using a user data directory
|
||||
if self.config.user_data_dir and self.browser:
|
||||
for context in self.browser.contexts:
|
||||
try:
|
||||
# Ensure the directory exists
|
||||
storage_dir = os.path.join(self.config.user_data_dir, "Default")
|
||||
os.makedirs(storage_dir, exist_ok=True)
|
||||
|
||||
# Save storage state
|
||||
storage_path = os.path.join(storage_dir, "storage_state.json")
|
||||
await context.storage_state(path=storage_path)
|
||||
|
||||
if self.logger:
|
||||
self.logger.debug("Storage state persisted before closing browser", tag="BROWSER")
|
||||
except Exception as e:
|
||||
if self.logger:
|
||||
self.logger.warning(
|
||||
message="Failed to ensure storage persistence: {error}",
|
||||
tag="BROWSER",
|
||||
params={"error": str(e)}
|
||||
)
|
||||
|
||||
# Close all active sessions
|
||||
session_ids = list(self.sessions.keys())
|
||||
for session_id in session_ids:
|
||||
await self.kill_session(session_id)
|
||||
|
||||
# Close all cached contexts
|
||||
for ctx in self.contexts_by_config.values():
|
||||
try:
|
||||
await ctx.close()
|
||||
except Exception as e:
|
||||
if self.logger:
|
||||
self.logger.error(
|
||||
message="Error closing context: {error}",
|
||||
tag="BROWSER",
|
||||
params={"error": str(e)}
|
||||
)
|
||||
self.contexts_by_config.clear()
|
||||
|
||||
# Close the browser if it exists
|
||||
if self.browser:
|
||||
await self.browser.close()
|
||||
self.browser = None
|
||||
|
||||
# Stop playwright
|
||||
if self.playwright:
|
||||
await self.playwright.stop()
|
||||
self.playwright = None
|
||||
|
||||
except Exception as e:
|
||||
if self.logger:
|
||||
self.logger.error(
|
||||
message="Error during browser cleanup: {error}",
|
||||
tag="BROWSER",
|
||||
params={"error": str(e)}
|
||||
)
|
||||
finally:
|
||||
# Reset shutting down flag
|
||||
self.shutting_down = False
|
||||
|
||||
|
||||
Reference in New Issue
Block a user