Moves common browser functionality into BaseBrowserStrategy class to reduce code duplication and improve maintainability. Key changes: - Adds shared browser argument building and session management to base class - Standardizes storage state handling across strategies - Improves process cleanup and error handling - Consolidates CDP URL management and container lifecycle BREAKING CHANGE: Changes browser_mode="custom" to "cdp" for consistency
577 lines
22 KiB
Python
577 lines
22 KiB
Python
"""Browser strategies module for Crawl4AI.
|
|
|
|
This module implements the browser strategy pattern for different
|
|
browser implementations, including Playwright, CDP, and builtin browsers.
|
|
"""
|
|
|
|
from abc import ABC, abstractmethod
|
|
import asyncio
|
|
import json
|
|
import hashlib
|
|
import os
|
|
import time
|
|
from typing import Optional, Tuple, List
|
|
|
|
from playwright.async_api import BrowserContext, Page
|
|
|
|
from ...async_logger import AsyncLogger
|
|
from ...async_configs import BrowserConfig, CrawlerRunConfig
|
|
from ...config import DOWNLOAD_PAGE_TIMEOUT
|
|
from ...js_snippet import load_js_script
|
|
from ..utils import get_playwright
|
|
|
|
|
|
class BaseBrowserStrategy(ABC):
|
|
"""Base class for all browser strategies.
|
|
|
|
This abstract class defines the interface that all browser strategies
|
|
must implement. It handles common functionality like context caching,
|
|
browser configuration, and session management.
|
|
"""
|
|
|
|
_playwright_instance = None
|
|
|
|
@classmethod
|
|
async def get_playwright(cls):
|
|
"""Get or create a shared Playwright instance.
|
|
|
|
Returns:
|
|
Playwright: The shared Playwright instance
|
|
"""
|
|
# For now I dont want Singleton pattern for Playwright
|
|
if cls._playwright_instance is None or True:
|
|
cls._playwright_instance = await get_playwright()
|
|
return cls._playwright_instance
|
|
|
|
def __init__(self, config: BrowserConfig, logger: Optional[AsyncLogger] = None):
|
|
"""Initialize the strategy with configuration and logger.
|
|
|
|
Args:
|
|
config: Browser configuration
|
|
logger: Logger for recording events and errors
|
|
"""
|
|
self.config = config
|
|
self.logger = logger
|
|
self.browser = None
|
|
self.default_context = None
|
|
|
|
# Context management
|
|
self.contexts_by_config = {} # config_signature -> context
|
|
|
|
self._contexts_lock = asyncio.Lock()
|
|
|
|
# Session management
|
|
self.sessions = {}
|
|
self.session_ttl = 1800 # 30 minutes default
|
|
|
|
# Playwright instance
|
|
self.playwright = None
|
|
|
|
@abstractmethod
|
|
async def start(self):
|
|
"""Start the browser.
|
|
|
|
This method should be implemented by concrete strategies to initialize
|
|
the browser in the appropriate way (direct launch, CDP connection, etc.)
|
|
|
|
Returns:
|
|
self: For method chaining
|
|
"""
|
|
# Base implementation gets the playwright instance
|
|
self.playwright = await self.get_playwright()
|
|
return self
|
|
|
|
@abstractmethod
|
|
async def get_page(self, crawlerRunConfig: CrawlerRunConfig) -> Tuple[Page, BrowserContext]:
|
|
"""Get a page with specified configuration.
|
|
|
|
This method should be implemented by concrete strategies to create
|
|
or retrieve a page according to their browser management approach.
|
|
|
|
Args:
|
|
crawlerRunConfig: Crawler run configuration
|
|
|
|
Returns:
|
|
Tuple of (Page, BrowserContext)
|
|
"""
|
|
pass
|
|
|
|
async def get_pages(self, crawlerRunConfig: CrawlerRunConfig, count: int = 1) -> List[Tuple[Page, BrowserContext]]:
|
|
"""Get multiple pages with the same configuration.
|
|
|
|
Args:
|
|
crawlerRunConfig: Configuration for the pages
|
|
count: Number of pages to create
|
|
|
|
Returns:
|
|
List of (Page, Context) tuples
|
|
"""
|
|
pages = []
|
|
for _ in range(count):
|
|
page, context = await self.get_page(crawlerRunConfig)
|
|
pages.append((page, context))
|
|
return pages
|
|
|
|
def _build_browser_args(self) -> dict:
|
|
"""Build browser launch arguments from config.
|
|
|
|
Returns:
|
|
dict: Browser launch arguments for Playwright
|
|
"""
|
|
# Define common browser arguments that improve performance and stability
|
|
args = [
|
|
"--disable-gpu",
|
|
"--disable-gpu-compositing",
|
|
"--disable-software-rasterizer",
|
|
"--no-sandbox",
|
|
"--disable-dev-shm-usage",
|
|
"--no-first-run",
|
|
"--no-default-browser-check",
|
|
"--disable-infobars",
|
|
"--window-position=0,0",
|
|
"--ignore-certificate-errors",
|
|
"--ignore-certificate-errors-spki-list",
|
|
"--disable-blink-features=AutomationControlled",
|
|
"--window-position=400,0",
|
|
"--disable-renderer-backgrounding",
|
|
"--disable-ipc-flooding-protection",
|
|
"--force-color-profile=srgb",
|
|
"--mute-audio",
|
|
"--disable-background-timer-throttling",
|
|
f"--window-size={self.config.viewport_width},{self.config.viewport_height}",
|
|
]
|
|
|
|
# Define browser disable options for light mode
|
|
browser_disable_options = [
|
|
"--disable-background-networking",
|
|
"--disable-background-timer-throttling",
|
|
"--disable-backgrounding-occluded-windows",
|
|
"--disable-breakpad",
|
|
"--disable-client-side-phishing-detection",
|
|
"--disable-component-extensions-with-background-pages",
|
|
"--disable-default-apps",
|
|
"--disable-extensions",
|
|
"--disable-features=TranslateUI",
|
|
"--disable-hang-monitor",
|
|
"--disable-ipc-flooding-protection",
|
|
"--disable-popup-blocking",
|
|
"--disable-prompt-on-repost",
|
|
"--disable-sync",
|
|
"--force-color-profile=srgb",
|
|
"--metrics-recording-only",
|
|
"--no-first-run",
|
|
"--password-store=basic",
|
|
"--use-mock-keychain",
|
|
]
|
|
|
|
# Apply light mode settings if enabled
|
|
if self.config.light_mode:
|
|
args.extend(browser_disable_options)
|
|
|
|
# Apply text mode settings if enabled (disables images, JS, etc)
|
|
if self.config.text_mode:
|
|
args.extend([
|
|
"--blink-settings=imagesEnabled=false",
|
|
"--disable-remote-fonts",
|
|
"--disable-images",
|
|
"--disable-javascript",
|
|
"--disable-software-rasterizer",
|
|
"--disable-dev-shm-usage",
|
|
])
|
|
|
|
# Add any extra arguments from the config
|
|
if self.config.extra_args:
|
|
args.extend(self.config.extra_args)
|
|
|
|
# Build the core browser args dictionary
|
|
browser_args = {"headless": self.config.headless, "args": args}
|
|
|
|
# Add chrome channel if specified
|
|
if self.config.chrome_channel:
|
|
browser_args["channel"] = self.config.chrome_channel
|
|
|
|
# Configure downloads
|
|
if self.config.accept_downloads:
|
|
browser_args["downloads_path"] = self.config.downloads_path or os.path.join(
|
|
os.getcwd(), "downloads"
|
|
)
|
|
os.makedirs(browser_args["downloads_path"], exist_ok=True)
|
|
|
|
# Check for user data directory
|
|
if self.config.user_data_dir:
|
|
# Ensure the directory exists
|
|
os.makedirs(self.config.user_data_dir, exist_ok=True)
|
|
browser_args["user_data_dir"] = self.config.user_data_dir
|
|
|
|
# Configure proxy settings
|
|
if self.config.proxy or self.config.proxy_config:
|
|
from playwright.async_api import ProxySettings
|
|
|
|
proxy_settings = (
|
|
ProxySettings(server=self.config.proxy)
|
|
if self.config.proxy
|
|
else ProxySettings(
|
|
server=self.config.proxy_config.server,
|
|
username=self.config.proxy_config.username,
|
|
password=self.config.proxy_config.password,
|
|
)
|
|
)
|
|
browser_args["proxy"] = proxy_settings
|
|
|
|
return browser_args
|
|
|
|
def _make_config_signature(self, crawlerRunConfig: CrawlerRunConfig) -> str:
|
|
"""Create a signature hash from configuration for context caching.
|
|
|
|
Converts the crawlerRunConfig into a dict, excludes ephemeral fields,
|
|
then returns a hash of the sorted JSON. This yields a stable signature
|
|
that identifies configurations requiring a unique browser context.
|
|
|
|
Args:
|
|
crawlerRunConfig: Crawler run configuration
|
|
|
|
Returns:
|
|
str: Unique hash for this configuration
|
|
"""
|
|
config_dict = crawlerRunConfig.__dict__.copy()
|
|
# Exclude items that do not affect browser-level setup
|
|
ephemeral_keys = [
|
|
"session_id",
|
|
"js_code",
|
|
"scraping_strategy",
|
|
"extraction_strategy",
|
|
"chunking_strategy",
|
|
"cache_mode",
|
|
"content_filter",
|
|
"semaphore_count",
|
|
"url"
|
|
]
|
|
for key in ephemeral_keys:
|
|
if key in config_dict:
|
|
del config_dict[key]
|
|
|
|
# Convert to canonical JSON string
|
|
signature_json = json.dumps(config_dict, sort_keys=True, default=str)
|
|
|
|
# Hash the JSON so we get a compact, unique string
|
|
signature_hash = hashlib.sha256(signature_json.encode("utf-8")).hexdigest()
|
|
return signature_hash
|
|
|
|
async def create_browser_context(self, crawlerRunConfig: Optional[CrawlerRunConfig] = None) -> BrowserContext:
|
|
"""Creates and returns a new browser context with configured settings.
|
|
|
|
Args:
|
|
crawlerRunConfig: Configuration object for the crawler run
|
|
|
|
Returns:
|
|
BrowserContext: Browser context object with the specified configurations
|
|
"""
|
|
if not self.browser:
|
|
raise ValueError("Browser must be initialized before creating context")
|
|
|
|
# Base settings
|
|
user_agent = self.config.headers.get("User-Agent", self.config.user_agent)
|
|
viewport_settings = {
|
|
"width": self.config.viewport_width,
|
|
"height": self.config.viewport_height,
|
|
}
|
|
proxy_settings = {"server": self.config.proxy} if self.config.proxy else None
|
|
|
|
# Define blocked extensions for resource optimization
|
|
blocked_extensions = [
|
|
# Images
|
|
"jpg", "jpeg", "png", "gif", "webp", "svg", "ico", "bmp", "tiff", "psd",
|
|
# Fonts
|
|
"woff", "woff2", "ttf", "otf", "eot",
|
|
# Media
|
|
"mp4", "webm", "ogg", "avi", "mov", "wmv", "flv", "m4v", "mp3", "wav", "aac",
|
|
"m4a", "opus", "flac",
|
|
# Documents
|
|
"pdf", "doc", "docx", "xls", "xlsx", "ppt", "pptx",
|
|
# Archives
|
|
"zip", "rar", "7z", "tar", "gz",
|
|
# Scripts and data
|
|
"xml", "swf", "wasm",
|
|
]
|
|
|
|
# Common context settings
|
|
context_settings = {
|
|
"user_agent": user_agent,
|
|
"viewport": viewport_settings,
|
|
"proxy": proxy_settings,
|
|
"accept_downloads": self.config.accept_downloads,
|
|
"storage_state": self.config.storage_state,
|
|
"ignore_https_errors": self.config.ignore_https_errors,
|
|
"device_scale_factor": 1.0,
|
|
"java_script_enabled": self.config.java_script_enabled,
|
|
}
|
|
|
|
# Apply text mode settings if enabled
|
|
if self.config.text_mode:
|
|
text_mode_settings = {
|
|
"has_touch": False,
|
|
"is_mobile": False,
|
|
"java_script_enabled": False, # Disable javascript in text mode
|
|
}
|
|
# Update context settings with text mode settings
|
|
context_settings.update(text_mode_settings)
|
|
if self.logger:
|
|
self.logger.debug("Text mode enabled for browser context", tag="BROWSER")
|
|
|
|
# Handle storage state properly - this is key for persistence
|
|
if self.config.storage_state:
|
|
if self.logger:
|
|
if isinstance(self.config.storage_state, str):
|
|
self.logger.debug(f"Using storage state from file: {self.config.storage_state}", tag="BROWSER")
|
|
else:
|
|
self.logger.debug("Using storage state from config object", tag="BROWSER")
|
|
|
|
if self.config.user_data_dir:
|
|
# For CDP-based browsers, storage persistence is typically handled by the user_data_dir
|
|
# at the browser level, but we'll create a storage_state location for Playwright as well
|
|
storage_path = os.path.join(self.config.user_data_dir, "storage_state.json")
|
|
if not os.path.exists(storage_path):
|
|
# Create parent directory if it doesn't exist
|
|
os.makedirs(os.path.dirname(storage_path), exist_ok=True)
|
|
with open(storage_path, "w") as f:
|
|
json.dump({}, f)
|
|
self.config.storage_state = storage_path
|
|
|
|
if self.logger:
|
|
self.logger.debug(f"Using user data directory: {self.config.user_data_dir}", tag="BROWSER")
|
|
|
|
# Apply crawler-specific configurations if provided
|
|
if crawlerRunConfig:
|
|
# Check if there is value for crawlerRunConfig.proxy_config set add that to context
|
|
if crawlerRunConfig.proxy_config:
|
|
proxy_settings = {
|
|
"server": crawlerRunConfig.proxy_config.server,
|
|
}
|
|
if crawlerRunConfig.proxy_config.username:
|
|
proxy_settings.update({
|
|
"username": crawlerRunConfig.proxy_config.username,
|
|
"password": crawlerRunConfig.proxy_config.password,
|
|
})
|
|
context_settings["proxy"] = proxy_settings
|
|
|
|
# Create and return the context
|
|
try:
|
|
# Create the context with appropriate settings
|
|
context = await self.browser.new_context(**context_settings)
|
|
|
|
# Apply text mode resource blocking if enabled
|
|
if self.config.text_mode:
|
|
# Create and apply route patterns for each extension
|
|
for ext in blocked_extensions:
|
|
await context.route(f"**/*.{ext}", lambda route: route.abort())
|
|
|
|
return context
|
|
except Exception as e:
|
|
if self.logger:
|
|
self.logger.error(f"Error creating browser context: {str(e)}", tag="BROWSER")
|
|
# Fallback to basic context creation if the advanced settings fail
|
|
return await self.browser.new_context()
|
|
|
|
async def setup_context(self, context: BrowserContext, crawlerRunConfig: Optional[CrawlerRunConfig] = None):
|
|
"""Set up a browser context with the configured options.
|
|
|
|
Args:
|
|
context: The browser context to set up
|
|
crawlerRunConfig: Configuration object containing all browser settings
|
|
"""
|
|
# Set HTTP headers
|
|
if self.config.headers:
|
|
await context.set_extra_http_headers(self.config.headers)
|
|
|
|
# Add cookies
|
|
if self.config.cookies:
|
|
await context.add_cookies(self.config.cookies)
|
|
|
|
# Apply storage state if provided
|
|
if self.config.storage_state:
|
|
await context.storage_state(path=None)
|
|
|
|
# Configure downloads
|
|
if self.config.accept_downloads:
|
|
context.set_default_timeout(DOWNLOAD_PAGE_TIMEOUT)
|
|
context.set_default_navigation_timeout(DOWNLOAD_PAGE_TIMEOUT)
|
|
if self.config.downloads_path:
|
|
context._impl_obj._options["accept_downloads"] = True
|
|
context._impl_obj._options["downloads_path"] = self.config.downloads_path
|
|
|
|
# Handle user agent and browser hints
|
|
if self.config.user_agent:
|
|
combined_headers = {
|
|
"User-Agent": self.config.user_agent,
|
|
"sec-ch-ua": self.config.browser_hint,
|
|
}
|
|
combined_headers.update(self.config.headers)
|
|
await context.set_extra_http_headers(combined_headers)
|
|
|
|
# Add default cookie
|
|
target_url = (crawlerRunConfig and crawlerRunConfig.url) or "https://crawl4ai.com/"
|
|
await context.add_cookies(
|
|
[
|
|
{
|
|
"name": "cookiesEnabled",
|
|
"value": "true",
|
|
"url": target_url,
|
|
}
|
|
]
|
|
)
|
|
|
|
# Handle navigator overrides
|
|
if crawlerRunConfig:
|
|
if (
|
|
crawlerRunConfig.override_navigator
|
|
or crawlerRunConfig.simulate_user
|
|
or crawlerRunConfig.magic
|
|
):
|
|
await context.add_init_script(load_js_script("navigator_overrider"))
|
|
|
|
async def kill_session(self, session_id: str):
|
|
"""Kill a browser session and clean up resources.
|
|
|
|
Args:
|
|
session_id (str): The session ID to kill.
|
|
"""
|
|
if session_id not in self.sessions:
|
|
return
|
|
|
|
context, page, _ = self.sessions[session_id]
|
|
|
|
# Close the page
|
|
try:
|
|
await page.close()
|
|
except Exception as e:
|
|
if self.logger:
|
|
self.logger.error(f"Error closing page for session {session_id}: {str(e)}", tag="BROWSER")
|
|
|
|
# Remove session from tracking
|
|
del self.sessions[session_id]
|
|
|
|
# Clean up any contexts that no longer have pages
|
|
await self._cleanup_unused_contexts()
|
|
|
|
if self.logger:
|
|
self.logger.debug(f"Killed session: {session_id}", tag="BROWSER")
|
|
|
|
async def _cleanup_unused_contexts(self):
|
|
"""Clean up contexts that no longer have any pages."""
|
|
async with self._contexts_lock:
|
|
# Get all contexts we're managing
|
|
contexts_to_check = list(self.contexts_by_config.values())
|
|
|
|
for context in contexts_to_check:
|
|
# Check if the context has any pages left
|
|
if not context.pages:
|
|
# No pages left, we can close this context
|
|
config_signature = next((sig for sig, ctx in self.contexts_by_config.items()
|
|
if ctx == context), None)
|
|
if config_signature:
|
|
try:
|
|
await context.close()
|
|
del self.contexts_by_config[config_signature]
|
|
if self.logger:
|
|
self.logger.debug(f"Closed unused context", tag="BROWSER")
|
|
except Exception as e:
|
|
if self.logger:
|
|
self.logger.error(f"Error closing unused context: {str(e)}", tag="BROWSER")
|
|
|
|
def _cleanup_expired_sessions(self):
|
|
"""Clean up expired sessions based on TTL."""
|
|
current_time = time.time()
|
|
expired_sessions = [
|
|
sid
|
|
for sid, (_, _, last_used) in self.sessions.items()
|
|
if current_time - last_used > self.session_ttl
|
|
]
|
|
|
|
for sid in expired_sessions:
|
|
if self.logger:
|
|
self.logger.debug(f"Session expired: {sid}", tag="BROWSER")
|
|
asyncio.create_task(self.kill_session(sid))
|
|
|
|
async def close(self):
|
|
"""Close the browser and clean up resources.
|
|
|
|
This method handles common cleanup tasks like:
|
|
1. Persisting storage state if a user_data_dir is configured
|
|
2. Closing all sessions
|
|
3. Closing all browser contexts
|
|
4. Closing the browser
|
|
5. Stopping Playwright
|
|
|
|
Child classes should override this method to add their specific cleanup logic,
|
|
but should call super().close() to ensure common cleanup tasks are performed.
|
|
"""
|
|
# Set a flag to prevent race conditions during cleanup
|
|
self.shutting_down = True
|
|
|
|
try:
|
|
# Add brief delay if configured
|
|
if self.config.sleep_on_close:
|
|
await asyncio.sleep(0.5)
|
|
|
|
# Persist storage state if using a user data directory
|
|
if self.config.user_data_dir and self.browser:
|
|
for context in self.browser.contexts:
|
|
try:
|
|
# Ensure the directory exists
|
|
storage_dir = os.path.join(self.config.user_data_dir, "Default")
|
|
os.makedirs(storage_dir, exist_ok=True)
|
|
|
|
# Save storage state
|
|
storage_path = os.path.join(storage_dir, "storage_state.json")
|
|
await context.storage_state(path=storage_path)
|
|
|
|
if self.logger:
|
|
self.logger.debug("Storage state persisted before closing browser", tag="BROWSER")
|
|
except Exception as e:
|
|
if self.logger:
|
|
self.logger.warning(
|
|
message="Failed to ensure storage persistence: {error}",
|
|
tag="BROWSER",
|
|
params={"error": str(e)}
|
|
)
|
|
|
|
# Close all active sessions
|
|
session_ids = list(self.sessions.keys())
|
|
for session_id in session_ids:
|
|
await self.kill_session(session_id)
|
|
|
|
# Close all cached contexts
|
|
for ctx in self.contexts_by_config.values():
|
|
try:
|
|
await ctx.close()
|
|
except Exception as e:
|
|
if self.logger:
|
|
self.logger.error(
|
|
message="Error closing context: {error}",
|
|
tag="BROWSER",
|
|
params={"error": str(e)}
|
|
)
|
|
self.contexts_by_config.clear()
|
|
|
|
# Close the browser if it exists
|
|
if self.browser:
|
|
await self.browser.close()
|
|
self.browser = None
|
|
|
|
# Stop playwright
|
|
if self.playwright:
|
|
await self.playwright.stop()
|
|
self.playwright = None
|
|
|
|
except Exception as e:
|
|
if self.logger:
|
|
self.logger.error(
|
|
message="Error during browser cleanup: {error}",
|
|
tag="BROWSER",
|
|
params={"error": str(e)}
|
|
)
|
|
finally:
|
|
# Reset shutting down flag
|
|
self.shutting_down = False
|
|
|
|
|