From 6eeb2e4076d9822b429e71081d34f64875a92b5d Mon Sep 17 00:00:00 2001 From: UncleCode Date: Sun, 23 Mar 2025 19:07:13 +0800 Subject: [PATCH] feat(browser): enhance browser context creation with user data directory support and improved storage state handling --- crawl4ai/browser/strategies.py | 222 +++++++++++++++++++++------------ 1 file changed, 139 insertions(+), 83 deletions(-) diff --git a/crawl4ai/browser/strategies.py b/crawl4ai/browser/strategies.py index 85feef36..68d2d97d 100644 --- a/crawl4ai/browser/strategies.py +++ b/crawl4ai/browser/strategies.py @@ -139,6 +139,112 @@ class BaseBrowserStrategy(ABC): signature_hash = hashlib.sha256(signature_json.encode("utf-8")).hexdigest() return signature_hash + async def create_browser_context(self, crawlerRunConfig: Optional[CrawlerRunConfig] = None) -> BrowserContext: + """Creates and returns a new browser context with configured settings. + + Args: + crawlerRunConfig: Configuration object for the crawler run + + Returns: + BrowserContext: Browser context object with the specified configurations + """ + if not self.browser: + raise ValueError("Browser must be initialized before creating context") + + # Base settings + user_agent = self.config.headers.get("User-Agent", self.config.user_agent) + viewport_settings = { + "width": self.config.viewport_width, + "height": self.config.viewport_height, + } + proxy_settings = {"server": self.config.proxy} if self.config.proxy else None + + # Define blocked extensions for resource optimization + blocked_extensions = [ + # Images + "jpg", "jpeg", "png", "gif", "webp", "svg", "ico", "bmp", "tiff", "psd", + # Fonts + "woff", "woff2", "ttf", "otf", "eot", + # Media + "mp4", "webm", "ogg", "avi", "mov", "wmv", "flv", "m4v", "mp3", "wav", "aac", + "m4a", "opus", "flac", + # Documents + "pdf", "doc", "docx", "xls", "xlsx", "ppt", "pptx", + # Archives + "zip", "rar", "7z", "tar", "gz", + # Scripts and data + "xml", "swf", "wasm", + ] + + # Common context settings + context_settings = { + "user_agent": user_agent, + "viewport": viewport_settings, + "proxy": proxy_settings, + "accept_downloads": self.config.accept_downloads, + "ignore_https_errors": self.config.ignore_https_errors, + "device_scale_factor": 1.0, + "java_script_enabled": self.config.java_script_enabled, + } + + # Apply text mode settings if enabled + if self.config.text_mode: + text_mode_settings = { + "has_touch": False, + "is_mobile": False, + # Disable javascript in text mode + "java_script_enabled": False + } + # Update context settings with text mode settings + context_settings.update(text_mode_settings) + if self.logger: + self.logger.debug("Text mode enabled for browser context", tag="BROWSER") + + # Handle storage state properly - this is key for persistence + if self.config.storage_state: + context_settings["storage_state"] = self.config.storage_state + if self.logger: + if isinstance(self.config.storage_state, str): + self.logger.debug(f"Using storage state from file: {self.config.storage_state}", tag="BROWSER") + else: + self.logger.debug("Using storage state from config object", tag="BROWSER") + + # If user_data_dir is specified, browser persistence should be automatic + if self.config.user_data_dir and self.logger: + self.logger.debug(f"Using user data directory: {self.config.user_data_dir}", tag="BROWSER") + + # Apply crawler-specific configurations if provided + if crawlerRunConfig: + # Check if there is value for crawlerRunConfig.proxy_config set add that to context + if crawlerRunConfig.proxy_config: + proxy_settings = { + "server": crawlerRunConfig.proxy_config.server, + } + if crawlerRunConfig.proxy_config.username: + proxy_settings.update({ + "username": crawlerRunConfig.proxy_config.username, + "password": crawlerRunConfig.proxy_config.password, + }) + context_settings["proxy"] = proxy_settings + + # Create and return the context + try: + # Create the context with appropriate settings + context = await self.browser.new_context(**context_settings) + + # Apply text mode resource blocking if enabled + if self.config.text_mode: + # Create and apply route patterns for each extension + for ext in blocked_extensions: + await context.route(f"**/*.{ext}", lambda route: route.abort()) + + return context + except Exception as e: + if self.logger: + self.logger.error(f"Error creating browser context: {str(e)}", tag="BROWSER") + # Fallback to basic context creation if the advanced settings fail + return await self.browser.new_context() + async def setup_context(self, context: BrowserContext, crawlerRunConfig: Optional[CrawlerRunConfig] = None): """Set up a browser context with the configured options. @@ -301,97 +407,32 @@ class PlaywrightBrowserStrategy(BaseBrowserStrategy): async def create_browser_context(self, crawlerRunConfig: Optional[CrawlerRunConfig] = None) -> BrowserContext: """Creates and returns a new browser context with configured settings. + This implementation extends the base class version to handle user_data_dir specifically. + Args: crawlerRunConfig: Configuration object for the crawler run Returns: BrowserContext: Browser context object with the specified configurations """ - # Base settings - user_agent = self.config.headers.get("User-Agent", self.config.user_agent) - viewport_settings = { - "width": self.config.viewport_width, - "height": self.config.viewport_height, - } - proxy_settings = {"server": self.config.proxy} if self.config.proxy else None - - blocked_extensions = [ - # Images - "jpg", "jpeg", "png", "gif", "webp", "svg", "ico", "bmp", "tiff", "psd", - # Fonts - "woff", "woff2", "ttf", "otf", "eot", - # Media - "mp4", "webm", "ogg", "avi", "mov", "wmv", "flv", "m4v", "mp3", "wav", "aac", - "m4a", "opus", "flac", - # Documents - "pdf", "doc", "docx", "xls", "xlsx", "ppt", "pptx", - # Archives - "zip", "rar", "7z", "tar", "gz", - # Scripts and data - "xml", "swf", "wasm", - ] - - # Common context settings - context_settings = { - "user_agent": user_agent, - "viewport": viewport_settings, - "proxy": proxy_settings, - "accept_downloads": self.config.accept_downloads, - "ignore_https_errors": self.config.ignore_https_errors, - "device_scale_factor": 1.0, - "java_script_enabled": self.config.java_script_enabled, - } - - # Handle storage state properly - this is key for persistence - if self.config.storage_state: - context_settings["storage_state"] = self.config.storage_state - if self.logger: - if isinstance(self.config.storage_state, str): - self.logger.debug(f"Using storage state from file: {self.config.storage_state}", tag="BROWSER") - else: - self.logger.debug("Using storage state from config object", tag="BROWSER") - + # Handle user_data_dir explicitly to ensure storage persistence if self.config.user_data_dir: - context_settings["storage_state"] = os.path.join( - self.config.user_data_dir, "Default", "storage_state.json" - ) + # Create a storage state file path if none exists + storage_path = os.path.join(self.config.user_data_dir, "Default", "storage_state.json") + # Create the file if it doesn't exist - if not os.path.exists(context_settings["storage_state"]): - os.makedirs(os.path.dirname(context_settings["storage_state"]), exist_ok=True) - with open(context_settings["storage_state"], "w") as f: + if not os.path.exists(storage_path): + os.makedirs(os.path.dirname(storage_path), exist_ok=True) + with open(storage_path, "w") as f: json.dump({}, f) - - - if crawlerRunConfig: - # Check if there is value for crawlerRunConfig.proxy_config set add that to context - if crawlerRunConfig.proxy_config: - proxy_settings = { - "server": crawlerRunConfig.proxy_config.server, - } - if crawlerRunConfig.proxy_config.username: - proxy_settings.update({ - "username": crawlerRunConfig.proxy_config.username, - "password": crawlerRunConfig.proxy_config.password, - }) - context_settings["proxy"] = proxy_settings - - if self.config.text_mode: - text_mode_settings = { - "has_touch": False, - "is_mobile": False, - } - # Update context settings with text mode settings - context_settings.update(text_mode_settings) - - # Create and return the context with all settings - context = await self.browser.new_context(**context_settings) - - # Apply text mode settings if enabled - if self.config.text_mode: - # Create and apply route patterns for each extension - for ext in blocked_extensions: - await context.route(f"**/*.{ext}", lambda route: route.abort()) - return context + + # Override storage_state with our specific path + self.config.storage_state = storage_path + if self.logger: + self.logger.debug(f"Using persistent storage state at: {storage_path}", tag="BROWSER") + + # Now call the base class implementation which handles everything else + return await super().create_browser_context(crawlerRunConfig) def _cleanup_expired_sessions(self): """Clean up expired sessions based on TTL.""" @@ -704,13 +745,28 @@ class CDPBrowserStrategy(BaseBrowserStrategy): async def create_browser_context(self, crawlerRunConfig: Optional[CrawlerRunConfig] = None) -> BrowserContext: """Create a new browser context. + Uses the base class implementation which handles all configurations. + Args: crawlerRunConfig: Configuration object for the crawler run Returns: BrowserContext: Browser context object """ - return await self.browser.new_context() + # Handle user_data_dir for CDP browsers + if self.config.user_data_dir: + # For CDP-based browsers, storage persistence is typically handled by the user_data_dir + # at the browser level, but we'll create a storage_state location for Playwright as well + storage_path = os.path.join(self.config.user_data_dir, "storage_state.json") + if not os.path.exists(storage_path): + # Create parent directory if it doesn't exist + os.makedirs(os.path.dirname(storage_path), exist_ok=True) + with open(storage_path, "w") as f: + json.dump({}, f) + self.config.storage_state = storage_path + + # Use the base class implementation + return await super().create_browser_context(crawlerRunConfig) def _cleanup_expired_sessions(self): """Clean up expired sessions based on TTL."""