feat(browser): enhance browser context creation with user data directory support and improved storage state handling

2025-03-23 19:07:13 +08:00
parent 0094cac675
commit 6eeb2e4076
1 changed files with 139 additions and 83 deletions
--- a/crawl4ai/browser/strategies.py
+++ b/crawl4ai/browser/strategies.py
@@ -139,6 +139,112 @@ class BaseBrowserStrategy(ABC):
        signature_hash = hashlib.sha256(signature_json.encode("utf-8")).hexdigest()
        return signature_hash
        
+    async def create_browser_context(self, crawlerRunConfig: Optional[CrawlerRunConfig] = None) -> BrowserContext:
+        """Creates and returns a new browser context with configured settings.
+        
+        Args:
+            crawlerRunConfig: Configuration object for the crawler run
+            
+        Returns:
+            BrowserContext: Browser context object with the specified configurations
+        """
+        if not self.browser:
+            raise ValueError("Browser must be initialized before creating context")
+            
+        # Base settings
+        user_agent = self.config.headers.get("User-Agent", self.config.user_agent) 
+        viewport_settings = {
+            "width": self.config.viewport_width,
+            "height": self.config.viewport_height,
+        }
+        proxy_settings = {"server": self.config.proxy} if self.config.proxy else None
+        
+        # Define blocked extensions for resource optimization
+        blocked_extensions = [
+            # Images
+            "jpg", "jpeg", "png", "gif", "webp", "svg", "ico", "bmp", "tiff", "psd",
+            # Fonts
+            "woff", "woff2", "ttf", "otf", "eot",
+            # Media
+            "mp4", "webm", "ogg", "avi", "mov", "wmv", "flv", "m4v", "mp3", "wav", "aac",
+            "m4a", "opus", "flac",
+            # Documents
+            "pdf", "doc", "docx", "xls", "xlsx", "ppt", "pptx",
+            # Archives
+            "zip", "rar", "7z", "tar", "gz",
+            # Scripts and data
+            "xml", "swf", "wasm",
+        ]
+
+        # Common context settings
+        context_settings = {
+            "user_agent": user_agent,
+            "viewport": viewport_settings,
+            "proxy": proxy_settings,
+            "accept_downloads": self.config.accept_downloads,
+            "ignore_https_errors": self.config.ignore_https_errors,
+            "device_scale_factor": 1.0,
+            "java_script_enabled": self.config.java_script_enabled,
+        }
+        
+        # Apply text mode settings if enabled
+        if self.config.text_mode:
+            text_mode_settings = {
+                "has_touch": False,
+                "is_mobile": False,
+                # Disable javascript in text mode
+                "java_script_enabled": False
+            }
+            # Update context settings with text mode settings
+            context_settings.update(text_mode_settings)
+            if self.logger:
+                self.logger.debug("Text mode enabled for browser context", tag="BROWSER")
+        
+        # Handle storage state properly - this is key for persistence
+        if self.config.storage_state:
+            context_settings["storage_state"] = self.config.storage_state
+            if self.logger:
+                if isinstance(self.config.storage_state, str):
+                    self.logger.debug(f"Using storage state from file: {self.config.storage_state}", tag="BROWSER")
+                else:
+                    self.logger.debug("Using storage state from config object", tag="BROWSER")
+                    
+        # If user_data_dir is specified, browser persistence should be automatic
+        if self.config.user_data_dir and self.logger:
+            self.logger.debug(f"Using user data directory: {self.config.user_data_dir}", tag="BROWSER")
+        
+        # Apply crawler-specific configurations if provided
+        if crawlerRunConfig:
+            # Check if there is value for crawlerRunConfig.proxy_config set add that to context
+            if crawlerRunConfig.proxy_config:
+                proxy_settings = {
+                    "server": crawlerRunConfig.proxy_config.server,
+                }
+                if crawlerRunConfig.proxy_config.username:
+                    proxy_settings.update({
+                        "username": crawlerRunConfig.proxy_config.username,
+                        "password": crawlerRunConfig.proxy_config.password,
+                    })
+                context_settings["proxy"] = proxy_settings
+                
+        # Create and return the context
+        try:
+            # Create the context with appropriate settings
+            context = await self.browser.new_context(**context_settings)
+            
+            # Apply text mode resource blocking if enabled
+            if self.config.text_mode:
+                # Create and apply route patterns for each extension
+                for ext in blocked_extensions:
+                    await context.route(f"**/*.{ext}", lambda route: route.abort())
+                    
+            return context
+        except Exception as e:
+            if self.logger:
+                self.logger.error(f"Error creating browser context: {str(e)}", tag="BROWSER")
+            # Fallback to basic context creation if the advanced settings fail
+            return await self.browser.new_context()
+        
    async def setup_context(self, context: BrowserContext, crawlerRunConfig: Optional[CrawlerRunConfig] = None):
        """Set up a browser context with the configured options.
        
@@ -301,97 +407,32 @@ class PlaywrightBrowserStrategy(BaseBrowserStrategy):
    async def create_browser_context(self, crawlerRunConfig: Optional[CrawlerRunConfig] = None) -> BrowserContext:
        """Creates and returns a new browser context with configured settings.
        
+        This implementation extends the base class version to handle user_data_dir specifically.
+        
        Args:
            crawlerRunConfig: Configuration object for the crawler run
            
        Returns:
            BrowserContext: Browser context object with the specified configurations
        """
-        # Base settings
-        user_agent = self.config.headers.get("User-Agent", self.config.user_agent) 
-        viewport_settings = {
-            "width": self.config.viewport_width,
-            "height": self.config.viewport_height,
-        }
-        proxy_settings = {"server": self.config.proxy} if self.config.proxy else None
-
-        blocked_extensions = [
-            # Images
-            "jpg", "jpeg", "png", "gif", "webp", "svg", "ico", "bmp", "tiff", "psd",
-            # Fonts
-            "woff", "woff2", "ttf", "otf", "eot",
-            # Media
-            "mp4", "webm", "ogg", "avi", "mov", "wmv", "flv", "m4v", "mp3", "wav", "aac",
-            "m4a", "opus", "flac",
-            # Documents
-            "pdf", "doc", "docx", "xls", "xlsx", "ppt", "pptx",
-            # Archives
-            "zip", "rar", "7z", "tar", "gz",
-            # Scripts and data
-            "xml", "swf", "wasm",
-        ]
-
-        # Common context settings
-        context_settings = {
-            "user_agent": user_agent,
-            "viewport": viewport_settings,
-            "proxy": proxy_settings,
-            "accept_downloads": self.config.accept_downloads,
-            "ignore_https_errors": self.config.ignore_https_errors,
-            "device_scale_factor": 1.0,
-            "java_script_enabled": self.config.java_script_enabled,
-        }
-        
-        # Handle storage state properly - this is key for persistence
-        if self.config.storage_state:
-            context_settings["storage_state"] = self.config.storage_state
-            if self.logger:
-                if isinstance(self.config.storage_state, str):
-                    self.logger.debug(f"Using storage state from file: {self.config.storage_state}", tag="BROWSER")
-                else:
-                    self.logger.debug("Using storage state from config object", tag="BROWSER")
-        
+        # Handle user_data_dir explicitly to ensure storage persistence
        if self.config.user_data_dir:
-            context_settings["storage_state"] = os.path.join(
-                self.config.user_data_dir, "Default", "storage_state.json"
-            )
+            # Create a storage state file path if none exists
+            storage_path = os.path.join(self.config.user_data_dir, "Default", "storage_state.json")
+            
            # Create the file if it doesn't exist
-            if not os.path.exists(context_settings["storage_state"]):
-                os.makedirs(os.path.dirname(context_settings["storage_state"]), exist_ok=True)
-                with open(context_settings["storage_state"], "w") as f:
+            if not os.path.exists(storage_path):
+                os.makedirs(os.path.dirname(storage_path), exist_ok=True)
+                with open(storage_path, "w") as f:
                    json.dump({}, f)
-
-        
-        if crawlerRunConfig:
-            # Check if there is value for crawlerRunConfig.proxy_config set add that to context
-            if crawlerRunConfig.proxy_config:
-                proxy_settings = {
-                    "server": crawlerRunConfig.proxy_config.server,
-                }
-                if crawlerRunConfig.proxy_config.username:
-                    proxy_settings.update({
-                        "username": crawlerRunConfig.proxy_config.username,
-                        "password": crawlerRunConfig.proxy_config.password,
-                    })
-                context_settings["proxy"] = proxy_settings
-
-        if self.config.text_mode:
-            text_mode_settings = {
-                "has_touch": False,
-                "is_mobile": False,
-            }
-            # Update context settings with text mode settings
-            context_settings.update(text_mode_settings)
-
-        # Create and return the context with all settings
-        context = await self.browser.new_context(**context_settings)
-
-        # Apply text mode settings if enabled
-        if self.config.text_mode:
-            # Create and apply route patterns for each extension
-            for ext in blocked_extensions:
-                await context.route(f"**/*.{ext}", lambda route: route.abort())
-        return context
+                    
+            # Override storage_state with our specific path
+            self.config.storage_state = storage_path
+            if self.logger:
+                self.logger.debug(f"Using persistent storage state at: {storage_path}", tag="BROWSER")
+                
+        # Now call the base class implementation which handles everything else
+        return await super().create_browser_context(crawlerRunConfig)
    
    def _cleanup_expired_sessions(self):
        """Clean up expired sessions based on TTL."""
@@ -704,13 +745,28 @@ class CDPBrowserStrategy(BaseBrowserStrategy):
    async def create_browser_context(self, crawlerRunConfig: Optional[CrawlerRunConfig] = None) -> BrowserContext:
        """Create a new browser context.
        
+        Uses the base class implementation which handles all configurations.
+        
        Args:
            crawlerRunConfig: Configuration object for the crawler run
            
        Returns:
            BrowserContext: Browser context object
        """
-        return await self.browser.new_context()
+        # Handle user_data_dir for CDP browsers
+        if self.config.user_data_dir:
+            # For CDP-based browsers, storage persistence is typically handled by the user_data_dir
+            # at the browser level, but we'll create a storage_state location for Playwright as well
+            storage_path = os.path.join(self.config.user_data_dir, "storage_state.json")
+            if not os.path.exists(storage_path):
+                # Create parent directory if it doesn't exist
+                os.makedirs(os.path.dirname(storage_path), exist_ok=True)
+                with open(storage_path, "w") as f:
+                    json.dump({}, f)
+            self.config.storage_state = storage_path
+            
+        # Use the base class implementation
+        return await super().create_browser_context(crawlerRunConfig)
    
    def _cleanup_expired_sessions(self):
        """Clean up expired sessions based on TTL."""