feat(browser): enhance browser context creation with user data directory support and improved storage state handling

2025-03-23 19:07:13 +08:00
parent 0094cac675
commit 6eeb2e4076
1 changed files with 139 additions and 83 deletions
--- a/crawl4ai/browser/strategies.py
+++ b/crawl4ai/browser/strategies.py
@@ -139,6 +139,112 @@ class BaseBrowserStrategy(ABC):
        signature_hash = hashlib.sha256(signature_json.encode("utf-8")).hexdigest()
        return signature_hash
    async def create_browser_context(self, crawlerRunConfig: Optional[CrawlerRunConfig] = None) -> BrowserContext:
        """Creates and returns a new browser context with configured settings.
        Args:
            crawlerRunConfig: Configuration object for the crawler run
        Returns:
            BrowserContext: Browser context object with the specified configurations
        """
        if not self.browser:
            raise ValueError("Browser must be initialized before creating context")
        # Base settings
        user_agent = self.config.headers.get("User-Agent", self.config.user_agent) 
        viewport_settings = {
            "width": self.config.viewport_width,
            "height": self.config.viewport_height,
        }
        proxy_settings = {"server": self.config.proxy} if self.config.proxy else None
        # Define blocked extensions for resource optimization
        blocked_extensions = [
            # Images
            "jpg", "jpeg", "png", "gif", "webp", "svg", "ico", "bmp", "tiff", "psd",
            # Fonts
            "woff", "woff2", "ttf", "otf", "eot",
            # Media
            "mp4", "webm", "ogg", "avi", "mov", "wmv", "flv", "m4v", "mp3", "wav", "aac",
            "m4a", "opus", "flac",
            # Documents
            "pdf", "doc", "docx", "xls", "xlsx", "ppt", "pptx",
            # Archives
            "zip", "rar", "7z", "tar", "gz",
            # Scripts and data
            "xml", "swf", "wasm",
        ]
        # Common context settings
        context_settings = {
            "user_agent": user_agent,
            "viewport": viewport_settings,
            "proxy": proxy_settings,
            "accept_downloads": self.config.accept_downloads,
            "ignore_https_errors": self.config.ignore_https_errors,
            "device_scale_factor": 1.0,
            "java_script_enabled": self.config.java_script_enabled,
        }
        # Apply text mode settings if enabled
        if self.config.text_mode:
            text_mode_settings = {
                "has_touch": False,
                "is_mobile": False,
                # Disable javascript in text mode
                "java_script_enabled": False
            }
            # Update context settings with text mode settings
            context_settings.update(text_mode_settings)
            if self.logger:
                self.logger.debug("Text mode enabled for browser context", tag="BROWSER")
        # Handle storage state properly - this is key for persistence
        if self.config.storage_state:
            context_settings["storage_state"] = self.config.storage_state
            if self.logger:
                if isinstance(self.config.storage_state, str):
                    self.logger.debug(f"Using storage state from file: {self.config.storage_state}", tag="BROWSER")
                else:
                    self.logger.debug("Using storage state from config object", tag="BROWSER")
        # If user_data_dir is specified, browser persistence should be automatic
        if self.config.user_data_dir and self.logger:
            self.logger.debug(f"Using user data directory: {self.config.user_data_dir}", tag="BROWSER")
        # Apply crawler-specific configurations if provided
        if crawlerRunConfig:
            # Check if there is value for crawlerRunConfig.proxy_config set add that to context
            if crawlerRunConfig.proxy_config:
                proxy_settings = {
                    "server": crawlerRunConfig.proxy_config.server,
                }
                if crawlerRunConfig.proxy_config.username:
                    proxy_settings.update({
                        "username": crawlerRunConfig.proxy_config.username,
                        "password": crawlerRunConfig.proxy_config.password,
                    })
                context_settings["proxy"] = proxy_settings
        # Create and return the context
        try:
            # Create the context with appropriate settings
            context = await self.browser.new_context(**context_settings)
            # Apply text mode resource blocking if enabled
            if self.config.text_mode:
                # Create and apply route patterns for each extension
                for ext in blocked_extensions:
                    await context.route(f"**/*.{ext}", lambda route: route.abort())
            return context
        except Exception as e:
            if self.logger:
                self.logger.error(f"Error creating browser context: {str(e)}", tag="BROWSER")
            # Fallback to basic context creation if the advanced settings fail
            return await self.browser.new_context()
    async def setup_context(self, context: BrowserContext, crawlerRunConfig: Optional[CrawlerRunConfig] = None):
        """Set up a browser context with the configured options.
@@ -301,97 +407,32 @@ class PlaywrightBrowserStrategy(BaseBrowserStrategy):
    async def create_browser_context(self, crawlerRunConfig: Optional[CrawlerRunConfig] = None) -> BrowserContext:
        """Creates and returns a new browser context with configured settings.
        This implementation extends the base class version to handle user_data_dir specifically.
        Args:
            crawlerRunConfig: Configuration object for the crawler run
        Returns:
            BrowserContext: Browser context object with the specified configurations
        """
-        # Base settings
+        # Handle user_data_dir explicitly to ensure storage persistence
        user_agent = self.config.headers.get("User-Agent", self.config.user_agent) 
        viewport_settings = {
            "width": self.config.viewport_width,
            "height": self.config.viewport_height,
        }
        proxy_settings = {"server": self.config.proxy} if self.config.proxy else None
        blocked_extensions = [
            # Images
            "jpg", "jpeg", "png", "gif", "webp", "svg", "ico", "bmp", "tiff", "psd",
            # Fonts
            "woff", "woff2", "ttf", "otf", "eot",
            # Media
            "mp4", "webm", "ogg", "avi", "mov", "wmv", "flv", "m4v", "mp3", "wav", "aac",
            "m4a", "opus", "flac",
            # Documents
            "pdf", "doc", "docx", "xls", "xlsx", "ppt", "pptx",
            # Archives
            "zip", "rar", "7z", "tar", "gz",
            # Scripts and data
            "xml", "swf", "wasm",
        ]
        # Common context settings
        context_settings = {
            "user_agent": user_agent,
            "viewport": viewport_settings,
            "proxy": proxy_settings,
            "accept_downloads": self.config.accept_downloads,
            "ignore_https_errors": self.config.ignore_https_errors,
            "device_scale_factor": 1.0,
            "java_script_enabled": self.config.java_script_enabled,
        }
        # Handle storage state properly - this is key for persistence
        if self.config.storage_state:
            context_settings["storage_state"] = self.config.storage_state
            if self.logger:
                if isinstance(self.config.storage_state, str):
                    self.logger.debug(f"Using storage state from file: {self.config.storage_state}", tag="BROWSER")
                else:
                    self.logger.debug("Using storage state from config object", tag="BROWSER")
        if self.config.user_data_dir:
-            context_settings["storage_state"] = os.path.join(
+            # Create a storage state file path if none exists
-                self.config.user_data_dir, "Default", "storage_state.json"
+            storage_path = os.path.join(self.config.user_data_dir, "Default", "storage_state.json")
-            )
+            
            # Create the file if it doesn't exist
-            if not os.path.exists(context_settings["storage_state"]):
+            if not os.path.exists(storage_path):
-                os.makedirs(os.path.dirname(context_settings["storage_state"]), exist_ok=True)
+                os.makedirs(os.path.dirname(storage_path), exist_ok=True)
-                with open(context_settings["storage_state"], "w") as f:
+                with open(storage_path, "w") as f:
                    json.dump({}, f)
            # Override storage_state with our specific path
            self.config.storage_state = storage_path
            if self.logger:
                self.logger.debug(f"Using persistent storage state at: {storage_path}", tag="BROWSER")
-        if crawlerRunConfig:
+        # Now call the base class implementation which handles everything else
-            # Check if there is value for crawlerRunConfig.proxy_config set add that to context
+        return await super().create_browser_context(crawlerRunConfig)
            if crawlerRunConfig.proxy_config:
                proxy_settings = {
                    "server": crawlerRunConfig.proxy_config.server,
                }
                if crawlerRunConfig.proxy_config.username:
                    proxy_settings.update({
                        "username": crawlerRunConfig.proxy_config.username,
                        "password": crawlerRunConfig.proxy_config.password,
                    })
                context_settings["proxy"] = proxy_settings
        if self.config.text_mode:
            text_mode_settings = {
                "has_touch": False,
                "is_mobile": False,
            }
            # Update context settings with text mode settings
            context_settings.update(text_mode_settings)
        # Create and return the context with all settings
        context = await self.browser.new_context(**context_settings)
        # Apply text mode settings if enabled
        if self.config.text_mode:
            # Create and apply route patterns for each extension
            for ext in blocked_extensions:
                await context.route(f"**/*.{ext}", lambda route: route.abort())
        return context
    def _cleanup_expired_sessions(self):
        """Clean up expired sessions based on TTL."""
@@ -704,13 +745,28 @@ class CDPBrowserStrategy(BaseBrowserStrategy):
    async def create_browser_context(self, crawlerRunConfig: Optional[CrawlerRunConfig] = None) -> BrowserContext:
        """Create a new browser context.
        Uses the base class implementation which handles all configurations.
        Args:
            crawlerRunConfig: Configuration object for the crawler run
        Returns:
            BrowserContext: Browser context object
        """
-        return await self.browser.new_context()
+        # Handle user_data_dir for CDP browsers
        if self.config.user_data_dir:
            # For CDP-based browsers, storage persistence is typically handled by the user_data_dir
            # at the browser level, but we'll create a storage_state location for Playwright as well
            storage_path = os.path.join(self.config.user_data_dir, "storage_state.json")
            if not os.path.exists(storage_path):
                # Create parent directory if it doesn't exist
                os.makedirs(os.path.dirname(storage_path), exist_ok=True)
                with open(storage_path, "w") as f:
                    json.dump({}, f)
            self.config.storage_state = storage_path
        # Use the base class implementation
        return await super().create_browser_context(crawlerRunConfig)
    def _cleanup_expired_sessions(self):
        """Clean up expired sessions based on TTL."""