From 462d5765e29293170fb9d320f0090a4061985883 Mon Sep 17 00:00:00 2001 From: UncleCode Date: Sun, 23 Mar 2025 21:06:41 +0800 Subject: [PATCH] fix(browser): improve storage state persistence in CDP strategy Enhance storage state persistence mechanism in CDP browser strategy by: - Explicitly saving storage state for each browser context - Using proper file path for storage state - Removing unnecessary sleep delay Also includes test improvements: - Simplified test configurations in playwright tests - Temporarily disabled some CDP tests --- crawl4ai/browser/manager.py | 4 +-- crawl4ai/browser/profiles.py | 1 - crawl4ai/browser/strategies.py | 32 +++++++++++------------ tests/browser/test_cdp_strategy.py | 4 +-- tests/browser/test_playwright_strategy.py | 14 +++------- 5 files changed, 21 insertions(+), 34 deletions(-) diff --git a/crawl4ai/browser/manager.py b/crawl4ai/browser/manager.py index 3a37efcb..9b0cf073 100644 --- a/crawl4ai/browser/manager.py +++ b/crawl4ai/browser/manager.py @@ -7,9 +7,7 @@ It also implements a page pooling mechanism for improved performance. import asyncio import time -import os -import psutil -from typing import Optional, Tuple, Dict, Any, List, Set +from typing import Optional, Tuple, List from playwright.async_api import Page, BrowserContext diff --git a/crawl4ai/browser/profiles.py b/crawl4ai/browser/profiles.py index 58a8bff2..afd0d78a 100644 --- a/crawl4ai/browser/profiles.py +++ b/crawl4ai/browser/profiles.py @@ -17,7 +17,6 @@ from colorama import Fore, Style, init from ..async_configs import BrowserConfig from ..async_logger import AsyncLogger, AsyncLoggerBase from ..utils import get_home_folder -from .strategies import is_windows class BrowserProfileManager: """Manages browser profiles for Crawl4AI. diff --git a/crawl4ai/browser/strategies.py b/crawl4ai/browser/strategies.py index 68d2d97d..f2a9525e 100644 --- a/crawl4ai/browser/strategies.py +++ b/crawl4ai/browser/strategies.py @@ -11,12 +11,11 @@ import time import json import hashlib import subprocess -import sys import shutil import signal from typing import Optional, Dict, Tuple, List, Any -from playwright.async_api import Browser, BrowserContext, Page, ProxySettings +from playwright.async_api import BrowserContext, Page, ProxySettings from ..async_logger import AsyncLogger from ..async_configs import BrowserConfig, CrawlerRunConfig @@ -831,26 +830,25 @@ class CDPBrowserStrategy(BaseBrowserStrategy): await asyncio.sleep(0.5) # If we have a user_data_dir configured, ensure persistence of storage state - if self.config.user_data_dir and self.browser: - try: - # Create a brief sleep to allow the browser to flush any pending operations - # This helps ensure all storage state (localStorage, cookies, etc.) gets saved - await asyncio.sleep(0.3) - if self.logger: - self.logger.debug("Ensuring storage state is persisted before closing CDP browser", tag="BROWSER") - except Exception as e: - if self.logger: - self.logger.warning( - message="Failed to ensure storage persistence: {error}", - tag="BROWSER", - params={"error": str(e)} - ) + if self.config.user_data_dir and self.browser and self.default_context: + for context in self.browser.contexts: + try: + await context.storage_state(path=os.path.join(self.config.user_data_dir, "Default", "storage_state.json")) + if self.logger: + self.logger.debug("Ensuring storage state is persisted before closing browser", tag="BROWSER") + except Exception as e: + if self.logger: + self.logger.warning( + message="Failed to ensure storage persistence: {error}", + tag="BROWSER", + params={"error": str(e)} + ) # Close all sessions session_ids = list(self.sessions.keys()) for session_id in session_ids: await self._kill_session(session_id) - + # Close browser if self.browser: await self.browser.close() diff --git a/tests/browser/test_cdp_strategy.py b/tests/browser/test_cdp_strategy.py index 4ec1f7f1..abadf42a 100644 --- a/tests/browser/test_cdp_strategy.py +++ b/tests/browser/test_cdp_strategy.py @@ -209,8 +209,8 @@ async def run_tests(): """Run all tests sequentially.""" results = [] - results.append(await test_cdp_launch_connect()) - results.append(await test_cdp_with_user_data_dir()) + # results.append(await test_cdp_launch_connect()) + # results.append(await test_cdp_with_user_data_dir()) results.append(await test_cdp_session_management()) # Print summary diff --git a/tests/browser/test_playwright_strategy.py b/tests/browser/test_playwright_strategy.py index 1d897bcf..2344c9ba 100644 --- a/tests/browser/test_playwright_strategy.py +++ b/tests/browser/test_playwright_strategy.py @@ -143,15 +143,11 @@ async def test_playwright_context_reuse(): # Create identical crawler configs crawler_config1 = CrawlerRunConfig( - url="https://example.com", - viewport_width=1280, - viewport_height=800 + css_selector="body", ) crawler_config2 = CrawlerRunConfig( - url="https://example.org", # Different URL but same browser parameters - viewport_width=1280, - viewport_height=800 + css_selector="body", ) # Get pages with these configs @@ -163,11 +159,7 @@ async def test_playwright_context_reuse(): logger.info(f"Contexts reused: {is_same_context}", tag="TEST") # Now try with a different config - crawler_config3 = CrawlerRunConfig( - url="https://example.net", - viewport_width=800, # Different viewport size - viewport_height=600 - ) + crawler_config3 = CrawlerRunConfig() page3, context3 = await manager.get_page(crawler_config3)