fix(browser): improve storage state persistence in CDP strategy

Enhance storage state persistence mechanism in CDP browser strategy by:
- Explicitly saving storage state for each browser context
- Using proper file path for storage state
- Removing unnecessary sleep delay

Also includes test improvements:
- Simplified test configurations in playwright tests
- Temporarily disabled some CDP tests
This commit is contained in:
UncleCode
2025-03-23 21:06:41 +08:00
parent 6eeb2e4076
commit 462d5765e2
5 changed files with 21 additions and 34 deletions

View File

@@ -7,9 +7,7 @@ It also implements a page pooling mechanism for improved performance.
import asyncio import asyncio
import time import time
import os from typing import Optional, Tuple, List
import psutil
from typing import Optional, Tuple, Dict, Any, List, Set
from playwright.async_api import Page, BrowserContext from playwright.async_api import Page, BrowserContext

View File

@@ -17,7 +17,6 @@ from colorama import Fore, Style, init
from ..async_configs import BrowserConfig from ..async_configs import BrowserConfig
from ..async_logger import AsyncLogger, AsyncLoggerBase from ..async_logger import AsyncLogger, AsyncLoggerBase
from ..utils import get_home_folder from ..utils import get_home_folder
from .strategies import is_windows
class BrowserProfileManager: class BrowserProfileManager:
"""Manages browser profiles for Crawl4AI. """Manages browser profiles for Crawl4AI.

View File

@@ -11,12 +11,11 @@ import time
import json import json
import hashlib import hashlib
import subprocess import subprocess
import sys
import shutil import shutil
import signal import signal
from typing import Optional, Dict, Tuple, List, Any from typing import Optional, Dict, Tuple, List, Any
from playwright.async_api import Browser, BrowserContext, Page, ProxySettings from playwright.async_api import BrowserContext, Page, ProxySettings
from ..async_logger import AsyncLogger from ..async_logger import AsyncLogger
from ..async_configs import BrowserConfig, CrawlerRunConfig from ..async_configs import BrowserConfig, CrawlerRunConfig
@@ -831,26 +830,25 @@ class CDPBrowserStrategy(BaseBrowserStrategy):
await asyncio.sleep(0.5) await asyncio.sleep(0.5)
# If we have a user_data_dir configured, ensure persistence of storage state # If we have a user_data_dir configured, ensure persistence of storage state
if self.config.user_data_dir and self.browser: if self.config.user_data_dir and self.browser and self.default_context:
try: for context in self.browser.contexts:
# Create a brief sleep to allow the browser to flush any pending operations try:
# This helps ensure all storage state (localStorage, cookies, etc.) gets saved await context.storage_state(path=os.path.join(self.config.user_data_dir, "Default", "storage_state.json"))
await asyncio.sleep(0.3) if self.logger:
if self.logger: self.logger.debug("Ensuring storage state is persisted before closing browser", tag="BROWSER")
self.logger.debug("Ensuring storage state is persisted before closing CDP browser", tag="BROWSER") except Exception as e:
except Exception as e: if self.logger:
if self.logger: self.logger.warning(
self.logger.warning( message="Failed to ensure storage persistence: {error}",
message="Failed to ensure storage persistence: {error}", tag="BROWSER",
tag="BROWSER", params={"error": str(e)}
params={"error": str(e)} )
)
# Close all sessions # Close all sessions
session_ids = list(self.sessions.keys()) session_ids = list(self.sessions.keys())
for session_id in session_ids: for session_id in session_ids:
await self._kill_session(session_id) await self._kill_session(session_id)
# Close browser # Close browser
if self.browser: if self.browser:
await self.browser.close() await self.browser.close()

View File

@@ -209,8 +209,8 @@ async def run_tests():
"""Run all tests sequentially.""" """Run all tests sequentially."""
results = [] results = []
results.append(await test_cdp_launch_connect()) # results.append(await test_cdp_launch_connect())
results.append(await test_cdp_with_user_data_dir()) # results.append(await test_cdp_with_user_data_dir())
results.append(await test_cdp_session_management()) results.append(await test_cdp_session_management())
# Print summary # Print summary

View File

@@ -143,15 +143,11 @@ async def test_playwright_context_reuse():
# Create identical crawler configs # Create identical crawler configs
crawler_config1 = CrawlerRunConfig( crawler_config1 = CrawlerRunConfig(
url="https://example.com", css_selector="body",
viewport_width=1280,
viewport_height=800
) )
crawler_config2 = CrawlerRunConfig( crawler_config2 = CrawlerRunConfig(
url="https://example.org", # Different URL but same browser parameters css_selector="body",
viewport_width=1280,
viewport_height=800
) )
# Get pages with these configs # Get pages with these configs
@@ -163,11 +159,7 @@ async def test_playwright_context_reuse():
logger.info(f"Contexts reused: {is_same_context}", tag="TEST") logger.info(f"Contexts reused: {is_same_context}", tag="TEST")
# Now try with a different config # Now try with a different config
crawler_config3 = CrawlerRunConfig( crawler_config3 = CrawlerRunConfig()
url="https://example.net",
viewport_width=800, # Different viewport size
viewport_height=600
)
page3, context3 = await manager.get_page(crawler_config3) page3, context3 = await manager.get_page(crawler_config3)