feat(browser): improve browser session management and profile handling

Enhance browser session management with the following improvements:
- Add state cloning between browser contexts
- Implement smarter page closing logic based on total pages and browser config
- Add storage state persistence during profile creation
- Improve managed browser context handling with storage state support

This change improves browser session reliability and persistence across runs.
This commit is contained in:
UncleCode
2025-05-21 20:23:17 +08:00
parent 85ac6fa523
commit 08ad7ef257
3 changed files with 114 additions and 7 deletions

View File

@@ -1065,7 +1065,13 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
finally:
# If no session_id is given we should close the page
if not config.session_id:
all_contexts = page.context.browser.contexts
total_pages = sum(len(context.pages) for context in all_contexts)
if config.session_id:
pass
elif total_pages <= 1 and (self.browser_config.use_managed_browser or self.browser_config.headless):
pass
else:
# Detach listeners before closing to prevent potential errors during close
if config.capture_network_requests:
page.remove_listener("request", handle_request_capture)
@@ -1075,6 +1081,7 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
page.remove_listener("console", handle_console_capture)
page.remove_listener("pageerror", handle_pageerror_capture)
# Close the page
await page.close()
async def _handle_full_page_scan(self, page: Page, scroll_delay: float = 0.1):

View File

@@ -511,6 +511,56 @@ class ManagedBrowser:
return profiler.delete_profile(profile_name_or_path)
async def clone_runtime_state(
src: BrowserContext,
dst: BrowserContext,
crawlerRunConfig: CrawlerRunConfig | None = None,
browserConfig: BrowserConfig | None = None,
) -> None:
"""
Bring everything that *can* be changed at runtime from `src` → `dst`.
1. Cookies
2. localStorage (and sessionStorage, same API)
3. Extra headers, permissions, geolocation if supplied in configs
"""
# ── 1. cookies ────────────────────────────────────────────────────────────
cookies = await src.cookies()
if cookies:
await dst.add_cookies(cookies)
# ── 2. localStorage / sessionStorage ──────────────────────────────────────
state = await src.storage_state()
for origin in state.get("origins", []):
url = origin["origin"]
kvs = origin.get("localStorage", [])
if not kvs:
continue
page = dst.pages[0] if dst.pages else await dst.new_page()
await page.goto(url, wait_until="domcontentloaded")
for k, v in kvs:
await page.evaluate("(k,v)=>localStorage.setItem(k,v)", k, v)
# ── 3. runtime-mutable extras from configs ────────────────────────────────
# headers
if browserConfig and browserConfig.headers:
await dst.set_extra_http_headers(browserConfig.headers)
# geolocation
if crawlerRunConfig and crawlerRunConfig.geolocation:
await dst.grant_permissions(["geolocation"])
await dst.set_geolocation(
{
"latitude": crawlerRunConfig.geolocation.latitude,
"longitude": crawlerRunConfig.geolocation.longitude,
"accuracy": crawlerRunConfig.geolocation.accuracy,
}
)
return dst
class BrowserManager:
@@ -960,11 +1010,39 @@ class BrowserManager:
# If using a managed browser, just grab the shared default_context
if self.config.use_managed_browser:
context = self.default_context
pages = context.pages
page = next((p for p in pages if p.url == crawlerRunConfig.url), None)
if not page:
page = context.pages[0] # await context.new_page()
if self.config.storage_state:
context = await self.create_browser_context(crawlerRunConfig)
ctx = self.default_context # default context, one window only
ctx = await clone_runtime_state(context, ctx, crawlerRunConfig, self.config)
# import json
# with open(self.config.storage_state, "r") as fh:
# state = json.load(fh)
# # 1. cookies
# await ctx.add_cookies(state["cookies"])
# # 2. local- / sessionStorage
# if state.get("origins"):
# page = ctx.pages[0] if ctx.pages else await ctx.new_page()
# for origin in state["origins"]:
# url = origin["origin"]
# for key, value in origin["localStorage"]:
# await page.goto(url)
# await page.evaluate(
# "(k, v) => localStorage.setItem(k, v)", key, value
# )
# If storage state is provided, create a new context
# context = await self.create_browser_context(crawlerRunConfig)
# await self.setup_context(context, crawlerRunConfig)
# self.default_context = context
page = await ctx.new_page()
else:
context = self.default_context
pages = context.pages
page = next((p for p in pages if p.url == crawlerRunConfig.url), None)
if not page:
page = context.pages[0] # await context.new_page()
else:
# Otherwise, check if we have an existing context for this config
config_signature = self._make_config_signature(crawlerRunConfig)

View File

@@ -218,8 +218,18 @@ class BrowserProfiler:
termios.tcsetattr(fd, termios.TCSADRAIN, old_settings)
try:
from playwright.async_api import async_playwright
# Start the browser
await managed_browser.start()
# await managed_browser.start()
# 1. ── Start the browser ─────────────────────────────────────────
cdp_url = await managed_browser.start()
# 2. ── Attach Playwright to that running Chrome ──────────────────
pw = await async_playwright().start()
browser = await pw.chromium.connect_over_cdp(cdp_url)
# Grab the existing default context (there is always one)
context = browser.contexts[0]
# Check if browser started successfully
browser_process = managed_browser.browser_process
@@ -244,6 +254,18 @@ class BrowserProfiler:
except asyncio.CancelledError:
pass
# 3. ── Persist storage state *before* we kill Chrome ─────────────
state_file = os.path.join(profile_path, "storage_state.json")
try:
await context.storage_state(path=state_file)
self.logger.info(f"[PROFILE].i storage_state saved → {state_file}", tag="PROFILE")
except Exception as e:
self.logger.warning(f"[PROFILE].w failed to save storage_state: {e}", tag="PROFILE")
# 4. ── Close everything cleanly ──────────────────────────────────
await browser.close()
await pw.stop()
# If the browser is still running and the user pressed 'q', terminate it
if browser_process.poll() is None and user_done_event.is_set():
self.logger.info("Terminating browser process...", tag="PROFILE")