feat(browser): improve browser session management and profile handling
Enhance browser session management with the following improvements: - Add state cloning between browser contexts - Implement smarter page closing logic based on total pages and browser config - Add storage state persistence during profile creation - Improve managed browser context handling with storage state support This change improves browser session reliability and persistence across runs.
This commit is contained in:
@@ -1065,7 +1065,13 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
|
|||||||
|
|
||||||
finally:
|
finally:
|
||||||
# If no session_id is given we should close the page
|
# If no session_id is given we should close the page
|
||||||
if not config.session_id:
|
all_contexts = page.context.browser.contexts
|
||||||
|
total_pages = sum(len(context.pages) for context in all_contexts)
|
||||||
|
if config.session_id:
|
||||||
|
pass
|
||||||
|
elif total_pages <= 1 and (self.browser_config.use_managed_browser or self.browser_config.headless):
|
||||||
|
pass
|
||||||
|
else:
|
||||||
# Detach listeners before closing to prevent potential errors during close
|
# Detach listeners before closing to prevent potential errors during close
|
||||||
if config.capture_network_requests:
|
if config.capture_network_requests:
|
||||||
page.remove_listener("request", handle_request_capture)
|
page.remove_listener("request", handle_request_capture)
|
||||||
@@ -1075,6 +1081,7 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
|
|||||||
page.remove_listener("console", handle_console_capture)
|
page.remove_listener("console", handle_console_capture)
|
||||||
page.remove_listener("pageerror", handle_pageerror_capture)
|
page.remove_listener("pageerror", handle_pageerror_capture)
|
||||||
|
|
||||||
|
# Close the page
|
||||||
await page.close()
|
await page.close()
|
||||||
|
|
||||||
async def _handle_full_page_scan(self, page: Page, scroll_delay: float = 0.1):
|
async def _handle_full_page_scan(self, page: Page, scroll_delay: float = 0.1):
|
||||||
|
|||||||
@@ -511,6 +511,56 @@ class ManagedBrowser:
|
|||||||
return profiler.delete_profile(profile_name_or_path)
|
return profiler.delete_profile(profile_name_or_path)
|
||||||
|
|
||||||
|
|
||||||
|
async def clone_runtime_state(
|
||||||
|
src: BrowserContext,
|
||||||
|
dst: BrowserContext,
|
||||||
|
crawlerRunConfig: CrawlerRunConfig | None = None,
|
||||||
|
browserConfig: BrowserConfig | None = None,
|
||||||
|
) -> None:
|
||||||
|
"""
|
||||||
|
Bring everything that *can* be changed at runtime from `src` → `dst`.
|
||||||
|
|
||||||
|
1. Cookies
|
||||||
|
2. localStorage (and sessionStorage, same API)
|
||||||
|
3. Extra headers, permissions, geolocation if supplied in configs
|
||||||
|
"""
|
||||||
|
|
||||||
|
# ── 1. cookies ────────────────────────────────────────────────────────────
|
||||||
|
cookies = await src.cookies()
|
||||||
|
if cookies:
|
||||||
|
await dst.add_cookies(cookies)
|
||||||
|
|
||||||
|
# ── 2. localStorage / sessionStorage ──────────────────────────────────────
|
||||||
|
state = await src.storage_state()
|
||||||
|
for origin in state.get("origins", []):
|
||||||
|
url = origin["origin"]
|
||||||
|
kvs = origin.get("localStorage", [])
|
||||||
|
if not kvs:
|
||||||
|
continue
|
||||||
|
|
||||||
|
page = dst.pages[0] if dst.pages else await dst.new_page()
|
||||||
|
await page.goto(url, wait_until="domcontentloaded")
|
||||||
|
for k, v in kvs:
|
||||||
|
await page.evaluate("(k,v)=>localStorage.setItem(k,v)", k, v)
|
||||||
|
|
||||||
|
# ── 3. runtime-mutable extras from configs ────────────────────────────────
|
||||||
|
# headers
|
||||||
|
if browserConfig and browserConfig.headers:
|
||||||
|
await dst.set_extra_http_headers(browserConfig.headers)
|
||||||
|
|
||||||
|
# geolocation
|
||||||
|
if crawlerRunConfig and crawlerRunConfig.geolocation:
|
||||||
|
await dst.grant_permissions(["geolocation"])
|
||||||
|
await dst.set_geolocation(
|
||||||
|
{
|
||||||
|
"latitude": crawlerRunConfig.geolocation.latitude,
|
||||||
|
"longitude": crawlerRunConfig.geolocation.longitude,
|
||||||
|
"accuracy": crawlerRunConfig.geolocation.accuracy,
|
||||||
|
}
|
||||||
|
)
|
||||||
|
|
||||||
|
return dst
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
class BrowserManager:
|
class BrowserManager:
|
||||||
@@ -960,11 +1010,39 @@ class BrowserManager:
|
|||||||
|
|
||||||
# If using a managed browser, just grab the shared default_context
|
# If using a managed browser, just grab the shared default_context
|
||||||
if self.config.use_managed_browser:
|
if self.config.use_managed_browser:
|
||||||
context = self.default_context
|
if self.config.storage_state:
|
||||||
pages = context.pages
|
context = await self.create_browser_context(crawlerRunConfig)
|
||||||
page = next((p for p in pages if p.url == crawlerRunConfig.url), None)
|
ctx = self.default_context # default context, one window only
|
||||||
if not page:
|
ctx = await clone_runtime_state(context, ctx, crawlerRunConfig, self.config)
|
||||||
page = context.pages[0] # await context.new_page()
|
# import json
|
||||||
|
# with open(self.config.storage_state, "r") as fh:
|
||||||
|
# state = json.load(fh)
|
||||||
|
|
||||||
|
# # 1. cookies
|
||||||
|
# await ctx.add_cookies(state["cookies"])
|
||||||
|
|
||||||
|
# # 2. local- / sessionStorage
|
||||||
|
# if state.get("origins"):
|
||||||
|
# page = ctx.pages[0] if ctx.pages else await ctx.new_page()
|
||||||
|
# for origin in state["origins"]:
|
||||||
|
# url = origin["origin"]
|
||||||
|
# for key, value in origin["localStorage"]:
|
||||||
|
# await page.goto(url)
|
||||||
|
# await page.evaluate(
|
||||||
|
# "(k, v) => localStorage.setItem(k, v)", key, value
|
||||||
|
# )
|
||||||
|
|
||||||
|
# If storage state is provided, create a new context
|
||||||
|
# context = await self.create_browser_context(crawlerRunConfig)
|
||||||
|
# await self.setup_context(context, crawlerRunConfig)
|
||||||
|
# self.default_context = context
|
||||||
|
page = await ctx.new_page()
|
||||||
|
else:
|
||||||
|
context = self.default_context
|
||||||
|
pages = context.pages
|
||||||
|
page = next((p for p in pages if p.url == crawlerRunConfig.url), None)
|
||||||
|
if not page:
|
||||||
|
page = context.pages[0] # await context.new_page()
|
||||||
else:
|
else:
|
||||||
# Otherwise, check if we have an existing context for this config
|
# Otherwise, check if we have an existing context for this config
|
||||||
config_signature = self._make_config_signature(crawlerRunConfig)
|
config_signature = self._make_config_signature(crawlerRunConfig)
|
||||||
|
|||||||
@@ -218,8 +218,18 @@ class BrowserProfiler:
|
|||||||
termios.tcsetattr(fd, termios.TCSADRAIN, old_settings)
|
termios.tcsetattr(fd, termios.TCSADRAIN, old_settings)
|
||||||
|
|
||||||
try:
|
try:
|
||||||
|
from playwright.async_api import async_playwright
|
||||||
|
|
||||||
# Start the browser
|
# Start the browser
|
||||||
await managed_browser.start()
|
# await managed_browser.start()
|
||||||
|
# 1. ── Start the browser ─────────────────────────────────────────
|
||||||
|
cdp_url = await managed_browser.start()
|
||||||
|
|
||||||
|
# 2. ── Attach Playwright to that running Chrome ──────────────────
|
||||||
|
pw = await async_playwright().start()
|
||||||
|
browser = await pw.chromium.connect_over_cdp(cdp_url)
|
||||||
|
# Grab the existing default context (there is always one)
|
||||||
|
context = browser.contexts[0]
|
||||||
|
|
||||||
# Check if browser started successfully
|
# Check if browser started successfully
|
||||||
browser_process = managed_browser.browser_process
|
browser_process = managed_browser.browser_process
|
||||||
@@ -244,6 +254,18 @@ class BrowserProfiler:
|
|||||||
except asyncio.CancelledError:
|
except asyncio.CancelledError:
|
||||||
pass
|
pass
|
||||||
|
|
||||||
|
# 3. ── Persist storage state *before* we kill Chrome ─────────────
|
||||||
|
state_file = os.path.join(profile_path, "storage_state.json")
|
||||||
|
try:
|
||||||
|
await context.storage_state(path=state_file)
|
||||||
|
self.logger.info(f"[PROFILE].i storage_state saved → {state_file}", tag="PROFILE")
|
||||||
|
except Exception as e:
|
||||||
|
self.logger.warning(f"[PROFILE].w failed to save storage_state: {e}", tag="PROFILE")
|
||||||
|
|
||||||
|
# 4. ── Close everything cleanly ──────────────────────────────────
|
||||||
|
await browser.close()
|
||||||
|
await pw.stop()
|
||||||
|
|
||||||
# If the browser is still running and the user pressed 'q', terminate it
|
# If the browser is still running and the user pressed 'q', terminate it
|
||||||
if browser_process.poll() is None and user_done_event.is_set():
|
if browser_process.poll() is None and user_done_event.is_set():
|
||||||
self.logger.info("Terminating browser process...", tag="PROFILE")
|
self.logger.info("Terminating browser process...", tag="PROFILE")
|
||||||
|
|||||||
Reference in New Issue
Block a user