From 08ad7ef257d891896840e5398d48b2a422d7b28a Mon Sep 17 00:00:00 2001 From: UncleCode Date: Wed, 21 May 2025 20:23:17 +0800 Subject: [PATCH] feat(browser): improve browser session management and profile handling Enhance browser session management with the following improvements: - Add state cloning between browser contexts - Implement smarter page closing logic based on total pages and browser config - Add storage state persistence during profile creation - Improve managed browser context handling with storage state support This change improves browser session reliability and persistence across runs. --- crawl4ai/async_crawler_strategy.py | 9 ++- crawl4ai/browser_manager.py | 88 ++++++++++++++++++++++++++++-- crawl4ai/browser_profiler.py | 24 +++++++- 3 files changed, 114 insertions(+), 7 deletions(-) diff --git a/crawl4ai/async_crawler_strategy.py b/crawl4ai/async_crawler_strategy.py index 55ad550d..6294e2f4 100644 --- a/crawl4ai/async_crawler_strategy.py +++ b/crawl4ai/async_crawler_strategy.py @@ -1065,7 +1065,13 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): finally: # If no session_id is given we should close the page - if not config.session_id: + all_contexts = page.context.browser.contexts + total_pages = sum(len(context.pages) for context in all_contexts) + if config.session_id: + pass + elif total_pages <= 1 and (self.browser_config.use_managed_browser or self.browser_config.headless): + pass + else: # Detach listeners before closing to prevent potential errors during close if config.capture_network_requests: page.remove_listener("request", handle_request_capture) @@ -1075,6 +1081,7 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): page.remove_listener("console", handle_console_capture) page.remove_listener("pageerror", handle_pageerror_capture) + # Close the page await page.close() async def _handle_full_page_scan(self, page: Page, scroll_delay: float = 0.1): diff --git a/crawl4ai/browser_manager.py b/crawl4ai/browser_manager.py index d4e074cf..4d518d0b 100644 --- a/crawl4ai/browser_manager.py +++ b/crawl4ai/browser_manager.py @@ -511,6 +511,56 @@ class ManagedBrowser: return profiler.delete_profile(profile_name_or_path) +async def clone_runtime_state( + src: BrowserContext, + dst: BrowserContext, + crawlerRunConfig: CrawlerRunConfig | None = None, + browserConfig: BrowserConfig | None = None, +) -> None: + """ + Bring everything that *can* be changed at runtime from `src` → `dst`. + + 1. Cookies + 2. localStorage (and sessionStorage, same API) + 3. Extra headers, permissions, geolocation if supplied in configs + """ + + # ── 1. cookies ──────────────────────────────────────────────────────────── + cookies = await src.cookies() + if cookies: + await dst.add_cookies(cookies) + + # ── 2. localStorage / sessionStorage ────────────────────────────────────── + state = await src.storage_state() + for origin in state.get("origins", []): + url = origin["origin"] + kvs = origin.get("localStorage", []) + if not kvs: + continue + + page = dst.pages[0] if dst.pages else await dst.new_page() + await page.goto(url, wait_until="domcontentloaded") + for k, v in kvs: + await page.evaluate("(k,v)=>localStorage.setItem(k,v)", k, v) + + # ── 3. runtime-mutable extras from configs ──────────────────────────────── + # headers + if browserConfig and browserConfig.headers: + await dst.set_extra_http_headers(browserConfig.headers) + + # geolocation + if crawlerRunConfig and crawlerRunConfig.geolocation: + await dst.grant_permissions(["geolocation"]) + await dst.set_geolocation( + { + "latitude": crawlerRunConfig.geolocation.latitude, + "longitude": crawlerRunConfig.geolocation.longitude, + "accuracy": crawlerRunConfig.geolocation.accuracy, + } + ) + + return dst + class BrowserManager: @@ -960,11 +1010,39 @@ class BrowserManager: # If using a managed browser, just grab the shared default_context if self.config.use_managed_browser: - context = self.default_context - pages = context.pages - page = next((p for p in pages if p.url == crawlerRunConfig.url), None) - if not page: - page = context.pages[0] # await context.new_page() + if self.config.storage_state: + context = await self.create_browser_context(crawlerRunConfig) + ctx = self.default_context # default context, one window only + ctx = await clone_runtime_state(context, ctx, crawlerRunConfig, self.config) + # import json + # with open(self.config.storage_state, "r") as fh: + # state = json.load(fh) + + # # 1. cookies + # await ctx.add_cookies(state["cookies"]) + + # # 2. local- / sessionStorage + # if state.get("origins"): + # page = ctx.pages[0] if ctx.pages else await ctx.new_page() + # for origin in state["origins"]: + # url = origin["origin"] + # for key, value in origin["localStorage"]: + # await page.goto(url) + # await page.evaluate( + # "(k, v) => localStorage.setItem(k, v)", key, value + # ) + + # If storage state is provided, create a new context + # context = await self.create_browser_context(crawlerRunConfig) + # await self.setup_context(context, crawlerRunConfig) + # self.default_context = context + page = await ctx.new_page() + else: + context = self.default_context + pages = context.pages + page = next((p for p in pages if p.url == crawlerRunConfig.url), None) + if not page: + page = context.pages[0] # await context.new_page() else: # Otherwise, check if we have an existing context for this config config_signature = self._make_config_signature(crawlerRunConfig) diff --git a/crawl4ai/browser_profiler.py b/crawl4ai/browser_profiler.py index 961ba740..4a5be13c 100644 --- a/crawl4ai/browser_profiler.py +++ b/crawl4ai/browser_profiler.py @@ -218,8 +218,18 @@ class BrowserProfiler: termios.tcsetattr(fd, termios.TCSADRAIN, old_settings) try: + from playwright.async_api import async_playwright + # Start the browser - await managed_browser.start() + # await managed_browser.start() + # 1. ── Start the browser ───────────────────────────────────────── + cdp_url = await managed_browser.start() + + # 2. ── Attach Playwright to that running Chrome ────────────────── + pw = await async_playwright().start() + browser = await pw.chromium.connect_over_cdp(cdp_url) + # Grab the existing default context (there is always one) + context = browser.contexts[0] # Check if browser started successfully browser_process = managed_browser.browser_process @@ -244,6 +254,18 @@ class BrowserProfiler: except asyncio.CancelledError: pass + # 3. ── Persist storage state *before* we kill Chrome ───────────── + state_file = os.path.join(profile_path, "storage_state.json") + try: + await context.storage_state(path=state_file) + self.logger.info(f"[PROFILE].i storage_state saved → {state_file}", tag="PROFILE") + except Exception as e: + self.logger.warning(f"[PROFILE].w failed to save storage_state: {e}", tag="PROFILE") + + # 4. ── Close everything cleanly ────────────────────────────────── + await browser.close() + await pw.stop() + # If the browser is still running and the user pressed 'q', terminate it if browser_process.poll() is None and user_done_event.is_set(): self.logger.info("Terminating browser process...", tag="PROFILE")