fix(browser_manager): serialize new_page on persistent context to avoid races ref #1198

- Add _page_lock and guarded creation; handle empty context.pages safely - Prevents BrowserContext.new_page “Target page/context closed” during concurrent arun_many
2025-08-11 18:55:43 +08:00
parent a5bcac4c9d
commit 96c4b0de67
2 changed files with 63 additions and 2 deletions
--- a/crawl4ai/browser_manager.py
+++ b/crawl4ai/browser_manager.py
@@ -608,6 +608,11 @@ class BrowserManager:
        self.contexts_by_config = {}
        self._contexts_lock = asyncio.Lock()
        # Serialize context.new_page() across concurrent tasks to avoid races
        # when using a shared persistent context (context.pages may be empty
        # for all racers). Prevents 'Target page/context closed' errors.
        self._page_lock = asyncio.Lock()
        # Stealth-related attributes
        self._stealth_instance = None
        self._stealth_cm = None 
@@ -1027,13 +1032,26 @@ class BrowserManager:
                context = await self.create_browser_context(crawlerRunConfig)
                ctx = self.default_context        # default context, one window only
                ctx = await clone_runtime_state(context, ctx, crawlerRunConfig, self.config)
-                page = await ctx.new_page()
+                # Avoid concurrent new_page on shared persistent context
                # See GH-1198: context.pages can be empty under races
                async with self._page_lock:
                    page = await ctx.new_page()
            else:
                context = self.default_context
                pages = context.pages
                page = next((p for p in pages if p.url == crawlerRunConfig.url), None)
                if not page:
-                    page = context.pages[0] # await context.new_page()
+                    if pages:
                        page = pages[0]
                    else:
                        # Double-check under lock to avoid TOCTOU and ensure only
                        # one task calls new_page when pages=[] concurrently
                        async with self._page_lock:
                            pages = context.pages
                            if pages:
                                page = pages[0]
                            else:
                                page = await context.new_page()
        else:
            # Otherwise, check if we have an existing context for this config
            config_signature = self._make_config_signature(crawlerRunConfig)
--- a/tests/general/test_persistent_context.py
+++ b/tests/general/test_persistent_context.py
@@ -0,0 +1,43 @@
 import asyncio
 import os
 from crawl4ai.async_webcrawler import AsyncWebCrawler
 from crawl4ai.async_configs import BrowserConfig, CrawlerRunConfig, CacheMode
 # Simple concurrency test for persistent context page creation
 # Usage: python scripts/test_persistent_context.py
 URLS = [
    # "https://example.com",
    "https://httpbin.org/html",
    "https://www.python.org/",
    "https://www.rust-lang.org/",
 ]
 async def main():
    profile_dir = os.path.join(os.path.expanduser("~"), ".crawl4ai", "profiles", "test-persistent-profile")
    os.makedirs(profile_dir, exist_ok=True)
    browser_config = BrowserConfig(
        browser_type="chromium",
        headless=True,
        use_persistent_context=True,
        user_data_dir=profile_dir,
        use_managed_browser=True,
        verbose=True,
    )
    run_cfg = CrawlerRunConfig(
        cache_mode=CacheMode.BYPASS,
        stream=False,
        verbose=True,
    )
    async with AsyncWebCrawler(config=browser_config) as crawler:
        results = await crawler.arun_many(URLS, config=run_cfg)
        for r in results:
            print(r.url, r.success, len(r.markdown.raw_markdown) if r.markdown else 0)
        # r = await crawler.arun(url=URLS[0], config=run_cfg)
        # print(r.url, r.success, len(r.markdown.raw_markdown) if r.markdown else 0)
 if __name__ == "__main__":
    asyncio.run(main())